Merge branch 'for-2.6.27' of git://linux-nfs.org/~bfields/linux
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 21 Jul 2008 04:21:46 +0000 (21:21 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 21 Jul 2008 04:21:46 +0000 (21:21 -0700)
* 'for-2.6.27' of git://linux-nfs.org/~bfields/linux: (51 commits)
  nfsd: nfs4xdr.c do-while is not a compound statement
  nfsd: Use C99 initializers in fs/nfsd/nfs4xdr.c
  lockd: Pass "struct sockaddr *" to new failover-by-IP function
  lockd: get host reference in nlmsvc_create_block() instead of callers
  lockd: minor svclock.c style fixes
  lockd: eliminate duplicate nlmsvc_lookup_host call from nlmsvc_lock
  lockd: eliminate duplicate nlmsvc_lookup_host call from nlmsvc_testlock
  lockd: nlm_release_host() checks for NULL, caller needn't
  file lock: reorder struct file_lock to save space on 64 bit builds
  nfsd: take file and mnt write in nfs4_upgrade_open
  nfsd: document open share bit tracking
  nfsd: tabulate nfs4 xdr encoding functions
  nfsd: dprint operation names
  svcrdma: Change WR context get/put to use the kmem cache
  svcrdma: Create a kmem cache for the WR contexts
  svcrdma: Add flush_scheduled_work to module exit function
  svcrdma: Limit ORD based on client's advertised IRD
  svcrdma: Remove unused wait q from svcrdma_xprt structure
  svcrdma: Remove unneeded spin locks from __svc_rdma_free
  svcrdma: Add dma map count and WARN_ON
  ...

36 files changed:
Documentation/filesystems/nfs-rdma.txt
fs/lockd/svc.c
fs/lockd/svc4proc.c
fs/lockd/svclock.c
fs/lockd/svcproc.c
fs/lockd/svcsubs.c
fs/nfsd/lockd.c
fs/nfsd/nfs2acl.c
fs/nfsd/nfs3acl.c
fs/nfsd/nfs3proc.c
fs/nfsd/nfs4proc.c
fs/nfsd/nfs4state.c
fs/nfsd/nfs4xdr.c
fs/nfsd/nfsctl.c
fs/nfsd/nfsfh.c
fs/nfsd/nfsproc.c
fs/nfsd/nfssvc.c
fs/nfsd/vfs.c
include/linux/fs.h
include/linux/lockd/lockd.h
include/linux/nfs4.h
include/linux/nfsd/nfsd.h
include/linux/nfsd/state.h
include/linux/sunrpc/gss_krb5.h
include/linux/sunrpc/svc.h
include/linux/sunrpc/svc_rdma.h
net/sunrpc/auth_gss/Makefile
net/sunrpc/auth_gss/gss_krb5_crypto.c
net/sunrpc/auth_gss/gss_krb5_seal.c
net/sunrpc/auth_gss/gss_krb5_unseal.c
net/sunrpc/auth_gss/gss_krb5_wrap.c
net/sunrpc/svc.c
net/sunrpc/xprtrdma/svc_rdma.c
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
net/sunrpc/xprtrdma/svc_rdma_sendto.c
net/sunrpc/xprtrdma/svc_rdma_transport.c

index d0ec45a..44bd766 100644 (file)
@@ -5,7 +5,7 @@
 ################################################################################
 
  Author: NetApp and Open Grid Computing
- Date: April 15, 2008
+ Date: May 29, 2008
 
 Table of Contents
 ~~~~~~~~~~~~~~~~~
@@ -60,16 +60,18 @@ Installation
     The procedures described in this document have been tested with
     distributions from Red Hat's Fedora Project (http://fedora.redhat.com/).
 
-  - Install nfs-utils-1.1.1 or greater on the client
+  - Install nfs-utils-1.1.2 or greater on the client
 
-    An NFS/RDMA mount point can only be obtained by using the mount.nfs
-    command in nfs-utils-1.1.1 or greater. To see which version of mount.nfs
-    you are using, type:
+    An NFS/RDMA mount point can be obtained by using the mount.nfs command in
+    nfs-utils-1.1.2 or greater (nfs-utils-1.1.1 was the first nfs-utils
+    version with support for NFS/RDMA mounts, but for various reasons we
+    recommend using nfs-utils-1.1.2 or greater). To see which version of
+    mount.nfs you are using, type:
 
-    > /sbin/mount.nfs -V
+    $ /sbin/mount.nfs -V
 
-    If the version is less than 1.1.1 or the command does not exist,
-    then you will need to install the latest version of nfs-utils.
+    If the version is less than 1.1.2 or the command does not exist,
+    you should install the latest version of nfs-utils.
 
     Download the latest package from:
 
@@ -77,22 +79,33 @@ Installation
 
     Uncompress the package and follow the installation instructions.
 
-    If you will not be using GSS and NFSv4, the installation process
-    can be simplified by disabling these features when running configure:
+    If you will not need the idmapper and gssd executables (you do not need
+    these to create an NFS/RDMA enabled mount command), the installation
+    process can be simplified by disabling these features when running
+    configure:
 
-    > ./configure --disable-gss --disable-nfsv4
+    $ ./configure --disable-gss --disable-nfsv4
 
-    For more information on this see the package's README and INSTALL files.
+    To build nfs-utils you will need the tcp_wrappers package installed. For
+    more information on this see the package's README and INSTALL files.
 
     After building the nfs-utils package, there will be a mount.nfs binary in
     the utils/mount directory. This binary can be used to initiate NFS v2, v3,
-    or v4 mounts. To initiate a v4 mount, the binary must be called mount.nfs4.
-    The standard technique is to create a symlink called mount.nfs4 to mount.nfs.
+    or v4 mounts. To initiate a v4 mount, the binary must be called
+    mount.nfs4.  The standard technique is to create a symlink called
+    mount.nfs4 to mount.nfs.
 
-    NOTE: mount.nfs and therefore nfs-utils-1.1.1 or greater is only needed
+    This mount.nfs binary should be installed at /sbin/mount.nfs as follows:
+
+    $ sudo cp utils/mount/mount.nfs /sbin/mount.nfs
+
+    In this location, mount.nfs will be invoked automatically for NFS mounts
+    by the system mount commmand.
+
+    NOTE: mount.nfs and therefore nfs-utils-1.1.2 or greater is only needed
     on the NFS client machine. You do not need this specific version of
     nfs-utils on the server. Furthermore, only the mount.nfs command from
-    nfs-utils-1.1.1 is needed on the client.
+    nfs-utils-1.1.2 is needed on the client.
 
   - Install a Linux kernel with NFS/RDMA
 
@@ -156,8 +169,8 @@ Check RDMA and NFS Setup
     this time. For example, if you are using a Mellanox Tavor/Sinai/Arbel
     card:
 
-    > modprobe ib_mthca
-    > modprobe ib_ipoib
+    $ modprobe ib_mthca
+    $ modprobe ib_ipoib
 
     If you are using InfiniBand, make sure there is a Subnet Manager (SM)
     running on the network. If your IB switch has an embedded SM, you can
@@ -166,7 +179,7 @@ Check RDMA and NFS Setup
 
     If an SM is running on your network, you should see the following:
 
-    > cat /sys/class/infiniband/driverX/ports/1/state
+    $ cat /sys/class/infiniband/driverX/ports/1/state
     4: ACTIVE
 
     where driverX is mthca0, ipath5, ehca3, etc.
@@ -174,10 +187,10 @@ Check RDMA and NFS Setup
     To further test the InfiniBand software stack, use IPoIB (this
     assumes you have two IB hosts named host1 and host2):
 
-    host1> ifconfig ib0 a.b.c.x
-    host2> ifconfig ib0 a.b.c.y
-    host1> ping a.b.c.y
-    host2> ping a.b.c.x
+    host1$ ifconfig ib0 a.b.c.x
+    host2$ ifconfig ib0 a.b.c.y
+    host1$ ping a.b.c.y
+    host2$ ping a.b.c.x
 
     For other device types, follow the appropriate procedures.
 
@@ -202,11 +215,11 @@ NFS/RDMA Setup
     /vol0   192.168.0.47(fsid=0,rw,async,insecure,no_root_squash)
     /vol0   192.168.0.0/255.255.255.0(fsid=0,rw,async,insecure,no_root_squash)
 
-    The IP address(es) is(are) the client's IPoIB address for an InfiniBand HCA or the
-    cleint's iWARP address(es) for an RNIC.
+    The IP address(es) is(are) the client's IPoIB address for an InfiniBand
+    HCA or the cleint's iWARP address(es) for an RNIC.
 
-    NOTE: The "insecure" option must be used because the NFS/RDMA client does not
-    use a reserved port.
+    NOTE: The "insecure" option must be used because the NFS/RDMA client does
+    not use a reserved port.
 
  Each time a machine boots:
 
@@ -214,43 +227,45 @@ NFS/RDMA Setup
 
     For InfiniBand using a Mellanox adapter:
 
-    > modprobe ib_mthca
-    > modprobe ib_ipoib
-    > ifconfig ib0 a.b.c.d
+    $ modprobe ib_mthca
+    $ modprobe ib_ipoib
+    $ ifconfig ib0 a.b.c.d
 
     NOTE: use unique addresses for the client and server
 
   - Start the NFS server
 
-    If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in kernel config),
-    load the RDMA transport module:
+    If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
+    kernel config), load the RDMA transport module:
 
-    > modprobe svcrdma
+    $ modprobe svcrdma
 
-    Regardless of how the server was built (module or built-in), start the server:
+    Regardless of how the server was built (module or built-in), start the
+    server:
 
-    > /etc/init.d/nfs start
+    $ /etc/init.d/nfs start
 
     or
 
-    > service nfs start
+    $ service nfs start
 
     Instruct the server to listen on the RDMA transport:
 
-    > echo rdma 2050 > /proc/fs/nfsd/portlist
+    $ echo rdma 2050 > /proc/fs/nfsd/portlist
 
   - On the client system
 
-    If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in kernel config),
-    load the RDMA client module:
+    If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
+    kernel config), load the RDMA client module:
 
-    > modprobe xprtrdma.ko
+    $ modprobe xprtrdma.ko
 
-    Regardless of how the client was built (module or built-in), issue the mount.nfs command:
+    Regardless of how the client was built (module or built-in), use this
+    command to mount the NFS/RDMA server:
 
-    > /path/to/your/mount.nfs <IPoIB-server-name-or-address>:/<export> /mnt -i -o rdma,port=2050
+    $ mount -o rdma,port=2050 <IPoIB-server-name-or-address>:/<export> /mnt
 
-    To verify that the mount is using RDMA, run "cat /proc/mounts" and check the
-    "proto" field for the given mount.
+    To verify that the mount is using RDMA, run "cat /proc/mounts" and check
+    the "proto" field for the given mount.
 
   Congratulations! You're using NFS/RDMA!
index 2169af4..5bd9bf0 100644 (file)
@@ -50,7 +50,7 @@ EXPORT_SYMBOL(nlmsvc_ops);
 static DEFINE_MUTEX(nlmsvc_mutex);
 static unsigned int            nlmsvc_users;
 static struct task_struct      *nlmsvc_task;
-static struct svc_serv         *nlmsvc_serv;
+static struct svc_rqst         *nlmsvc_rqst;
 int                            nlmsvc_grace_period;
 unsigned long                  nlmsvc_timeout;
 
@@ -194,20 +194,11 @@ lockd(void *vrqstp)
 
                svc_process(rqstp);
        }
-
        flush_signals(current);
        if (nlmsvc_ops)
                nlmsvc_invalidate_all();
        nlm_shutdown_hosts();
-
        unlock_kernel();
-
-       nlmsvc_task = NULL;
-       nlmsvc_serv = NULL;
-
-       /* Exit the RPC thread */
-       svc_exit_thread(rqstp);
-
        return 0;
 }
 
@@ -254,16 +245,15 @@ int
 lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
 {
        struct svc_serv *serv;
-       struct svc_rqst *rqstp;
        int             error = 0;
 
        mutex_lock(&nlmsvc_mutex);
        /*
         * Check whether we're already up and running.
         */
-       if (nlmsvc_serv) {
+       if (nlmsvc_rqst) {
                if (proto)
-                       error = make_socks(nlmsvc_serv, proto);
+                       error = make_socks(nlmsvc_rqst->rq_server, proto);
                goto out;
        }
 
@@ -288,9 +278,10 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
        /*
         * Create the kernel thread and wait for it to start.
         */
-       rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
-       if (IS_ERR(rqstp)) {
-               error = PTR_ERR(rqstp);
+       nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
+       if (IS_ERR(nlmsvc_rqst)) {
+               error = PTR_ERR(nlmsvc_rqst);
+               nlmsvc_rqst = NULL;
                printk(KERN_WARNING
                        "lockd_up: svc_rqst allocation failed, error=%d\n",
                        error);
@@ -298,16 +289,15 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
        }
 
        svc_sock_update_bufs(serv);
-       nlmsvc_serv = rqstp->rq_server;
 
-       nlmsvc_task = kthread_run(lockd, rqstp, serv->sv_name);
+       nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
        if (IS_ERR(nlmsvc_task)) {
                error = PTR_ERR(nlmsvc_task);
+               svc_exit_thread(nlmsvc_rqst);
                nlmsvc_task = NULL;
-               nlmsvc_serv = NULL;
+               nlmsvc_rqst = NULL;
                printk(KERN_WARNING
                        "lockd_up: kthread_run failed, error=%d\n", error);
-               svc_exit_thread(rqstp);
                goto destroy_and_out;
        }
 
@@ -346,6 +336,9 @@ lockd_down(void)
                BUG();
        }
        kthread_stop(nlmsvc_task);
+       svc_exit_thread(nlmsvc_rqst);
+       nlmsvc_task = NULL;
+       nlmsvc_rqst = NULL;
 out:
        mutex_unlock(&nlmsvc_mutex);
 }
index 2e27176..3994446 100644 (file)
@@ -58,8 +58,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
        return 0;
 
 no_locks:
-       if (host)
-               nlm_release_host(host);
+       nlm_release_host(host);
        if (error)
                return error;   
        return nlm_lck_denied_nolocks;
@@ -100,7 +99,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
                return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
        /* Now check for conflicting locks */
-       resp->status = nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie);
+       resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie);
        if (resp->status == nlm_drop_reply)
                rc = rpc_drop_reply;
        else
@@ -146,7 +145,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 #endif
 
        /* Now try to lock the file */
-       resp->status = nlmsvc_lock(rqstp, file, &argp->lock,
+       resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock,
                                        argp->block, &argp->cookie);
        if (resp->status == nlm_drop_reply)
                rc = rpc_drop_reply;
index 56a08ab..821b9ac 100644 (file)
@@ -129,9 +129,9 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock)
 
 static inline int nlm_cookie_match(struct nlm_cookie *a, struct nlm_cookie *b)
 {
-       if(a->len != b->len)
+       if (a->len != b->len)
                return 0;
-       if(memcmp(a->data,b->data,a->len))
+       if (memcmp(a->data, b->data, a->len))
                return 0;
        return 1;
 }
@@ -180,6 +180,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
        struct nlm_block        *block;
        struct nlm_rqst         *call = NULL;
 
+       nlm_get_host(host);
        call = nlm_alloc_call(host);
        if (call == NULL)
                return NULL;
@@ -358,10 +359,10 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block)
  */
 __be32
 nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
-                       struct nlm_lock *lock, int wait, struct nlm_cookie *cookie)
+           struct nlm_host *host, struct nlm_lock *lock, int wait,
+           struct nlm_cookie *cookie)
 {
        struct nlm_block        *block = NULL;
-       struct nlm_host         *host;
        int                     error;
        __be32                  ret;
 
@@ -373,11 +374,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
                                (long long)lock->fl.fl_end,
                                wait);
 
-       /* Create host handle for callback */
-       host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len);
-       if (host == NULL)
-               return nlm_lck_denied_nolocks;
-
        /* Lock file against concurrent access */
        mutex_lock(&file->f_mutex);
        /* Get existing block (in case client is busy-waiting)
@@ -385,8 +381,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
         */
        block = nlmsvc_lookup_block(file, lock);
        if (block == NULL) {
-               block = nlmsvc_create_block(rqstp, nlm_get_host(host), file,
-                               lock, cookie);
+               block = nlmsvc_create_block(rqstp, host, file, lock, cookie);
                ret = nlm_lck_denied_nolocks;
                if (block == NULL)
                        goto out;
@@ -417,7 +412,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
        lock->fl.fl_flags &= ~FL_SLEEP;
 
        dprintk("lockd: vfs_lock_file returned %d\n", error);
-       switch(error) {
+       switch (error) {
                case 0:
                        ret = nlm_granted;
                        goto out;
@@ -450,7 +445,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 out:
        mutex_unlock(&file->f_mutex);
        nlmsvc_release_block(block);
-       nlm_release_host(host);
        dprintk("lockd: nlmsvc_lock returned %u\n", ret);
        return ret;
 }
@@ -460,8 +454,8 @@ out:
  */
 __be32
 nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
-               struct nlm_lock *lock, struct nlm_lock *conflock,
-               struct nlm_cookie *cookie)
+               struct nlm_host *host, struct nlm_lock *lock,
+               struct nlm_lock *conflock, struct nlm_cookie *cookie)
 {
        struct nlm_block        *block = NULL;
        int                     error;
@@ -479,16 +473,9 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 
        if (block == NULL) {
                struct file_lock *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
-               struct nlm_host *host;
 
                if (conf == NULL)
                        return nlm_granted;
-               /* Create host handle for callback */
-               host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len);
-               if (host == NULL) {
-                       kfree(conf);
-                       return nlm_lck_denied_nolocks;
-               }
                block = nlmsvc_create_block(rqstp, host, file, lock, cookie);
                if (block == NULL) {
                        kfree(conf);
@@ -897,7 +884,7 @@ nlmsvc_retry_blocked(void)
 
                if (block->b_when == NLM_NEVER)
                        break;
-               if (time_after(block->b_when,jiffies)) {
+               if (time_after(block->b_when, jiffies)) {
                        timeout = block->b_when - jiffies;
                        break;
                }
index ce6952b..76019d2 100644 (file)
@@ -87,8 +87,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
        return 0;
 
 no_locks:
-       if (host)
-               nlm_release_host(host);
+       nlm_release_host(host);
        if (error)
                return error;
        return nlm_lck_denied_nolocks;
@@ -129,7 +128,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
                return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
        /* Now check for conflicting locks */
-       resp->status = cast_status(nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie));
+       resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie));
        if (resp->status == nlm_drop_reply)
                rc = rpc_drop_reply;
        else
@@ -176,7 +175,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 #endif
 
        /* Now try to lock the file */
-       resp->status = cast_status(nlmsvc_lock(rqstp, file, &argp->lock,
+       resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock,
                                               argp->block, &argp->cookie));
        if (resp->status == nlm_drop_reply)
                rc = rpc_drop_reply;
index d1c48b5..198b4e5 100644 (file)
@@ -373,13 +373,16 @@ nlmsvc_free_host_resources(struct nlm_host *host)
        }
 }
 
-/*
- * Remove all locks held for clients
+/**
+ * nlmsvc_invalidate_all - remove all locks held for clients
+ *
+ * Release all locks held by NFS clients.
+ *
  */
 void
 nlmsvc_invalidate_all(void)
 {
-       /* Release all locks held by NFS clients.
+       /*
         * Previously, the code would call
         * nlmsvc_free_host_resources for each client in
         * turn, which is about as inefficient as it gets.
@@ -396,6 +399,12 @@ nlmsvc_match_sb(void *datap, struct nlm_file *file)
        return sb == file->f_file->f_path.mnt->mnt_sb;
 }
 
+/**
+ * nlmsvc_unlock_all_by_sb - release locks held on this file system
+ * @sb: super block
+ *
+ * Release all locks held by clients accessing this file system.
+ */
 int
 nlmsvc_unlock_all_by_sb(struct super_block *sb)
 {
@@ -409,17 +418,22 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb);
 static int
 nlmsvc_match_ip(void *datap, struct nlm_host *host)
 {
-       __be32 *server_addr = datap;
-
-       return host->h_saddr.sin_addr.s_addr == *server_addr;
+       return nlm_cmp_addr(&host->h_saddr, datap);
 }
 
+/**
+ * nlmsvc_unlock_all_by_ip - release local locks by IP address
+ * @server_addr: server's IP address as seen by clients
+ *
+ * Release all locks held by clients accessing this host
+ * via the passed in IP address.
+ */
 int
-nlmsvc_unlock_all_by_ip(__be32 server_addr)
+nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr)
 {
        int ret;
-       ret = nlm_traverse_files(&server_addr, nlmsvc_match_ip, NULL);
-       return ret ? -EIO : 0;
 
+       ret = nlm_traverse_files(server_addr, nlmsvc_match_ip, NULL);
+       return ret ? -EIO : 0;
 }
 EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_ip);
index 9e4a568..6b6225a 100644 (file)
@@ -35,7 +35,7 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
        fh.fh_export = NULL;
 
        exp_readlock();
-       nfserr = nfsd_open(rqstp, &fh, S_IFREG, MAY_LOCK, filp);
+       nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp);
        fh_put(&fh);
        rqstp->rq_client = NULL;
        exp_readunlock();
index 1c3b765..4e3219e 100644 (file)
@@ -40,7 +40,8 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
        dprintk("nfsd: GETACL(2acl)   %s\n", SVCFH_fmt(&argp->fh));
 
        fh = fh_copy(&resp->fh, &argp->fh);
-       if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP)))
+       nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+       if (nfserr)
                RETURN_STATUS(nfserr);
 
        if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
@@ -107,7 +108,7 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,
        dprintk("nfsd: SETACL(2acl)   %s\n", SVCFH_fmt(&argp->fh));
 
        fh = fh_copy(&resp->fh, &argp->fh);
-       nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_SATTR);
+       nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
 
        if (!nfserr) {
                nfserr = nfserrno( nfsd_set_posix_acl(
@@ -134,7 +135,7 @@ static __be32 nfsacld_proc_getattr(struct svc_rqst * rqstp,
        dprintk("nfsd: GETATTR  %s\n", SVCFH_fmt(&argp->fh));
 
        fh_copy(&resp->fh, &argp->fh);
-       return fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
+       return fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
 }
 
 /*
index b647f2f..9981dbb 100644 (file)
@@ -36,7 +36,8 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
        __be32 nfserr = 0;
 
        fh = fh_copy(&resp->fh, &argp->fh);
-       if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP)))
+       nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+       if (nfserr)
                RETURN_STATUS(nfserr);
 
        if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
@@ -101,7 +102,7 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp,
        __be32 nfserr = 0;
 
        fh = fh_copy(&resp->fh, &argp->fh);
-       nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_SATTR);
+       nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
 
        if (!nfserr) {
                nfserr = nfserrno( nfsd_set_posix_acl(
index c721a1e..4d617ea 100644 (file)
@@ -63,7 +63,7 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
                SVCFH_fmt(&argp->fh));
 
        fh_copy(&resp->fh, &argp->fh);
-       nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
+       nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
        if (nfserr)
                RETURN_STATUS(nfserr);
 
@@ -242,7 +242,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
        attr   = &argp->attrs;
 
        /* Get the directory inode */
-       nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, MAY_CREATE);
+       nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_CREATE);
        if (nfserr)
                RETURN_STATUS(nfserr);
 
@@ -558,7 +558,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
        resp->f_maxfilesize = ~(u32) 0;
        resp->f_properties = NFS3_FSF_DEFAULT;
 
-       nfserr = fh_verify(rqstp, &argp->fh, 0, MAY_NOP);
+       nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP);
 
        /* Check special features of the file system. May request
         * different read/write sizes for file systems known to have
@@ -597,7 +597,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle      *argp,
        resp->p_case_insensitive = 0;
        resp->p_case_preserving = 1;
 
-       nfserr = fh_verify(rqstp, &argp->fh, 0, MAY_NOP);
+       nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP);
 
        if (nfserr == 0) {
                struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb;
index c309c88..eef1629 100644 (file)
@@ -71,11 +71,11 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
                return nfserr_inval;
 
        if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
-               accmode |= MAY_READ;
+               accmode |= NFSD_MAY_READ;
        if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
-               accmode |= (MAY_WRITE | MAY_TRUNC);
+               accmode |= (NFSD_MAY_WRITE | NFSD_MAY_TRUNC);
        if (open->op_share_deny & NFS4_SHARE_DENY_WRITE)
-               accmode |= MAY_WRITE;
+               accmode |= NFSD_MAY_WRITE;
 
        status = fh_verify(rqstp, current_fh, S_IFREG, accmode);
 
@@ -126,7 +126,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
                        &resfh.fh_handle.fh_base, resfh.fh_handle.fh_size);
 
        if (!created)
-               status = do_open_permission(rqstp, current_fh, open, MAY_NOP);
+               status = do_open_permission(rqstp, current_fh, open,
+                                           NFSD_MAY_NOP);
 
 out:
        fh_put(&resfh);
@@ -157,7 +158,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
        open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
                (open->op_iattr.ia_size == 0);
 
-       status = do_open_permission(rqstp, current_fh, open, MAY_OWNER_OVERRIDE);
+       status = do_open_permission(rqstp, current_fh, open,
+                                   NFSD_MAY_OWNER_OVERRIDE);
 
        return status;
 }
@@ -186,7 +188,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len;
                memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh,
                                rp->rp_openfh_len);
-               status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP);
+               status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
                if (status)
                        dprintk("nfsd4_open: replay failed"
                                " restoring previous filehandle\n");
@@ -285,7 +287,7 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen;
        memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval,
               putfh->pf_fhlen);
-       return fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP);
+       return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
 }
 
 static __be32
@@ -363,7 +365,8 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
        fh_init(&resfh, NFS4_FHSIZE);
 
-       status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, MAY_CREATE);
+       status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR,
+                          NFSD_MAY_CREATE);
        if (status == nfserr_symlink)
                status = nfserr_notdir;
        if (status)
@@ -445,7 +448,7 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
        __be32 status;
 
-       status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP);
+       status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
        if (status)
                return status;
 
@@ -730,7 +733,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        int count;
        __be32 status;
 
-       status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP);
+       status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
        if (status)
                return status;
 
@@ -843,10 +846,13 @@ struct nfsd4_operation {
 #define ALLOWED_WITHOUT_FH 1
 /* GETATTR and ops not listed as returning NFS4ERR_MOVED: */
 #define ALLOWED_ON_ABSENT_FS 2
+       char *op_name;
 };
 
 static struct nfsd4_operation nfsd4_ops[];
 
+static inline char *nfsd4_op_name(unsigned opnum);
+
 /*
  * COMPOUND call.
  */
@@ -888,7 +894,9 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        while (!status && resp->opcnt < args->opcnt) {
                op = &args->ops[resp->opcnt++];
 
-               dprintk("nfsv4 compound op #%d: %d\n", resp->opcnt, op->opnum);
+               dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
+                       resp->opcnt, args->opcnt, op->opnum,
+                       nfsd4_op_name(op->opnum));
 
                /*
                 * The XDR decode routines may have pre-set op->status;
@@ -952,126 +960,170 @@ encode_op:
 out:
        nfsd4_release_compoundargs(args);
        cstate_free(cstate);
+       dprintk("nfsv4 compound returned %d\n", ntohl(status));
        return status;
 }
 
 static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
        [OP_ACCESS] = {
                .op_func = (nfsd4op_func)nfsd4_access,
+               .op_name = "OP_ACCESS",
        },
        [OP_CLOSE] = {
                .op_func = (nfsd4op_func)nfsd4_close,
+               .op_name = "OP_CLOSE",
        },
        [OP_COMMIT] = {
                .op_func = (nfsd4op_func)nfsd4_commit,
+               .op_name = "OP_COMMIT",
        },
        [OP_CREATE] = {
                .op_func = (nfsd4op_func)nfsd4_create,
+               .op_name = "OP_CREATE",
        },
        [OP_DELEGRETURN] = {
                .op_func = (nfsd4op_func)nfsd4_delegreturn,
+               .op_name = "OP_DELEGRETURN",
        },
        [OP_GETATTR] = {
                .op_func = (nfsd4op_func)nfsd4_getattr,
                .op_flags = ALLOWED_ON_ABSENT_FS,
+               .op_name = "OP_GETATTR",
        },
        [OP_GETFH] = {
                .op_func = (nfsd4op_func)nfsd4_getfh,
+               .op_name = "OP_GETFH",
        },
        [OP_LINK] = {
                .op_func = (nfsd4op_func)nfsd4_link,
+               .op_name = "OP_LINK",
        },
        [OP_LOCK] = {
                .op_func = (nfsd4op_func)nfsd4_lock,
+               .op_name = "OP_LOCK",
        },
        [OP_LOCKT] = {
                .op_func = (nfsd4op_func)nfsd4_lockt,
+               .op_name = "OP_LOCKT",
        },
        [OP_LOCKU] = {
                .op_func = (nfsd4op_func)nfsd4_locku,
+               .op_name = "OP_LOCKU",
        },
        [OP_LOOKUP] = {
                .op_func = (nfsd4op_func)nfsd4_lookup,
+               .op_name = "OP_LOOKUP",
        },
        [OP_LOOKUPP] = {
                .op_func = (nfsd4op_func)nfsd4_lookupp,
+               .op_name = "OP_LOOKUPP",
        },
        [OP_NVERIFY] = {
                .op_func = (nfsd4op_func)nfsd4_nverify,
+               .op_name = "OP_NVERIFY",
        },
        [OP_OPEN] = {
                .op_func = (nfsd4op_func)nfsd4_open,
+               .op_name = "OP_OPEN",
        },
        [OP_OPEN_CONFIRM] = {
                .op_func = (nfsd4op_func)nfsd4_open_confirm,
+               .op_name = "OP_OPEN_CONFIRM",
        },
        [OP_OPEN_DOWNGRADE] = {
                .op_func = (nfsd4op_func)nfsd4_open_downgrade,
+               .op_name = "OP_OPEN_DOWNGRADE",
        },
        [OP_PUTFH] = {
                .op_func = (nfsd4op_func)nfsd4_putfh,
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+               .op_name = "OP_PUTFH",
        },
        [OP_PUTPUBFH] = {
-               /* unsupported; just for future reference: */
+               /* unsupported, just for future reference: */
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+               .op_name = "OP_PUTPUBFH",
        },
        [OP_PUTROOTFH] = {
                .op_func = (nfsd4op_func)nfsd4_putrootfh,
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+               .op_name = "OP_PUTROOTFH",
        },
        [OP_READ] = {
                .op_func = (nfsd4op_func)nfsd4_read,
+               .op_name = "OP_READ",
        },
        [OP_READDIR] = {
                .op_func = (nfsd4op_func)nfsd4_readdir,
+               .op_name = "OP_READDIR",
        },
        [OP_READLINK] = {
                .op_func = (nfsd4op_func)nfsd4_readlink,
+               .op_name = "OP_READLINK",
        },
        [OP_REMOVE] = {
                .op_func = (nfsd4op_func)nfsd4_remove,
+               .op_name = "OP_REMOVE",
        },
        [OP_RENAME] = {
+               .op_name = "OP_RENAME",
                .op_func = (nfsd4op_func)nfsd4_rename,
        },
        [OP_RENEW] = {
                .op_func = (nfsd4op_func)nfsd4_renew,
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+               .op_name = "OP_RENEW",
        },
        [OP_RESTOREFH] = {
                .op_func = (nfsd4op_func)nfsd4_restorefh,
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+               .op_name = "OP_RESTOREFH",
        },
        [OP_SAVEFH] = {
                .op_func = (nfsd4op_func)nfsd4_savefh,
+               .op_name = "OP_SAVEFH",
        },
        [OP_SECINFO] = {
                .op_func = (nfsd4op_func)nfsd4_secinfo,
+               .op_name = "OP_SECINFO",
        },
        [OP_SETATTR] = {
                .op_func = (nfsd4op_func)nfsd4_setattr,
+               .op_name = "OP_SETATTR",
        },
        [OP_SETCLIENTID] = {
                .op_func = (nfsd4op_func)nfsd4_setclientid,
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+               .op_name = "OP_SETCLIENTID",
        },
        [OP_SETCLIENTID_CONFIRM] = {
                .op_func = (nfsd4op_func)nfsd4_setclientid_confirm,
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+               .op_name = "OP_SETCLIENTID_CONFIRM",
        },
        [OP_VERIFY] = {
                .op_func = (nfsd4op_func)nfsd4_verify,
+               .op_name = "OP_VERIFY",
        },
        [OP_WRITE] = {
                .op_func = (nfsd4op_func)nfsd4_write,
+               .op_name = "OP_WRITE",
        },
        [OP_RELEASE_LOCKOWNER] = {
                .op_func = (nfsd4op_func)nfsd4_release_lockowner,
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+               .op_name = "OP_RELEASE_LOCKOWNER",
        },
 };
 
+static inline char *
+nfsd4_op_name(unsigned opnum)
+{
+       if (opnum < ARRAY_SIZE(nfsd4_ops))
+               return nfsd4_ops[opnum].op_name;
+       return "unknown_operation";
+}
+
 #define nfs4svc_decode_voidargs                NULL
 #define nfs4svc_release_void           NULL
 #define nfsd4_voidres                  nfsd4_voidargs
index 8799b87..1578d7a 100644 (file)
@@ -1173,6 +1173,24 @@ static inline int deny_valid(u32 x)
        return x <= NFS4_SHARE_DENY_BOTH;
 }
 
+/*
+ * We store the NONE, READ, WRITE, and BOTH bits separately in the
+ * st_{access,deny}_bmap field of the stateid, in order to track not
+ * only what share bits are currently in force, but also what
+ * combinations of share bits previous opens have used.  This allows us
+ * to enforce the recommendation of rfc 3530 14.2.19 that the server
+ * return an error if the client attempt to downgrade to a combination
+ * of share bits not explicable by closing some of its previous opens.
+ *
+ * XXX: This enforcement is actually incomplete, since we don't keep
+ * track of access/deny bit combinations; so, e.g., we allow:
+ *
+ *     OPEN allow read, deny write
+ *     OPEN allow both, deny none
+ *     DOWNGRADE allow read, deny none
+ *
+ * which we should reject.
+ */
 static void
 set_access(unsigned int *access, unsigned long bmap) {
        int i;
@@ -1570,6 +1588,10 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_sta
                int err = get_write_access(inode);
                if (err)
                        return nfserrno(err);
+               err = mnt_want_write(cur_fh->fh_export->ex_path.mnt);
+               if (err)
+                       return nfserrno(err);
+               file_take_write(filp);
        }
        status = nfsd4_truncate(rqstp, cur_fh, open);
        if (status) {
@@ -1579,8 +1601,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_sta
        }
        /* remember the open */
        filp->f_mode |= open->op_share_access;
-       set_bit(open->op_share_access, &stp->st_access_bmap);
-       set_bit(open->op_share_deny, &stp->st_deny_bmap);
+       __set_bit(open->op_share_access, &stp->st_access_bmap);
+       __set_bit(open->op_share_deny, &stp->st_deny_bmap);
 
        return nfs_ok;
 }
@@ -1722,9 +1744,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
                /* Stateid was not found, this is a new OPEN */
                int flags = 0;
                if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
-                       flags |= MAY_READ;
+                       flags |= NFSD_MAY_READ;
                if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
-                       flags |= MAY_WRITE;
+                       flags |= NFSD_MAY_WRITE;
                status = nfs4_new_open(rqstp, &stp, dp, current_fh, flags);
                if (status)
                        goto out;
@@ -2610,7 +2632,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                 return nfserr_inval;
 
        if ((status = fh_verify(rqstp, &cstate->current_fh,
-                               S_IFREG, MAY_LOCK))) {
+                               S_IFREG, NFSD_MAY_LOCK))) {
                dprintk("NFSD: nfsd4_lock: permission denied!\n");
                return status;
        }
@@ -3249,12 +3271,14 @@ nfs4_state_shutdown(void)
        nfs4_unlock_state();
 }
 
+/*
+ * user_recovery_dirname is protected by the nfsd_mutex since it's only
+ * accessed when nfsd is starting.
+ */
 static void
 nfs4_set_recdir(char *recdir)
 {
-       nfs4_lock_state();
        strcpy(user_recovery_dirname, recdir);
-       nfs4_unlock_state();
 }
 
 /*
@@ -3278,6 +3302,12 @@ nfs4_reset_recoverydir(char *recdir)
        return status;
 }
 
+char *
+nfs4_recoverydir(void)
+{
+       return user_recovery_dirname;
+}
+
 /*
  * Called when leasetime is changed.
  *
@@ -3286,11 +3316,12 @@ nfs4_reset_recoverydir(char *recdir)
  * we start to register any changes in lease time.  If the administrator
  * really wants to change the lease time *now*, they can go ahead and bring
  * nfsd down and then back up again after changing the lease time.
+ *
+ * user_lease_time is protected by nfsd_mutex since it's only really accessed
+ * when nfsd is starting
  */
 void
 nfs4_reset_lease(time_t leasetime)
 {
-       lock_kernel();
        user_lease_time = leasetime;
-       unlock_kernel();
 }
index c513bbd..14ba4d9 100644 (file)
@@ -985,11 +985,75 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
        DECODE_TAIL;
 }
 
+static __be32
+nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
+{
+       return nfs_ok;
+}
+
+static __be32
+nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
+{
+       return nfserr_opnotsupp;
+}
+
+typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
+
+static nfsd4_dec nfsd4_dec_ops[] = {
+       [OP_ACCESS]             = (nfsd4_dec)nfsd4_decode_access,
+       [OP_CLOSE]              = (nfsd4_dec)nfsd4_decode_close,
+       [OP_COMMIT]             = (nfsd4_dec)nfsd4_decode_commit,
+       [OP_CREATE]             = (nfsd4_dec)nfsd4_decode_create,
+       [OP_DELEGPURGE]         = (nfsd4_dec)nfsd4_decode_notsupp,
+       [OP_DELEGRETURN]        = (nfsd4_dec)nfsd4_decode_delegreturn,
+       [OP_GETATTR]            = (nfsd4_dec)nfsd4_decode_getattr,
+       [OP_GETFH]              = (nfsd4_dec)nfsd4_decode_noop,
+       [OP_LINK]               = (nfsd4_dec)nfsd4_decode_link,
+       [OP_LOCK]               = (nfsd4_dec)nfsd4_decode_lock,
+       [OP_LOCKT]              = (nfsd4_dec)nfsd4_decode_lockt,
+       [OP_LOCKU]              = (nfsd4_dec)nfsd4_decode_locku,
+       [OP_LOOKUP]             = (nfsd4_dec)nfsd4_decode_lookup,
+       [OP_LOOKUPP]            = (nfsd4_dec)nfsd4_decode_noop,
+       [OP_NVERIFY]            = (nfsd4_dec)nfsd4_decode_verify,
+       [OP_OPEN]               = (nfsd4_dec)nfsd4_decode_open,
+       [OP_OPENATTR]           = (nfsd4_dec)nfsd4_decode_notsupp,
+       [OP_OPEN_CONFIRM]       = (nfsd4_dec)nfsd4_decode_open_confirm,
+       [OP_OPEN_DOWNGRADE]     = (nfsd4_dec)nfsd4_decode_open_downgrade,
+       [OP_PUTFH]              = (nfsd4_dec)nfsd4_decode_putfh,
+       [OP_PUTPUBFH]           = (nfsd4_dec)nfsd4_decode_notsupp,
+       [OP_PUTROOTFH]          = (nfsd4_dec)nfsd4_decode_noop,
+       [OP_READ]               = (nfsd4_dec)nfsd4_decode_read,
+       [OP_READDIR]            = (nfsd4_dec)nfsd4_decode_readdir,
+       [OP_READLINK]           = (nfsd4_dec)nfsd4_decode_noop,
+       [OP_REMOVE]             = (nfsd4_dec)nfsd4_decode_remove,
+       [OP_RENAME]             = (nfsd4_dec)nfsd4_decode_rename,
+       [OP_RENEW]              = (nfsd4_dec)nfsd4_decode_renew,
+       [OP_RESTOREFH]          = (nfsd4_dec)nfsd4_decode_noop,
+       [OP_SAVEFH]             = (nfsd4_dec)nfsd4_decode_noop,
+       [OP_SECINFO]            = (nfsd4_dec)nfsd4_decode_secinfo,
+       [OP_SETATTR]            = (nfsd4_dec)nfsd4_decode_setattr,
+       [OP_SETCLIENTID]        = (nfsd4_dec)nfsd4_decode_setclientid,
+       [OP_SETCLIENTID_CONFIRM] = (nfsd4_dec)nfsd4_decode_setclientid_confirm,
+       [OP_VERIFY]             = (nfsd4_dec)nfsd4_decode_verify,
+       [OP_WRITE]              = (nfsd4_dec)nfsd4_decode_write,
+       [OP_RELEASE_LOCKOWNER]  = (nfsd4_dec)nfsd4_decode_release_lockowner,
+};
+
+struct nfsd4_minorversion_ops {
+       nfsd4_dec *decoders;
+       int nops;
+};
+
+static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
+       [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
+};
+
 static __be32
 nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 {
        DECODE_HEAD;
        struct nfsd4_op *op;
+       struct nfsd4_minorversion_ops *ops;
        int i;
 
        /*
@@ -1019,6 +1083,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
                }
        }
 
+       if (argp->minorversion >= ARRAY_SIZE(nfsd4_minorversion))
+               argp->opcnt = 0;
+
+       ops = &nfsd4_minorversion[argp->minorversion];
        for (i = 0; i < argp->opcnt; i++) {
                op = &argp->ops[i];
                op->replay = NULL;
@@ -1056,120 +1124,11 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
                }
                op->opnum = ntohl(*argp->p++);
 
-               switch (op->opnum) {
-               case 2: /* Reserved operation */
-                       op->opnum = OP_ILLEGAL;
-                       if (argp->minorversion == 0)
-                               op->status = nfserr_op_illegal;
-                       else
-                               op->status = nfserr_minor_vers_mismatch;
-                       break;
-               case OP_ACCESS:
-                       op->status = nfsd4_decode_access(argp, &op->u.access);
-                       break;
-               case OP_CLOSE:
-                       op->status = nfsd4_decode_close(argp, &op->u.close);
-                       break;
-               case OP_COMMIT:
-                       op->status = nfsd4_decode_commit(argp, &op->u.commit);
-                       break;
-               case OP_CREATE:
-                       op->status = nfsd4_decode_create(argp, &op->u.create);
-                       break;
-               case OP_DELEGRETURN:
-                       op->status = nfsd4_decode_delegreturn(argp, &op->u.delegreturn);
-                       break;
-               case OP_GETATTR:
-                       op->status = nfsd4_decode_getattr(argp, &op->u.getattr);
-                       break;
-               case OP_GETFH:
-                       op->status = nfs_ok;
-                       break;
-               case OP_LINK:
-                       op->status = nfsd4_decode_link(argp, &op->u.link);
-                       break;
-               case OP_LOCK:
-                       op->status = nfsd4_decode_lock(argp, &op->u.lock);
-                       break;
-               case OP_LOCKT:
-                       op->status = nfsd4_decode_lockt(argp, &op->u.lockt);
-                       break;
-               case OP_LOCKU:
-                       op->status = nfsd4_decode_locku(argp, &op->u.locku);
-                       break;
-               case OP_LOOKUP:
-                       op->status = nfsd4_decode_lookup(argp, &op->u.lookup);
-                       break;
-               case OP_LOOKUPP:
-                       op->status = nfs_ok;
-                       break;
-               case OP_NVERIFY:
-                       op->status = nfsd4_decode_verify(argp, &op->u.nverify);
-                       break;
-               case OP_OPEN:
-                       op->status = nfsd4_decode_open(argp, &op->u.open);
-                       break;
-               case OP_OPEN_CONFIRM:
-                       op->status = nfsd4_decode_open_confirm(argp, &op->u.open_confirm);
-                       break;
-               case OP_OPEN_DOWNGRADE:
-                       op->status = nfsd4_decode_open_downgrade(argp, &op->u.open_downgrade);
-                       break;
-               case OP_PUTFH:
-                       op->status = nfsd4_decode_putfh(argp, &op->u.putfh);
-                       break;
-               case OP_PUTROOTFH:
-                       op->status = nfs_ok;
-                       break;
-               case OP_READ:
-                       op->status = nfsd4_decode_read(argp, &op->u.read);
-                       break;
-               case OP_READDIR:
-                       op->status = nfsd4_decode_readdir(argp, &op->u.readdir);
-                       break;
-               case OP_READLINK:
-                       op->status = nfs_ok;
-                       break;
-               case OP_REMOVE:
-                       op->status = nfsd4_decode_remove(argp, &op->u.remove);
-                       break;
-               case OP_RENAME:
-                       op->status = nfsd4_decode_rename(argp, &op->u.rename);
-                       break;
-               case OP_RESTOREFH:
-                       op->status = nfs_ok;
-                       break;
-               case OP_RENEW:
-                       op->status = nfsd4_decode_renew(argp, &op->u.renew);
-                       break;
-               case OP_SAVEFH:
-                       op->status = nfs_ok;
-                       break;
-               case OP_SECINFO:
-                       op->status = nfsd4_decode_secinfo(argp, &op->u.secinfo);
-                       break;
-               case OP_SETATTR:
-                       op->status = nfsd4_decode_setattr(argp, &op->u.setattr);
-                       break;
-               case OP_SETCLIENTID:
-                       op->status = nfsd4_decode_setclientid(argp, &op->u.setclientid);
-                       break;
-               case OP_SETCLIENTID_CONFIRM:
-                       op->status = nfsd4_decode_setclientid_confirm(argp, &op->u.setclientid_confirm);
-                       break;
-               case OP_VERIFY:
-                       op->status = nfsd4_decode_verify(argp, &op->u.verify);
-                       break;
-               case OP_WRITE:
-                       op->status = nfsd4_decode_write(argp, &op->u.write);
-                       break;
-               case OP_RELEASE_LOCKOWNER:
-                       op->status = nfsd4_decode_release_lockowner(argp, &op->u.release_lockowner);
-                       break;
-               default:
+               if (op->opnum >= OP_ACCESS && op->opnum < ops->nops)
+                       op->status = ops->decoders[op->opnum](argp, &op->u);
+               else {
                        op->opnum = OP_ILLEGAL;
                        op->status = nfserr_op_illegal;
-                       break;
                }
 
                if (op->status) {
@@ -1201,11 +1160,11 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
        *p++ = htonl((u32)((n) >> 32));                         \
        *p++ = htonl((u32)(n));                                 \
 } while (0)
-#define WRITEMEM(ptr,nbytes)     do {                          \
+#define WRITEMEM(ptr,nbytes)     do { if (nbytes > 0) {                \
        *(p + XDR_QUADLEN(nbytes) -1) = 0;                      \
        memcpy(p, ptr, nbytes);                                 \
        p += XDR_QUADLEN(nbytes);                               \
-} while (0)
+}} while (0)
 #define WRITECINFO(c)          do {                            \
        *p++ = htonl(c.atomic);                                 \
        *p++ = htonl(c.before_ctime_sec);                               \
@@ -1991,7 +1950,7 @@ fail:
        return -EINVAL;
 }
 
-static void
+static __be32
 nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
 {
        ENCODE_HEAD;
@@ -2002,9 +1961,10 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
                WRITE32(access->ac_resp_access);
                ADJUST_ARGS();
        }
+       return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
 {
        ENCODE_SEQID_OP_HEAD;
@@ -2016,10 +1976,11 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
                ADJUST_ARGS();
        }
        ENCODE_SEQID_OP_TAIL(close->cl_stateowner);
+       return nfserr;
 }
 
 
-static void
+static __be32
 nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
 {
        ENCODE_HEAD;
@@ -2029,9 +1990,10 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
                WRITEMEM(commit->co_verf.data, 8);
                ADJUST_ARGS();
        }
+       return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
 {
        ENCODE_HEAD;
@@ -2044,6 +2006,7 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
                WRITE32(create->cr_bmval[1]);
                ADJUST_ARGS();
        }
+       return nfserr;
 }
 
 static __be32
@@ -2064,9 +2027,10 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
        return nfserr;
 }
 
-static void
-nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh *fhp)
+static __be32
+nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp)
 {
+       struct svc_fh *fhp = *fhpp;
        unsigned int len;
        ENCODE_HEAD;
 
@@ -2077,6 +2041,7 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
                WRITEMEM(&fhp->fh_handle.fh_base, len);
                ADJUST_ARGS();
        }
+       return nfserr;
 }
 
 /*
@@ -2104,7 +2069,7 @@ nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denie
        ADJUST_ARGS();
 }
 
-static void
+static __be32
 nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
 {
        ENCODE_SEQID_OP_HEAD;
@@ -2118,16 +2083,18 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
                nfsd4_encode_lock_denied(resp, &lock->lk_denied);
 
        ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner);
+       return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt)
 {
        if (nfserr == nfserr_denied)
                nfsd4_encode_lock_denied(resp, &lockt->lt_denied);
+       return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
 {
        ENCODE_SEQID_OP_HEAD;
@@ -2140,10 +2107,11 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
        }
                                        
        ENCODE_SEQID_OP_TAIL(locku->lu_stateowner);
+       return nfserr;
 }
 
 
-static void
+static __be32
 nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
 {
        ENCODE_HEAD;
@@ -2153,10 +2121,11 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
                WRITECINFO(link->li_cinfo);
                ADJUST_ARGS();
        }
+       return nfserr;
 }
 
 
-static void
+static __be32
 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
 {
        ENCODE_SEQID_OP_HEAD;
@@ -2219,9 +2188,10 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
        /* XXX save filehandle here */
 out:
        ENCODE_SEQID_OP_TAIL(open->op_stateowner);
+       return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
 {
        ENCODE_SEQID_OP_HEAD;
@@ -2234,9 +2204,10 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct
        }
 
        ENCODE_SEQID_OP_TAIL(oc->oc_stateowner);
+       return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
 {
        ENCODE_SEQID_OP_HEAD;
@@ -2249,6 +2220,7 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struc
        }
 
        ENCODE_SEQID_OP_TAIL(od->od_stateowner);
+       return nfserr;
 }
 
 static __be32
@@ -2443,7 +2415,7 @@ err_no_verf:
        return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
 {
        ENCODE_HEAD;
@@ -2453,9 +2425,10 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
                WRITECINFO(remove->rm_cinfo);
                ADJUST_ARGS();
        }
+       return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
 {
        ENCODE_HEAD;
@@ -2466,9 +2439,10 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
                WRITECINFO(rename->rn_tinfo);
                ADJUST_ARGS();
        }
+       return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
                     struct nfsd4_secinfo *secinfo)
 {
@@ -2532,13 +2506,14 @@ nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
 out:
        if (exp)
                exp_put(exp);
+       return nfserr;
 }
 
 /*
  * The SETATTR encode routine is special -- it always encodes a bitmap,
  * regardless of the error status.
  */
-static void
+static __be32
 nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
 {
        ENCODE_HEAD;
@@ -2555,9 +2530,10 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
                WRITE32(setattr->sa_bmval[1]);
        }
        ADJUST_ARGS();
+       return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
 {
        ENCODE_HEAD;
@@ -2574,9 +2550,10 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n
                WRITE32(0);
                ADJUST_ARGS();
        }
+       return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
 {
        ENCODE_HEAD;
@@ -2588,8 +2565,56 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
                WRITEMEM(write->wr_verifier.data, 8);
                ADJUST_ARGS();
        }
+       return nfserr;
 }
 
+static __be32
+nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
+{
+       return nfserr;
+}
+
+typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
+
+static nfsd4_enc nfsd4_enc_ops[] = {
+       [OP_ACCESS]             = (nfsd4_enc)nfsd4_encode_access,
+       [OP_CLOSE]              = (nfsd4_enc)nfsd4_encode_close,
+       [OP_COMMIT]             = (nfsd4_enc)nfsd4_encode_commit,
+       [OP_CREATE]             = (nfsd4_enc)nfsd4_encode_create,
+       [OP_DELEGPURGE]         = (nfsd4_enc)nfsd4_encode_noop,
+       [OP_DELEGRETURN]        = (nfsd4_enc)nfsd4_encode_noop,
+       [OP_GETATTR]            = (nfsd4_enc)nfsd4_encode_getattr,
+       [OP_GETFH]              = (nfsd4_enc)nfsd4_encode_getfh,
+       [OP_LINK]               = (nfsd4_enc)nfsd4_encode_link,
+       [OP_LOCK]               = (nfsd4_enc)nfsd4_encode_lock,
+       [OP_LOCKT]              = (nfsd4_enc)nfsd4_encode_lockt,
+       [OP_LOCKU]              = (nfsd4_enc)nfsd4_encode_locku,
+       [OP_LOOKUP]             = (nfsd4_enc)nfsd4_encode_noop,
+       [OP_LOOKUPP]            = (nfsd4_enc)nfsd4_encode_noop,
+       [OP_NVERIFY]            = (nfsd4_enc)nfsd4_encode_noop,
+       [OP_OPEN]               = (nfsd4_enc)nfsd4_encode_open,
+       [OP_OPEN_CONFIRM]       = (nfsd4_enc)nfsd4_encode_open_confirm,
+       [OP_OPEN_DOWNGRADE]     = (nfsd4_enc)nfsd4_encode_open_downgrade,
+       [OP_PUTFH]              = (nfsd4_enc)nfsd4_encode_noop,
+       [OP_PUTPUBFH]           = (nfsd4_enc)nfsd4_encode_noop,
+       [OP_PUTROOTFH]          = (nfsd4_enc)nfsd4_encode_noop,
+       [OP_READ]               = (nfsd4_enc)nfsd4_encode_read,
+       [OP_READDIR]            = (nfsd4_enc)nfsd4_encode_readdir,
+       [OP_READLINK]           = (nfsd4_enc)nfsd4_encode_readlink,
+       [OP_REMOVE]             = (nfsd4_enc)nfsd4_encode_remove,
+       [OP_RENAME]             = (nfsd4_enc)nfsd4_encode_rename,
+       [OP_RENEW]              = (nfsd4_enc)nfsd4_encode_noop,
+       [OP_RESTOREFH]          = (nfsd4_enc)nfsd4_encode_noop,
+       [OP_SAVEFH]             = (nfsd4_enc)nfsd4_encode_noop,
+       [OP_SECINFO]            = (nfsd4_enc)nfsd4_encode_secinfo,
+       [OP_SETATTR]            = (nfsd4_enc)nfsd4_encode_setattr,
+       [OP_SETCLIENTID]        = (nfsd4_enc)nfsd4_encode_setclientid,
+       [OP_SETCLIENTID_CONFIRM] = (nfsd4_enc)nfsd4_encode_noop,
+       [OP_VERIFY]             = (nfsd4_enc)nfsd4_encode_noop,
+       [OP_WRITE]              = (nfsd4_enc)nfsd4_encode_write,
+       [OP_RELEASE_LOCKOWNER]  = (nfsd4_enc)nfsd4_encode_noop,
+};
+
 void
 nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 {
@@ -2601,101 +2626,12 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
        statp = p++;    /* to be backfilled at the end */
        ADJUST_ARGS();
 
-       switch (op->opnum) {
-       case OP_ACCESS:
-               nfsd4_encode_access(resp, op->status, &op->u.access);
-               break;
-       case OP_CLOSE:
-               nfsd4_encode_close(resp, op->status, &op->u.close);
-               break;
-       case OP_COMMIT:
-               nfsd4_encode_commit(resp, op->status, &op->u.commit);
-               break;
-       case OP_CREATE:
-               nfsd4_encode_create(resp, op->status, &op->u.create);
-               break;
-       case OP_DELEGRETURN:
-               break;
-       case OP_GETATTR:
-               op->status = nfsd4_encode_getattr(resp, op->status, &op->u.getattr);
-               break;
-       case OP_GETFH:
-               nfsd4_encode_getfh(resp, op->status, op->u.getfh);
-               break;
-       case OP_LINK:
-               nfsd4_encode_link(resp, op->status, &op->u.link);
-               break;
-       case OP_LOCK:
-               nfsd4_encode_lock(resp, op->status, &op->u.lock);
-               break;
-       case OP_LOCKT:
-               nfsd4_encode_lockt(resp, op->status, &op->u.lockt);
-               break;
-       case OP_LOCKU:
-               nfsd4_encode_locku(resp, op->status, &op->u.locku);
-               break;
-       case OP_LOOKUP:
-               break;
-       case OP_LOOKUPP:
-               break;
-       case OP_NVERIFY:
-               break;
-       case OP_OPEN:
-               nfsd4_encode_open(resp, op->status, &op->u.open);
-               break;
-       case OP_OPEN_CONFIRM:
-               nfsd4_encode_open_confirm(resp, op->status, &op->u.open_confirm);
-               break;
-       case OP_OPEN_DOWNGRADE:
-               nfsd4_encode_open_downgrade(resp, op->status, &op->u.open_downgrade);
-               break;
-       case OP_PUTFH:
-               break;
-       case OP_PUTROOTFH:
-               break;
-       case OP_READ:
-               op->status = nfsd4_encode_read(resp, op->status, &op->u.read);
-               break;
-       case OP_READDIR:
-               op->status = nfsd4_encode_readdir(resp, op->status, &op->u.readdir);
-               break;
-       case OP_READLINK:
-               op->status = nfsd4_encode_readlink(resp, op->status, &op->u.readlink);
-               break;
-       case OP_REMOVE:
-               nfsd4_encode_remove(resp, op->status, &op->u.remove);
-               break;
-       case OP_RENAME:
-               nfsd4_encode_rename(resp, op->status, &op->u.rename);
-               break;
-       case OP_RENEW:
-               break;
-       case OP_RESTOREFH:
-               break;
-       case OP_SAVEFH:
-               break;
-       case OP_SECINFO:
-               nfsd4_encode_secinfo(resp, op->status, &op->u.secinfo);
-               break;
-       case OP_SETATTR:
-               nfsd4_encode_setattr(resp, op->status, &op->u.setattr);
-               break;
-       case OP_SETCLIENTID:
-               nfsd4_encode_setclientid(resp, op->status, &op->u.setclientid);
-               break;
-       case OP_SETCLIENTID_CONFIRM:
-               break;
-       case OP_VERIFY:
-               break;
-       case OP_WRITE:
-               nfsd4_encode_write(resp, op->status, &op->u.write);
-               break;
-       case OP_RELEASE_LOCKOWNER:
-               break;
-       default:
-               break;
-       }
-
+       if (op->opnum == OP_ILLEGAL)
+               goto status;
+       BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
+              !nfsd4_enc_ops[op->opnum]);
+       op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
+status:
        /*
         * Note: We write the status directly, instead of using WRITE32(),
         * since it is already in network byte order.
index 5ac00c4..1955a27 100644 (file)
@@ -310,9 +310,12 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
 
 static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
 {
-       __be32 server_ip;
-       char *fo_path, c;
+       struct sockaddr_in sin = {
+               .sin_family     = AF_INET,
+       };
        int b1, b2, b3, b4;
+       char c;
+       char *fo_path;
 
        /* sanity check */
        if (size == 0)
@@ -326,11 +329,13 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
                return -EINVAL;
 
        /* get ipv4 address */
-       if (sscanf(fo_path, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4)
+       if (sscanf(fo_path, NIPQUAD_FMT "%c", &b1, &b2, &b3, &b4, &c) != 4)
                return -EINVAL;
-       server_ip = htonl((((((b1<<8)|b2)<<8)|b3)<<8)|b4);
+       if (b1 > 255 || b2 > 255 || b3 > 255 || b4 > 255)
+               return -EINVAL;
+       sin.sin_addr.s_addr = htonl((b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
 
-       return nlmsvc_unlock_all_by_ip(server_ip);
+       return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin);
 }
 
 static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
@@ -450,22 +455,26 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
        int i;
        int rv;
        int len;
-       int npools = nfsd_nrpools();
+       int npools;
        int *nthreads;
 
+       mutex_lock(&nfsd_mutex);
+       npools = nfsd_nrpools();
        if (npools == 0) {
                /*
                 * NFS is shut down.  The admin can start it by
                 * writing to the threads file but NOT the pool_threads
                 * file, sorry.  Report zero threads.
                 */
+               mutex_unlock(&nfsd_mutex);
                strcpy(buf, "0\n");
                return strlen(buf);
        }
 
        nthreads = kcalloc(npools, sizeof(int), GFP_KERNEL);
+       rv = -ENOMEM;
        if (nthreads == NULL)
-               return -ENOMEM;
+               goto out_free;
 
        if (size > 0) {
                for (i = 0; i < npools; i++) {
@@ -496,14 +505,16 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
                mesg += len;
        }
 
+       mutex_unlock(&nfsd_mutex);
        return (mesg-buf);
 
 out_free:
        kfree(nthreads);
+       mutex_unlock(&nfsd_mutex);
        return rv;
 }
 
-static ssize_t write_versions(struct file *file, char *buf, size_t size)
+static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 {
        /*
         * Format:
@@ -566,14 +577,23 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size)
        return len;
 }
 
-static ssize_t write_ports(struct file *file, char *buf, size_t size)
+static ssize_t write_versions(struct file *file, char *buf, size_t size)
+{
+       ssize_t rv;
+
+       mutex_lock(&nfsd_mutex);
+       rv = __write_versions(file, buf, size);
+       mutex_unlock(&nfsd_mutex);
+       return rv;
+}
+
+static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 {
        if (size == 0) {
                int len = 0;
-               lock_kernel();
+
                if (nfsd_serv)
                        len = svc_xprt_names(nfsd_serv, buf, 0);
-               unlock_kernel();
                return len;
        }
        /* Either a single 'fd' number is written, in which
@@ -603,9 +623,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
                        /* Decrease the count, but don't shutdown the
                         * the service
                         */
-                       lock_kernel();
                        nfsd_serv->sv_nrthreads--;
-                       unlock_kernel();
                }
                return err < 0 ? err : 0;
        }
@@ -614,10 +632,8 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
                int len = 0;
                if (!toclose)
                        return -ENOMEM;
-               lock_kernel();
                if (nfsd_serv)
                        len = svc_sock_names(buf, nfsd_serv, toclose);
-               unlock_kernel();
                if (len >= 0)
                        lockd_down();
                kfree(toclose);
@@ -655,7 +671,6 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
                if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
                        if (port == 0)
                                return -EINVAL;
-                       lock_kernel();
                        if (nfsd_serv) {
                                xprt = svc_find_xprt(nfsd_serv, transport,
                                                     AF_UNSPEC, port);
@@ -666,13 +681,23 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
                                } else
                                        err = -ENOTCONN;
                        }
-                       unlock_kernel();
                        return err < 0 ? err : 0;
                }
        }
        return -EINVAL;
 }
 
+static ssize_t write_ports(struct file *file, char *buf, size_t size)
+{
+       ssize_t rv;
+
+       mutex_lock(&nfsd_mutex);
+       rv = __write_ports(file, buf, size);
+       mutex_unlock(&nfsd_mutex);
+       return rv;
+}
+
+
 int nfsd_max_blksize;
 
 static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
@@ -691,13 +716,13 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
                if (bsize > NFSSVC_MAXBLKSIZE)
                        bsize = NFSSVC_MAXBLKSIZE;
                bsize &= ~(1024-1);
-               lock_kernel();
+               mutex_lock(&nfsd_mutex);
                if (nfsd_serv && nfsd_serv->sv_nrthreads) {
-                       unlock_kernel();
+                       mutex_unlock(&nfsd_mutex);
                        return -EBUSY;
                }
                nfsd_max_blksize = bsize;
-               unlock_kernel();
+               mutex_unlock(&nfsd_mutex);
        }
        return sprintf(buf, "%d\n", nfsd_max_blksize);
 }
@@ -705,16 +730,17 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
 #ifdef CONFIG_NFSD_V4
 extern time_t nfs4_leasetime(void);
 
-static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
+static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
 {
        /* if size > 10 seconds, call
         * nfs4_reset_lease() then write out the new lease (seconds) as reply
         */
        char *mesg = buf;
-       int rv;
+       int rv, lease;
 
        if (size > 0) {
-               int lease;
+               if (nfsd_serv)
+                       return -EBUSY;
                rv = get_int(&mesg, &lease);
                if (rv)
                        return rv;
@@ -726,24 +752,52 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
        return strlen(buf);
 }
 
-static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
+static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
+{
+       ssize_t rv;
+
+       mutex_lock(&nfsd_mutex);
+       rv = __write_leasetime(file, buf, size);
+       mutex_unlock(&nfsd_mutex);
+       return rv;
+}
+
+extern char *nfs4_recoverydir(void);
+
+static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
 {
        char *mesg = buf;
        char *recdir;
        int len, status;
 
-       if (size == 0 || size > PATH_MAX || buf[size-1] != '\n')
-               return -EINVAL;
-       buf[size-1] = 0;
+       if (size > 0) {
+               if (nfsd_serv)
+                       return -EBUSY;
+               if (size > PATH_MAX || buf[size-1] != '\n')
+                       return -EINVAL;
+               buf[size-1] = 0;
 
-       recdir = mesg;
-       len = qword_get(&mesg, recdir, size);
-       if (len <= 0)
-               return -EINVAL;
+               recdir = mesg;
+               len = qword_get(&mesg, recdir, size);
+               if (len <= 0)
+                       return -EINVAL;
 
-       status = nfs4_reset_recoverydir(recdir);
+               status = nfs4_reset_recoverydir(recdir);
+       }
+       sprintf(buf, "%s\n", nfs4_recoverydir());
        return strlen(buf);
 }
+
+static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
+{
+       ssize_t rv;
+
+       mutex_lock(&nfsd_mutex);
+       rv = __write_recoverydir(file, buf, size);
+       mutex_unlock(&nfsd_mutex);
+       return rv;
+}
+
 #endif
 
 /*----------------------------------------------------------------------------*/
index 100ae56..f45451e 100644 (file)
@@ -176,9 +176,24 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
        if (IS_ERR(exp))
                return nfserrno(PTR_ERR(exp));
 
-       error = nfsd_setuser_and_check_port(rqstp, exp);
-       if (error)
-               goto out;
+       if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) {
+               /* Elevate privileges so that the lack of 'r' or 'x'
+                * permission on some parent directory will
+                * not stop exportfs_decode_fh from being able
+                * to reconnect a directory into the dentry cache.
+                * The same problem can affect "SUBTREECHECK" exports,
+                * but as nfsd_acceptable depends on correct
+                * access control settings being in effect, we cannot
+                * fix that case easily.
+                */
+               current->cap_effective =
+                       cap_raise_nfsd_set(current->cap_effective,
+                                          current->cap_permitted);
+       } else {
+               error = nfsd_setuser_and_check_port(rqstp, exp);
+               if (error)
+                       goto out;
+       }
 
        /*
         * Look up the dentry using the NFS file handle.
@@ -215,6 +230,14 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
                goto out;
        }
 
+       if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) {
+               error = nfsd_setuser_and_check_port(rqstp, exp);
+               if (error) {
+                       dput(dentry);
+                       goto out;
+               }
+       }
+
        if (S_ISDIR(dentry->d_inode->i_mode) &&
                        (dentry->d_flags & DCACHE_DISCONNECTED)) {
                printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %s/%s\n",
@@ -279,7 +302,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
        if (error)
                goto out;
 
-       if (!(access & MAY_LOCK)) {
+       if (!(access & NFSD_MAY_LOCK)) {
                /*
                 * pseudoflavor restrictions are not enforced on NLM,
                 * which clients virtually always use auth_sys for,
index 6cfc96a..0766f95 100644 (file)
@@ -65,7 +65,7 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
        dprintk("nfsd: GETATTR  %s\n", SVCFH_fmt(&argp->fh));
 
        fh_copy(&resp->fh, &argp->fh);
-       nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
+       nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
        return nfsd_return_attrs(nfserr, resp);
 }
 
@@ -215,11 +215,11 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
                SVCFH_fmt(dirfhp), argp->len, argp->name);
 
        /* First verify the parent file handle */
-       nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, MAY_EXEC);
+       nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC);
        if (nfserr)
                goto done; /* must fh_put dirfhp even on error */
 
-       /* Check for MAY_WRITE in nfsd_create if necessary */
+       /* Check for NFSD_MAY_WRITE in nfsd_create if necessary */
 
        nfserr = nfserr_acces;
        if (!argp->len)
@@ -281,7 +281,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
                                        nfserr = nfsd_permission(rqstp,
                                                                 newfhp->fh_export,
                                                                 newfhp->fh_dentry,
-                                                                MAY_WRITE|MAY_LOCAL_ACCESS);
+                                                                NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS);
                                        if (nfserr && nfserr != nfserr_rofs)
                                                goto out_unlock;
                                }
@@ -614,6 +614,7 @@ nfserrno (int errno)
 #endif
                { nfserr_stale, -ESTALE },
                { nfserr_jukebox, -ETIMEDOUT },
+               { nfserr_jukebox, -ERESTARTSYS },
                { nfserr_dropit, -EAGAIN },
                { nfserr_dropit, -ENOMEM },
                { nfserr_badname, -ESRCH },
index 941041f..80292ff 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/smp_lock.h>
 #include <linux/freezer.h>
 #include <linux/fs_struct.h>
+#include <linux/kthread.h>
 
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/stats.h>
 
 #define NFSDDBG_FACILITY       NFSDDBG_SVC
 
-/* these signals will be delivered to an nfsd thread 
- * when handling a request
- */
-#define ALLOWED_SIGS   (sigmask(SIGKILL))
-/* these signals will be delivered to an nfsd thread
- * when not handling a request. i.e. when waiting
- */
-#define SHUTDOWN_SIGS  (sigmask(SIGKILL) | sigmask(SIGHUP) | sigmask(SIGINT) | sigmask(SIGQUIT))
-/* if the last thread dies with SIGHUP, then the exports table is
- * left unchanged ( like 2.4-{0-9} ).  Any other signal will clear
- * the exports table (like 2.2).
- */
-#define        SIG_NOCLEAN     SIGHUP
-
 extern struct svc_program      nfsd_program;
-static void                    nfsd(struct svc_rqst *rqstp);
+static int                     nfsd(void *vrqstp);
 struct timeval                 nfssvc_boot;
-       struct svc_serv                 *nfsd_serv;
 static atomic_t                        nfsd_busy;
 static unsigned long           nfsd_last_call;
 static DEFINE_SPINLOCK(nfsd_call_lock);
 
+/*
+ * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members
+ * of the svc_serv struct. In particular, ->sv_nrthreads but also to some
+ * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt
+ *
+ * If (out side the lock) nfsd_serv is non-NULL, then it must point to a
+ * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number
+ * of nfsd threads must exist and each must listed in ->sp_all_threads in each
+ * entry of ->sv_pools[].
+ *
+ * Transitions of the thread count between zero and non-zero are of particular
+ * interest since the svc_serv needs to be created and initialized at that
+ * point, or freed.
+ *
+ * Finally, the nfsd_mutex also protects some of the global variables that are
+ * accessed when nfsd starts and that are settable via the write_* routines in
+ * nfsctl.c. In particular:
+ *
+ *     user_recovery_dirname
+ *     user_lease_time
+ *     nfsd_versions
+ */
+DEFINE_MUTEX(nfsd_mutex);
+struct svc_serv                *nfsd_serv;
+
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
 static struct svc_stat nfsd_acl_svcstats;
 static struct svc_version *    nfsd_acl_version[] = {
@@ -145,13 +156,14 @@ int nfsd_vers(int vers, enum vers_op change)
 
 int nfsd_nrthreads(void)
 {
-       if (nfsd_serv == NULL)
-               return 0;
-       else
-               return nfsd_serv->sv_nrthreads;
+       int rv = 0;
+       mutex_lock(&nfsd_mutex);
+       if (nfsd_serv)
+               rv = nfsd_serv->sv_nrthreads;
+       mutex_unlock(&nfsd_mutex);
+       return rv;
 }
 
-static int killsig;    /* signal that was used to kill last nfsd */
 static void nfsd_last_thread(struct svc_serv *serv)
 {
        /* When last nfsd thread exits we need to do some clean-up */
@@ -162,11 +174,9 @@ static void nfsd_last_thread(struct svc_serv *serv)
        nfsd_racache_shutdown();
        nfs4_state_shutdown();
 
-       printk(KERN_WARNING "nfsd: last server has exited\n");
-       if (killsig != SIG_NOCLEAN) {
-               printk(KERN_WARNING "nfsd: unexporting all filesystems\n");
-               nfsd_export_flush();
-       }
+       printk(KERN_WARNING "nfsd: last server has exited, flushing export "
+                           "cache\n");
+       nfsd_export_flush();
 }
 
 void nfsd_reset_versions(void)
@@ -190,13 +200,14 @@ void nfsd_reset_versions(void)
        }
 }
 
+
 int nfsd_create_serv(void)
 {
        int err = 0;
-       lock_kernel();
+
+       WARN_ON(!mutex_is_locked(&nfsd_mutex));
        if (nfsd_serv) {
                svc_get(nfsd_serv);
-               unlock_kernel();
                return 0;
        }
        if (nfsd_max_blksize == 0) {
@@ -217,13 +228,11 @@ int nfsd_create_serv(void)
        }
 
        atomic_set(&nfsd_busy, 0);
-       nfsd_serv = svc_create_pooled(&nfsd_program,
-                                     nfsd_max_blksize,
-                                     nfsd_last_thread,
-                                     nfsd, SIG_NOCLEAN, THIS_MODULE);
+       nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
+                                     nfsd_last_thread, nfsd, THIS_MODULE);
        if (nfsd_serv == NULL)
                err = -ENOMEM;
-       unlock_kernel();
+
        do_gettimeofday(&nfssvc_boot);          /* record boot time */
        return err;
 }
@@ -282,6 +291,8 @@ int nfsd_set_nrthreads(int n, int *nthreads)
        int tot = 0;
        int err = 0;
 
+       WARN_ON(!mutex_is_locked(&nfsd_mutex));
+
        if (nfsd_serv == NULL || n <= 0)
                return 0;
 
@@ -316,7 +327,6 @@ int nfsd_set_nrthreads(int n, int *nthreads)
                nthreads[0] = 1;
 
        /* apply the new numbers */
-       lock_kernel();
        svc_get(nfsd_serv);
        for (i = 0; i < n; i++) {
                err = svc_set_num_threads(nfsd_serv, &nfsd_serv->sv_pools[i],
@@ -325,7 +335,6 @@ int nfsd_set_nrthreads(int n, int *nthreads)
                        break;
        }
        svc_destroy(nfsd_serv);
-       unlock_kernel();
 
        return err;
 }
@@ -334,8 +343,8 @@ int
 nfsd_svc(unsigned short port, int nrservs)
 {
        int     error;
-       
-       lock_kernel();
+
+       mutex_lock(&nfsd_mutex);
        dprintk("nfsd: creating service\n");
        error = -EINVAL;
        if (nrservs <= 0)
@@ -363,7 +372,7 @@ nfsd_svc(unsigned short port, int nrservs)
  failure:
        svc_destroy(nfsd_serv);         /* Release server */
  out:
-       unlock_kernel();
+       mutex_unlock(&nfsd_mutex);
        return error;
 }
 
@@ -391,18 +400,17 @@ update_thread_usage(int busy_threads)
 /*
  * This is the NFS server kernel thread
  */
-static void
-nfsd(struct svc_rqst *rqstp)
+static int
+nfsd(void *vrqstp)
 {
+       struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
        struct fs_struct *fsp;
-       int             err;
-       sigset_t shutdown_mask, allowed_mask;
+       int err, preverr = 0;
 
        /* Lock module and set up kernel thread */
-       lock_kernel();
-       daemonize("nfsd");
+       mutex_lock(&nfsd_mutex);
 
-       /* After daemonize() this kernel thread shares current->fs
+       /* At this point, the thread shares current->fs
         * with the init process. We need to create files with a
         * umask of 0 instead of init's umask. */
        fsp = copy_fs_struct(current->fs);
@@ -414,14 +422,17 @@ nfsd(struct svc_rqst *rqstp)
        current->fs = fsp;
        current->fs->umask = 0;
 
-       siginitsetinv(&shutdown_mask, SHUTDOWN_SIGS);
-       siginitsetinv(&allowed_mask, ALLOWED_SIGS);
+       /*
+        * thread is spawned with all signals set to SIG_IGN, re-enable
+        * the ones that will bring down the thread
+        */
+       allow_signal(SIGKILL);
+       allow_signal(SIGHUP);
+       allow_signal(SIGINT);
+       allow_signal(SIGQUIT);
 
        nfsdstats.th_cnt++;
-
-       rqstp->rq_task = current;
-
-       unlock_kernel();
+       mutex_unlock(&nfsd_mutex);
 
        /*
         * We want less throttling in balance_dirty_pages() so that nfs to
@@ -435,26 +446,30 @@ nfsd(struct svc_rqst *rqstp)
         * The main request loop
         */
        for (;;) {
-               /* Block all but the shutdown signals */
-               sigprocmask(SIG_SETMASK, &shutdown_mask, NULL);
-
                /*
                 * Find a socket with data available and call its
                 * recvfrom routine.
                 */
                while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN)
                        ;
-               if (err < 0)
+               if (err == -EINTR)
                        break;
+               else if (err < 0) {
+                       if (err != preverr) {
+                               printk(KERN_WARNING "%s: unexpected error "
+                                       "from svc_recv (%d)\n", __func__, -err);
+                               preverr = err;
+                       }
+                       schedule_timeout_uninterruptible(HZ);
+                       continue;
+               }
+
                update_thread_usage(atomic_read(&nfsd_busy));
                atomic_inc(&nfsd_busy);
 
                /* Lock the export hash tables for reading. */
                exp_readlock();
 
-               /* Process request with signals blocked.  */
-               sigprocmask(SIG_SETMASK, &allowed_mask, NULL);
-
                svc_process(rqstp);
 
                /* Unlock export hash tables */
@@ -463,22 +478,10 @@ nfsd(struct svc_rqst *rqstp)
                atomic_dec(&nfsd_busy);
        }
 
-       if (err != -EINTR) {
-               printk(KERN_WARNING "nfsd: terminating on error %d\n", -err);
-       } else {
-               unsigned int    signo;
-
-               for (signo = 1; signo <= _NSIG; signo++)
-                       if (sigismember(&current->pending.signal, signo) &&
-                           !sigismember(&current->blocked, signo))
-                               break;
-               killsig = signo;
-       }
        /* Clear signals before calling svc_exit_thread() */
        flush_signals(current);
 
-       lock_kernel();
-
+       mutex_lock(&nfsd_mutex);
        nfsdstats.th_cnt --;
 
 out:
@@ -486,8 +489,9 @@ out:
        svc_exit_thread(rqstp);
 
        /* Release module */
-       unlock_kernel();
+       mutex_unlock(&nfsd_mutex);
        module_put_and_exit(0);
+       return 0;
 }
 
 static __be32 map_new_errors(u32 vers, __be32 nfserr)
index a3a291f..0f4481e 100644 (file)
@@ -144,7 +144,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
        dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
 
        /* Obtain dentry and export. */
-       err = fh_verify(rqstp, fhp, S_IFDIR, MAY_EXEC);
+       err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
        if (err)
                return err;
 
@@ -262,14 +262,14 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 {
        struct dentry   *dentry;
        struct inode    *inode;
-       int             accmode = MAY_SATTR;
+       int             accmode = NFSD_MAY_SATTR;
        int             ftype = 0;
        __be32          err;
        int             host_err;
        int             size_change = 0;
 
        if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
-               accmode |= MAY_WRITE|MAY_OWNER_OVERRIDE;
+               accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
        if (iap->ia_valid & ATTR_SIZE)
                ftype = S_IFREG;
 
@@ -331,7 +331,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
         */
        if (iap->ia_valid & ATTR_SIZE) {
                if (iap->ia_size < inode->i_size) {
-                       err = nfsd_permission(rqstp, fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE);
+                       err = nfsd_permission(rqstp, fhp->fh_export, dentry,
+                                       NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE);
                        if (err)
                                goto out;
                }
@@ -462,7 +463,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
        unsigned int flags = 0;
 
        /* Get inode */
-       error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR);
+       error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
        if (error)
                return error;
 
@@ -563,20 +564,20 @@ struct accessmap {
        int             how;
 };
 static struct accessmap        nfs3_regaccess[] = {
-    {  NFS3_ACCESS_READ,       MAY_READ                        },
-    {  NFS3_ACCESS_EXECUTE,    MAY_EXEC                        },
-    {  NFS3_ACCESS_MODIFY,     MAY_WRITE|MAY_TRUNC             },
-    {  NFS3_ACCESS_EXTEND,     MAY_WRITE                       },
+    {  NFS3_ACCESS_READ,       NFSD_MAY_READ                   },
+    {  NFS3_ACCESS_EXECUTE,    NFSD_MAY_EXEC                   },
+    {  NFS3_ACCESS_MODIFY,     NFSD_MAY_WRITE|NFSD_MAY_TRUNC   },
+    {  NFS3_ACCESS_EXTEND,     NFSD_MAY_WRITE                  },
 
     {  0,                      0                               }
 };
 
 static struct accessmap        nfs3_diraccess[] = {
-    {  NFS3_ACCESS_READ,       MAY_READ                        },
-    {  NFS3_ACCESS_LOOKUP,     MAY_EXEC                        },
-    {  NFS3_ACCESS_MODIFY,     MAY_EXEC|MAY_WRITE|MAY_TRUNC    },
-    {  NFS3_ACCESS_EXTEND,     MAY_EXEC|MAY_WRITE              },
-    {  NFS3_ACCESS_DELETE,     MAY_REMOVE                      },
+    {  NFS3_ACCESS_READ,       NFSD_MAY_READ                   },
+    {  NFS3_ACCESS_LOOKUP,     NFSD_MAY_EXEC                   },
+    {  NFS3_ACCESS_MODIFY,     NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC},
+    {  NFS3_ACCESS_EXTEND,     NFSD_MAY_EXEC|NFSD_MAY_WRITE    },
+    {  NFS3_ACCESS_DELETE,     NFSD_MAY_REMOVE                 },
 
     {  0,                      0                               }
 };
@@ -589,10 +590,10 @@ static struct accessmap   nfs3_anyaccess[] = {
         * mainly at mode bits, and we make sure to ignore read-only
         * filesystem checks
         */
-    {  NFS3_ACCESS_READ,       MAY_READ                        },
-    {  NFS3_ACCESS_EXECUTE,    MAY_EXEC                        },
-    {  NFS3_ACCESS_MODIFY,     MAY_WRITE|MAY_LOCAL_ACCESS      },
-    {  NFS3_ACCESS_EXTEND,     MAY_WRITE|MAY_LOCAL_ACCESS      },
+    {  NFS3_ACCESS_READ,       NFSD_MAY_READ                   },
+    {  NFS3_ACCESS_EXECUTE,    NFSD_MAY_EXEC                   },
+    {  NFS3_ACCESS_MODIFY,     NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS    },
+    {  NFS3_ACCESS_EXTEND,     NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS    },
 
     {  0,                      0                               }
 };
@@ -606,7 +607,7 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
        u32                     query, result = 0, sresult = 0;
        __be32                  error;
 
-       error = fh_verify(rqstp, fhp, 0, MAY_NOP);
+       error = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
        if (error)
                goto out;
 
@@ -678,7 +679,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
         * and (hopefully) checked permission - so allow OWNER_OVERRIDE
         * in case a chmod has now revoked permission.
         */
-       err = fh_verify(rqstp, fhp, type, access | MAY_OWNER_OVERRIDE);
+       err = fh_verify(rqstp, fhp, type, access | NFSD_MAY_OWNER_OVERRIDE);
        if (err)
                goto out;
 
@@ -689,7 +690,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
         * or any access when mandatory locking enabled
         */
        err = nfserr_perm;
-       if (IS_APPEND(inode) && (access & MAY_WRITE))
+       if (IS_APPEND(inode) && (access & NFSD_MAY_WRITE))
                goto out;
        /*
         * We must ignore files (but only files) which might have mandatory
@@ -706,14 +707,14 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
         * Check to see if there are any leases on this file.
         * This may block while leases are broken.
         */
-       host_err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0));
+       host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? FMODE_WRITE : 0));
        if (host_err == -EWOULDBLOCK)
                host_err = -ETIMEDOUT;
        if (host_err) /* NOMEM or WOULDBLOCK */
                goto out_nfserr;
 
-       if (access & MAY_WRITE) {
-               if (access & MAY_READ)
+       if (access & NFSD_MAY_WRITE) {
+               if (access & NFSD_MAY_READ)
                        flags = O_RDWR|O_LARGEFILE;
                else
                        flags = O_WRONLY|O_LARGEFILE;
@@ -1069,12 +1070,12 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 
        if (file) {
                err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
-                               MAY_READ|MAY_OWNER_OVERRIDE);
+                               NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE);
                if (err)
                        goto out;
                err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
        } else {
-               err = nfsd_open(rqstp, fhp, S_IFREG, MAY_READ, &file);
+               err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
                if (err)
                        goto out;
                err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
@@ -1098,13 +1099,13 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 
        if (file) {
                err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
-                               MAY_WRITE|MAY_OWNER_OVERRIDE);
+                               NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE);
                if (err)
                        goto out;
                err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt,
                                stablep);
        } else {
-               err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file);
+               err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
                if (err)
                        goto out;
 
@@ -1136,7 +1137,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if ((u64)count > ~(u64)offset)
                return nfserr_inval;
 
-       if ((err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file)) != 0)
+       err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
+       if (err)
                return err;
        if (EX_ISSYNC(fhp->fh_export)) {
                if (file->f_op && file->f_op->fsync) {
@@ -1197,7 +1199,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if (isdotent(fname, flen))
                goto out;
 
-       err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
+       err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
        if (err)
                goto out;
 
@@ -1248,36 +1250,34 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
                iap->ia_mode = 0;
        iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type;
 
+       err = nfserr_inval;
+       if (!S_ISREG(type) && !S_ISDIR(type) && !special_file(type)) {
+               printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n",
+                      type);
+               goto out;
+       }
+
+       host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+       if (host_err)
+               goto out_nfserr;
+
        /*
         * Get the dir op function pointer.
         */
        err = 0;
        switch (type) {
        case S_IFREG:
-               host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
-               if (host_err)
-                       goto out_nfserr;
                host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
                break;
        case S_IFDIR:
-               host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
-               if (host_err)
-                       goto out_nfserr;
                host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
                break;
        case S_IFCHR:
        case S_IFBLK:
        case S_IFIFO:
        case S_IFSOCK:
-               host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
-               if (host_err)
-                       goto out_nfserr;
                host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
                break;
-       default:
-               printk("nfsd: bad file type %o in nfsd_create\n", type);
-               host_err = -EINVAL;
-               goto out_nfserr;
        }
        if (host_err < 0) {
                mnt_drop_write(fhp->fh_export->ex_path.mnt);
@@ -1289,7 +1289,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
                write_inode_now(dchild->d_inode, 1);
        }
 
-
        err2 = nfsd_create_setattr(rqstp, resfhp, iap);
        if (err2)
                err = err2;
@@ -1334,7 +1333,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
                goto out;
        if (!(iap->ia_valid & ATTR_MODE))
                iap->ia_mode = 0;
-       err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
+       err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
        if (err)
                goto out;
 
@@ -1471,7 +1470,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
        __be32          err;
        int             host_err;
 
-       err = fh_verify(rqstp, fhp, S_IFLNK, MAY_NOP);
+       err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP);
        if (err)
                goto out;
 
@@ -1526,7 +1525,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if (isdotent(fname, flen))
                goto out;
 
-       err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
+       err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
        if (err)
                goto out;
        fh_lock(fhp);
@@ -1591,10 +1590,10 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
        __be32          err;
        int             host_err;
 
-       err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_CREATE);
+       err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE);
        if (err)
                goto out;
-       err = fh_verify(rqstp, tfhp, -S_IFDIR, MAY_NOP);
+       err = fh_verify(rqstp, tfhp, -S_IFDIR, NFSD_MAY_NOP);
        if (err)
                goto out;
 
@@ -1661,10 +1660,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        __be32          err;
        int             host_err;
 
-       err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_REMOVE);
+       err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
        if (err)
                goto out;
-       err = fh_verify(rqstp, tfhp, S_IFDIR, MAY_CREATE);
+       err = fh_verify(rqstp, tfhp, S_IFDIR, NFSD_MAY_CREATE);
        if (err)
                goto out;
 
@@ -1768,7 +1767,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        err = nfserr_acces;
        if (!flen || isdotent(fname, flen))
                goto out;
-       err = fh_verify(rqstp, fhp, S_IFDIR, MAY_REMOVE);
+       err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_REMOVE);
        if (err)
                goto out;
 
@@ -1834,7 +1833,7 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
        struct file     *file;
        loff_t          offset = *offsetp;
 
-       err = nfsd_open(rqstp, fhp, S_IFDIR, MAY_READ, &file);
+       err = nfsd_open(rqstp, fhp, S_IFDIR, NFSD_MAY_READ, &file);
        if (err)
                goto out;
 
@@ -1875,7 +1874,7 @@ out:
 __be32
 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
 {
-       __be32 err = fh_verify(rqstp, fhp, 0, MAY_NOP);
+       __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
        if (!err && vfs_statfs(fhp->fh_dentry,stat))
                err = nfserr_io;
        return err;
@@ -1896,18 +1895,18 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
        struct inode    *inode = dentry->d_inode;
        int             err;
 
-       if (acc == MAY_NOP)
+       if (acc == NFSD_MAY_NOP)
                return 0;
 #if 0
        dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n",
                acc,
-               (acc & MAY_READ)?       " read"  : "",
-               (acc & MAY_WRITE)?      " write" : "",
-               (acc & MAY_EXEC)?       " exec"  : "",
-               (acc & MAY_SATTR)?      " sattr" : "",
-               (acc & MAY_TRUNC)?      " trunc" : "",
-               (acc & MAY_LOCK)?       " lock"  : "",
-               (acc & MAY_OWNER_OVERRIDE)? " owneroverride" : "",
+               (acc & NFSD_MAY_READ)?  " read"  : "",
+               (acc & NFSD_MAY_WRITE)? " write" : "",
+               (acc & NFSD_MAY_EXEC)?  " exec"  : "",
+               (acc & NFSD_MAY_SATTR)? " sattr" : "",
+               (acc & NFSD_MAY_TRUNC)? " trunc" : "",
+               (acc & NFSD_MAY_LOCK)?  " lock"  : "",
+               (acc & NFSD_MAY_OWNER_OVERRIDE)? " owneroverride" : "",
                inode->i_mode,
                IS_IMMUTABLE(inode)?    " immut" : "",
                IS_APPEND(inode)?       " append" : "",
@@ -1920,18 +1919,18 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
         * system.  But if it is IRIX doing check on write-access for a 
         * device special file, we ignore rofs.
         */
-       if (!(acc & MAY_LOCAL_ACCESS))
-               if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) {
+       if (!(acc & NFSD_MAY_LOCAL_ACCESS))
+               if (acc & (NFSD_MAY_WRITE | NFSD_MAY_SATTR | NFSD_MAY_TRUNC)) {
                        if (exp_rdonly(rqstp, exp) ||
                            __mnt_is_readonly(exp->ex_path.mnt))
                                return nfserr_rofs;
-                       if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode))
+                       if (/* (acc & NFSD_MAY_WRITE) && */ IS_IMMUTABLE(inode))
                                return nfserr_perm;
                }
-       if ((acc & MAY_TRUNC) && IS_APPEND(inode))
+       if ((acc & NFSD_MAY_TRUNC) && IS_APPEND(inode))
                return nfserr_perm;
 
-       if (acc & MAY_LOCK) {
+       if (acc & NFSD_MAY_LOCK) {
                /* If we cannot rely on authentication in NLM requests,
                 * just allow locks, otherwise require read permission, or
                 * ownership
@@ -1939,7 +1938,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
                if (exp->ex_flags & NFSEXP_NOAUTHNLM)
                        return 0;
                else
-                       acc = MAY_READ | MAY_OWNER_OVERRIDE;
+                       acc = NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE;
        }
        /*
         * The file owner always gets access permission for accesses that
@@ -1955,15 +1954,16 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
         * We must trust the client to do permission checking - using "ACCESS"
         * with NFSv3.
         */
-       if ((acc & MAY_OWNER_OVERRIDE) &&
+       if ((acc & NFSD_MAY_OWNER_OVERRIDE) &&
            inode->i_uid == current->fsuid)
                return 0;
 
+       /* This assumes  NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */
        err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC), NULL);
 
        /* Allow read access to binaries even when mode 111 */
        if (err == -EACCES && S_ISREG(inode->i_mode) &&
-           acc == (MAY_READ | MAY_OWNER_OVERRIDE))
+           acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
                err = permission(inode, MAY_EXEC, NULL);
 
        return err? nfserrno(err) : 0;
index c6455da..9c2ac5c 100644 (file)
@@ -918,12 +918,12 @@ struct file_lock {
        struct list_head fl_link;       /* doubly linked list of all locks */
        struct list_head fl_block;      /* circular list of blocked processes */
        fl_owner_t fl_owner;
+       unsigned char fl_flags;
+       unsigned char fl_type;
        unsigned int fl_pid;
        struct pid *fl_nspid;
        wait_queue_head_t fl_wait;
        struct file *fl_file;
-       unsigned char fl_flags;
-       unsigned char fl_type;
        loff_t fl_start;
        loff_t fl_end;
 
index 102d928..dbb87ab 100644 (file)
@@ -200,10 +200,12 @@ typedef int         (*nlm_host_match_fn_t)(void *cur, struct nlm_host *ref);
  * Server-side lock handling
  */
 __be32           nlmsvc_lock(struct svc_rqst *, struct nlm_file *,
-                                       struct nlm_lock *, int, struct nlm_cookie *);
+                             struct nlm_host *, struct nlm_lock *, int,
+                             struct nlm_cookie *);
 __be32           nlmsvc_unlock(struct nlm_file *, struct nlm_lock *);
 __be32           nlmsvc_testlock(struct svc_rqst *, struct nlm_file *,
-                       struct nlm_lock *, struct nlm_lock *, struct nlm_cookie *);
+                       struct nlm_host *, struct nlm_lock *,
+                       struct nlm_lock *, struct nlm_cookie *);
 __be32           nlmsvc_cancel_blocked(struct nlm_file *, struct nlm_lock *);
 unsigned long    nlmsvc_retry_blocked(void);
 void             nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *,
@@ -224,7 +226,7 @@ void                  nlmsvc_invalidate_all(void);
  * Cluster failover support
  */
 int           nlmsvc_unlock_all_by_sb(struct super_block *sb);
-int           nlmsvc_unlock_all_by_ip(__be32 server_addr);
+int           nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr);
 
 static inline struct inode *nlmsvc_file_inode(struct nlm_file *file)
 {
index 8726491..ea03667 100644 (file)
@@ -65,9 +65,6 @@
 #define NFS4_ACE_SUCCESSFUL_ACCESS_ACE_FLAG   0x00000010
 #define NFS4_ACE_FAILED_ACCESS_ACE_FLAG       0x00000020
 #define NFS4_ACE_IDENTIFIER_GROUP             0x00000040
-#define NFS4_ACE_OWNER                        0x00000080
-#define NFS4_ACE_GROUP                        0x00000100
-#define NFS4_ACE_EVERYONE                     0x00000200
 
 #define NFS4_ACE_READ_DATA                    0x00000001
 #define NFS4_ACE_LIST_DIRECTORY               0x00000001
index 41d30c9..a2861d9 100644 (file)
 #define NFSD_SUPPORTED_MINOR_VERSION   0
 
 /*
- * Special flags for nfsd_permission. These must be different from MAY_READ,
- * MAY_WRITE, and MAY_EXEC.
+ * Flags for nfsd_permission
  */
-#define MAY_NOP                        0
-#define MAY_SATTR              8
-#define MAY_TRUNC              16
-#define MAY_LOCK               32
-#define MAY_OWNER_OVERRIDE     64
-#define        MAY_LOCAL_ACCESS        128 /* IRIX doing local access check on device special file*/
-#if (MAY_SATTR | MAY_TRUNC | MAY_LOCK | MAY_OWNER_OVERRIDE | MAY_LOCAL_ACCESS) & (MAY_READ | MAY_WRITE | MAY_EXEC)
-# error "please use a different value for MAY_SATTR or MAY_TRUNC or MAY_LOCK or MAY_LOCAL_ACCESS or MAY_OWNER_OVERRIDE."
-#endif
-#define MAY_CREATE             (MAY_EXEC|MAY_WRITE)
-#define MAY_REMOVE             (MAY_EXEC|MAY_WRITE|MAY_TRUNC)
+#define NFSD_MAY_NOP           0
+#define NFSD_MAY_EXEC          1 /* == MAY_EXEC */
+#define NFSD_MAY_WRITE         2 /* == MAY_WRITE */
+#define NFSD_MAY_READ          4 /* == MAY_READ */
+#define NFSD_MAY_SATTR         8
+#define NFSD_MAY_TRUNC         16
+#define NFSD_MAY_LOCK          32
+#define NFSD_MAY_OWNER_OVERRIDE        64
+#define NFSD_MAY_LOCAL_ACCESS  128 /* IRIX doing local access check on device special file*/
+
+#define NFSD_MAY_CREATE                (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
+#define NFSD_MAY_REMOVE                (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
 
 /*
  * Callback function for readdir
@@ -54,6 +54,7 @@ typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int);
 extern struct svc_program      nfsd_program;
 extern struct svc_version      nfsd_version2, nfsd_version3,
                                nfsd_version4;
+extern struct mutex            nfsd_mutex;
 extern struct svc_serv         *nfsd_serv;
 
 extern struct seq_operations nfs_exports_op;
index db348f7..d0fe2e3 100644 (file)
@@ -98,8 +98,6 @@ struct nfs4_callback {
        u32                     cb_ident;
        /* RPC client info */
        atomic_t                cb_set;     /* successful CB_NULL call */
-       struct rpc_program      cb_program;
-       struct rpc_stat         cb_stat;
        struct rpc_clnt *       cb_client;
 };
 
index a10f1fb..e7bbdba 100644 (file)
@@ -51,6 +51,9 @@ struct krb5_ctx {
 
 extern spinlock_t krb5_seq_lock;
 
+/* The length of the Kerberos GSS token header */
+#define GSS_KRB5_TOK_HDR_LEN   (16)
+
 #define KG_TOK_MIC_MSG    0x0101
 #define KG_TOK_WRAP_MSG   0x0201
 
index 4b54c5f..dc69068 100644 (file)
@@ -22,7 +22,7 @@
 /*
  * This is the RPC server thread function prototype
  */
-typedef void           (*svc_thread_fn)(struct svc_rqst *);
+typedef int            (*svc_thread_fn)(void *);
 
 /*
  *
@@ -80,7 +80,6 @@ struct svc_serv {
        struct module *         sv_module;      /* optional module to count when
                                                 * adding threads */
        svc_thread_fn           sv_function;    /* main function for threads */
-       int                     sv_kill_signal; /* signal to kill threads */
 };
 
 /*
@@ -388,8 +387,8 @@ struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
                                        struct svc_pool *pool);
 void              svc_exit_thread(struct svc_rqst *);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
-                       void (*shutdown)(struct svc_serv*),
-                       svc_thread_fn, int sig, struct module *);
+                       void (*shutdown)(struct svc_serv*), svc_thread_fn,
+                       struct module *);
 int               svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 void              svc_destroy(struct svc_serv *);
 int               svc_process(struct svc_rqst *);
index 05eb466..ef2e3a2 100644 (file)
@@ -72,7 +72,7 @@ extern atomic_t rdma_stat_sq_prod;
  */
 struct svc_rdma_op_ctxt {
        struct svc_rdma_op_ctxt *read_hdr;
-       struct list_head free_list;
+       int hdr_count;
        struct xdr_buf arg;
        struct list_head dto_q;
        enum ib_wr_opcode wr_op;
@@ -86,6 +86,31 @@ struct svc_rdma_op_ctxt {
        struct page *pages[RPCSVC_MAXPAGES];
 };
 
+/*
+ * NFS_ requests are mapped on the client side by the chunk lists in
+ * the RPCRDMA header. During the fetching of the RPC from the client
+ * and the writing of the reply to the client, the memory in the
+ * client and the memory in the server must be mapped as contiguous
+ * vaddr/len for access by the hardware. These data strucures keep
+ * these mappings.
+ *
+ * For an RDMA_WRITE, the 'sge' maps the RPC REPLY. For RDMA_READ, the
+ * 'sge' in the svc_rdma_req_map maps the server side RPC reply and the
+ * 'ch' field maps the read-list of the RPCRDMA header to the 'sge'
+ * mapping of the reply.
+ */
+struct svc_rdma_chunk_sge {
+       int start;              /* sge no for this chunk */
+       int count;              /* sge count for this chunk */
+};
+struct svc_rdma_req_map {
+       unsigned long count;
+       union {
+               struct kvec sge[RPCSVC_MAXPAGES];
+               struct svc_rdma_chunk_sge ch[RPCSVC_MAXPAGES];
+       };
+};
+
 #define RDMACTXT_F_LAST_CTXT   2
 
 struct svcxprt_rdma {
@@ -93,7 +118,6 @@ struct svcxprt_rdma {
        struct rdma_cm_id    *sc_cm_id;         /* RDMA connection id */
        struct list_head     sc_accept_q;       /* Conn. waiting accept */
        int                  sc_ord;            /* RDMA read limit */
-       wait_queue_head_t    sc_read_wait;
        int                  sc_max_sge;
 
        int                  sc_sq_depth;       /* Depth of SQ */
@@ -104,12 +128,8 @@ struct svcxprt_rdma {
 
        struct ib_pd         *sc_pd;
 
+       atomic_t             sc_dma_used;
        atomic_t             sc_ctxt_used;
-       struct list_head     sc_ctxt_free;
-       int                  sc_ctxt_cnt;
-       int                  sc_ctxt_bump;
-       int                  sc_ctxt_max;
-       spinlock_t           sc_ctxt_lock;
        struct list_head     sc_rq_dto_q;
        spinlock_t           sc_rq_dto_lock;
        struct ib_qp         *sc_qp;
@@ -173,6 +193,8 @@ extern int svc_rdma_post_recv(struct svcxprt_rdma *);
 extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *);
 extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *);
 extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int);
+extern struct svc_rdma_req_map *svc_rdma_get_req_map(void);
+extern void svc_rdma_put_req_map(struct svc_rdma_req_map *);
 extern void svc_sq_reap(struct svcxprt_rdma *);
 extern void svc_rq_reap(struct svcxprt_rdma *);
 extern struct svc_xprt_class svc_rdma_class;
index f3431a7..4de8bcf 100644 (file)
@@ -5,12 +5,12 @@
 obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o
 
 auth_rpcgss-objs := auth_gss.o gss_generic_token.o \
-       gss_mech_switch.o svcauth_gss.o gss_krb5_crypto.o
+       gss_mech_switch.o svcauth_gss.o
 
 obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o
 
 rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \
-       gss_krb5_seqnum.o gss_krb5_wrap.o
+       gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o
 
 obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o
 
index 1d52308..c93fca2 100644 (file)
@@ -83,8 +83,6 @@ out:
        return ret;
 }
 
-EXPORT_SYMBOL(krb5_encrypt);
-
 u32
 krb5_decrypt(
      struct crypto_blkcipher *tfm,
@@ -118,8 +116,6 @@ out:
        return ret;
 }
 
-EXPORT_SYMBOL(krb5_decrypt);
-
 static int
 checksummer(struct scatterlist *sg, void *data)
 {
@@ -161,8 +157,6 @@ out:
        return err ? GSS_S_FAILURE : 0;
 }
 
-EXPORT_SYMBOL(make_checksum);
-
 struct encryptor_desc {
        u8 iv[8]; /* XXX hard-coded blocksize */
        struct blkcipher_desc desc;
@@ -262,8 +256,6 @@ gss_encrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf,
        return ret;
 }
 
-EXPORT_SYMBOL(gss_encrypt_xdr_buf);
-
 struct decryptor_desc {
        u8 iv[8]; /* XXX hard-coded blocksize */
        struct blkcipher_desc desc;
@@ -334,5 +326,3 @@ gss_decrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf,
 
        return xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc);
 }
-
-EXPORT_SYMBOL(gss_decrypt_xdr_buf);
index 5f1d36d..b8f42ef 100644 (file)
@@ -78,7 +78,7 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text,
        struct krb5_ctx         *ctx = gss_ctx->internal_ctx_id;
        char                    cksumdata[16];
        struct xdr_netobj       md5cksum = {.len = 0, .data = cksumdata};
-       unsigned char           *ptr, *krb5_hdr, *msg_start;
+       unsigned char           *ptr, *msg_start;
        s32                     now;
        u32                     seq_send;
 
@@ -87,36 +87,36 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text,
 
        now = get_seconds();
 
-       token->len = g_token_size(&ctx->mech_used, 24);
+       token->len = g_token_size(&ctx->mech_used, GSS_KRB5_TOK_HDR_LEN + 8);
 
        ptr = token->data;
-       g_make_token_header(&ctx->mech_used, 24, &ptr);
+       g_make_token_header(&ctx->mech_used, GSS_KRB5_TOK_HDR_LEN + 8, &ptr);
 
-       *ptr++ = (unsigned char) ((KG_TOK_MIC_MSG>>8)&0xff);
-       *ptr++ = (unsigned char) (KG_TOK_MIC_MSG&0xff);
+       /* ptr now at header described in rfc 1964, section 1.2.1: */
+       ptr[0] = (unsigned char) ((KG_TOK_MIC_MSG >> 8) & 0xff);
+       ptr[1] = (unsigned char) (KG_TOK_MIC_MSG & 0xff);
 
-       /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */
-       krb5_hdr = ptr - 2;
-       msg_start = krb5_hdr + 24;
+       msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + 8;
 
-       *(__be16 *)(krb5_hdr + 2) = htons(SGN_ALG_DES_MAC_MD5);
-       memset(krb5_hdr + 4, 0xff, 4);
+       *(__be16 *)(ptr + 2) = htons(SGN_ALG_DES_MAC_MD5);
+       memset(ptr + 4, 0xff, 4);
 
-       if (make_checksum("md5", krb5_hdr, 8, text, 0, &md5cksum))
+       if (make_checksum("md5", ptr, 8, text, 0, &md5cksum))
                return GSS_S_FAILURE;
 
        if (krb5_encrypt(ctx->seq, NULL, md5cksum.data,
                          md5cksum.data, md5cksum.len))
                return GSS_S_FAILURE;
 
-       memcpy(krb5_hdr + 16, md5cksum.data + md5cksum.len - 8, 8);
+       memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data + md5cksum.len - 8, 8);
 
        spin_lock(&krb5_seq_lock);
        seq_send = ctx->seq_send++;
        spin_unlock(&krb5_seq_lock);
 
        if (krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff,
-                             seq_send, krb5_hdr + 16, krb5_hdr + 8))
+                             seq_send, ptr + GSS_KRB5_TOK_HDR_LEN,
+                             ptr + 8))
                return GSS_S_FAILURE;
 
        return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE;
index d91a5d0..066ec73 100644 (file)
@@ -92,30 +92,30 @@ gss_verify_mic_kerberos(struct gss_ctx *gss_ctx,
                                        read_token->len))
                return GSS_S_DEFECTIVE_TOKEN;
 
-       if ((*ptr++ != ((KG_TOK_MIC_MSG>>8)&0xff)) ||
-           (*ptr++ != ( KG_TOK_MIC_MSG    &0xff))   )
+       if ((ptr[0] != ((KG_TOK_MIC_MSG >> 8) & 0xff)) ||
+           (ptr[1] !=  (KG_TOK_MIC_MSG & 0xff)))
                return GSS_S_DEFECTIVE_TOKEN;
 
        /* XXX sanity-check bodysize?? */
 
-       signalg = ptr[0] + (ptr[1] << 8);
+       signalg = ptr[2] + (ptr[3] << 8);
        if (signalg != SGN_ALG_DES_MAC_MD5)
                return GSS_S_DEFECTIVE_TOKEN;
 
-       sealalg = ptr[2] + (ptr[3] << 8);
+       sealalg = ptr[4] + (ptr[5] << 8);
        if (sealalg != SEAL_ALG_NONE)
                return GSS_S_DEFECTIVE_TOKEN;
 
-       if ((ptr[4] != 0xff) || (ptr[5] != 0xff))
+       if ((ptr[6] != 0xff) || (ptr[7] != 0xff))
                return GSS_S_DEFECTIVE_TOKEN;
 
-       if (make_checksum("md5", ptr - 2, 8, message_buffer, 0, &md5cksum))
+       if (make_checksum("md5", ptr, 8, message_buffer, 0, &md5cksum))
                return GSS_S_FAILURE;
 
        if (krb5_encrypt(ctx->seq, NULL, md5cksum.data, md5cksum.data, 16))
                return GSS_S_FAILURE;
 
-       if (memcmp(md5cksum.data + 8, ptr + 14, 8))
+       if (memcmp(md5cksum.data + 8, ptr + GSS_KRB5_TOK_HDR_LEN, 8))
                return GSS_S_BAD_SIG;
 
        /* it got through unscathed.  Make sure the context is unexpired */
@@ -127,7 +127,7 @@ gss_verify_mic_kerberos(struct gss_ctx *gss_ctx,
 
        /* do sequencing checks */
 
-       if (krb5_get_seq_num(ctx->seq, ptr + 14, ptr + 6, &direction, &seqnum))
+       if (krb5_get_seq_num(ctx->seq, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, &direction, &seqnum))
                return GSS_S_FAILURE;
 
        if ((ctx->initiate && direction != 0xff) ||
index b00b1b4..ae8e69b 100644 (file)
@@ -87,8 +87,8 @@ out:
        return 0;
 }
 
-static inline void
-make_confounder(char *p, int blocksize)
+static void
+make_confounder(char *p, u32 conflen)
 {
        static u64 i = 0;
        u64 *q = (u64 *)p;
@@ -102,8 +102,22 @@ make_confounder(char *p, int blocksize)
         * uniqueness would mean worrying about atomicity and rollover, and I
         * don't care enough. */
 
-       BUG_ON(blocksize != 8);
-       *q = i++;
+       /* initialize to random value */
+       if (i == 0) {
+               i = random32();
+               i = (i << 32) | random32();
+       }
+
+       switch (conflen) {
+       case 16:
+               *q++ = i++;
+               /* fall through */
+       case 8:
+               *q++ = i++;
+               break;
+       default:
+               BUG();
+       }
 }
 
 /* Assumptions: the head and tail of inbuf are ours to play with.
@@ -122,7 +136,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset,
        char                    cksumdata[16];
        struct xdr_netobj       md5cksum = {.len = 0, .data = cksumdata};
        int                     blocksize = 0, plainlen;
-       unsigned char           *ptr, *krb5_hdr, *msg_start;
+       unsigned char           *ptr, *msg_start;
        s32                     now;
        int                     headlen;
        struct page             **tmp_pages;
@@ -149,26 +163,26 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset,
        buf->len += headlen;
        BUG_ON((buf->len - offset - headlen) % blocksize);
 
-       g_make_token_header(&kctx->mech_used, 24 + plainlen, &ptr);
+       g_make_token_header(&kctx->mech_used,
+                               GSS_KRB5_TOK_HDR_LEN + 8 + plainlen, &ptr);
 
 
-       *ptr++ = (unsigned char) ((KG_TOK_WRAP_MSG>>8)&0xff);
-       *ptr++ = (unsigned char) (KG_TOK_WRAP_MSG&0xff);
+       /* ptr now at header described in rfc 1964, section 1.2.1: */
+       ptr[0] = (unsigned char) ((KG_TOK_WRAP_MSG >> 8) & 0xff);
+       ptr[1] = (unsigned char) (KG_TOK_WRAP_MSG & 0xff);
 
-       /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */
-       krb5_hdr = ptr - 2;
-       msg_start = krb5_hdr + 24;
+       msg_start = ptr + 24;
 
-       *(__be16 *)(krb5_hdr + 2) = htons(SGN_ALG_DES_MAC_MD5);
-       memset(krb5_hdr + 4, 0xff, 4);
-       *(__be16 *)(krb5_hdr + 4) = htons(SEAL_ALG_DES);
+       *(__be16 *)(ptr + 2) = htons(SGN_ALG_DES_MAC_MD5);
+       memset(ptr + 4, 0xff, 4);
+       *(__be16 *)(ptr + 4) = htons(SEAL_ALG_DES);
 
        make_confounder(msg_start, blocksize);
 
        /* XXXJBF: UGH!: */
        tmp_pages = buf->pages;
        buf->pages = pages;
-       if (make_checksum("md5", krb5_hdr, 8, buf,
+       if (make_checksum("md5", ptr, 8, buf,
                                offset + headlen - blocksize, &md5cksum))
                return GSS_S_FAILURE;
        buf->pages = tmp_pages;
@@ -176,7 +190,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset,
        if (krb5_encrypt(kctx->seq, NULL, md5cksum.data,
                          md5cksum.data, md5cksum.len))
                return GSS_S_FAILURE;
-       memcpy(krb5_hdr + 16, md5cksum.data + md5cksum.len - 8, 8);
+       memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data + md5cksum.len - 8, 8);
 
        spin_lock(&krb5_seq_lock);
        seq_send = kctx->seq_send++;
@@ -185,7 +199,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset,
        /* XXX would probably be more efficient to compute checksum
         * and encrypt at the same time: */
        if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff,
-                              seq_send, krb5_hdr + 16, krb5_hdr + 8)))
+                              seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)))
                return GSS_S_FAILURE;
 
        if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize,
@@ -219,38 +233,38 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf)
                                        buf->len - offset))
                return GSS_S_DEFECTIVE_TOKEN;
 
-       if ((*ptr++ != ((KG_TOK_WRAP_MSG>>8)&0xff)) ||
-           (*ptr++ !=  (KG_TOK_WRAP_MSG    &0xff))   )
+       if ((ptr[0] != ((KG_TOK_WRAP_MSG >> 8) & 0xff)) ||
+           (ptr[1] !=  (KG_TOK_WRAP_MSG & 0xff)))
                return GSS_S_DEFECTIVE_TOKEN;
 
        /* XXX sanity-check bodysize?? */
 
        /* get the sign and seal algorithms */
 
-       signalg = ptr[0] + (ptr[1] << 8);
+       signalg = ptr[2] + (ptr[3] << 8);
        if (signalg != SGN_ALG_DES_MAC_MD5)
                return GSS_S_DEFECTIVE_TOKEN;
 
-       sealalg = ptr[2] + (ptr[3] << 8);
+       sealalg = ptr[4] + (ptr[5] << 8);
        if (sealalg != SEAL_ALG_DES)
                return GSS_S_DEFECTIVE_TOKEN;
 
-       if ((ptr[4] != 0xff) || (ptr[5] != 0xff))
+       if ((ptr[6] != 0xff) || (ptr[7] != 0xff))
                return GSS_S_DEFECTIVE_TOKEN;
 
        if (gss_decrypt_xdr_buf(kctx->enc, buf,
-                       ptr + 22 - (unsigned char *)buf->head[0].iov_base))
+                       ptr + GSS_KRB5_TOK_HDR_LEN + 8 - (unsigned char *)buf->head[0].iov_base))
                return GSS_S_DEFECTIVE_TOKEN;
 
-       if (make_checksum("md5", ptr - 2, 8, buf,
-                ptr + 22 - (unsigned char *)buf->head[0].iov_base, &md5cksum))
+       if (make_checksum("md5", ptr, 8, buf,
+                ptr + GSS_KRB5_TOK_HDR_LEN + 8 - (unsigned char *)buf->head[0].iov_base, &md5cksum))
                return GSS_S_FAILURE;
 
        if (krb5_encrypt(kctx->seq, NULL, md5cksum.data,
                           md5cksum.data, md5cksum.len))
                return GSS_S_FAILURE;
 
-       if (memcmp(md5cksum.data + 8, ptr + 14, 8))
+       if (memcmp(md5cksum.data + 8, ptr + GSS_KRB5_TOK_HDR_LEN, 8))
                return GSS_S_BAD_SIG;
 
        /* it got through unscathed.  Make sure the context is unexpired */
@@ -262,8 +276,8 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf)
 
        /* do sequencing checks */
 
-       if (krb5_get_seq_num(kctx->seq, ptr + 14, ptr + 6, &direction,
-                                   &seqnum))
+       if (krb5_get_seq_num(kctx->seq, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8,
+                                   &direction, &seqnum))
                return GSS_S_BAD_SIG;
 
        if ((kctx->initiate && direction != 0xff) ||
@@ -274,7 +288,7 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf)
         * better to copy and encrypt at the same time. */
 
        blocksize = crypto_blkcipher_blocksize(kctx->enc);
-       data_start = ptr + 22 + blocksize;
+       data_start = ptr + GSS_KRB5_TOK_HDR_LEN + 8 + blocksize;
        orig_start = buf->head[0].iov_base + offset;
        data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start;
        memmove(orig_start, data_start, data_len);
index 01c7e31..5a32cb7 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/kthread.h>
 
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/xdr.h>
@@ -291,15 +292,14 @@ svc_pool_map_put(void)
 
 
 /*
- * Set the current thread's cpus_allowed mask so that it
+ * Set the given thread's cpus_allowed mask so that it
  * will only run on cpus in the given pool.
- *
- * Returns 1 and fills in oldmask iff a cpumask was applied.
  */
-static inline int
-svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
+static inline void
+svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx)
 {
        struct svc_pool_map *m = &svc_pool_map;
+       unsigned int node = m->pool_to[pidx];
 
        /*
         * The caller checks for sv_nrpools > 1, which
@@ -307,26 +307,17 @@ svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
         */
        BUG_ON(m->count == 0);
 
-       switch (m->mode)
-       {
-       default:
-               return 0;
+       switch (m->mode) {
        case SVC_POOL_PERCPU:
        {
-               unsigned int cpu = m->pool_to[pidx];
-
-               *oldmask = current->cpus_allowed;
-               set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-               return 1;
+               set_cpus_allowed_ptr(task, &cpumask_of_cpu(node));
+               break;
        }
        case SVC_POOL_PERNODE:
        {
-               unsigned int node = m->pool_to[pidx];
                node_to_cpumask_ptr(nodecpumask, node);
-
-               *oldmask = current->cpus_allowed;
-               set_cpus_allowed_ptr(current, nodecpumask);
-               return 1;
+               set_cpus_allowed_ptr(task, nodecpumask);
+               break;
        }
        }
 }
@@ -443,7 +434,7 @@ EXPORT_SYMBOL(svc_create);
 struct svc_serv *
 svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
                void (*shutdown)(struct svc_serv *serv),
-                 svc_thread_fn func, int sig, struct module *mod)
+                 svc_thread_fn func, struct module *mod)
 {
        struct svc_serv *serv;
        unsigned int npools = svc_pool_map_get();
@@ -452,7 +443,6 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
 
        if (serv != NULL) {
                serv->sv_function = func;
-               serv->sv_kill_signal = sig;
                serv->sv_module = mod;
        }
 
@@ -461,7 +451,8 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
 EXPORT_SYMBOL(svc_create_pooled);
 
 /*
- * Destroy an RPC service.  Should be called with the BKL held
+ * Destroy an RPC service. Should be called with appropriate locking to
+ * protect the sv_nrthreads, sv_permsocks and sv_tempsocks.
  */
 void
 svc_destroy(struct svc_serv *serv)
@@ -577,46 +568,6 @@ out_enomem:
 }
 EXPORT_SYMBOL(svc_prepare_thread);
 
-/*
- * Create a thread in the given pool.  Caller must hold BKL.
- * On a NUMA or SMP machine, with a multi-pool serv, the thread
- * will be restricted to run on the cpus belonging to the pool.
- */
-static int
-__svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
-                   struct svc_pool *pool)
-{
-       struct svc_rqst *rqstp;
-       int             error = -ENOMEM;
-       int             have_oldmask = 0;
-       cpumask_t       uninitialized_var(oldmask);
-
-       rqstp = svc_prepare_thread(serv, pool);
-       if (IS_ERR(rqstp)) {
-               error = PTR_ERR(rqstp);
-               goto out;
-       }
-
-       if (serv->sv_nrpools > 1)
-               have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask);
-
-       error = kernel_thread((int (*)(void *)) func, rqstp, 0);
-
-       if (have_oldmask)
-               set_cpus_allowed(current, oldmask);
-
-       if (error < 0)
-               goto out_thread;
-       svc_sock_update_bufs(serv);
-       error = 0;
-out:
-       return error;
-
-out_thread:
-       svc_exit_thread(rqstp);
-       goto out;
-}
-
 /*
  * Choose a pool in which to create a new thread, for svc_set_num_threads
  */
@@ -674,7 +625,7 @@ found_pool:
  * of threads the given number.  If `pool' is non-NULL, applies
  * only to threads in that pool, otherwise round-robins between
  * all pools.  Must be called with a svc_get() reference and
- * the BKL held.
+ * the BKL or another lock to protect access to svc_serv fields.
  *
  * Destroying threads relies on the service threads filling in
  * rqstp->rq_task, which only the nfs ones do.  Assumes the serv
@@ -686,7 +637,9 @@ found_pool:
 int
 svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 {
-       struct task_struct *victim;
+       struct svc_rqst *rqstp;
+       struct task_struct *task;
+       struct svc_pool *chosen_pool;
        int error = 0;
        unsigned int state = serv->sv_nrthreads-1;
 
@@ -702,18 +655,34 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
        /* create new threads */
        while (nrservs > 0) {
                nrservs--;
+               chosen_pool = choose_pool(serv, pool, &state);
+
+               rqstp = svc_prepare_thread(serv, chosen_pool);
+               if (IS_ERR(rqstp)) {
+                       error = PTR_ERR(rqstp);
+                       break;
+               }
+
                __module_get(serv->sv_module);
-               error = __svc_create_thread(serv->sv_function, serv,
-                                           choose_pool(serv, pool, &state));
-               if (error < 0) {
+               task = kthread_create(serv->sv_function, rqstp, serv->sv_name);
+               if (IS_ERR(task)) {
+                       error = PTR_ERR(task);
                        module_put(serv->sv_module);
+                       svc_exit_thread(rqstp);
                        break;
                }
+
+               rqstp->rq_task = task;
+               if (serv->sv_nrpools > 1)
+                       svc_pool_map_set_cpumask(task, chosen_pool->sp_id);
+
+               svc_sock_update_bufs(serv);
+               wake_up_process(task);
        }
        /* destroy old threads */
        while (nrservs < 0 &&
-              (victim = choose_victim(serv, pool, &state)) != NULL) {
-               send_sig(serv->sv_kill_signal, victim, 1);
+              (task = choose_victim(serv, pool, &state)) != NULL) {
+               send_sig(SIGINT, task, 1);
                nrservs++;
        }
 
@@ -722,7 +691,8 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 EXPORT_SYMBOL(svc_set_num_threads);
 
 /*
- * Called from a server thread as it's exiting.  Caller must hold BKL.
+ * Called from a server thread as it's exiting. Caller must hold the BKL or
+ * the "service mutex", whichever is appropriate for the service.
  */
 void
 svc_exit_thread(struct svc_rqst *rqstp)
index 88c0ca2..8710117 100644 (file)
@@ -69,6 +69,10 @@ atomic_t rdma_stat_rq_prod;
 atomic_t rdma_stat_sq_poll;
 atomic_t rdma_stat_sq_prod;
 
+/* Temporary NFS request map and context caches */
+struct kmem_cache *svc_rdma_map_cachep;
+struct kmem_cache *svc_rdma_ctxt_cachep;
+
 /*
  * This function implements reading and resetting an atomic_t stat
  * variable through read/write to a proc file. Any write to the file
@@ -236,11 +240,14 @@ static ctl_table svcrdma_root_table[] = {
 void svc_rdma_cleanup(void)
 {
        dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
+       flush_scheduled_work();
        if (svcrdma_table_header) {
                unregister_sysctl_table(svcrdma_table_header);
                svcrdma_table_header = NULL;
        }
        svc_unreg_xprt_class(&svc_rdma_class);
+       kmem_cache_destroy(svc_rdma_map_cachep);
+       kmem_cache_destroy(svc_rdma_ctxt_cachep);
 }
 
 int svc_rdma_init(void)
@@ -255,9 +262,37 @@ int svc_rdma_init(void)
                svcrdma_table_header =
                        register_sysctl_table(svcrdma_root_table);
 
+       /* Create the temporary map cache */
+       svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache",
+                                               sizeof(struct svc_rdma_req_map),
+                                               0,
+                                               SLAB_HWCACHE_ALIGN,
+                                               NULL);
+       if (!svc_rdma_map_cachep) {
+               printk(KERN_INFO "Could not allocate map cache.\n");
+               goto err0;
+       }
+
+       /* Create the temporary context cache */
+       svc_rdma_ctxt_cachep =
+               kmem_cache_create("svc_rdma_ctxt_cache",
+                                 sizeof(struct svc_rdma_op_ctxt),
+                                 0,
+                                 SLAB_HWCACHE_ALIGN,
+                                 NULL);
+       if (!svc_rdma_ctxt_cachep) {
+               printk(KERN_INFO "Could not allocate WR ctxt cache.\n");
+               goto err1;
+       }
+
        /* Register RDMA with the SVC transport switch */
        svc_reg_xprt_class(&svc_rdma_class);
        return 0;
+ err1:
+       kmem_cache_destroy(svc_rdma_map_cachep);
+ err0:
+       unregister_sysctl_table(svcrdma_table_header);
+       return -ENOMEM;
 }
 MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
 MODULE_DESCRIPTION("SVC RDMA Transport");
index 06ab484..b4b17f4 100644 (file)
@@ -112,11 +112,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
        rqstp->rq_arg.tail[0].iov_len = 0;
 }
 
-struct chunk_sge {
-       int start;              /* sge no for this chunk */
-       int count;              /* sge count for this chunk */
-};
-
 /* Encode a read-chunk-list as an array of IB SGE
  *
  * Assumptions:
@@ -134,8 +129,8 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
                           struct svc_rqst *rqstp,
                           struct svc_rdma_op_ctxt *head,
                           struct rpcrdma_msg *rmsgp,
-                          struct ib_sge *sge,
-                          struct chunk_sge *ch_sge_ary,
+                          struct svc_rdma_req_map *rpl_map,
+                          struct svc_rdma_req_map *chl_map,
                           int ch_count,
                           int byte_count)
 {
@@ -156,22 +151,18 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
        head->arg.head[0] = rqstp->rq_arg.head[0];
        head->arg.tail[0] = rqstp->rq_arg.tail[0];
        head->arg.pages = &head->pages[head->count];
-       head->sge[0].length = head->count; /* save count of hdr pages */
+       head->hdr_count = head->count; /* save count of hdr pages */
        head->arg.page_base = 0;
        head->arg.page_len = ch_bytes;
        head->arg.len = rqstp->rq_arg.len + ch_bytes;
        head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes;
        head->count++;
-       ch_sge_ary[0].start = 0;
+       chl_map->ch[0].start = 0;
        while (byte_count) {
+               rpl_map->sge[sge_no].iov_base =
+                       page_address(rqstp->rq_arg.pages[page_no]) + page_off;
                sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes);
-               sge[sge_no].addr =
-                       ib_dma_map_page(xprt->sc_cm_id->device,
-                                       rqstp->rq_arg.pages[page_no],
-                                       page_off, sge_bytes,
-                                       DMA_FROM_DEVICE);
-               sge[sge_no].length = sge_bytes;
-               sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+               rpl_map->sge[sge_no].iov_len = sge_bytes;
                /*
                 * Don't bump head->count here because the same page
                 * may be used by multiple SGE.
@@ -187,11 +178,11 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
                 * SGE, move to the next SGE
                 */
                if (ch_bytes == 0) {
-                       ch_sge_ary[ch_no].count =
-                               sge_no - ch_sge_ary[ch_no].start;
+                       chl_map->ch[ch_no].count =
+                               sge_no - chl_map->ch[ch_no].start;
                        ch_no++;
                        ch++;
-                       ch_sge_ary[ch_no].start = sge_no;
+                       chl_map->ch[ch_no].start = sge_no;
                        ch_bytes = ch->rc_target.rs_length;
                        /* If bytes remaining account for next chunk */
                        if (byte_count) {
@@ -220,18 +211,25 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
        return sge_no;
 }
 
-static void rdma_set_ctxt_sge(struct svc_rdma_op_ctxt *ctxt,
-                             struct ib_sge *sge,
+static void rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
+                             struct svc_rdma_op_ctxt *ctxt,
+                             struct kvec *vec,
                              u64 *sgl_offset,
                              int count)
 {
        int i;
 
        ctxt->count = count;
+       ctxt->direction = DMA_FROM_DEVICE;
        for (i = 0; i < count; i++) {
-               ctxt->sge[i].addr = sge[i].addr;
-               ctxt->sge[i].length = sge[i].length;
-               *sgl_offset = *sgl_offset + sge[i].length;
+               atomic_inc(&xprt->sc_dma_used);
+               ctxt->sge[i].addr =
+                       ib_dma_map_single(xprt->sc_cm_id->device,
+                                         vec[i].iov_base, vec[i].iov_len,
+                                         DMA_FROM_DEVICE);
+               ctxt->sge[i].length = vec[i].iov_len;
+               ctxt->sge[i].lkey = xprt->sc_phys_mr->lkey;
+               *sgl_offset = *sgl_offset + vec[i].iov_len;
        }
 }
 
@@ -282,34 +280,29 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
        struct ib_send_wr read_wr;
        int err = 0;
        int ch_no;
-       struct ib_sge *sge;
        int ch_count;
        int byte_count;
        int sge_count;
        u64 sgl_offset;
        struct rpcrdma_read_chunk *ch;
        struct svc_rdma_op_ctxt *ctxt = NULL;
-       struct svc_rdma_op_ctxt *tmp_sge_ctxt;
-       struct svc_rdma_op_ctxt *tmp_ch_ctxt;
-       struct chunk_sge *ch_sge_ary;
+       struct svc_rdma_req_map *rpl_map;
+       struct svc_rdma_req_map *chl_map;
 
        /* If no read list is present, return 0 */
        ch = svc_rdma_get_read_chunk(rmsgp);
        if (!ch)
                return 0;
 
-       /* Allocate temporary contexts to keep SGE */
-       BUG_ON(sizeof(struct ib_sge) < sizeof(struct chunk_sge));
-       tmp_sge_ctxt = svc_rdma_get_context(xprt);
-       sge = tmp_sge_ctxt->sge;
-       tmp_ch_ctxt = svc_rdma_get_context(xprt);
-       ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge;
+       /* Allocate temporary reply and chunk maps */
+       rpl_map = svc_rdma_get_req_map();
+       chl_map = svc_rdma_get_req_map();
 
        svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
        if (ch_count > RPCSVC_MAXPAGES)
                return -EINVAL;
        sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp,
-                                   sge, ch_sge_ary,
+                                   rpl_map, chl_map,
                                    ch_count, byte_count);
        sgl_offset = 0;
        ch_no = 0;
@@ -331,14 +324,15 @@ next_sge:
                read_wr.wr.rdma.remote_addr =
                        get_unaligned(&(ch->rc_target.rs_offset)) +
                        sgl_offset;
-               read_wr.sg_list = &sge[ch_sge_ary[ch_no].start];
+               read_wr.sg_list = ctxt->sge;
                read_wr.num_sge =
-                       rdma_read_max_sge(xprt, ch_sge_ary[ch_no].count);
-               rdma_set_ctxt_sge(ctxt, &sge[ch_sge_ary[ch_no].start],
+                       rdma_read_max_sge(xprt, chl_map->ch[ch_no].count);
+               rdma_set_ctxt_sge(xprt, ctxt,
+                                 &rpl_map->sge[chl_map->ch[ch_no].start],
                                  &sgl_offset,
                                  read_wr.num_sge);
                if (((ch+1)->rc_discrim == 0) &&
-                   (read_wr.num_sge == ch_sge_ary[ch_no].count)) {
+                   (read_wr.num_sge == chl_map->ch[ch_no].count)) {
                        /*
                         * Mark the last RDMA_READ with a bit to
                         * indicate all RPC data has been fetched from
@@ -358,9 +352,9 @@ next_sge:
                }
                atomic_inc(&rdma_stat_read);
 
-               if (read_wr.num_sge < ch_sge_ary[ch_no].count) {
-                       ch_sge_ary[ch_no].count -= read_wr.num_sge;
-                       ch_sge_ary[ch_no].start += read_wr.num_sge;
+               if (read_wr.num_sge < chl_map->ch[ch_no].count) {
+                       chl_map->ch[ch_no].count -= read_wr.num_sge;
+                       chl_map->ch[ch_no].start += read_wr.num_sge;
                        goto next_sge;
                }
                sgl_offset = 0;
@@ -368,8 +362,8 @@ next_sge:
        }
 
  out:
-       svc_rdma_put_context(tmp_sge_ctxt, 0);
-       svc_rdma_put_context(tmp_ch_ctxt, 0);
+       svc_rdma_put_req_map(rpl_map);
+       svc_rdma_put_req_map(chl_map);
 
        /* Detach arg pages. svc_recv will replenish them */
        for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
@@ -399,7 +393,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
                rqstp->rq_pages[page_no] = head->pages[page_no];
        }
        /* Point rq_arg.pages past header */
-       rqstp->rq_arg.pages = &rqstp->rq_pages[head->sge[0].length];
+       rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count];
        rqstp->rq_arg.page_len = head->arg.page_len;
        rqstp->rq_arg.page_base = head->arg.page_base;
 
index fb82b1b..a19b22b 100644 (file)
  * SGE[2..sge_count-2] data from xdr->pages[]
  * SGE[sge_count-1]    data from xdr->tail.
  *
+ * The max SGE we need is the length of the XDR / pagesize + one for
+ * head + one for tail + one for RPCRDMA header. Since RPCSVC_MAXPAGES
+ * reserves a page for both the request and the reply header, and this
+ * array is only concerned with the reply we are assured that we have
+ * on extra page for the RPCRMDA header.
  */
-static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt,
-                                struct xdr_buf *xdr,
-                                struct ib_sge *sge,
-                                int *sge_count)
+static void xdr_to_sge(struct svcxprt_rdma *xprt,
+                      struct xdr_buf *xdr,
+                      struct svc_rdma_req_map *vec)
 {
-       /* Max we need is the length of the XDR / pagesize + one for
-        * head + one for tail + one for RPCRDMA header
-        */
        int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3;
        int sge_no;
-       u32 byte_count = xdr->len;
        u32 sge_bytes;
        u32 page_bytes;
-       int page_off;
+       u32 page_off;
        int page_no;
 
+       BUG_ON(xdr->len !=
+              (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len));
+
        /* Skip the first sge, this is for the RPCRDMA header */
        sge_no = 1;
 
        /* Head SGE */
-       sge[sge_no].addr = ib_dma_map_single(xprt->sc_cm_id->device,
-                                            xdr->head[0].iov_base,
-                                            xdr->head[0].iov_len,
-                                            DMA_TO_DEVICE);
-       sge_bytes = min_t(u32, byte_count, xdr->head[0].iov_len);
-       byte_count -= sge_bytes;
-       sge[sge_no].length = sge_bytes;
-       sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+       vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
+       vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
        sge_no++;
 
        /* pages SGE */
        page_no = 0;
        page_bytes = xdr->page_len;
        page_off = xdr->page_base;
-       while (byte_count && page_bytes) {
-               sge_bytes = min_t(u32, byte_count, (PAGE_SIZE-page_off));
-               sge[sge_no].addr =
-                       ib_dma_map_page(xprt->sc_cm_id->device,
-                                       xdr->pages[page_no], page_off,
-                                       sge_bytes, DMA_TO_DEVICE);
-               sge_bytes = min(sge_bytes, page_bytes);
-               byte_count -= sge_bytes;
+       while (page_bytes) {
+               vec->sge[sge_no].iov_base =
+                       page_address(xdr->pages[page_no]) + page_off;
+               sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
                page_bytes -= sge_bytes;
-               sge[sge_no].length = sge_bytes;
-               sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+               vec->sge[sge_no].iov_len = sge_bytes;
 
                sge_no++;
                page_no++;
@@ -116,36 +108,24 @@ static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt,
        }
 
        /* Tail SGE */
-       if (byte_count && xdr->tail[0].iov_len) {
-               sge[sge_no].addr =
-                       ib_dma_map_single(xprt->sc_cm_id->device,
-                                         xdr->tail[0].iov_base,
-                                         xdr->tail[0].iov_len,
-                                         DMA_TO_DEVICE);
-               sge_bytes = min_t(u32, byte_count, xdr->tail[0].iov_len);
-               byte_count -= sge_bytes;
-               sge[sge_no].length = sge_bytes;
-               sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+       if (xdr->tail[0].iov_len) {
+               vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
+               vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
                sge_no++;
        }
 
        BUG_ON(sge_no > sge_max);
-       BUG_ON(byte_count != 0);
-
-       *sge_count = sge_no;
-       return sge;
+       vec->count = sge_no;
 }
 
-
 /* Assumptions:
  * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
  */
 static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
                      u32 rmr, u64 to,
                      u32 xdr_off, int write_len,
-                     struct ib_sge *xdr_sge, int sge_count)
+                     struct svc_rdma_req_map *vec)
 {
-       struct svc_rdma_op_ctxt *tmp_sge_ctxt;
        struct ib_send_wr write_wr;
        struct ib_sge *sge;
        int xdr_sge_no;
@@ -154,25 +134,23 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
        int sge_off;
        int bc;
        struct svc_rdma_op_ctxt *ctxt;
-       int ret = 0;
 
-       BUG_ON(sge_count > RPCSVC_MAXPAGES);
+       BUG_ON(vec->count > RPCSVC_MAXPAGES);
        dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
-               "write_len=%d, xdr_sge=%p, sge_count=%d\n",
+               "write_len=%d, vec->sge=%p, vec->count=%lu\n",
                rmr, (unsigned long long)to, xdr_off,
-               write_len, xdr_sge, sge_count);
+               write_len, vec->sge, vec->count);
 
        ctxt = svc_rdma_get_context(xprt);
-       ctxt->count = 0;
-       tmp_sge_ctxt = svc_rdma_get_context(xprt);
-       sge = tmp_sge_ctxt->sge;
+       ctxt->direction = DMA_TO_DEVICE;
+       sge = ctxt->sge;
 
        /* Find the SGE associated with xdr_off */
-       for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < sge_count;
+       for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count;
             xdr_sge_no++) {
-               if (xdr_sge[xdr_sge_no].length > bc)
+               if (vec->sge[xdr_sge_no].iov_len > bc)
                        break;
-               bc -= xdr_sge[xdr_sge_no].length;
+               bc -= vec->sge[xdr_sge_no].iov_len;
        }
 
        sge_off = bc;
@@ -180,21 +158,28 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
        sge_no = 0;
 
        /* Copy the remaining SGE */
-       while (bc != 0 && xdr_sge_no < sge_count) {
-               sge[sge_no].addr = xdr_sge[xdr_sge_no].addr + sge_off;
-               sge[sge_no].lkey = xdr_sge[xdr_sge_no].lkey;
+       while (bc != 0 && xdr_sge_no < vec->count) {
+               sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
                sge_bytes = min((size_t)bc,
-                               (size_t)(xdr_sge[xdr_sge_no].length-sge_off));
+                               (size_t)(vec->sge[xdr_sge_no].iov_len-sge_off));
                sge[sge_no].length = sge_bytes;
-
+               atomic_inc(&xprt->sc_dma_used);
+               sge[sge_no].addr =
+                       ib_dma_map_single(xprt->sc_cm_id->device,
+                                         (void *)
+                                         vec->sge[xdr_sge_no].iov_base + sge_off,
+                                         sge_bytes, DMA_TO_DEVICE);
+               if (dma_mapping_error(sge[sge_no].addr))
+                       goto err;
                sge_off = 0;
                sge_no++;
+               ctxt->count++;
                xdr_sge_no++;
                bc -= sge_bytes;
        }
 
        BUG_ON(bc != 0);
-       BUG_ON(xdr_sge_no > sge_count);
+       BUG_ON(xdr_sge_no > vec->count);
 
        /* Prepare WRITE WR */
        memset(&write_wr, 0, sizeof write_wr);
@@ -209,21 +194,20 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
 
        /* Post It */
        atomic_inc(&rdma_stat_write);
-       if (svc_rdma_send(xprt, &write_wr)) {
-               svc_rdma_put_context(ctxt, 1);
-               /* Fatal error, close transport */
-               ret = -EIO;
-       }
-       svc_rdma_put_context(tmp_sge_ctxt, 0);
-       return ret;
+       if (svc_rdma_send(xprt, &write_wr))
+               goto err;
+       return 0;
+ err:
+       svc_rdma_put_context(ctxt, 0);
+       /* Fatal error, close transport */
+       return -EIO;
 }
 
 static int send_write_chunks(struct svcxprt_rdma *xprt,
                             struct rpcrdma_msg *rdma_argp,
                             struct rpcrdma_msg *rdma_resp,
                             struct svc_rqst *rqstp,
-                            struct ib_sge *sge,
-                            int sge_count)
+                            struct svc_rdma_req_map *vec)
 {
        u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
        int write_len;
@@ -269,8 +253,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
                                         rs_offset + chunk_off,
                                         xdr_off,
                                         this_write,
-                                        sge,
-                                        sge_count);
+                                        vec);
                        if (ret) {
                                dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
                                        ret);
@@ -292,8 +275,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
                             struct rpcrdma_msg *rdma_argp,
                             struct rpcrdma_msg *rdma_resp,
                             struct svc_rqst *rqstp,
-                            struct ib_sge *sge,
-                            int sge_count)
+                            struct svc_rdma_req_map *vec)
 {
        u32 xfer_len = rqstp->rq_res.len;
        int write_len;
@@ -341,8 +323,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
                                         rs_offset + chunk_off,
                                         xdr_off,
                                         this_write,
-                                        sge,
-                                        sge_count);
+                                        vec);
                        if (ret) {
                                dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
                                        ret);
@@ -380,7 +361,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
                      struct page *page,
                      struct rpcrdma_msg *rdma_resp,
                      struct svc_rdma_op_ctxt *ctxt,
-                     int sge_count,
+                     struct svc_rdma_req_map *vec,
                      int byte_count)
 {
        struct ib_send_wr send_wr;
@@ -405,6 +386,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
        ctxt->count = 1;
 
        /* Prepare the SGE for the RPCRDMA Header */
+       atomic_inc(&rdma->sc_dma_used);
        ctxt->sge[0].addr =
                ib_dma_map_page(rdma->sc_cm_id->device,
                                page, 0, PAGE_SIZE, DMA_TO_DEVICE);
@@ -413,10 +395,16 @@ static int send_reply(struct svcxprt_rdma *rdma,
        ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey;
 
        /* Determine how many of our SGE are to be transmitted */
-       for (sge_no = 1; byte_count && sge_no < sge_count; sge_no++) {
-               sge_bytes = min((size_t)ctxt->sge[sge_no].length,
-                               (size_t)byte_count);
+       for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
+               sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
                byte_count -= sge_bytes;
+               atomic_inc(&rdma->sc_dma_used);
+               ctxt->sge[sge_no].addr =
+                       ib_dma_map_single(rdma->sc_cm_id->device,
+                                         vec->sge[sge_no].iov_base,
+                                         sge_bytes, DMA_TO_DEVICE);
+               ctxt->sge[sge_no].length = sge_bytes;
+               ctxt->sge[sge_no].lkey = rdma->sc_phys_mr->lkey;
        }
        BUG_ON(byte_count != 0);
 
@@ -428,8 +416,10 @@ static int send_reply(struct svcxprt_rdma *rdma,
                ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
                ctxt->count++;
                rqstp->rq_respages[page_no] = NULL;
+               /* If there are more pages than SGE, terminate SGE list */
+               if (page_no+1 >= sge_no)
+                       ctxt->sge[page_no+1].length = 0;
        }
-
        BUG_ON(sge_no > rdma->sc_max_sge);
        memset(&send_wr, 0, sizeof send_wr);
        ctxt->wr_op = IB_WR_SEND;
@@ -473,20 +463,20 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
        enum rpcrdma_proc reply_type;
        int ret;
        int inline_bytes;
-       struct ib_sge *sge;
-       int sge_count = 0;
        struct page *res_page;
        struct svc_rdma_op_ctxt *ctxt;
+       struct svc_rdma_req_map *vec;
 
        dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
 
        /* Get the RDMA request header. */
        rdma_argp = xdr_start(&rqstp->rq_arg);
 
-       /* Build an SGE for the XDR */
+       /* Build an req vec for the XDR */
        ctxt = svc_rdma_get_context(rdma);
        ctxt->direction = DMA_TO_DEVICE;
-       sge = xdr_to_sge(rdma, &rqstp->rq_res, ctxt->sge, &sge_count);
+       vec = svc_rdma_get_req_map();
+       xdr_to_sge(rdma, &rqstp->rq_res, vec);
 
        inline_bytes = rqstp->rq_res.len;
 
@@ -503,7 +493,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 
        /* Send any write-chunk data and build resp write-list */
        ret = send_write_chunks(rdma, rdma_argp, rdma_resp,
-                               rqstp, sge, sge_count);
+                               rqstp, vec);
        if (ret < 0) {
                printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n",
                       ret);
@@ -513,7 +503,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 
        /* Send any reply-list data and update resp reply-list */
        ret = send_reply_chunks(rdma, rdma_argp, rdma_resp,
-                               rqstp, sge, sge_count);
+                               rqstp, vec);
        if (ret < 0) {
                printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n",
                       ret);
@@ -521,11 +511,13 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
        }
        inline_bytes -= ret;
 
-       ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, sge_count,
+       ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
                         inline_bytes);
+       svc_rdma_put_req_map(vec);
        dprintk("svcrdma: send_reply returns %d\n", ret);
        return ret;
  error:
+       svc_rdma_put_req_map(vec);
        svc_rdma_put_context(ctxt, 0);
        put_page(res_page);
        return ret;
index e132509..19ddc38 100644 (file)
@@ -84,70 +84,37 @@ struct svc_xprt_class svc_rdma_class = {
        .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
 };
 
-static int rdma_bump_context_cache(struct svcxprt_rdma *xprt)
+/* WR context cache. Created in svc_rdma.c  */
+extern struct kmem_cache *svc_rdma_ctxt_cachep;
+
+struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
 {
-       int target;
-       int at_least_one = 0;
        struct svc_rdma_op_ctxt *ctxt;
 
-       target = min(xprt->sc_ctxt_cnt + xprt->sc_ctxt_bump,
-                    xprt->sc_ctxt_max);
-
-       spin_lock_bh(&xprt->sc_ctxt_lock);
-       while (xprt->sc_ctxt_cnt < target) {
-               xprt->sc_ctxt_cnt++;
-               spin_unlock_bh(&xprt->sc_ctxt_lock);
-
-               ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
-
-               spin_lock_bh(&xprt->sc_ctxt_lock);
-               if (ctxt) {
-                       at_least_one = 1;
-                       INIT_LIST_HEAD(&ctxt->free_list);
-                       list_add(&ctxt->free_list, &xprt->sc_ctxt_free);
-               } else {
-                       /* kmalloc failed...give up for now */
-                       xprt->sc_ctxt_cnt--;
+       while (1) {
+               ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, GFP_KERNEL);
+               if (ctxt)
                        break;
-               }
+               schedule_timeout_uninterruptible(msecs_to_jiffies(500));
        }
-       spin_unlock_bh(&xprt->sc_ctxt_lock);
-       dprintk("svcrdma: sc_ctxt_max=%d, sc_ctxt_cnt=%d\n",
-               xprt->sc_ctxt_max, xprt->sc_ctxt_cnt);
-       return at_least_one;
+       ctxt->xprt = xprt;
+       INIT_LIST_HEAD(&ctxt->dto_q);
+       ctxt->count = 0;
+       atomic_inc(&xprt->sc_ctxt_used);
+       return ctxt;
 }
 
-struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
+static void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
 {
-       struct svc_rdma_op_ctxt *ctxt;
-
-       while (1) {
-               spin_lock_bh(&xprt->sc_ctxt_lock);
-               if (unlikely(list_empty(&xprt->sc_ctxt_free))) {
-                       /* Try to bump my cache. */
-                       spin_unlock_bh(&xprt->sc_ctxt_lock);
-
-                       if (rdma_bump_context_cache(xprt))
-                               continue;
-
-                       printk(KERN_INFO "svcrdma: sleeping waiting for "
-                              "context memory on xprt=%p\n",
-                              xprt);
-                       schedule_timeout_uninterruptible(msecs_to_jiffies(500));
-                       continue;
-               }
-               ctxt = list_entry(xprt->sc_ctxt_free.next,
-                                 struct svc_rdma_op_ctxt,
-                                 free_list);
-               list_del_init(&ctxt->free_list);
-               spin_unlock_bh(&xprt->sc_ctxt_lock);
-               ctxt->xprt = xprt;
-               INIT_LIST_HEAD(&ctxt->dto_q);
-               ctxt->count = 0;
-               atomic_inc(&xprt->sc_ctxt_used);
-               break;
+       struct svcxprt_rdma *xprt = ctxt->xprt;
+       int i;
+       for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
+               atomic_dec(&xprt->sc_dma_used);
+               ib_dma_unmap_single(xprt->sc_cm_id->device,
+                                   ctxt->sge[i].addr,
+                                   ctxt->sge[i].length,
+                                   ctxt->direction);
        }
-       return ctxt;
 }
 
 void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
@@ -161,18 +128,36 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
                for (i = 0; i < ctxt->count; i++)
                        put_page(ctxt->pages[i]);
 
-       for (i = 0; i < ctxt->count; i++)
-               ib_dma_unmap_single(xprt->sc_cm_id->device,
-                                   ctxt->sge[i].addr,
-                                   ctxt->sge[i].length,
-                                   ctxt->direction);
-
-       spin_lock_bh(&xprt->sc_ctxt_lock);
-       list_add(&ctxt->free_list, &xprt->sc_ctxt_free);
-       spin_unlock_bh(&xprt->sc_ctxt_lock);
+       kmem_cache_free(svc_rdma_ctxt_cachep, ctxt);
        atomic_dec(&xprt->sc_ctxt_used);
 }
 
+/* Temporary NFS request map cache. Created in svc_rdma.c  */
+extern struct kmem_cache *svc_rdma_map_cachep;
+
+/*
+ * Temporary NFS req mappings are shared across all transport
+ * instances. These are short lived and should be bounded by the number
+ * of concurrent server threads * depth of the SQ.
+ */
+struct svc_rdma_req_map *svc_rdma_get_req_map(void)
+{
+       struct svc_rdma_req_map *map;
+       while (1) {
+               map = kmem_cache_alloc(svc_rdma_map_cachep, GFP_KERNEL);
+               if (map)
+                       break;
+               schedule_timeout_uninterruptible(msecs_to_jiffies(500));
+       }
+       map->count = 0;
+       return map;
+}
+
+void svc_rdma_put_req_map(struct svc_rdma_req_map *map)
+{
+       kmem_cache_free(svc_rdma_map_cachep, map);
+}
+
 /* ib_cq event handler */
 static void cq_event_handler(struct ib_event *event, void *context)
 {
@@ -302,6 +287,7 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
                ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
                ctxt->wc_status = wc.status;
                ctxt->byte_len = wc.byte_len;
+               svc_rdma_unmap_dma(ctxt);
                if (wc.status != IB_WC_SUCCESS) {
                        /* Close the transport */
                        dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt);
@@ -351,6 +337,7 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
                ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
                xprt = ctxt->xprt;
 
+               svc_rdma_unmap_dma(ctxt);
                if (wc.status != IB_WC_SUCCESS)
                        /* Close the transport */
                        set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
@@ -361,10 +348,13 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
 
                switch (ctxt->wr_op) {
                case IB_WR_SEND:
-               case IB_WR_RDMA_WRITE:
                        svc_rdma_put_context(ctxt, 1);
                        break;
 
+               case IB_WR_RDMA_WRITE:
+                       svc_rdma_put_context(ctxt, 0);
+                       break;
+
                case IB_WR_RDMA_READ:
                        if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
                                struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
@@ -423,40 +413,6 @@ static void sq_comp_handler(struct ib_cq *cq, void *cq_context)
        tasklet_schedule(&dto_tasklet);
 }
 
-static void create_context_cache(struct svcxprt_rdma *xprt,
-                                int ctxt_count, int ctxt_bump, int ctxt_max)
-{
-       struct svc_rdma_op_ctxt *ctxt;
-       int i;
-
-       xprt->sc_ctxt_max = ctxt_max;
-       xprt->sc_ctxt_bump = ctxt_bump;
-       xprt->sc_ctxt_cnt = 0;
-       atomic_set(&xprt->sc_ctxt_used, 0);
-
-       INIT_LIST_HEAD(&xprt->sc_ctxt_free);
-       for (i = 0; i < ctxt_count; i++) {
-               ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
-               if (ctxt) {
-                       INIT_LIST_HEAD(&ctxt->free_list);
-                       list_add(&ctxt->free_list, &xprt->sc_ctxt_free);
-                       xprt->sc_ctxt_cnt++;
-               }
-       }
-}
-
-static void destroy_context_cache(struct svcxprt_rdma *xprt)
-{
-       while (!list_empty(&xprt->sc_ctxt_free)) {
-               struct svc_rdma_op_ctxt *ctxt;
-               ctxt = list_entry(xprt->sc_ctxt_free.next,
-                                 struct svc_rdma_op_ctxt,
-                                 free_list);
-               list_del_init(&ctxt->free_list);
-               kfree(ctxt);
-       }
-}
-
 static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
                                             int listener)
 {
@@ -473,7 +429,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
 
        spin_lock_init(&cma_xprt->sc_lock);
        spin_lock_init(&cma_xprt->sc_read_complete_lock);
-       spin_lock_init(&cma_xprt->sc_ctxt_lock);
        spin_lock_init(&cma_xprt->sc_rq_dto_lock);
 
        cma_xprt->sc_ord = svcrdma_ord;
@@ -482,21 +437,9 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
        cma_xprt->sc_max_requests = svcrdma_max_requests;
        cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT;
        atomic_set(&cma_xprt->sc_sq_count, 0);
+       atomic_set(&cma_xprt->sc_ctxt_used, 0);
 
-       if (!listener) {
-               int reqs = cma_xprt->sc_max_requests;
-               create_context_cache(cma_xprt,
-                                    reqs << 1, /* starting size */
-                                    reqs,      /* bump amount */
-                                    reqs +
-                                    cma_xprt->sc_sq_depth +
-                                    RPCRDMA_MAX_THREADS + 1); /* max */
-               if (list_empty(&cma_xprt->sc_ctxt_free)) {
-                       kfree(cma_xprt);
-                       return NULL;
-               }
-               clear_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
-       } else
+       if (listener)
                set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
 
        return cma_xprt;
@@ -532,6 +475,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
                BUG_ON(sge_no >= xprt->sc_max_sge);
                page = svc_rdma_get_page();
                ctxt->pages[sge_no] = page;
+               atomic_inc(&xprt->sc_dma_used);
                pa = ib_dma_map_page(xprt->sc_cm_id->device,
                                     page, 0, PAGE_SIZE,
                                     DMA_FROM_DEVICE);
@@ -566,7 +510,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
  * will call the recvfrom method on the listen xprt which will accept the new
  * connection.
  */
-static void handle_connect_req(struct rdma_cm_id *new_cma_id)
+static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird)
 {
        struct svcxprt_rdma *listen_xprt = new_cma_id->context;
        struct svcxprt_rdma *newxprt;
@@ -583,6 +527,9 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id)
        dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
                newxprt, newxprt->sc_cm_id, listen_xprt);
 
+       /* Save client advertised inbound read limit for use later in accept. */
+       newxprt->sc_ord = client_ird;
+
        /* Set the local and remote addresses in the transport */
        sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
        svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
@@ -619,7 +566,8 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
        case RDMA_CM_EVENT_CONNECT_REQUEST:
                dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
                        "event=%d\n", cma_id, cma_id->context, event->event);
-               handle_connect_req(cma_id);
+               handle_connect_req(cma_id,
+                                  event->param.conn.responder_resources);
                break;
 
        case RDMA_CM_EVENT_ESTABLISHED:
@@ -793,8 +741,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
                                   (size_t)svcrdma_max_requests);
        newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
 
-       newxprt->sc_ord =  min((size_t)devattr.max_qp_rd_atom,
-                              (size_t)svcrdma_ord);
+       /*
+        * Limit ORD based on client limit, local device limit, and
+        * configured svcrdma limit.
+        */
+       newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord);
+       newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord);
 
        newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device);
        if (IS_ERR(newxprt->sc_pd)) {
@@ -987,7 +939,6 @@ static void __svc_rdma_free(struct work_struct *work)
         * cm_id because the device ptr is needed to unmap the dma in
         * svc_rdma_put_context.
         */
-       spin_lock_bh(&rdma->sc_read_complete_lock);
        while (!list_empty(&rdma->sc_read_complete_q)) {
                struct svc_rdma_op_ctxt *ctxt;
                ctxt = list_entry(rdma->sc_read_complete_q.next,
@@ -996,10 +947,8 @@ static void __svc_rdma_free(struct work_struct *work)
                list_del_init(&ctxt->dto_q);
                svc_rdma_put_context(ctxt, 1);
        }
-       spin_unlock_bh(&rdma->sc_read_complete_lock);
 
        /* Destroy queued, but not processed recv completions */
-       spin_lock_bh(&rdma->sc_rq_dto_lock);
        while (!list_empty(&rdma->sc_rq_dto_q)) {
                struct svc_rdma_op_ctxt *ctxt;
                ctxt = list_entry(rdma->sc_rq_dto_q.next,
@@ -1008,10 +957,10 @@ static void __svc_rdma_free(struct work_struct *work)
                list_del_init(&ctxt->dto_q);
                svc_rdma_put_context(ctxt, 1);
        }
-       spin_unlock_bh(&rdma->sc_rq_dto_lock);
 
        /* Warn if we leaked a resource or under-referenced */
        WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0);
+       WARN_ON(atomic_read(&rdma->sc_dma_used) != 0);
 
        /* Destroy the QP if present (not a listener) */
        if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
@@ -1032,7 +981,6 @@ static void __svc_rdma_free(struct work_struct *work)
        /* Destroy the CM ID */
        rdma_destroy_id(rdma->sc_cm_id);
 
-       destroy_context_cache(rdma);
        kfree(rdma);
 }
 
@@ -1132,6 +1080,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
        length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
 
        /* Prepare SGE for local address */
+       atomic_inc(&xprt->sc_dma_used);
        sge.addr = ib_dma_map_page(xprt->sc_cm_id->device,
                                   p, 0, PAGE_SIZE, DMA_FROM_DEVICE);
        sge.lkey = xprt->sc_phys_mr->lkey;