Merge branch 'master' of /home/trondmy/kernel/linux-2.6/ into merge_linus
authorTrond Myklebust <Trond.Myklebust@netapp.com>
Thu, 7 Dec 2006 21:35:17 +0000 (16:35 -0500)
committerTrond Myklebust <Trond.Myklebust@netapp.com>
Thu, 7 Dec 2006 21:35:17 +0000 (16:35 -0500)
1  2 
fs/lockd/clntproc.c
fs/nfs/direct.c
fs/nfs/inode.c
fs/nfs/pagelist.c
fs/nfs/read.c
fs/nfs/write.c
include/linux/sunrpc/sched.h
net/sunrpc/sched.c
net/sunrpc/xprtsock.c

diff --combined fs/lockd/clntproc.c
@@@ -13,6 -13,7 +13,7 @@@
  #include <linux/nfs_fs.h>
  #include <linux/utsname.h>
  #include <linux/smp_lock.h>
+ #include <linux/freezer.h>
  #include <linux/sunrpc/clnt.h>
  #include <linux/sunrpc/svc.h>
  #include <linux/lockd/lockd.h>
@@@ -729,7 -730,7 +730,7 @@@ static void nlmclnt_cancel_callback(str
                goto retry_cancel;
        }
  
 -      dprintk("lockd: cancel status %d (task %d)\n",
 +      dprintk("lockd: cancel status %u (task %u)\n",
                        req->a_res.status, task->tk_pid);
  
        switch (req->a_res.status) {
diff --combined fs/nfs/direct.c
@@@ -58,7 -58,7 +58,7 @@@
  
  #define NFSDBG_FACILITY               NFSDBG_VFS
  
- static kmem_cache_t *nfs_direct_cachep;
+ static struct kmem_cache *nfs_direct_cachep;
  
  /*
   * This represents a set of asynchronous requests that we're waiting on
@@@ -143,7 -143,7 +143,7 @@@ static inline struct nfs_direct_req *nf
  {
        struct nfs_direct_req *dreq;
  
-       dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL);
+       dreq = kmem_cache_alloc(nfs_direct_cachep, GFP_KERNEL);
        if (!dreq)
                return NULL;
  
@@@ -307,7 -307,9 +307,7 @@@ static ssize_t nfs_direct_read_schedule
  
                data->task.tk_cookie = (unsigned long) inode;
  
 -              lock_kernel();
                rpc_execute(&data->task);
 -              unlock_kernel();
  
                dfprintk(VFS, "NFS: %5u initiated direct read call (req %s/%Ld, %zu bytes @ offset %Lu)\n",
                                data->task.tk_pid,
@@@ -473,7 -475,9 +473,7 @@@ static void nfs_direct_commit_schedule(
  
        dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
  
 -      lock_kernel();
        rpc_execute(&data->task);
 -      unlock_kernel();
  }
  
  static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
@@@ -637,7 -641,9 +637,7 @@@ static ssize_t nfs_direct_write_schedul
                data->task.tk_priority = RPC_PRIORITY_NORMAL;
                data->task.tk_cookie = (unsigned long) inode;
  
 -              lock_kernel();
                rpc_execute(&data->task);
 -              unlock_kernel();
  
                dfprintk(VFS, "NFS: %5u initiated direct write call (req %s/%Ld, %zu bytes @ offset %Lu)\n",
                                data->task.tk_pid,
diff --combined fs/nfs/inode.c
@@@ -55,7 -55,7 +55,7 @@@ static int nfs_update_inode(struct inod
  
  static void nfs_zap_acl_cache(struct inode *);
  
- static kmem_cache_t * nfs_inode_cachep;
+ static struct kmem_cache * nfs_inode_cachep;
  
  static inline unsigned long
  nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
@@@ -422,7 -422,7 +422,7 @@@ int nfs_getattr(struct vfsmount *mnt, s
        int err;
  
        /* Flush out writes to the server in order to update c/mtime */
 -      nfs_sync_inode_wait(inode, 0, 0, FLUSH_NOCOMMIT);
 +      nfs_sync_mapping_range(inode->i_mapping, 0, 0, FLUSH_NOCOMMIT);
  
        /*
         * We may force a getattr if the user cares about atime.
@@@ -1080,7 -1080,7 +1080,7 @@@ void nfs4_clear_inode(struct inode *ino
  struct inode *nfs_alloc_inode(struct super_block *sb)
  {
        struct nfs_inode *nfsi;
-       nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, SLAB_KERNEL);
+       nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL);
        if (!nfsi)
                return NULL;
        nfsi->flags = 0UL;
@@@ -1111,7 -1111,7 +1111,7 @@@ static inline void nfs4_init_once(struc
  #endif
  }
  
- static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
  {
        struct nfs_inode *nfsi = (struct nfs_inode *) foo;
  
diff --combined fs/nfs/pagelist.c
  #include <linux/nfs_page.h>
  #include <linux/nfs_fs.h>
  #include <linux/nfs_mount.h>
 +#include <linux/writeback.h>
  
  #define NFS_PARANOIA 1
  
- static kmem_cache_t *nfs_page_cachep;
+ static struct kmem_cache *nfs_page_cachep;
  
  static inline struct nfs_page *
  nfs_page_alloc(void)
  {
        struct nfs_page *p;
-       p = kmem_cache_alloc(nfs_page_cachep, SLAB_KERNEL);
+       p = kmem_cache_alloc(nfs_page_cachep, GFP_KERNEL);
        if (p) {
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->wb_list);
@@@ -269,10 -268,11 +269,10 @@@ nfs_coalesce_requests(struct list_head 
  
  #define NFS_SCAN_MAXENTRIES 16
  /**
 - * nfs_scan_lock_dirty - Scan the radix tree for dirty requests
 - * @nfsi: NFS inode
 + * nfs_scan_dirty - Scan the radix tree for dirty requests
 + * @mapping: pointer to address space
 + * @wbc: writeback_control structure
   * @dst: Destination list
 - * @idx_start: lower bound of page->index to scan
 - * @npages: idx_start + npages sets the upper bound to scan.
   *
   * Moves elements from one of the inode request lists.
   * If the number of requests is set to 0, the entire address_space
   * The requests are *not* checked to ensure that they form a contiguous set.
   * You must be holding the inode's req_lock when calling this function
   */
 -int
 -nfs_scan_lock_dirty(struct nfs_inode *nfsi, struct list_head *dst,
 -            unsigned long idx_start, unsigned int npages)
 +long nfs_scan_dirty(struct address_space *mapping,
 +                      struct writeback_control *wbc,
 +                      struct list_head *dst)
  {
 +      struct nfs_inode *nfsi = NFS_I(mapping->host);
        struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
        struct nfs_page *req;
 -      unsigned long idx_end;
 +      pgoff_t idx_start, idx_end;
 +      long res = 0;
        int found, i;
 -      int res;
  
 -      res = 0;
 -      if (npages == 0)
 -              idx_end = ~0;
 -      else
 -              idx_end = idx_start + npages - 1;
 +      if (nfsi->ndirty == 0)
 +              return 0;
 +      if (wbc->range_cyclic) {
 +              idx_start = 0;
 +              idx_end = ULONG_MAX;
 +      } else if (wbc->range_end == 0) {
 +              idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
 +              idx_end = ULONG_MAX;
 +      } else {
 +              idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
 +              idx_end = wbc->range_end >> PAGE_CACHE_SHIFT;
 +      }
  
        for (;;) {
 +              unsigned int toscan = NFS_SCAN_MAXENTRIES;
 +
                found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree,
 -                              (void **)&pgvec[0], idx_start, NFS_SCAN_MAXENTRIES,
 +                              (void **)&pgvec[0], idx_start, toscan,
                                NFS_PAGE_TAG_DIRTY);
 +
 +              /* Did we make progress? */
                if (found <= 0)
                        break;
 +
                for (i = 0; i < found; i++) {
                        req = pgvec[i];
 -                      if (req->wb_index > idx_end)
 +                      if (!wbc->range_cyclic && req->wb_index > idx_end)
                                goto out;
  
 +                      /* Try to lock request and mark it for writeback */
 +                      if (!nfs_set_page_writeback_locked(req))
 +                              goto next;
 +                      radix_tree_tag_clear(&nfsi->nfs_page_tree,
 +                                      req->wb_index, NFS_PAGE_TAG_DIRTY);
 +                      nfsi->ndirty--;
 +                      nfs_list_remove_request(req);
 +                      nfs_list_add_request(req, dst);
 +                      res++;
 +                      if (res == LONG_MAX)
 +                              goto out;
 +next:
                        idx_start = req->wb_index + 1;
 -
 -                      if (nfs_set_page_writeback_locked(req)) {
 -                              radix_tree_tag_clear(&nfsi->nfs_page_tree,
 -                                              req->wb_index, NFS_PAGE_TAG_DIRTY);
 -                              nfs_list_remove_request(req);
 -                              nfs_list_add_request(req, dst);
 -                              dec_zone_page_state(req->wb_page, NR_FILE_DIRTY);
 -                              res++;
 -                      }
                }
        }
  out:
 +      WARN_ON ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty));
        return res;
  }
  
diff --combined fs/nfs/read.c
@@@ -30,7 -30,6 +30,7 @@@
  
  #include <asm/system.h>
  
 +#include "internal.h"
  #include "iostat.h"
  
  #define NFSDBG_FACILITY               NFSDBG_PAGECACHE
@@@ -39,7 -38,7 +39,7 @@@ static int nfs_pagein_one(struct list_h
  static const struct rpc_call_ops nfs_read_partial_ops;
  static const struct rpc_call_ops nfs_read_full_ops;
  
- static kmem_cache_t *nfs_rdata_cachep;
+ static struct kmem_cache *nfs_rdata_cachep;
  static mempool_t *nfs_rdata_mempool;
  
  #define MIN_POOL_READ (32)
@@@ -47,7 -46,7 +47,7 @@@
  struct nfs_read_data *nfs_readdata_alloc(size_t len)
  {
        unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS);
+       struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS);
  
        if (p) {
                memset(p, 0, sizeof(*p));
        return p;
  }
  
 -static void nfs_readdata_free(struct nfs_read_data *p)
 +static void nfs_readdata_rcu_free(struct rcu_head *head)
  {
 +      struct nfs_read_data *p = container_of(head, struct nfs_read_data, task.u.tk_rcu);
        if (p && (p->pagevec != &p->page_array[0]))
                kfree(p->pagevec);
        mempool_free(p, nfs_rdata_mempool);
  }
  
 -void nfs_readdata_release(void *data)
 +static void nfs_readdata_free(struct nfs_read_data *rdata)
  {
 -        nfs_readdata_free(data);
 +      call_rcu_bh(&rdata->task.u.tk_rcu, nfs_readdata_rcu_free);
  }
  
 -static
 -unsigned int nfs_page_length(struct inode *inode, struct page *page)
 +void nfs_readdata_release(void *data)
  {
 -      loff_t i_size = i_size_read(inode);
 -      unsigned long idx;
 -
 -      if (i_size <= 0)
 -              return 0;
 -      idx = (i_size - 1) >> PAGE_CACHE_SHIFT;
 -      if (page->index > idx)
 -              return 0;
 -      if (page->index != idx)
 -              return PAGE_CACHE_SIZE;
 -      return 1 + ((i_size - 1) & (PAGE_CACHE_SIZE - 1));
 +        nfs_readdata_free(data);
  }
  
  static
@@@ -130,12 -139,12 +130,12 @@@ static int nfs_readpage_sync(struct nfs
  {
        unsigned int    rsize = NFS_SERVER(inode)->rsize;
        unsigned int    count = PAGE_CACHE_SIZE;
 -      int             result;
 +      int result = -ENOMEM;
        struct nfs_read_data *rdata;
  
        rdata = nfs_readdata_alloc(count);
        if (!rdata)
 -              return -ENOMEM;
 +              goto out_unlock;
  
        memset(rdata, 0, sizeof(*rdata));
        rdata->flags = (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0);
        result = 0;
  
  io_error:
 -      unlock_page(page);
        nfs_readdata_free(rdata);
 +out_unlock:
 +      unlock_page(page);
        return result;
  }
  
@@@ -216,7 -224,7 +216,7 @@@ static int nfs_readpage_async(struct nf
        struct nfs_page *new;
        unsigned int len;
  
 -      len = nfs_page_length(inode, page);
 +      len = nfs_page_length(page);
        if (len == 0)
                return nfs_return_empty_page(page);
        new = nfs_create_request(ctx, inode, page, 0, len);
@@@ -308,7 -316,9 +308,7 @@@ static void nfs_execute_read(struct nfs
        sigset_t oldset;
  
        rpc_clnt_sigmask(clnt, &oldset);
 -      lock_kernel();
        rpc_execute(&data->task);
 -      unlock_kernel();
        rpc_clnt_sigunmask(clnt, &oldset);
  }
  
@@@ -444,55 -454,6 +444,55 @@@ nfs_pagein_list(struct list_head *head
        return error;
  }
  
 +/*
 + * This is the callback from RPC telling us whether a reply was
 + * received or some error occurred (timeout or socket shutdown).
 + */
 +int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
 +{
 +      int status;
 +
 +      dprintk("%s: %4d, (status %d)\n", __FUNCTION__, task->tk_pid,
 +                      task->tk_status);
 +
 +      status = NFS_PROTO(data->inode)->read_done(task, data);
 +      if (status != 0)
 +              return status;
 +
 +      nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, data->res.count);
 +
 +      if (task->tk_status == -ESTALE) {
 +              set_bit(NFS_INO_STALE, &NFS_FLAGS(data->inode));
 +              nfs_mark_for_revalidate(data->inode);
 +      }
 +      spin_lock(&data->inode->i_lock);
 +      NFS_I(data->inode)->cache_validity |= NFS_INO_INVALID_ATIME;
 +      spin_unlock(&data->inode->i_lock);
 +      return 0;
 +}
 +
 +static int nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data)
 +{
 +      struct nfs_readargs *argp = &data->args;
 +      struct nfs_readres *resp = &data->res;
 +
 +      if (resp->eof || resp->count == argp->count)
 +              return 0;
 +
 +      /* This is a short read! */
 +      nfs_inc_stats(data->inode, NFSIOS_SHORTREAD);
 +      /* Has the server at least made some progress? */
 +      if (resp->count == 0)
 +              return 0;
 +
 +      /* Yes, so retry the read at the end of the data */
 +      argp->offset += resp->count;
 +      argp->pgbase += resp->count;
 +      argp->count -= resp->count;
 +      rpc_restart_call(task);
 +      return -EAGAIN;
 +}
 +
  /*
   * Handle a read reply that fills part of a page.
   */
@@@ -502,16 -463,12 +502,16 @@@ static void nfs_readpage_result_partial
        struct nfs_page *req = data->req;
        struct page *page = req->wb_page;
   
 -      if (likely(task->tk_status >= 0))
 -              nfs_readpage_truncate_uninitialised_page(data);
 -      else
 -              SetPageError(page);
        if (nfs_readpage_result(task, data) != 0)
                return;
 +
 +      if (likely(task->tk_status >= 0)) {
 +              nfs_readpage_truncate_uninitialised_page(data);
 +              if (nfs_readpage_retry(task, data) != 0)
 +                      return;
 +      }
 +      if (unlikely(task->tk_status < 0))
 +              SetPageError(page);
        if (atomic_dec_and_test(&req->wb_complete)) {
                if (!PageError(page))
                        SetPageUptodate(page);
@@@ -539,13 -496,25 +539,13 @@@ static void nfs_readpage_set_pages_upto
        count += base;
        for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++)
                SetPageUptodate(*pages);
 -      if (count != 0)
 +      if (count == 0)
 +              return;
 +      /* Was this a short read? */
 +      if (data->res.eof || data->res.count == data->args.count)
                SetPageUptodate(*pages);
  }
  
 -static void nfs_readpage_set_pages_error(struct nfs_read_data *data)
 -{
 -      unsigned int count = data->args.count;
 -      unsigned int base = data->args.pgbase;
 -      struct page **pages;
 -
 -      pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
 -      base &= ~PAGE_CACHE_MASK;
 -      count += base;
 -      for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++)
 -              SetPageError(*pages);
 -      if (count != 0)
 -              SetPageError(*pages);
 -}
 -
  /*
   * This is the callback from RPC telling us whether a reply was
   * received or some error occurred (timeout or socket shutdown).
@@@ -554,20 -523,19 +554,20 @@@ static void nfs_readpage_result_full(st
  {
        struct nfs_read_data *data = calldata;
  
 +      if (nfs_readpage_result(task, data) != 0)
 +              return;
        /*
 -       * Note: nfs_readpage_result may change the values of
 +       * Note: nfs_readpage_retry may change the values of
         * data->args. In the multi-page case, we therefore need
 -       * to ensure that we call the next nfs_readpage_set_page_uptodate()
 -       * first in the multi-page case.
 +       * to ensure that we call nfs_readpage_set_pages_uptodate()
 +       * first.
         */
        if (likely(task->tk_status >= 0)) {
                nfs_readpage_truncate_uninitialised_page(data);
                nfs_readpage_set_pages_uptodate(data);
 -      } else
 -              nfs_readpage_set_pages_error(data);
 -      if (nfs_readpage_result(task, data) != 0)
 -              return;
 +              if (nfs_readpage_retry(task, data) != 0)
 +                      return;
 +      }
        while (!list_empty(&data->pages)) {
                struct nfs_page *req = nfs_list_entry(data->pages.next);
  
@@@ -581,6 -549,50 +581,6 @@@ static const struct rpc_call_ops nfs_re
        .rpc_release = nfs_readdata_release,
  };
  
 -/*
 - * This is the callback from RPC telling us whether a reply was
 - * received or some error occurred (timeout or socket shutdown).
 - */
 -int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
 -{
 -      struct nfs_readargs *argp = &data->args;
 -      struct nfs_readres *resp = &data->res;
 -      int status;
 -
 -      dprintk("NFS: %4d nfs_readpage_result, (status %d)\n",
 -              task->tk_pid, task->tk_status);
 -
 -      status = NFS_PROTO(data->inode)->read_done(task, data);
 -      if (status != 0)
 -              return status;
 -
 -      nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, resp->count);
 -
 -      if (task->tk_status < 0) {
 -              if (task->tk_status == -ESTALE) {
 -                      set_bit(NFS_INO_STALE, &NFS_FLAGS(data->inode));
 -                      nfs_mark_for_revalidate(data->inode);
 -              }
 -      } else if (resp->count < argp->count && !resp->eof) {
 -              /* This is a short read! */
 -              nfs_inc_stats(data->inode, NFSIOS_SHORTREAD);
 -              /* Has the server at least made some progress? */
 -              if (resp->count != 0) {
 -                      /* Yes, so retry the read at the end of the data */
 -                      argp->offset += resp->count;
 -                      argp->pgbase += resp->count;
 -                      argp->count -= resp->count;
 -                      rpc_restart_call(task);
 -                      return -EAGAIN;
 -              }
 -              task->tk_status = -EIO;
 -      }
 -      spin_lock(&data->inode->i_lock);
 -      NFS_I(data->inode)->cache_validity |= NFS_INO_INVALID_ATIME;
 -      spin_unlock(&data->inode->i_lock);
 -      return 0;
 -}
 -
  /*
   * Read a page over NFS.
   * We read the page synchronously in the following case:
@@@ -614,10 -626,9 +614,10 @@@ int nfs_readpage(struct file *file, str
                goto out_error;
  
        if (file == NULL) {
 +              error = -EBADF;
                ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
                if (ctx == NULL)
 -                      return -EBADF;
 +                      goto out_error;
        } else
                ctx = get_nfs_open_context((struct nfs_open_context *)
                                file->private_data);
@@@ -652,7 -663,7 +652,7 @@@ readpage_async_filler(void *data, struc
        unsigned int len;
  
        nfs_wb_page(inode, page);
 -      len = nfs_page_length(inode, page);
 +      len = nfs_page_length(page);
        if (len == 0)
                return nfs_return_empty_page(page);
        new = nfs_create_request(desc->ctx, inode, page, 0, len);
diff --combined fs/nfs/write.c
@@@ -63,7 -63,6 +63,7 @@@
  #include <linux/smp_lock.h>
  
  #include "delegation.h"
 +#include "internal.h"
  #include "iostat.h"
  
  #define NFSDBG_FACILITY               NFSDBG_PAGECACHE
   * Local function declarations
   */
  static struct nfs_page * nfs_update_request(struct nfs_open_context*,
 -                                          struct inode *,
                                            struct page *,
                                            unsigned int, unsigned int);
 +static void nfs_mark_request_dirty(struct nfs_page *req);
  static int nfs_wait_on_write_congestion(struct address_space *, int);
  static int nfs_wait_on_requests(struct inode *, unsigned long, unsigned int);
 -static int nfs_flush_inode(struct inode *inode, unsigned long idx_start,
 -                         unsigned int npages, int how);
 +static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how);
  static const struct rpc_call_ops nfs_write_partial_ops;
  static const struct rpc_call_ops nfs_write_full_ops;
  static const struct rpc_call_ops nfs_commit_ops;
  
- static kmem_cache_t *nfs_wdata_cachep;
+ static struct kmem_cache *nfs_wdata_cachep;
  static mempool_t *nfs_wdata_mempool;
  static mempool_t *nfs_commit_mempool;
  
@@@ -93,7 -93,7 +93,7 @@@ static DECLARE_WAIT_QUEUE_HEAD(nfs_writ
  
  struct nfs_write_data *nfs_commit_alloc(void)
  {
-       struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS);
+       struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
  
        if (p) {
                memset(p, 0, sizeof(*p));
        return p;
  }
  
 -void nfs_commit_free(struct nfs_write_data *p)
 +void nfs_commit_rcu_free(struct rcu_head *head)
  {
 +      struct nfs_write_data *p = container_of(head, struct nfs_write_data, task.u.tk_rcu);
        if (p && (p->pagevec != &p->page_array[0]))
                kfree(p->pagevec);
        mempool_free(p, nfs_commit_mempool);
  }
  
 +void nfs_commit_free(struct nfs_write_data *wdata)
 +{
 +      call_rcu_bh(&wdata->task.u.tk_rcu, nfs_commit_rcu_free);
 +}
 +
  struct nfs_write_data *nfs_writedata_alloc(size_t len)
  {
        unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS);
+       struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS);
  
        if (p) {
                memset(p, 0, sizeof(*p));
        return p;
  }
  
 -static void nfs_writedata_free(struct nfs_write_data *p)
 +static void nfs_writedata_rcu_free(struct rcu_head *head)
  {
 +      struct nfs_write_data *p = container_of(head, struct nfs_write_data, task.u.tk_rcu);
        if (p && (p->pagevec != &p->page_array[0]))
                kfree(p->pagevec);
        mempool_free(p, nfs_wdata_mempool);
  }
  
 +static void nfs_writedata_free(struct nfs_write_data *wdata)
 +{
 +      call_rcu_bh(&wdata->task.u.tk_rcu, nfs_writedata_rcu_free);
 +}
 +
  void nfs_writedata_release(void *wdata)
  {
        nfs_writedata_free(wdata);
  }
  
 +static struct nfs_page *nfs_page_find_request_locked(struct page *page)
 +{
 +      struct nfs_page *req = NULL;
 +
 +      if (PagePrivate(page)) {
 +              req = (struct nfs_page *)page_private(page);
 +              if (req != NULL)
 +                      atomic_inc(&req->wb_count);
 +      }
 +      return req;
 +}
 +
 +static struct nfs_page *nfs_page_find_request(struct page *page)
 +{
 +      struct nfs_page *req = NULL;
 +      spinlock_t *req_lock = &NFS_I(page->mapping->host)->req_lock;
 +
 +      spin_lock(req_lock);
 +      req = nfs_page_find_request_locked(page);
 +      spin_unlock(req_lock);
 +      return req;
 +}
 +
  /* Adjust the file length if we're writing beyond the end */
  static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
  {
   */
  static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int count)
  {
 -      loff_t end_offs;
 -
        if (PageUptodate(page))
                return;
        if (base != 0)
                return;
 -      if (count == PAGE_CACHE_SIZE) {
 -              SetPageUptodate(page);
 -              return;
 -      }
 -
 -      end_offs = i_size_read(page->mapping->host) - 1;
 -      if (end_offs < 0)
 +      if (count != nfs_page_length(page))
                return;
 -      /* Is this the last page? */
 -      if (page->index != (unsigned long)(end_offs >> PAGE_CACHE_SHIFT))
 -              return;
 -      /* This is the last page: set PG_uptodate if we cover the entire
 -       * extent of the data, then zero the rest of the page.
 -       */
 -      if (count == (unsigned int)(end_offs & (PAGE_CACHE_SIZE - 1)) + 1) {
 +      if (count != PAGE_CACHE_SIZE)
                memclear_highpage_flush(page, count, PAGE_CACHE_SIZE - count);
 -              SetPageUptodate(page);
 -      }
 +      SetPageUptodate(page);
  }
  
 -/*
 - * Write a page synchronously.
 - * Offset is the data offset within the page.
 - */
 -static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode,
 -              struct page *page, unsigned int offset, unsigned int count,
 -              int how)
 -{
 -      unsigned int    wsize = NFS_SERVER(inode)->wsize;
 -      int             result, written = 0;
 -      struct nfs_write_data *wdata;
 -
 -      wdata = nfs_writedata_alloc(wsize);
 -      if (!wdata)
 -              return -ENOMEM;
 -
 -      wdata->flags = how;
 -      wdata->cred = ctx->cred;
 -      wdata->inode = inode;
 -      wdata->args.fh = NFS_FH(inode);
 -      wdata->args.context = ctx;
 -      wdata->args.pages = &page;
 -      wdata->args.stable = NFS_FILE_SYNC;
 -      wdata->args.pgbase = offset;
 -      wdata->args.count = wsize;
 -      wdata->res.fattr = &wdata->fattr;
 -      wdata->res.verf = &wdata->verf;
 -
 -      dprintk("NFS:      nfs_writepage_sync(%s/%Ld %d@%Ld)\n",
 -              inode->i_sb->s_id,
 -              (long long)NFS_FILEID(inode),
 -              count, (long long)(page_offset(page) + offset));
 -
 -      set_page_writeback(page);
 -      nfs_begin_data_update(inode);
 -      do {
 -              if (count < wsize)
 -                      wdata->args.count = count;
 -              wdata->args.offset = page_offset(page) + wdata->args.pgbase;
 -
 -              result = NFS_PROTO(inode)->write(wdata);
 -
 -              if (result < 0) {
 -                      /* Must mark the page invalid after I/O error */
 -                      ClearPageUptodate(page);
 -                      goto io_error;
 -              }
 -              if (result < wdata->args.count)
 -                      printk(KERN_WARNING "NFS: short write, count=%u, result=%d\n",
 -                                      wdata->args.count, result);
 -
 -              wdata->args.offset += result;
 -              wdata->args.pgbase += result;
 -              written += result;
 -              count -= result;
 -              nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, result);
 -      } while (count);
 -      /* Update file length */
 -      nfs_grow_file(page, offset, written);
 -      /* Set the PG_uptodate flag? */
 -      nfs_mark_uptodate(page, offset, written);
 -
 -      if (PageError(page))
 -              ClearPageError(page);
 -
 -io_error:
 -      nfs_end_data_update(inode);
 -      end_page_writeback(page);
 -      nfs_writedata_free(wdata);
 -      return written ? written : result;
 -}
 -
 -static int nfs_writepage_async(struct nfs_open_context *ctx,
 -              struct inode *inode, struct page *page,
 +static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
                unsigned int offset, unsigned int count)
  {
        struct nfs_page *req;
 +      int ret;
  
 -      req = nfs_update_request(ctx, inode, page, offset, count);
 -      if (IS_ERR(req))
 -              return PTR_ERR(req);
 +      for (;;) {
 +              req = nfs_update_request(ctx, page, offset, count);
 +              if (!IS_ERR(req))
 +                      break;
 +              ret = PTR_ERR(req);
 +              if (ret != -EBUSY)
 +                      return ret;
 +              ret = nfs_wb_page(page->mapping->host, page);
 +              if (ret != 0)
 +                      return ret;
 +      }
        /* Update file length */
        nfs_grow_file(page, offset, count);
        /* Set the PG_uptodate flag? */
@@@ -244,95 -288,74 +244,95 @@@ static int wb_priority(struct writeback
        return 0;
  }
  
 +/*
 + * Find an associated nfs write request, and prepare to flush it out
 + * Returns 1 if there was no write request, or if the request was
 + * already tagged by nfs_set_page_dirty.Returns 0 if the request
 + * was not tagged.
 + * May also return an error if the user signalled nfs_wait_on_request().
 + */
 +static int nfs_page_mark_flush(struct page *page)
 +{
 +      struct nfs_page *req;
 +      spinlock_t *req_lock = &NFS_I(page->mapping->host)->req_lock;
 +      int ret;
 +
 +      spin_lock(req_lock);
 +      for(;;) {
 +              req = nfs_page_find_request_locked(page);
 +              if (req == NULL) {
 +                      spin_unlock(req_lock);
 +                      return 1;
 +              }
 +              if (nfs_lock_request_dontget(req))
 +                      break;
 +              /* Note: If we hold the page lock, as is the case in nfs_writepage,
 +               *       then the call to nfs_lock_request_dontget() will always
 +               *       succeed provided that someone hasn't already marked the
 +               *       request as dirty (in which case we don't care).
 +               */
 +              spin_unlock(req_lock);
 +              ret = nfs_wait_on_request(req);
 +              nfs_release_request(req);
 +              if (ret != 0)
 +                      return ret;
 +              spin_lock(req_lock);
 +      }
 +      spin_unlock(req_lock);
 +      if (test_and_set_bit(PG_FLUSHING, &req->wb_flags) == 0) {
 +              nfs_mark_request_dirty(req);
 +              set_page_writeback(page);
 +      }
 +      ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
 +      nfs_unlock_request(req);
 +      return ret;
 +}
 +
  /*
   * Write an mmapped page to the server.
   */
 -int nfs_writepage(struct page *page, struct writeback_control *wbc)
 +static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
  {
        struct nfs_open_context *ctx;
        struct inode *inode = page->mapping->host;
 -      unsigned long end_index;
 -      unsigned offset = PAGE_CACHE_SIZE;
 -      loff_t i_size = i_size_read(inode);
 -      int inode_referenced = 0;
 -      int priority = wb_priority(wbc);
 +      unsigned offset;
        int err;
  
        nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
        nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
  
 -      /*
 -       * Note: We need to ensure that we have a reference to the inode
 -       *       if we are to do asynchronous writes. If not, waiting
 -       *       in nfs_wait_on_request() may deadlock with clear_inode().
 -       *
 -       *       If igrab() fails here, then it is in any case safe to
 -       *       call nfs_wb_page(), since there will be no pending writes.
 -       */
 -      if (igrab(inode) != 0)
 -              inode_referenced = 1;
 -      end_index = i_size >> PAGE_CACHE_SHIFT;
 -
 -      /* Ensure we've flushed out any previous writes */
 -      nfs_wb_page_priority(inode, page, priority);
 -
 -      /* easy case */
 -      if (page->index < end_index)
 -              goto do_it;
 -      /* things got complicated... */
 -      offset = i_size & (PAGE_CACHE_SIZE-1);
 -
 -      /* OK, are we completely out? */
 -      err = 0; /* potential race with truncate - ignore */
 -      if (page->index >= end_index+1 || !offset)
 +      err = nfs_page_mark_flush(page);
 +      if (err <= 0)
 +              goto out;
 +      err = 0;
 +      offset = nfs_page_length(page);
 +      if (!offset)
                goto out;
 -do_it:
 +
        ctx = nfs_find_open_context(inode, NULL, FMODE_WRITE);
        if (ctx == NULL) {
                err = -EBADF;
                goto out;
        }
 -      lock_kernel();
 -      if (!IS_SYNC(inode) && inode_referenced) {
 -              err = nfs_writepage_async(ctx, inode, page, 0, offset);
 -              if (!wbc->for_writepages)
 -                      nfs_flush_inode(inode, 0, 0, wb_priority(wbc));
 -      } else {
 -              err = nfs_writepage_sync(ctx, inode, page, 0,
 -                                              offset, priority);
 -              if (err >= 0) {
 -                      if (err != offset)
 -                              redirty_page_for_writepage(wbc, page);
 -                      err = 0;
 -              }
 -      }
 -      unlock_kernel();
 +      err = nfs_writepage_setup(ctx, page, 0, offset);
        put_nfs_open_context(ctx);
 +      if (err != 0)
 +              goto out;
 +      err = nfs_page_mark_flush(page);
 +      if (err > 0)
 +              err = 0;
  out:
 +      if (!wbc->for_writepages)
 +              nfs_flush_mapping(page->mapping, wbc, wb_priority(wbc));
 +      return err;
 +}
 +
 +int nfs_writepage(struct page *page, struct writeback_control *wbc)
 +{
 +      int err;
 +
 +      err = nfs_writepage_locked(page, wbc);
        unlock_page(page);
 -      if (inode_referenced)
 -              iput(inode);
        return err; 
  }
  
@@@ -356,18 -379,21 +356,18 @@@ int nfs_writepages(struct address_spac
                        return 0;
                nfs_wait_on_write_congestion(mapping, 0);
        }
 -      err = nfs_flush_inode(inode, 0, 0, wb_priority(wbc));
 +      err = nfs_flush_mapping(mapping, wbc, wb_priority(wbc));
        if (err < 0)
                goto out;
        nfs_add_stats(inode, NFSIOS_WRITEPAGES, err);
 -      wbc->nr_to_write -= err;
        if (!wbc->nonblocking && wbc->sync_mode == WB_SYNC_ALL) {
                err = nfs_wait_on_requests(inode, 0, 0);
                if (err < 0)
                        goto out;
        }
        err = nfs_commit_inode(inode, wb_priority(wbc));
 -      if (err > 0) {
 -              wbc->nr_to_write -= err;
 +      if (err > 0)
                err = 0;
 -      }
  out:
        clear_bit(BDI_write_congested, &bdi->state);
        wake_up_all(&nfs_write_congestion);
@@@ -394,7 -420,6 +394,7 @@@ static int nfs_inode_add_request(struc
                        nfsi->change_attr++;
        }
        SetPagePrivate(req->wb_page);
 +      set_page_private(req->wb_page, (unsigned long)req);
        nfsi->npages++;
        atomic_inc(&req->wb_count);
        return 0;
@@@ -411,7 -436,6 +411,7 @@@ static void nfs_inode_remove_request(st
        BUG_ON (!NFS_WBACK_BUSY(req));
  
        spin_lock(&nfsi->req_lock);
 +      set_page_private(req->wb_page, 0);
        ClearPagePrivate(req->wb_page);
        radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
        nfsi->npages--;
        nfs_release_request(req);
  }
  
 -/*
 - * Find a request
 - */
 -static inline struct nfs_page *
 -_nfs_find_request(struct inode *inode, unsigned long index)
 -{
 -      struct nfs_inode *nfsi = NFS_I(inode);
 -      struct nfs_page *req;
 -
 -      req = (struct nfs_page*)radix_tree_lookup(&nfsi->nfs_page_tree, index);
 -      if (req)
 -              atomic_inc(&req->wb_count);
 -      return req;
 -}
 -
 -static struct nfs_page *
 -nfs_find_request(struct inode *inode, unsigned long index)
 -{
 -      struct nfs_page         *req;
 -      struct nfs_inode        *nfsi = NFS_I(inode);
 -
 -      spin_lock(&nfsi->req_lock);
 -      req = _nfs_find_request(inode, index);
 -      spin_unlock(&nfsi->req_lock);
 -      return req;
 -}
 -
  /*
   * Add a request to the inode's dirty list.
   */
@@@ -440,14 -491,8 +440,14 @@@ nfs_mark_request_dirty(struct nfs_page 
        nfs_list_add_request(req, &nfsi->dirty);
        nfsi->ndirty++;
        spin_unlock(&nfsi->req_lock);
 -      inc_zone_page_state(req->wb_page, NR_FILE_DIRTY);
 -      mark_inode_dirty(inode);
 +      __mark_inode_dirty(inode, I_DIRTY_PAGES);
 +}
 +
 +static void
 +nfs_redirty_request(struct nfs_page *req)
 +{
 +      clear_bit(PG_FLUSHING, &req->wb_flags);
 +      __set_page_dirty_nobuffers(req->wb_page);
  }
  
  /*
  static inline int
  nfs_dirty_request(struct nfs_page *req)
  {
 -      struct nfs_inode *nfsi = NFS_I(req->wb_context->dentry->d_inode);
 -      return !list_empty(&req->wb_list) && req->wb_list_head == &nfsi->dirty;
 +      return test_bit(PG_FLUSHING, &req->wb_flags) == 0;
  }
  
  #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@@ -474,7 -520,7 +474,7 @@@ nfs_mark_request_commit(struct nfs_pag
        nfsi->ncommit++;
        spin_unlock(&nfsi->req_lock);
        inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 -      mark_inode_dirty(inode);
 +      __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
  }
  #endif
  
@@@ -551,6 -597,31 +551,6 @@@ static void nfs_cancel_commit_list(stru
        }
  }
  
 -/*
 - * nfs_scan_dirty - Scan an inode for dirty requests
 - * @inode: NFS inode to scan
 - * @dst: destination list
 - * @idx_start: lower bound of page->index to scan.
 - * @npages: idx_start + npages sets the upper bound to scan.
 - *
 - * Moves requests from the inode's dirty page list.
 - * The requests are *not* checked to ensure that they form a contiguous set.
 - */
 -static int
 -nfs_scan_dirty(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages)
 -{
 -      struct nfs_inode *nfsi = NFS_I(inode);
 -      int res = 0;
 -
 -      if (nfsi->ndirty != 0) {
 -              res = nfs_scan_lock_dirty(nfsi, dst, idx_start, npages);
 -              nfsi->ndirty -= res;
 -              if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty))
 -                      printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n");
 -      }
 -      return res;
 -}
 -
  #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
  /*
   * nfs_scan_commit - Scan an inode for commit requests
@@@ -627,27 -698,27 +627,27 @@@ static int nfs_wait_on_write_congestion
   * Note: Should always be called with the Page Lock held!
   */
  static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
 -              struct inode *inode, struct page *page,
 -              unsigned int offset, unsigned int bytes)
 +              struct page *page, unsigned int offset, unsigned int bytes)
  {
 -      struct nfs_server *server = NFS_SERVER(inode);
 +      struct inode *inode = page->mapping->host;
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_page         *req, *new = NULL;
        unsigned long           rqend, end;
  
        end = offset + bytes;
  
 -      if (nfs_wait_on_write_congestion(page->mapping, server->flags & NFS_MOUNT_INTR))
 +      if (nfs_wait_on_write_congestion(page->mapping, NFS_SERVER(inode)->flags & NFS_MOUNT_INTR))
                return ERR_PTR(-ERESTARTSYS);
        for (;;) {
                /* Loop over all inode entries and see if we find
                 * A request for the page we wish to update
                 */
                spin_lock(&nfsi->req_lock);
 -              req = _nfs_find_request(inode, page->index);
 +              req = nfs_page_find_request_locked(page);
                if (req) {
                        if (!nfs_lock_request_dontget(req)) {
                                int error;
 +
                                spin_unlock(&nfsi->req_lock);
                                error = nfs_wait_on_request(req);
                                nfs_release_request(req);
                                return ERR_PTR(error);
                        }
                        spin_unlock(&nfsi->req_lock);
 -                      nfs_mark_request_dirty(new);
                        return new;
                }
                spin_unlock(&nfsi->req_lock);
  int nfs_flush_incompatible(struct file *file, struct page *page)
  {
        struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
 -      struct inode    *inode = page->mapping->host;
        struct nfs_page *req;
 -      int             status = 0;
 +      int do_flush, status;
        /*
         * Look for a request corresponding to this page. If there
         * is one, and it belongs to another file, we flush it out
         * Also do the same if we find a request from an existing
         * dropped page.
         */
 -      req = nfs_find_request(inode, page->index);
 -      if (req) {
 -              if (req->wb_page != page || ctx != req->wb_context)
 -                      status = nfs_wb_page(inode, page);
 +      do {
 +              req = nfs_page_find_request(page);
 +              if (req == NULL)
 +                      return 0;
 +              do_flush = req->wb_page != page || req->wb_context != ctx
 +                      || !nfs_dirty_request(req);
                nfs_release_request(req);
 -      }
 -      return (status < 0) ? status : 0;
 +              if (!do_flush)
 +                      return 0;
 +              status = nfs_wb_page(page->mapping->host, page);
 +      } while (status == 0);
 +      return status;
  }
  
  /*
@@@ -749,6 -817,7 +749,6 @@@ int nfs_updatepage(struct file *file, s
  {
        struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
        struct inode    *inode = page->mapping->host;
 -      struct nfs_page *req;
        int             status = 0;
  
        nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
                file->f_dentry->d_name.name, count,
                (long long)(page_offset(page) +offset));
  
 -      if (IS_SYNC(inode)) {
 -              status = nfs_writepage_sync(ctx, inode, page, offset, count, 0);
 -              if (status > 0) {
 -                      if (offset == 0 && status == PAGE_CACHE_SIZE)
 -                              SetPageUptodate(page);
 -                      return 0;
 -              }
 -              return status;
 -      }
 -
        /* If we're not using byte range locks, and we know the page
         * is entirely in cache, it may be more efficient to avoid
         * fragmenting write requests.
         */
        if (PageUptodate(page) && inode->i_flock == NULL && !(file->f_mode & O_SYNC)) {
 -              loff_t end_offs = i_size_read(inode) - 1;
 -              unsigned long end_index = end_offs >> PAGE_CACHE_SHIFT;
 -
 -              count += offset;
 +              count = max(count + offset, nfs_page_length(page));
                offset = 0;
 -              if (unlikely(end_offs < 0)) {
 -                      /* Do nothing */
 -              } else if (page->index == end_index) {
 -                      unsigned int pglen;
 -                      pglen = (unsigned int)(end_offs & (PAGE_CACHE_SIZE-1)) + 1;
 -                      if (count < pglen)
 -                              count = pglen;
 -              } else if (page->index < end_index)
 -                      count = PAGE_CACHE_SIZE;
        }
  
 -      /*
 -       * Try to find an NFS request corresponding to this page
 -       * and update it.
 -       * If the existing request cannot be updated, we must flush
 -       * it out now.
 -       */
 -      do {
 -              req = nfs_update_request(ctx, inode, page, offset, count);
 -              status = (IS_ERR(req)) ? PTR_ERR(req) : 0;
 -              if (status != -EBUSY)
 -                      break;
 -              /* Request could not be updated. Flush it out and try again */
 -              status = nfs_wb_page(inode, page);
 -      } while (status >= 0);
 -      if (status < 0)
 -              goto done;
 -
 -      status = 0;
 +      status = nfs_writepage_setup(ctx, page, offset, count);
 +      __set_page_dirty_nobuffers(page);
  
 -      /* Update file length */
 -      nfs_grow_file(page, offset, count);
 -      /* Set the PG_uptodate flag? */
 -      nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
 -      nfs_unlock_request(req);
 -done:
          dprintk("NFS:      nfs_updatepage returns %d (isize %Ld)\n",
                        status, (long long)i_size_read(inode));
        if (status < 0)
@@@ -784,7 -897,7 +784,7 @@@ static void nfs_writepage_release(struc
  #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
        if (!PageError(req->wb_page)) {
                if (NFS_NEED_RESCHED(req)) {
 -                      nfs_mark_request_dirty(req);
 +                      nfs_redirty_request(req);
                        goto out;
                } else if (NFS_NEED_COMMIT(req)) {
                        nfs_mark_request_commit(req);
@@@ -866,7 -979,9 +866,7 @@@ static void nfs_execute_write(struct nf
        sigset_t oldset;
  
        rpc_clnt_sigmask(clnt, &oldset);
 -      lock_kernel();
        rpc_execute(&data->task);
 -      unlock_kernel();
        rpc_clnt_sigunmask(clnt, &oldset);
  }
  
@@@ -900,6 -1015,7 +900,6 @@@ static int nfs_flush_multi(struct inod
        atomic_set(&req->wb_complete, requests);
  
        ClearPageError(page);
 -      set_page_writeback(page);
        offset = 0;
        nbytes = req->wb_bytes;
        do {
@@@ -927,9 -1043,9 +927,9 @@@ out_bad
        while (!list_empty(&list)) {
                data = list_entry(list.next, struct nfs_write_data, pages);
                list_del(&data->pages);
 -              nfs_writedata_free(data);
 +              nfs_writedata_release(data);
        }
 -      nfs_mark_request_dirty(req);
 +      nfs_redirty_request(req);
        nfs_clear_page_writeback(req);
        return -ENOMEM;
  }
@@@ -960,6 -1076,7 +960,6 @@@ static int nfs_flush_one(struct inode *
                nfs_list_remove_request(req);
                nfs_list_add_request(req, &data->pages);
                ClearPageError(req->wb_page);
 -              set_page_writeback(req->wb_page);
                *pages++ = req->wb_page;
                count += req->wb_bytes;
        }
        while (!list_empty(head)) {
                struct nfs_page *req = nfs_list_entry(head->next);
                nfs_list_remove_request(req);
 -              nfs_mark_request_dirty(req);
 +              nfs_redirty_request(req);
                nfs_clear_page_writeback(req);
        }
        return -ENOMEM;
@@@ -1009,7 -1126,7 +1009,7 @@@ out_err
        while (!list_empty(head)) {
                req = nfs_list_entry(head->next);
                nfs_list_remove_request(req);
 -              nfs_mark_request_dirty(req);
 +              nfs_redirty_request(req);
                nfs_clear_page_writeback(req);
        }
        return error;
@@@ -1325,7 -1442,7 +1325,7 @@@ static void nfs_commit_done(struct rpc_
                }
                /* We have a mismatch. Write the page again */
                dprintk(" mismatch\n");
 -              nfs_mark_request_dirty(req);
 +              nfs_redirty_request(req);
        next:
                nfs_clear_page_writeback(req);
        }
@@@ -1342,17 -1459,18 +1342,17 @@@ static inline int nfs_commit_list(struc
  }
  #endif
  
 -static int nfs_flush_inode(struct inode *inode, unsigned long idx_start,
 -                         unsigned int npages, int how)
 +static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how)
  {
 -      struct nfs_inode *nfsi = NFS_I(inode);
 +      struct nfs_inode *nfsi = NFS_I(mapping->host);
        LIST_HEAD(head);
 -      int res;
 +      long res;
  
        spin_lock(&nfsi->req_lock);
 -      res = nfs_scan_dirty(inode, &head, idx_start, npages);
 +      res = nfs_scan_dirty(mapping, wbc, &head);
        spin_unlock(&nfsi->req_lock);
        if (res) {
 -              int error = nfs_flush_list(inode, &head, res, how);
 +              int error = nfs_flush_list(mapping->host, &head, res, how);
                if (error < 0)
                        return error;
        }
@@@ -1378,62 -1496,38 +1378,62 @@@ int nfs_commit_inode(struct inode *inod
  }
  #endif
  
 -int nfs_sync_inode_wait(struct inode *inode, unsigned long idx_start,
 -              unsigned int npages, int how)
 +long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
  {
 +      struct inode *inode = mapping->host;
        struct nfs_inode *nfsi = NFS_I(inode);
 +      unsigned long idx_start, idx_end;
 +      unsigned int npages = 0;
        LIST_HEAD(head);
        int nocommit = how & FLUSH_NOCOMMIT;
 -      int pages, ret;
 -
 +      long pages, ret;
 +
 +      /* FIXME */
 +      if (wbc->range_cyclic)
 +              idx_start = 0;
 +      else {
 +              idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
 +              idx_end = wbc->range_end >> PAGE_CACHE_SHIFT;
 +              if (idx_end > idx_start) {
 +                      unsigned long l_npages = 1 + idx_end - idx_start;
 +                      npages = l_npages;
 +                      if (sizeof(npages) != sizeof(l_npages) &&
 +                                      (unsigned long)npages != l_npages)
 +                              npages = 0;
 +              }
 +      }
        how &= ~FLUSH_NOCOMMIT;
        spin_lock(&nfsi->req_lock);
        do {
 +              wbc->pages_skipped = 0;
                ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
                if (ret != 0)
                        continue;
 -              pages = nfs_scan_dirty(inode, &head, idx_start, npages);
 +              pages = nfs_scan_dirty(mapping, wbc, &head);
                if (pages != 0) {
                        spin_unlock(&nfsi->req_lock);
 -                      if (how & FLUSH_INVALIDATE)
 +                      if (how & FLUSH_INVALIDATE) {
                                nfs_cancel_dirty_list(&head);
 -                      else
 +                              ret = pages;
 +                      } else
                                ret = nfs_flush_list(inode, &head, pages, how);
                        spin_lock(&nfsi->req_lock);
                        continue;
                }
 +              if (wbc->pages_skipped != 0)
 +                      continue;
                if (nocommit)
                        break;
                pages = nfs_scan_commit(inode, &head, idx_start, npages);
 -              if (pages == 0)
 +              if (pages == 0) {
 +                      if (wbc->pages_skipped != 0)
 +                              continue;
                        break;
 +              }
                if (how & FLUSH_INVALIDATE) {
                        spin_unlock(&nfsi->req_lock);
                        nfs_cancel_commit_list(&head);
 +                      ret = pages;
                        spin_lock(&nfsi->req_lock);
                        continue;
                }
        return ret;
  }
  
 +/*
 + * flush the inode to disk.
 + */
 +int nfs_wb_all(struct inode *inode)
 +{
 +      struct address_space *mapping = inode->i_mapping;
 +      struct writeback_control wbc = {
 +              .bdi = mapping->backing_dev_info,
 +              .sync_mode = WB_SYNC_ALL,
 +              .nr_to_write = LONG_MAX,
 +              .for_writepages = 1,
 +              .range_cyclic = 1,
 +      };
 +      int ret;
 +
 +      ret = generic_writepages(mapping, &wbc);
 +      if (ret < 0)
 +              goto out;
 +      ret = nfs_sync_mapping_wait(mapping, &wbc, 0);
 +      if (ret >= 0)
 +              return 0;
 +out:
 +      __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 +      return ret;
 +}
 +
 +int nfs_sync_mapping_range(struct address_space *mapping, loff_t range_start, loff_t range_end, int how)
 +{
 +      struct writeback_control wbc = {
 +              .bdi = mapping->backing_dev_info,
 +              .sync_mode = WB_SYNC_ALL,
 +              .nr_to_write = LONG_MAX,
 +              .range_start = range_start,
 +              .range_end = range_end,
 +              .for_writepages = 1,
 +      };
 +      int ret;
 +
 +      if (!(how & FLUSH_NOWRITEPAGE)) {
 +              ret = generic_writepages(mapping, &wbc);
 +              if (ret < 0)
 +                      goto out;
 +      }
 +      ret = nfs_sync_mapping_wait(mapping, &wbc, how);
 +      if (ret >= 0)
 +              return 0;
 +out:
 +      __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 +      return ret;
 +}
 +
 +int nfs_wb_page_priority(struct inode *inode, struct page *page, int how)
 +{
 +      loff_t range_start = page_offset(page);
 +      loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
 +      struct writeback_control wbc = {
 +              .bdi = page->mapping->backing_dev_info,
 +              .sync_mode = WB_SYNC_ALL,
 +              .nr_to_write = LONG_MAX,
 +              .range_start = range_start,
 +              .range_end = range_end,
 +      };
 +      int ret;
 +
 +      BUG_ON(!PageLocked(page));
 +      if (!(how & FLUSH_NOWRITEPAGE) && clear_page_dirty_for_io(page)) {
 +              ret = nfs_writepage_locked(page, &wbc);
 +              if (ret < 0)
 +                      goto out;
 +      }
 +      ret = nfs_sync_mapping_wait(page->mapping, &wbc, how);
 +      if (ret >= 0)
 +              return 0;
 +out:
 +      __mark_inode_dirty(inode, I_DIRTY_PAGES);
 +      return ret;
 +}
 +
 +/*
 + * Write back all requests on one page - we do this before reading it.
 + */
 +int nfs_wb_page(struct inode *inode, struct page* page)
 +{
 +      return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
 +}
 +
 +int nfs_set_page_dirty(struct page *page)
 +{
 +      struct nfs_page *req;
 +
 +      req = nfs_page_find_request(page);
 +      if (req != NULL) {
 +              /* Mark any existing write requests for flushing */
 +              set_bit(PG_NEED_FLUSH, &req->wb_flags);
 +              nfs_release_request(req);
 +      }
 +      return __set_page_dirty_nobuffers(page);
 +}
 +
 +
  int __init nfs_init_writepagecache(void)
  {
        nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
@@@ -11,7 -11,6 +11,7 @@@
  
  #include <linux/timer.h>
  #include <linux/sunrpc/types.h>
 +#include <linux/rcupdate.h>
  #include <linux/spinlock.h>
  #include <linux/wait.h>
  #include <linux/workqueue.h>
@@@ -86,7 -85,6 +86,7 @@@ struct rpc_task 
        union {
                struct work_struct      tk_work;        /* Async task work queue */
                struct rpc_wait         tk_wait;        /* RPC wait */
 +              struct rcu_head         tk_rcu;         /* for task deletion */
        } u;
  
        unsigned short          tk_timeouts;    /* maj timeouts */
@@@ -180,6 -178,13 +180,6 @@@ struct rpc_call_ops 
        } while (0)
  
  #define RPC_IS_ACTIVATED(t)   (test_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate))
 -#define rpc_set_active(t)     (set_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate))
 -#define rpc_clear_active(t)   \
 -      do { \
 -              smp_mb__before_clear_bit(); \
 -              clear_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate); \
 -              smp_mb__after_clear_bit(); \
 -      } while(0)
  
  /*
   * Task priorities.
@@@ -217,7 -222,7 +217,7 @@@ struct rpc_wait_queue 
  
  #ifndef RPC_DEBUG
  # define RPC_WAITQ_INIT(var,qname) { \
-               .lock = SPIN_LOCK_UNLOCKED, \
+               .lock = __SPIN_LOCK_UNLOCKED(var.lock), \
                .tasks = { \
                        [0] = LIST_HEAD_INIT(var.tasks[0]), \
                        [1] = LIST_HEAD_INIT(var.tasks[1]), \
        }
  #else
  # define RPC_WAITQ_INIT(var,qname) { \
-               .lock = SPIN_LOCK_UNLOCKED, \
+               .lock = __SPIN_LOCK_UNLOCKED(var.lock), \
                .tasks = { \
                        [0] = LIST_HEAD_INIT(var.tasks[0]), \
                        [1] = LIST_HEAD_INIT(var.tasks[1]), \
@@@ -249,10 -254,8 +249,10 @@@ struct rpc_task *rpc_run_task(struct rp
  void          rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt,
                                int flags, const struct rpc_call_ops *ops,
                                void *data);
 +void          rpc_put_task(struct rpc_task *);
  void          rpc_release_task(struct rpc_task *);
  void          rpc_exit_task(struct rpc_task *);
 +void          rpc_release_calldata(const struct rpc_call_ops *, void *);
  void          rpc_killall_tasks(struct rpc_clnt *);
  int           rpc_execute(struct rpc_task *);
  void          rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *);
diff --combined net/sunrpc/sched.c
@@@ -34,8 -34,8 +34,8 @@@ static int                    rpc_task_id
  #define RPC_BUFFER_MAXSIZE    (2048)
  #define RPC_BUFFER_POOLSIZE   (8)
  #define RPC_TASK_POOLSIZE     (8)
- static kmem_cache_t   *rpc_task_slabp __read_mostly;
- static kmem_cache_t   *rpc_buffer_slabp __read_mostly;
+ static struct kmem_cache      *rpc_task_slabp __read_mostly;
+ static struct kmem_cache      *rpc_buffer_slabp __read_mostly;
  static mempool_t      *rpc_task_mempool __read_mostly;
  static mempool_t      *rpc_buffer_mempool __read_mostly;
  
@@@ -266,28 -266,12 +266,28 @@@ static int rpc_wait_bit_interruptible(v
        return 0;
  }
  
 +static void rpc_set_active(struct rpc_task *task)
 +{
 +      if (test_and_set_bit(RPC_TASK_ACTIVE, &task->tk_runstate) != 0)
 +              return;
 +      spin_lock(&rpc_sched_lock);
 +#ifdef RPC_DEBUG
 +      task->tk_magic = RPC_TASK_MAGIC_ID;
 +      task->tk_pid = rpc_task_id++;
 +#endif
 +      /* Add to global list of all tasks */
 +      list_add_tail(&task->tk_task, &all_tasks);
 +      spin_unlock(&rpc_sched_lock);
 +}
 +
  /*
   * Mark an RPC call as having completed by clearing the 'active' bit
   */
 -static inline void rpc_mark_complete_task(struct rpc_task *task)
 +static void rpc_mark_complete_task(struct rpc_task *task)
  {
 -      rpc_clear_active(task);
 +      smp_mb__before_clear_bit();
 +      clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
 +      smp_mb__after_clear_bit();
        wake_up_bit(&task->tk_runstate, RPC_TASK_ACTIVE);
  }
  
@@@ -311,15 -295,13 +311,15 @@@ EXPORT_SYMBOL(__rpc_wait_for_completion
   */
  static void rpc_make_runnable(struct rpc_task *task)
  {
 -      int do_ret;
 -
        BUG_ON(task->tk_timeout_fn);
 -      do_ret = rpc_test_and_set_running(task);
        rpc_clear_queued(task);
 -      if (do_ret)
 +      if (rpc_test_and_set_running(task))
                return;
 +      /* We might have raced */
 +      if (RPC_IS_QUEUED(task)) {
 +              rpc_clear_running(task);
 +              return;
 +      }
        if (RPC_IS_ASYNC(task)) {
                int status;
  
@@@ -351,6 -333,9 +351,6 @@@ static void __rpc_sleep_on(struct rpc_w
                return;
        }
  
 -      /* Mark the task as being activated if so needed */
 -      rpc_set_active(task);
 -
        __rpc_add_wait_queue(q, task);
  
        BUG_ON(task->tk_callback != NULL);
  void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
                                rpc_action action, rpc_action timer)
  {
 +      /* Mark the task as being activated if so needed */
 +      rpc_set_active(task);
 +
        /*
         * Protect the queue operations.
         */
@@@ -427,19 -409,16 +427,19 @@@ __rpc_default_timer(struct rpc_task *ta
   */
  void rpc_wake_up_task(struct rpc_task *task)
  {
 +      rcu_read_lock_bh();
        if (rpc_start_wakeup(task)) {
                if (RPC_IS_QUEUED(task)) {
                        struct rpc_wait_queue *queue = task->u.tk_wait.rpc_waitq;
  
 -                      spin_lock_bh(&queue->lock);
 +                      /* Note: we're already in a bh-safe context */
 +                      spin_lock(&queue->lock);
                        __rpc_do_wake_up_task(task);
 -                      spin_unlock_bh(&queue->lock);
 +                      spin_unlock(&queue->lock);
                }
                rpc_finish_wakeup(task);
        }
 +      rcu_read_unlock_bh();
  }
  
  /*
@@@ -502,16 -481,14 +502,16 @@@ struct rpc_task * rpc_wake_up_next(stru
        struct rpc_task *task = NULL;
  
        dprintk("RPC:      wake_up_next(%p \"%s\")\n", queue, rpc_qname(queue));
 -      spin_lock_bh(&queue->lock);
 +      rcu_read_lock_bh();
 +      spin_lock(&queue->lock);
        if (RPC_IS_PRIORITY(queue))
                task = __rpc_wake_up_next_priority(queue);
        else {
                task_for_first(task, &queue->tasks[0])
                        __rpc_wake_up_task(task);
        }
 -      spin_unlock_bh(&queue->lock);
 +      spin_unlock(&queue->lock);
 +      rcu_read_unlock_bh();
  
        return task;
  }
@@@ -527,8 -504,7 +527,8 @@@ void rpc_wake_up(struct rpc_wait_queue 
        struct rpc_task *task, *next;
        struct list_head *head;
  
 -      spin_lock_bh(&queue->lock);
 +      rcu_read_lock_bh();
 +      spin_lock(&queue->lock);
        head = &queue->tasks[queue->maxpriority];
        for (;;) {
                list_for_each_entry_safe(task, next, head, u.tk_wait.list)
                        break;
                head--;
        }
 -      spin_unlock_bh(&queue->lock);
 +      spin_unlock(&queue->lock);
 +      rcu_read_unlock_bh();
  }
  
  /**
@@@ -553,8 -528,7 +553,8 @@@ void rpc_wake_up_status(struct rpc_wait
        struct rpc_task *task, *next;
        struct list_head *head;
  
 -      spin_lock_bh(&queue->lock);
 +      rcu_read_lock_bh();
 +      spin_lock(&queue->lock);
        head = &queue->tasks[queue->maxpriority];
        for (;;) {
                list_for_each_entry_safe(task, next, head, u.tk_wait.list) {
                        break;
                head--;
        }
 -      spin_unlock_bh(&queue->lock);
 +      spin_unlock(&queue->lock);
 +      rcu_read_unlock_bh();
  }
  
  static void __rpc_atrun(struct rpc_task *task)
@@@ -588,9 -561,7 +588,9 @@@ void rpc_delay(struct rpc_task *task, u
   */
  static void rpc_prepare_task(struct rpc_task *task)
  {
 +      lock_kernel();
        task->tk_ops->rpc_call_prepare(task, task->tk_calldata);
 +      unlock_kernel();
  }
  
  /*
@@@ -600,9 -571,7 +600,9 @@@ void rpc_exit_task(struct rpc_task *tas
  {
        task->tk_action = NULL;
        if (task->tk_ops->rpc_call_done != NULL) {
 +              lock_kernel();
                task->tk_ops->rpc_call_done(task, task->tk_calldata);
 +              unlock_kernel();
                if (task->tk_action != NULL) {
                        WARN_ON(RPC_ASSASSINATED(task));
                        /* Always release the RPC slot and buffer memory */
  }
  EXPORT_SYMBOL(rpc_exit_task);
  
 +void rpc_release_calldata(const struct rpc_call_ops *ops, void *calldata)
 +{
 +      if (ops->rpc_release != NULL) {
 +              lock_kernel();
 +              ops->rpc_release(calldata);
 +              unlock_kernel();
 +      }
 +}
 +
  /*
   * This is the RPC `scheduler' (or rather, the finite state machine).
   */
@@@ -655,7 -615,9 +655,7 @@@ static int __rpc_execute(struct rpc_tas
                         */
                        save_callback=task->tk_callback;
                        task->tk_callback=NULL;
 -                      lock_kernel();
                        save_callback(task);
 -                      unlock_kernel();
                }
  
                /*
                if (!RPC_IS_QUEUED(task)) {
                        if (task->tk_action == NULL)
                                break;
 -                      lock_kernel();
                        task->tk_action(task);
 -                      unlock_kernel();
                }
  
                /*
        }
  
        dprintk("RPC: %4d, return %d, status %d\n", task->tk_pid, status, task->tk_status);
 -      /* Wake up anyone who is waiting for task completion */
 -      rpc_mark_complete_task(task);
        /* Release all resources associated with the task */
        rpc_release_task(task);
        return status;
@@@ -820,6 -786,15 +820,6 @@@ void rpc_init_task(struct rpc_task *tas
                        task->tk_flags |= RPC_TASK_NOINTR;
        }
  
 -#ifdef RPC_DEBUG
 -      task->tk_magic = RPC_TASK_MAGIC_ID;
 -      task->tk_pid = rpc_task_id++;
 -#endif
 -      /* Add to global list of all tasks */
 -      spin_lock(&rpc_sched_lock);
 -      list_add_tail(&task->tk_task, &all_tasks);
 -      spin_unlock(&rpc_sched_lock);
 -
        BUG_ON(task->tk_ops == NULL);
  
        /* starting timestamp */
@@@ -835,9 -810,8 +835,9 @@@ rpc_alloc_task(void
        return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS);
  }
  
 -static void rpc_free_task(struct rpc_task *task)
 +static void rpc_free_task(struct rcu_head *rcu)
  {
 +      struct rpc_task *task = container_of(rcu, struct rpc_task, u.tk_rcu);
        dprintk("RPC: %4d freeing task\n", task->tk_pid);
        mempool_free(task, rpc_task_mempool);
  }
@@@ -873,34 -847,16 +873,34 @@@ cleanup
        goto out;
  }
  
 -void rpc_release_task(struct rpc_task *task)
 +
 +void rpc_put_task(struct rpc_task *task)
  {
        const struct rpc_call_ops *tk_ops = task->tk_ops;
        void *calldata = task->tk_calldata;
  
 +      if (!atomic_dec_and_test(&task->tk_count))
 +              return;
 +      /* Release resources */
 +      if (task->tk_rqstp)
 +              xprt_release(task);
 +      if (task->tk_msg.rpc_cred)
 +              rpcauth_unbindcred(task);
 +      if (task->tk_client) {
 +              rpc_release_client(task->tk_client);
 +              task->tk_client = NULL;
 +      }
 +      if (task->tk_flags & RPC_TASK_DYNAMIC)
 +              call_rcu_bh(&task->u.tk_rcu, rpc_free_task);
 +      rpc_release_calldata(tk_ops, calldata);
 +}
 +EXPORT_SYMBOL(rpc_put_task);
 +
 +void rpc_release_task(struct rpc_task *task)
 +{
  #ifdef RPC_DEBUG
        BUG_ON(task->tk_magic != RPC_TASK_MAGIC_ID);
  #endif
 -      if (!atomic_dec_and_test(&task->tk_count))
 -              return;
        dprintk("RPC: %4d release task\n", task->tk_pid);
  
        /* Remove from global task list */
        /* Synchronously delete any running timer */
        rpc_delete_timer(task);
  
 -      /* Release resources */
 -      if (task->tk_rqstp)
 -              xprt_release(task);
 -      if (task->tk_msg.rpc_cred)
 -              rpcauth_unbindcred(task);
 -      if (task->tk_client) {
 -              rpc_release_client(task->tk_client);
 -              task->tk_client = NULL;
 -      }
 -
  #ifdef RPC_DEBUG
        task->tk_magic = 0;
  #endif
 -      if (task->tk_flags & RPC_TASK_DYNAMIC)
 -              rpc_free_task(task);
 -      if (tk_ops->rpc_release)
 -              tk_ops->rpc_release(calldata);
 +      /* Wake up anyone who is waiting for task completion */
 +      rpc_mark_complete_task(task);
 +
 +      rpc_put_task(task);
  }
  
  /**
@@@ -936,7 -902,8 +936,7 @@@ struct rpc_task *rpc_run_task(struct rp
        struct rpc_task *task;
        task = rpc_new_task(clnt, flags, ops, data);
        if (task == NULL) {
 -              if (ops->rpc_release != NULL)
 -                      ops->rpc_release(data);
 +              rpc_release_calldata(ops, data);
                return ERR_PTR(-ENOMEM);
        }
        atomic_inc(&task->tk_count);
diff --combined net/sunrpc/xprtsock.c
@@@ -45,92 -45,6 +45,92 @@@ unsigned int xprt_tcp_slot_table_entrie
  unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT;
  unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT;
  
 +/*
 + * We can register our own files under /proc/sys/sunrpc by
 + * calling register_sysctl_table() again.  The files in that
 + * directory become the union of all files registered there.
 + *
 + * We simply need to make sure that we don't collide with
 + * someone else's file names!
 + */
 +
 +#ifdef RPC_DEBUG
 +
 +static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE;
 +static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE;
 +static unsigned int xprt_min_resvport_limit = RPC_MIN_RESVPORT;
 +static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT;
 +
 +static struct ctl_table_header *sunrpc_table_header;
 +
 +/*
 + * FIXME: changing the UDP slot table size should also resize the UDP
 + *        socket buffers for existing UDP transports
 + */
 +static ctl_table xs_tunables_table[] = {
 +      {
 +              .ctl_name       = CTL_SLOTTABLE_UDP,
 +              .procname       = "udp_slot_table_entries",
 +              .data           = &xprt_udp_slot_table_entries,
 +              .maxlen         = sizeof(unsigned int),
 +              .mode           = 0644,
 +              .proc_handler   = &proc_dointvec_minmax,
 +              .strategy       = &sysctl_intvec,
 +              .extra1         = &min_slot_table_size,
 +              .extra2         = &max_slot_table_size
 +      },
 +      {
 +              .ctl_name       = CTL_SLOTTABLE_TCP,
 +              .procname       = "tcp_slot_table_entries",
 +              .data           = &xprt_tcp_slot_table_entries,
 +              .maxlen         = sizeof(unsigned int),
 +              .mode           = 0644,
 +              .proc_handler   = &proc_dointvec_minmax,
 +              .strategy       = &sysctl_intvec,
 +              .extra1         = &min_slot_table_size,
 +              .extra2         = &max_slot_table_size
 +      },
 +      {
 +              .ctl_name       = CTL_MIN_RESVPORT,
 +              .procname       = "min_resvport",
 +              .data           = &xprt_min_resvport,
 +              .maxlen         = sizeof(unsigned int),
 +              .mode           = 0644,
 +              .proc_handler   = &proc_dointvec_minmax,
 +              .strategy       = &sysctl_intvec,
 +              .extra1         = &xprt_min_resvport_limit,
 +              .extra2         = &xprt_max_resvport_limit
 +      },
 +      {
 +              .ctl_name       = CTL_MAX_RESVPORT,
 +              .procname       = "max_resvport",
 +              .data           = &xprt_max_resvport,
 +              .maxlen         = sizeof(unsigned int),
 +              .mode           = 0644,
 +              .proc_handler   = &proc_dointvec_minmax,
 +              .strategy       = &sysctl_intvec,
 +              .extra1         = &xprt_min_resvport_limit,
 +              .extra2         = &xprt_max_resvport_limit
 +      },
 +      {
 +              .ctl_name = 0,
 +      },
 +};
 +
 +static ctl_table sunrpc_table[] = {
 +      {
 +              .ctl_name       = CTL_SUNRPC,
 +              .procname       = "sunrpc",
 +              .mode           = 0555,
 +              .child          = xs_tunables_table
 +      },
 +      {
 +              .ctl_name = 0,
 +      },
 +};
 +
 +#endif
 +
  /*
   * How many times to try sending a request on a socket before waiting
   * for the socket buffer to clear.
@@@ -211,55 -125,6 +211,55 @@@ static inline void xs_pktdump(char *msg
  }
  #endif
  
 +struct sock_xprt {
 +      struct rpc_xprt         xprt;
 +
 +      /*
 +       * Network layer
 +       */
 +      struct socket *         sock;
 +      struct sock *           inet;
 +
 +      /*
 +       * State of TCP reply receive
 +       */
 +      __be32                  tcp_fraghdr,
 +                              tcp_xid;
 +
 +      u32                     tcp_offset,
 +                              tcp_reclen;
 +
 +      unsigned long           tcp_copied,
 +                              tcp_flags;
 +
 +      /*
 +       * Connection of transports
 +       */
 +      struct delayed_work     connect_worker;
 +      unsigned short          port;
 +
 +      /*
 +       * UDP socket buffer size parameters
 +       */
 +      size_t                  rcvsize,
 +                              sndsize;
 +
 +      /*
 +       * Saved socket callback addresses
 +       */
 +      void                    (*old_data_ready)(struct sock *, int);
 +      void                    (*old_state_change)(struct sock *);
 +      void                    (*old_write_space)(struct sock *);
 +};
 +
 +/*
 + * TCP receive state flags
 + */
 +#define TCP_RCV_LAST_FRAG     (1UL << 0)
 +#define TCP_RCV_COPY_FRAGHDR  (1UL << 1)
 +#define TCP_RCV_COPY_XID      (1UL << 2)
 +#define TCP_RCV_COPY_DATA     (1UL << 3)
 +
  static void xs_format_peer_addresses(struct rpc_xprt *xprt)
  {
        struct sockaddr_in *addr = (struct sockaddr_in *) &xprt->addr;
@@@ -303,52 -168,37 +303,52 @@@ static void xs_free_peer_addresses(stru
  
  #define XS_SENDMSG_FLAGS      (MSG_DONTWAIT | MSG_NOSIGNAL)
  
 -static inline int xs_send_head(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, unsigned int len)
 +static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen, struct kvec *vec, unsigned int base, int more)
  {
 -      struct kvec iov = {
 -              .iov_base       = xdr->head[0].iov_base + base,
 -              .iov_len        = len - base,
 -      };
        struct msghdr msg = {
                .msg_name       = addr,
                .msg_namelen    = addrlen,
 -              .msg_flags      = XS_SENDMSG_FLAGS,
 +              .msg_flags      = XS_SENDMSG_FLAGS | (more ? MSG_MORE : 0),
 +      };
 +      struct kvec iov = {
 +              .iov_base       = vec->iov_base + base,
 +              .iov_len        = vec->iov_len - base,
        };
  
 -      if (xdr->len > len)
 -              msg.msg_flags |= MSG_MORE;
 -
 -      if (likely(iov.iov_len))
 +      if (iov.iov_len != 0)
                return kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len);
        return kernel_sendmsg(sock, &msg, NULL, 0, 0);
  }
  
 -static int xs_send_tail(struct socket *sock, struct xdr_buf *xdr, unsigned int base, unsigned int len)
 +static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more)
  {
 -      struct kvec iov = {
 -              .iov_base       = xdr->tail[0].iov_base + base,
 -              .iov_len        = len - base,
 -      };
 -      struct msghdr msg = {
 -              .msg_flags      = XS_SENDMSG_FLAGS,
 -      };
 +      struct page **ppage;
 +      unsigned int remainder;
 +      int err, sent = 0;
 +
 +      remainder = xdr->page_len - base;
 +      base += xdr->page_base;
 +      ppage = xdr->pages + (base >> PAGE_SHIFT);
 +      base &= ~PAGE_MASK;
 +      for(;;) {
 +              unsigned int len = min_t(unsigned int, PAGE_SIZE - base, remainder);
 +              int flags = XS_SENDMSG_FLAGS;
  
 -      return kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len);
 +              remainder -= len;
 +              if (remainder != 0 || more)
 +                      flags |= MSG_MORE;
 +              err = sock->ops->sendpage(sock, *ppage, base, len, flags);
 +              if (remainder == 0 || err != len)
 +                      break;
 +              sent += err;
 +              ppage++;
 +              base = 0;
 +      }
 +      if (sent == 0)
 +              return err;
 +      if (err > 0)
 +              sent += err;
 +      return sent;
  }
  
  /**
   * @base: starting position in the buffer
   *
   */
 -static inline int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base)
 +static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base)
  {
 -      struct page **ppage = xdr->pages;
 -      unsigned int len, pglen = xdr->page_len;
 -      int err, ret = 0;
 +      unsigned int remainder = xdr->len - base;
 +      int err, sent = 0;
  
        if (unlikely(!sock))
                return -ENOTCONN;
  
        clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
 +      if (base != 0) {
 +              addr = NULL;
 +              addrlen = 0;
 +      }
  
 -      len = xdr->head[0].iov_len;
 -      if (base < len || (addr != NULL && base == 0)) {
 -              err = xs_send_head(sock, addr, addrlen, xdr, base, len);
 -              if (ret == 0)
 -                      ret = err;
 -              else if (err > 0)
 -                      ret += err;
 -              if (err != (len - base))
 +      if (base < xdr->head[0].iov_len || addr != NULL) {
 +              unsigned int len = xdr->head[0].iov_len - base;
 +              remainder -= len;
 +              err = xs_send_kvec(sock, addr, addrlen, &xdr->head[0], base, remainder != 0);
 +              if (remainder == 0 || err != len)
                        goto out;
 +              sent += err;
                base = 0;
        } else
 -              base -= len;
 -
 -      if (unlikely(pglen == 0))
 -              goto copy_tail;
 -      if (unlikely(base >= pglen)) {
 -              base -= pglen;
 -              goto copy_tail;
 -      }
 -      if (base || xdr->page_base) {
 -              pglen -= base;
 -              base += xdr->page_base;
 -              ppage += base >> PAGE_CACHE_SHIFT;
 -              base &= ~PAGE_CACHE_MASK;
 -      }
 -
 -      do {
 -              int flags = XS_SENDMSG_FLAGS;
 -
 -              len = PAGE_CACHE_SIZE;
 -              if (base)
 -                      len -= base;
 -              if (pglen < len)
 -                      len = pglen;
 -
 -              if (pglen != len || xdr->tail[0].iov_len != 0)
 -                      flags |= MSG_MORE;
 +              base -= xdr->head[0].iov_len;
  
 -              err = kernel_sendpage(sock, *ppage, base, len, flags);
 -              if (ret == 0)
 -                      ret = err;
 -              else if (err > 0)
 -                      ret += err;
 -              if (err != len)
 +      if (base < xdr->page_len) {
 +              unsigned int len = xdr->page_len - base;
 +              remainder -= len;
 +              err = xs_send_pagedata(sock, xdr, base, remainder != 0);
 +              if (remainder == 0 || err != len)
                        goto out;
 +              sent += err;
                base = 0;
 -              ppage++;
 -      } while ((pglen -= len) != 0);
 -copy_tail:
 -      len = xdr->tail[0].iov_len;
 -      if (base < len) {
 -              err = xs_send_tail(sock, xdr, base, len);
 -              if (ret == 0)
 -                      ret = err;
 -              else if (err > 0)
 -                      ret += err;
 -      }
 +      } else
 +              base -= xdr->page_len;
 +
 +      if (base >= xdr->tail[0].iov_len)
 +              return sent;
 +      err = xs_send_kvec(sock, NULL, 0, &xdr->tail[0], base, 0);
  out:
 -      return ret;
 +      if (sent == 0)
 +              return err;
 +      if (err > 0)
 +              sent += err;
 +      return sent;
  }
  
  /**
@@@ -416,20 -291,19 +416,20 @@@ static void xs_nospace(struct rpc_task 
  {
        struct rpc_rqst *req = task->tk_rqstp;
        struct rpc_xprt *xprt = req->rq_xprt;
 +      struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
  
        dprintk("RPC: %4d xmit incomplete (%u left of %u)\n",
                        task->tk_pid, req->rq_slen - req->rq_bytes_sent,
                        req->rq_slen);
  
 -      if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) {
 +      if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) {
                /* Protect against races with write_space */
                spin_lock_bh(&xprt->transport_lock);
  
                /* Don't race with disconnect */
                if (!xprt_connected(xprt))
                        task->tk_status = -ENOTCONN;
 -              else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags))
 +              else if (test_bit(SOCK_NOSPACE, &transport->sock->flags))
                        xprt_wait_for_buffer_space(task);
  
                spin_unlock_bh(&xprt->transport_lock);
@@@ -453,7 -327,6 +453,7 @@@ static int xs_udp_send_request(struct r
  {
        struct rpc_rqst *req = task->tk_rqstp;
        struct rpc_xprt *xprt = req->rq_xprt;
 +      struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
        struct xdr_buf *xdr = &req->rq_snd_buf;
        int status;
  
                                req->rq_svec->iov_len);
  
        req->rq_xtime = jiffies;
 -      status = xs_sendpages(xprt->sock, (struct sockaddr *) &xprt->addr,
 -                              xprt->addrlen, xdr, req->rq_bytes_sent);
 +      status = xs_sendpages(transport->sock,
 +                            (struct sockaddr *) &xprt->addr,
 +                            xprt->addrlen, xdr,
 +                            req->rq_bytes_sent);
  
        dprintk("RPC:      xs_udp_send_request(%u) = %d\n",
                        xdr->len - req->rq_bytes_sent, status);
@@@ -521,7 -392,6 +521,7 @@@ static int xs_tcp_send_request(struct r
  {
        struct rpc_rqst *req = task->tk_rqstp;
        struct rpc_xprt *xprt = req->rq_xprt;
 +      struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
        struct xdr_buf *xdr = &req->rq_snd_buf;
        int status, retry = 0;
  
         * called sendmsg(). */
        while (1) {
                req->rq_xtime = jiffies;
 -              status = xs_sendpages(xprt->sock, NULL, 0, xdr,
 -                                              req->rq_bytes_sent);
 +              status = xs_sendpages(transport->sock,
 +                                      NULL, 0, xdr, req->rq_bytes_sent);
  
                dprintk("RPC:      xs_tcp_send_request(%u) = %d\n",
                                xdr->len - req->rq_bytes_sent, status);
@@@ -615,9 -485,8 +615,9 @@@ out_release
   */
  static void xs_close(struct rpc_xprt *xprt)
  {
 -      struct socket *sock = xprt->sock;
 -      struct sock *sk = xprt->inet;
 +      struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 +      struct socket *sock = transport->sock;
 +      struct sock *sk = transport->inet;
  
        if (!sk)
                goto clear_close_wait;
        dprintk("RPC:      xs_close xprt %p\n", xprt);
  
        write_lock_bh(&sk->sk_callback_lock);
 -      xprt->inet = NULL;
 -      xprt->sock = NULL;
 +      transport->inet = NULL;
 +      transport->sock = NULL;
  
        sk->sk_user_data = NULL;
 -      sk->sk_data_ready = xprt->old_data_ready;
 -      sk->sk_state_change = xprt->old_state_change;
 -      sk->sk_write_space = xprt->old_write_space;
 +      sk->sk_data_ready = transport->old_data_ready;
 +      sk->sk_state_change = transport->old_state_change;
 +      sk->sk_write_space = transport->old_write_space;
        write_unlock_bh(&sk->sk_callback_lock);
  
        sk->sk_no_check = 0;
@@@ -650,18 -519,15 +650,18 @@@ clear_close_wait
   */
  static void xs_destroy(struct rpc_xprt *xprt)
  {
 +      struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 +
        dprintk("RPC:      xs_destroy xprt %p\n", xprt);
  
 -      cancel_delayed_work(&xprt->connect_worker);
 +      cancel_delayed_work(&transport->connect_worker);
        flush_scheduled_work();
  
        xprt_disconnect(xprt);
        xs_close(xprt);
        xs_free_peer_addresses(xprt);
        kfree(xprt->slot);
 +      kfree(xprt);
  }
  
  static inline struct rpc_xprt *xprt_from_sock(struct sock *sk)
@@@ -737,75 -603,91 +737,75 @@@ static void xs_udp_data_ready(struct so
        read_unlock(&sk->sk_callback_lock);
  }
  
 -static inline size_t xs_tcp_copy_data(skb_reader_t *desc, void *p, size_t len)
 -{
 -      if (len > desc->count)
 -              len = desc->count;
 -      if (skb_copy_bits(desc->skb, desc->offset, p, len)) {
 -              dprintk("RPC:      failed to copy %zu bytes from skb. %zu bytes remain\n",
 -                              len, desc->count);
 -              return 0;
 -      }
 -      desc->offset += len;
 -      desc->count -= len;
 -      dprintk("RPC:      copied %zu bytes from skb. %zu bytes remain\n",
 -                      len, desc->count);
 -      return len;
 -}
 -
 -static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc)
 +static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc)
  {
 +      struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
        size_t len, used;
        char *p;
  
 -      p = ((char *) &xprt->tcp_recm) + xprt->tcp_offset;
 -      len = sizeof(xprt->tcp_recm) - xprt->tcp_offset;
 -      used = xs_tcp_copy_data(desc, p, len);
 -      xprt->tcp_offset += used;
 +      p = ((char *) &transport->tcp_fraghdr) + transport->tcp_offset;
 +      len = sizeof(transport->tcp_fraghdr) - transport->tcp_offset;
 +      used = xdr_skb_read_bits(desc, p, len);
 +      transport->tcp_offset += used;
        if (used != len)
                return;
  
 -      xprt->tcp_reclen = ntohl(xprt->tcp_recm);
 -      if (xprt->tcp_reclen & RPC_LAST_STREAM_FRAGMENT)
 -              xprt->tcp_flags |= XPRT_LAST_FRAG;
 +      transport->tcp_reclen = ntohl(transport->tcp_fraghdr);
 +      if (transport->tcp_reclen & RPC_LAST_STREAM_FRAGMENT)
 +              transport->tcp_flags |= TCP_RCV_LAST_FRAG;
        else
 -              xprt->tcp_flags &= ~XPRT_LAST_FRAG;
 -      xprt->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK;
 +              transport->tcp_flags &= ~TCP_RCV_LAST_FRAG;
 +      transport->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK;
  
 -      xprt->tcp_flags &= ~XPRT_COPY_RECM;
 -      xprt->tcp_offset = 0;
 +      transport->tcp_flags &= ~TCP_RCV_COPY_FRAGHDR;
 +      transport->tcp_offset = 0;
  
        /* Sanity check of the record length */
 -      if (unlikely(xprt->tcp_reclen < 4)) {
 +      if (unlikely(transport->tcp_reclen < 4)) {
                dprintk("RPC:      invalid TCP record fragment length\n");
                xprt_disconnect(xprt);
                return;
        }
        dprintk("RPC:      reading TCP record fragment of length %d\n",
 -                      xprt->tcp_reclen);
 +                      transport->tcp_reclen);
  }
  
 -static void xs_tcp_check_recm(struct rpc_xprt *xprt)
 +static void xs_tcp_check_fraghdr(struct sock_xprt *transport)
  {
 -      dprintk("RPC:      xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u, tcp_flags = %lx\n",
 -                      xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen, xprt->tcp_flags);
 -      if (xprt->tcp_offset == xprt->tcp_reclen) {
 -              xprt->tcp_flags |= XPRT_COPY_RECM;
 -              xprt->tcp_offset = 0;
 -              if (xprt->tcp_flags & XPRT_LAST_FRAG) {
 -                      xprt->tcp_flags &= ~XPRT_COPY_DATA;
 -                      xprt->tcp_flags |= XPRT_COPY_XID;
 -                      xprt->tcp_copied = 0;
 +      if (transport->tcp_offset == transport->tcp_reclen) {
 +              transport->tcp_flags |= TCP_RCV_COPY_FRAGHDR;
 +              transport->tcp_offset = 0;
 +              if (transport->tcp_flags & TCP_RCV_LAST_FRAG) {
 +                      transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
 +                      transport->tcp_flags |= TCP_RCV_COPY_XID;
 +                      transport->tcp_copied = 0;
                }
        }
  }
  
 -static inline void xs_tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc)
 +static inline void xs_tcp_read_xid(struct sock_xprt *transport, struct xdr_skb_reader *desc)
  {
        size_t len, used;
        char *p;
  
 -      len = sizeof(xprt->tcp_xid) - xprt->tcp_offset;
 +      len = sizeof(transport->tcp_xid) - transport->tcp_offset;
        dprintk("RPC:      reading XID (%Zu bytes)\n", len);
 -      p = ((char *) &xprt->tcp_xid) + xprt->tcp_offset;
 -      used = xs_tcp_copy_data(desc, p, len);
 -      xprt->tcp_offset += used;
 +      p = ((char *) &transport->tcp_xid) + transport->tcp_offset;
 +      used = xdr_skb_read_bits(desc, p, len);
 +      transport->tcp_offset += used;
        if (used != len)
                return;
 -      xprt->tcp_flags &= ~XPRT_COPY_XID;
 -      xprt->tcp_flags |= XPRT_COPY_DATA;
 -      xprt->tcp_copied = 4;
 +      transport->tcp_flags &= ~TCP_RCV_COPY_XID;
 +      transport->tcp_flags |= TCP_RCV_COPY_DATA;
 +      transport->tcp_copied = 4;
        dprintk("RPC:      reading reply for XID %08x\n",
 -                                              ntohl(xprt->tcp_xid));
 -      xs_tcp_check_recm(xprt);
 +                      ntohl(transport->tcp_xid));
 +      xs_tcp_check_fraghdr(transport);
  }
  
 -static inline void xs_tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 +static inline void xs_tcp_read_request(struct rpc_xprt *xprt, struct xdr_skb_reader *desc)
  {
 +      struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
        struct rpc_rqst *req;
        struct xdr_buf *rcvbuf;
        size_t len;
  
        /* Find and lock the request corresponding to this xid */
        spin_lock(&xprt->transport_lock);
 -      req = xprt_lookup_rqst(xprt, xprt->tcp_xid);
 +      req = xprt_lookup_rqst(xprt, transport->tcp_xid);
        if (!req) {
 -              xprt->tcp_flags &= ~XPRT_COPY_DATA;
 +              transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
                dprintk("RPC:      XID %08x request not found!\n",
 -                              ntohl(xprt->tcp_xid));
 +                              ntohl(transport->tcp_xid));
                spin_unlock(&xprt->transport_lock);
                return;
        }
  
        rcvbuf = &req->rq_private_buf;
        len = desc->count;
 -      if (len > xprt->tcp_reclen - xprt->tcp_offset) {
 -              skb_reader_t my_desc;
 +      if (len > transport->tcp_reclen - transport->tcp_offset) {
 +              struct xdr_skb_reader my_desc;
  
 -              len = xprt->tcp_reclen - xprt->tcp_offset;
 +              len = transport->tcp_reclen - transport->tcp_offset;
                memcpy(&my_desc, desc, sizeof(my_desc));
                my_desc.count = len;
 -              r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
 -                                        &my_desc, xs_tcp_copy_data);
 +              r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied,
 +                                        &my_desc, xdr_skb_read_bits);
                desc->count -= r;
                desc->offset += r;
        } else
 -              r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
 -                                        desc, xs_tcp_copy_data);
 +              r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied,
 +                                        desc, xdr_skb_read_bits);
  
        if (r > 0) {
 -              xprt->tcp_copied += r;
 -              xprt->tcp_offset += r;
 +              transport->tcp_copied += r;
 +              transport->tcp_offset += r;
        }
        if (r != len) {
                /* Error when copying to the receive buffer,
                 * usually because we weren't able to allocate
                 * additional buffer pages. All we can do now
 -               * is turn off XPRT_COPY_DATA, so the request
 +               * is turn off TCP_RCV_COPY_DATA, so the request
                 * will not receive any additional updates,
                 * and time out.
                 * Any remaining data from this record will
                 * be discarded.
                 */
 -              xprt->tcp_flags &= ~XPRT_COPY_DATA;
 +              transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
                dprintk("RPC:      XID %08x truncated request\n",
 -                              ntohl(xprt->tcp_xid));
 +                              ntohl(transport->tcp_xid));
                dprintk("RPC:      xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
 -                              xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
 +                              xprt, transport->tcp_copied, transport->tcp_offset,
 +                                      transport->tcp_reclen);
                goto out;
        }
  
        dprintk("RPC:      XID %08x read %Zd bytes\n",
 -                      ntohl(xprt->tcp_xid), r);
 +                      ntohl(transport->tcp_xid), r);
        dprintk("RPC:      xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
 -                      xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
 -
 -      if (xprt->tcp_copied == req->rq_private_buf.buflen)
 -              xprt->tcp_flags &= ~XPRT_COPY_DATA;
 -      else if (xprt->tcp_offset == xprt->tcp_reclen) {
 -              if (xprt->tcp_flags & XPRT_LAST_FRAG)
 -                      xprt->tcp_flags &= ~XPRT_COPY_DATA;
 +                      xprt, transport->tcp_copied, transport->tcp_offset,
 +                              transport->tcp_reclen);
 +
 +      if (transport->tcp_copied == req->rq_private_buf.buflen)
 +              transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
 +      else if (transport->tcp_offset == transport->tcp_reclen) {
 +              if (transport->tcp_flags & TCP_RCV_LAST_FRAG)
 +                      transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
        }
  
  out:
 -      if (!(xprt->tcp_flags & XPRT_COPY_DATA))
 -              xprt_complete_rqst(req->rq_task, xprt->tcp_copied);
 +      if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
 +              xprt_complete_rqst(req->rq_task, transport->tcp_copied);
        spin_unlock(&xprt->transport_lock);
 -      xs_tcp_check_recm(xprt);
 +      xs_tcp_check_fraghdr(transport);
  }
  
 -static inline void xs_tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc)
 +static inline void xs_tcp_read_discard(struct sock_xprt *transport, struct xdr_skb_reader *desc)
  {
        size_t len;
  
 -      len = xprt->tcp_reclen - xprt->tcp_offset;
 +      len = transport->tcp_reclen - transport->tcp_offset;
        if (len > desc->count)
                len = desc->count;
        desc->count -= len;
        desc->offset += len;
 -      xprt->tcp_offset += len;
 +      transport->tcp_offset += len;
        dprintk("RPC:      discarded %Zu bytes\n", len);
 -      xs_tcp_check_recm(xprt);
 +      xs_tcp_check_fraghdr(transport);
  }
  
  static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len)
  {
        struct rpc_xprt *xprt = rd_desc->arg.data;
 -      skb_reader_t desc = {
 +      struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 +      struct xdr_skb_reader desc = {
                .skb    = skb,
                .offset = offset,
                .count  = len,
 -              .csum   = 0
        };
  
        dprintk("RPC:      xs_tcp_data_recv started\n");
        do {
                /* Read in a new fragment marker if necessary */
                /* Can we ever really expect to get completely empty fragments? */
 -              if (xprt->tcp_flags & XPRT_COPY_RECM) {
 +              if (transport->tcp_flags & TCP_RCV_COPY_FRAGHDR) {
                        xs_tcp_read_fraghdr(xprt, &desc);
                        continue;
                }
                /* Read in the xid if necessary */
 -              if (xprt->tcp_flags & XPRT_COPY_XID) {
 -                      xs_tcp_read_xid(xprt, &desc);
 +              if (transport->tcp_flags & TCP_RCV_COPY_XID) {
 +                      xs_tcp_read_xid(transport, &desc);
                        continue;
                }
                /* Read in the request data */
 -              if (xprt->tcp_flags & XPRT_COPY_DATA) {
 +              if (transport->tcp_flags & TCP_RCV_COPY_DATA) {
                        xs_tcp_read_request(xprt, &desc);
                        continue;
                }
                /* Skip over any trailing bytes on short reads */
 -              xs_tcp_read_discard(xprt, &desc);
 +              xs_tcp_read_discard(transport, &desc);
        } while (desc.count);
        dprintk("RPC:      xs_tcp_data_recv done\n");
        return len - desc.count;
@@@ -978,16 -858,11 +978,16 @@@ static void xs_tcp_state_change(struct 
        case TCP_ESTABLISHED:
                spin_lock_bh(&xprt->transport_lock);
                if (!xprt_test_and_set_connected(xprt)) {
 +                      struct sock_xprt *transport = container_of(xprt,
 +                                      struct sock_xprt, xprt);
 +
                        /* Reset TCP record info */
 -                      xprt->tcp_offset = 0;
 -                      xprt->tcp_reclen = 0;
 -                      xprt->tcp_copied = 0;
 -                      xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID;
 +                      transport->tcp_offset = 0;
 +                      transport->tcp_reclen = 0;
 +                      transport->tcp_copied = 0;
 +                      transport->tcp_flags =
 +                              TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID;
 +
                        xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
                        xprt_wake_pending_tasks(xprt, 0);
                }
@@@ -1076,16 -951,15 +1076,16 @@@ static void xs_tcp_write_space(struct s
  
  static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt)
  {
 -      struct sock *sk = xprt->inet;
 +      struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 +      struct sock *sk = transport->inet;
  
 -      if (xprt->rcvsize) {
 +      if (transport->rcvsize) {
                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 -              sk->sk_rcvbuf = xprt->rcvsize * xprt->max_reqs *  2;
 +              sk->sk_rcvbuf = transport->rcvsize * xprt->max_reqs * 2;
        }
 -      if (xprt->sndsize) {
 +      if (transport->sndsize) {
                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 -              sk->sk_sndbuf = xprt->sndsize * xprt->max_reqs * 2;
 +              sk->sk_sndbuf = transport->sndsize * xprt->max_reqs * 2;
                sk->sk_write_space(sk);
        }
  }
   */
  static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize)
  {
 -      xprt->sndsize = 0;
 +      struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 +
 +      transport->sndsize = 0;
        if (sndsize)
 -              xprt->sndsize = sndsize + 1024;
 -      xprt->rcvsize = 0;
 +              transport->sndsize = sndsize + 1024;
 +      transport->rcvsize = 0;
        if (rcvsize)
 -              xprt->rcvsize = rcvsize + 1024;
 +              transport->rcvsize = rcvsize + 1024;
  
        xs_udp_do_set_buffer_size(xprt);
  }
@@@ -1130,6 -1002,19 +1130,6 @@@ static unsigned short xs_get_random_por
        return rand + xprt_min_resvport;
  }
  
 -/**
 - * xs_print_peer_address - format an IPv4 address for printing
 - * @xprt: generic transport
 - * @format: flags field indicating which parts of the address to render
 - */
 -static char *xs_print_peer_address(struct rpc_xprt *xprt, enum rpc_display_format_t format)
 -{
 -      if (xprt->address_strings[format] != NULL)
 -              return xprt->address_strings[format];
 -      else
 -              return "unprintable";
 -}
 -
  /**
   * xs_set_port - reset the port number in the remote endpoint address
   * @xprt: generic transport
@@@ -1145,20 -1030,20 +1145,20 @@@ static void xs_set_port(struct rpc_xpr
        sap->sin_port = htons(port);
  }
  
 -static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock)
 +static int xs_bindresvport(struct sock_xprt *transport, struct socket *sock)
  {
        struct sockaddr_in myaddr = {
                .sin_family = AF_INET,
        };
        int err;
 -      unsigned short port = xprt->port;
 +      unsigned short port = transport->port;
  
        do {
                myaddr.sin_port = htons(port);
                err = kernel_bind(sock, (struct sockaddr *) &myaddr,
                                                sizeof(myaddr));
                if (err == 0) {
 -                      xprt->port = port;
 +                      transport->port = port;
                        dprintk("RPC:      xs_bindresvport bound to port %u\n",
                                        port);
                        return 0;
                        port = xprt_max_resvport;
                else
                        port--;
 -      } while (err == -EADDRINUSE && port != xprt->port);
 +      } while (err == -EADDRINUSE && port != transport->port);
  
        dprintk("RPC:      can't bind to reserved port (%d).\n", -err);
        return err;
  }
  
+ #ifdef CONFIG_DEBUG_LOCK_ALLOC
+ static struct lock_class_key xs_key[2];
+ static struct lock_class_key xs_slock_key[2];
+ static inline void xs_reclassify_socket(struct socket *sock)
+ {
+       struct sock *sk = sock->sk;
+       BUG_ON(sk->sk_lock.owner != NULL);
+       switch (sk->sk_family) {
+       case AF_INET:
+               sock_lock_init_class_and_name(sk, "slock-AF_INET-NFS",
+                       &xs_slock_key[0], "sk_lock-AF_INET-NFS", &xs_key[0]);
+               break;
+       case AF_INET6:
+               sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFS",
+                       &xs_slock_key[1], "sk_lock-AF_INET6-NFS", &xs_key[1]);
+               break;
+       default:
+               BUG();
+       }
+ }
+ #else
+ static inline void xs_reclassify_socket(struct socket *sock)
+ {
+ }
+ #endif
  /**
   * xs_udp_connect_worker - set up a UDP socket
   * @work: RPC transport to connect
   */
  static void xs_udp_connect_worker(struct work_struct *work)
  {
 -      struct rpc_xprt *xprt =
 -              container_of(work, struct rpc_xprt, connect_worker.work);
 -      struct socket *sock = xprt->sock;
 +      struct sock_xprt *transport =
 +              container_of(work, struct sock_xprt, connect_worker.work);
 +      struct rpc_xprt *xprt = &transport->xprt;
 +      struct socket *sock = transport->sock;
        int err, status = -EIO;
  
        if (xprt->shutdown || !xprt_bound(xprt))
                dprintk("RPC:      can't create UDP transport socket (%d).\n", -err);
                goto out;
        }
+       xs_reclassify_socket(sock);
  
 -      if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) {
 +      if (xprt->resvport && xs_bindresvport(transport, sock) < 0) {
                sock_release(sock);
                goto out;
        }
  
        dprintk("RPC:      worker connecting xprt %p to address: %s\n",
 -                      xprt, xs_print_peer_address(xprt, RPC_DISPLAY_ALL));
 +                      xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
  
 -      if (!xprt->inet) {
 +      if (!transport->inet) {
                struct sock *sk = sock->sk;
  
                write_lock_bh(&sk->sk_callback_lock);
  
                sk->sk_user_data = xprt;
 -              xprt->old_data_ready = sk->sk_data_ready;
 -              xprt->old_state_change = sk->sk_state_change;
 -              xprt->old_write_space = sk->sk_write_space;
 +              transport->old_data_ready = sk->sk_data_ready;
 +              transport->old_state_change = sk->sk_state_change;
 +              transport->old_write_space = sk->sk_write_space;
                sk->sk_data_ready = xs_udp_data_ready;
                sk->sk_write_space = xs_udp_write_space;
                sk->sk_no_check = UDP_CSUM_NORCV;
                xprt_set_connected(xprt);
  
                /* Reset to new socket */
 -              xprt->sock = sock;
 -              xprt->inet = sk;
 +              transport->sock = sock;
 +              transport->inet = sk;
  
                write_unlock_bh(&sk->sk_callback_lock);
        }
@@@ -1242,7 -1156,7 +1272,7 @@@ out
  static void xs_tcp_reuse_connection(struct rpc_xprt *xprt)
  {
        int result;
 -      struct socket *sock = xprt->sock;
 +      struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
        struct sockaddr any;
  
        dprintk("RPC:      disconnecting xprt %p to reuse port\n", xprt);
         */
        memset(&any, 0, sizeof(any));
        any.sa_family = AF_UNSPEC;
 -      result = kernel_connect(sock, &any, sizeof(any), 0);
 +      result = kernel_connect(transport->sock, &any, sizeof(any), 0);
        if (result)
                dprintk("RPC:      AF_UNSPEC connect return code %d\n",
                                result);
   */
  static void xs_tcp_connect_worker(struct work_struct *work)
  {
 -      struct rpc_xprt *xprt =
 -              container_of(work, struct rpc_xprt, connect_worker.work);
 -      struct socket *sock = xprt->sock;
 +      struct sock_xprt *transport =
 +              container_of(work, struct sock_xprt, connect_worker.work);
 +      struct rpc_xprt *xprt = &transport->xprt;
 +      struct socket *sock = transport->sock;
        int err, status = -EIO;
  
        if (xprt->shutdown || !xprt_bound(xprt))
                goto out;
  
 -      if (!xprt->sock) {
 +      if (!sock) {
                /* start from scratch */
                if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
                        dprintk("RPC:      can't create TCP transport socket (%d).\n", -err);
                        goto out;
                }
+               xs_reclassify_socket(sock);
  
 -              if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) {
 +              if (xprt->resvport && xs_bindresvport(transport, sock) < 0) {
                        sock_release(sock);
                        goto out;
                }
                xs_tcp_reuse_connection(xprt);
  
        dprintk("RPC:      worker connecting xprt %p to address: %s\n",
 -                      xprt, xs_print_peer_address(xprt, RPC_DISPLAY_ALL));
 +                      xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
  
 -      if (!xprt->inet) {
 +      if (!transport->inet) {
                struct sock *sk = sock->sk;
  
                write_lock_bh(&sk->sk_callback_lock);
  
                sk->sk_user_data = xprt;
 -              xprt->old_data_ready = sk->sk_data_ready;
 -              xprt->old_state_change = sk->sk_state_change;
 -              xprt->old_write_space = sk->sk_write_space;
 +              transport->old_data_ready = sk->sk_data_ready;
 +              transport->old_state_change = sk->sk_state_change;
 +              transport->old_write_space = sk->sk_write_space;
                sk->sk_data_ready = xs_tcp_data_ready;
                sk->sk_state_change = xs_tcp_state_change;
                sk->sk_write_space = xs_tcp_write_space;
                xprt_clear_connected(xprt);
  
                /* Reset to new socket */
 -              xprt->sock = sock;
 -              xprt->inet = sk;
 +              transport->sock = sock;
 +              transport->inet = sk;
  
                write_unlock_bh(&sk->sk_callback_lock);
        }
@@@ -1367,22 -1281,21 +1398,22 @@@ out_clear
  static void xs_connect(struct rpc_task *task)
  {
        struct rpc_xprt *xprt = task->tk_xprt;
 +      struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
  
        if (xprt_test_and_set_connecting(xprt))
                return;
  
 -      if (xprt->sock != NULL) {
 +      if (transport->sock != NULL) {
                dprintk("RPC:      xs_connect delayed xprt %p for %lu seconds\n",
                                xprt, xprt->reestablish_timeout / HZ);
 -              schedule_delayed_work(&xprt->connect_worker,
 +              schedule_delayed_work(&transport->connect_worker,
                                        xprt->reestablish_timeout);
                xprt->reestablish_timeout <<= 1;
                if (xprt->reestablish_timeout > XS_TCP_MAX_REEST_TO)
                        xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO;
        } else {
                dprintk("RPC:      xs_connect scheduled xprt %p\n", xprt);
 -              schedule_delayed_work(&xprt->connect_worker, 0);
 +              schedule_delayed_work(&transport->connect_worker, 0);
  
                /* flush_scheduled_work can sleep... */
                if (!RPC_IS_ASYNC(task))
   */
  static void xs_udp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
  {
 +      struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 +
        seq_printf(seq, "\txprt:\tudp %u %lu %lu %lu %lu %Lu %Lu\n",
 -                      xprt->port,
 +                      transport->port,
                        xprt->stat.bind_count,
                        xprt->stat.sends,
                        xprt->stat.recvs,
   */
  static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
  {
 +      struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
        long idle_time = 0;
  
        if (xprt_connected(xprt))
                idle_time = (long)(jiffies - xprt->last_used) / HZ;
  
        seq_printf(seq, "\txprt:\ttcp %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu\n",
 -                      xprt->port,
 +                      transport->port,
                        xprt->stat.bind_count,
                        xprt->stat.connect_count,
                        xprt->stat.connect_time,
  
  static struct rpc_xprt_ops xs_udp_ops = {
        .set_buffer_size        = xs_udp_set_buffer_size,
 -      .print_addr             = xs_print_peer_address,
        .reserve_xprt           = xprt_reserve_xprt_cong,
        .release_xprt           = xprt_release_xprt_cong,
        .rpcbind                = rpc_getport,
  };
  
  static struct rpc_xprt_ops xs_tcp_ops = {
 -      .print_addr             = xs_print_peer_address,
        .reserve_xprt           = xprt_reserve_xprt,
        .release_xprt           = xs_tcp_release_xprt,
        .rpcbind                = rpc_getport,
        .print_stats            = xs_tcp_print_stats,
  };
  
 +static struct rpc_xprt *xs_setup_xprt(struct sockaddr *addr, size_t addrlen, unsigned int slot_table_size)
 +{
 +      struct rpc_xprt *xprt;
 +      struct sock_xprt *new;
 +
 +      if (addrlen > sizeof(xprt->addr)) {
 +              dprintk("RPC:      xs_setup_xprt: address too large\n");
 +              return ERR_PTR(-EBADF);
 +      }
 +
 +      new = kzalloc(sizeof(*new), GFP_KERNEL);
 +      if (new == NULL) {
 +              dprintk("RPC:      xs_setup_xprt: couldn't allocate rpc_xprt\n");
 +              return ERR_PTR(-ENOMEM);
 +      }
 +      xprt = &new->xprt;
 +
 +      xprt->max_reqs = slot_table_size;
 +      xprt->slot = kcalloc(xprt->max_reqs, sizeof(struct rpc_rqst), GFP_KERNEL);
 +      if (xprt->slot == NULL) {
 +              kfree(xprt);
 +              dprintk("RPC:      xs_setup_xprt: couldn't allocate slot table\n");
 +              return ERR_PTR(-ENOMEM);
 +      }
 +
 +      memcpy(&xprt->addr, addr, addrlen);
 +      xprt->addrlen = addrlen;
 +      new->port = xs_get_random_port();
 +
 +      return xprt;
 +}
 +
  /**
   * xs_setup_udp - Set up transport to use a UDP socket
 - * @xprt: transport to set up
 + * @addr: address of remote server
 + * @addrlen: length of address in bytes
   * @to:   timeout parameters
   *
   */
 -int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to)
 +struct rpc_xprt *xs_setup_udp(struct sockaddr *addr, size_t addrlen, struct rpc_timeout *to)
  {
 -      size_t slot_table_size;
 -      struct sockaddr_in *addr = (struct sockaddr_in *) &xprt->addr;
 +      struct rpc_xprt *xprt;
 +      struct sock_xprt *transport;
  
 -      xprt->max_reqs = xprt_udp_slot_table_entries;
 -      slot_table_size = xprt->max_reqs * sizeof(xprt->slot[0]);
 -      xprt->slot = kzalloc(slot_table_size, GFP_KERNEL);
 -      if (xprt->slot == NULL)
 -              return -ENOMEM;
 +      xprt = xs_setup_xprt(addr, addrlen, xprt_udp_slot_table_entries);
 +      if (IS_ERR(xprt))
 +              return xprt;
 +      transport = container_of(xprt, struct sock_xprt, xprt);
  
 -      if (ntohs(addr->sin_port) != 0)
 +      if (ntohs(((struct sockaddr_in *)addr)->sin_port) != 0)
                xprt_set_bound(xprt);
 -      xprt->port = xs_get_random_port();
  
        xprt->prot = IPPROTO_UDP;
        xprt->tsh_size = 0;
        /* XXX: header size can vary due to auth type, IPv6, etc. */
        xprt->max_payload = (1U << 16) - (MAX_HEADER << 3);
  
 -      INIT_DELAYED_WORK(&xprt->connect_worker, xs_udp_connect_worker);
 +      INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_connect_worker);
        xprt->bind_timeout = XS_BIND_TO;
        xprt->connect_timeout = XS_UDP_CONN_TO;
        xprt->reestablish_timeout = XS_UDP_REEST_TO;
  
        xs_format_peer_addresses(xprt);
        dprintk("RPC:      set up transport to address %s\n",
 -                      xs_print_peer_address(xprt, RPC_DISPLAY_ALL));
 +                      xprt->address_strings[RPC_DISPLAY_ALL]);
  
 -      return 0;
 +      return xprt;
  }
  
  /**
   * xs_setup_tcp - Set up transport to use a TCP socket
 - * @xprt: transport to set up
 + * @addr: address of remote server
 + * @addrlen: length of address in bytes
   * @to: timeout parameters
   *
   */
 -int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to)
 +struct rpc_xprt *xs_setup_tcp(struct sockaddr *addr, size_t addrlen, struct rpc_timeout *to)
  {
 -      size_t slot_table_size;
 -      struct sockaddr_in *addr = (struct sockaddr_in *) &xprt->addr;
 +      struct rpc_xprt *xprt;
 +      struct sock_xprt *transport;
  
 -      xprt->max_reqs = xprt_tcp_slot_table_entries;
 -      slot_table_size = xprt->max_reqs * sizeof(xprt->slot[0]);
 -      xprt->slot = kzalloc(slot_table_size, GFP_KERNEL);
 -      if (xprt->slot == NULL)
 -              return -ENOMEM;
 +      xprt = xs_setup_xprt(addr, addrlen, xprt_tcp_slot_table_entries);
 +      if (IS_ERR(xprt))
 +              return xprt;
 +      transport = container_of(xprt, struct sock_xprt, xprt);
  
 -      if (ntohs(addr->sin_port) != 0)
 +      if (ntohs(((struct sockaddr_in *)addr)->sin_port) != 0)
                xprt_set_bound(xprt);
 -      xprt->port = xs_get_random_port();
  
        xprt->prot = IPPROTO_TCP;
        xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
        xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
  
 -      INIT_DELAYED_WORK(&xprt->connect_worker, xs_tcp_connect_worker);
 +      INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker);
        xprt->bind_timeout = XS_BIND_TO;
        xprt->connect_timeout = XS_TCP_CONN_TO;
        xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
  
        xs_format_peer_addresses(xprt);
        dprintk("RPC:      set up transport to address %s\n",
 -                      xs_print_peer_address(xprt, RPC_DISPLAY_ALL));
 +                      xprt->address_strings[RPC_DISPLAY_ALL]);
 +
 +      return xprt;
 +}
 +
 +/**
 + * init_socket_xprt - set up xprtsock's sysctls
 + *
 + */
 +int init_socket_xprt(void)
 +{
 +#ifdef RPC_DEBUG
 +      if (!sunrpc_table_header) {
 +              sunrpc_table_header = register_sysctl_table(sunrpc_table, 1);
 +#ifdef CONFIG_PROC_FS
 +              if (sunrpc_table[0].de)
 +                      sunrpc_table[0].de->owner = THIS_MODULE;
 +#endif
 +      }
 +#endif
  
        return 0;
  }
 +
 +/**
 + * cleanup_socket_xprt - remove xprtsock's sysctls
 + *
 + */
 +void cleanup_socket_xprt(void)
 +{
 +#ifdef RPC_DEBUG
 +      if (sunrpc_table_header) {
 +              unregister_sysctl_table(sunrpc_table_header);
 +              sunrpc_table_header = NULL;
 +      }
 +#endif
 +}