Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph...

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 24 May 2010 14:37:52 +0000 (07:37 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 24 May 2010 14:37:52 +0000 (07:37 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 24 May 2010 14:37:52 +0000 (07:37 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 24 May 2010 14:37:52 +0000 (07:37 -0700)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c

index a9005d8..d9c60b8 100644 (file)
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -274,7 +274,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
         struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
         int rc = 0;
         struct page **pages;
-       struct pagevec pvec;
         loff_t offset;
         u64 len;
  
@@ -297,8 +296,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
         if (rc < 0)
                 goto out;
  
-       /* set uptodate and add to lru in pagevec-sized chunks */
-       pagevec_init(&pvec, 0);
         for (; !list_empty(page_list) && len > 0;
              rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
                 struct page *page =
@@ -312,7 +309,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
                         zero_user_segment(page, s, PAGE_CACHE_SIZE);
                 }
  
-               if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
+               if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) {
                         page_cache_release(page);
                         dout("readpages %p add_to_page_cache failed %p\n",
                              inode, page);
@@ -323,10 +320,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
                 flush_dcache_page(page);
                 SetPageUptodate(page);
                 unlock_page(page);
-               if (pagevec_add(&pvec, page) == 0)
-                       pagevec_lru_add_file(&pvec);   /* add to lru */
+               page_cache_release(page);
         }
-       pagevec_lru_add_file(&pvec);
         rc = 0;
  
  out:
@@ -568,7 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req,
         ceph_release_pages(req->r_pages, req->r_num_pages);
         if (req->r_pages_from_pool)
                 mempool_free(req->r_pages,
-                            ceph_client(inode->i_sb)->wb_pagevec_pool);
+                            ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
         else
                 kfree(req->r_pages);
         ceph_osdc_put_request(req);
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c

index 818afe7..9f46de2 100644 (file)
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -150,7 +150,8 @@ int ceph_build_auth_request(struct ceph_auth_client *ac,
  
         ret = ac->ops->build_request(ac, p + sizeof(u32), end);
         if (ret < 0) {
-               pr_err("error %d building request\n", ret);
+               pr_err("error %d building auth method %s request\n", ret,
+                      ac->ops->name);
                 return ret;
         }
         dout(" built request %d bytes\n", ret);
@@ -216,8 +217,8 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
                 if (ac->protocol != protocol) {
                         ret = ceph_auth_init_protocol(ac, protocol);
                         if (ret) {
-                               pr_err("error %d on auth protocol %d init\n",
-                                      ret, protocol);
+                               pr_err("error %d on auth method %s init\n",
+                                      ret, ac->ops->name);
                                 goto out;
                         }
                 }
@@ -229,7 +230,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
         if (ret == -EAGAIN) {
                 return ceph_build_auth_request(ac, reply_buf, reply_len);
         } else if (ret) {
-               pr_err("authentication error %d\n", ret);
+               pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
                 return ret;
         }
         return 0;
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h

index ca4f57c..4429a70 100644 (file)
--- a/fs/ceph/auth.h
+++ b/fs/ceph/auth.h
@@ -15,6 +15,8 @@ struct ceph_auth_client;
  struct ceph_authorizer;
  
  struct ceph_auth_client_ops {
+       const char *name;
+
         /*
          * true if we are authenticated and can connect to
          * services.
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c

index 8cd9e3a..24407c1 100644 (file)
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -94,6 +94,7 @@ static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
  }
  
  static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+       .name = "none",
         .reset = reset,
         .destroy = destroy,
         .is_authenticated = is_authenticated,
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c

index fee5a08..7b20623 100644 (file)
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -127,7 +127,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
         int ret;
         char *dbuf;
         char *ticket_buf;
-       u8 struct_v;
+       u8 reply_struct_v;
  
         dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
         if (!dbuf)
@@ -139,14 +139,14 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
                 goto out_dbuf;
  
         ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
-       struct_v = ceph_decode_8(&p);
-       if (struct_v != 1)
+       reply_struct_v = ceph_decode_8(&p);
+       if (reply_struct_v != 1)
                 goto bad;
         num = ceph_decode_32(&p);
         dout("%d tickets\n", num);
         while (num--) {
                 int type;
-               u8 struct_v;
+               u8 tkt_struct_v, blob_struct_v;
                 struct ceph_x_ticket_handler *th;
                 void *dp, *dend;
                 int dlen;
@@ -165,8 +165,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
                 type = ceph_decode_32(&p);
                 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
  
-               struct_v = ceph_decode_8(&p);
-               if (struct_v != 1)
+               tkt_struct_v = ceph_decode_8(&p);
+               if (tkt_struct_v != 1)
                         goto bad;
  
                 th = get_ticket_handler(ac, type);
@@ -186,8 +186,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
                 dend = dbuf + dlen;
                 dp = dbuf;
  
-               struct_v = ceph_decode_8(&dp);
-               if (struct_v != 1)
+               tkt_struct_v = ceph_decode_8(&dp);
+               if (tkt_struct_v != 1)
                         goto bad;
  
                 memcpy(&old_key, &th->session_key, sizeof(old_key));
@@ -224,7 +224,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
                 tpend = tp + dlen;
                 dout(" ticket blob is %d bytes\n", dlen);
                 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
-               struct_v = ceph_decode_8(&tp);
+               blob_struct_v = ceph_decode_8(&tp);
                 new_secret_id = ceph_decode_64(&tp);
                 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
                 if (ret)
@@ -618,6 +618,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
  
  
  static const struct ceph_auth_client_ops ceph_x_ops = {
+       .name = "x",
         .is_authenticated = ceph_x_is_authenticated,
         .build_request = ceph_x_build_request,
         .handle_reply = ceph_x_handle_reply,
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c

index d940053..0dd0b81 100644 (file)
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -867,7 +867,8 @@ void __ceph_remove_cap(struct ceph_cap *cap)
  {
         struct ceph_mds_session *session = cap->session;
         struct ceph_inode_info *ci = cap->ci;
-       struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+       struct ceph_mds_client *mdsc =
+               &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
         int removed = 0;
  
         dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -937,9 +938,9 @@ static int send_cap_msg(struct ceph_mds_session *session,
              seq, issue_seq, mseq, follows, size, max_size,
              xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
  
-       msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
-       if (IS_ERR(msg))
-               return PTR_ERR(msg);
+       msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
+       if (!msg)
+               return -ENOMEM;
  
         msg->hdr.tid = cpu_to_le64(flush_tid);
  
@@ -1298,7 +1299,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
   */
  void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
  {
-       struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+       struct ceph_mds_client *mdsc =
+               &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
         struct inode *inode = &ci->vfs_inode;
         int was = ci->i_dirty_caps;
         int dirty = 0;
@@ -1336,7 +1338,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
  static int __mark_caps_flushing(struct inode *inode,
                                  struct ceph_mds_session *session)
  {
-       struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+       struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
         struct ceph_inode_info *ci = ceph_inode(inode);
         int flushing;
  
@@ -1663,7 +1665,7 @@ ack:
  static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
                           unsigned *flush_tid)
  {
-       struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+       struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
         struct ceph_inode_info *ci = ceph_inode(inode);
         int unlock_session = session ? 0 : 1;
         int flushing = 0;
@@ -1716,10 +1718,9 @@ out_unlocked:
  static int caps_are_flushed(struct inode *inode, unsigned tid)
  {
         struct ceph_inode_info *ci = ceph_inode(inode);
-       int dirty, i, ret = 1;
+       int i, ret = 1;
  
         spin_lock(&inode->i_lock);
-       dirty = __ceph_caps_dirty(ci);
         for (i = 0; i < CEPH_CAP_BITS; i++)
                 if ((ci->i_flushing_caps & (1 << i)) &&
                     ci->i_cap_flush_tid[i] <= tid) {
@@ -1829,7 +1830,8 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
                         err = wait_event_interruptible(ci->i_cap_wq,
                                        caps_are_flushed(inode, flush_tid));
         } else {
-               struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+               struct ceph_mds_client *mdsc =
+                       &ceph_sb_to_client(inode->i_sb)->mdsc;
  
                 spin_lock(&inode->i_lock);
                 if (__ceph_caps_dirty(ci))
@@ -2411,7 +2413,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
         __releases(inode->i_lock)
  {
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+       struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
         unsigned seq = le32_to_cpu(m->seq);
         int dirty = le32_to_cpu(m->dirty);
         int cleaned = 0;
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h

index 0c2241e..3b9eeed 100644 (file)
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -19,7 +19,7 @@
   * Ceph release version
   */
  #define CEPH_VERSION_MAJOR 0
-#define CEPH_VERSION_MINOR 19
+#define CEPH_VERSION_MINOR 20
  #define CEPH_VERSION_PATCH 0
  
  #define _CEPH_STRINGIFY(x) #x
@@ -36,7 +36,7 @@
   * client-facing protocol.
   */
  #define CEPH_OSD_PROTOCOL     8 /* cluster internal */
-#define CEPH_MDS_PROTOCOL     9 /* cluster internal */
+#define CEPH_MDS_PROTOCOL    12 /* cluster internal */
  #define CEPH_MON_PROTOCOL     5 /* cluster internal */
  #define CEPH_OSDC_PROTOCOL   24 /* server/client */
  #define CEPH_MDSC_PROTOCOL   32 /* server/client */
@@ -53,8 +53,18 @@
  /*
   * feature bits
   */
-#define CEPH_FEATURE_SUPPORTED  0
-#define CEPH_FEATURE_REQUIRED   0
+#define CEPH_FEATURE_UID        1
+#define CEPH_FEATURE_NOSRCADDR  2
+#define CEPH_FEATURE_FLOCK      4
+
+#define CEPH_FEATURE_SUPPORTED_MON  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_MON   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_MDS  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK
+#define CEPH_FEATURE_REQUIRED_MDS   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_OSD  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_OSD   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR
  
  
  /*
@@ -91,6 +101,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
  #define CEPH_AUTH_NONE         0x1
  #define CEPH_AUTH_CEPHX                0x2
  
+#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
+
  
  /*********************************************
   * message layer
@@ -128,11 +140,27 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
  #define CEPH_MSG_CLIENT_SNAP            0x312
  #define CEPH_MSG_CLIENT_CAPRELEASE      0x313
  
+/* pool ops */
+#define CEPH_MSG_POOLOP_REPLY           48
+#define CEPH_MSG_POOLOP                 49
+
+
  /* osd */
  #define CEPH_MSG_OSD_MAP          41
  #define CEPH_MSG_OSD_OP           42
  #define CEPH_MSG_OSD_OPREPLY      43
  
+/* pool operations */
+enum {
+  POOL_OP_CREATE                       = 0x01,
+  POOL_OP_DELETE                       = 0x02,
+  POOL_OP_AUID_CHANGE                  = 0x03,
+  POOL_OP_CREATE_SNAP                  = 0x11,
+  POOL_OP_DELETE_SNAP                  = 0x12,
+  POOL_OP_CREATE_UNMANAGED_SNAP                = 0x21,
+  POOL_OP_DELETE_UNMANAGED_SNAP                = 0x22,
+};
+
  struct ceph_mon_request_header {
         __le64 have_version;
         __le16 session_mon;
@@ -155,6 +183,31 @@ struct ceph_mon_statfs_reply {
         struct ceph_statfs st;
  } __attribute__ ((packed));
  
+const char *ceph_pool_op_name(int op);
+
+struct ceph_mon_poolop {
+       struct ceph_mon_request_header monhdr;
+       struct ceph_fsid fsid;
+       __le32 pool;
+       __le32 op;
+       __le64 auid;
+       __le64 snapid;
+       __le32 name_len;
+} __attribute__ ((packed));
+
+struct ceph_mon_poolop_reply {
+       struct ceph_mon_request_header monhdr;
+       struct ceph_fsid fsid;
+       __le32 reply_code;
+       __le32 epoch;
+       char has_data;
+       char data[0];
+} __attribute__ ((packed));
+
+struct ceph_mon_unmanaged_snap {
+       __le64 snapid;
+} __attribute__ ((packed));
+
  struct ceph_osd_getmap {
         struct ceph_mon_request_header monhdr;
         struct ceph_fsid fsid;
@@ -308,6 +361,7 @@ union ceph_mds_request_args {
         struct {
                 __le32 frag;                 /* which dir fragment */
                 __le32 max_entries;          /* how many dentries to grab */
+               __le32 max_bytes;
         } __attribute__ ((packed)) readdir;
         struct {
                 __le32 mode;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c

index 8e4be6a..7503aee 100644 (file)
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/ceph_strings.c
@@ -10,7 +10,6 @@ const char *ceph_entity_type_name(int type)
         case CEPH_ENTITY_TYPE_OSD: return "osd";
         case CEPH_ENTITY_TYPE_MON: return "mon";
         case CEPH_ENTITY_TYPE_CLIENT: return "client";
-       case CEPH_ENTITY_TYPE_ADMIN: return "admin";
         case CEPH_ENTITY_TYPE_AUTH: return "auth";
         default: return "unknown";
         }
@@ -45,6 +44,7 @@ const char *ceph_osd_op_name(int op)
         case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
         case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
         case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+       case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
  
         case CEPH_OSD_OP_PULL: return "pull";
         case CEPH_OSD_OP_PUSH: return "push";
@@ -174,3 +174,17 @@ const char *ceph_snap_op_name(int o)
         }
         return "???";
  }
+
+const char *ceph_pool_op_name(int op)
+{
+       switch (op) {
+       case POOL_OP_CREATE: return "create";
+       case POOL_OP_DELETE: return "delete";
+       case POOL_OP_AUID_CHANGE: return "auid change";
+       case POOL_OP_CREATE_SNAP: return "create snap";
+       case POOL_OP_DELETE_SNAP: return "delete snap";
+       case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
+       case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
+       }
+       return "???";
+}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c

index f7048da..3be33fb 100644 (file)
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -113,7 +113,7 @@ static int osdmap_show(struct seq_file *s, void *p)
  static int monc_show(struct seq_file *s, void *p)
  {
         struct ceph_client *client = s->private;
-       struct ceph_mon_statfs_request *req;
+       struct ceph_mon_generic_request *req;
         struct ceph_mon_client *monc = &client->monc;
         struct rb_node *rp;
  
@@ -126,9 +126,14 @@ static int monc_show(struct seq_file *s, void *p)
         if (monc->want_next_osdmap)
                 seq_printf(s, "want next osdmap\n");
  
-       for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
-               req = rb_entry(rp, struct ceph_mon_statfs_request, node);
-               seq_printf(s, "%lld statfs\n", req->tid);
+       for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
+               __u16 op;
+               req = rb_entry(rp, struct ceph_mon_generic_request, node);
+               op = le16_to_cpu(req->request->hdr.type);
+               if (op == CEPH_MSG_STATFS)
+                       seq_printf(s, "%lld statfs\n", req->tid);
+               else
+                       seq_printf(s, "%lld unknown\n", req->tid);
         }
  
         mutex_unlock(&monc->mutex);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c

index 650d2db..4fd3090 100644 (file)
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -51,8 +51,11 @@ int ceph_init_dentry(struct dentry *dentry)
                 return -ENOMEM;          /* oh well */
  
         spin_lock(&dentry->d_lock);
-       if (dentry->d_fsdata) /* lost a race */
+       if (dentry->d_fsdata) {
+               /* lost a race */
+               kmem_cache_free(ceph_dentry_cachep, di);
                 goto out_unlock;
+       }
         di->dentry = dentry;
         di->lease_session = NULL;
         dentry->d_fsdata = di;
@@ -125,7 +128,8 @@ more:
         dentry = list_entry(p, struct dentry, d_u.d_child);
         di = ceph_dentry(dentry);
         while (1) {
-               dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
+               dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
+                    d_unhashed(dentry) ? "!hashed" : "hashed",
                      parent->d_subdirs.prev, parent->d_subdirs.next);
                 if (p == &parent->d_subdirs) {
                         fi->at_end = 1;
@@ -229,6 +233,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
         u32 ftype;
         struct ceph_mds_reply_info_parsed *rinfo;
         const int max_entries = client->mount_args->max_readdir;
+       const int max_bytes = client->mount_args->max_readdir_bytes;
  
         dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
         if (fi->at_end)
@@ -312,6 +317,7 @@ more:
                 req->r_readdir_offset = fi->next_offset;
                 req->r_args.readdir.frag = cpu_to_le32(frag);
                 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
+               req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
                 req->r_num_caps = max_entries + 1;
                 err = ceph_mdsc_do_request(mdsc, NULL, req);
                 if (err < 0) {
@@ -335,7 +341,7 @@ more:
                 if (req->r_reply_info.dir_end) {
                         kfree(fi->last_name);
                         fi->last_name = NULL;
-                       fi->next_offset = 0;
+                       fi->next_offset = 2;
                 } else {
                         rinfo = &req->r_reply_info;
                         err = note_last_dentry(fi,
@@ -478,7 +484,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
  struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
                                   struct dentry *dentry, int err)
  {
-       struct ceph_client *client = ceph_client(dentry->d_sb);
+       struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
         struct inode *parent = dentry->d_parent->d_inode;
  
         /* .snap dir? */
@@ -568,7 +574,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                     !is_root_ceph_dentry(dir, dentry) &&
                     (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
                     (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
-                       di->offset = ci->i_max_offset++;
                         spin_unlock(&dir->i_lock);
                         dout(" dir %p complete, -ENOENT\n", dir);
                         d_add(dentry, NULL);
@@ -888,13 +893,22 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
  
                 /* ensure target dentry is invalidated, despite
                    rehashing bug in vfs_rename_dir */
-               new_dentry->d_time = jiffies;
-               ceph_dentry(new_dentry)->lease_shared_gen = 0;
+               ceph_invalidate_dentry_lease(new_dentry);
         }
         ceph_mdsc_put_request(req);
         return err;
  }
  
+/*
+ * Ensure a dentry lease will no longer revalidate.
+ */
+void ceph_invalidate_dentry_lease(struct dentry *dentry)
+{
+       spin_lock(&dentry->d_lock);
+       dentry->d_time = jiffies;
+       ceph_dentry(dentry)->lease_shared_gen = 0;
+       spin_unlock(&dentry->d_lock);
+}
  
  /*
   * Check if dentry lease is valid.  If not, delete the lease.  Try to
@@ -972,8 +986,9 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
  {
         struct inode *dir = dentry->d_parent->d_inode;
  
-       dout("d_revalidate %p '%.*s' inode %p\n", dentry,
-            dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+       dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
+            dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
+            ceph_dentry(dentry)->offset);
  
         /* always trust cached snapped dentries, snapdir dentry */
         if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -1050,7 +1065,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
         struct ceph_inode_info *ci = ceph_inode(inode);
         int left;
  
-       if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
+       if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
                 return -EISDIR;
  
         if (!cf->dir_info) {
@@ -1152,7 +1167,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
         dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
              dn->d_name.len, dn->d_name.name);
         if (di) {
-               mdsc = &ceph_client(dn->d_sb)->mdsc;
+               mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
                 spin_lock(&mdsc->dentry_lru_lock);
                 list_add_tail(&di->lru, &mdsc->dentry_lru);
                 mdsc->num_dentry++;
@@ -1165,10 +1180,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
         struct ceph_dentry_info *di = ceph_dentry(dn);
         struct ceph_mds_client *mdsc;
  
-       dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
-            dn->d_name.len, dn->d_name.name);
+       dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
+            dn->d_name.len, dn->d_name.name, di->offset);
         if (di) {
-               mdsc = &ceph_client(dn->d_sb)->mdsc;
+               mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
                 spin_lock(&mdsc->dentry_lru_lock);
                 list_move_tail(&di->lru, &mdsc->dentry_lru);
                 spin_unlock(&mdsc->dentry_lru_lock);
@@ -1183,7 +1198,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
         dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
              dn->d_name.len, dn->d_name.name);
         if (di) {
-               mdsc = &ceph_client(dn->d_sb)->mdsc;
+               mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
                 spin_lock(&mdsc->dentry_lru_lock);
                 list_del_init(&di->lru);
                 mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c

index 9d67572..1744764 100644 (file)
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -93,11 +93,11 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
                 return ERR_PTR(-ESTALE);
  
         dentry = d_obtain_alias(inode);
-       if (!dentry) {
+       if (IS_ERR(dentry)) {
                 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
                        fh->ino, inode);
                 iput(inode);
-               return ERR_PTR(-ENOMEM);
+               return dentry;
         }
         err = ceph_init_dentry(dentry);
  
@@ -115,7 +115,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
  static struct dentry *__cfh_to_dentry(struct super_block *sb,
                                       struct ceph_nfs_confh *cfh)
  {
-       struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
+       struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
         struct inode *inode;
         struct dentry *dentry;
         struct ceph_vino vino;
@@ -149,11 +149,11 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
         }
  
         dentry = d_obtain_alias(inode);
-       if (!dentry) {
+       if (IS_ERR(dentry)) {
                 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
                        cfh->ino, inode);
                 iput(inode);
-               return ERR_PTR(-ENOMEM);
+               return dentry;
         }
         err = ceph_init_dentry(dentry);
         if (err < 0) {
@@ -202,11 +202,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
                 return ERR_PTR(-ESTALE);
  
         dentry = d_obtain_alias(inode);
-       if (!dentry) {
+       if (IS_ERR(dentry)) {
                 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
                        cfh->ino, inode);
                 iput(inode);
-               return ERR_PTR(-ENOMEM);
+               return dentry;
         }
         err = ceph_init_dentry(dentry);
         if (err < 0) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c

index 7d63493..6512b67 100644 (file)
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -317,16 +317,16 @@ void ceph_release_page_vector(struct page **pages, int num_pages)
  /*
   * allocate a vector new pages
   */
-static struct page **alloc_page_vector(int num_pages)
+struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
  {
         struct page **pages;
         int i;
  
-       pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+       pages = kmalloc(sizeof(*pages) * num_pages, flags);
         if (!pages)
                 return ERR_PTR(-ENOMEM);
         for (i = 0; i < num_pages; i++) {
-               pages[i] = alloc_page(GFP_NOFS);
+               pages[i] = __page_cache_alloc(flags);
                 if (pages[i] == NULL) {
                         ceph_release_page_vector(pages, i);
                         return ERR_PTR(-ENOMEM);
@@ -540,7 +540,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
                  * in sequence.
                  */
         } else {
-               pages = alloc_page_vector(num_pages);
+               pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
         }
         if (IS_ERR(pages))
                 return PTR_ERR(pages);
@@ -649,8 +649,8 @@ more:
                                     do_sync,
                                     ci->i_truncate_seq, ci->i_truncate_size,
                                     &mtime, false, 2);
-       if (IS_ERR(req))
-               return PTR_ERR(req);
+       if (!req)
+               return -ENOMEM;
  
         num_pages = calc_pages_for(pos, len);
  
@@ -668,7 +668,7 @@ more:
                 truncate_inode_pages_range(inode->i_mapping, pos, 
                                            (pos+len) | (PAGE_CACHE_SIZE-1));
         } else {
-               pages = alloc_page_vector(num_pages);
+               pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
                 if (IS_ERR(pages)) {
                         ret = PTR_ERR(pages);
                         goto out;
@@ -809,7 +809,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_dentry->d_inode;
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+       struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
         loff_t endoff = pos + iov->iov_len;
         int got = 0;
         int ret, err;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c

index 85b4d2f..a81b8b6 100644 (file)
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -384,7 +384,7 @@ void ceph_destroy_inode(struct inode *inode)
          */
         if (ci->i_snap_realm) {
                 struct ceph_mds_client *mdsc =
-                       &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+                       &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
                 struct ceph_snap_realm *realm = ci->i_snap_realm;
  
                 dout(" dropping residual ref to snap realm %p\n", realm);
@@ -619,11 +619,12 @@ static int fill_inode(struct inode *inode,
                         memcpy(ci->i_xattrs.blob->vec.iov_base,
                                iinfo->xattr_data, iinfo->xattr_len);
                 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+               xattr_blob = NULL;
         }
  
         inode->i_mapping->a_ops = &ceph_aops;
         inode->i_mapping->backing_dev_info =
-               &ceph_client(inode->i_sb)->backing_dev_info;
+               &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
  
         switch (inode->i_mode & S_IFMT) {
         case S_IFIFO:
@@ -674,14 +675,15 @@ static int fill_inode(struct inode *inode,
                 /* set dir completion flag? */
                 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
                     ceph_snap(inode) == CEPH_NOSNAP &&
-                   (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
+                   (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
+                   (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
                         dout(" marking %p complete (empty)\n", inode);
                         ci->i_ceph_flags |= CEPH_I_COMPLETE;
                         ci->i_max_offset = 2;
                 }
  
                 /* it may be better to set st_size in getattr instead? */
-               if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
+               if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
                         inode->i_size = ci->i_rbytes;
                 break;
         default:
@@ -801,6 +803,37 @@ out_unlock:
         return;
  }
  
+/*
+ * Set dentry's directory position based on the current dir's max, and
+ * order it in d_subdirs, so that dcache_readdir behaves.
+ */
+static void ceph_set_dentry_offset(struct dentry *dn)
+{
+       struct dentry *dir = dn->d_parent;
+       struct inode *inode = dn->d_parent->d_inode;
+       struct ceph_dentry_info *di;
+
+       BUG_ON(!inode);
+
+       di = ceph_dentry(dn);
+
+       spin_lock(&inode->i_lock);
+       if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
+               spin_unlock(&inode->i_lock);
+               return;
+       }
+       di->offset = ceph_inode(inode)->i_max_offset++;
+       spin_unlock(&inode->i_lock);
+
+       spin_lock(&dcache_lock);
+       spin_lock(&dn->d_lock);
+       list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
+       dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
+            dn->d_u.d_child.prev, dn->d_u.d_child.next);
+       spin_unlock(&dn->d_lock);
+       spin_unlock(&dcache_lock);
+}
+
  /*
   * splice a dentry to an inode.
   * caller must hold directory i_mutex for this to be safe.
@@ -814,6 +847,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
  {
         struct dentry *realdn;
  
+       BUG_ON(dn->d_inode);
+
         /* dn must be unhashed */
         if (!d_unhashed(dn))
                 d_drop(dn);
@@ -835,43 +870,16 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
                 dn = realdn;
         } else {
                 BUG_ON(!ceph_dentry(dn));
-
                 dout("dn %p attached to %p ino %llx.%llx\n",
                      dn, dn->d_inode, ceph_vinop(dn->d_inode));
         }
         if ((!prehash || *prehash) && d_unhashed(dn))
                 d_rehash(dn);
+       ceph_set_dentry_offset(dn);
  out:
         return dn;
  }
  
-/*
- * Set dentry's directory position based on the current dir's max, and
- * order it in d_subdirs, so that dcache_readdir behaves.
- */
-static void ceph_set_dentry_offset(struct dentry *dn)
-{
-       struct dentry *dir = dn->d_parent;
-       struct inode *inode = dn->d_parent->d_inode;
-       struct ceph_dentry_info *di;
-
-       BUG_ON(!inode);
-
-       di = ceph_dentry(dn);
-
-       spin_lock(&inode->i_lock);
-       di->offset = ceph_inode(inode)->i_max_offset++;
-       spin_unlock(&inode->i_lock);
-
-       spin_lock(&dcache_lock);
-       spin_lock(&dn->d_lock);
-       list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
-       dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
-            dn->d_u.d_child.prev, dn->d_u.d_child.next);
-       spin_unlock(&dn->d_lock);
-       spin_unlock(&dcache_lock);
-}
-
  /*
   * Incorporate results into the local cache.  This is either just
   * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
@@ -933,14 +941,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
  
         if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
                 dout("fill_trace reply is empty!\n");
-               if (rinfo->head->result == 0 && req->r_locked_dir) {
-                       struct ceph_inode_info *ci =
-                               ceph_inode(req->r_locked_dir);
-                       dout(" clearing %p complete (empty trace)\n",
-                            req->r_locked_dir);
-                       ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-                       ci->i_release_count++;
-               }
+               if (rinfo->head->result == 0 && req->r_locked_dir)
+                       ceph_invalidate_dir_request(req);
                 return 0;
         }
  
@@ -1011,13 +1013,18 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                              req->r_old_dentry->d_name.len,
                              req->r_old_dentry->d_name.name,
                              dn, dn->d_name.len, dn->d_name.name);
+
                         /* ensure target dentry is invalidated, despite
                            rehashing bug in vfs_rename_dir */
-                       dn->d_time = jiffies;
-                       ceph_dentry(dn)->lease_shared_gen = 0;
+                       ceph_invalidate_dentry_lease(dn);
+
                         /* take overwritten dentry's readdir offset */
+                       dout("dn %p gets %p offset %lld (old offset %lld)\n",
+                            req->r_old_dentry, dn, ceph_dentry(dn)->offset,
+                            ceph_dentry(req->r_old_dentry)->offset);
                         ceph_dentry(req->r_old_dentry)->offset =
                                 ceph_dentry(dn)->offset;
+
                         dn = req->r_old_dentry;  /* use old_dentry */
                         in = dn->d_inode;
                 }
@@ -1059,7 +1066,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                                 goto done;
                         }
                         req->r_dentry = dn;  /* may have spliced */
-                       ceph_set_dentry_offset(dn);
                         igrab(in);
                 } else if (ceph_ino(in) == vino.ino &&
                            ceph_snap(in) == vino.snap) {
@@ -1102,7 +1108,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                         err = PTR_ERR(dn);
                         goto done;
                 }
-               ceph_set_dentry_offset(dn);
                 req->r_dentry = dn;  /* may have spliced */
                 igrab(in);
                 rinfo->head->is_dentry = 1;  /* fool notrace handlers */
@@ -1429,7 +1434,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
  {
         struct ceph_inode_info *ci = ceph_inode(inode);
  
-       if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
+       if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
                        &ci->i_vmtruncate_work)) {
                 dout("ceph_queue_vmtruncate %p\n", inode);
                 igrab(inode);
@@ -1518,7 +1523,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
         struct inode *parent_inode = dentry->d_parent->d_inode;
         const unsigned int ia_valid = attr->ia_valid;
         struct ceph_mds_request *req;
-       struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
+       struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
         int issued;
         int release = 0, dirtied = 0;
         int mask = 0;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c

index 8a5bcae..d085f07 100644 (file)
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -98,7 +98,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
         struct ceph_ioctl_dataloc dl;
         struct inode *inode = file->f_dentry->d_inode;
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+       struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
         u64 len = 1, olen;
         u64 tmp;
         struct ceph_object_layout ol;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c

index 24561a5..885aa57 100644 (file)
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -40,7 +40,7 @@
  static void __wake_requests(struct ceph_mds_client *mdsc,
                             struct list_head *head);
  
-const static struct ceph_connection_operations mds_con_ops;
+static const struct ceph_connection_operations mds_con_ops;
  
  
  /*
@@ -665,10 +665,10 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
         struct ceph_msg *msg;
         struct ceph_mds_session_head *h;
  
-       msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
-       if (IS_ERR(msg)) {
+       msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS);
+       if (!msg) {
                 pr_err("create_session_msg ENOMEM creating msg\n");
-               return ERR_PTR(PTR_ERR(msg));
+               return NULL;
         }
         h = msg->front.iov_base;
         h->op = cpu_to_le32(op);
@@ -687,7 +687,6 @@ static int __open_session(struct ceph_mds_client *mdsc,
         struct ceph_msg *msg;
         int mstate;
         int mds = session->s_mds;
-       int err = 0;
  
         /* wait for mds to go active? */
         mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
@@ -698,13 +697,9 @@ static int __open_session(struct ceph_mds_client *mdsc,
  
         /* send connect message */
         msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
-       if (IS_ERR(msg)) {
-               err = PTR_ERR(msg);
-               goto out;
-       }
+       if (!msg)
+               return -ENOMEM;
         ceph_con_send(&session->s_con, msg);
-
-out:
         return 0;
  }
  
@@ -804,12 +799,49 @@ out:
  }
  
  static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
-                                  void *arg)
+                                 void *arg)
  {
         struct ceph_inode_info *ci = ceph_inode(inode);
+       int drop = 0;
+
         dout("removing cap %p, ci is %p, inode is %p\n",
              cap, ci, &ci->vfs_inode);
-       ceph_remove_cap(cap);
+       spin_lock(&inode->i_lock);
+       __ceph_remove_cap(cap);
+       if (!__ceph_is_any_real_caps(ci)) {
+               struct ceph_mds_client *mdsc =
+                       &ceph_sb_to_client(inode->i_sb)->mdsc;
+
+               spin_lock(&mdsc->cap_dirty_lock);
+               if (!list_empty(&ci->i_dirty_item)) {
+                       pr_info(" dropping dirty %s state for %p %lld\n",
+                               ceph_cap_string(ci->i_dirty_caps),
+                               inode, ceph_ino(inode));
+                       ci->i_dirty_caps = 0;
+                       list_del_init(&ci->i_dirty_item);
+                       drop = 1;
+               }
+               if (!list_empty(&ci->i_flushing_item)) {
+                       pr_info(" dropping dirty+flushing %s state for %p %lld\n",
+                               ceph_cap_string(ci->i_flushing_caps),
+                               inode, ceph_ino(inode));
+                       ci->i_flushing_caps = 0;
+                       list_del_init(&ci->i_flushing_item);
+                       mdsc->num_cap_flushing--;
+                       drop = 1;
+               }
+               if (drop && ci->i_wrbuffer_ref) {
+                       pr_info(" dropping dirty data for %p %lld\n",
+                               inode, ceph_ino(inode));
+                       ci->i_wrbuffer_ref = 0;
+                       ci->i_wrbuffer_ref_head = 0;
+                       drop++;
+               }
+               spin_unlock(&mdsc->cap_dirty_lock);
+       }
+       spin_unlock(&inode->i_lock);
+       while (drop--)
+               iput(inode);
         return 0;
  }
  
@@ -821,6 +853,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
         dout("remove_session_caps on %p\n", session);
         iterate_session_caps(session, remove_session_caps_cb, NULL);
         BUG_ON(session->s_nr_caps > 0);
+       BUG_ON(!list_empty(&session->s_cap_flushing));
         cleanup_cap_releases(session);
  }
  
@@ -883,8 +916,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
                 ceph_mds_state_name(state));
         msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
                                  ++session->s_renew_seq);
-       if (IS_ERR(msg))
-               return PTR_ERR(msg);
+       if (!msg)
+               return -ENOMEM;
         ceph_con_send(&session->s_con, msg);
         return 0;
  }
@@ -931,17 +964,15 @@ static int request_close_session(struct ceph_mds_client *mdsc,
                                  struct ceph_mds_session *session)
  {
         struct ceph_msg *msg;
-       int err = 0;
  
         dout("request_close_session mds%d state %s seq %lld\n",
              session->s_mds, session_state_name(session->s_state),
              session->s_seq);
         msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
-       if (IS_ERR(msg))
-               err = PTR_ERR(msg);
-       else
-               ceph_con_send(&session->s_con, msg);
-       return err;
+       if (!msg)
+               return -ENOMEM;
+       ceph_con_send(&session->s_con, msg);
+       return 0;
  }
  
  /*
@@ -1059,7 +1090,7 @@ static int add_cap_releases(struct ceph_mds_client *mdsc,
         while (session->s_num_cap_releases < session->s_nr_caps + extra) {
                 spin_unlock(&session->s_cap_lock);
                 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
-                                  0, 0, NULL);
+                                  GFP_NOFS);
                 if (!msg)
                         goto out_unlocked;
                 dout("add_cap_releases %p msg %p now %d\n", session, msg,
@@ -1151,10 +1182,8 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
         struct ceph_msg *msg;
  
         dout("send_cap_releases mds%d\n", session->s_mds);
-       while (1) {
-               spin_lock(&session->s_cap_lock);
-               if (list_empty(&session->s_cap_releases_done))
-                       break;
+       spin_lock(&session->s_cap_lock);
+       while (!list_empty(&session->s_cap_releases_done)) {
                 msg = list_first_entry(&session->s_cap_releases_done,
                                  struct ceph_msg, list_head);
                 list_del_init(&msg->list_head);
@@ -1162,10 +1191,49 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
                 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
                 dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
                 ceph_con_send(&session->s_con, msg);
+               spin_lock(&session->s_cap_lock);
         }
         spin_unlock(&session->s_cap_lock);
  }
  
+static void discard_cap_releases(struct ceph_mds_client *mdsc,
+                                struct ceph_mds_session *session)
+{
+       struct ceph_msg *msg;
+       struct ceph_mds_cap_release *head;
+       unsigned num;
+
+       dout("discard_cap_releases mds%d\n", session->s_mds);
+       spin_lock(&session->s_cap_lock);
+
+       /* zero out the in-progress message */
+       msg = list_first_entry(&session->s_cap_releases,
+                              struct ceph_msg, list_head);
+       head = msg->front.iov_base;
+       num = le32_to_cpu(head->num);
+       dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
+       head->num = cpu_to_le32(0);
+       session->s_num_cap_releases += num;
+
+       /* requeue completed messages */
+       while (!list_empty(&session->s_cap_releases_done)) {
+               msg = list_first_entry(&session->s_cap_releases_done,
+                                struct ceph_msg, list_head);
+               list_del_init(&msg->list_head);
+
+               head = msg->front.iov_base;
+               num = le32_to_cpu(head->num);
+               dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
+                    num);
+               session->s_num_cap_releases += num;
+               head->num = cpu_to_le32(0);
+               msg->front.iov_len = sizeof(*head);
+               list_add(&msg->list_head, &session->s_cap_releases);
+       }
+
+       spin_unlock(&session->s_cap_lock);
+}
+
  /*
   * requests
   */
@@ -1181,6 +1249,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
         if (!req)
                 return ERR_PTR(-ENOMEM);
  
+       mutex_init(&req->r_fill_mutex);
         req->r_started = jiffies;
         req->r_resend_mds = -1;
         INIT_LIST_HEAD(&req->r_unsafe_dir_item);
@@ -1251,7 +1320,7 @@ retry:
                         len += 1 + temp->d_name.len;
                 temp = temp->d_parent;
                 if (temp == NULL) {
-                       pr_err("build_path_dentry corrupt dentry %p\n", dentry);
+                       pr_err("build_path corrupt dentry %p\n", dentry);
                         return ERR_PTR(-EINVAL);
                 }
         }
@@ -1267,7 +1336,7 @@ retry:
                 struct inode *inode = temp->d_inode;
  
                 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
-                       dout("build_path_dentry path+%d: %p SNAPDIR\n",
+                       dout("build_path path+%d: %p SNAPDIR\n",
                              pos, temp);
                 } else if (stop_on_nosnap && inode &&
                            ceph_snap(inode) == CEPH_NOSNAP) {
@@ -1278,20 +1347,18 @@ retry:
                                 break;
                         strncpy(path + pos, temp->d_name.name,
                                 temp->d_name.len);
-                       dout("build_path_dentry path+%d: %p '%.*s'\n",
-                            pos, temp, temp->d_name.len, path + pos);
                 }
                 if (pos)
                         path[--pos] = '/';
                 temp = temp->d_parent;
                 if (temp == NULL) {
-                       pr_err("build_path_dentry corrupt dentry\n");
+                       pr_err("build_path corrupt dentry\n");
                         kfree(path);
                         return ERR_PTR(-EINVAL);
                 }
         }
         if (pos != 0) {
-               pr_err("build_path_dentry did not end path lookup where "
+               pr_err("build_path did not end path lookup where "
                        "expected, namelen is %d, pos is %d\n", len, pos);
                 /* presumably this is only possible if racing with a
                    rename of one of the parent directories (we can not
@@ -1303,7 +1370,7 @@ retry:
  
         *base = ceph_ino(temp->d_inode);
         *plen = len;
-       dout("build_path_dentry on %p %d built %llx '%.*s'\n",
+       dout("build_path on %p %d built %llx '%.*s'\n",
              dentry, atomic_read(&dentry->d_count), *base, len, path);
         return path;
  }
@@ -1426,9 +1493,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
         if (req->r_old_dentry_drop)
                 len += req->r_old_dentry->d_name.len;
  
-       msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
-       if (IS_ERR(msg))
+       msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS);
+       if (!msg) {
+               msg = ERR_PTR(-ENOMEM);
                 goto out_free2;
+       }
  
         msg->hdr.tid = cpu_to_le64(req->r_tid);
  
@@ -1517,9 +1586,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
         }
         msg = create_request_message(mdsc, req, mds);
         if (IS_ERR(msg)) {
-               req->r_reply = ERR_PTR(PTR_ERR(msg));
+               req->r_err = PTR_ERR(msg);
                 complete_request(mdsc, req);
-               return -PTR_ERR(msg);
+               return PTR_ERR(msg);
         }
         req->r_request = msg;
  
@@ -1552,7 +1621,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
         int mds = -1;
         int err = -EAGAIN;
  
-       if (req->r_reply)
+       if (req->r_err || req->r_got_result)
                 goto out;
  
         if (req->r_timeout &&
@@ -1609,7 +1678,7 @@ out:
         return err;
  
  finish:
-       req->r_reply = ERR_PTR(err);
+       req->r_err = err;
         complete_request(mdsc, req);
         goto out;
  }
@@ -1630,10 +1699,9 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
  
  /*
   * Wake up threads with requests pending for @mds, so that they can
- * resubmit their requests to a possibly different mds.  If @all is set,
- * wake up if their requests has been forwarded to @mds, too.
+ * resubmit their requests to a possibly different mds.
   */
-static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
+static void kick_requests(struct ceph_mds_client *mdsc, int mds)
  {
         struct ceph_mds_request *req;
         struct rb_node *p;
@@ -1689,63 +1757,77 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
         __register_request(mdsc, req, dir);
         __do_request(mdsc, req);
  
-       /* wait */
-       if (!req->r_reply) {
-               mutex_unlock(&mdsc->mutex);
-               if (req->r_timeout) {
-                       err = (long)wait_for_completion_interruptible_timeout(
-                               &req->r_completion, req->r_timeout);
-                       if (err == 0)
-                               req->r_reply = ERR_PTR(-EIO);
-                       else if (err < 0)
-                               req->r_reply = ERR_PTR(err);
-               } else {
-                        err = wait_for_completion_interruptible(
-                                &req->r_completion);
-                        if (err)
-                                req->r_reply = ERR_PTR(err);
-               }
-               mutex_lock(&mdsc->mutex);
+       if (req->r_err) {
+               err = req->r_err;
+               __unregister_request(mdsc, req);
+               dout("do_request early error %d\n", err);
+               goto out;
         }
  
-       if (IS_ERR(req->r_reply)) {
-               err = PTR_ERR(req->r_reply);
-               req->r_reply = NULL;
+       /* wait */
+       mutex_unlock(&mdsc->mutex);
+       dout("do_request waiting\n");
+       if (req->r_timeout) {
+               err = (long)wait_for_completion_interruptible_timeout(
+                       &req->r_completion, req->r_timeout);
+               if (err == 0)
+                       err = -EIO;
+       } else {
+               err = wait_for_completion_interruptible(&req->r_completion);
+       }
+       dout("do_request waited, got %d\n", err);
+       mutex_lock(&mdsc->mutex);
  
-               if (err == -ERESTARTSYS) {
-                       /* aborted */
-                       req->r_aborted = true;
+       /* only abort if we didn't race with a real reply */
+       if (req->r_got_result) {
+               err = le32_to_cpu(req->r_reply_info.head->result);
+       } else if (err < 0) {
+               dout("aborted request %lld with %d\n", req->r_tid, err);
  
-                       if (req->r_locked_dir &&
-                           (req->r_op & CEPH_MDS_OP_WRITE)) {
-                               struct ceph_inode_info *ci =
-                                       ceph_inode(req->r_locked_dir);
+               /*
+                * ensure we aren't running concurrently with
+                * ceph_fill_trace or ceph_readdir_prepopulate, which
+                * rely on locks (dir mutex) held by our caller.
+                */
+               mutex_lock(&req->r_fill_mutex);
+               req->r_err = err;
+               req->r_aborted = true;
+               mutex_unlock(&req->r_fill_mutex);
  
-                               dout("aborted, clearing I_COMPLETE on %p\n", 
-                                    req->r_locked_dir);
-                               spin_lock(&req->r_locked_dir->i_lock);
-                               ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-                               ci->i_release_count++;
-                               spin_unlock(&req->r_locked_dir->i_lock);
-                       }
-               } else {
-                       /* clean up this request */
-                       __unregister_request(mdsc, req);
-                       if (!list_empty(&req->r_unsafe_item))
-                               list_del_init(&req->r_unsafe_item);
-                       complete(&req->r_safe_completion);
-               }
-       } else if (req->r_err) {
-               err = req->r_err;
+               if (req->r_locked_dir &&
+                   (req->r_op & CEPH_MDS_OP_WRITE))
+                       ceph_invalidate_dir_request(req);
         } else {
-               err = le32_to_cpu(req->r_reply_info.head->result);
+               err = req->r_err;
         }
-       mutex_unlock(&mdsc->mutex);
  
+out:
+       mutex_unlock(&mdsc->mutex);
         dout("do_request %p done, result %d\n", req, err);
         return err;
  }
  
+/*
+ * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
+ * namespace request.
+ */
+void ceph_invalidate_dir_request(struct ceph_mds_request *req)
+{
+       struct inode *inode = req->r_locked_dir;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+
+       dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
+       spin_lock(&inode->i_lock);
+       ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+       ci->i_release_count++;
+       spin_unlock(&inode->i_lock);
+
+       if (req->r_dentry)
+               ceph_invalidate_dentry_lease(req->r_dentry);
+       if (req->r_old_dentry)
+               ceph_invalidate_dentry_lease(req->r_old_dentry);
+}
+
  /*
   * Handle mds reply.
   *
@@ -1797,6 +1879,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                 mutex_unlock(&mdsc->mutex);
                 goto out;
         }
+       if (req->r_got_safe && !head->safe) {
+               pr_warning("got unsafe after safe on %llu from mds%d\n",
+                          tid, mds);
+               mutex_unlock(&mdsc->mutex);
+               goto out;
+       }
  
         result = le32_to_cpu(head->result);
  
@@ -1838,11 +1926,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                         mutex_unlock(&mdsc->mutex);
                         goto out;
                 }
-       }
-
-       BUG_ON(req->r_reply);
-
-       if (!head->safe) {
+       } else {
                 req->r_got_unsafe = true;
                 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
         }
@@ -1871,21 +1955,30 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
         }
  
         /* insert trace into our cache */
+       mutex_lock(&req->r_fill_mutex);
         err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
         if (err == 0) {
                 if (result == 0 && rinfo->dir_nr)
                         ceph_readdir_prepopulate(req, req->r_session);
                 ceph_unreserve_caps(&req->r_caps_reservation);
         }
+       mutex_unlock(&req->r_fill_mutex);
  
         up_read(&mdsc->snap_rwsem);
  out_err:
-       if (err) {
-               req->r_err = err;
+       mutex_lock(&mdsc->mutex);
+       if (!req->r_aborted) {
+               if (err) {
+                       req->r_err = err;
+               } else {
+                       req->r_reply = msg;
+                       ceph_msg_get(msg);
+                       req->r_got_result = true;
+               }
         } else {
-               req->r_reply = msg;
-               ceph_msg_get(msg);
+               dout("reply arrived after request %lld was aborted\n", tid);
         }
+       mutex_unlock(&mdsc->mutex);
  
         add_cap_releases(mdsc, req->r_session, -1);
         mutex_unlock(&session->s_mutex);
@@ -1984,6 +2077,8 @@ static void handle_session(struct ceph_mds_session *session,
  
         switch (op) {
         case CEPH_SESSION_OPEN:
+               if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+                       pr_info("mds%d reconnect success\n", session->s_mds);
                 session->s_state = CEPH_MDS_SESSION_OPEN;
                 renewed_caps(mdsc, session, 0);
                 wake = 1;
@@ -1997,10 +2092,12 @@ static void handle_session(struct ceph_mds_session *session,
                 break;
  
         case CEPH_SESSION_CLOSE:
+               if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+                       pr_info("mds%d reconnect denied\n", session->s_mds);
                 remove_session_caps(session);
                 wake = 1; /* for good measure */
                 complete(&mdsc->session_close_waiters);
-               kick_requests(mdsc, mds, 0);      /* cur only */
+               kick_requests(mdsc, mds);
                 break;
  
         case CEPH_SESSION_STALE:
@@ -2132,54 +2229,44 @@ out:
   *
   * called with mdsc->mutex held.
   */
-static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
+static void send_mds_reconnect(struct ceph_mds_client *mdsc,
+                              struct ceph_mds_session *session)
  {
-       struct ceph_mds_session *session = NULL;
         struct ceph_msg *reply;
         struct rb_node *p;
+       int mds = session->s_mds;
         int err = -ENOMEM;
         struct ceph_pagelist *pagelist;
  
-       pr_info("reconnect to recovering mds%d\n", mds);
+       pr_info("mds%d reconnect start\n", mds);
  
         pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
         if (!pagelist)
                 goto fail_nopagelist;
         ceph_pagelist_init(pagelist);
  
-       reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
-       if (IS_ERR(reply)) {
-               err = PTR_ERR(reply);
+       reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS);
+       if (!reply)
                 goto fail_nomsg;
-       }
-
-       /* find session */
-       session = __ceph_lookup_mds_session(mdsc, mds);
-       mutex_unlock(&mdsc->mutex);    /* drop lock for duration */
  
-       if (session) {
-               mutex_lock(&session->s_mutex);
+       mutex_lock(&session->s_mutex);
+       session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+       session->s_seq = 0;
  
-               session->s_state = CEPH_MDS_SESSION_RECONNECTING;
-               session->s_seq = 0;
+       ceph_con_open(&session->s_con,
+                     ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
  
-               ceph_con_open(&session->s_con,
-                             ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
-
-               /* replay unsafe requests */
-               replay_unsafe_requests(mdsc, session);
-       } else {
-               dout("no session for mds%d, will send short reconnect\n",
-                    mds);
-       }
+       /* replay unsafe requests */
+       replay_unsafe_requests(mdsc, session);
  
         down_read(&mdsc->snap_rwsem);
  
-       if (!session)
-               goto send;
         dout("session %p state %s\n", session,
              session_state_name(session->s_state));
  
+       /* drop old cap expires; we're about to reestablish that state */
+       discard_cap_releases(mdsc, session);
+
         /* traverse this session's caps */
         err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
         if (err)
@@ -2208,36 +2295,29 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
                         goto fail;
         }
  
-send:
         reply->pagelist = pagelist;
         reply->hdr.data_len = cpu_to_le32(pagelist->length);
         reply->nr_pages = calc_pages_for(0, pagelist->length);
         ceph_con_send(&session->s_con, reply);
  
-       session->s_state = CEPH_MDS_SESSION_OPEN;
         mutex_unlock(&session->s_mutex);
  
         mutex_lock(&mdsc->mutex);
         __wake_requests(mdsc, &session->s_waiting);
         mutex_unlock(&mdsc->mutex);
  
-       ceph_put_mds_session(session);
-
         up_read(&mdsc->snap_rwsem);
-       mutex_lock(&mdsc->mutex);
         return;
  
  fail:
         ceph_msg_put(reply);
         up_read(&mdsc->snap_rwsem);
         mutex_unlock(&session->s_mutex);
-       ceph_put_mds_session(session);
  fail_nomsg:
         ceph_pagelist_release(pagelist);
         kfree(pagelist);
  fail_nopagelist:
         pr_err("error %d preparing reconnect for mds%d\n", err, mds);
-       mutex_lock(&mdsc->mutex);
         return;
  }
  
@@ -2290,7 +2370,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
                         }
  
                         /* kick any requests waiting on the recovering mds */
-                       kick_requests(mdsc, i, 1);
+                       kick_requests(mdsc, i);
                 } else if (oldstate == newstate) {
                         continue;  /* nothing new with this mds */
                 }
@@ -2299,22 +2379,21 @@ static void check_new_map(struct ceph_mds_client *mdsc,
                  * send reconnect?
                  */
                 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
-                   newstate >= CEPH_MDS_STATE_RECONNECT)
-                       send_mds_reconnect(mdsc, i);
+                   newstate >= CEPH_MDS_STATE_RECONNECT) {
+                       mutex_unlock(&mdsc->mutex);
+                       send_mds_reconnect(mdsc, s);
+                       mutex_lock(&mdsc->mutex);
+               }
  
                 /*
-                * kick requests on any mds that has gone active.
-                *
-                * kick requests on cur or forwarder: we may have sent
-                * the request to mds1, mds1 told us it forwarded it
-                * to mds2, but then we learn mds1 failed and can't be
-                * sure it successfully forwarded our request before
-                * it died.
+                * kick request on any mds that has gone active.
                  */
                 if (oldstate < CEPH_MDS_STATE_ACTIVE &&
                     newstate >= CEPH_MDS_STATE_ACTIVE) {
-                       pr_info("mds%d reconnect completed\n", s->s_mds);
-                       kick_requests(mdsc, i, 1);
+                       if (oldstate != CEPH_MDS_STATE_CREATING &&
+                           oldstate != CEPH_MDS_STATE_STARTING)
+                               pr_info("mds%d recovery completed\n", s->s_mds);
+                       kick_requests(mdsc, i);
                         ceph_kick_flushing_caps(mdsc, s);
                         wake_up_session_caps(s, 1);
                 }
@@ -2457,8 +2536,8 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
         dnamelen = dentry->d_name.len;
         len += dnamelen;
  
-       msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
-       if (IS_ERR(msg))
+       msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS);
+       if (!msg)
                 return;
         lease = msg->front.iov_base;
         lease->action = action;
@@ -2603,7 +2682,9 @@ static void delayed_work(struct work_struct *work)
                 else
                         ceph_con_keepalive(&s->s_con);
                 add_cap_releases(mdsc, s, -1);
-               send_cap_releases(mdsc, s);
+               if (s->s_state == CEPH_MDS_SESSION_OPEN ||
+                   s->s_state == CEPH_MDS_SESSION_HUNG)
+                       send_cap_releases(mdsc, s);
                 mutex_unlock(&s->s_mutex);
                 ceph_put_mds_session(s);
  
@@ -2620,6 +2701,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
         mdsc->client = client;
         mutex_init(&mdsc->mutex);
         mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
+       if (mdsc->mdsmap == NULL)
+               return -ENOMEM;
+
         init_completion(&mdsc->safe_umount_waiters);
         init_completion(&mdsc->session_close_waiters);
         INIT_LIST_HEAD(&mdsc->waiting_for_map);
@@ -2645,6 +2729,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
         init_waitqueue_head(&mdsc->cap_flushing_wq);
         spin_lock_init(&mdsc->dentry_lru_lock);
         INIT_LIST_HEAD(&mdsc->dentry_lru);
+
         return 0;
  }
  
@@ -2740,6 +2825,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
  {
         u64 want_tid, want_flush;
  
+       if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+               return;
+
         dout("sync\n");
         mutex_lock(&mdsc->mutex);
         want_tid = mdsc->last_tid;
@@ -2922,9 +3010,10 @@ static void con_put(struct ceph_connection *con)
  static void peer_reset(struct ceph_connection *con)
  {
         struct ceph_mds_session *s = con->private;
+       struct ceph_mds_client *mdsc = s->s_mdsc;
  
-       pr_err("mds%d gave us the boot.  IMPLEMENT RECONNECT.\n",
-              s->s_mds);
+       pr_warning("mds%d closed our session\n", s->s_mds);
+       send_mds_reconnect(mdsc, s);
  }
  
  static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
@@ -3031,7 +3120,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
         return ceph_monc_validate_auth(&mdsc->client->monc);
  }
  
-const static struct ceph_connection_operations mds_con_ops = {
+static const struct ceph_connection_operations mds_con_ops = {
         .get = con_get,
         .put = con_put,
         .dispatch = dispatch,
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h

index 961cc6f..d9936c4 100644 (file)
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -165,6 +165,8 @@ struct ceph_mds_request {
         struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
         struct inode *r_target_inode;       /* resulting inode */
  
+       struct mutex r_fill_mutex;
+
         union ceph_mds_request_args r_args;
         int r_fmode;        /* file mode, if expecting cap */
  
@@ -213,7 +215,7 @@ struct ceph_mds_request {
         struct completion r_safe_completion;
         ceph_mds_request_callback_t r_callback;
         struct list_head  r_unsafe_item;  /* per-session unsafe list item */
-       bool              r_got_unsafe, r_got_safe;
+       bool              r_got_unsafe, r_got_safe, r_got_result;
  
         bool              r_did_prepopulate;
         u32               r_readdir_offset;
@@ -301,6 +303,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
                                     struct inode *inode,
                                     struct dentry *dn, int mask);
  
+extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
+
  extern struct ceph_mds_request *
  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
  extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c

index cd4fadb..60b7483 100644 (file)
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -39,18 +39,6 @@ static void queue_con(struct ceph_connection *con);
  static void con_work(struct work_struct *);
  static void ceph_fault(struct ceph_connection *con);
  
-const char *ceph_name_type_str(int t)
-{
-       switch (t) {
-       case CEPH_ENTITY_TYPE_MON: return "mon";
-       case CEPH_ENTITY_TYPE_MDS: return "mds";
-       case CEPH_ENTITY_TYPE_OSD: return "osd";
-       case CEPH_ENTITY_TYPE_CLIENT: return "client";
-       case CEPH_ENTITY_TYPE_ADMIN: return "admin";
-       default: return "???";
-       }
-}
-
  /*
   * nicely render a sockaddr as a string.
   */
@@ -340,6 +328,7 @@ static void reset_connection(struct ceph_connection *con)
                 ceph_msg_put(con->out_msg);
                 con->out_msg = NULL;
         }
+       con->out_keepalive_pending = false;
         con->in_seq = 0;
         con->in_seq_acked = 0;
  }
@@ -357,6 +346,7 @@ void ceph_con_close(struct ceph_connection *con)
         clear_bit(WRITE_PENDING, &con->state);
         mutex_lock(&con->mutex);
         reset_connection(con);
+       con->peer_global_seq = 0;
         cancel_delayed_work(&con->work);
         mutex_unlock(&con->mutex);
         queue_con(con);
@@ -661,7 +651,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
         dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
              con->connect_seq, global_seq, proto);
  
-       con->out_connect.features = CEPH_FEATURE_SUPPORTED;
+       con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT;
         con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
         con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
         con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1124,8 +1114,8 @@ static void fail_protocol(struct ceph_connection *con)
  
  static int process_connect(struct ceph_connection *con)
  {
-       u64 sup_feat = CEPH_FEATURE_SUPPORTED;
-       u64 req_feat = CEPH_FEATURE_REQUIRED;
+       u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT;
+       u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT;
         u64 server_feat = le64_to_cpu(con->in_reply.features);
  
         dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -1233,6 +1223,7 @@ static int process_connect(struct ceph_connection *con)
                 clear_bit(CONNECTING, &con->state);
                 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
                 con->connect_seq++;
+               con->peer_features = server_feat;
                 dout("process_connect got READY gseq %d cseq %d (%d)\n",
                      con->peer_global_seq,
                      le32_to_cpu(con->in_reply.connect_seq),
@@ -1402,19 +1393,17 @@ static int read_partial_message(struct ceph_connection *con)
                 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
                 if (skip) {
                         /* skip this message */
-                       dout("alloc_msg returned NULL, skipping message\n");
+                       dout("alloc_msg said skip message\n");
                         con->in_base_pos = -front_len - middle_len - data_len -
                                 sizeof(m->footer);
                         con->in_tag = CEPH_MSGR_TAG_READY;
                         con->in_seq++;
                         return 0;
                 }
-               if (IS_ERR(con->in_msg)) {
-                       ret = PTR_ERR(con->in_msg);
-                       con->in_msg = NULL;
+               if (!con->in_msg) {
                         con->error_msg =
                                 "error allocating memory for incoming message";
-                       return ret;
+                       return -ENOMEM;
                 }
                 m = con->in_msg;
                 m->front.iov_len = 0;    /* haven't read it yet */
@@ -1514,14 +1503,14 @@ static void process_message(struct ceph_connection *con)
  
         /* if first message, set peer_name */
         if (con->peer_name.type == 0)
-               con->peer_name = msg->hdr.src.name;
+               con->peer_name = msg->hdr.src;
  
         con->in_seq++;
         mutex_unlock(&con->mutex);
  
         dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
              msg, le64_to_cpu(msg->hdr.seq),
-            ENTITY_NAME(msg->hdr.src.name),
+            ENTITY_NAME(msg->hdr.src),
              le16_to_cpu(msg->hdr.type),
              ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
              le32_to_cpu(msg->hdr.front_len),
@@ -1546,7 +1535,6 @@ static int try_write(struct ceph_connection *con)
         dout("try_write start %p state %lu nref %d\n", con, con->state,
              atomic_read(&con->nref));
  
-       mutex_lock(&con->mutex);
  more:
         dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
  
@@ -1639,7 +1627,6 @@ do_next:
  done:
         ret = 0;
  out:
-       mutex_unlock(&con->mutex);
         dout("try_write done on %p\n", con);
         return ret;
  }
@@ -1651,7 +1638,6 @@ out:
   */
  static int try_read(struct ceph_connection *con)
  {
-       struct ceph_messenger *msgr;
         int ret = -1;
  
         if (!con->sock)
@@ -1661,9 +1647,6 @@ static int try_read(struct ceph_connection *con)
                 return 0;
  
         dout("try_read start on %p\n", con);
-       msgr = con->msgr;
-
-       mutex_lock(&con->mutex);
  
  more:
         dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
@@ -1758,7 +1741,6 @@ more:
  done:
         ret = 0;
  out:
-       mutex_unlock(&con->mutex);
         dout("try_read done on %p\n", con);
         return ret;
  
@@ -1830,6 +1812,8 @@ more:
         dout("con_work %p start, clearing QUEUED\n", con);
         clear_bit(QUEUED, &con->state);
  
+       mutex_lock(&con->mutex);
+
         if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
                 dout("con_work CLOSED\n");
                 con_close_socket(con);
@@ -1844,11 +1828,16 @@ more:
         if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
             try_read(con) < 0 ||
             try_write(con) < 0) {
+               mutex_unlock(&con->mutex);
                 backoff = 1;
                 ceph_fault(con);     /* error/fault path */
+               goto done_unlocked;
         }
  
  done:
+       mutex_unlock(&con->mutex);
+
+done_unlocked:
         clear_bit(BUSY, &con->state);
         dout("con->state=%lu\n", con->state);
         if (test_bit(QUEUED, &con->state)) {
@@ -1947,7 +1936,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
  
         /* the zero page is needed if a request is "canceled" while the message
          * is being written over the socket */
-       msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+       msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
         if (!msgr->zero_page) {
                 kfree(msgr);
                 return ERR_PTR(-ENOMEM);
@@ -1987,9 +1976,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
         }
  
         /* set src+dst */
-       msg->hdr.src.name = con->msgr->inst.name;
-       msg->hdr.src.addr = con->msgr->my_enc_addr;
-       msg->hdr.orig_src = msg->hdr.src;
+       msg->hdr.src = con->msgr->inst.name;
  
         BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
  
@@ -2083,12 +2070,11 @@ void ceph_con_keepalive(struct ceph_connection *con)
   * construct a new message with given type, size
   * the new msg has a ref count of 1.
   */
-struct ceph_msg *ceph_msg_new(int type, int front_len,
-                             int page_len, int page_off, struct page **pages)
+struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
  {
         struct ceph_msg *m;
  
-       m = kmalloc(sizeof(*m), GFP_NOFS);
+       m = kmalloc(sizeof(*m), flags);
         if (m == NULL)
                 goto out;
         kref_init(&m->kref);
@@ -2100,8 +2086,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
         m->hdr.version = 0;
         m->hdr.front_len = cpu_to_le32(front_len);
         m->hdr.middle_len = 0;
-       m->hdr.data_len = cpu_to_le32(page_len);
-       m->hdr.data_off = cpu_to_le16(page_off);
+       m->hdr.data_len = 0;
+       m->hdr.data_off = 0;
         m->hdr.reserved = 0;
         m->footer.front_crc = 0;
         m->footer.middle_crc = 0;
@@ -2115,11 +2101,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
         /* front */
         if (front_len) {
                 if (front_len > PAGE_CACHE_SIZE) {
-                       m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
+                       m->front.iov_base = __vmalloc(front_len, flags,
                                                       PAGE_KERNEL);
                         m->front_is_vmalloc = true;
                 } else {
-                       m->front.iov_base = kmalloc(front_len, GFP_NOFS);
+                       m->front.iov_base = kmalloc(front_len, flags);
                 }
                 if (m->front.iov_base == NULL) {
                         pr_err("msg_new can't allocate %d bytes\n",
@@ -2135,19 +2121,18 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
         m->middle = NULL;
  
         /* data */
-       m->nr_pages = calc_pages_for(page_off, page_len);
-       m->pages = pages;
+       m->nr_pages = 0;
+       m->pages = NULL;
         m->pagelist = NULL;
  
-       dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
-            m->nr_pages);
+       dout("ceph_msg_new %p front %d\n", m, front_len);
         return m;
  
  out2:
         ceph_msg_put(m);
  out:
-       pr_err("msg_new can't create type %d len %d\n", type, front_len);
-       return ERR_PTR(-ENOMEM);
+       pr_err("msg_new can't create type %d front %d\n", type, front_len);
+       return NULL;
  }
  
  /*
@@ -2190,29 +2175,25 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
                 mutex_unlock(&con->mutex);
                 msg = con->ops->alloc_msg(con, hdr, skip);
                 mutex_lock(&con->mutex);
-               if (IS_ERR(msg))
-                       return msg;
-
-               if (*skip)
+               if (!msg || *skip)
                         return NULL;
         }
         if (!msg) {
                 *skip = 0;
-               msg = ceph_msg_new(type, front_len, 0, 0, NULL);
+               msg = ceph_msg_new(type, front_len, GFP_NOFS);
                 if (!msg) {
                         pr_err("unable to allocate msg type %d len %d\n",
                                type, front_len);
-                       return ERR_PTR(-ENOMEM);
+                       return NULL;
                 }
         }
         memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
  
-       if (middle_len) {
+       if (middle_len && !msg->middle) {
                 ret = ceph_alloc_middle(con, msg);
-
                 if (ret < 0) {
                         ceph_msg_put(msg);
-                       return msg;
+                       return NULL;
                 }
         }
  
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h

index a5caf91..00a9430 100644 (file)
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -49,10 +49,8 @@ struct ceph_connection_operations {
                                         int *skip);
  };
  
-extern const char *ceph_name_type_str(int t);
-
  /* use format string %s%d */
-#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
+#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
  
  struct ceph_messenger {
         struct ceph_entity_inst inst;    /* my name+address */
@@ -144,6 +142,7 @@ struct ceph_connection {
         struct ceph_entity_addr peer_addr; /* peer address */
         struct ceph_entity_name peer_name; /* peer name */
         struct ceph_entity_addr peer_addr_for_me;
+       unsigned peer_features;
         u32 connect_seq;      /* identify the most recent connection
                                  attempt for this connection, client */
         u32 peer_global_seq;  /* peer's global seq for this connection */
@@ -158,7 +157,6 @@ struct ceph_connection {
         struct list_head out_queue;
         struct list_head out_sent;   /* sending or sent but unacked */
         u64 out_seq;                 /* last message queued for send */
-       u64 out_seq_sent;            /* last message sent */
         bool out_keepalive_pending;
  
         u64 in_seq, in_seq_acked;  /* last message received, acked */
@@ -234,9 +232,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con);
  extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
  extern void ceph_con_put(struct ceph_connection *con);
  
-extern struct ceph_msg *ceph_msg_new(int type, int front_len,
-                                    int page_len, int page_off,
-                                    struct page **pages);
+extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
  extern void ceph_msg_kfree(struct ceph_msg *m);
  
  
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c

index 8fdc011..f6510a4 100644 (file)
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -28,7 +28,7 @@
   * resend any outstanding requests.
   */
  
-const static struct ceph_connection_operations mon_con_ops;
+static const struct ceph_connection_operations mon_con_ops;
  
  static int __validate_auth(struct ceph_mon_client *monc);
  
@@ -104,6 +104,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
         monc->pending_auth = 1;
         monc->m_auth->front.iov_len = len;
         monc->m_auth->hdr.front_len = cpu_to_le32(len);
+       ceph_con_revoke(monc->con, monc->m_auth);
         ceph_msg_get(monc->m_auth);  /* keep our ref */
         ceph_con_send(monc->con, monc->m_auth);
  }
@@ -187,16 +188,12 @@ static void __send_subscribe(struct ceph_mon_client *monc)
              monc->want_next_osdmap);
         if ((__sub_expired(monc) && !monc->sub_sent) ||
             monc->want_next_osdmap == 1) {
-               struct ceph_msg *msg;
+               struct ceph_msg *msg = monc->m_subscribe;
                 struct ceph_mon_subscribe_item *i;
                 void *p, *end;
  
-               msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
-               if (!msg)
-                       return;
-
                 p = msg->front.iov_base;
-               end = p + msg->front.iov_len;
+               end = p + msg->front_max;
  
                 dout("__send_subscribe to 'mdsmap' %u+\n",
                      (unsigned)monc->have_mdsmap);
@@ -226,7 +223,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)
  
                 msg->front.iov_len = p - msg->front.iov_base;
                 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-               ceph_con_send(monc->con, msg);
+               ceph_con_revoke(monc->con, msg);
+               ceph_con_send(monc->con, ceph_msg_get(msg));
  
                 monc->sub_sent = jiffies | 1;  /* never 0 */
         }
@@ -353,14 +351,14 @@ out:
  /*
   * statfs
   */
-static struct ceph_mon_statfs_request *__lookup_statfs(
+static struct ceph_mon_generic_request *__lookup_generic_req(
         struct ceph_mon_client *monc, u64 tid)
  {
-       struct ceph_mon_statfs_request *req;
-       struct rb_node *n = monc->statfs_request_tree.rb_node;
+       struct ceph_mon_generic_request *req;
+       struct rb_node *n = monc->generic_request_tree.rb_node;
  
         while (n) {
-               req = rb_entry(n, struct ceph_mon_statfs_request, node);
+               req = rb_entry(n, struct ceph_mon_generic_request, node);
                 if (tid < req->tid)
                         n = n->rb_left;
                 else if (tid > req->tid)
@@ -371,16 +369,16 @@ static struct ceph_mon_statfs_request *__lookup_statfs(
         return NULL;
  }
  
-static void __insert_statfs(struct ceph_mon_client *monc,
-                           struct ceph_mon_statfs_request *new)
+static void __insert_generic_request(struct ceph_mon_client *monc,
+                           struct ceph_mon_generic_request *new)
  {
-       struct rb_node **p = &monc->statfs_request_tree.rb_node;
+       struct rb_node **p = &monc->generic_request_tree.rb_node;
         struct rb_node *parent = NULL;
-       struct ceph_mon_statfs_request *req = NULL;
+       struct ceph_mon_generic_request *req = NULL;
  
         while (*p) {
                 parent = *p;
-               req = rb_entry(parent, struct ceph_mon_statfs_request, node);
+               req = rb_entry(parent, struct ceph_mon_generic_request, node);
                 if (new->tid < req->tid)
                         p = &(*p)->rb_left;
                 else if (new->tid > req->tid)
@@ -390,113 +388,157 @@ static void __insert_statfs(struct ceph_mon_client *monc,
         }
  
         rb_link_node(&new->node, parent, p);
-       rb_insert_color(&new->node, &monc->statfs_request_tree);
+       rb_insert_color(&new->node, &monc->generic_request_tree);
+}
+
+static void release_generic_request(struct kref *kref)
+{
+       struct ceph_mon_generic_request *req =
+               container_of(kref, struct ceph_mon_generic_request, kref);
+
+       if (req->reply)
+               ceph_msg_put(req->reply);
+       if (req->request)
+               ceph_msg_put(req->request);
+}
+
+static void put_generic_request(struct ceph_mon_generic_request *req)
+{
+       kref_put(&req->kref, release_generic_request);
+}
+
+static void get_generic_request(struct ceph_mon_generic_request *req)
+{
+       kref_get(&req->kref);
+}
+
+static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
+                                        struct ceph_msg_header *hdr,
+                                        int *skip)
+{
+       struct ceph_mon_client *monc = con->private;
+       struct ceph_mon_generic_request *req;
+       u64 tid = le64_to_cpu(hdr->tid);
+       struct ceph_msg *m;
+
+       mutex_lock(&monc->mutex);
+       req = __lookup_generic_req(monc, tid);
+       if (!req) {
+               dout("get_generic_reply %lld dne\n", tid);
+               *skip = 1;
+               m = NULL;
+       } else {
+               dout("get_generic_reply %lld got %p\n", tid, req->reply);
+               m = ceph_msg_get(req->reply);
+               /*
+                * we don't need to track the connection reading into
+                * this reply because we only have one open connection
+                * at a time, ever.
+                */
+       }
+       mutex_unlock(&monc->mutex);
+       return m;
  }
  
  static void handle_statfs_reply(struct ceph_mon_client *monc,
                                 struct ceph_msg *msg)
  {
-       struct ceph_mon_statfs_request *req;
+       struct ceph_mon_generic_request *req;
         struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
-       u64 tid;
+       u64 tid = le64_to_cpu(msg->hdr.tid);
  
         if (msg->front.iov_len != sizeof(*reply))
                 goto bad;
-       tid = le64_to_cpu(msg->hdr.tid);
         dout("handle_statfs_reply %p tid %llu\n", msg, tid);
  
         mutex_lock(&monc->mutex);
-       req = __lookup_statfs(monc, tid);
+       req = __lookup_generic_req(monc, tid);
         if (req) {
-               *req->buf = reply->st;
+               *(struct ceph_statfs *)req->buf = reply->st;
                 req->result = 0;
+               get_generic_request(req);
         }
         mutex_unlock(&monc->mutex);
-       if (req)
+       if (req) {
                 complete(&req->completion);
+               put_generic_request(req);
+       }
         return;
  
  bad:
-       pr_err("corrupt statfs reply, no tid\n");
+       pr_err("corrupt generic reply, no tid\n");
         ceph_msg_dump(msg);
  }
  
  /*
- * (re)send a statfs request
+ * Do a synchronous statfs().
   */
-static int send_statfs(struct ceph_mon_client *monc,
-                      struct ceph_mon_statfs_request *req)
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
  {
-       struct ceph_msg *msg;
+       struct ceph_mon_generic_request *req;
         struct ceph_mon_statfs *h;
+       int err;
  
-       dout("send_statfs tid %llu\n", req->tid);
-       msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
-       if (IS_ERR(msg))
-               return PTR_ERR(msg);
-       req->request = msg;
-       msg->hdr.tid = cpu_to_le64(req->tid);
-       h = msg->front.iov_base;
+       req = kzalloc(sizeof(*req), GFP_NOFS);
+       if (!req)
+               return -ENOMEM;
+
+       kref_init(&req->kref);
+       req->buf = buf;
+       init_completion(&req->completion);
+
+       err = -ENOMEM;
+       req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
+       if (!req->request)
+               goto out;
+       req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
+       if (!req->reply)
+               goto out;
+
+       /* fill out request */
+       h = req->request->front.iov_base;
         h->monhdr.have_version = 0;
         h->monhdr.session_mon = cpu_to_le16(-1);
         h->monhdr.session_mon_tid = 0;
         h->fsid = monc->monmap->fsid;
-       ceph_con_send(monc->con, msg);
-       return 0;
-}
-
-/*
- * Do a synchronous statfs().
- */
-int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
-{
-       struct ceph_mon_statfs_request req;
-       int err;
-
-       req.buf = buf;
-       init_completion(&req.completion);
-
-       /* allocate memory for reply */
-       err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
-       if (err)
-               return err;
  
         /* register request */
         mutex_lock(&monc->mutex);
-       req.tid = ++monc->last_tid;
-       req.last_attempt = jiffies;
-       req.delay = BASE_DELAY_INTERVAL;
-       __insert_statfs(monc, &req);
-       monc->num_statfs_requests++;
+       req->tid = ++monc->last_tid;
+       req->request->hdr.tid = cpu_to_le64(req->tid);
+       __insert_generic_request(monc, req);
+       monc->num_generic_requests++;
         mutex_unlock(&monc->mutex);
  
         /* send request and wait */
-       err = send_statfs(monc, &req);
-       if (!err)
-               err = wait_for_completion_interruptible(&req.completion);
+       ceph_con_send(monc->con, ceph_msg_get(req->request));
+       err = wait_for_completion_interruptible(&req->completion);
  
         mutex_lock(&monc->mutex);
-       rb_erase(&req.node, &monc->statfs_request_tree);
-       monc->num_statfs_requests--;
-       ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
+       rb_erase(&req->node, &monc->generic_request_tree);
+       monc->num_generic_requests--;
         mutex_unlock(&monc->mutex);
  
         if (!err)
-               err = req.result;
+               err = req->result;
+
+out:
+       kref_put(&req->kref, release_generic_request);
         return err;
  }
  
  /*
   * Resend pending statfs requests.
   */
-static void __resend_statfs(struct ceph_mon_client *monc)
+static void __resend_generic_request(struct ceph_mon_client *monc)
  {
-       struct ceph_mon_statfs_request *req;
+       struct ceph_mon_generic_request *req;
         struct rb_node *p;
  
-       for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
-               req = rb_entry(p, struct ceph_mon_statfs_request, node);
-               send_statfs(monc, req);
+       for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
+               req = rb_entry(p, struct ceph_mon_generic_request, node);
+               ceph_con_revoke(monc->con, req->request);
+               ceph_con_send(monc->con, ceph_msg_get(req->request));
         }
  }
  
@@ -586,26 +628,26 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
                 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
                 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
  
-       /* msg pools */
-       err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
-                              sizeof(struct ceph_mon_subscribe_ack), 1, false);
-       if (err < 0)
+       /* msgs */
+       err = -ENOMEM;
+       monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
+                                    sizeof(struct ceph_mon_subscribe_ack),
+                                    GFP_NOFS);
+       if (!monc->m_subscribe_ack)
                 goto out_monmap;
-       err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
-                               sizeof(struct ceph_mon_statfs_reply), 0, false);
-       if (err < 0)
-               goto out_pool1;
-       err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
-       if (err < 0)
-               goto out_pool2;
-
-       monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
+
+       monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
+       if (!monc->m_subscribe)
+               goto out_subscribe_ack;
+
+       monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
+       if (!monc->m_auth_reply)
+               goto out_subscribe;
+
+       monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
         monc->pending_auth = 0;
-       if (IS_ERR(monc->m_auth)) {
-               err = PTR_ERR(monc->m_auth);
-               monc->m_auth = NULL;
-               goto out_pool3;
-       }
+       if (!monc->m_auth)
+               goto out_auth_reply;
  
         monc->cur_mon = -1;
         monc->hunting = true;
@@ -613,8 +655,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
         monc->sub_sent = 0;
  
         INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
-       monc->statfs_request_tree = RB_ROOT;
-       monc->num_statfs_requests = 0;
+       monc->generic_request_tree = RB_ROOT;
+       monc->num_generic_requests = 0;
         monc->last_tid = 0;
  
         monc->have_mdsmap = 0;
@@ -622,12 +664,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
         monc->want_next_osdmap = 1;
         return 0;
  
-out_pool3:
-       ceph_msgpool_destroy(&monc->msgpool_auth_reply);
-out_pool2:
-       ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
-out_pool1:
-       ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+out_auth_reply:
+       ceph_msg_put(monc->m_auth_reply);
+out_subscribe:
+       ceph_msg_put(monc->m_subscribe);
+out_subscribe_ack:
+       ceph_msg_put(monc->m_subscribe_ack);
  out_monmap:
         kfree(monc->monmap);
  out:
@@ -651,9 +693,9 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
         ceph_auth_destroy(monc->auth);
  
         ceph_msg_put(monc->m_auth);
-       ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
-       ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
-       ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+       ceph_msg_put(monc->m_auth_reply);
+       ceph_msg_put(monc->m_subscribe);
+       ceph_msg_put(monc->m_subscribe_ack);
  
         kfree(monc->monmap);
  }
@@ -681,7 +723,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
                 monc->client->msgr->inst.name.num = monc->auth->global_id;
  
                 __send_subscribe(monc);
-               __resend_statfs(monc);
+               __resend_generic_request(monc);
         }
         mutex_unlock(&monc->mutex);
  }
@@ -770,18 +812,17 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
  
         switch (type) {
         case CEPH_MSG_MON_SUBSCRIBE_ACK:
-               m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
+               m = ceph_msg_get(monc->m_subscribe_ack);
                 break;
         case CEPH_MSG_STATFS_REPLY:
-               m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
-               break;
+               return get_generic_reply(con, hdr, skip);
         case CEPH_MSG_AUTH_REPLY:
-               m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
+               m = ceph_msg_get(monc->m_auth_reply);
                 break;
         case CEPH_MSG_MON_MAP:
         case CEPH_MSG_MDS_MAP:
         case CEPH_MSG_OSD_MAP:
-               m = ceph_msg_new(type, front_len, 0, 0, NULL);
+               m = ceph_msg_new(type, front_len, GFP_NOFS);
                 break;
         }
  
@@ -826,7 +867,7 @@ out:
         mutex_unlock(&monc->mutex);
  }
  
-const static struct ceph_connection_operations mon_con_ops = {
+static const struct ceph_connection_operations mon_con_ops = {
         .get = ceph_con_get,
         .put = ceph_con_put,
         .dispatch = dispatch,
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h

index b958ad5..174d794 100644 (file)
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -2,10 +2,10 @@
  #define _FS_CEPH_MON_CLIENT_H
  
  #include <linux/completion.h>
+#include <linux/kref.h>
  #include <linux/rbtree.h>
  
  #include "messenger.h"
-#include "msgpool.h"
  
  struct ceph_client;
  struct ceph_mount_args;
@@ -22,7 +22,7 @@ struct ceph_monmap {
  };
  
  struct ceph_mon_client;
-struct ceph_mon_statfs_request;
+struct ceph_mon_generic_request;
  
  
  /*
@@ -40,17 +40,19 @@ struct ceph_mon_request {
  };
  
  /*
- * statfs() is done a bit differently because we need to get data back
+ * ceph_mon_generic_request is being used for the statfs and poolop requests
+ * which are bening done a bit differently because we need to get data back
   * to the caller
   */
-struct ceph_mon_statfs_request {
+struct ceph_mon_generic_request {
+       struct kref kref;
         u64 tid;
         struct rb_node node;
         int result;
-       struct ceph_statfs *buf;
+       void *buf;
         struct completion completion;
-       unsigned long last_attempt, delay; /* jiffies */
         struct ceph_msg *request;  /* original request */
+       struct ceph_msg *reply;    /* and reply */
  };
  
  struct ceph_mon_client {
@@ -61,7 +63,7 @@ struct ceph_mon_client {
         struct delayed_work delayed_work;
  
         struct ceph_auth_client *auth;
-       struct ceph_msg *m_auth;
+       struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
         int pending_auth;
  
         bool hunting;
@@ -70,14 +72,9 @@ struct ceph_mon_client {
         struct ceph_connection *con;
         bool have_fsid;
  
-       /* msg pools */
-       struct ceph_msgpool msgpool_subscribe_ack;
-       struct ceph_msgpool msgpool_statfs_reply;
-       struct ceph_msgpool msgpool_auth_reply;
-
-       /* pending statfs requests */
-       struct rb_root statfs_request_tree;
-       int num_statfs_requests;
+       /* pending generic requests */
+       struct rb_root generic_request_tree;
+       int num_generic_requests;
         u64 last_tid;
  
         /* mds/osd map */
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c

index ca3b44a..dd65a64 100644 (file)
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -7,180 +7,58 @@
  
  #include "msgpool.h"
  
-/*
- * We use msg pools to preallocate memory for messages we expect to
- * receive over the wire, to avoid getting ourselves into OOM
- * conditions at unexpected times.  We take use a few different
- * strategies:
- *
- *  - for request/response type interactions, we preallocate the
- * memory needed for the response when we generate the request.
- *
- *  - for messages we can receive at any time from the MDS, we preallocate
- * a pool of messages we can re-use.
- *
- *  - for writeback, we preallocate some number of messages to use for
- * requests and their replies, so that we always make forward
- * progress.
- *
- * The msgpool behaves like a mempool_t, but keeps preallocated
- * ceph_msgs strung together on a list_head instead of using a pointer
- * vector.  This avoids vector reallocation when we adjust the number
- * of preallocated items (which happens frequently).
- */
+static void *alloc_fn(gfp_t gfp_mask, void *arg)
+{
+       struct ceph_msgpool *pool = arg;
+       void *p;
  
+       p = ceph_msg_new(0, pool->front_len, gfp_mask);
+       if (!p)
+               pr_err("msgpool %s alloc failed\n", pool->name);
+       return p;
+}
  
-/*
- * Allocate or release as necessary to meet our target pool size.
- */
-static int __fill_msgpool(struct ceph_msgpool *pool)
+static void free_fn(void *element, void *arg)
  {
-       struct ceph_msg *msg;
-
-       while (pool->num < pool->min) {
-               dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
-                    pool->min);
-               spin_unlock(&pool->lock);
-               msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
-               spin_lock(&pool->lock);
-               if (IS_ERR(msg))
-                       return PTR_ERR(msg);
-               msg->pool = pool;
-               list_add(&msg->list_head, &pool->msgs);
-               pool->num++;
-       }
-       while (pool->num > pool->min) {
-               msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
-               dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
-                    pool->min, msg);
-               list_del_init(&msg->list_head);
-               pool->num--;
-               ceph_msg_kfree(msg);
-       }
-       return 0;
+       ceph_msg_put(element);
  }
  
  int ceph_msgpool_init(struct ceph_msgpool *pool,
-                     int front_len, int min, bool blocking)
+                     int front_len, int size, bool blocking, const char *name)
  {
-       int ret;
-
-       dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
-       spin_lock_init(&pool->lock);
         pool->front_len = front_len;
-       INIT_LIST_HEAD(&pool->msgs);
-       pool->num = 0;
-       pool->min = min;
-       pool->blocking = blocking;
-       init_waitqueue_head(&pool->wait);
-
-       spin_lock(&pool->lock);
-       ret = __fill_msgpool(pool);
-       spin_unlock(&pool->lock);
-       return ret;
+       pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
+       if (!pool->pool)
+               return -ENOMEM;
+       pool->name = name;
+       return 0;
  }
  
  void ceph_msgpool_destroy(struct ceph_msgpool *pool)
  {
-       dout("msgpool_destroy %p\n", pool);
-       spin_lock(&pool->lock);
-       pool->min = 0;
-       __fill_msgpool(pool);
-       spin_unlock(&pool->lock);
+       mempool_destroy(pool->pool);
  }
  
-int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
+                                 int front_len)
  {
-       int ret;
-
-       spin_lock(&pool->lock);
-       dout("msgpool_resv %p delta %d\n", pool, delta);
-       pool->min += delta;
-       ret = __fill_msgpool(pool);
-       spin_unlock(&pool->lock);
-       return ret;
-}
-
-struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
-{
-       wait_queue_t wait;
-       struct ceph_msg *msg;
-
-       if (front_len && front_len > pool->front_len) {
-               pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
-                      pool, front_len, pool->front_len);
+       if (front_len > pool->front_len) {
+               pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
+                      pool->name, front_len, pool->front_len);
                 WARN_ON(1);
  
                 /* try to alloc a fresh message */
-               msg = ceph_msg_new(0, front_len, 0, 0, NULL);
-               if (!IS_ERR(msg))
-                       return msg;
-       }
-
-       if (!front_len)
-               front_len = pool->front_len;
-
-       if (pool->blocking) {
-               /* mempool_t behavior; first try to alloc */
-               msg = ceph_msg_new(0, front_len, 0, 0, NULL);
-               if (!IS_ERR(msg))
-                       return msg;
+               return ceph_msg_new(0, front_len, GFP_NOFS);
         }
  
-       while (1) {
-               spin_lock(&pool->lock);
-               if (likely(pool->num)) {
-                       msg = list_entry(pool->msgs.next, struct ceph_msg,
-                                        list_head);
-                       list_del_init(&msg->list_head);
-                       pool->num--;
-                       dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
-                            pool->num, pool->min);
-                       spin_unlock(&pool->lock);
-                       return msg;
-               }
-               pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
-                      pool->min, pool->blocking ? "waiting" : "may fail");
-               spin_unlock(&pool->lock);
-
-               if (!pool->blocking) {
-                       WARN_ON(1);
-
-                       /* maybe we can allocate it now? */
-                       msg = ceph_msg_new(0, front_len, 0, 0, NULL);
-                       if (!IS_ERR(msg))
-                               return msg;
-
-                       pr_err("msgpool_get %p empty + alloc failed\n", pool);
-                       return ERR_PTR(-ENOMEM);
-               }
-
-               init_wait(&wait);
-               prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
-               schedule();
-               finish_wait(&pool->wait, &wait);
-       }
+       return mempool_alloc(pool->pool, GFP_NOFS);
  }
  
  void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
  {
-       spin_lock(&pool->lock);
-       if (pool->num < pool->min) {
-               /* reset msg front_len; user may have changed it */
-               msg->front.iov_len = pool->front_len;
-               msg->hdr.front_len = cpu_to_le32(pool->front_len);
+       /* reset msg front_len; user may have changed it */
+       msg->front.iov_len = pool->front_len;
+       msg->hdr.front_len = cpu_to_le32(pool->front_len);
  
-               kref_set(&msg->kref, 1);  /* retake a single ref */
-               list_add(&msg->list_head, &pool->msgs);
-               pool->num++;
-               dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
-                    pool->num, pool->min);
-               spin_unlock(&pool->lock);
-               wake_up(&pool->wait);
-       } else {
-               dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
-                    pool->num, pool->min);
-               spin_unlock(&pool->lock);
-               ceph_msg_kfree(msg);
-       }
+       kref_init(&msg->kref);  /* retake single ref */
  }
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h

index bc834bf..a362605 100644 (file)
--- a/fs/ceph/msgpool.h
+++ b/fs/ceph/msgpool.h
@@ -1,6 +1,7 @@
  #ifndef _FS_CEPH_MSGPOOL
  #define _FS_CEPH_MSGPOOL
  
+#include <linux/mempool.h>
  #include "messenger.h"
  
  /*
@@ -8,18 +9,15 @@
   * avoid unexpected OOM conditions.
   */
  struct ceph_msgpool {
-       spinlock_t lock;
+       const char *name;
+       mempool_t *pool;
         int front_len;          /* preallocated payload size */
-       struct list_head msgs;  /* msgs in the pool; each has 1 ref */
-       int num, min;           /* cur, min # msgs in the pool */
-       bool blocking;
-       wait_queue_head_t wait;
  };
  
  extern int ceph_msgpool_init(struct ceph_msgpool *pool,
-                            int front_len, int size, bool blocking);
+                            int front_len, int size, bool blocking,
+                            const char *name);
  extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
-extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
  extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
                                          int front_len);
  extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h

index 8aaab41..892a029 100644 (file)
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -50,7 +50,6 @@ struct ceph_entity_name {
  #define CEPH_ENTITY_TYPE_MDS    0x02
  #define CEPH_ENTITY_TYPE_OSD    0x04
  #define CEPH_ENTITY_TYPE_CLIENT 0x08
-#define CEPH_ENTITY_TYPE_ADMIN  0x10
  #define CEPH_ENTITY_TYPE_AUTH   0x20
  
  #define CEPH_ENTITY_TYPE_ANY    0xFF
@@ -120,7 +119,7 @@ struct ceph_msg_connect_reply {
  /*
   * message header
   */
-struct ceph_msg_header {
+struct ceph_msg_header_old {
         __le64 seq;       /* message seq# for this session */
         __le64 tid;       /* transaction id */
         __le16 type;      /* message type */
@@ -138,6 +137,24 @@ struct ceph_msg_header {
         __le32 crc;       /* header crc32c */
  } __attribute__ ((packed));
  
+struct ceph_msg_header {
+       __le64 seq;       /* message seq# for this session */
+       __le64 tid;       /* transaction id */
+       __le16 type;      /* message type */
+       __le16 priority;  /* priority.  higher value == higher priority */
+       __le16 version;   /* version of message encoding */
+
+       __le32 front_len; /* bytes in main payload */
+       __le32 middle_len;/* bytes in middle payload */
+       __le32 data_len;  /* bytes of data payload */
+       __le16 data_off;  /* sender: include full offset;
+                            receiver: mask against ~PAGE_MASK */
+
+       struct ceph_entity_name src;
+       __le32 reserved;
+       __le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
+
  #define CEPH_MSG_PRIO_LOW     64
  #define CEPH_MSG_PRIO_DEFAULT 127
  #define CEPH_MSG_PRIO_HIGH    196
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c

index 3514f71..afa7bb3 100644 (file)
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -16,7 +16,7 @@
  #define OSD_OP_FRONT_LEN       4096
  #define OSD_OPREPLY_FRONT_LEN  512
  
-const static struct ceph_connection_operations osd_con_ops;
+static const struct ceph_connection_operations osd_con_ops;
  static int __kick_requests(struct ceph_osd_client *osdc,
                           struct ceph_osd *kickosd);
  
@@ -147,7 +147,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
                 req = kzalloc(sizeof(*req), GFP_NOFS);
         }
         if (req == NULL)
-               return ERR_PTR(-ENOMEM);
+               return NULL;
  
         req->r_osdc = osdc;
         req->r_mempool = use_mempool;
@@ -164,10 +164,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
                 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
         else
                 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
-                                  OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
-       if (IS_ERR(msg)) {
+                                  OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
+       if (!msg) {
                 ceph_osdc_put_request(req);
-               return ERR_PTR(PTR_ERR(msg));
+               return NULL;
         }
         req->r_reply = msg;
  
@@ -178,10 +178,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
         if (use_mempool)
                 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
         else
-               msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
-       if (IS_ERR(msg)) {
+               msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
+       if (!msg) {
                 ceph_osdc_put_request(req);
-               return ERR_PTR(PTR_ERR(msg));
+               return NULL;
         }
         msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
         memset(msg->front.iov_base, 0, msg->front.iov_len);
@@ -715,7 +715,7 @@ static void handle_timeout(struct work_struct *work)
          * should mark the osd as failed and we should find out about
          * it from an updated osd map.
          */
-       while (!list_empty(&osdc->req_lru)) {
+       while (timeout && !list_empty(&osdc->req_lru)) {
                 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
                                  r_req_lru_item);
  
@@ -1078,6 +1078,7 @@ done:
         if (newmap)
                 kick_requests(osdc, NULL);
         up_read(&osdc->map_sem);
+       wake_up(&osdc->client->auth_wq);
         return;
  
  bad:
@@ -1087,45 +1088,6 @@ bad:
         return;
  }
  
-
-/*
- * A read request prepares specific pages that data is to be read into.
- * When a message is being read off the wire, we call prepare_pages to
- * find those pages.
- *  0 = success, -1 failure.
- */
-static int __prepare_pages(struct ceph_connection *con,
-                        struct ceph_msg_header *hdr,
-                        struct ceph_osd_request *req,
-                        u64 tid,
-                        struct ceph_msg *m)
-{
-       struct ceph_osd *osd = con->private;
-       struct ceph_osd_client *osdc;
-       int ret = -1;
-       int data_len = le32_to_cpu(hdr->data_len);
-       unsigned data_off = le16_to_cpu(hdr->data_off);
-
-       int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
-
-       if (!osd)
-               return -1;
-
-       osdc = osd->o_osdc;
-
-       dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
-            tid, req->r_num_pages, want);
-       if (unlikely(req->r_num_pages < want))
-               goto out;
-       m->pages = req->r_pages;
-       m->nr_pages = req->r_num_pages;
-       ret = 0; /* success */
-out:
-       BUG_ON(ret < 0 || m->nr_pages < want);
-
-       return ret;
-}
-
  /*
   * Register request, send initial attempt.
   */
@@ -1252,11 +1214,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
         if (!osdc->req_mempool)
                 goto out;
  
-       err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
+       err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
+                               "osd_op");
         if (err < 0)
                 goto out_mempool;
         err = ceph_msgpool_init(&osdc->msgpool_op_reply,
-                               OSD_OPREPLY_FRONT_LEN, 10, true);
+                               OSD_OPREPLY_FRONT_LEN, 10, true,
+                               "osd_op_reply");
         if (err < 0)
                 goto out_msgpool;
         return 0;
@@ -1302,8 +1266,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
                                     CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
                                     NULL, 0, truncate_seq, truncate_size, NULL,
                                     false, 1);
-       if (IS_ERR(req))
-               return PTR_ERR(req);
+       if (!req)
+               return -ENOMEM;
  
         /* it may be a short read due to an object boundary */
         req->r_pages = pages;
@@ -1345,8 +1309,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
                                     snapc, do_sync,
                                     truncate_seq, truncate_size, mtime,
                                     nofail, 1);
-       if (IS_ERR(req))
-               return PTR_ERR(req);
+       if (!req)
+               return -ENOMEM;
  
         /* it may be a short write due to an object boundary */
         req->r_pages = pages;
@@ -1394,7 +1358,8 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
  }
  
  /*
- * lookup and return message for incoming reply
+ * lookup and return message for incoming reply.  set up reply message
+ * pages.
   */
  static struct ceph_msg *get_reply(struct ceph_connection *con,
                                   struct ceph_msg_header *hdr,
@@ -1407,7 +1372,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
         int front = le32_to_cpu(hdr->front_len);
         int data_len = le32_to_cpu(hdr->data_len);
         u64 tid;
-       int err;
  
         tid = le64_to_cpu(hdr->tid);
         mutex_lock(&osdc->request_mutex);
@@ -1425,13 +1389,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
                      req->r_reply, req->r_con_filling_msg);
                 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
                 ceph_con_put(req->r_con_filling_msg);
+               req->r_con_filling_msg = NULL;
         }
  
         if (front > req->r_reply->front.iov_len) {
                 pr_warning("get_reply front %d > preallocated %d\n",
                            front, (int)req->r_reply->front.iov_len);
-               m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
-               if (IS_ERR(m))
+               m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
+               if (!m)
                         goto out;
                 ceph_msg_put(req->r_reply);
                 req->r_reply = m;
@@ -1439,12 +1404,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
         m = ceph_msg_get(req->r_reply);
  
         if (data_len > 0) {
-               err = __prepare_pages(con, hdr, req, tid, m);
-               if (err < 0) {
+               unsigned data_off = le16_to_cpu(hdr->data_off);
+               int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
+
+               if (unlikely(req->r_num_pages < want)) {
+                       pr_warning("tid %lld reply %d > expected %d pages\n",
+                                  tid, want, m->nr_pages);
                         *skip = 1;
                         ceph_msg_put(m);
-                       m = ERR_PTR(err);
+                       m = NULL;
+                       goto out;
                 }
+               m->pages = req->r_pages;
+               m->nr_pages = req->r_num_pages;
         }
         *skip = 0;
         req->r_con_filling_msg = ceph_con_get(con);
@@ -1466,7 +1438,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
  
         switch (type) {
         case CEPH_MSG_OSD_MAP:
-               return ceph_msg_new(type, front, 0, 0, NULL);
+               return ceph_msg_new(type, front, GFP_NOFS);
         case CEPH_MSG_OSD_OPREPLY:
                 return get_reply(con, hdr, skip);
         default:
@@ -1552,7 +1524,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
         return ceph_monc_validate_auth(&osdc->client->monc);
  }
  
-const static struct ceph_connection_operations osd_con_ops = {
+static const struct ceph_connection_operations osd_con_ops = {
         .get = get_osd_con,
         .put = put_osd_con,
         .dispatch = dispatch,
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c

index 5f8dbf7..b6859f4 100644 (file)
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -20,7 +20,7 @@ int ceph_pagelist_release(struct ceph_pagelist *pl)
  
  static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
  {
-       struct page *page = alloc_page(GFP_NOFS);
+       struct page *page = __page_cache_alloc(GFP_NOFS);
         if (!page)
                 return -ENOMEM;
         pl->room += PAGE_SIZE;
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h

index fd56451..8fcc023 100644 (file)
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -101,8 +101,8 @@ struct ceph_pg_pool {
         __le64 snap_seq;          /* seq for per-pool snapshot */
         __le32 snap_epoch;        /* epoch of last snap */
         __le32 num_snaps;
-       __le32 num_removed_snap_intervals;
-       __le64 uid;
+       __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
+       __le64 auid;               /* who owns the pg */
  } __attribute__ ((packed));
  
  /*
@@ -208,6 +208,7 @@ enum {
         /* read */
         CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
         CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
+       CEPH_OSD_OP_CMPXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
  
         /* write */
         CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
@@ -305,6 +306,22 @@ enum {
  #define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
  #define EBLACKLISTED ESHUTDOWN /* blacklisted */
  
+/* xattr comparison */
+enum {
+       CEPH_OSD_CMPXATTR_OP_NOP = 0,
+       CEPH_OSD_CMPXATTR_OP_EQ  = 1,
+       CEPH_OSD_CMPXATTR_OP_NE  = 2,
+       CEPH_OSD_CMPXATTR_OP_GT  = 3,
+       CEPH_OSD_CMPXATTR_OP_GTE = 4,
+       CEPH_OSD_CMPXATTR_OP_LT  = 5,
+       CEPH_OSD_CMPXATTR_OP_LTE = 6
+};
+
+enum {
+       CEPH_OSD_CMPXATTR_MODE_STRING = 1,
+       CEPH_OSD_CMPXATTR_MODE_U64    = 2
+};
+
  /*
   * an individual object operation.  each may be accompanied by some data
   * payload
@@ -321,6 +338,8 @@ struct ceph_osd_op {
                 struct {
                         __le32 name_len;
                         __le32 value_len;
+                       __u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
+                       __u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
                 } __attribute__ ((packed)) xattr;
                 struct {
                         __u8 class_len;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c

index d5114db..c0b26b6 100644 (file)
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -512,7 +512,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
                             struct ceph_cap_snap *capsnap)
  {
         struct inode *inode = &ci->vfs_inode;
-       struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+       struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
  
         BUG_ON(capsnap->writing);
         capsnap->size = inode->i_size;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c

index 9307bbe..7c663d9 100644 (file)
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -8,14 +8,11 @@
  #include <linux/module.h>
  #include <linux/mount.h>
  #include <linux/parser.h>
-#include <linux/rwsem.h>
  #include <linux/sched.h>
  #include <linux/seq_file.h>
  #include <linux/slab.h>
  #include <linux/statfs.h>
  #include <linux/string.h>
-#include <linux/version.h>
-#include <linux/vmalloc.h>
  
  #include "decode.h"
  #include "super.h"
@@ -107,12 +104,40 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
  static int ceph_syncfs(struct super_block *sb, int wait)
  {
         dout("sync_fs %d\n", wait);
-       ceph_osdc_sync(&ceph_client(sb)->osdc);
-       ceph_mdsc_sync(&ceph_client(sb)->mdsc);
+       ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
+       ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
         dout("sync_fs %d done\n", wait);
         return 0;
  }
  
+static int default_congestion_kb(void)
+{
+       int congestion_kb;
+
+       /*
+        * Copied from NFS
+        *
+        * congestion size, scale with available memory.
+        *
+        *  64MB:    8192k
+        * 128MB:   11585k
+        * 256MB:   16384k
+        * 512MB:   23170k
+        *   1GB:   32768k
+        *   2GB:   46340k
+        *   4GB:   65536k
+        *   8GB:   92681k
+        *  16GB:  131072k
+        *
+        * This allows larger machines to have larger/more transfers.
+        * Limit the default to 256M
+        */
+       congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+       if (congestion_kb > 256*1024)
+               congestion_kb = 256*1024;
+
+       return congestion_kb;
+}
  
  /**
   * ceph_show_options - Show mount options in /proc/mounts
@@ -138,6 +163,35 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
                 seq_puts(m, ",nocrc");
         if (args->flags & CEPH_OPT_NOASYNCREADDIR)
                 seq_puts(m, ",noasyncreaddir");
+
+       if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+               seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
+       if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+               seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
+       if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+               seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
+       if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+               seq_printf(m, ",osdkeepalivetimeout=%d",
+                        args->osd_keepalive_timeout);
+       if (args->wsize)
+               seq_printf(m, ",wsize=%d", args->wsize);
+       if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+               seq_printf(m, ",rsize=%d", args->rsize);
+       if (args->congestion_kb != default_congestion_kb())
+               seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
+       if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+               seq_printf(m, ",caps_wanted_delay_min=%d",
+                        args->caps_wanted_delay_min);
+       if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+               seq_printf(m, ",caps_wanted_delay_max=%d",
+                          args->caps_wanted_delay_max);
+       if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+               seq_printf(m, ",cap_release_safety=%d",
+                          args->cap_release_safety);
+       if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+               seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
+       if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+               seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
         if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
                 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
         if (args->name)
@@ -161,35 +215,6 @@ static void ceph_inode_init_once(void *foo)
         inode_init_once(&ci->vfs_inode);
  }
  
-static int default_congestion_kb(void)
-{
-       int congestion_kb;
-
-       /*
-        * Copied from NFS
-        *
-        * congestion size, scale with available memory.
-        *
-        *  64MB:    8192k
-        * 128MB:   11585k
-        * 256MB:   16384k
-        * 512MB:   23170k
-        *   1GB:   32768k
-        *   2GB:   46340k
-        *   4GB:   65536k
-        *   8GB:   92681k
-        *  16GB:  131072k
-        *
-        * This allows larger machines to have larger/more transfers.
-        * Limit the default to 256M
-        */
-       congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
-       if (congestion_kb > 256*1024)
-               congestion_kb = 256*1024;
-
-       return congestion_kb;
-}
-
  static int __init init_caches(void)
  {
         ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
@@ -308,7 +333,9 @@ enum {
         Opt_osd_idle_ttl,
         Opt_caps_wanted_delay_min,
         Opt_caps_wanted_delay_max,
+       Opt_cap_release_safety,
         Opt_readdir_max_entries,
+       Opt_readdir_max_bytes,
         Opt_congestion_kb,
         Opt_last_int,
         /* int args above */
@@ -339,7 +366,9 @@ static match_table_t arg_tokens = {
         {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
         {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
         {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+       {Opt_cap_release_safety, "cap_release_safety=%d"},
         {Opt_readdir_max_entries, "readdir_max_entries=%d"},
+       {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
         {Opt_congestion_kb, "write_congestion_kb=%d"},
         /* int args above */
         {Opt_snapdirname, "snapdirname=%s"},
@@ -388,8 +417,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
         args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
         args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
         args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
-       args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
-       args->max_readdir = 1024;
+       args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+       args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+       args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
         args->congestion_kb = default_congestion_kb();
  
         /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
@@ -497,6 +527,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
                 case Opt_readdir_max_entries:
                         args->max_readdir = intval;
                         break;
+               case Opt_readdir_max_bytes:
+                       args->max_readdir_bytes = intval;
+                       break;
                 case Opt_congestion_kb:
                         args->congestion_kb = intval;
                         break;
@@ -682,9 +715,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
  /*
   * true if we have the mon map (and have thus joined the cluster)
   */
-static int have_mon_map(struct ceph_client *client)
+static int have_mon_and_osd_map(struct ceph_client *client)
  {
-       return client->monc.monmap && client->monc.monmap->epoch;
+       return client->monc.monmap && client->monc.monmap->epoch &&
+              client->osdc.osdmap && client->osdc.osdmap->epoch;
  }
  
  /*
@@ -762,7 +796,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
         if (err < 0)
                 goto out;
  
-       while (!have_mon_map(client)) {
+       while (!have_mon_and_osd_map(client)) {
                 err = -EIO;
                 if (timeout && time_after_eq(jiffies, started + timeout))
                         goto out;
@@ -770,8 +804,8 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
                 /* wait */
                 dout("mount waiting for mon_map\n");
                 err = wait_event_interruptible_timeout(client->auth_wq,
-                              have_mon_map(client) || (client->auth_err < 0),
-                              timeout);
+                      have_mon_and_osd_map(client) || (client->auth_err < 0),
+                      timeout);
                 if (err == -EINTR || err == -ERESTARTSYS)
                         goto out;
                 if (client->auth_err < 0) {
@@ -884,6 +918,8 @@ static int ceph_compare_super(struct super_block *sb, void *data)
  /*
   * construct our own bdi so we can control readahead, etc.
   */
+static atomic_long_t bdi_seq = ATOMIC_INIT(0);
+
  static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
  {
         int err;
@@ -893,7 +929,8 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
                 client->backing_dev_info.ra_pages =
                         (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
                         >> PAGE_SHIFT;
-       err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
+       err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
+                          atomic_long_inc_return(&bdi_seq));
         if (!err)
                 sb->s_bdi = &client->backing_dev_info;
         return err;
@@ -932,9 +969,9 @@ static int ceph_get_sb(struct file_system_type *fs_type,
                 goto out;
         }
  
-       if (ceph_client(sb) != client) {
+       if (ceph_sb_to_client(sb) != client) {
                 ceph_destroy_client(client);
-               client = ceph_client(sb);
+               client = ceph_sb_to_client(sb);
                 dout("get_sb got existing client %p\n", client);
         } else {
                 dout("get_sb using new client %p\n", client);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h

index 13513b8..3725c9e 100644 (file)
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -52,24 +52,25 @@
  
  struct ceph_mount_args {
         int sb_flags;
+       int flags;
+       struct ceph_fsid fsid;
+       struct ceph_entity_addr my_addr;
         int num_mon;
         struct ceph_entity_addr *mon_addr;
-       int flags;
         int mount_timeout;
         int osd_idle_ttl;
-       int caps_wanted_delay_min, caps_wanted_delay_max;
-       struct ceph_fsid fsid;
-       struct ceph_entity_addr my_addr;
-       int wsize;
-       int rsize;            /* max readahead */
-       int max_readdir;      /* max readdir size */
-       int congestion_kb;      /* max readdir size */
         int osd_timeout;
         int osd_keepalive_timeout;
+       int wsize;
+       int rsize;            /* max readahead */
+       int congestion_kb;    /* max writeback in flight */
+       int caps_wanted_delay_min, caps_wanted_delay_max;
+       int cap_release_safety;
+       int max_readdir;       /* max readdir result (entires) */
+       int max_readdir_bytes; /* max readdir result (bytes) */
         char *snapdir_name;   /* default ".snap" */
         char *name;
         char *secret;
-       int cap_release_safety;
  };
  
  /*
@@ -80,13 +81,14 @@ struct ceph_mount_args {
  #define CEPH_OSD_KEEPALIVE_DEFAULT  5
  #define CEPH_OSD_IDLE_TTL_DEFAULT    60
  #define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
+#define CEPH_MAX_READDIR_DEFAULT    1024
+#define CEPH_MAX_READDIR_BYTES_DEFAULT    (512*1024)
  
  #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
  #define CEPH_MSG_MAX_DATA_LEN  (16*1024*1024)
  
  #define CEPH_SNAPDIRNAME_DEFAULT ".snap"
  #define CEPH_AUTH_NAME_DEFAULT   "guest"
-
  /*
   * Delay telling the MDS we no longer want caps, in case we reopen
   * the file.  Delay a minimum amount of time, even if we send a cap
@@ -96,6 +98,7 @@ struct ceph_mount_args {
  #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
  #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
  
+#define CEPH_CAP_RELEASE_SAFETY_DEFAULT        (CEPH_CAPS_PER_RELEASE * 4)
  
  /* mount state */
  enum {
@@ -160,12 +163,6 @@ struct ceph_client {
  #endif
  };
  
-static inline struct ceph_client *ceph_client(struct super_block *sb)
-{
-       return sb->s_fs_info;
-}
-
-
  /*
   * File i/o capability.  This tracks shared state with the metadata
   * server that allows us to cache or writeback attributes or to read
@@ -871,6 +868,7 @@ extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
  extern void ceph_dentry_lru_add(struct dentry *dn);
  extern void ceph_dentry_lru_touch(struct dentry *dn);
  extern void ceph_dentry_lru_del(struct dentry *dn);
+extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
  
  /*
   * our d_ops vary depending on whether the inode is live,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c

index 2845422..68aeebc 100644 (file)
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -7,7 +7,8 @@
  
  static bool ceph_is_valid_xattr(const char *name)
  {
-       return !strncmp(name, XATTR_SECURITY_PREFIX,
+       return !strncmp(name, "ceph.", 5) ||
+              !strncmp(name, XATTR_SECURITY_PREFIX,
                         XATTR_SECURITY_PREFIX_LEN) ||
                !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
                !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
@@ -76,14 +77,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
  }
  
  static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
-       { true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
-       { true, "user.ceph.dir.files", ceph_vxattrcb_files},
-       { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
-       { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
-       { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
-       { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
-       { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
-       { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
+       { true, "ceph.dir.entries", ceph_vxattrcb_entries},
+       { true, "ceph.dir.files", ceph_vxattrcb_files},
+       { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
+       { true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
+       { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
+       { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
+       { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
+       { true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
         { true, NULL, NULL }
  };
  
@@ -107,7 +108,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
  }
  
  static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
-       { true, "user.ceph.layout", ceph_vxattrcb_layout},
+       { true, "ceph.layout", ceph_vxattrcb_layout},
         { NULL, NULL }
  };
  
@@ -186,12 +187,6 @@ static int __set_xattr(struct ceph_inode_info *ci,
                 ci->i_xattrs.names_size -= xattr->name_len;
                 ci->i_xattrs.vals_size -= xattr->val_len;
         }
-       if (!xattr) {
-               pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
-                      &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
-                      xattr->val);
-               return -ENOMEM;
-       }
         ci->i_xattrs.names_size += name_len;
         ci->i_xattrs.vals_size += val_len;
         if (val)
@@ -574,7 +569,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
              ci->i_xattrs.version, ci->i_xattrs.index_version);
  
         if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
-           (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
+           (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
                 goto list_xattr;
         } else {
                 spin_unlock(&inode->i_lock);
@@ -622,7 +617,7 @@ out:
  static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
                               const char *value, size_t size, int flags)
  {
-       struct ceph_client *client = ceph_client(dentry->d_sb);
+       struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
         struct inode *inode = dentry->d_inode;
         struct ceph_inode_info *ci = ceph_inode(inode);
         struct inode *parent_inode = dentry->d_parent->d_inode;
@@ -641,7 +636,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
                         return -ENOMEM;
                 err = -ENOMEM;
                 for (i = 0; i < nr_pages; i++) {
-                       pages[i] = alloc_page(GFP_NOFS);
+                       pages[i] = __page_cache_alloc(GFP_NOFS);
                         if (!pages[i]) {
                                 nr_pages = i;
                                 goto out;
@@ -779,7 +774,7 @@ out:
  
  static int ceph_send_removexattr(struct dentry *dentry, const char *name)
  {
-       struct ceph_client *client = ceph_client(dentry->d_sb);
+       struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
         struct ceph_mds_client *mdsc = &client->mdsc;
         struct inode *inode = dentry->d_inode;
         struct inode *parent_inode = dentry->d_parent->d_inode;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 24 May 2010 14:37:52 +0000 (07:37 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 24 May 2010 14:37:52 +0000 (07:37 -0700)
fs/ceph/addr.c		patch \| blob \| history
fs/ceph/auth.c		patch \| blob \| history
fs/ceph/auth.h		patch \| blob \| history
fs/ceph/auth_none.c		patch \| blob \| history
fs/ceph/auth_x.c		patch \| blob \| history
fs/ceph/caps.c		patch \| blob \| history
fs/ceph/ceph_fs.h		patch \| blob \| history
fs/ceph/ceph_strings.c		patch \| blob \| history
fs/ceph/debugfs.c		patch \| blob \| history
fs/ceph/dir.c		patch \| blob \| history
fs/ceph/export.c		patch \| blob \| history
fs/ceph/file.c		patch \| blob \| history
fs/ceph/inode.c		patch \| blob \| history
fs/ceph/ioctl.c		patch \| blob \| history
fs/ceph/mds_client.c		patch \| blob \| history
fs/ceph/mds_client.h		patch \| blob \| history
fs/ceph/messenger.c		patch \| blob \| history
fs/ceph/messenger.h		patch \| blob \| history
fs/ceph/mon_client.c		patch \| blob \| history
fs/ceph/mon_client.h		patch \| blob \| history
fs/ceph/msgpool.c		patch \| blob \| history
fs/ceph/msgpool.h		patch \| blob \| history
fs/ceph/msgr.h		patch \| blob \| history
fs/ceph/osd_client.c		patch \| blob \| history
fs/ceph/pagelist.c		patch \| blob \| history
fs/ceph/rados.h		patch \| blob \| history
fs/ceph/snap.c		patch \| blob \| history
fs/ceph/super.c		patch \| blob \| history
fs/ceph/super.h		patch \| blob \| history
fs/ceph/xattr.c		patch \| blob \| history