fs/ceph/super.c

   1
   2 #include <linux/ceph/ceph_debug.h>
   3
   4 #include <linux/backing-dev.h>
   5 #include <linux/ctype.h>
   6 #include <linux/fs.h>
   7 #include <linux/inet.h>
   8 #include <linux/in6.h>
   9 #include <linux/module.h>
  10 #include <linux/mount.h>
  11 #include <linux/parser.h>
  12 #include <linux/sched.h>
  13 #include <linux/seq_file.h>
  14 #include <linux/slab.h>
  15 #include <linux/statfs.h>
  16 #include <linux/string.h>
  17
  18 #include "super.h"
  19 #include "mds_client.h"
  20
  21 #include <linux/ceph/decode.h>
  22 #include <linux/ceph/mon_client.h>
  23 #include <linux/ceph/auth.h>
  24 #include <linux/ceph/debugfs.h>
  25
  26 /*
  27  * Ceph superblock operations
  28  *
  29  * Handle the basics of mounting, unmounting.
  30  */
  31
  32 /*
  33  * super ops
  34  */
  35 static void ceph_put_super(struct super_block *s)
  36 {
  37         struct ceph_fs_client *fsc = ceph_sb_to_client(s);
  38
  39         dout("put_super\n");
  40         ceph_mdsc_close_sessions(fsc->mdsc);
  41
  42         /*
  43          * ensure we release the bdi before put_anon_super releases
  44          * the device name.
  45          */
  46         if (s->s_bdi == &fsc->backing_dev_info) {
  47                 bdi_unregister(&fsc->backing_dev_info);
  48                 s->s_bdi = NULL;
  49         }
  50
  51         return;
  52 }
  53
  54 static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
  55 {
  56         struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
  57         struct ceph_monmap *monmap = fsc->client->monc.monmap;
  58         struct ceph_statfs st;
  59         u64 fsid;
  60         int err;
  61
  62         dout("statfs\n");
  63         err = ceph_monc_do_statfs(&fsc->client->monc, &st);
  64         if (err < 0)
  65                 return err;
  66
  67         /* fill in kstatfs */
  68         buf->f_type = CEPH_SUPER_MAGIC;  /* ?? */
  69
  70         /*
  71          * express utilization in terms of large blocks to avoid
  72          * overflow on 32-bit machines.
  73          */
  74         buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
  75         buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
  76         buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
  77         buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
  78
  79         buf->f_files = le64_to_cpu(st.num_objects);
  80         buf->f_ffree = -1;
  81         buf->f_namelen = NAME_MAX;
  82         buf->f_frsize = PAGE_CACHE_SIZE;
  83
  84         /* leave fsid little-endian, regardless of host endianness */
  85         fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
  86         buf->f_fsid.val[0] = fsid & 0xffffffff;
  87         buf->f_fsid.val[1] = fsid >> 32;
  88
  89         return 0;
  90 }
  91
  92
  93 static int ceph_sync_fs(struct super_block *sb, int wait)
  94 {
  95         struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
  96
  97         if (!wait) {
  98                 dout("sync_fs (non-blocking)\n");
  99                 ceph_flush_dirty_caps(fsc->mdsc);
 100                 dout("sync_fs (non-blocking) done\n");
 101                 return 0;
 102         }
 103
 104         dout("sync_fs (blocking)\n");
 105         ceph_osdc_sync(&fsc->client->osdc);
 106         ceph_mdsc_sync(fsc->mdsc);
 107         dout("sync_fs (blocking) done\n");
 108         return 0;
 109 }
 110
 111 /*
 112  * mount options
 113  */
 114 enum {
 115         Opt_wsize,
 116         Opt_rsize,
 117         Opt_rasize,
 118         Opt_caps_wanted_delay_min,
 119         Opt_caps_wanted_delay_max,
 120         Opt_cap_release_safety,
 121         Opt_readdir_max_entries,
 122         Opt_readdir_max_bytes,
 123         Opt_congestion_kb,
 124         Opt_last_int,
 125         /* int args above */
 126         Opt_snapdirname,
 127         Opt_last_string,
 128         /* string args above */
 129         Opt_dirstat,
 130         Opt_nodirstat,
 131         Opt_rbytes,
 132         Opt_norbytes,
 133         Opt_noasyncreaddir,
 134         Opt_dcache,
 135         Opt_nodcache,
 136         Opt_ino32,
 137 };
 138
 139 static match_table_t fsopt_tokens = {
 140         {Opt_wsize, "wsize=%d"},
 141         {Opt_rsize, "rsize=%d"},
 142         {Opt_rasize, "rasize=%d"},
 143         {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
 144         {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
 145         {Opt_cap_release_safety, "cap_release_safety=%d"},
 146         {Opt_readdir_max_entries, "readdir_max_entries=%d"},
 147         {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
 148         {Opt_congestion_kb, "write_congestion_kb=%d"},
 149         /* int args above */
 150         {Opt_snapdirname, "snapdirname=%s"},
 151         /* string args above */
 152         {Opt_dirstat, "dirstat"},
 153         {Opt_nodirstat, "nodirstat"},
 154         {Opt_rbytes, "rbytes"},
 155         {Opt_norbytes, "norbytes"},
 156         {Opt_noasyncreaddir, "noasyncreaddir"},
 157         {Opt_dcache, "dcache"},
 158         {Opt_nodcache, "nodcache"},
 159         {Opt_ino32, "ino32"},
 160         {-1, NULL}
 161 };
 162
 163 static int parse_fsopt_token(char *c, void *private)
 164 {
 165         struct ceph_mount_options *fsopt = private;
 166         substring_t argstr[MAX_OPT_ARGS];
 167         int token, intval, ret;
 168
 169         token = match_token((char *)c, fsopt_tokens, argstr);
 170         if (token < 0)
 171                 return -EINVAL;
 172
 173         if (token < Opt_last_int) {
 174                 ret = match_int(&argstr[0], &intval);
 175                 if (ret < 0) {
 176                         pr_err("bad mount option arg (not int) "
 177                                "at '%s'\n", c);
 178                         return ret;
 179                 }
 180                 dout("got int token %d val %d\n", token, intval);
 181         } else if (token > Opt_last_int && token < Opt_last_string) {
 182                 dout("got string token %d val %s\n", token,
 183                      argstr[0].from);
 184         } else {
 185                 dout("got token %d\n", token);
 186         }
 187
 188         switch (token) {
 189         case Opt_snapdirname:
 190                 kfree(fsopt->snapdir_name);
 191                 fsopt->snapdir_name = kstrndup(argstr[0].from,
 192                                                argstr[0].to-argstr[0].from,
 193                                                GFP_KERNEL);
 194                 if (!fsopt->snapdir_name)
 195                         return -ENOMEM;
 196                 break;
 197
 198                 /* misc */
 199         case Opt_wsize:
 200                 fsopt->wsize = intval;
 201                 break;
 202         case Opt_rsize:
 203                 fsopt->rsize = intval;
 204                 break;
 205         case Opt_rasize:
 206                 fsopt->rasize = intval;
 207                 break;
 208         case Opt_caps_wanted_delay_min:
 209                 fsopt->caps_wanted_delay_min = intval;
 210                 break;
 211         case Opt_caps_wanted_delay_max:
 212                 fsopt->caps_wanted_delay_max = intval;
 213                 break;
 214         case Opt_readdir_max_entries:
 215                 fsopt->max_readdir = intval;
 216                 break;
 217         case Opt_readdir_max_bytes:
 218                 fsopt->max_readdir_bytes = intval;
 219                 break;
 220         case Opt_congestion_kb:
 221                 fsopt->congestion_kb = intval;
 222                 break;
 223         case Opt_dirstat:
 224                 fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
 225                 break;
 226         case Opt_nodirstat:
 227                 fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
 228                 break;
 229         case Opt_rbytes:
 230                 fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
 231                 break;
 232         case Opt_norbytes:
 233                 fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
 234                 break;
 235         case Opt_noasyncreaddir:
 236                 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
 237                 break;
 238         case Opt_dcache:
 239                 fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
 240                 break;
 241         case Opt_nodcache:
 242                 fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
 243                 break;
 244         case Opt_ino32:
 245                 fsopt->flags |= CEPH_MOUNT_OPT_INO32;
 246                 break;
 247         default:
 248                 BUG_ON(token);
 249         }
 250         return 0;
 251 }
 252
 253 static void destroy_mount_options(struct ceph_mount_options *args)
 254 {
 255         dout("destroy_mount_options %p\n", args);
 256         kfree(args->snapdir_name);
 257         kfree(args);
 258 }
 259
 260 static int strcmp_null(const char *s1, const char *s2)
 261 {
 262         if (!s1 && !s2)
 263                 return 0;
 264         if (s1 && !s2)
 265                 return -1;
 266         if (!s1 && s2)
 267                 return 1;
 268         return strcmp(s1, s2);
 269 }
 270
 271 static int compare_mount_options(struct ceph_mount_options *new_fsopt,
 272                                  struct ceph_options *new_opt,
 273                                  struct ceph_fs_client *fsc)
 274 {
 275         struct ceph_mount_options *fsopt1 = new_fsopt;
 276         struct ceph_mount_options *fsopt2 = fsc->mount_options;
 277         int ofs = offsetof(struct ceph_mount_options, snapdir_name);
 278         int ret;
 279
 280         ret = memcmp(fsopt1, fsopt2, ofs);
 281         if (ret)
 282                 return ret;
 283
 284         ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
 285         if (ret)
 286                 return ret;
 287
 288         return ceph_compare_options(new_opt, fsc->client);
 289 }
 290
 291 static int parse_mount_options(struct ceph_mount_options **pfsopt,
 292                                struct ceph_options **popt,
 293                                int flags, char *options,
 294                                const char *dev_name,
 295                                const char **path)
 296 {
 297         struct ceph_mount_options *fsopt;
 298         const char *dev_name_end;
 299         int err = -ENOMEM;
 300
 301         fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
 302         if (!fsopt)
 303                 return -ENOMEM;
 304
 305         dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
 306
 307         fsopt->sb_flags = flags;
 308         fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
 309
 310         fsopt->rsize = CEPH_RSIZE_DEFAULT;
 311         fsopt->rasize = CEPH_RASIZE_DEFAULT;
 312         fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
 313         fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
 314         fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
 315         fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
 316         fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
 317         fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
 318         fsopt->congestion_kb = default_congestion_kb();
 319
 320         /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
 321         err = -EINVAL;
 322         if (!dev_name)
 323                 goto out;
 324         *path = strstr(dev_name, ":/");
 325         if (*path == NULL) {
 326                 pr_err("device name is missing path (no :/ in %s)\n",
 327                                 dev_name);
 328                 goto out;
 329         }
 330         dev_name_end = *path;
 331         dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
 332
 333         /* path on server */
 334         *path += 2;
 335         dout("server path '%s'\n", *path);
 336
 337         err = ceph_parse_options(popt, options, dev_name, dev_name_end,
 338                                  parse_fsopt_token, (void *)fsopt);
 339         if (err)
 340                 goto out;
 341
 342         /* success */
 343         *pfsopt = fsopt;
 344         return 0;
 345
 346 out:
 347         destroy_mount_options(fsopt);
 348         return err;
 349 }
 350
 351 /**
 352  * ceph_show_options - Show mount options in /proc/mounts
 353  * @m: seq_file to write to
 354  * @root: root of that (sub)tree
 355  */
 356 static int ceph_show_options(struct seq_file *m, struct dentry *root)
 357 {
 358         struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);
 359         struct ceph_mount_options *fsopt = fsc->mount_options;
 360         struct ceph_options *opt = fsc->client->options;
 361
 362         if (opt->flags & CEPH_OPT_FSID)
 363                 seq_printf(m, ",fsid=%pU", &opt->fsid);
 364         if (opt->flags & CEPH_OPT_NOSHARE)
 365                 seq_puts(m, ",noshare");
 366         if (opt->flags & CEPH_OPT_NOCRC)
 367                 seq_puts(m, ",nocrc");
 368
 369         if (opt->name)
 370                 seq_printf(m, ",name=%s", opt->name);
 371         if (opt->key)
 372                 seq_puts(m, ",secret=<hidden>");
 373
 374         if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
 375                 seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
 376         if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
 377                 seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
 378         if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
 379                 seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
 380         if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
 381                 seq_printf(m, ",osdkeepalivetimeout=%d",
 382                            opt->osd_keepalive_timeout);
 383
 384         if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
 385                 seq_puts(m, ",dirstat");
 386         if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
 387                 seq_puts(m, ",norbytes");
 388         if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
 389                 seq_puts(m, ",noasyncreaddir");
 390         if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE)
 391                 seq_puts(m, ",dcache");
 392         else
 393                 seq_puts(m, ",nodcache");
 394
 395         if (fsopt->wsize)
 396                 seq_printf(m, ",wsize=%d", fsopt->wsize);
 397         if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
 398                 seq_printf(m, ",rsize=%d", fsopt->rsize);
 399         if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
 400                 seq_printf(m, ",rasize=%d", fsopt->rasize);
 401         if (fsopt->congestion_kb != default_congestion_kb())
 402                 seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
 403         if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
 404                 seq_printf(m, ",caps_wanted_delay_min=%d",
 405                          fsopt->caps_wanted_delay_min);
 406         if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
 407                 seq_printf(m, ",caps_wanted_delay_max=%d",
 408                            fsopt->caps_wanted_delay_max);
 409         if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
 410                 seq_printf(m, ",cap_release_safety=%d",
 411                            fsopt->cap_release_safety);
 412         if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
 413                 seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
 414         if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
 415                 seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
 416         if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
 417                 seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
 418         return 0;
 419 }
 420
 421 /*
 422  * handle any mon messages the standard library doesn't understand.
 423  * return error if we don't either.
 424  */
 425 static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
 426 {
 427         struct ceph_fs_client *fsc = client->private;
 428         int type = le16_to_cpu(msg->hdr.type);
 429
 430         switch (type) {
 431         case CEPH_MSG_MDS_MAP:
 432                 ceph_mdsc_handle_map(fsc->mdsc, msg);
 433                 return 0;
 434
 435         default:
 436                 return -1;
 437         }
 438 }
 439
 440 /*
 441  * create a new fs client
 442  */
 443 static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 444                                         struct ceph_options *opt)
 445 {
 446         struct ceph_fs_client *fsc;
 447         const unsigned supported_features =
 448                 CEPH_FEATURE_FLOCK |
 449                 CEPH_FEATURE_DIRLAYOUTHASH;
 450         const unsigned required_features = 0;
 451         int err = -ENOMEM;
 452
 453         fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
 454         if (!fsc)
 455                 return ERR_PTR(-ENOMEM);
 456
 457         fsc->client = ceph_create_client(opt, fsc, supported_features,
 458                                          required_features);
 459         if (IS_ERR(fsc->client)) {
 460                 err = PTR_ERR(fsc->client);
 461                 goto fail;
 462         }
 463         fsc->client->extra_mon_dispatch = extra_mon_dispatch;
 464         fsc->client->monc.want_mdsmap = 1;
 465
 466         fsc->mount_options = fsopt;
 467
 468         fsc->sb = NULL;
 469         fsc->mount_state = CEPH_MOUNT_MOUNTING;
 470
 471         atomic_long_set(&fsc->writeback_count, 0);
 472
 473         err = bdi_init(&fsc->backing_dev_info);
 474         if (err < 0)
 475                 goto fail_client;
 476
 477         err = -ENOMEM;
 478         /*
 479          * The number of concurrent works can be high but they don't need
 480          * to be processed in parallel, limit concurrency.
 481          */
 482         fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
 483         if (fsc->wb_wq == NULL)
 484                 goto fail_bdi;
 485         fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
 486         if (fsc->pg_inv_wq == NULL)
 487                 goto fail_wb_wq;
 488         fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
 489         if (fsc->trunc_wq == NULL)
 490                 goto fail_pg_inv_wq;
 491
 492         /* set up mempools */
 493         err = -ENOMEM;
 494         fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
 495                               fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
 496         if (!fsc->wb_pagevec_pool)
 497                 goto fail_trunc_wq;
 498
 499         /* caps */
 500         fsc->min_caps = fsopt->max_readdir;
 501
 502         return fsc;
 503
 504 fail_trunc_wq:
 505         destroy_workqueue(fsc->trunc_wq);
 506 fail_pg_inv_wq:
 507         destroy_workqueue(fsc->pg_inv_wq);
 508 fail_wb_wq:
 509         destroy_workqueue(fsc->wb_wq);
 510 fail_bdi:
 511         bdi_destroy(&fsc->backing_dev_info);
 512 fail_client:
 513         ceph_destroy_client(fsc->client);
 514 fail:
 515         kfree(fsc);
 516         return ERR_PTR(err);
 517 }
 518
 519 static void destroy_fs_client(struct ceph_fs_client *fsc)
 520 {
 521         dout("destroy_fs_client %p\n", fsc);
 522
 523         destroy_workqueue(fsc->wb_wq);
 524         destroy_workqueue(fsc->pg_inv_wq);
 525         destroy_workqueue(fsc->trunc_wq);
 526
 527         bdi_destroy(&fsc->backing_dev_info);
 528
 529         mempool_destroy(fsc->wb_pagevec_pool);
 530
 531         destroy_mount_options(fsc->mount_options);
 532
 533         ceph_fs_debugfs_cleanup(fsc);
 534
 535         ceph_destroy_client(fsc->client);
 536
 537         kfree(fsc);
 538         dout("destroy_fs_client %p done\n", fsc);
 539 }
 540
 541 /*
 542  * caches
 543  */
 544 struct kmem_cache *ceph_inode_cachep;
 545 struct kmem_cache *ceph_cap_cachep;
 546 struct kmem_cache *ceph_dentry_cachep;
 547 struct kmem_cache *ceph_file_cachep;
 548
 549 static void ceph_inode_init_once(void *foo)
 550 {
 551         struct ceph_inode_info *ci = foo;
 552         inode_init_once(&ci->vfs_inode);
 553 }
 554
 555 static int __init init_caches(void)
 556 {
 557         ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
 558                                       sizeof(struct ceph_inode_info),
 559                                       __alignof__(struct ceph_inode_info),
 560                                       (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
 561                                       ceph_inode_init_once);
 562         if (ceph_inode_cachep == NULL)
 563                 return -ENOMEM;
 564
 565         ceph_cap_cachep = KMEM_CACHE(ceph_cap,
 566                                      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
 567         if (ceph_cap_cachep == NULL)
 568                 goto bad_cap;
 569
 570         ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
 571                                         SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
 572         if (ceph_dentry_cachep == NULL)
 573                 goto bad_dentry;
 574
 575         ceph_file_cachep = KMEM_CACHE(ceph_file_info,
 576                                       SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
 577         if (ceph_file_cachep == NULL)
 578                 goto bad_file;
 579
 580         return 0;
 581
 582 bad_file:
 583         kmem_cache_destroy(ceph_dentry_cachep);
 584 bad_dentry:
 585         kmem_cache_destroy(ceph_cap_cachep);
 586 bad_cap:
 587         kmem_cache_destroy(ceph_inode_cachep);
 588         return -ENOMEM;
 589 }
 590
 591 static void destroy_caches(void)
 592 {
 593         kmem_cache_destroy(ceph_inode_cachep);
 594         kmem_cache_destroy(ceph_cap_cachep);
 595         kmem_cache_destroy(ceph_dentry_cachep);
 596         kmem_cache_destroy(ceph_file_cachep);
 597 }
 598
 599
 600 /*
 601  * ceph_umount_begin - initiate forced umount.  Tear down down the
 602  * mount, skipping steps that may hang while waiting for server(s).
 603  */
 604 static void ceph_umount_begin(struct super_block *sb)
 605 {
 606         struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
 607
 608         dout("ceph_umount_begin - starting forced umount\n");
 609         if (!fsc)
 610                 return;
 611         fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
 612         return;
 613 }
 614
 615 static const struct super_operations ceph_super_ops = {
 616         .alloc_inode    = ceph_alloc_inode,
 617         .destroy_inode  = ceph_destroy_inode,
 618         .write_inode    = ceph_write_inode,
 619         .sync_fs        = ceph_sync_fs,
 620         .put_super      = ceph_put_super,
 621         .show_options   = ceph_show_options,
 622         .statfs         = ceph_statfs,
 623         .umount_begin   = ceph_umount_begin,
 624 };
 625
 626 /*
 627  * Bootstrap mount by opening the root directory.  Note the mount
 628  * @started time from caller, and time out if this takes too long.
 629  */
 630 static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 631                                        const char *path,
 632                                        unsigned long started)
 633 {
 634         struct ceph_mds_client *mdsc = fsc->mdsc;
 635         struct ceph_mds_request *req = NULL;
 636         int err;
 637         struct dentry *root;
 638
 639         /* open dir */
 640         dout("open_root_inode opening '%s'\n", path);
 641         req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
 642         if (IS_ERR(req))
 643                 return ERR_CAST(req);
 644         req->r_path1 = kstrdup(path, GFP_NOFS);
 645         req->r_ino1.ino = CEPH_INO_ROOT;
 646         req->r_ino1.snap = CEPH_NOSNAP;
 647         req->r_started = started;
 648         req->r_timeout = fsc->client->options->mount_timeout * HZ;
 649         req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
 650         req->r_num_caps = 2;
 651         err = ceph_mdsc_do_request(mdsc, NULL, req);
 652         if (err == 0) {
 653                 struct inode *inode = req->r_target_inode;
 654                 req->r_target_inode = NULL;
 655                 dout("open_root_inode success\n");
 656                 if (ceph_ino(inode) == CEPH_INO_ROOT &&
 657                     fsc->sb->s_root == NULL) {
 658                         root = d_make_root(inode);
 659                         if (!root) {
 660                                 root = ERR_PTR(-ENOMEM);
 661                                 goto out;
 662                         }
 663                 } else {
 664                         root = d_obtain_alias(inode);
 665                 }
 666                 ceph_init_dentry(root);
 667                 dout("open_root_inode success, root dentry is %p\n", root);
 668         } else {
 669                 root = ERR_PTR(err);
 670         }
 671 out:
 672         ceph_mdsc_put_request(req);
 673         return root;
 674 }
 675
 676
 677
 678
 679 /*
 680  * mount: join the ceph cluster, and open root directory.
 681  */
 682 static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
 683                       const char *path)
 684 {
 685         int err;
 686         unsigned long started = jiffies;  /* note the start time */
 687         struct dentry *root;
 688         int first = 0;   /* first vfsmount for this super_block */
 689
 690         dout("mount start\n");
 691         mutex_lock(&fsc->client->mount_mutex);
 692
 693         err = __ceph_open_session(fsc->client, started);
 694         if (err < 0)
 695                 goto out;
 696
 697         dout("mount opening root\n");
 698         root = open_root_dentry(fsc, "", started);
 699         if (IS_ERR(root)) {
 700                 err = PTR_ERR(root);
 701                 goto out;
 702         }
 703         if (fsc->sb->s_root) {
 704                 dput(root);
 705         } else {
 706                 fsc->sb->s_root = root;
 707                 first = 1;
 708
 709                 err = ceph_fs_debugfs_init(fsc);
 710                 if (err < 0)
 711                         goto fail;
 712         }
 713
 714         if (path[0] == 0) {
 715                 dget(root);
 716         } else {
 717                 dout("mount opening base mountpoint\n");
 718                 root = open_root_dentry(fsc, path, started);
 719                 if (IS_ERR(root)) {
 720                         err = PTR_ERR(root);
 721                         goto fail;
 722                 }
 723         }
 724
 725         fsc->mount_state = CEPH_MOUNT_MOUNTED;
 726         dout("mount success\n");
 727         mutex_unlock(&fsc->client->mount_mutex);
 728         return root;
 729
 730 out:
 731         mutex_unlock(&fsc->client->mount_mutex);
 732         return ERR_PTR(err);
 733
 734 fail:
 735         if (first) {
 736                 dput(fsc->sb->s_root);
 737                 fsc->sb->s_root = NULL;
 738         }
 739         goto out;
 740 }
 741
 742 static int ceph_set_super(struct super_block *s, void *data)
 743 {
 744         struct ceph_fs_client *fsc = data;
 745         int ret;
 746
 747         dout("set_super %p data %p\n", s, data);
 748
 749         s->s_flags = fsc->mount_options->sb_flags;
 750         s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
 751
 752         s->s_fs_info = fsc;
 753         fsc->sb = s;
 754
 755         s->s_op = &ceph_super_ops;
 756         s->s_export_op = &ceph_export_ops;
 757
 758         s->s_time_gran = 1000;  /* 1000 ns == 1 us */
 759
 760         ret = set_anon_super(s, NULL);  /* what is that second arg for? */
 761         if (ret != 0)
 762                 goto fail;
 763
 764         return ret;
 765
 766 fail:
 767         s->s_fs_info = NULL;
 768         fsc->sb = NULL;
 769         return ret;
 770 }
 771
 772 /*
 773  * share superblock if same fs AND options
 774  */
 775 static int ceph_compare_super(struct super_block *sb, void *data)
 776 {
 777         struct ceph_fs_client *new = data;
 778         struct ceph_mount_options *fsopt = new->mount_options;
 779         struct ceph_options *opt = new->client->options;
 780         struct ceph_fs_client *other = ceph_sb_to_client(sb);
 781
 782         dout("ceph_compare_super %p\n", sb);
 783
 784         if (compare_mount_options(fsopt, opt, other)) {
 785                 dout("monitor(s)/mount options don't match\n");
 786                 return 0;
 787         }
 788         if ((opt->flags & CEPH_OPT_FSID) &&
 789             ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
 790                 dout("fsid doesn't match\n");
 791                 return 0;
 792         }
 793         if (fsopt->sb_flags != other->mount_options->sb_flags) {
 794                 dout("flags differ\n");
 795                 return 0;
 796         }
 797         return 1;
 798 }
 799
 800 /*
 801  * construct our own bdi so we can control readahead, etc.
 802  */
 803 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 804
 805 static int ceph_register_bdi(struct super_block *sb,
 806                              struct ceph_fs_client *fsc)
 807 {
 808         int err;
 809
 810         /* set ra_pages based on rasize mount option? */
 811         if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE)
 812                 fsc->backing_dev_info.ra_pages =
 813                         (fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1)
 814                         >> PAGE_SHIFT;
 815         else
 816                 fsc->backing_dev_info.ra_pages =
 817                         default_backing_dev_info.ra_pages;
 818
 819         err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
 820                            atomic_long_inc_return(&bdi_seq));
 821         if (!err)
 822                 sb->s_bdi = &fsc->backing_dev_info;
 823         return err;
 824 }
 825
 826 static struct dentry *ceph_mount(struct file_system_type *fs_type,
 827                        int flags, const char *dev_name, void *data)
 828 {
 829         struct super_block *sb;
 830         struct ceph_fs_client *fsc;
 831         struct dentry *res;
 832         int err;
 833         int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
 834         const char *path = NULL;
 835         struct ceph_mount_options *fsopt = NULL;
 836         struct ceph_options *opt = NULL;
 837
 838         dout("ceph_mount\n");
 839         err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
 840         if (err < 0) {
 841                 res = ERR_PTR(err);
 842                 goto out_final;
 843         }
 844
 845         /* create client (which we may/may not use) */
 846         fsc = create_fs_client(fsopt, opt);
 847         if (IS_ERR(fsc)) {
 848                 res = ERR_CAST(fsc);
 849                 destroy_mount_options(fsopt);
 850                 ceph_destroy_options(opt);
 851                 goto out_final;
 852         }
 853
 854         err = ceph_mdsc_init(fsc);
 855         if (err < 0) {
 856                 res = ERR_PTR(err);
 857                 goto out;
 858         }
 859
 860         if (ceph_test_opt(fsc->client, NOSHARE))
 861                 compare_super = NULL;
 862         sb = sget(fs_type, compare_super, ceph_set_super, fsc);
 863         if (IS_ERR(sb)) {
 864                 res = ERR_CAST(sb);
 865                 goto out;
 866         }
 867
 868         if (ceph_sb_to_client(sb) != fsc) {
 869                 ceph_mdsc_destroy(fsc);
 870                 destroy_fs_client(fsc);
 871                 fsc = ceph_sb_to_client(sb);
 872                 dout("get_sb got existing client %p\n", fsc);
 873         } else {
 874                 dout("get_sb using new client %p\n", fsc);
 875                 err = ceph_register_bdi(sb, fsc);
 876                 if (err < 0) {
 877                         res = ERR_PTR(err);
 878                         goto out_splat;
 879                 }
 880         }
 881
 882         res = ceph_real_mount(fsc, path);
 883         if (IS_ERR(res))
 884                 goto out_splat;
 885         dout("root %p inode %p ino %llx.%llx\n", res,
 886              res->d_inode, ceph_vinop(res->d_inode));
 887         return res;
 888
 889 out_splat:
 890         ceph_mdsc_close_sessions(fsc->mdsc);
 891         deactivate_locked_super(sb);
 892         goto out_final;
 893
 894 out:
 895         ceph_mdsc_destroy(fsc);
 896         destroy_fs_client(fsc);
 897 out_final:
 898         dout("ceph_mount fail %ld\n", PTR_ERR(res));
 899         return res;
 900 }
 901
 902 static void ceph_kill_sb(struct super_block *s)
 903 {
 904         struct ceph_fs_client *fsc = ceph_sb_to_client(s);
 905         dout("kill_sb %p\n", s);
 906         ceph_mdsc_pre_umount(fsc->mdsc);
 907         kill_anon_super(s);    /* will call put_super after sb is r/o */
 908         ceph_mdsc_destroy(fsc);
 909         destroy_fs_client(fsc);
 910 }
 911
 912 static struct file_system_type ceph_fs_type = {
 913         .owner          = THIS_MODULE,
 914         .name           = "ceph",
 915         .mount          = ceph_mount,
 916         .kill_sb        = ceph_kill_sb,
 917         .fs_flags       = FS_RENAME_DOES_D_MOVE,
 918 };
 919
 920 #define _STRINGIFY(x) #x
 921 #define STRINGIFY(x) _STRINGIFY(x)
 922
 923 static int __init init_ceph(void)
 924 {
 925         int ret = init_caches();
 926         if (ret)
 927                 goto out;
 928
 929         ret = register_filesystem(&ceph_fs_type);
 930         if (ret)
 931                 goto out_icache;
 932
 933         pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
 934
 935         return 0;
 936
 937 out_icache:
 938         destroy_caches();
 939 out:
 940         return ret;
 941 }
 942
 943 static void __exit exit_ceph(void)
 944 {
 945         dout("exit_ceph\n");
 946         unregister_filesystem(&ceph_fs_type);
 947         destroy_caches();
 948 }
 949
 950 module_init(init_ceph);
 951 module_exit(exit_ceph);
 952
 953 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
 954 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
 955 MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
 956 MODULE_DESCRIPTION("Ceph filesystem for Linux");
 957 MODULE_LICENSE("GPL");