drivers/md/dm-snap-persistent.c

   1 /*
   2  * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
   3  * Copyright (C) 2006-2008 Red Hat GmbH
   4  *
   5  * This file is released under the GPL.
   6  */
   7
   8 #include "dm-exception-store.h"
   9
  10 #include <linux/mm.h>
  11 #include <linux/pagemap.h>
  12 #include <linux/vmalloc.h>
  13 #include <linux/slab.h>
  14 #include <linux/dm-io.h>
  15
  16 #define DM_MSG_PREFIX "persistent snapshot"
  17 #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32        /* 16KB */
  18
  19 /*-----------------------------------------------------------------
  20  * Persistent snapshots, by persistent we mean that the snapshot
  21  * will survive a reboot.
  22  *---------------------------------------------------------------*/
  23
  24 /*
  25  * We need to store a record of which parts of the origin have
  26  * been copied to the snapshot device.  The snapshot code
  27  * requires that we copy exception chunks to chunk aligned areas
  28  * of the COW store.  It makes sense therefore, to store the
  29  * metadata in chunk size blocks.
  30  *
  31  * There is no backward or forward compatibility implemented,
  32  * snapshots with different disk versions than the kernel will
  33  * not be usable.  It is expected that "lvcreate" will blank out
  34  * the start of a fresh COW device before calling the snapshot
  35  * constructor.
  36  *
  37  * The first chunk of the COW device just contains the header.
  38  * After this there is a chunk filled with exception metadata,
  39  * followed by as many exception chunks as can fit in the
  40  * metadata areas.
  41  *
  42  * All on disk structures are in little-endian format.  The end
  43  * of the exceptions info is indicated by an exception with a
  44  * new_chunk of 0, which is invalid since it would point to the
  45  * header chunk.
  46  */
  47
  48 /*
  49  * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
  50  */
  51 #define SNAP_MAGIC 0x70416e53
  52
  53 /*
  54  * The on-disk version of the metadata.
  55  */
  56 #define SNAPSHOT_DISK_VERSION 1
  57
  58 struct disk_header {
  59         uint32_t magic;
  60
  61         /*
  62          * Is this snapshot valid.  There is no way of recovering
  63          * an invalid snapshot.
  64          */
  65         uint32_t valid;
  66
  67         /*
  68          * Simple, incrementing version. no backward
  69          * compatibility.
  70          */
  71         uint32_t version;
  72
  73         /* In sectors */
  74         uint32_t chunk_size;
  75 };
  76
  77 struct disk_exception {
  78         uint64_t old_chunk;
  79         uint64_t new_chunk;
  80 };
  81
  82 struct commit_callback {
  83         void (*callback)(void *, int success);
  84         void *context;
  85 };
  86
  87 /*
  88  * The top level structure for a persistent exception store.
  89  */
  90 struct pstore {
  91         struct dm_exception_store *store;
  92         int version;
  93         int valid;
  94         uint32_t exceptions_per_area;
  95
  96         /*
  97          * Now that we have an asynchronous kcopyd there is no
  98          * need for large chunk sizes, so it wont hurt to have a
  99          * whole chunks worth of metadata in memory at once.
 100          */
 101         void *area;
 102
 103         /*
 104          * An area of zeros used to clear the next area.
 105          */
 106         void *zero_area;
 107
 108         /*
 109          * An area used for header. The header can be written
 110          * concurrently with metadata (when invalidating the snapshot),
 111          * so it needs a separate buffer.
 112          */
 113         void *header_area;
 114
 115         /*
 116          * Used to keep track of which metadata area the data in
 117          * 'chunk' refers to.
 118          */
 119         chunk_t current_area;
 120
 121         /*
 122          * The next free chunk for an exception.
 123          */
 124         chunk_t next_free;
 125
 126         /*
 127          * The index of next free exception in the current
 128          * metadata area.
 129          */
 130         uint32_t current_committed;
 131
 132         atomic_t pending_count;
 133         uint32_t callback_count;
 134         struct commit_callback *callbacks;
 135         struct dm_io_client *io_client;
 136
 137         struct workqueue_struct *metadata_wq;
 138 };
 139
 140 static unsigned sectors_to_pages(unsigned sectors)
 141 {
 142         return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9);
 143 }
 144
 145 static int alloc_area(struct pstore *ps)
 146 {
 147         int r = -ENOMEM;
 148         size_t len;
 149
 150         len = ps->store->chunk_size << SECTOR_SHIFT;
 151
 152         /*
 153          * Allocate the chunk_size block of memory that will hold
 154          * a single metadata area.
 155          */
 156         ps->area = vmalloc(len);
 157         if (!ps->area)
 158                 goto err_area;
 159
 160         ps->zero_area = vmalloc(len);
 161         if (!ps->zero_area)
 162                 goto err_zero_area;
 163         memset(ps->zero_area, 0, len);
 164
 165         ps->header_area = vmalloc(len);
 166         if (!ps->header_area)
 167                 goto err_header_area;
 168
 169         return 0;
 170
 171 err_header_area:
 172         vfree(ps->zero_area);
 173
 174 err_zero_area:
 175         vfree(ps->area);
 176
 177 err_area:
 178         return r;
 179 }
 180
 181 static void free_area(struct pstore *ps)
 182 {
 183         if (ps->area)
 184                 vfree(ps->area);
 185         ps->area = NULL;
 186
 187         if (ps->zero_area)
 188                 vfree(ps->zero_area);
 189         ps->zero_area = NULL;
 190
 191         if (ps->header_area)
 192                 vfree(ps->header_area);
 193         ps->header_area = NULL;
 194 }
 195
 196 struct mdata_req {
 197         struct dm_io_region *where;
 198         struct dm_io_request *io_req;
 199         struct work_struct work;
 200         int result;
 201 };
 202
 203 static void do_metadata(struct work_struct *work)
 204 {
 205         struct mdata_req *req = container_of(work, struct mdata_req, work);
 206
 207         req->result = dm_io(req->io_req, 1, req->where, NULL);
 208 }
 209
 210 /*
 211  * Read or write a chunk aligned and sized block of data from a device.
 212  */
 213 static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
 214                     int metadata)
 215 {
 216         struct dm_io_region where = {
 217                 .bdev = ps->store->cow->bdev,
 218                 .sector = ps->store->chunk_size * chunk,
 219                 .count = ps->store->chunk_size,
 220         };
 221         struct dm_io_request io_req = {
 222                 .bi_rw = rw,
 223                 .mem.type = DM_IO_VMA,
 224                 .mem.ptr.vma = area,
 225                 .client = ps->io_client,
 226                 .notify.fn = NULL,
 227         };
 228         struct mdata_req req;
 229
 230         if (!metadata)
 231                 return dm_io(&io_req, 1, &where, NULL);
 232
 233         req.where = &where;
 234         req.io_req = &io_req;
 235
 236         /*
 237          * Issue the synchronous I/O from a different thread
 238          * to avoid generic_make_request recursion.
 239          */
 240         INIT_WORK(&req.work, do_metadata);
 241         queue_work(ps->metadata_wq, &req.work);
 242         flush_workqueue(ps->metadata_wq);
 243
 244         return req.result;
 245 }
 246
 247 /*
 248  * Convert a metadata area index to a chunk index.
 249  */
 250 static chunk_t area_location(struct pstore *ps, chunk_t area)
 251 {
 252         return 1 + ((ps->exceptions_per_area + 1) * area);
 253 }
 254
 255 /*
 256  * Read or write a metadata area.  Remembering to skip the first
 257  * chunk which holds the header.
 258  */
 259 static int area_io(struct pstore *ps, int rw)
 260 {
 261         int r;
 262         chunk_t chunk;
 263
 264         chunk = area_location(ps, ps->current_area);
 265
 266         r = chunk_io(ps, ps->area, chunk, rw, 0);
 267         if (r)
 268                 return r;
 269
 270         return 0;
 271 }
 272
 273 static void zero_memory_area(struct pstore *ps)
 274 {
 275         memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT);
 276 }
 277
 278 static int zero_disk_area(struct pstore *ps, chunk_t area)
 279 {
 280         return chunk_io(ps, ps->zero_area, area_location(ps, area), WRITE, 0);
 281 }
 282
 283 static int read_header(struct pstore *ps, int *new_snapshot)
 284 {
 285         int r;
 286         struct disk_header *dh;
 287         chunk_t chunk_size;
 288         int chunk_size_supplied = 1;
 289
 290         /*
 291          * Use default chunk size (or hardsect_size, if larger) if none supplied
 292          */
 293         if (!ps->store->chunk_size) {
 294                 ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
 295                     bdev_logical_block_size(ps->store->cow->bdev) >> 9);
 296                 ps->store->chunk_mask = ps->store->chunk_size - 1;
 297                 ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
 298                 chunk_size_supplied = 0;
 299         }
 300
 301         ps->io_client = dm_io_client_create(sectors_to_pages(ps->store->
 302                                                              chunk_size));
 303         if (IS_ERR(ps->io_client))
 304                 return PTR_ERR(ps->io_client);
 305
 306         r = alloc_area(ps);
 307         if (r)
 308                 return r;
 309
 310         r = chunk_io(ps, ps->header_area, 0, READ, 1);
 311         if (r)
 312                 goto bad;
 313
 314         dh = ps->header_area;
 315
 316         if (le32_to_cpu(dh->magic) == 0) {
 317                 *new_snapshot = 1;
 318                 return 0;
 319         }
 320
 321         if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
 322                 DMWARN("Invalid or corrupt snapshot");
 323                 r = -ENXIO;
 324                 goto bad;
 325         }
 326
 327         *new_snapshot = 0;
 328         ps->valid = le32_to_cpu(dh->valid);
 329         ps->version = le32_to_cpu(dh->version);
 330         chunk_size = le32_to_cpu(dh->chunk_size);
 331
 332         if (!chunk_size_supplied || ps->store->chunk_size == chunk_size)
 333                 return 0;
 334
 335         DMWARN("chunk size %llu in device metadata overrides "
 336                "table chunk size of %llu.",
 337                (unsigned long long)chunk_size,
 338                (unsigned long long)ps->store->chunk_size);
 339
 340         /* We had a bogus chunk_size. Fix stuff up. */
 341         free_area(ps);
 342
 343         ps->store->chunk_size = chunk_size;
 344         ps->store->chunk_mask = chunk_size - 1;
 345         ps->store->chunk_shift = ffs(chunk_size) - 1;
 346
 347         r = dm_io_client_resize(sectors_to_pages(ps->store->chunk_size),
 348                                 ps->io_client);
 349         if (r)
 350                 return r;
 351
 352         r = alloc_area(ps);
 353         return r;
 354
 355 bad:
 356         free_area(ps);
 357         return r;
 358 }
 359
 360 static int write_header(struct pstore *ps)
 361 {
 362         struct disk_header *dh;
 363
 364         memset(ps->header_area, 0, ps->store->chunk_size << SECTOR_SHIFT);
 365
 366         dh = ps->header_area;
 367         dh->magic = cpu_to_le32(SNAP_MAGIC);
 368         dh->valid = cpu_to_le32(ps->valid);
 369         dh->version = cpu_to_le32(ps->version);
 370         dh->chunk_size = cpu_to_le32(ps->store->chunk_size);
 371
 372         return chunk_io(ps, ps->header_area, 0, WRITE, 1);
 373 }
 374
 375 /*
 376  * Access functions for the disk exceptions, these do the endian conversions.
 377  */
 378 static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
 379 {
 380         BUG_ON(index >= ps->exceptions_per_area);
 381
 382         return ((struct disk_exception *) ps->area) + index;
 383 }
 384
 385 static void read_exception(struct pstore *ps,
 386                            uint32_t index, struct disk_exception *result)
 387 {
 388         struct disk_exception *e = get_exception(ps, index);
 389
 390         /* copy it */
 391         result->old_chunk = le64_to_cpu(e->old_chunk);
 392         result->new_chunk = le64_to_cpu(e->new_chunk);
 393 }
 394
 395 static void write_exception(struct pstore *ps,
 396                             uint32_t index, struct disk_exception *de)
 397 {
 398         struct disk_exception *e = get_exception(ps, index);
 399
 400         /* copy it */
 401         e->old_chunk = cpu_to_le64(de->old_chunk);
 402         e->new_chunk = cpu_to_le64(de->new_chunk);
 403 }
 404
 405 /*
 406  * Registers the exceptions that are present in the current area.
 407  * 'full' is filled in to indicate if the area has been
 408  * filled.
 409  */
 410 static int insert_exceptions(struct pstore *ps,
 411                              int (*callback)(void *callback_context,
 412                                              chunk_t old, chunk_t new),
 413                              void *callback_context,
 414                              int *full)
 415 {
 416         int r;
 417         unsigned int i;
 418         struct disk_exception de;
 419
 420         /* presume the area is full */
 421         *full = 1;
 422
 423         for (i = 0; i < ps->exceptions_per_area; i++) {
 424                 read_exception(ps, i, &de);
 425
 426                 /*
 427                  * If the new_chunk is pointing at the start of
 428                  * the COW device, where the first metadata area
 429                  * is we know that we've hit the end of the
 430                  * exceptions.  Therefore the area is not full.
 431                  */
 432                 if (de.new_chunk == 0LL) {
 433                         ps->current_committed = i;
 434                         *full = 0;
 435                         break;
 436                 }
 437
 438                 /*
 439                  * Keep track of the start of the free chunks.
 440                  */
 441                 if (ps->next_free <= de.new_chunk)
 442                         ps->next_free = de.new_chunk + 1;
 443
 444                 /*
 445                  * Otherwise we add the exception to the snapshot.
 446                  */
 447                 r = callback(callback_context, de.old_chunk, de.new_chunk);
 448                 if (r)
 449                         return r;
 450         }
 451
 452         return 0;
 453 }
 454
 455 static int read_exceptions(struct pstore *ps,
 456                            int (*callback)(void *callback_context, chunk_t old,
 457                                            chunk_t new),
 458                            void *callback_context)
 459 {
 460         int r, full = 1;
 461
 462         /*
 463          * Keeping reading chunks and inserting exceptions until
 464          * we find a partially full area.
 465          */
 466         for (ps->current_area = 0; full; ps->current_area++) {
 467                 r = area_io(ps, READ);
 468                 if (r)
 469                         return r;
 470
 471                 r = insert_exceptions(ps, callback, callback_context, &full);
 472                 if (r)
 473                         return r;
 474         }
 475
 476         ps->current_area--;
 477
 478         return 0;
 479 }
 480
 481 static struct pstore *get_info(struct dm_exception_store *store)
 482 {
 483         return (struct pstore *) store->context;
 484 }
 485
 486 static void persistent_fraction_full(struct dm_exception_store *store,
 487                                      sector_t *numerator, sector_t *denominator)
 488 {
 489         *numerator = get_info(store)->next_free * store->chunk_size;
 490         *denominator = get_dev_size(store->cow->bdev);
 491 }
 492
 493 static void persistent_dtr(struct dm_exception_store *store)
 494 {
 495         struct pstore *ps = get_info(store);
 496
 497         destroy_workqueue(ps->metadata_wq);
 498
 499         /* Created in read_header */
 500         if (ps->io_client)
 501                 dm_io_client_destroy(ps->io_client);
 502         free_area(ps);
 503
 504         /* Allocated in persistent_read_metadata */
 505         if (ps->callbacks)
 506                 vfree(ps->callbacks);
 507
 508         kfree(ps);
 509 }
 510
 511 static int persistent_read_metadata(struct dm_exception_store *store,
 512                                     int (*callback)(void *callback_context,
 513                                                     chunk_t old, chunk_t new),
 514                                     void *callback_context)
 515 {
 516         int r, uninitialized_var(new_snapshot);
 517         struct pstore *ps = get_info(store);
 518
 519         /*
 520          * Read the snapshot header.
 521          */
 522         r = read_header(ps, &new_snapshot);
 523         if (r)
 524                 return r;
 525
 526         /*
 527          * Now we know correct chunk_size, complete the initialisation.
 528          */
 529         ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) /
 530                                   sizeof(struct disk_exception);
 531         ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
 532                         sizeof(*ps->callbacks));
 533         if (!ps->callbacks)
 534                 return -ENOMEM;
 535
 536         /*
 537          * Do we need to setup a new snapshot ?
 538          */
 539         if (new_snapshot) {
 540                 r = write_header(ps);
 541                 if (r) {
 542                         DMWARN("write_header failed");
 543                         return r;
 544                 }
 545
 546                 ps->current_area = 0;
 547                 zero_memory_area(ps);
 548                 r = zero_disk_area(ps, 0);
 549                 if (r) {
 550                         DMWARN("zero_disk_area(0) failed");
 551                         return r;
 552                 }
 553         } else {
 554                 /*
 555                  * Sanity checks.
 556                  */
 557                 if (ps->version != SNAPSHOT_DISK_VERSION) {
 558                         DMWARN("unable to handle snapshot disk version %d",
 559                                ps->version);
 560                         return -EINVAL;
 561                 }
 562
 563                 /*
 564                  * Metadata are valid, but snapshot is invalidated
 565                  */
 566                 if (!ps->valid)
 567                         return 1;
 568
 569                 /*
 570                  * Read the metadata.
 571                  */
 572                 r = read_exceptions(ps, callback, callback_context);
 573                 if (r)
 574                         return r;
 575         }
 576
 577         return 0;
 578 }
 579
 580 static int persistent_prepare_exception(struct dm_exception_store *store,
 581                                         struct dm_snap_exception *e)
 582 {
 583         struct pstore *ps = get_info(store);
 584         uint32_t stride;
 585         chunk_t next_free;
 586         sector_t size = get_dev_size(store->cow->bdev);
 587
 588         /* Is there enough room ? */
 589         if (size < ((ps->next_free + 1) * store->chunk_size))
 590                 return -ENOSPC;
 591
 592         e->new_chunk = ps->next_free;
 593
 594         /*
 595          * Move onto the next free pending, making sure to take
 596          * into account the location of the metadata chunks.
 597          */
 598         stride = (ps->exceptions_per_area + 1);
 599         next_free = ++ps->next_free;
 600         if (sector_div(next_free, stride) == 1)
 601                 ps->next_free++;
 602
 603         atomic_inc(&ps->pending_count);
 604         return 0;
 605 }
 606
 607 static void persistent_commit_exception(struct dm_exception_store *store,
 608                                         struct dm_snap_exception *e,
 609                                         void (*callback) (void *, int success),
 610                                         void *callback_context)
 611 {
 612         unsigned int i;
 613         struct pstore *ps = get_info(store);
 614         struct disk_exception de;
 615         struct commit_callback *cb;
 616
 617         de.old_chunk = e->old_chunk;
 618         de.new_chunk = e->new_chunk;
 619         write_exception(ps, ps->current_committed++, &de);
 620
 621         /*
 622          * Add the callback to the back of the array.  This code
 623          * is the only place where the callback array is
 624          * manipulated, and we know that it will never be called
 625          * multiple times concurrently.
 626          */
 627         cb = ps->callbacks + ps->callback_count++;
 628         cb->callback = callback;
 629         cb->context = callback_context;
 630
 631         /*
 632          * If there are exceptions in flight and we have not yet
 633          * filled this metadata area there's nothing more to do.
 634          */
 635         if (!atomic_dec_and_test(&ps->pending_count) &&
 636             (ps->current_committed != ps->exceptions_per_area))
 637                 return;
 638
 639         /*
 640          * If we completely filled the current area, then wipe the next one.
 641          */
 642         if ((ps->current_committed == ps->exceptions_per_area) &&
 643              zero_disk_area(ps, ps->current_area + 1))
 644                 ps->valid = 0;
 645
 646         /*
 647          * Commit exceptions to disk.
 648          */
 649         if (ps->valid && area_io(ps, WRITE_BARRIER))
 650                 ps->valid = 0;
 651
 652         /*
 653          * Advance to the next area if this one is full.
 654          */
 655         if (ps->current_committed == ps->exceptions_per_area) {
 656                 ps->current_committed = 0;
 657                 ps->current_area++;
 658                 zero_memory_area(ps);
 659         }
 660
 661         for (i = 0; i < ps->callback_count; i++) {
 662                 cb = ps->callbacks + i;
 663                 cb->callback(cb->context, ps->valid);
 664         }
 665
 666         ps->callback_count = 0;
 667 }
 668
 669 static void persistent_drop_snapshot(struct dm_exception_store *store)
 670 {
 671         struct pstore *ps = get_info(store);
 672
 673         ps->valid = 0;
 674         if (write_header(ps))
 675                 DMWARN("write header failed");
 676 }
 677
 678 static int persistent_ctr(struct dm_exception_store *store,
 679                           unsigned argc, char **argv)
 680 {
 681         struct pstore *ps;
 682
 683         /* allocate the pstore */
 684         ps = kzalloc(sizeof(*ps), GFP_KERNEL);
 685         if (!ps)
 686                 return -ENOMEM;
 687
 688         ps->store = store;
 689         ps->valid = 1;
 690         ps->version = SNAPSHOT_DISK_VERSION;
 691         ps->area = NULL;
 692         ps->zero_area = NULL;
 693         ps->header_area = NULL;
 694         ps->next_free = 2;      /* skipping the header and first area */
 695         ps->current_committed = 0;
 696
 697         ps->callback_count = 0;
 698         atomic_set(&ps->pending_count, 0);
 699         ps->callbacks = NULL;
 700
 701         ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
 702         if (!ps->metadata_wq) {
 703                 kfree(ps);
 704                 DMERR("couldn't start header metadata update thread");
 705                 return -ENOMEM;
 706         }
 707
 708         store->context = ps;
 709
 710         return 0;
 711 }
 712
 713 static unsigned persistent_status(struct dm_exception_store *store,
 714                                   status_type_t status, char *result,
 715                                   unsigned maxlen)
 716 {
 717         unsigned sz = 0;
 718
 719         switch (status) {
 720         case STATUSTYPE_INFO:
 721                 break;
 722         case STATUSTYPE_TABLE:
 723                 DMEMIT(" %s P %llu", store->cow->name,
 724                        (unsigned long long)store->chunk_size);
 725         }
 726
 727         return sz;
 728 }
 729
 730 static struct dm_exception_store_type _persistent_type = {
 731         .name = "persistent",
 732         .module = THIS_MODULE,
 733         .ctr = persistent_ctr,
 734         .dtr = persistent_dtr,
 735         .read_metadata = persistent_read_metadata,
 736         .prepare_exception = persistent_prepare_exception,
 737         .commit_exception = persistent_commit_exception,
 738         .drop_snapshot = persistent_drop_snapshot,
 739         .fraction_full = persistent_fraction_full,
 740         .status = persistent_status,
 741 };
 742
 743 static struct dm_exception_store_type _persistent_compat_type = {
 744         .name = "P",
 745         .module = THIS_MODULE,
 746         .ctr = persistent_ctr,
 747         .dtr = persistent_dtr,
 748         .read_metadata = persistent_read_metadata,
 749         .prepare_exception = persistent_prepare_exception,
 750         .commit_exception = persistent_commit_exception,
 751         .drop_snapshot = persistent_drop_snapshot,
 752         .fraction_full = persistent_fraction_full,
 753         .status = persistent_status,
 754 };
 755
 756 int dm_persistent_snapshot_init(void)
 757 {
 758         int r;
 759
 760         r = dm_exception_store_type_register(&_persistent_type);
 761         if (r) {
 762                 DMERR("Unable to register persistent exception store type");
 763                 return r;
 764         }
 765
 766         r = dm_exception_store_type_register(&_persistent_compat_type);
 767         if (r) {
 768                 DMERR("Unable to register old-style persistent exception "
 769                       "store type");
 770                 dm_exception_store_type_unregister(&_persistent_type);
 771                 return r;
 772         }
 773
 774         return r;
 775 }
 776
 777 void dm_persistent_snapshot_exit(void)
 778 {
 779         dm_exception_store_type_unregister(&_persistent_type);
 780         dm_exception_store_type_unregister(&_persistent_compat_type);
 781 }