drivers/md/dm-thin.c

   1 /*
   2  * Copyright (C) 2011 Red Hat UK.
   3  *
   4  * This file is released under the GPL.
   5  */
   6
   7 #include "dm-thin-metadata.h"
   8
   9 #include <linux/device-mapper.h>
  10 #include <linux/dm-io.h>
  11 #include <linux/dm-kcopyd.h>
  12 #include <linux/list.h>
  13 #include <linux/init.h>
  14 #include <linux/module.h>
  15 #include <linux/slab.h>
  16
  17 #define DM_MSG_PREFIX   "thin"
  18
  19 /*
  20  * Tunable constants
  21  */
  22 #define ENDIO_HOOK_POOL_SIZE 1024
  23 #define DEFERRED_SET_SIZE 64
  24 #define MAPPING_POOL_SIZE 1024
  25 #define PRISON_CELLS 1024
  26 #define COMMIT_PERIOD HZ
  27
  28 /*
  29  * The block size of the device holding pool data must be
  30  * between 64KB and 1GB.
  31  */
  32 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
  33 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
  34
  35 /*
  36  * Device id is restricted to 24 bits.
  37  */
  38 #define MAX_DEV_ID ((1 << 24) - 1)
  39
  40 /*
  41  * How do we handle breaking sharing of data blocks?
  42  * =================================================
  43  *
  44  * We use a standard copy-on-write btree to store the mappings for the
  45  * devices (note I'm talking about copy-on-write of the metadata here, not
  46  * the data).  When you take an internal snapshot you clone the root node
  47  * of the origin btree.  After this there is no concept of an origin or a
  48  * snapshot.  They are just two device trees that happen to point to the
  49  * same data blocks.
  50  *
  51  * When we get a write in we decide if it's to a shared data block using
  52  * some timestamp magic.  If it is, we have to break sharing.
  53  *
  54  * Let's say we write to a shared block in what was the origin.  The
  55  * steps are:
  56  *
  57  * i) plug io further to this physical block. (see bio_prison code).
  58  *
  59  * ii) quiesce any read io to that shared data block.  Obviously
  60  * including all devices that share this block.  (see deferred_set code)
  61  *
  62  * iii) copy the data block to a newly allocate block.  This step can be
  63  * missed out if the io covers the block. (schedule_copy).
  64  *
  65  * iv) insert the new mapping into the origin's btree
  66  * (process_prepared_mapping).  This act of inserting breaks some
  67  * sharing of btree nodes between the two devices.  Breaking sharing only
  68  * effects the btree of that specific device.  Btrees for the other
  69  * devices that share the block never change.  The btree for the origin
  70  * device as it was after the last commit is untouched, ie. we're using
  71  * persistent data structures in the functional programming sense.
  72  *
  73  * v) unplug io to this physical block, including the io that triggered
  74  * the breaking of sharing.
  75  *
  76  * Steps (ii) and (iii) occur in parallel.
  77  *
  78  * The metadata _doesn't_ need to be committed before the io continues.  We
  79  * get away with this because the io is always written to a _new_ block.
  80  * If there's a crash, then:
  81  *
  82  * - The origin mapping will point to the old origin block (the shared
  83  * one).  This will contain the data as it was before the io that triggered
  84  * the breaking of sharing came in.
  85  *
  86  * - The snap mapping still points to the old block.  As it would after
  87  * the commit.
  88  *
  89  * The downside of this scheme is the timestamp magic isn't perfect, and
  90  * will continue to think that data block in the snapshot device is shared
  91  * even after the write to the origin has broken sharing.  I suspect data
  92  * blocks will typically be shared by many different devices, so we're
  93  * breaking sharing n + 1 times, rather than n, where n is the number of
  94  * devices that reference this data block.  At the moment I think the
  95  * benefits far, far outweigh the disadvantages.
  96  */
  97
  98 /*----------------------------------------------------------------*/
  99
 100 /*
 101  * Sometimes we can't deal with a bio straight away.  We put them in prison
 102  * where they can't cause any mischief.  Bios are put in a cell identified
 103  * by a key, multiple bios can be in the same cell.  When the cell is
 104  * subsequently unlocked the bios become available.
 105  */
 106 struct bio_prison;
 107
 108 struct cell_key {
 109         int virtual;
 110         dm_thin_id dev;
 111         dm_block_t block;
 112 };
 113
 114 struct dm_bio_prison_cell {
 115         struct hlist_node list;
 116         struct bio_prison *prison;
 117         struct cell_key key;
 118         struct bio *holder;
 119         struct bio_list bios;
 120 };
 121
 122 struct bio_prison {
 123         spinlock_t lock;
 124         mempool_t *cell_pool;
 125
 126         unsigned nr_buckets;
 127         unsigned hash_mask;
 128         struct hlist_head *cells;
 129 };
 130
 131 static uint32_t calc_nr_buckets(unsigned nr_cells)
 132 {
 133         uint32_t n = 128;
 134
 135         nr_cells /= 4;
 136         nr_cells = min(nr_cells, 8192u);
 137
 138         while (n < nr_cells)
 139                 n <<= 1;
 140
 141         return n;
 142 }
 143
 144 static struct kmem_cache *_cell_cache;
 145
 146 /*
 147  * @nr_cells should be the number of cells you want in use _concurrently_.
 148  * Don't confuse it with the number of distinct keys.
 149  */
 150 static struct bio_prison *prison_create(unsigned nr_cells)
 151 {
 152         unsigned i;
 153         uint32_t nr_buckets = calc_nr_buckets(nr_cells);
 154         size_t len = sizeof(struct bio_prison) +
 155                 (sizeof(struct hlist_head) * nr_buckets);
 156         struct bio_prison *prison = kmalloc(len, GFP_KERNEL);
 157
 158         if (!prison)
 159                 return NULL;
 160
 161         spin_lock_init(&prison->lock);
 162         prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache);
 163         if (!prison->cell_pool) {
 164                 kfree(prison);
 165                 return NULL;
 166         }
 167
 168         prison->nr_buckets = nr_buckets;
 169         prison->hash_mask = nr_buckets - 1;
 170         prison->cells = (struct hlist_head *) (prison + 1);
 171         for (i = 0; i < nr_buckets; i++)
 172                 INIT_HLIST_HEAD(prison->cells + i);
 173
 174         return prison;
 175 }
 176
 177 static void prison_destroy(struct bio_prison *prison)
 178 {
 179         mempool_destroy(prison->cell_pool);
 180         kfree(prison);
 181 }
 182
 183 static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key)
 184 {
 185         const unsigned long BIG_PRIME = 4294967291UL;
 186         uint64_t hash = key->block * BIG_PRIME;
 187
 188         return (uint32_t) (hash & prison->hash_mask);
 189 }
 190
 191 static int keys_equal(struct cell_key *lhs, struct cell_key *rhs)
 192 {
 193                return (lhs->virtual == rhs->virtual) &&
 194                        (lhs->dev == rhs->dev) &&
 195                        (lhs->block == rhs->block);
 196 }
 197
 198 static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket,
 199                                                   struct cell_key *key)
 200 {
 201         struct dm_bio_prison_cell *cell;
 202         struct hlist_node *tmp;
 203
 204         hlist_for_each_entry(cell, tmp, bucket, list)
 205                 if (keys_equal(&cell->key, key))
 206                         return cell;
 207
 208         return NULL;
 209 }
 210
 211 /*
 212  * This may block if a new cell needs allocating.  You must ensure that
 213  * cells will be unlocked even if the calling thread is blocked.
 214  *
 215  * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
 216  */
 217 static int bio_detain(struct bio_prison *prison, struct cell_key *key,
 218                       struct bio *inmate, struct dm_bio_prison_cell **ref)
 219 {
 220         int r = 1;
 221         unsigned long flags;
 222         uint32_t hash = hash_key(prison, key);
 223         struct dm_bio_prison_cell *cell, *cell2;
 224
 225         BUG_ON(hash > prison->nr_buckets);
 226
 227         spin_lock_irqsave(&prison->lock, flags);
 228
 229         cell = __search_bucket(prison->cells + hash, key);
 230         if (cell) {
 231                 bio_list_add(&cell->bios, inmate);
 232                 goto out;
 233         }
 234
 235         /*
 236          * Allocate a new cell
 237          */
 238         spin_unlock_irqrestore(&prison->lock, flags);
 239         cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
 240         spin_lock_irqsave(&prison->lock, flags);
 241
 242         /*
 243          * We've been unlocked, so we have to double check that
 244          * nobody else has inserted this cell in the meantime.
 245          */
 246         cell = __search_bucket(prison->cells + hash, key);
 247         if (cell) {
 248                 mempool_free(cell2, prison->cell_pool);
 249                 bio_list_add(&cell->bios, inmate);
 250                 goto out;
 251         }
 252
 253         /*
 254          * Use new cell.
 255          */
 256         cell = cell2;
 257
 258         cell->prison = prison;
 259         memcpy(&cell->key, key, sizeof(cell->key));
 260         cell->holder = inmate;
 261         bio_list_init(&cell->bios);
 262         hlist_add_head(&cell->list, prison->cells + hash);
 263
 264         r = 0;
 265
 266 out:
 267         spin_unlock_irqrestore(&prison->lock, flags);
 268
 269         *ref = cell;
 270
 271         return r;
 272 }
 273
 274 /*
 275  * @inmates must have been initialised prior to this call
 276  */
 277 static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
 278 {
 279         struct bio_prison *prison = cell->prison;
 280
 281         hlist_del(&cell->list);
 282
 283         if (inmates) {
 284                 bio_list_add(inmates, cell->holder);
 285                 bio_list_merge(inmates, &cell->bios);
 286         }
 287
 288         mempool_free(cell, prison->cell_pool);
 289 }
 290
 291 static void cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios)
 292 {
 293         unsigned long flags;
 294         struct bio_prison *prison = cell->prison;
 295
 296         spin_lock_irqsave(&prison->lock, flags);
 297         __cell_release(cell, bios);
 298         spin_unlock_irqrestore(&prison->lock, flags);
 299 }
 300
 301 /*
 302  * There are a couple of places where we put a bio into a cell briefly
 303  * before taking it out again.  In these situations we know that no other
 304  * bio may be in the cell.  This function releases the cell, and also does
 305  * a sanity check.
 306  */
 307 static void __cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio)
 308 {
 309         BUG_ON(cell->holder != bio);
 310         BUG_ON(!bio_list_empty(&cell->bios));
 311
 312         __cell_release(cell, NULL);
 313 }
 314
 315 static void cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio)
 316 {
 317         unsigned long flags;
 318         struct bio_prison *prison = cell->prison;
 319
 320         spin_lock_irqsave(&prison->lock, flags);
 321         __cell_release_singleton(cell, bio);
 322         spin_unlock_irqrestore(&prison->lock, flags);
 323 }
 324
 325 /*
 326  * Sometimes we don't want the holder, just the additional bios.
 327  */
 328 static void __cell_release_no_holder(struct dm_bio_prison_cell *cell,
 329                                      struct bio_list *inmates)
 330 {
 331         struct bio_prison *prison = cell->prison;
 332
 333         hlist_del(&cell->list);
 334         bio_list_merge(inmates, &cell->bios);
 335
 336         mempool_free(cell, prison->cell_pool);
 337 }
 338
 339 static void cell_release_no_holder(struct dm_bio_prison_cell *cell,
 340                                    struct bio_list *inmates)
 341 {
 342         unsigned long flags;
 343         struct bio_prison *prison = cell->prison;
 344
 345         spin_lock_irqsave(&prison->lock, flags);
 346         __cell_release_no_holder(cell, inmates);
 347         spin_unlock_irqrestore(&prison->lock, flags);
 348 }
 349
 350 static void cell_error(struct dm_bio_prison_cell *cell)
 351 {
 352         struct bio_prison *prison = cell->prison;
 353         struct bio_list bios;
 354         struct bio *bio;
 355         unsigned long flags;
 356
 357         bio_list_init(&bios);
 358
 359         spin_lock_irqsave(&prison->lock, flags);
 360         __cell_release(cell, &bios);
 361         spin_unlock_irqrestore(&prison->lock, flags);
 362
 363         while ((bio = bio_list_pop(&bios)))
 364                 bio_io_error(bio);
 365 }
 366
 367 /*----------------------------------------------------------------*/
 368
 369 /*
 370  * We use the deferred set to keep track of pending reads to shared blocks.
 371  * We do this to ensure the new mapping caused by a write isn't performed
 372  * until these prior reads have completed.  Otherwise the insertion of the
 373  * new mapping could free the old block that the read bios are mapped to.
 374  */
 375
 376 struct deferred_set;
 377 struct deferred_entry {
 378         struct deferred_set *ds;
 379         unsigned count;
 380         struct list_head work_items;
 381 };
 382
 383 struct deferred_set {
 384         spinlock_t lock;
 385         unsigned current_entry;
 386         unsigned sweeper;
 387         struct deferred_entry entries[DEFERRED_SET_SIZE];
 388 };
 389
 390 static void ds_init(struct deferred_set *ds)
 391 {
 392         int i;
 393
 394         spin_lock_init(&ds->lock);
 395         ds->current_entry = 0;
 396         ds->sweeper = 0;
 397         for (i = 0; i < DEFERRED_SET_SIZE; i++) {
 398                 ds->entries[i].ds = ds;
 399                 ds->entries[i].count = 0;
 400                 INIT_LIST_HEAD(&ds->entries[i].work_items);
 401         }
 402 }
 403
 404 static struct deferred_entry *ds_inc(struct deferred_set *ds)
 405 {
 406         unsigned long flags;
 407         struct deferred_entry *entry;
 408
 409         spin_lock_irqsave(&ds->lock, flags);
 410         entry = ds->entries + ds->current_entry;
 411         entry->count++;
 412         spin_unlock_irqrestore(&ds->lock, flags);
 413
 414         return entry;
 415 }
 416
 417 static unsigned ds_next(unsigned index)
 418 {
 419         return (index + 1) % DEFERRED_SET_SIZE;
 420 }
 421
 422 static void __sweep(struct deferred_set *ds, struct list_head *head)
 423 {
 424         while ((ds->sweeper != ds->current_entry) &&
 425                !ds->entries[ds->sweeper].count) {
 426                 list_splice_init(&ds->entries[ds->sweeper].work_items, head);
 427                 ds->sweeper = ds_next(ds->sweeper);
 428         }
 429
 430         if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
 431                 list_splice_init(&ds->entries[ds->sweeper].work_items, head);
 432 }
 433
 434 static void ds_dec(struct deferred_entry *entry, struct list_head *head)
 435 {
 436         unsigned long flags;
 437
 438         spin_lock_irqsave(&entry->ds->lock, flags);
 439         BUG_ON(!entry->count);
 440         --entry->count;
 441         __sweep(entry->ds, head);
 442         spin_unlock_irqrestore(&entry->ds->lock, flags);
 443 }
 444
 445 /*
 446  * Returns 1 if deferred or 0 if no pending items to delay job.
 447  */
 448 static int ds_add_work(struct deferred_set *ds, struct list_head *work)
 449 {
 450         int r = 1;
 451         unsigned long flags;
 452         unsigned next_entry;
 453
 454         spin_lock_irqsave(&ds->lock, flags);
 455         if ((ds->sweeper == ds->current_entry) &&
 456             !ds->entries[ds->current_entry].count)
 457                 r = 0;
 458         else {
 459                 list_add(work, &ds->entries[ds->current_entry].work_items);
 460                 next_entry = ds_next(ds->current_entry);
 461                 if (!ds->entries[next_entry].count)
 462                         ds->current_entry = next_entry;
 463         }
 464         spin_unlock_irqrestore(&ds->lock, flags);
 465
 466         return r;
 467 }
 468
 469 /*----------------------------------------------------------------*/
 470
 471 /*
 472  * Key building.
 473  */
 474 static void build_data_key(struct dm_thin_device *td,
 475                            dm_block_t b, struct cell_key *key)
 476 {
 477         key->virtual = 0;
 478         key->dev = dm_thin_dev_id(td);
 479         key->block = b;
 480 }
 481
 482 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 483                               struct cell_key *key)
 484 {
 485         key->virtual = 1;
 486         key->dev = dm_thin_dev_id(td);
 487         key->block = b;
 488 }
 489
 490 /*----------------------------------------------------------------*/
 491
 492 /*
 493  * A pool device ties together a metadata device and a data device.  It
 494  * also provides the interface for creating and destroying internal
 495  * devices.
 496  */
 497 struct dm_thin_new_mapping;
 498
 499 struct pool_features {
 500         unsigned zero_new_blocks:1;
 501         unsigned discard_enabled:1;
 502         unsigned discard_passdown:1;
 503 };
 504
 505 struct pool {
 506         struct list_head list;
 507         struct dm_target *ti;   /* Only set if a pool target is bound */
 508
 509         struct mapped_device *pool_md;
 510         struct block_device *md_dev;
 511         struct dm_pool_metadata *pmd;
 512
 513         dm_block_t low_water_blocks;
 514         uint32_t sectors_per_block;
 515         int sectors_per_block_shift;
 516
 517         struct pool_features pf;
 518         unsigned low_water_triggered:1; /* A dm event has been sent */
 519         unsigned no_free_space:1;       /* A -ENOSPC warning has been issued */
 520
 521         struct bio_prison *prison;
 522         struct dm_kcopyd_client *copier;
 523
 524         struct workqueue_struct *wq;
 525         struct work_struct worker;
 526         struct delayed_work waker;
 527
 528         unsigned long last_commit_jiffies;
 529         unsigned ref_count;
 530
 531         spinlock_t lock;
 532         struct bio_list deferred_bios;
 533         struct bio_list deferred_flush_bios;
 534         struct list_head prepared_mappings;
 535         struct list_head prepared_discards;
 536
 537         struct bio_list retry_on_resume_list;
 538
 539         struct deferred_set shared_read_ds;
 540         struct deferred_set all_io_ds;
 541
 542         struct dm_thin_new_mapping *next_mapping;
 543         mempool_t *mapping_pool;
 544         mempool_t *endio_hook_pool;
 545 };
 546
 547 /*
 548  * Target context for a pool.
 549  */
 550 struct pool_c {
 551         struct dm_target *ti;
 552         struct pool *pool;
 553         struct dm_dev *data_dev;
 554         struct dm_dev *metadata_dev;
 555         struct dm_target_callbacks callbacks;
 556
 557         dm_block_t low_water_blocks;
 558         struct pool_features pf;
 559 };
 560
 561 /*
 562  * Target context for a thin.
 563  */
 564 struct thin_c {
 565         struct dm_dev *pool_dev;
 566         struct dm_dev *origin_dev;
 567         dm_thin_id dev_id;
 568
 569         struct pool *pool;
 570         struct dm_thin_device *td;
 571 };
 572
 573 /*----------------------------------------------------------------*/
 574
 575 /*
 576  * A global list of pools that uses a struct mapped_device as a key.
 577  */
 578 static struct dm_thin_pool_table {
 579         struct mutex mutex;
 580         struct list_head pools;
 581 } dm_thin_pool_table;
 582
 583 static void pool_table_init(void)
 584 {
 585         mutex_init(&dm_thin_pool_table.mutex);
 586         INIT_LIST_HEAD(&dm_thin_pool_table.pools);
 587 }
 588
 589 static void __pool_table_insert(struct pool *pool)
 590 {
 591         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 592         list_add(&pool->list, &dm_thin_pool_table.pools);
 593 }
 594
 595 static void __pool_table_remove(struct pool *pool)
 596 {
 597         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 598         list_del(&pool->list);
 599 }
 600
 601 static struct pool *__pool_table_lookup(struct mapped_device *md)
 602 {
 603         struct pool *pool = NULL, *tmp;
 604
 605         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 606
 607         list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
 608                 if (tmp->pool_md == md) {
 609                         pool = tmp;
 610                         break;
 611                 }
 612         }
 613
 614         return pool;
 615 }
 616
 617 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
 618 {
 619         struct pool *pool = NULL, *tmp;
 620
 621         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 622
 623         list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
 624                 if (tmp->md_dev == md_dev) {
 625                         pool = tmp;
 626                         break;
 627                 }
 628         }
 629
 630         return pool;
 631 }
 632
 633 /*----------------------------------------------------------------*/
 634
 635 struct dm_thin_endio_hook {
 636         struct thin_c *tc;
 637         struct deferred_entry *shared_read_entry;
 638         struct deferred_entry *all_io_entry;
 639         struct dm_thin_new_mapping *overwrite_mapping;
 640 };
 641
 642 static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 643 {
 644         struct bio *bio;
 645         struct bio_list bios;
 646
 647         bio_list_init(&bios);
 648         bio_list_merge(&bios, master);
 649         bio_list_init(master);
 650
 651         while ((bio = bio_list_pop(&bios))) {
 652                 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
 653
 654                 if (h->tc == tc)
 655                         bio_endio(bio, DM_ENDIO_REQUEUE);
 656                 else
 657                         bio_list_add(master, bio);
 658         }
 659 }
 660
 661 static void requeue_io(struct thin_c *tc)
 662 {
 663         struct pool *pool = tc->pool;
 664         unsigned long flags;
 665
 666         spin_lock_irqsave(&pool->lock, flags);
 667         __requeue_bio_list(tc, &pool->deferred_bios);
 668         __requeue_bio_list(tc, &pool->retry_on_resume_list);
 669         spin_unlock_irqrestore(&pool->lock, flags);
 670 }
 671
 672 /*
 673  * This section of code contains the logic for processing a thin device's IO.
 674  * Much of the code depends on pool object resources (lists, workqueues, etc)
 675  * but most is exclusively called from the thin target rather than the thin-pool
 676  * target.
 677  */
 678
 679 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 680 {
 681         sector_t block_nr = bio->bi_sector;
 682
 683         if (tc->pool->sectors_per_block_shift < 0)
 684                 (void) sector_div(block_nr, tc->pool->sectors_per_block);
 685         else
 686                 block_nr >>= tc->pool->sectors_per_block_shift;
 687
 688         return block_nr;
 689 }
 690
 691 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 692 {
 693         struct pool *pool = tc->pool;
 694         sector_t bi_sector = bio->bi_sector;
 695
 696         bio->bi_bdev = tc->pool_dev->bdev;
 697         if (tc->pool->sectors_per_block_shift < 0)
 698                 bio->bi_sector = (block * pool->sectors_per_block) +
 699                                  sector_div(bi_sector, pool->sectors_per_block);
 700         else
 701                 bio->bi_sector = (block << pool->sectors_per_block_shift) |
 702                                 (bi_sector & (pool->sectors_per_block - 1));
 703 }
 704
 705 static void remap_to_origin(struct thin_c *tc, struct bio *bio)
 706 {
 707         bio->bi_bdev = tc->origin_dev->bdev;
 708 }
 709
 710 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
 711 {
 712         return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
 713                 dm_thin_changed_this_transaction(tc->td);
 714 }
 715
 716 static void issue(struct thin_c *tc, struct bio *bio)
 717 {
 718         struct pool *pool = tc->pool;
 719         unsigned long flags;
 720
 721         /*
 722          * Batch together any FUA/FLUSH bios we find and then issue
 723          * a single commit for them in process_deferred_bios().
 724          */
 725         if (bio_triggers_commit(tc, bio)) {
 726                 spin_lock_irqsave(&pool->lock, flags);
 727                 bio_list_add(&pool->deferred_flush_bios, bio);
 728                 spin_unlock_irqrestore(&pool->lock, flags);
 729         } else
 730                 generic_make_request(bio);
 731 }
 732
 733 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
 734 {
 735         remap_to_origin(tc, bio);
 736         issue(tc, bio);
 737 }
 738
 739 static void remap_and_issue(struct thin_c *tc, struct bio *bio,
 740                             dm_block_t block)
 741 {
 742         remap(tc, bio, block);
 743         issue(tc, bio);
 744 }
 745
 746 /*
 747  * wake_worker() is used when new work is queued and when pool_resume is
 748  * ready to continue deferred IO processing.
 749  */
 750 static void wake_worker(struct pool *pool)
 751 {
 752         queue_work(pool->wq, &pool->worker);
 753 }
 754
 755 /*----------------------------------------------------------------*/
 756
 757 /*
 758  * Bio endio functions.
 759  */
 760 struct dm_thin_new_mapping {
 761         struct list_head list;
 762
 763         unsigned quiesced:1;
 764         unsigned prepared:1;
 765         unsigned pass_discard:1;
 766
 767         struct thin_c *tc;
 768         dm_block_t virt_block;
 769         dm_block_t data_block;
 770         struct dm_bio_prison_cell *cell, *cell2;
 771         int err;
 772
 773         /*
 774          * If the bio covers the whole area of a block then we can avoid
 775          * zeroing or copying.  Instead this bio is hooked.  The bio will
 776          * still be in the cell, so care has to be taken to avoid issuing
 777          * the bio twice.
 778          */
 779         struct bio *bio;
 780         bio_end_io_t *saved_bi_end_io;
 781 };
 782
 783 static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
 784 {
 785         struct pool *pool = m->tc->pool;
 786
 787         if (m->quiesced && m->prepared) {
 788                 list_add(&m->list, &pool->prepared_mappings);
 789                 wake_worker(pool);
 790         }
 791 }
 792
 793 static void copy_complete(int read_err, unsigned long write_err, void *context)
 794 {
 795         unsigned long flags;
 796         struct dm_thin_new_mapping *m = context;
 797         struct pool *pool = m->tc->pool;
 798
 799         m->err = read_err || write_err ? -EIO : 0;
 800
 801         spin_lock_irqsave(&pool->lock, flags);
 802         m->prepared = 1;
 803         __maybe_add_mapping(m);
 804         spin_unlock_irqrestore(&pool->lock, flags);
 805 }
 806
 807 static void overwrite_endio(struct bio *bio, int err)
 808 {
 809         unsigned long flags;
 810         struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
 811         struct dm_thin_new_mapping *m = h->overwrite_mapping;
 812         struct pool *pool = m->tc->pool;
 813
 814         m->err = err;
 815
 816         spin_lock_irqsave(&pool->lock, flags);
 817         m->prepared = 1;
 818         __maybe_add_mapping(m);
 819         spin_unlock_irqrestore(&pool->lock, flags);
 820 }
 821
 822 /*----------------------------------------------------------------*/
 823
 824 /*
 825  * Workqueue.
 826  */
 827
 828 /*
 829  * Prepared mapping jobs.
 830  */
 831
 832 /*
 833  * This sends the bios in the cell back to the deferred_bios list.
 834  */
 835 static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell,
 836                        dm_block_t data_block)
 837 {
 838         struct pool *pool = tc->pool;
 839         unsigned long flags;
 840
 841         spin_lock_irqsave(&pool->lock, flags);
 842         cell_release(cell, &pool->deferred_bios);
 843         spin_unlock_irqrestore(&tc->pool->lock, flags);
 844
 845         wake_worker(pool);
 846 }
 847
 848 /*
 849  * Same as cell_defer above, except it omits one particular detainee,
 850  * a write bio that covers the block and has already been processed.
 851  */
 852 static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 853 {
 854         struct bio_list bios;
 855         struct pool *pool = tc->pool;
 856         unsigned long flags;
 857
 858         bio_list_init(&bios);
 859
 860         spin_lock_irqsave(&pool->lock, flags);
 861         cell_release_no_holder(cell, &pool->deferred_bios);
 862         spin_unlock_irqrestore(&pool->lock, flags);
 863
 864         wake_worker(pool);
 865 }
 866
 867 static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 868 {
 869         struct thin_c *tc = m->tc;
 870         struct bio *bio;
 871         int r;
 872
 873         bio = m->bio;
 874         if (bio)
 875                 bio->bi_end_io = m->saved_bi_end_io;
 876
 877         if (m->err) {
 878                 cell_error(m->cell);
 879                 goto out;
 880         }
 881
 882         /*
 883          * Commit the prepared block into the mapping btree.
 884          * Any I/O for this block arriving after this point will get
 885          * remapped to it directly.
 886          */
 887         r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
 888         if (r) {
 889                 DMERR("dm_thin_insert_block() failed");
 890                 cell_error(m->cell);
 891                 goto out;
 892         }
 893
 894         /*
 895          * Release any bios held while the block was being provisioned.
 896          * If we are processing a write bio that completely covers the block,
 897          * we already processed it so can ignore it now when processing
 898          * the bios in the cell.
 899          */
 900         if (bio) {
 901                 cell_defer_except(tc, m->cell);
 902                 bio_endio(bio, 0);
 903         } else
 904                 cell_defer(tc, m->cell, m->data_block);
 905
 906 out:
 907         list_del(&m->list);
 908         mempool_free(m, tc->pool->mapping_pool);
 909 }
 910
 911 static void process_prepared_discard(struct dm_thin_new_mapping *m)
 912 {
 913         int r;
 914         struct thin_c *tc = m->tc;
 915
 916         r = dm_thin_remove_block(tc->td, m->virt_block);
 917         if (r)
 918                 DMERR("dm_thin_remove_block() failed");
 919
 920         /*
 921          * Pass the discard down to the underlying device?
 922          */
 923         if (m->pass_discard)
 924                 remap_and_issue(tc, m->bio, m->data_block);
 925         else
 926                 bio_endio(m->bio, 0);
 927
 928         cell_defer_except(tc, m->cell);
 929         cell_defer_except(tc, m->cell2);
 930         mempool_free(m, tc->pool->mapping_pool);
 931 }
 932
 933 static void process_prepared(struct pool *pool, struct list_head *head,
 934                              void (*fn)(struct dm_thin_new_mapping *))
 935 {
 936         unsigned long flags;
 937         struct list_head maps;
 938         struct dm_thin_new_mapping *m, *tmp;
 939
 940         INIT_LIST_HEAD(&maps);
 941         spin_lock_irqsave(&pool->lock, flags);
 942         list_splice_init(head, &maps);
 943         spin_unlock_irqrestore(&pool->lock, flags);
 944
 945         list_for_each_entry_safe(m, tmp, &maps, list)
 946                 fn(m);
 947 }
 948
 949 /*
 950  * Deferred bio jobs.
 951  */
 952 static int io_overlaps_block(struct pool *pool, struct bio *bio)
 953 {
 954         return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT);
 955 }
 956
 957 static int io_overwrites_block(struct pool *pool, struct bio *bio)
 958 {
 959         return (bio_data_dir(bio) == WRITE) &&
 960                 io_overlaps_block(pool, bio);
 961 }
 962
 963 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
 964                                bio_end_io_t *fn)
 965 {
 966         *save = bio->bi_end_io;
 967         bio->bi_end_io = fn;
 968 }
 969
 970 static int ensure_next_mapping(struct pool *pool)
 971 {
 972         if (pool->next_mapping)
 973                 return 0;
 974
 975         pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
 976
 977         return pool->next_mapping ? 0 : -ENOMEM;
 978 }
 979
 980 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
 981 {
 982         struct dm_thin_new_mapping *r = pool->next_mapping;
 983
 984         BUG_ON(!pool->next_mapping);
 985
 986         pool->next_mapping = NULL;
 987
 988         return r;
 989 }
 990
 991 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 992                           struct dm_dev *origin, dm_block_t data_origin,
 993                           dm_block_t data_dest,
 994                           struct dm_bio_prison_cell *cell, struct bio *bio)
 995 {
 996         int r;
 997         struct pool *pool = tc->pool;
 998         struct dm_thin_new_mapping *m = get_next_mapping(pool);
 999
1000         INIT_LIST_HEAD(&m->list);
1001         m->quiesced = 0;
1002         m->prepared = 0;
1003         m->tc = tc;
1004         m->virt_block = virt_block;
1005         m->data_block = data_dest;
1006         m->cell = cell;
1007         m->err = 0;
1008         m->bio = NULL;
1009
1010         if (!ds_add_work(&pool->shared_read_ds, &m->list))
1011                 m->quiesced = 1;
1012
1013         /*
1014          * IO to pool_dev remaps to the pool target's data_dev.
1015          *
1016          * If the whole block of data is being overwritten, we can issue the
1017          * bio immediately. Otherwise we use kcopyd to clone the data first.
1018          */
1019         if (io_overwrites_block(pool, bio)) {
1020                 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
1021
1022                 h->overwrite_mapping = m;
1023                 m->bio = bio;
1024                 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
1025                 remap_and_issue(tc, bio, data_dest);
1026         } else {
1027                 struct dm_io_region from, to;
1028
1029                 from.bdev = origin->bdev;
1030                 from.sector = data_origin * pool->sectors_per_block;
1031                 from.count = pool->sectors_per_block;
1032
1033                 to.bdev = tc->pool_dev->bdev;
1034                 to.sector = data_dest * pool->sectors_per_block;
1035                 to.count = pool->sectors_per_block;
1036
1037                 r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
1038                                    0, copy_complete, m);
1039                 if (r < 0) {
1040                         mempool_free(m, pool->mapping_pool);
1041                         DMERR("dm_kcopyd_copy() failed");
1042                         cell_error(cell);
1043                 }
1044         }
1045 }
1046
1047 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
1048                                    dm_block_t data_origin, dm_block_t data_dest,
1049                                    struct dm_bio_prison_cell *cell, struct bio *bio)
1050 {
1051         schedule_copy(tc, virt_block, tc->pool_dev,
1052                       data_origin, data_dest, cell, bio);
1053 }
1054
1055 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
1056                                    dm_block_t data_dest,
1057                                    struct dm_bio_prison_cell *cell, struct bio *bio)
1058 {
1059         schedule_copy(tc, virt_block, tc->origin_dev,
1060                       virt_block, data_dest, cell, bio);
1061 }
1062
1063 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
1064                           dm_block_t data_block, struct dm_bio_prison_cell *cell,
1065                           struct bio *bio)
1066 {
1067         struct pool *pool = tc->pool;
1068         struct dm_thin_new_mapping *m = get_next_mapping(pool);
1069
1070         INIT_LIST_HEAD(&m->list);
1071         m->quiesced = 1;
1072         m->prepared = 0;
1073         m->tc = tc;
1074         m->virt_block = virt_block;
1075         m->data_block = data_block;
1076         m->cell = cell;
1077         m->err = 0;
1078         m->bio = NULL;
1079
1080         /*
1081          * If the whole block of data is being overwritten or we are not
1082          * zeroing pre-existing data, we can issue the bio immediately.
1083          * Otherwise we use kcopyd to zero the data first.
1084          */
1085         if (!pool->pf.zero_new_blocks)
1086                 process_prepared_mapping(m);
1087
1088         else if (io_overwrites_block(pool, bio)) {
1089                 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
1090
1091                 h->overwrite_mapping = m;
1092                 m->bio = bio;
1093                 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
1094                 remap_and_issue(tc, bio, data_block);
1095         } else {
1096                 int r;
1097                 struct dm_io_region to;
1098
1099                 to.bdev = tc->pool_dev->bdev;
1100                 to.sector = data_block * pool->sectors_per_block;
1101                 to.count = pool->sectors_per_block;
1102
1103                 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
1104                 if (r < 0) {
1105                         mempool_free(m, pool->mapping_pool);
1106                         DMERR("dm_kcopyd_zero() failed");
1107                         cell_error(cell);
1108                 }
1109         }
1110 }
1111
1112 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1113 {
1114         int r;
1115         dm_block_t free_blocks;
1116         unsigned long flags;
1117         struct pool *pool = tc->pool;
1118
1119         r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1120         if (r)
1121                 return r;
1122
1123         if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
1124                 DMWARN("%s: reached low water mark, sending event.",
1125                        dm_device_name(pool->pool_md));
1126                 spin_lock_irqsave(&pool->lock, flags);
1127                 pool->low_water_triggered = 1;
1128                 spin_unlock_irqrestore(&pool->lock, flags);
1129                 dm_table_event(pool->ti->table);
1130         }
1131
1132         if (!free_blocks) {
1133                 if (pool->no_free_space)
1134                         return -ENOSPC;
1135                 else {
1136                         /*
1137                          * Try to commit to see if that will free up some
1138                          * more space.
1139                          */
1140                         r = dm_pool_commit_metadata(pool->pmd);
1141                         if (r) {
1142                                 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1143                                       __func__, r);
1144                                 return r;
1145                         }
1146
1147                         r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1148                         if (r)
1149                                 return r;
1150
1151                         /*
1152                          * If we still have no space we set a flag to avoid
1153                          * doing all this checking and return -ENOSPC.
1154                          */
1155                         if (!free_blocks) {
1156                                 DMWARN("%s: no free space available.",
1157                                        dm_device_name(pool->pool_md));
1158                                 spin_lock_irqsave(&pool->lock, flags);
1159                                 pool->no_free_space = 1;
1160                                 spin_unlock_irqrestore(&pool->lock, flags);
1161                                 return -ENOSPC;
1162                         }
1163                 }
1164         }
1165
1166         r = dm_pool_alloc_data_block(pool->pmd, result);
1167         if (r)
1168                 return r;
1169
1170         return 0;
1171 }
1172
1173 /*
1174  * If we have run out of space, queue bios until the device is
1175  * resumed, presumably after having been reloaded with more space.
1176  */
1177 static void retry_on_resume(struct bio *bio)
1178 {
1179         struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
1180         struct thin_c *tc = h->tc;
1181         struct pool *pool = tc->pool;
1182         unsigned long flags;
1183
1184         spin_lock_irqsave(&pool->lock, flags);
1185         bio_list_add(&pool->retry_on_resume_list, bio);
1186         spin_unlock_irqrestore(&pool->lock, flags);
1187 }
1188
1189 static void no_space(struct dm_bio_prison_cell *cell)
1190 {
1191         struct bio *bio;
1192         struct bio_list bios;
1193
1194         bio_list_init(&bios);
1195         cell_release(cell, &bios);
1196
1197         while ((bio = bio_list_pop(&bios)))
1198                 retry_on_resume(bio);
1199 }
1200
1201 static void process_discard(struct thin_c *tc, struct bio *bio)
1202 {
1203         int r;
1204         unsigned long flags;
1205         struct pool *pool = tc->pool;
1206         struct dm_bio_prison_cell *cell, *cell2;
1207         struct cell_key key, key2;
1208         dm_block_t block = get_bio_block(tc, bio);
1209         struct dm_thin_lookup_result lookup_result;
1210         struct dm_thin_new_mapping *m;
1211
1212         build_virtual_key(tc->td, block, &key);
1213         if (bio_detain(tc->pool->prison, &key, bio, &cell))
1214                 return;
1215
1216         r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1217         switch (r) {
1218         case 0:
1219                 /*
1220                  * Check nobody is fiddling with this pool block.  This can
1221                  * happen if someone's in the process of breaking sharing
1222                  * on this block.
1223                  */
1224                 build_data_key(tc->td, lookup_result.block, &key2);
1225                 if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
1226                         cell_release_singleton(cell, bio);
1227                         break;
1228                 }
1229
1230                 if (io_overlaps_block(pool, bio)) {
1231                         /*
1232                          * IO may still be going to the destination block.  We must
1233                          * quiesce before we can do the removal.
1234                          */
1235                         m = get_next_mapping(pool);
1236                         m->tc = tc;
1237                         m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
1238                         m->virt_block = block;
1239                         m->data_block = lookup_result.block;
1240                         m->cell = cell;
1241                         m->cell2 = cell2;
1242                         m->err = 0;
1243                         m->bio = bio;
1244
1245                         if (!ds_add_work(&pool->all_io_ds, &m->list)) {
1246                                 spin_lock_irqsave(&pool->lock, flags);
1247                                 list_add(&m->list, &pool->prepared_discards);
1248                                 spin_unlock_irqrestore(&pool->lock, flags);
1249                                 wake_worker(pool);
1250                         }
1251                 } else {
1252                         /*
1253                          * The DM core makes sure that the discard doesn't span
1254                          * a block boundary.  So we submit the discard of a
1255                          * partial block appropriately.
1256                          */
1257                         cell_release_singleton(cell, bio);
1258                         cell_release_singleton(cell2, bio);
1259                         if ((!lookup_result.shared) && pool->pf.discard_passdown)
1260                                 remap_and_issue(tc, bio, lookup_result.block);
1261                         else
1262                                 bio_endio(bio, 0);
1263                 }
1264                 break;
1265
1266         case -ENODATA:
1267                 /*
1268                  * It isn't provisioned, just forget it.
1269                  */
1270                 cell_release_singleton(cell, bio);
1271                 bio_endio(bio, 0);
1272                 break;
1273
1274         default:
1275                 DMERR("discard: find block unexpectedly returned %d", r);
1276                 cell_release_singleton(cell, bio);
1277                 bio_io_error(bio);
1278                 break;
1279         }
1280 }
1281
1282 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1283                           struct cell_key *key,
1284                           struct dm_thin_lookup_result *lookup_result,
1285                           struct dm_bio_prison_cell *cell)
1286 {
1287         int r;
1288         dm_block_t data_block;
1289
1290         r = alloc_data_block(tc, &data_block);
1291         switch (r) {
1292         case 0:
1293                 schedule_internal_copy(tc, block, lookup_result->block,
1294                                        data_block, cell, bio);
1295                 break;
1296
1297         case -ENOSPC:
1298                 no_space(cell);
1299                 break;
1300
1301         default:
1302                 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1303                 cell_error(cell);
1304                 break;
1305         }
1306 }
1307
1308 static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1309                                dm_block_t block,
1310                                struct dm_thin_lookup_result *lookup_result)
1311 {
1312         struct dm_bio_prison_cell *cell;
1313         struct pool *pool = tc->pool;
1314         struct cell_key key;
1315
1316         /*
1317          * If cell is already occupied, then sharing is already in the process
1318          * of being broken so we have nothing further to do here.
1319          */
1320         build_data_key(tc->td, lookup_result->block, &key);
1321         if (bio_detain(pool->prison, &key, bio, &cell))
1322                 return;
1323
1324         if (bio_data_dir(bio) == WRITE && bio->bi_size)
1325                 break_sharing(tc, bio, block, &key, lookup_result, cell);
1326         else {
1327                 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
1328
1329                 h->shared_read_entry = ds_inc(&pool->shared_read_ds);
1330
1331                 cell_release_singleton(cell, bio);
1332                 remap_and_issue(tc, bio, lookup_result->block);
1333         }
1334 }
1335
1336 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
1337                             struct dm_bio_prison_cell *cell)
1338 {
1339         int r;
1340         dm_block_t data_block;
1341
1342         /*
1343          * Remap empty bios (flushes) immediately, without provisioning.
1344          */
1345         if (!bio->bi_size) {
1346                 cell_release_singleton(cell, bio);
1347                 remap_and_issue(tc, bio, 0);
1348                 return;
1349         }
1350
1351         /*
1352          * Fill read bios with zeroes and complete them immediately.
1353          */
1354         if (bio_data_dir(bio) == READ) {
1355                 zero_fill_bio(bio);
1356                 cell_release_singleton(cell, bio);
1357                 bio_endio(bio, 0);
1358                 return;
1359         }
1360
1361         r = alloc_data_block(tc, &data_block);
1362         switch (r) {
1363         case 0:
1364                 if (tc->origin_dev)
1365                         schedule_external_copy(tc, block, data_block, cell, bio);
1366                 else
1367                         schedule_zero(tc, block, data_block, cell, bio);
1368                 break;
1369
1370         case -ENOSPC:
1371                 no_space(cell);
1372                 break;
1373
1374         default:
1375                 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1376                 cell_error(cell);
1377                 break;
1378         }
1379 }
1380
1381 static void process_bio(struct thin_c *tc, struct bio *bio)
1382 {
1383         int r;
1384         dm_block_t block = get_bio_block(tc, bio);
1385         struct dm_bio_prison_cell *cell;
1386         struct cell_key key;
1387         struct dm_thin_lookup_result lookup_result;
1388
1389         /*
1390          * If cell is already occupied, then the block is already
1391          * being provisioned so we have nothing further to do here.
1392          */
1393         build_virtual_key(tc->td, block, &key);
1394         if (bio_detain(tc->pool->prison, &key, bio, &cell))
1395                 return;
1396
1397         r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1398         switch (r) {
1399         case 0:
1400                 /*
1401                  * We can release this cell now.  This thread is the only
1402                  * one that puts bios into a cell, and we know there were
1403                  * no preceding bios.
1404                  */
1405                 /*
1406                  * TODO: this will probably have to change when discard goes
1407                  * back in.
1408                  */
1409                 cell_release_singleton(cell, bio);
1410
1411                 if (lookup_result.shared)
1412                         process_shared_bio(tc, bio, block, &lookup_result);
1413                 else
1414                         remap_and_issue(tc, bio, lookup_result.block);
1415                 break;
1416
1417         case -ENODATA:
1418                 if (bio_data_dir(bio) == READ && tc->origin_dev) {
1419                         cell_release_singleton(cell, bio);
1420                         remap_to_origin_and_issue(tc, bio);
1421                 } else
1422                         provision_block(tc, bio, block, cell);
1423                 break;
1424
1425         default:
1426                 DMERR("dm_thin_find_block() failed, error = %d", r);
1427                 cell_release_singleton(cell, bio);
1428                 bio_io_error(bio);
1429                 break;
1430         }
1431 }
1432
1433 static int need_commit_due_to_time(struct pool *pool)
1434 {
1435         return jiffies < pool->last_commit_jiffies ||
1436                jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1437 }
1438
1439 static void process_deferred_bios(struct pool *pool)
1440 {
1441         unsigned long flags;
1442         struct bio *bio;
1443         struct bio_list bios;
1444         int r;
1445
1446         bio_list_init(&bios);
1447
1448         spin_lock_irqsave(&pool->lock, flags);
1449         bio_list_merge(&bios, &pool->deferred_bios);
1450         bio_list_init(&pool->deferred_bios);
1451         spin_unlock_irqrestore(&pool->lock, flags);
1452
1453         while ((bio = bio_list_pop(&bios))) {
1454                 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
1455                 struct thin_c *tc = h->tc;
1456
1457                 /*
1458                  * If we've got no free new_mapping structs, and processing
1459                  * this bio might require one, we pause until there are some
1460                  * prepared mappings to process.
1461                  */
1462                 if (ensure_next_mapping(pool)) {
1463                         spin_lock_irqsave(&pool->lock, flags);
1464                         bio_list_merge(&pool->deferred_bios, &bios);
1465                         spin_unlock_irqrestore(&pool->lock, flags);
1466
1467                         break;
1468                 }
1469
1470                 if (bio->bi_rw & REQ_DISCARD)
1471                         process_discard(tc, bio);
1472                 else
1473                         process_bio(tc, bio);
1474         }
1475
1476         /*
1477          * If there are any deferred flush bios, we must commit
1478          * the metadata before issuing them.
1479          */
1480         bio_list_init(&bios);
1481         spin_lock_irqsave(&pool->lock, flags);
1482         bio_list_merge(&bios, &pool->deferred_flush_bios);
1483         bio_list_init(&pool->deferred_flush_bios);
1484         spin_unlock_irqrestore(&pool->lock, flags);
1485
1486         if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
1487                 return;
1488
1489         r = dm_pool_commit_metadata(pool->pmd);
1490         if (r) {
1491                 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1492                       __func__, r);
1493                 while ((bio = bio_list_pop(&bios)))
1494                         bio_io_error(bio);
1495                 return;
1496         }
1497         pool->last_commit_jiffies = jiffies;
1498
1499         while ((bio = bio_list_pop(&bios)))
1500                 generic_make_request(bio);
1501 }
1502
1503 static void do_worker(struct work_struct *ws)
1504 {
1505         struct pool *pool = container_of(ws, struct pool, worker);
1506
1507         process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
1508         process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
1509         process_deferred_bios(pool);
1510 }
1511
1512 /*
1513  * We want to commit periodically so that not too much
1514  * unwritten data builds up.
1515  */
1516 static void do_waker(struct work_struct *ws)
1517 {
1518         struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1519         wake_worker(pool);
1520         queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1521 }
1522
1523 /*----------------------------------------------------------------*/
1524
1525 /*
1526  * Mapping functions.
1527  */
1528
1529 /*
1530  * Called only while mapping a thin bio to hand it over to the workqueue.
1531  */
1532 static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1533 {
1534         unsigned long flags;
1535         struct pool *pool = tc->pool;
1536
1537         spin_lock_irqsave(&pool->lock, flags);
1538         bio_list_add(&pool->deferred_bios, bio);
1539         spin_unlock_irqrestore(&pool->lock, flags);
1540
1541         wake_worker(pool);
1542 }
1543
1544 static struct dm_thin_endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
1545 {
1546         struct pool *pool = tc->pool;
1547         struct dm_thin_endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
1548
1549         h->tc = tc;
1550         h->shared_read_entry = NULL;
1551         h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds);
1552         h->overwrite_mapping = NULL;
1553
1554         return h;
1555 }
1556
1557 /*
1558  * Non-blocking function called from the thin target's map function.
1559  */
1560 static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1561                         union map_info *map_context)
1562 {
1563         int r;
1564         struct thin_c *tc = ti->private;
1565         dm_block_t block = get_bio_block(tc, bio);
1566         struct dm_thin_device *td = tc->td;
1567         struct dm_thin_lookup_result result;
1568
1569         map_context->ptr = thin_hook_bio(tc, bio);
1570         if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1571                 thin_defer_bio(tc, bio);
1572                 return DM_MAPIO_SUBMITTED;
1573         }
1574
1575         r = dm_thin_find_block(td, block, 0, &result);
1576
1577         /*
1578          * Note that we defer readahead too.
1579          */
1580         switch (r) {
1581         case 0:
1582                 if (unlikely(result.shared)) {
1583                         /*
1584                          * We have a race condition here between the
1585                          * result.shared value returned by the lookup and
1586                          * snapshot creation, which may cause new
1587                          * sharing.
1588                          *
1589                          * To avoid this always quiesce the origin before
1590                          * taking the snap.  You want to do this anyway to
1591                          * ensure a consistent application view
1592                          * (i.e. lockfs).
1593                          *
1594                          * More distant ancestors are irrelevant. The
1595                          * shared flag will be set in their case.
1596                          */
1597                         thin_defer_bio(tc, bio);
1598                         r = DM_MAPIO_SUBMITTED;
1599                 } else {
1600                         remap(tc, bio, result.block);
1601                         r = DM_MAPIO_REMAPPED;
1602                 }
1603                 break;
1604
1605         case -ENODATA:
1606                 /*
1607                  * In future, the failed dm_thin_find_block above could
1608                  * provide the hint to load the metadata into cache.
1609                  */
1610         case -EWOULDBLOCK:
1611                 thin_defer_bio(tc, bio);
1612                 r = DM_MAPIO_SUBMITTED;
1613                 break;
1614         }
1615
1616         return r;
1617 }
1618
1619 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1620 {
1621         int r;
1622         unsigned long flags;
1623         struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
1624
1625         spin_lock_irqsave(&pt->pool->lock, flags);
1626         r = !bio_list_empty(&pt->pool->retry_on_resume_list);
1627         spin_unlock_irqrestore(&pt->pool->lock, flags);
1628
1629         if (!r) {
1630                 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1631                 r = bdi_congested(&q->backing_dev_info, bdi_bits);
1632         }
1633
1634         return r;
1635 }
1636
1637 static void __requeue_bios(struct pool *pool)
1638 {
1639         bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
1640         bio_list_init(&pool->retry_on_resume_list);
1641 }
1642
1643 /*----------------------------------------------------------------
1644  * Binding of control targets to a pool object
1645  *--------------------------------------------------------------*/
1646 static int bind_control_target(struct pool *pool, struct dm_target *ti)
1647 {
1648         struct pool_c *pt = ti->private;
1649
1650         pool->ti = ti;
1651         pool->low_water_blocks = pt->low_water_blocks;
1652         pool->pf = pt->pf;
1653
1654         /*
1655          * If discard_passdown was enabled verify that the data device
1656          * supports discards.  Disable discard_passdown if not; otherwise
1657          * -EOPNOTSUPP will be returned.
1658          */
1659         if (pt->pf.discard_passdown) {
1660                 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1661                 if (!q || !blk_queue_discard(q)) {
1662                         char buf[BDEVNAME_SIZE];
1663                         DMWARN("Discard unsupported by data device (%s): Disabling discard passdown.",
1664                                bdevname(pt->data_dev->bdev, buf));
1665                         pool->pf.discard_passdown = 0;
1666                 }
1667         }
1668
1669         return 0;
1670 }
1671
1672 static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1673 {
1674         if (pool->ti == ti)
1675                 pool->ti = NULL;
1676 }
1677
1678 /*----------------------------------------------------------------
1679  * Pool creation
1680  *--------------------------------------------------------------*/
1681 /* Initialize pool features. */
1682 static void pool_features_init(struct pool_features *pf)
1683 {
1684         pf->zero_new_blocks = 1;
1685         pf->discard_enabled = 1;
1686         pf->discard_passdown = 1;
1687 }
1688
1689 static void __pool_destroy(struct pool *pool)
1690 {
1691         __pool_table_remove(pool);
1692
1693         if (dm_pool_metadata_close(pool->pmd) < 0)
1694                 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1695
1696         prison_destroy(pool->prison);
1697         dm_kcopyd_client_destroy(pool->copier);
1698
1699         if (pool->wq)
1700                 destroy_workqueue(pool->wq);
1701
1702         if (pool->next_mapping)
1703                 mempool_free(pool->next_mapping, pool->mapping_pool);
1704         mempool_destroy(pool->mapping_pool);
1705         mempool_destroy(pool->endio_hook_pool);
1706         kfree(pool);
1707 }
1708
1709 static struct kmem_cache *_new_mapping_cache;
1710 static struct kmem_cache *_endio_hook_cache;
1711
1712 static struct pool *pool_create(struct mapped_device *pool_md,
1713                                 struct block_device *metadata_dev,
1714                                 unsigned long block_size, char **error)
1715 {
1716         int r;
1717         void *err_p;
1718         struct pool *pool;
1719         struct dm_pool_metadata *pmd;
1720
1721         pmd = dm_pool_metadata_open(metadata_dev, block_size, true);
1722         if (IS_ERR(pmd)) {
1723                 *error = "Error creating metadata object";
1724                 return (struct pool *)pmd;
1725         }
1726
1727         pool = kmalloc(sizeof(*pool), GFP_KERNEL);
1728         if (!pool) {
1729                 *error = "Error allocating memory for pool";
1730                 err_p = ERR_PTR(-ENOMEM);
1731                 goto bad_pool;
1732         }
1733
1734         pool->pmd = pmd;
1735         pool->sectors_per_block = block_size;
1736         if (block_size & (block_size - 1))
1737                 pool->sectors_per_block_shift = -1;
1738         else
1739                 pool->sectors_per_block_shift = __ffs(block_size);
1740         pool->low_water_blocks = 0;
1741         pool_features_init(&pool->pf);
1742         pool->prison = prison_create(PRISON_CELLS);
1743         if (!pool->prison) {
1744                 *error = "Error creating pool's bio prison";
1745                 err_p = ERR_PTR(-ENOMEM);
1746                 goto bad_prison;
1747         }
1748
1749         pool->copier = dm_kcopyd_client_create();
1750         if (IS_ERR(pool->copier)) {
1751                 r = PTR_ERR(pool->copier);
1752                 *error = "Error creating pool's kcopyd client";
1753                 err_p = ERR_PTR(r);
1754                 goto bad_kcopyd_client;
1755         }
1756
1757         /*
1758          * Create singlethreaded workqueue that will service all devices
1759          * that use this metadata.
1760          */
1761         pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1762         if (!pool->wq) {
1763                 *error = "Error creating pool's workqueue";
1764                 err_p = ERR_PTR(-ENOMEM);
1765                 goto bad_wq;
1766         }
1767
1768         INIT_WORK(&pool->worker, do_worker);
1769         INIT_DELAYED_WORK(&pool->waker, do_waker);
1770         spin_lock_init(&pool->lock);
1771         bio_list_init(&pool->deferred_bios);
1772         bio_list_init(&pool->deferred_flush_bios);
1773         INIT_LIST_HEAD(&pool->prepared_mappings);
1774         INIT_LIST_HEAD(&pool->prepared_discards);
1775         pool->low_water_triggered = 0;
1776         pool->no_free_space = 0;
1777         bio_list_init(&pool->retry_on_resume_list);
1778         ds_init(&pool->shared_read_ds);
1779         ds_init(&pool->all_io_ds);
1780
1781         pool->next_mapping = NULL;
1782         pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
1783                                                       _new_mapping_cache);
1784         if (!pool->mapping_pool) {
1785                 *error = "Error creating pool's mapping mempool";
1786                 err_p = ERR_PTR(-ENOMEM);
1787                 goto bad_mapping_pool;
1788         }
1789
1790         pool->endio_hook_pool = mempool_create_slab_pool(ENDIO_HOOK_POOL_SIZE,
1791                                                          _endio_hook_cache);
1792         if (!pool->endio_hook_pool) {
1793                 *error = "Error creating pool's endio_hook mempool";
1794                 err_p = ERR_PTR(-ENOMEM);
1795                 goto bad_endio_hook_pool;
1796         }
1797         pool->ref_count = 1;
1798         pool->last_commit_jiffies = jiffies;
1799         pool->pool_md = pool_md;
1800         pool->md_dev = metadata_dev;
1801         __pool_table_insert(pool);
1802
1803         return pool;
1804
1805 bad_endio_hook_pool:
1806         mempool_destroy(pool->mapping_pool);
1807 bad_mapping_pool:
1808         destroy_workqueue(pool->wq);
1809 bad_wq:
1810         dm_kcopyd_client_destroy(pool->copier);
1811 bad_kcopyd_client:
1812         prison_destroy(pool->prison);
1813 bad_prison:
1814         kfree(pool);
1815 bad_pool:
1816         if (dm_pool_metadata_close(pmd))
1817                 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1818
1819         return err_p;
1820 }
1821
1822 static void __pool_inc(struct pool *pool)
1823 {
1824         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1825         pool->ref_count++;
1826 }
1827
1828 static void __pool_dec(struct pool *pool)
1829 {
1830         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1831         BUG_ON(!pool->ref_count);
1832         if (!--pool->ref_count)
1833                 __pool_destroy(pool);
1834 }
1835
1836 static struct pool *__pool_find(struct mapped_device *pool_md,
1837                                 struct block_device *metadata_dev,
1838                                 unsigned long block_size, char **error,
1839                                 int *created)
1840 {
1841         struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
1842
1843         if (pool) {
1844                 if (pool->pool_md != pool_md) {
1845                         *error = "metadata device already in use by a pool";
1846                         return ERR_PTR(-EBUSY);
1847                 }
1848                 __pool_inc(pool);
1849
1850         } else {
1851                 pool = __pool_table_lookup(pool_md);
1852                 if (pool) {
1853                         if (pool->md_dev != metadata_dev) {
1854                                 *error = "different pool cannot replace a pool";
1855                                 return ERR_PTR(-EINVAL);
1856                         }
1857                         __pool_inc(pool);
1858
1859                 } else {
1860                         pool = pool_create(pool_md, metadata_dev, block_size, error);
1861                         *created = 1;
1862                 }
1863         }
1864
1865         return pool;
1866 }
1867
1868 /*----------------------------------------------------------------
1869  * Pool target methods
1870  *--------------------------------------------------------------*/
1871 static void pool_dtr(struct dm_target *ti)
1872 {
1873         struct pool_c *pt = ti->private;
1874
1875         mutex_lock(&dm_thin_pool_table.mutex);
1876
1877         unbind_control_target(pt->pool, ti);
1878         __pool_dec(pt->pool);
1879         dm_put_device(ti, pt->metadata_dev);
1880         dm_put_device(ti, pt->data_dev);
1881         kfree(pt);
1882
1883         mutex_unlock(&dm_thin_pool_table.mutex);
1884 }
1885
1886 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1887                                struct dm_target *ti)
1888 {
1889         int r;
1890         unsigned argc;
1891         const char *arg_name;
1892
1893         static struct dm_arg _args[] = {
1894                 {0, 3, "Invalid number of pool feature arguments"},
1895         };
1896
1897         /*
1898          * No feature arguments supplied.
1899          */
1900         if (!as->argc)
1901                 return 0;
1902
1903         r = dm_read_arg_group(_args, as, &argc, &ti->error);
1904         if (r)
1905                 return -EINVAL;
1906
1907         while (argc && !r) {
1908                 arg_name = dm_shift_arg(as);
1909                 argc--;
1910
1911                 if (!strcasecmp(arg_name, "skip_block_zeroing")) {
1912                         pf->zero_new_blocks = 0;
1913                         continue;
1914                 } else if (!strcasecmp(arg_name, "ignore_discard")) {
1915                         pf->discard_enabled = 0;
1916                         continue;
1917                 } else if (!strcasecmp(arg_name, "no_discard_passdown")) {
1918                         pf->discard_passdown = 0;
1919                         continue;
1920                 }
1921
1922                 ti->error = "Unrecognised pool feature requested";
1923                 r = -EINVAL;
1924         }
1925
1926         return r;
1927 }
1928
1929 /*
1930  * thin-pool <metadata dev> <data dev>
1931  *           <data block size (sectors)>
1932  *           <low water mark (blocks)>
1933  *           [<#feature args> [<arg>]*]
1934  *
1935  * Optional feature arguments are:
1936  *           skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1937  *           ignore_discard: disable discard
1938  *           no_discard_passdown: don't pass discards down to the data device
1939  */
1940 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1941 {
1942         int r, pool_created = 0;
1943         struct pool_c *pt;
1944         struct pool *pool;
1945         struct pool_features pf;
1946         struct dm_arg_set as;
1947         struct dm_dev *data_dev;
1948         unsigned long block_size;
1949         dm_block_t low_water_blocks;
1950         struct dm_dev *metadata_dev;
1951         sector_t metadata_dev_size;
1952         char b[BDEVNAME_SIZE];
1953
1954         /*
1955          * FIXME Remove validation from scope of lock.
1956          */
1957         mutex_lock(&dm_thin_pool_table.mutex);
1958
1959         if (argc < 4) {
1960                 ti->error = "Invalid argument count";
1961                 r = -EINVAL;
1962                 goto out_unlock;
1963         }
1964         as.argc = argc;
1965         as.argv = argv;
1966
1967         r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
1968         if (r) {
1969                 ti->error = "Error opening metadata block device";
1970                 goto out_unlock;
1971         }
1972
1973         metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
1974         if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
1975                 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1976                        bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1977
1978         r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
1979         if (r) {
1980                 ti->error = "Error getting data device";
1981                 goto out_metadata;
1982         }
1983
1984         if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
1985             block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1986             block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1987             block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1988                 ti->error = "Invalid block size";
1989                 r = -EINVAL;
1990                 goto out;
1991         }
1992
1993         if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
1994                 ti->error = "Invalid low water mark";
1995                 r = -EINVAL;
1996                 goto out;
1997         }
1998
1999         /*
2000          * Set default pool features.
2001          */
2002         pool_features_init(&pf);
2003
2004         dm_consume_args(&as, 4);
2005         r = parse_pool_features(&as, &pf, ti);
2006         if (r)
2007                 goto out;
2008
2009         pt = kzalloc(sizeof(*pt), GFP_KERNEL);
2010         if (!pt) {
2011                 r = -ENOMEM;
2012                 goto out;
2013         }
2014
2015         pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
2016                            block_size, &ti->error, &pool_created);
2017         if (IS_ERR(pool)) {
2018                 r = PTR_ERR(pool);
2019                 goto out_free_pt;
2020         }
2021
2022         /*
2023          * 'pool_created' reflects whether this is the first table load.
2024          * Top level discard support is not allowed to be changed after
2025          * initial load.  This would require a pool reload to trigger thin
2026          * device changes.
2027          */
2028         if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
2029                 ti->error = "Discard support cannot be disabled once enabled";
2030                 r = -EINVAL;
2031                 goto out_flags_changed;
2032         }
2033
2034         /*
2035          * The block layer requires discard_granularity to be a power of 2.
2036          */
2037         if (pf.discard_enabled && !is_power_of_2(block_size)) {
2038                 ti->error = "Discard support must be disabled when the block size is not a power of 2";
2039                 r = -EINVAL;
2040                 goto out_flags_changed;
2041         }
2042
2043         pt->pool = pool;
2044         pt->ti = ti;
2045         pt->metadata_dev = metadata_dev;
2046         pt->data_dev = data_dev;
2047         pt->low_water_blocks = low_water_blocks;
2048         pt->pf = pf;
2049         ti->num_flush_requests = 1;
2050         /*
2051          * Only need to enable discards if the pool should pass
2052          * them down to the data device.  The thin device's discard
2053          * processing will cause mappings to be removed from the btree.
2054          */
2055         if (pf.discard_enabled && pf.discard_passdown) {
2056                 ti->num_discard_requests = 1;
2057                 /*
2058                  * Setting 'discards_supported' circumvents the normal
2059                  * stacking of discard limits (this keeps the pool and
2060                  * thin devices' discard limits consistent).
2061                  */
2062                 ti->discards_supported = true;
2063         }
2064         ti->private = pt;
2065
2066         pt->callbacks.congested_fn = pool_is_congested;
2067         dm_table_add_target_callbacks(ti->table, &pt->callbacks);
2068
2069         mutex_unlock(&dm_thin_pool_table.mutex);
2070
2071         return 0;
2072
2073 out_flags_changed:
2074         __pool_dec(pool);
2075 out_free_pt:
2076         kfree(pt);
2077 out:
2078         dm_put_device(ti, data_dev);
2079 out_metadata:
2080         dm_put_device(ti, metadata_dev);
2081 out_unlock:
2082         mutex_unlock(&dm_thin_pool_table.mutex);
2083
2084         return r;
2085 }
2086
2087 static int pool_map(struct dm_target *ti, struct bio *bio,
2088                     union map_info *map_context)
2089 {
2090         int r;
2091         struct pool_c *pt = ti->private;
2092         struct pool *pool = pt->pool;
2093         unsigned long flags;
2094
2095         /*
2096          * As this is a singleton target, ti->begin is always zero.
2097          */
2098         spin_lock_irqsave(&pool->lock, flags);
2099         bio->bi_bdev = pt->data_dev->bdev;
2100         r = DM_MAPIO_REMAPPED;
2101         spin_unlock_irqrestore(&pool->lock, flags);
2102
2103         return r;
2104 }
2105
2106 /*
2107  * Retrieves the number of blocks of the data device from
2108  * the superblock and compares it to the actual device size,
2109  * thus resizing the data device in case it has grown.
2110  *
2111  * This both copes with opening preallocated data devices in the ctr
2112  * being followed by a resume
2113  * -and-
2114  * calling the resume method individually after userspace has
2115  * grown the data device in reaction to a table event.
2116  */
2117 static int pool_preresume(struct dm_target *ti)
2118 {
2119         int r;
2120         struct pool_c *pt = ti->private;
2121         struct pool *pool = pt->pool;
2122         sector_t data_size = ti->len;
2123         dm_block_t sb_data_size;
2124
2125         /*
2126          * Take control of the pool object.
2127          */
2128         r = bind_control_target(pool, ti);
2129         if (r)
2130                 return r;
2131
2132         (void) sector_div(data_size, pool->sectors_per_block);
2133
2134         r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
2135         if (r) {
2136                 DMERR("failed to retrieve data device size");
2137                 return r;
2138         }
2139
2140         if (data_size < sb_data_size) {
2141                 DMERR("pool target too small, is %llu blocks (expected %llu)",
2142                       (unsigned long long)data_size, sb_data_size);
2143                 return -EINVAL;
2144
2145         } else if (data_size > sb_data_size) {
2146                 r = dm_pool_resize_data_dev(pool->pmd, data_size);
2147                 if (r) {
2148                         DMERR("failed to resize data device");
2149                         return r;
2150                 }
2151
2152                 r = dm_pool_commit_metadata(pool->pmd);
2153                 if (r) {
2154                         DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
2155                               __func__, r);
2156                         return r;
2157                 }
2158         }
2159
2160         return 0;
2161 }
2162
2163 static void pool_resume(struct dm_target *ti)
2164 {
2165         struct pool_c *pt = ti->private;
2166         struct pool *pool = pt->pool;
2167         unsigned long flags;
2168
2169         spin_lock_irqsave(&pool->lock, flags);
2170         pool->low_water_triggered = 0;
2171         pool->no_free_space = 0;
2172         __requeue_bios(pool);
2173         spin_unlock_irqrestore(&pool->lock, flags);
2174
2175         do_waker(&pool->waker.work);
2176 }
2177
2178 static void pool_postsuspend(struct dm_target *ti)
2179 {
2180         int r;
2181         struct pool_c *pt = ti->private;
2182         struct pool *pool = pt->pool;
2183
2184         cancel_delayed_work(&pool->waker);
2185         flush_workqueue(pool->wq);
2186
2187         r = dm_pool_commit_metadata(pool->pmd);
2188         if (r < 0) {
2189                 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
2190                       __func__, r);
2191                 /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
2192         }
2193 }
2194
2195 static int check_arg_count(unsigned argc, unsigned args_required)
2196 {
2197         if (argc != args_required) {
2198                 DMWARN("Message received with %u arguments instead of %u.",
2199                        argc, args_required);
2200                 return -EINVAL;
2201         }
2202
2203         return 0;
2204 }
2205
2206 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
2207 {
2208         if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
2209             *dev_id <= MAX_DEV_ID)
2210                 return 0;
2211
2212         if (warning)
2213                 DMWARN("Message received with invalid device id: %s", arg);
2214
2215         return -EINVAL;
2216 }
2217
2218 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
2219 {
2220         dm_thin_id dev_id;
2221         int r;
2222
2223         r = check_arg_count(argc, 2);
2224         if (r)
2225                 return r;
2226
2227         r = read_dev_id(argv[1], &dev_id, 1);
2228         if (r)
2229                 return r;
2230
2231         r = dm_pool_create_thin(pool->pmd, dev_id);
2232         if (r) {
2233                 DMWARN("Creation of new thinly-provisioned device with id %s failed.",
2234                        argv[1]);
2235                 return r;
2236         }
2237
2238         return 0;
2239 }
2240
2241 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2242 {
2243         dm_thin_id dev_id;
2244         dm_thin_id origin_dev_id;
2245         int r;
2246
2247         r = check_arg_count(argc, 3);
2248         if (r)
2249                 return r;
2250
2251         r = read_dev_id(argv[1], &dev_id, 1);
2252         if (r)
2253                 return r;
2254
2255         r = read_dev_id(argv[2], &origin_dev_id, 1);
2256         if (r)
2257                 return r;
2258
2259         r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
2260         if (r) {
2261                 DMWARN("Creation of new snapshot %s of device %s failed.",
2262                        argv[1], argv[2]);
2263                 return r;
2264         }
2265
2266         return 0;
2267 }
2268
2269 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
2270 {
2271         dm_thin_id dev_id;
2272         int r;
2273
2274         r = check_arg_count(argc, 2);
2275         if (r)
2276                 return r;
2277
2278         r = read_dev_id(argv[1], &dev_id, 1);
2279         if (r)
2280                 return r;
2281
2282         r = dm_pool_delete_thin_device(pool->pmd, dev_id);
2283         if (r)
2284                 DMWARN("Deletion of thin device %s failed.", argv[1]);
2285
2286         return r;
2287 }
2288
2289 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
2290 {
2291         dm_thin_id old_id, new_id;
2292         int r;
2293
2294         r = check_arg_count(argc, 3);
2295         if (r)
2296                 return r;
2297
2298         if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
2299                 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
2300                 return -EINVAL;
2301         }
2302
2303         if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
2304                 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
2305                 return -EINVAL;
2306         }
2307
2308         r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
2309         if (r) {
2310                 DMWARN("Failed to change transaction id from %s to %s.",
2311                        argv[1], argv[2]);
2312                 return r;
2313         }
2314
2315         return 0;
2316 }
2317
2318 static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2319 {
2320         int r;
2321
2322         r = check_arg_count(argc, 1);
2323         if (r)
2324                 return r;
2325
2326         r = dm_pool_commit_metadata(pool->pmd);
2327         if (r) {
2328                 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
2329                       __func__, r);
2330                 return r;
2331         }
2332
2333         r = dm_pool_reserve_metadata_snap(pool->pmd);
2334         if (r)
2335                 DMWARN("reserve_metadata_snap message failed.");
2336
2337         return r;
2338 }
2339
2340 static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2341 {
2342         int r;
2343
2344         r = check_arg_count(argc, 1);
2345         if (r)
2346                 return r;
2347
2348         r = dm_pool_release_metadata_snap(pool->pmd);
2349         if (r)
2350                 DMWARN("release_metadata_snap message failed.");
2351
2352         return r;
2353 }
2354
2355 /*
2356  * Messages supported:
2357  *   create_thin        <dev_id>
2358  *   create_snap        <dev_id> <origin_id>
2359  *   delete             <dev_id>
2360  *   trim               <dev_id> <new_size_in_sectors>
2361  *   set_transaction_id <current_trans_id> <new_trans_id>
2362  *   reserve_metadata_snap
2363  *   release_metadata_snap
2364  */
2365 static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2366 {
2367         int r = -EINVAL;
2368         struct pool_c *pt = ti->private;
2369         struct pool *pool = pt->pool;
2370
2371         if (!strcasecmp(argv[0], "create_thin"))
2372                 r = process_create_thin_mesg(argc, argv, pool);
2373
2374         else if (!strcasecmp(argv[0], "create_snap"))
2375                 r = process_create_snap_mesg(argc, argv, pool);
2376
2377         else if (!strcasecmp(argv[0], "delete"))
2378                 r = process_delete_mesg(argc, argv, pool);
2379
2380         else if (!strcasecmp(argv[0], "set_transaction_id"))
2381                 r = process_set_transaction_id_mesg(argc, argv, pool);
2382
2383         else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
2384                 r = process_reserve_metadata_snap_mesg(argc, argv, pool);
2385
2386         else if (!strcasecmp(argv[0], "release_metadata_snap"))
2387                 r = process_release_metadata_snap_mesg(argc, argv, pool);
2388
2389         else
2390                 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2391
2392         if (!r) {
2393                 r = dm_pool_commit_metadata(pool->pmd);
2394                 if (r)
2395                         DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
2396                               argv[0], r);
2397         }
2398
2399         return r;
2400 }
2401
2402 /*
2403  * Status line is:
2404  *    <transaction id> <used metadata sectors>/<total metadata sectors>
2405  *    <used data sectors>/<total data sectors> <held metadata root>
2406  */
2407 static int pool_status(struct dm_target *ti, status_type_t type,
2408                        char *result, unsigned maxlen)
2409 {
2410         int r, count;
2411         unsigned sz = 0;
2412         uint64_t transaction_id;
2413         dm_block_t nr_free_blocks_data;
2414         dm_block_t nr_free_blocks_metadata;
2415         dm_block_t nr_blocks_data;
2416         dm_block_t nr_blocks_metadata;
2417         dm_block_t held_root;
2418         char buf[BDEVNAME_SIZE];
2419         char buf2[BDEVNAME_SIZE];
2420         struct pool_c *pt = ti->private;
2421         struct pool *pool = pt->pool;
2422
2423         switch (type) {
2424         case STATUSTYPE_INFO:
2425                 r = dm_pool_get_metadata_transaction_id(pool->pmd,
2426                                                         &transaction_id);
2427                 if (r)
2428                         return r;
2429
2430                 r = dm_pool_get_free_metadata_block_count(pool->pmd,
2431                                                           &nr_free_blocks_metadata);
2432                 if (r)
2433                         return r;
2434
2435                 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
2436                 if (r)
2437                         return r;
2438
2439                 r = dm_pool_get_free_block_count(pool->pmd,
2440                                                  &nr_free_blocks_data);
2441                 if (r)
2442                         return r;
2443
2444                 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
2445                 if (r)
2446                         return r;
2447
2448                 r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
2449                 if (r)
2450                         return r;
2451
2452                 DMEMIT("%llu %llu/%llu %llu/%llu ",
2453                        (unsigned long long)transaction_id,
2454                        (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2455                        (unsigned long long)nr_blocks_metadata,
2456                        (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
2457                        (unsigned long long)nr_blocks_data);
2458
2459                 if (held_root)
2460                         DMEMIT("%llu", held_root);
2461                 else
2462                         DMEMIT("-");
2463
2464                 break;
2465
2466         case STATUSTYPE_TABLE:
2467                 DMEMIT("%s %s %lu %llu ",
2468                        format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
2469                        format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
2470                        (unsigned long)pool->sectors_per_block,
2471                        (unsigned long long)pt->low_water_blocks);
2472
2473                 count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled +
2474                         !pt->pf.discard_passdown;
2475                 DMEMIT("%u ", count);
2476
2477                 if (!pool->pf.zero_new_blocks)
2478                         DMEMIT("skip_block_zeroing ");
2479
2480                 if (!pool->pf.discard_enabled)
2481                         DMEMIT("ignore_discard ");
2482
2483                 if (!pt->pf.discard_passdown)
2484                         DMEMIT("no_discard_passdown ");
2485
2486                 break;
2487         }
2488
2489         return 0;
2490 }
2491
2492 static int pool_iterate_devices(struct dm_target *ti,
2493                                 iterate_devices_callout_fn fn, void *data)
2494 {
2495         struct pool_c *pt = ti->private;
2496
2497         return fn(ti, pt->data_dev, 0, ti->len, data);
2498 }
2499
2500 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2501                       struct bio_vec *biovec, int max_size)
2502 {
2503         struct pool_c *pt = ti->private;
2504         struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
2505
2506         if (!q->merge_bvec_fn)
2507                 return max_size;
2508
2509         bvm->bi_bdev = pt->data_dev->bdev;
2510
2511         return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2512 }
2513
2514 static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
2515 {
2516         /*
2517          * FIXME: these limits may be incompatible with the pool's data device
2518          */
2519         limits->max_discard_sectors = pool->sectors_per_block;
2520
2521         /*
2522          * This is just a hint, and not enforced.  We have to cope with
2523          * bios that cover a block partially.  A discard that spans a block
2524          * boundary is not sent to this target.
2525          */
2526         limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
2527         limits->discard_zeroes_data = pool->pf.zero_new_blocks;
2528 }
2529
2530 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2531 {
2532         struct pool_c *pt = ti->private;
2533         struct pool *pool = pt->pool;
2534
2535         blk_limits_io_min(limits, 0);
2536         blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2537         if (pool->pf.discard_enabled)
2538                 set_discard_limits(pool, limits);
2539 }
2540
2541 static struct target_type pool_target = {
2542         .name = "thin-pool",
2543         .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2544                     DM_TARGET_IMMUTABLE,
2545         .version = {1, 2, 0},
2546         .module = THIS_MODULE,
2547         .ctr = pool_ctr,
2548         .dtr = pool_dtr,
2549         .map = pool_map,
2550         .postsuspend = pool_postsuspend,
2551         .preresume = pool_preresume,
2552         .resume = pool_resume,
2553         .message = pool_message,
2554         .status = pool_status,
2555         .merge = pool_merge,
2556         .iterate_devices = pool_iterate_devices,
2557         .io_hints = pool_io_hints,
2558 };
2559
2560 /*----------------------------------------------------------------
2561  * Thin target methods
2562  *--------------------------------------------------------------*/
2563 static void thin_dtr(struct dm_target *ti)
2564 {
2565         struct thin_c *tc = ti->private;
2566
2567         mutex_lock(&dm_thin_pool_table.mutex);
2568
2569         __pool_dec(tc->pool);
2570         dm_pool_close_thin_device(tc->td);
2571         dm_put_device(ti, tc->pool_dev);
2572         if (tc->origin_dev)
2573                 dm_put_device(ti, tc->origin_dev);
2574         kfree(tc);
2575
2576         mutex_unlock(&dm_thin_pool_table.mutex);
2577 }
2578
2579 /*
2580  * Thin target parameters:
2581  *
2582  * <pool_dev> <dev_id> [origin_dev]
2583  *
2584  * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2585  * dev_id: the internal device identifier
2586  * origin_dev: a device external to the pool that should act as the origin
2587  *
2588  * If the pool device has discards disabled, they get disabled for the thin
2589  * device as well.
2590  */
2591 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2592 {
2593         int r;
2594         struct thin_c *tc;
2595         struct dm_dev *pool_dev, *origin_dev;
2596         struct mapped_device *pool_md;
2597
2598         mutex_lock(&dm_thin_pool_table.mutex);
2599
2600         if (argc != 2 && argc != 3) {
2601                 ti->error = "Invalid argument count";
2602                 r = -EINVAL;
2603                 goto out_unlock;
2604         }
2605
2606         tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
2607         if (!tc) {
2608                 ti->error = "Out of memory";
2609                 r = -ENOMEM;
2610                 goto out_unlock;
2611         }
2612
2613         if (argc == 3) {
2614                 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
2615                 if (r) {
2616                         ti->error = "Error opening origin device";
2617                         goto bad_origin_dev;
2618                 }
2619                 tc->origin_dev = origin_dev;
2620         }
2621
2622         r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
2623         if (r) {
2624                 ti->error = "Error opening pool device";
2625                 goto bad_pool_dev;
2626         }
2627         tc->pool_dev = pool_dev;
2628
2629         if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
2630                 ti->error = "Invalid device id";
2631                 r = -EINVAL;
2632                 goto bad_common;
2633         }
2634
2635         pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
2636         if (!pool_md) {
2637                 ti->error = "Couldn't get pool mapped device";
2638                 r = -EINVAL;
2639                 goto bad_common;
2640         }
2641
2642         tc->pool = __pool_table_lookup(pool_md);
2643         if (!tc->pool) {
2644                 ti->error = "Couldn't find pool object";
2645                 r = -EINVAL;
2646                 goto bad_pool_lookup;
2647         }
2648         __pool_inc(tc->pool);
2649
2650         r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
2651         if (r) {
2652                 ti->error = "Couldn't open thin internal device";
2653                 goto bad_thin_open;
2654         }
2655
2656         r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
2657         if (r)
2658                 goto bad_thin_open;
2659
2660         ti->num_flush_requests = 1;
2661         ti->flush_supported = true;
2662
2663         /* In case the pool supports discards, pass them on. */
2664         if (tc->pool->pf.discard_enabled) {
2665                 ti->discards_supported = true;
2666                 ti->num_discard_requests = 1;
2667                 ti->discard_zeroes_data_unsupported = true;
2668                 /* Discard requests must be split on a block boundary */
2669                 ti->split_discard_requests = true;
2670         }
2671
2672         dm_put(pool_md);
2673
2674         mutex_unlock(&dm_thin_pool_table.mutex);
2675
2676         return 0;
2677
2678 bad_thin_open:
2679         __pool_dec(tc->pool);
2680 bad_pool_lookup:
2681         dm_put(pool_md);
2682 bad_common:
2683         dm_put_device(ti, tc->pool_dev);
2684 bad_pool_dev:
2685         if (tc->origin_dev)
2686                 dm_put_device(ti, tc->origin_dev);
2687 bad_origin_dev:
2688         kfree(tc);
2689 out_unlock:
2690         mutex_unlock(&dm_thin_pool_table.mutex);
2691
2692         return r;
2693 }
2694
2695 static int thin_map(struct dm_target *ti, struct bio *bio,
2696                     union map_info *map_context)
2697 {
2698         bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
2699
2700         return thin_bio_map(ti, bio, map_context);
2701 }
2702
2703 static int thin_endio(struct dm_target *ti,
2704                       struct bio *bio, int err,
2705                       union map_info *map_context)
2706 {
2707         unsigned long flags;
2708         struct dm_thin_endio_hook *h = map_context->ptr;
2709         struct list_head work;
2710         struct dm_thin_new_mapping *m, *tmp;
2711         struct pool *pool = h->tc->pool;
2712
2713         if (h->shared_read_entry) {
2714                 INIT_LIST_HEAD(&work);
2715                 ds_dec(h->shared_read_entry, &work);
2716
2717                 spin_lock_irqsave(&pool->lock, flags);
2718                 list_for_each_entry_safe(m, tmp, &work, list) {
2719                         list_del(&m->list);
2720                         m->quiesced = 1;
2721                         __maybe_add_mapping(m);
2722                 }
2723                 spin_unlock_irqrestore(&pool->lock, flags);
2724         }
2725
2726         if (h->all_io_entry) {
2727                 INIT_LIST_HEAD(&work);
2728                 ds_dec(h->all_io_entry, &work);
2729                 spin_lock_irqsave(&pool->lock, flags);
2730                 list_for_each_entry_safe(m, tmp, &work, list)
2731                         list_add(&m->list, &pool->prepared_discards);
2732                 spin_unlock_irqrestore(&pool->lock, flags);
2733         }
2734
2735         mempool_free(h, pool->endio_hook_pool);
2736
2737         return 0;
2738 }
2739
2740 static void thin_postsuspend(struct dm_target *ti)
2741 {
2742         if (dm_noflush_suspending(ti))
2743                 requeue_io((struct thin_c *)ti->private);
2744 }
2745
2746 /*
2747  * <nr mapped sectors> <highest mapped sector>
2748  */
2749 static int thin_status(struct dm_target *ti, status_type_t type,
2750                        char *result, unsigned maxlen)
2751 {
2752         int r;
2753         ssize_t sz = 0;
2754         dm_block_t mapped, highest;
2755         char buf[BDEVNAME_SIZE];
2756         struct thin_c *tc = ti->private;
2757
2758         if (!tc->td)
2759                 DMEMIT("-");
2760         else {
2761                 switch (type) {
2762                 case STATUSTYPE_INFO:
2763                         r = dm_thin_get_mapped_count(tc->td, &mapped);
2764                         if (r)
2765                                 return r;
2766
2767                         r = dm_thin_get_highest_mapped_block(tc->td, &highest);
2768                         if (r < 0)
2769                                 return r;
2770
2771                         DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
2772                         if (r)
2773                                 DMEMIT("%llu", ((highest + 1) *
2774                                                 tc->pool->sectors_per_block) - 1);
2775                         else
2776                                 DMEMIT("-");
2777                         break;
2778
2779                 case STATUSTYPE_TABLE:
2780                         DMEMIT("%s %lu",
2781                                format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
2782                                (unsigned long) tc->dev_id);
2783                         if (tc->origin_dev)
2784                                 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
2785                         break;
2786                 }
2787         }
2788
2789         return 0;
2790 }
2791
2792 static int thin_iterate_devices(struct dm_target *ti,
2793                                 iterate_devices_callout_fn fn, void *data)
2794 {
2795         sector_t blocks;
2796         struct thin_c *tc = ti->private;
2797         struct pool *pool = tc->pool;
2798
2799         /*
2800          * We can't call dm_pool_get_data_dev_size() since that blocks.  So
2801          * we follow a more convoluted path through to the pool's target.
2802          */
2803         if (!pool->ti)
2804                 return 0;       /* nothing is bound */
2805
2806         blocks = pool->ti->len;
2807         (void) sector_div(blocks, pool->sectors_per_block);
2808         if (blocks)
2809                 return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
2810
2811         return 0;
2812 }
2813
2814 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
2815 {
2816         struct thin_c *tc = ti->private;
2817         struct pool *pool = tc->pool;
2818
2819         blk_limits_io_min(limits, 0);
2820         blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2821         set_discard_limits(pool, limits);
2822 }
2823
2824 static struct target_type thin_target = {
2825         .name = "thin",
2826         .version = {1, 2, 0},
2827         .module = THIS_MODULE,
2828         .ctr = thin_ctr,
2829         .dtr = thin_dtr,
2830         .map = thin_map,
2831         .end_io = thin_endio,
2832         .postsuspend = thin_postsuspend,
2833         .status = thin_status,
2834         .iterate_devices = thin_iterate_devices,
2835         .io_hints = thin_io_hints,
2836 };
2837
2838 /*----------------------------------------------------------------*/
2839
2840 static int __init dm_thin_init(void)
2841 {
2842         int r;
2843
2844         pool_table_init();
2845
2846         r = dm_register_target(&thin_target);
2847         if (r)
2848                 return r;
2849
2850         r = dm_register_target(&pool_target);
2851         if (r)
2852                 goto bad_pool_target;
2853
2854         r = -ENOMEM;
2855
2856         _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0);
2857         if (!_cell_cache)
2858                 goto bad_cell_cache;
2859
2860         _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
2861         if (!_new_mapping_cache)
2862                 goto bad_new_mapping_cache;
2863
2864         _endio_hook_cache = KMEM_CACHE(dm_thin_endio_hook, 0);
2865         if (!_endio_hook_cache)
2866                 goto bad_endio_hook_cache;
2867
2868         return 0;
2869
2870 bad_endio_hook_cache:
2871         kmem_cache_destroy(_new_mapping_cache);
2872 bad_new_mapping_cache:
2873         kmem_cache_destroy(_cell_cache);
2874 bad_cell_cache:
2875         dm_unregister_target(&pool_target);
2876 bad_pool_target:
2877         dm_unregister_target(&thin_target);
2878
2879         return r;
2880 }
2881
2882 static void dm_thin_exit(void)
2883 {
2884         dm_unregister_target(&thin_target);
2885         dm_unregister_target(&pool_target);
2886
2887         kmem_cache_destroy(_cell_cache);
2888         kmem_cache_destroy(_new_mapping_cache);
2889         kmem_cache_destroy(_endio_hook_cache);
2890 }
2891
2892 module_init(dm_thin_init);
2893 module_exit(dm_thin_exit);
2894
2895 MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
2896 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2897 MODULE_LICENSE("GPL");