drivers/md/dm.c

   1 /*
   2  * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
   3  * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
   4  *
   5  * This file is released under the GPL.
   6  */
   7
   8 #include "dm.h"
   9 #include "dm-uevent.h"
  10
  11 #include <linux/init.h>
  12 #include <linux/module.h>
  13 #include <linux/mutex.h>
  14 #include <linux/moduleparam.h>
  15 #include <linux/blkpg.h>
  16 #include <linux/bio.h>
  17 #include <linux/buffer_head.h>
  18 #include <linux/mempool.h>
  19 #include <linux/slab.h>
  20 #include <linux/idr.h>
  21 #include <linux/hdreg.h>
  22 #include <linux/delay.h>
  23
  24 #include <trace/events/block.h>
  25
  26 #define DM_MSG_PREFIX "core"
  27
  28 #ifdef CONFIG_PRINTK
  29 /*
  30  * ratelimit state to be used in DMXXX_LIMIT().
  31  */
  32 DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
  33                        DEFAULT_RATELIMIT_INTERVAL,
  34                        DEFAULT_RATELIMIT_BURST);
  35 EXPORT_SYMBOL(dm_ratelimit_state);
  36 #endif
  37
  38 /*
  39  * Cookies are numeric values sent with CHANGE and REMOVE
  40  * uevents while resuming, removing or renaming the device.
  41  */
  42 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
  43 #define DM_COOKIE_LENGTH 24
  44
  45 static const char *_name = DM_NAME;
  46
  47 static unsigned int major = 0;
  48 static unsigned int _major = 0;
  49
  50 static DEFINE_IDR(_minor_idr);
  51
  52 static DEFINE_SPINLOCK(_minor_lock);
  53 /*
  54  * For bio-based dm.
  55  * One of these is allocated per bio.
  56  */
  57 struct dm_io {
  58         struct mapped_device *md;
  59         int error;
  60         atomic_t io_count;
  61         struct bio *bio;
  62         unsigned long start_time;
  63         spinlock_t endio_lock;
  64 };
  65
  66 /*
  67  * For bio-based dm.
  68  * One of these is allocated per target within a bio.  Hopefully
  69  * this will be simplified out one day.
  70  */
  71 struct dm_target_io {
  72         struct dm_io *io;
  73         struct dm_target *ti;
  74         union map_info info;
  75 };
  76
  77 /*
  78  * For request-based dm.
  79  * One of these is allocated per request.
  80  */
  81 struct dm_rq_target_io {
  82         struct mapped_device *md;
  83         struct dm_target *ti;
  84         struct request *orig, clone;
  85         int error;
  86         union map_info info;
  87 };
  88
  89 /*
  90  * For request-based dm.
  91  * One of these is allocated per bio.
  92  */
  93 struct dm_rq_clone_bio_info {
  94         struct bio *orig;
  95         struct dm_rq_target_io *tio;
  96 };
  97
  98 union map_info *dm_get_mapinfo(struct bio *bio)
  99 {
 100         if (bio && bio->bi_private)
 101                 return &((struct dm_target_io *)bio->bi_private)->info;
 102         return NULL;
 103 }
 104
 105 union map_info *dm_get_rq_mapinfo(struct request *rq)
 106 {
 107         if (rq && rq->end_io_data)
 108                 return &((struct dm_rq_target_io *)rq->end_io_data)->info;
 109         return NULL;
 110 }
 111 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 112
 113 #define MINOR_ALLOCED ((void *)-1)
 114
 115 /*
 116  * Bits for the md->flags field.
 117  */
 118 #define DMF_BLOCK_IO_FOR_SUSPEND 0
 119 #define DMF_SUSPENDED 1
 120 #define DMF_FROZEN 2
 121 #define DMF_FREEING 3
 122 #define DMF_DELETING 4
 123 #define DMF_NOFLUSH_SUSPENDING 5
 124 #define DMF_MERGE_IS_OPTIONAL 6
 125
 126 /*
 127  * Work processed by per-device workqueue.
 128  */
 129 struct mapped_device {
 130         struct rw_semaphore io_lock;
 131         struct mutex suspend_lock;
 132         rwlock_t map_lock;
 133         atomic_t holders;
 134         atomic_t open_count;
 135
 136         unsigned long flags;
 137
 138         struct request_queue *queue;
 139         unsigned type;
 140         /* Protect queue and type against concurrent access. */
 141         struct mutex type_lock;
 142
 143         struct target_type *immutable_target_type;
 144
 145         struct gendisk *disk;
 146         char name[16];
 147
 148         void *interface_ptr;
 149
 150         /*
 151          * A list of ios that arrived while we were suspended.
 152          */
 153         atomic_t pending[2];
 154         wait_queue_head_t wait;
 155         struct work_struct work;
 156         struct bio_list deferred;
 157         spinlock_t deferred_lock;
 158
 159         /*
 160          * Processing queue (flush)
 161          */
 162         struct workqueue_struct *wq;
 163
 164         /*
 165          * The current mapping.
 166          */
 167         struct dm_table *map;
 168
 169         /*
 170          * io objects are allocated from here.
 171          */
 172         mempool_t *io_pool;
 173         mempool_t *tio_pool;
 174
 175         struct bio_set *bs;
 176
 177         /*
 178          * Event handling.
 179          */
 180         atomic_t event_nr;
 181         wait_queue_head_t eventq;
 182         atomic_t uevent_seq;
 183         struct list_head uevent_list;
 184         spinlock_t uevent_lock; /* Protect access to uevent_list */
 185
 186         /*
 187          * freeze/thaw support require holding onto a super block
 188          */
 189         struct super_block *frozen_sb;
 190         struct block_device *bdev;
 191
 192         /* forced geometry settings */
 193         struct hd_geometry geometry;
 194
 195         /* sysfs handle */
 196         struct kobject kobj;
 197
 198         /* wait until the kobject is released */
 199         struct completion kobj_completion;
 200
 201         /* zero-length flush that will be cloned and submitted to targets */
 202         struct bio flush_bio;
 203 };
 204
 205 /*
 206  * For mempools pre-allocation at the table loading time.
 207  */
 208 struct dm_md_mempools {
 209         mempool_t *io_pool;
 210         mempool_t *tio_pool;
 211         struct bio_set *bs;
 212 };
 213
 214 #define MIN_IOS 256
 215 static struct kmem_cache *_io_cache;
 216 static struct kmem_cache *_tio_cache;
 217 static struct kmem_cache *_rq_tio_cache;
 218 static struct kmem_cache *_rq_bio_info_cache;
 219
 220 static int __init local_init(void)
 221 {
 222         int r = -ENOMEM;
 223
 224         /* allocate a slab for the dm_ios */
 225         _io_cache = KMEM_CACHE(dm_io, 0);
 226         if (!_io_cache)
 227                 return r;
 228
 229         /* allocate a slab for the target ios */
 230         _tio_cache = KMEM_CACHE(dm_target_io, 0);
 231         if (!_tio_cache)
 232                 goto out_free_io_cache;
 233
 234         _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
 235         if (!_rq_tio_cache)
 236                 goto out_free_tio_cache;
 237
 238         _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
 239         if (!_rq_bio_info_cache)
 240                 goto out_free_rq_tio_cache;
 241
 242         r = dm_uevent_init();
 243         if (r)
 244                 goto out_free_rq_bio_info_cache;
 245
 246         _major = major;
 247         r = register_blkdev(_major, _name);
 248         if (r < 0)
 249                 goto out_uevent_exit;
 250
 251         if (!_major)
 252                 _major = r;
 253
 254         return 0;
 255
 256 out_uevent_exit:
 257         dm_uevent_exit();
 258 out_free_rq_bio_info_cache:
 259         kmem_cache_destroy(_rq_bio_info_cache);
 260 out_free_rq_tio_cache:
 261         kmem_cache_destroy(_rq_tio_cache);
 262 out_free_tio_cache:
 263         kmem_cache_destroy(_tio_cache);
 264 out_free_io_cache:
 265         kmem_cache_destroy(_io_cache);
 266
 267         return r;
 268 }
 269
 270 static void local_exit(void)
 271 {
 272         kmem_cache_destroy(_rq_bio_info_cache);
 273         kmem_cache_destroy(_rq_tio_cache);
 274         kmem_cache_destroy(_tio_cache);
 275         kmem_cache_destroy(_io_cache);
 276         unregister_blkdev(_major, _name);
 277         dm_uevent_exit();
 278
 279         _major = 0;
 280
 281         DMINFO("cleaned up");
 282 }
 283
 284 static int (*_inits[])(void) __initdata = {
 285         local_init,
 286         dm_target_init,
 287         dm_linear_init,
 288         dm_stripe_init,
 289         dm_io_init,
 290         dm_kcopyd_init,
 291         dm_interface_init,
 292 };
 293
 294 static void (*_exits[])(void) = {
 295         local_exit,
 296         dm_target_exit,
 297         dm_linear_exit,
 298         dm_stripe_exit,
 299         dm_io_exit,
 300         dm_kcopyd_exit,
 301         dm_interface_exit,
 302 };
 303
 304 static int __init dm_init(void)
 305 {
 306         const int count = ARRAY_SIZE(_inits);
 307
 308         int r, i;
 309
 310         for (i = 0; i < count; i++) {
 311                 r = _inits[i]();
 312                 if (r)
 313                         goto bad;
 314         }
 315
 316         return 0;
 317
 318       bad:
 319         while (i--)
 320                 _exits[i]();
 321
 322         return r;
 323 }
 324
 325 static void __exit dm_exit(void)
 326 {
 327         int i = ARRAY_SIZE(_exits);
 328
 329         while (i--)
 330                 _exits[i]();
 331
 332         /*
 333          * Should be empty by this point.
 334          */
 335         idr_remove_all(&_minor_idr);
 336         idr_destroy(&_minor_idr);
 337 }
 338
 339 /*
 340  * Block device functions
 341  */
 342 int dm_deleting_md(struct mapped_device *md)
 343 {
 344         return test_bit(DMF_DELETING, &md->flags);
 345 }
 346
 347 static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 348 {
 349         struct mapped_device *md;
 350
 351         spin_lock(&_minor_lock);
 352
 353         md = bdev->bd_disk->private_data;
 354         if (!md)
 355                 goto out;
 356
 357         if (test_bit(DMF_FREEING, &md->flags) ||
 358             dm_deleting_md(md)) {
 359                 md = NULL;
 360                 goto out;
 361         }
 362
 363         dm_get(md);
 364         atomic_inc(&md->open_count);
 365
 366 out:
 367         spin_unlock(&_minor_lock);
 368
 369         return md ? 0 : -ENXIO;
 370 }
 371
 372 static int dm_blk_close(struct gendisk *disk, fmode_t mode)
 373 {
 374         struct mapped_device *md = disk->private_data;
 375
 376         spin_lock(&_minor_lock);
 377
 378         atomic_dec(&md->open_count);
 379         dm_put(md);
 380
 381         spin_unlock(&_minor_lock);
 382
 383         return 0;
 384 }
 385
 386 int dm_open_count(struct mapped_device *md)
 387 {
 388         return atomic_read(&md->open_count);
 389 }
 390
 391 /*
 392  * Guarantees nothing is using the device before it's deleted.
 393  */
 394 int dm_lock_for_deletion(struct mapped_device *md)
 395 {
 396         int r = 0;
 397
 398         spin_lock(&_minor_lock);
 399
 400         if (dm_open_count(md))
 401                 r = -EBUSY;
 402         else
 403                 set_bit(DMF_DELETING, &md->flags);
 404
 405         spin_unlock(&_minor_lock);
 406
 407         return r;
 408 }
 409
 410 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 411 {
 412         struct mapped_device *md = bdev->bd_disk->private_data;
 413
 414         return dm_get_geometry(md, geo);
 415 }
 416
 417 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 418                         unsigned int cmd, unsigned long arg)
 419 {
 420         struct mapped_device *md = bdev->bd_disk->private_data;
 421         struct dm_table *map = dm_get_live_table(md);
 422         struct dm_target *tgt;
 423         int r = -ENOTTY;
 424
 425         if (!map || !dm_table_get_size(map))
 426                 goto out;
 427
 428         /* We only support devices that have a single target */
 429         if (dm_table_get_num_targets(map) != 1)
 430                 goto out;
 431
 432         tgt = dm_table_get_target(map, 0);
 433
 434         if (dm_suspended_md(md)) {
 435                 r = -EAGAIN;
 436                 goto out;
 437         }
 438
 439         if (tgt->type->ioctl)
 440                 r = tgt->type->ioctl(tgt, cmd, arg);
 441
 442 out:
 443         dm_table_put(map);
 444
 445         return r;
 446 }
 447
 448 static struct dm_io *alloc_io(struct mapped_device *md)
 449 {
 450         return mempool_alloc(md->io_pool, GFP_NOIO);
 451 }
 452
 453 static void free_io(struct mapped_device *md, struct dm_io *io)
 454 {
 455         mempool_free(io, md->io_pool);
 456 }
 457
 458 static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 459 {
 460         mempool_free(tio, md->tio_pool);
 461 }
 462
 463 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
 464                                             gfp_t gfp_mask)
 465 {
 466         return mempool_alloc(md->tio_pool, gfp_mask);
 467 }
 468
 469 static void free_rq_tio(struct dm_rq_target_io *tio)
 470 {
 471         mempool_free(tio, tio->md->tio_pool);
 472 }
 473
 474 static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
 475 {
 476         return mempool_alloc(md->io_pool, GFP_ATOMIC);
 477 }
 478
 479 static void free_bio_info(struct dm_rq_clone_bio_info *info)
 480 {
 481         mempool_free(info, info->tio->md->io_pool);
 482 }
 483
 484 static int md_in_flight(struct mapped_device *md)
 485 {
 486         return atomic_read(&md->pending[READ]) +
 487                atomic_read(&md->pending[WRITE]);
 488 }
 489
 490 static void start_io_acct(struct dm_io *io)
 491 {
 492         struct mapped_device *md = io->md;
 493         int cpu;
 494         int rw = bio_data_dir(io->bio);
 495
 496         io->start_time = jiffies;
 497
 498         cpu = part_stat_lock();
 499         part_round_stats(cpu, &dm_disk(md)->part0);
 500         part_stat_unlock();
 501         atomic_set(&dm_disk(md)->part0.in_flight[rw],
 502                 atomic_inc_return(&md->pending[rw]));
 503 }
 504
 505 static void end_io_acct(struct dm_io *io)
 506 {
 507         struct mapped_device *md = io->md;
 508         struct bio *bio = io->bio;
 509         unsigned long duration = jiffies - io->start_time;
 510         int pending, cpu;
 511         int rw = bio_data_dir(bio);
 512
 513         cpu = part_stat_lock();
 514         part_round_stats(cpu, &dm_disk(md)->part0);
 515         part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
 516         part_stat_unlock();
 517
 518         /*
 519          * After this is decremented the bio must not be touched if it is
 520          * a flush.
 521          */
 522         pending = atomic_dec_return(&md->pending[rw]);
 523         atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
 524         pending += atomic_read(&md->pending[rw^0x1]);
 525
 526         /* nudge anyone waiting on suspend queue */
 527         if (!pending)
 528                 wake_up(&md->wait);
 529 }
 530
 531 /*
 532  * Add the bio to the list of deferred io.
 533  */
 534 static void queue_io(struct mapped_device *md, struct bio *bio)
 535 {
 536         unsigned long flags;
 537
 538         spin_lock_irqsave(&md->deferred_lock, flags);
 539         bio_list_add(&md->deferred, bio);
 540         spin_unlock_irqrestore(&md->deferred_lock, flags);
 541         queue_work(md->wq, &md->work);
 542 }
 543
 544 /*
 545  * Everyone (including functions in this file), should use this
 546  * function to access the md->map field, and make sure they call
 547  * dm_table_put() when finished.
 548  */
 549 struct dm_table *dm_get_live_table(struct mapped_device *md)
 550 {
 551         struct dm_table *t;
 552         unsigned long flags;
 553
 554         read_lock_irqsave(&md->map_lock, flags);
 555         t = md->map;
 556         if (t)
 557                 dm_table_get(t);
 558         read_unlock_irqrestore(&md->map_lock, flags);
 559
 560         return t;
 561 }
 562
 563 /*
 564  * Get the geometry associated with a dm device
 565  */
 566 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
 567 {
 568         *geo = md->geometry;
 569
 570         return 0;
 571 }
 572
 573 /*
 574  * Set the geometry of a device.
 575  */
 576 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 577 {
 578         sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
 579
 580         if (geo->start > sz) {
 581                 DMWARN("Start sector is beyond the geometry limits.");
 582                 return -EINVAL;
 583         }
 584
 585         md->geometry = *geo;
 586
 587         return 0;
 588 }
 589
 590 /*-----------------------------------------------------------------
 591  * CRUD START:
 592  *   A more elegant soln is in the works that uses the queue
 593  *   merge fn, unfortunately there are a couple of changes to
 594  *   the block layer that I want to make for this.  So in the
 595  *   interests of getting something for people to use I give
 596  *   you this clearly demarcated crap.
 597  *---------------------------------------------------------------*/
 598
 599 static int __noflush_suspending(struct mapped_device *md)
 600 {
 601         return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 602 }
 603
 604 /*
 605  * Decrements the number of outstanding ios that a bio has been
 606  * cloned into, completing the original io if necc.
 607  */
 608 static void dec_pending(struct dm_io *io, int error)
 609 {
 610         unsigned long flags;
 611         int io_error;
 612         struct bio *bio;
 613         struct mapped_device *md = io->md;
 614
 615         /* Push-back supersedes any I/O errors */
 616         if (unlikely(error)) {
 617                 spin_lock_irqsave(&io->endio_lock, flags);
 618                 if (!(io->error > 0 && __noflush_suspending(md)))
 619                         io->error = error;
 620                 spin_unlock_irqrestore(&io->endio_lock, flags);
 621         }
 622
 623         if (atomic_dec_and_test(&io->io_count)) {
 624                 if (io->error == DM_ENDIO_REQUEUE) {
 625                         /*
 626                          * Target requested pushing back the I/O.
 627                          */
 628                         spin_lock_irqsave(&md->deferred_lock, flags);
 629                         if (__noflush_suspending(md))
 630                                 bio_list_add_head(&md->deferred, io->bio);
 631                         else
 632                                 /* noflush suspend was interrupted. */
 633                                 io->error = -EIO;
 634                         spin_unlock_irqrestore(&md->deferred_lock, flags);
 635                 }
 636
 637                 io_error = io->error;
 638                 bio = io->bio;
 639                 end_io_acct(io);
 640                 free_io(md, io);
 641
 642                 if (io_error == DM_ENDIO_REQUEUE)
 643                         return;
 644
 645                 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
 646                         /*
 647                          * Preflush done for flush with data, reissue
 648                          * without REQ_FLUSH.
 649                          */
 650                         bio->bi_rw &= ~REQ_FLUSH;
 651                         queue_io(md, bio);
 652                 } else {
 653                         /* done with normal IO or empty flush */
 654                         trace_block_bio_complete(md->queue, bio, io_error);
 655                         bio_endio(bio, io_error);
 656                 }
 657         }
 658 }
 659
 660 static void clone_endio(struct bio *bio, int error)
 661 {
 662         int r = 0;
 663         struct dm_target_io *tio = bio->bi_private;
 664         struct dm_io *io = tio->io;
 665         struct mapped_device *md = tio->io->md;
 666         dm_endio_fn endio = tio->ti->type->end_io;
 667
 668         if (!bio_flagged(bio, BIO_UPTODATE) && !error)
 669                 error = -EIO;
 670
 671         if (endio) {
 672                 r = endio(tio->ti, bio, error, &tio->info);
 673                 if (r < 0 || r == DM_ENDIO_REQUEUE)
 674                         /*
 675                          * error and requeue request are handled
 676                          * in dec_pending().
 677                          */
 678                         error = r;
 679                 else if (r == DM_ENDIO_INCOMPLETE)
 680                         /* The target will handle the io */
 681                         return;
 682                 else if (r) {
 683                         DMWARN("unimplemented target endio return value: %d", r);
 684                         BUG();
 685                 }
 686         }
 687
 688         /*
 689          * Store md for cleanup instead of tio which is about to get freed.
 690          */
 691         bio->bi_private = md->bs;
 692
 693         free_tio(md, tio);
 694         bio_put(bio);
 695         dec_pending(io, error);
 696 }
 697
 698 /*
 699  * Partial completion handling for request-based dm
 700  */
 701 static void end_clone_bio(struct bio *clone, int error)
 702 {
 703         struct dm_rq_clone_bio_info *info = clone->bi_private;
 704         struct dm_rq_target_io *tio = info->tio;
 705         struct bio *bio = info->orig;
 706         unsigned int nr_bytes = info->orig->bi_size;
 707
 708         bio_put(clone);
 709
 710         if (tio->error)
 711                 /*
 712                  * An error has already been detected on the request.
 713                  * Once error occurred, just let clone->end_io() handle
 714                  * the remainder.
 715                  */
 716                 return;
 717         else if (error) {
 718                 /*
 719                  * Don't notice the error to the upper layer yet.
 720                  * The error handling decision is made by the target driver,
 721                  * when the request is completed.
 722                  */
 723                 tio->error = error;
 724                 return;
 725         }
 726
 727         /*
 728          * I/O for the bio successfully completed.
 729          * Notice the data completion to the upper layer.
 730          */
 731
 732         /*
 733          * bios are processed from the head of the list.
 734          * So the completing bio should always be rq->bio.
 735          * If it's not, something wrong is happening.
 736          */
 737         if (tio->orig->bio != bio)
 738                 DMERR("bio completion is going in the middle of the request");
 739
 740         /*
 741          * Update the original request.
 742          * Do not use blk_end_request() here, because it may complete
 743          * the original request before the clone, and break the ordering.
 744          */
 745         blk_update_request(tio->orig, 0, nr_bytes);
 746 }
 747
 748 /*
 749  * Don't touch any member of the md after calling this function because
 750  * the md may be freed in dm_put() at the end of this function.
 751  * Or do dm_get() before calling this function and dm_put() later.
 752  */
 753 static void rq_completed(struct mapped_device *md, int rw, int run_queue)
 754 {
 755         atomic_dec(&md->pending[rw]);
 756
 757         /* nudge anyone waiting on suspend queue */
 758         if (!md_in_flight(md))
 759                 wake_up(&md->wait);
 760
 761         /*
 762          * Run this off this callpath, as drivers could invoke end_io while
 763          * inside their request_fn (and holding the queue lock). Calling
 764          * back into ->request_fn() could deadlock attempting to grab the
 765          * queue lock again.
 766          */
 767         if (run_queue)
 768                 blk_run_queue_async(md->queue);
 769
 770         /*
 771          * dm_put() must be at the end of this function. See the comment above
 772          */
 773         dm_put(md);
 774 }
 775
 776 static void free_rq_clone(struct request *clone)
 777 {
 778         struct dm_rq_target_io *tio = clone->end_io_data;
 779
 780         blk_rq_unprep_clone(clone);
 781         free_rq_tio(tio);
 782 }
 783
 784 /*
 785  * Complete the clone and the original request.
 786  * Must be called without queue lock.
 787  */
 788 static void dm_end_request(struct request *clone, int error)
 789 {
 790         int rw = rq_data_dir(clone);
 791         struct dm_rq_target_io *tio = clone->end_io_data;
 792         struct mapped_device *md = tio->md;
 793         struct request *rq = tio->orig;
 794
 795         if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 796                 rq->errors = clone->errors;
 797                 rq->resid_len = clone->resid_len;
 798
 799                 if (rq->sense)
 800                         /*
 801                          * We are using the sense buffer of the original
 802                          * request.
 803                          * So setting the length of the sense data is enough.
 804                          */
 805                         rq->sense_len = clone->sense_len;
 806         }
 807
 808         free_rq_clone(clone);
 809         blk_end_request_all(rq, error);
 810         rq_completed(md, rw, true);
 811 }
 812
 813 static void dm_unprep_request(struct request *rq)
 814 {
 815         struct request *clone = rq->special;
 816
 817         rq->special = NULL;
 818         rq->cmd_flags &= ~REQ_DONTPREP;
 819
 820         free_rq_clone(clone);
 821 }
 822
 823 /*
 824  * Requeue the original request of a clone.
 825  */
 826 void dm_requeue_unmapped_request(struct request *clone)
 827 {
 828         int rw = rq_data_dir(clone);
 829         struct dm_rq_target_io *tio = clone->end_io_data;
 830         struct mapped_device *md = tio->md;
 831         struct request *rq = tio->orig;
 832         struct request_queue *q = rq->q;
 833         unsigned long flags;
 834
 835         dm_unprep_request(rq);
 836
 837         spin_lock_irqsave(q->queue_lock, flags);
 838         blk_requeue_request(q, rq);
 839         spin_unlock_irqrestore(q->queue_lock, flags);
 840
 841         rq_completed(md, rw, 0);
 842 }
 843 EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
 844
 845 static void __stop_queue(struct request_queue *q)
 846 {
 847         blk_stop_queue(q);
 848 }
 849
 850 static void stop_queue(struct request_queue *q)
 851 {
 852         unsigned long flags;
 853
 854         spin_lock_irqsave(q->queue_lock, flags);
 855         __stop_queue(q);
 856         spin_unlock_irqrestore(q->queue_lock, flags);
 857 }
 858
 859 static void __start_queue(struct request_queue *q)
 860 {
 861         if (blk_queue_stopped(q))
 862                 blk_start_queue(q);
 863 }
 864
 865 static void start_queue(struct request_queue *q)
 866 {
 867         unsigned long flags;
 868
 869         spin_lock_irqsave(q->queue_lock, flags);
 870         __start_queue(q);
 871         spin_unlock_irqrestore(q->queue_lock, flags);
 872 }
 873
 874 static void dm_done(struct request *clone, int error, bool mapped)
 875 {
 876         int r = error;
 877         struct dm_rq_target_io *tio = clone->end_io_data;
 878         dm_request_endio_fn rq_end_io = NULL;
 879
 880         if (tio->ti) {
 881                 rq_end_io = tio->ti->type->rq_end_io;
 882
 883                 if (mapped && rq_end_io)
 884                         r = rq_end_io(tio->ti, clone, error, &tio->info);
 885         }
 886
 887         if (r <= 0)
 888                 /* The target wants to complete the I/O */
 889                 dm_end_request(clone, r);
 890         else if (r == DM_ENDIO_INCOMPLETE)
 891                 /* The target will handle the I/O */
 892                 return;
 893         else if (r == DM_ENDIO_REQUEUE)
 894                 /* The target wants to requeue the I/O */
 895                 dm_requeue_unmapped_request(clone);
 896         else {
 897                 DMWARN("unimplemented target endio return value: %d", r);
 898                 BUG();
 899         }
 900 }
 901
 902 /*
 903  * Request completion handler for request-based dm
 904  */
 905 static void dm_softirq_done(struct request *rq)
 906 {
 907         bool mapped = true;
 908         struct request *clone = rq->completion_data;
 909         struct dm_rq_target_io *tio = clone->end_io_data;
 910
 911         if (rq->cmd_flags & REQ_FAILED)
 912                 mapped = false;
 913
 914         dm_done(clone, tio->error, mapped);
 915 }
 916
 917 /*
 918  * Complete the clone and the original request with the error status
 919  * through softirq context.
 920  */
 921 static void dm_complete_request(struct request *clone, int error)
 922 {
 923         struct dm_rq_target_io *tio = clone->end_io_data;
 924         struct request *rq = tio->orig;
 925
 926         tio->error = error;
 927         rq->completion_data = clone;
 928         blk_complete_request(rq);
 929 }
 930
 931 /*
 932  * Complete the not-mapped clone and the original request with the error status
 933  * through softirq context.
 934  * Target's rq_end_io() function isn't called.
 935  * This may be used when the target's map_rq() function fails.
 936  */
 937 void dm_kill_unmapped_request(struct request *clone, int error)
 938 {
 939         struct dm_rq_target_io *tio = clone->end_io_data;
 940         struct request *rq = tio->orig;
 941
 942         rq->cmd_flags |= REQ_FAILED;
 943         dm_complete_request(clone, error);
 944 }
 945 EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
 946
 947 /*
 948  * Called with the queue lock held
 949  */
 950 static void end_clone_request(struct request *clone, int error)
 951 {
 952         /*
 953          * For just cleaning up the information of the queue in which
 954          * the clone was dispatched.
 955          * The clone is *NOT* freed actually here because it is alloced from
 956          * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
 957          */
 958         __blk_put_request(clone->q, clone);
 959
 960         /*
 961          * Actual request completion is done in a softirq context which doesn't
 962          * hold the queue lock.  Otherwise, deadlock could occur because:
 963          *     - another request may be submitted by the upper level driver
 964          *       of the stacking during the completion
 965          *     - the submission which requires queue lock may be done
 966          *       against this queue
 967          */
 968         dm_complete_request(clone, error);
 969 }
 970
 971 /*
 972  * Return maximum size of I/O possible at the supplied sector up to the current
 973  * target boundary.
 974  */
 975 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
 976 {
 977         sector_t target_offset = dm_target_offset(ti, sector);
 978
 979         return ti->len - target_offset;
 980 }
 981
 982 static sector_t max_io_len(sector_t sector, struct dm_target *ti)
 983 {
 984         sector_t len = max_io_len_target_boundary(sector, ti);
 985
 986         /*
 987          * Does the target need to split even further ?
 988          */
 989         if (ti->split_io) {
 990                 sector_t boundary;
 991                 sector_t offset = dm_target_offset(ti, sector);
 992                 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
 993                            - offset;
 994                 if (len > boundary)
 995                         len = boundary;
 996         }
 997
 998         return len;
 999 }
1000
1001 static void __map_bio(struct dm_target *ti, struct bio *clone,
1002                       struct dm_target_io *tio)
1003 {
1004         int r;
1005         sector_t sector;
1006         struct mapped_device *md;
1007
1008         clone->bi_end_io = clone_endio;
1009         clone->bi_private = tio;
1010
1011         /*
1012          * Map the clone.  If r == 0 we don't need to do
1013          * anything, the target has assumed ownership of
1014          * this io.
1015          */
1016         atomic_inc(&tio->io->io_count);
1017         sector = clone->bi_sector;
1018         r = ti->type->map(ti, clone, &tio->info);
1019         if (r == DM_MAPIO_REMAPPED) {
1020                 /* the bio has been remapped so dispatch it */
1021
1022                 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
1023                                       tio->io->bio->bi_bdev->bd_dev, sector);
1024
1025                 generic_make_request(clone);
1026         } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
1027                 /* error the io and bail out, or requeue it if needed */
1028                 md = tio->io->md;
1029                 dec_pending(tio->io, r);
1030                 /*
1031                  * Store bio_set for cleanup.
1032                  */
1033                 clone->bi_private = md->bs;
1034                 bio_put(clone);
1035                 free_tio(md, tio);
1036         } else if (r) {
1037                 DMWARN("unimplemented target map return value: %d", r);
1038                 BUG();
1039         }
1040 }
1041
1042 struct clone_info {
1043         struct mapped_device *md;
1044         struct dm_table *map;
1045         struct bio *bio;
1046         struct dm_io *io;
1047         sector_t sector;
1048         sector_t sector_count;
1049         unsigned short idx;
1050 };
1051
1052 static void dm_bio_destructor(struct bio *bio)
1053 {
1054         struct bio_set *bs = bio->bi_private;
1055
1056         bio_free(bio, bs);
1057 }
1058
1059 /*
1060  * Creates a little bio that just does part of a bvec.
1061  */
1062 static struct bio *split_bvec(struct bio *bio, sector_t sector,
1063                               unsigned short idx, unsigned int offset,
1064                               unsigned int len, struct bio_set *bs)
1065 {
1066         struct bio *clone;
1067         struct bio_vec *bv = bio->bi_io_vec + idx;
1068
1069         clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
1070         clone->bi_destructor = dm_bio_destructor;
1071         *clone->bi_io_vec = *bv;
1072
1073         clone->bi_sector = sector;
1074         clone->bi_bdev = bio->bi_bdev;
1075         clone->bi_rw = bio->bi_rw;
1076         clone->bi_vcnt = 1;
1077         clone->bi_size = to_bytes(len);
1078         clone->bi_io_vec->bv_offset = offset;
1079         clone->bi_io_vec->bv_len = clone->bi_size;
1080         clone->bi_flags |= 1 << BIO_CLONED;
1081
1082         if (bio_integrity(bio)) {
1083                 bio_integrity_clone(clone, bio, GFP_NOIO, bs);
1084                 bio_integrity_trim(clone,
1085                                    bio_sector_offset(bio, idx, offset), len);
1086         }
1087
1088         return clone;
1089 }
1090
1091 /*
1092  * Creates a bio that consists of range of complete bvecs.
1093  */
1094 static struct bio *clone_bio(struct bio *bio, sector_t sector,
1095                              unsigned short idx, unsigned short bv_count,
1096                              unsigned int len, struct bio_set *bs)
1097 {
1098         struct bio *clone;
1099
1100         clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
1101         __bio_clone(clone, bio);
1102         clone->bi_destructor = dm_bio_destructor;
1103         clone->bi_sector = sector;
1104         clone->bi_idx = idx;
1105         clone->bi_vcnt = idx + bv_count;
1106         clone->bi_size = to_bytes(len);
1107         clone->bi_flags &= ~(1 << BIO_SEG_VALID);
1108
1109         if (bio_integrity(bio)) {
1110                 bio_integrity_clone(clone, bio, GFP_NOIO, bs);
1111
1112                 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
1113                         bio_integrity_trim(clone,
1114                                            bio_sector_offset(bio, idx, 0), len);
1115         }
1116
1117         return clone;
1118 }
1119
1120 static struct dm_target_io *alloc_tio(struct clone_info *ci,
1121                                       struct dm_target *ti)
1122 {
1123         struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
1124
1125         tio->io = ci->io;
1126         tio->ti = ti;
1127         memset(&tio->info, 0, sizeof(tio->info));
1128
1129         return tio;
1130 }
1131
1132 static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
1133                                    unsigned request_nr, sector_t len)
1134 {
1135         struct dm_target_io *tio = alloc_tio(ci, ti);
1136         struct bio *clone;
1137
1138         tio->info.target_request_nr = request_nr;
1139
1140         /*
1141          * Discard requests require the bio's inline iovecs be initialized.
1142          * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
1143          * and discard, so no need for concern about wasted bvec allocations.
1144          */
1145         clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);
1146         __bio_clone(clone, ci->bio);
1147         clone->bi_destructor = dm_bio_destructor;
1148         if (len) {
1149                 clone->bi_sector = ci->sector;
1150                 clone->bi_size = to_bytes(len);
1151         }
1152
1153         __map_bio(ti, clone, tio);
1154 }
1155
1156 static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
1157                                     unsigned num_requests, sector_t len)
1158 {
1159         unsigned request_nr;
1160
1161         for (request_nr = 0; request_nr < num_requests; request_nr++)
1162                 __issue_target_request(ci, ti, request_nr, len);
1163 }
1164
1165 static int __clone_and_map_empty_flush(struct clone_info *ci)
1166 {
1167         unsigned target_nr = 0;
1168         struct dm_target *ti;
1169
1170         BUG_ON(bio_has_data(ci->bio));
1171         while ((ti = dm_table_get_target(ci->map, target_nr++)))
1172                 __issue_target_requests(ci, ti, ti->num_flush_requests, 0);
1173
1174         return 0;
1175 }
1176
1177 /*
1178  * Perform all io with a single clone.
1179  */
1180 static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
1181 {
1182         struct bio *clone, *bio = ci->bio;
1183         struct dm_target_io *tio;
1184
1185         tio = alloc_tio(ci, ti);
1186         clone = clone_bio(bio, ci->sector, ci->idx,
1187                           bio->bi_vcnt - ci->idx, ci->sector_count,
1188                           ci->md->bs);
1189         __map_bio(ti, clone, tio);
1190         ci->sector_count = 0;
1191 }
1192
1193 static int __clone_and_map_discard(struct clone_info *ci)
1194 {
1195         struct dm_target *ti;
1196         sector_t len;
1197
1198         do {
1199                 ti = dm_table_find_target(ci->map, ci->sector);
1200                 if (!dm_target_is_valid(ti))
1201                         return -EIO;
1202
1203                 /*
1204                  * Even though the device advertised discard support,
1205                  * that does not mean every target supports it, and
1206                  * reconfiguration might also have changed that since the
1207                  * check was performed.
1208                  */
1209                 if (!ti->num_discard_requests)
1210                         return -EOPNOTSUPP;
1211
1212                 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1213
1214                 __issue_target_requests(ci, ti, ti->num_discard_requests, len);
1215
1216                 ci->sector += len;
1217         } while (ci->sector_count -= len);
1218
1219         return 0;
1220 }
1221
1222 static int __clone_and_map(struct clone_info *ci)
1223 {
1224         struct bio *clone, *bio = ci->bio;
1225         struct dm_target *ti;
1226         sector_t len = 0, max;
1227         struct dm_target_io *tio;
1228
1229         if (unlikely(bio->bi_rw & REQ_DISCARD))
1230                 return __clone_and_map_discard(ci);
1231
1232         ti = dm_table_find_target(ci->map, ci->sector);
1233         if (!dm_target_is_valid(ti))
1234                 return -EIO;
1235
1236         max = max_io_len(ci->sector, ti);
1237
1238         if (ci->sector_count <= max) {
1239                 /*
1240                  * Optimise for the simple case where we can do all of
1241                  * the remaining io with a single clone.
1242                  */
1243                 __clone_and_map_simple(ci, ti);
1244
1245         } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
1246                 /*
1247                  * There are some bvecs that don't span targets.
1248                  * Do as many of these as possible.
1249                  */
1250                 int i;
1251                 sector_t remaining = max;
1252                 sector_t bv_len;
1253
1254                 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
1255                         bv_len = to_sector(bio->bi_io_vec[i].bv_len);
1256
1257                         if (bv_len > remaining)
1258                                 break;
1259
1260                         remaining -= bv_len;
1261                         len += bv_len;
1262                 }
1263
1264                 tio = alloc_tio(ci, ti);
1265                 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
1266                                   ci->md->bs);
1267                 __map_bio(ti, clone, tio);
1268
1269                 ci->sector += len;
1270                 ci->sector_count -= len;
1271                 ci->idx = i;
1272
1273         } else {
1274                 /*
1275                  * Handle a bvec that must be split between two or more targets.
1276                  */
1277                 struct bio_vec *bv = bio->bi_io_vec + ci->idx;
1278                 sector_t remaining = to_sector(bv->bv_len);
1279                 unsigned int offset = 0;
1280
1281                 do {
1282                         if (offset) {
1283                                 ti = dm_table_find_target(ci->map, ci->sector);
1284                                 if (!dm_target_is_valid(ti))
1285                                         return -EIO;
1286
1287                                 max = max_io_len(ci->sector, ti);
1288                         }
1289
1290                         len = min(remaining, max);
1291
1292                         tio = alloc_tio(ci, ti);
1293                         clone = split_bvec(bio, ci->sector, ci->idx,
1294                                            bv->bv_offset + offset, len,
1295                                            ci->md->bs);
1296
1297                         __map_bio(ti, clone, tio);
1298
1299                         ci->sector += len;
1300                         ci->sector_count -= len;
1301                         offset += to_bytes(len);
1302                 } while (remaining -= len);
1303
1304                 ci->idx++;
1305         }
1306
1307         return 0;
1308 }
1309
1310 /*
1311  * Split the bio into several clones and submit it to targets.
1312  */
1313 static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1314 {
1315         struct clone_info ci;
1316         int error = 0;
1317
1318         ci.map = dm_get_live_table(md);
1319         if (unlikely(!ci.map)) {
1320                 bio_io_error(bio);
1321                 return;
1322         }
1323
1324         ci.md = md;
1325         ci.io = alloc_io(md);
1326         ci.io->error = 0;
1327         atomic_set(&ci.io->io_count, 1);
1328         ci.io->bio = bio;
1329         ci.io->md = md;
1330         spin_lock_init(&ci.io->endio_lock);
1331         ci.sector = bio->bi_sector;
1332         ci.idx = bio->bi_idx;
1333
1334         start_io_acct(ci.io);
1335         if (bio->bi_rw & REQ_FLUSH) {
1336                 ci.bio = &ci.md->flush_bio;
1337                 ci.sector_count = 0;
1338                 error = __clone_and_map_empty_flush(&ci);
1339                 /* dec_pending submits any data associated with flush */
1340         } else {
1341                 ci.bio = bio;
1342                 ci.sector_count = bio_sectors(bio);
1343                 while (ci.sector_count && !error)
1344                         error = __clone_and_map(&ci);
1345         }
1346
1347         /* drop the extra reference count */
1348         dec_pending(ci.io, error);
1349         dm_table_put(ci.map);
1350 }
1351 /*-----------------------------------------------------------------
1352  * CRUD END
1353  *---------------------------------------------------------------*/
1354
1355 static int dm_merge_bvec(struct request_queue *q,
1356                          struct bvec_merge_data *bvm,
1357                          struct bio_vec *biovec)
1358 {
1359         struct mapped_device *md = q->queuedata;
1360         struct dm_table *map = dm_get_live_table(md);
1361         struct dm_target *ti;
1362         sector_t max_sectors;
1363         int max_size = 0;
1364
1365         if (unlikely(!map))
1366                 goto out;
1367
1368         ti = dm_table_find_target(map, bvm->bi_sector);
1369         if (!dm_target_is_valid(ti))
1370                 goto out_table;
1371
1372         /*
1373          * Find maximum amount of I/O that won't need splitting
1374          */
1375         max_sectors = min(max_io_len(bvm->bi_sector, ti),
1376                           (sector_t) BIO_MAX_SECTORS);
1377         max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
1378         if (max_size < 0)
1379                 max_size = 0;
1380
1381         /*
1382          * merge_bvec_fn() returns number of bytes
1383          * it can accept at this offset
1384          * max is precomputed maximal io size
1385          */
1386         if (max_size && ti->type->merge)
1387                 max_size = ti->type->merge(ti, bvm, biovec, max_size);
1388         /*
1389          * If the target doesn't support merge method and some of the devices
1390          * provided their merge_bvec method (we know this by looking at
1391          * queue_max_hw_sectors), then we can't allow bios with multiple vector
1392          * entries.  So always set max_size to 0, and the code below allows
1393          * just one page.
1394          */
1395         else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
1396
1397                 max_size = 0;
1398
1399 out_table:
1400         dm_table_put(map);
1401
1402 out:
1403         /*
1404          * Always allow an entire first page
1405          */
1406         if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
1407                 max_size = biovec->bv_len;
1408
1409         return max_size;
1410 }
1411
1412 /*
1413  * The request function that just remaps the bio built up by
1414  * dm_merge_bvec.
1415  */
1416 static void _dm_request(struct request_queue *q, struct bio *bio)
1417 {
1418         int rw = bio_data_dir(bio);
1419         struct mapped_device *md = q->queuedata;
1420         int cpu;
1421
1422         down_read(&md->io_lock);
1423
1424         cpu = part_stat_lock();
1425         part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
1426         part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
1427         part_stat_unlock();
1428
1429         /* if we're suspended, we have to queue this io for later */
1430         if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1431                 up_read(&md->io_lock);
1432
1433                 if (bio_rw(bio) != READA)
1434                         queue_io(md, bio);
1435                 else
1436                         bio_io_error(bio);
1437                 return;
1438         }
1439
1440         __split_and_process_bio(md, bio);
1441         up_read(&md->io_lock);
1442         return;
1443 }
1444
1445 static int dm_request_based(struct mapped_device *md)
1446 {
1447         return blk_queue_stackable(md->queue);
1448 }
1449
1450 static void dm_request(struct request_queue *q, struct bio *bio)
1451 {
1452         struct mapped_device *md = q->queuedata;
1453
1454         if (dm_request_based(md))
1455                 blk_queue_bio(q, bio);
1456         else
1457                 _dm_request(q, bio);
1458 }
1459
1460 void dm_dispatch_request(struct request *rq)
1461 {
1462         int r;
1463
1464         if (blk_queue_io_stat(rq->q))
1465                 rq->cmd_flags |= REQ_IO_STAT;
1466
1467         rq->start_time = jiffies;
1468         r = blk_insert_cloned_request(rq->q, rq);
1469         if (r)
1470                 dm_complete_request(rq, r);
1471 }
1472 EXPORT_SYMBOL_GPL(dm_dispatch_request);
1473
1474 static void dm_rq_bio_destructor(struct bio *bio)
1475 {
1476         struct dm_rq_clone_bio_info *info = bio->bi_private;
1477         struct mapped_device *md = info->tio->md;
1478
1479         free_bio_info(info);
1480         bio_free(bio, md->bs);
1481 }
1482
1483 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1484                                  void *data)
1485 {
1486         struct dm_rq_target_io *tio = data;
1487         struct mapped_device *md = tio->md;
1488         struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
1489
1490         if (!info)
1491                 return -ENOMEM;
1492
1493         info->orig = bio_orig;
1494         info->tio = tio;
1495         bio->bi_end_io = end_clone_bio;
1496         bio->bi_private = info;
1497         bio->bi_destructor = dm_rq_bio_destructor;
1498
1499         return 0;
1500 }
1501
1502 static int setup_clone(struct request *clone, struct request *rq,
1503                        struct dm_rq_target_io *tio)
1504 {
1505         int r;
1506
1507         r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1508                               dm_rq_bio_constructor, tio);
1509         if (r)
1510                 return r;
1511
1512         clone->cmd = rq->cmd;
1513         clone->cmd_len = rq->cmd_len;
1514         clone->sense = rq->sense;
1515         clone->buffer = rq->buffer;
1516         clone->end_io = end_clone_request;
1517         clone->end_io_data = tio;
1518
1519         return 0;
1520 }
1521
1522 static struct request *clone_rq(struct request *rq, struct mapped_device *md,
1523                                 gfp_t gfp_mask)
1524 {
1525         struct request *clone;
1526         struct dm_rq_target_io *tio;
1527
1528         tio = alloc_rq_tio(md, gfp_mask);
1529         if (!tio)
1530                 return NULL;
1531
1532         tio->md = md;
1533         tio->ti = NULL;
1534         tio->orig = rq;
1535         tio->error = 0;
1536         memset(&tio->info, 0, sizeof(tio->info));
1537
1538         clone = &tio->clone;
1539         if (setup_clone(clone, rq, tio)) {
1540                 /* -ENOMEM */
1541                 free_rq_tio(tio);
1542                 return NULL;
1543         }
1544
1545         return clone;
1546 }
1547
1548 /*
1549  * Called with the queue lock held.
1550  */
1551 static int dm_prep_fn(struct request_queue *q, struct request *rq)
1552 {
1553         struct mapped_device *md = q->queuedata;
1554         struct request *clone;
1555
1556         if (unlikely(rq->special)) {
1557                 DMWARN("Already has something in rq->special.");
1558                 return BLKPREP_KILL;
1559         }
1560
1561         clone = clone_rq(rq, md, GFP_ATOMIC);
1562         if (!clone)
1563                 return BLKPREP_DEFER;
1564
1565         rq->special = clone;
1566         rq->cmd_flags |= REQ_DONTPREP;
1567
1568         return BLKPREP_OK;
1569 }
1570
1571 /*
1572  * Returns:
1573  * 0  : the request has been processed (not requeued)
1574  * !0 : the request has been requeued
1575  */
1576 static int map_request(struct dm_target *ti, struct request *clone,
1577                        struct mapped_device *md)
1578 {
1579         int r, requeued = 0;
1580         struct dm_rq_target_io *tio = clone->end_io_data;
1581
1582         tio->ti = ti;
1583         r = ti->type->map_rq(ti, clone, &tio->info);
1584         switch (r) {
1585         case DM_MAPIO_SUBMITTED:
1586                 /* The target has taken the I/O to submit by itself later */
1587                 break;
1588         case DM_MAPIO_REMAPPED:
1589                 /* The target has remapped the I/O so dispatch it */
1590                 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
1591                                      blk_rq_pos(tio->orig));
1592                 dm_dispatch_request(clone);
1593                 break;
1594         case DM_MAPIO_REQUEUE:
1595                 /* The target wants to requeue the I/O */
1596                 dm_requeue_unmapped_request(clone);
1597                 requeued = 1;
1598                 break;
1599         default:
1600                 if (r > 0) {
1601                         DMWARN("unimplemented target map return value: %d", r);
1602                         BUG();
1603                 }
1604
1605                 /* The target wants to complete the I/O */
1606                 dm_kill_unmapped_request(clone, r);
1607                 break;
1608         }
1609
1610         return requeued;
1611 }
1612
1613 static struct request *dm_start_request(struct mapped_device *md, struct request *orig)
1614 {
1615         struct request *clone;
1616
1617         blk_start_request(orig);
1618         clone = orig->special;
1619         atomic_inc(&md->pending[rq_data_dir(clone)]);
1620
1621         /*
1622          * Hold the md reference here for the in-flight I/O.
1623          * We can't rely on the reference count by device opener,
1624          * because the device may be closed during the request completion
1625          * when all bios are completed.
1626          * See the comment in rq_completed() too.
1627          */
1628         dm_get(md);
1629
1630         return clone;
1631 }
1632
1633 /*
1634  * q->request_fn for request-based dm.
1635  * Called with the queue lock held.
1636  */
1637 static void dm_request_fn(struct request_queue *q)
1638 {
1639         struct mapped_device *md = q->queuedata;
1640         struct dm_table *map = dm_get_live_table(md);
1641         struct dm_target *ti;
1642         struct request *rq, *clone;
1643         sector_t pos;
1644
1645         /*
1646          * For suspend, check blk_queue_stopped() and increment
1647          * ->pending within a single queue_lock not to increment the
1648          * number of in-flight I/Os after the queue is stopped in
1649          * dm_suspend().
1650          */
1651         while (!blk_queue_stopped(q)) {
1652                 rq = blk_peek_request(q);
1653                 if (!rq)
1654                         goto delay_and_out;
1655
1656                 /* always use block 0 to find the target for flushes for now */
1657                 pos = 0;
1658                 if (!(rq->cmd_flags & REQ_FLUSH))
1659                         pos = blk_rq_pos(rq);
1660
1661                 ti = dm_table_find_target(map, pos);
1662                 if (!dm_target_is_valid(ti)) {
1663                         /*
1664                          * Must perform setup, that dm_done() requires,
1665                          * before calling dm_kill_unmapped_request
1666                          */
1667                         DMERR_LIMIT("request attempted access beyond the end of device");
1668                         clone = dm_start_request(md, rq);
1669                         dm_kill_unmapped_request(clone, -EIO);
1670                         continue;
1671                 }
1672
1673                 if (ti->type->busy && ti->type->busy(ti))
1674                         goto delay_and_out;
1675
1676                 clone = dm_start_request(md, rq);
1677
1678                 spin_unlock(q->queue_lock);
1679                 if (map_request(ti, clone, md))
1680                         goto requeued;
1681
1682                 BUG_ON(!irqs_disabled());
1683                 spin_lock(q->queue_lock);
1684         }
1685
1686         goto out;
1687
1688 requeued:
1689         BUG_ON(!irqs_disabled());
1690         spin_lock(q->queue_lock);
1691
1692 delay_and_out:
1693         blk_delay_queue(q, HZ / 10);
1694 out:
1695         dm_table_put(map);
1696 }
1697
1698 int dm_underlying_device_busy(struct request_queue *q)
1699 {
1700         return blk_lld_busy(q);
1701 }
1702 EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
1703
1704 static int dm_lld_busy(struct request_queue *q)
1705 {
1706         int r;
1707         struct mapped_device *md = q->queuedata;
1708         struct dm_table *map = dm_get_live_table(md);
1709
1710         if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
1711                 r = 1;
1712         else
1713                 r = dm_table_any_busy_target(map);
1714
1715         dm_table_put(map);
1716
1717         return r;
1718 }
1719
1720 static int dm_any_congested(void *congested_data, int bdi_bits)
1721 {
1722         int r = bdi_bits;
1723         struct mapped_device *md = congested_data;
1724         struct dm_table *map;
1725
1726         if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1727                 map = dm_get_live_table(md);
1728                 if (map) {
1729                         /*
1730                          * Request-based dm cares about only own queue for
1731                          * the query about congestion status of request_queue
1732                          */
1733                         if (dm_request_based(md))
1734                                 r = md->queue->backing_dev_info.state &
1735                                     bdi_bits;
1736                         else
1737                                 r = dm_table_any_congested(map, bdi_bits);
1738
1739                         dm_table_put(map);
1740                 }
1741         }
1742
1743         return r;
1744 }
1745
1746 /*-----------------------------------------------------------------
1747  * An IDR is used to keep track of allocated minor numbers.
1748  *---------------------------------------------------------------*/
1749 static void free_minor(int minor)
1750 {
1751         spin_lock(&_minor_lock);
1752         idr_remove(&_minor_idr, minor);
1753         spin_unlock(&_minor_lock);
1754 }
1755
1756 /*
1757  * See if the device with a specific minor # is free.
1758  */
1759 static int specific_minor(int minor)
1760 {
1761         int r, m;
1762
1763         if (minor >= (1 << MINORBITS))
1764                 return -EINVAL;
1765
1766         r = idr_pre_get(&_minor_idr, GFP_KERNEL);
1767         if (!r)
1768                 return -ENOMEM;
1769
1770         spin_lock(&_minor_lock);
1771
1772         if (idr_find(&_minor_idr, minor)) {
1773                 r = -EBUSY;
1774                 goto out;
1775         }
1776
1777         r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);
1778         if (r)
1779                 goto out;
1780
1781         if (m != minor) {
1782                 idr_remove(&_minor_idr, m);
1783                 r = -EBUSY;
1784                 goto out;
1785         }
1786
1787 out:
1788         spin_unlock(&_minor_lock);
1789         return r;
1790 }
1791
1792 static int next_free_minor(int *minor)
1793 {
1794         int r, m;
1795
1796         r = idr_pre_get(&_minor_idr, GFP_KERNEL);
1797         if (!r)
1798                 return -ENOMEM;
1799
1800         spin_lock(&_minor_lock);
1801
1802         r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
1803         if (r)
1804                 goto out;
1805
1806         if (m >= (1 << MINORBITS)) {
1807                 idr_remove(&_minor_idr, m);
1808                 r = -ENOSPC;
1809                 goto out;
1810         }
1811
1812         *minor = m;
1813
1814 out:
1815         spin_unlock(&_minor_lock);
1816         return r;
1817 }
1818
1819 static const struct block_device_operations dm_blk_dops;
1820
1821 static void dm_wq_work(struct work_struct *work);
1822
1823 static void dm_init_md_queue(struct mapped_device *md)
1824 {
1825         /*
1826          * Request-based dm devices cannot be stacked on top of bio-based dm
1827          * devices.  The type of this dm device has not been decided yet.
1828          * The type is decided at the first table loading time.
1829          * To prevent problematic device stacking, clear the queue flag
1830          * for request stacking support until then.
1831          *
1832          * This queue is new, so no concurrency on the queue_flags.
1833          */
1834         queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
1835
1836         md->queue->queuedata = md;
1837         md->queue->backing_dev_info.congested_fn = dm_any_congested;
1838         md->queue->backing_dev_info.congested_data = md;
1839         blk_queue_make_request(md->queue, dm_request);
1840         blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1841         blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1842 }
1843
1844 /*
1845  * Allocate and initialise a blank device with a given minor.
1846  */
1847 static struct mapped_device *alloc_dev(int minor)
1848 {
1849         int r;
1850         struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
1851         void *old_md;
1852
1853         if (!md) {
1854                 DMWARN("unable to allocate device, out of memory.");
1855                 return NULL;
1856         }
1857
1858         if (!try_module_get(THIS_MODULE))
1859                 goto bad_module_get;
1860
1861         /* get a minor number for the dev */
1862         if (minor == DM_ANY_MINOR)
1863                 r = next_free_minor(&minor);
1864         else
1865                 r = specific_minor(minor);
1866         if (r < 0)
1867                 goto bad_minor;
1868
1869         md->type = DM_TYPE_NONE;
1870         init_rwsem(&md->io_lock);
1871         mutex_init(&md->suspend_lock);
1872         mutex_init(&md->type_lock);
1873         spin_lock_init(&md->deferred_lock);
1874         rwlock_init(&md->map_lock);
1875         atomic_set(&md->holders, 1);
1876         atomic_set(&md->open_count, 0);
1877         atomic_set(&md->event_nr, 0);
1878         atomic_set(&md->uevent_seq, 0);
1879         INIT_LIST_HEAD(&md->uevent_list);
1880         spin_lock_init(&md->uevent_lock);
1881
1882         md->queue = blk_alloc_queue(GFP_KERNEL);
1883         if (!md->queue)
1884                 goto bad_queue;
1885
1886         dm_init_md_queue(md);
1887
1888         md->disk = alloc_disk(1);
1889         if (!md->disk)
1890                 goto bad_disk;
1891
1892         atomic_set(&md->pending[0], 0);
1893         atomic_set(&md->pending[1], 0);
1894         init_waitqueue_head(&md->wait);
1895         INIT_WORK(&md->work, dm_wq_work);
1896         init_waitqueue_head(&md->eventq);
1897         init_completion(&md->kobj_completion);
1898
1899         md->disk->major = _major;
1900         md->disk->first_minor = minor;
1901         md->disk->fops = &dm_blk_dops;
1902         md->disk->queue = md->queue;
1903         md->disk->private_data = md;
1904         sprintf(md->disk->disk_name, "dm-%d", minor);
1905         add_disk(md->disk);
1906         format_dev_t(md->name, MKDEV(_major, minor));
1907
1908         md->wq = alloc_workqueue("kdmflush",
1909                                  WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
1910         if (!md->wq)
1911                 goto bad_thread;
1912
1913         md->bdev = bdget_disk(md->disk, 0);
1914         if (!md->bdev)
1915                 goto bad_bdev;
1916
1917         bio_init(&md->flush_bio);
1918         md->flush_bio.bi_bdev = md->bdev;
1919         md->flush_bio.bi_rw = WRITE_FLUSH;
1920
1921         /* Populate the mapping, nobody knows we exist yet */
1922         spin_lock(&_minor_lock);
1923         old_md = idr_replace(&_minor_idr, md, minor);
1924         spin_unlock(&_minor_lock);
1925
1926         BUG_ON(old_md != MINOR_ALLOCED);
1927
1928         return md;
1929
1930 bad_bdev:
1931         destroy_workqueue(md->wq);
1932 bad_thread:
1933         del_gendisk(md->disk);
1934         put_disk(md->disk);
1935 bad_disk:
1936         blk_cleanup_queue(md->queue);
1937 bad_queue:
1938         free_minor(minor);
1939 bad_minor:
1940         module_put(THIS_MODULE);
1941 bad_module_get:
1942         kfree(md);
1943         return NULL;
1944 }
1945
1946 static void unlock_fs(struct mapped_device *md);
1947
1948 static void free_dev(struct mapped_device *md)
1949 {
1950         int minor = MINOR(disk_devt(md->disk));
1951
1952         unlock_fs(md);
1953         bdput(md->bdev);
1954         destroy_workqueue(md->wq);
1955         if (md->tio_pool)
1956                 mempool_destroy(md->tio_pool);
1957         if (md->io_pool)
1958                 mempool_destroy(md->io_pool);
1959         if (md->bs)
1960                 bioset_free(md->bs);
1961         blk_integrity_unregister(md->disk);
1962         del_gendisk(md->disk);
1963         free_minor(minor);
1964
1965         spin_lock(&_minor_lock);
1966         md->disk->private_data = NULL;
1967         spin_unlock(&_minor_lock);
1968
1969         put_disk(md->disk);
1970         blk_cleanup_queue(md->queue);
1971         module_put(THIS_MODULE);
1972         kfree(md);
1973 }
1974
1975 static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
1976 {
1977         struct dm_md_mempools *p;
1978
1979         if (md->io_pool && md->tio_pool && md->bs)
1980                 /* the md already has necessary mempools */
1981                 goto out;
1982
1983         p = dm_table_get_md_mempools(t);
1984         BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
1985
1986         md->io_pool = p->io_pool;
1987         p->io_pool = NULL;
1988         md->tio_pool = p->tio_pool;
1989         p->tio_pool = NULL;
1990         md->bs = p->bs;
1991         p->bs = NULL;
1992
1993 out:
1994         /* mempool bind completed, now no need any mempools in the table */
1995         dm_table_free_md_mempools(t);
1996 }
1997
1998 /*
1999  * Bind a table to the device.
2000  */
2001 static void event_callback(void *context)
2002 {
2003         unsigned long flags;
2004         LIST_HEAD(uevents);
2005         struct mapped_device *md = (struct mapped_device *) context;
2006
2007         spin_lock_irqsave(&md->uevent_lock, flags);
2008         list_splice_init(&md->uevent_list, &uevents);
2009         spin_unlock_irqrestore(&md->uevent_lock, flags);
2010
2011         dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2012
2013         atomic_inc(&md->event_nr);
2014         wake_up(&md->eventq);
2015 }
2016
2017 /*
2018  * Protected by md->suspend_lock obtained by dm_swap_table().
2019  */
2020 static void __set_size(struct mapped_device *md, sector_t size)
2021 {
2022         set_capacity(md->disk, size);
2023
2024         i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2025 }
2026
2027 /*
2028  * Return 1 if the queue has a compulsory merge_bvec_fn function.
2029  *
2030  * If this function returns 0, then the device is either a non-dm
2031  * device without a merge_bvec_fn, or it is a dm device that is
2032  * able to split any bios it receives that are too big.
2033  */
2034 int dm_queue_merge_is_compulsory(struct request_queue *q)
2035 {
2036         struct mapped_device *dev_md;
2037
2038         if (!q->merge_bvec_fn)
2039                 return 0;
2040
2041         if (q->make_request_fn == dm_request) {
2042                 dev_md = q->queuedata;
2043                 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
2044                         return 0;
2045         }
2046
2047         return 1;
2048 }
2049
2050 static int dm_device_merge_is_compulsory(struct dm_target *ti,
2051                                          struct dm_dev *dev, sector_t start,
2052                                          sector_t len, void *data)
2053 {
2054         struct block_device *bdev = dev->bdev;
2055         struct request_queue *q = bdev_get_queue(bdev);
2056
2057         return dm_queue_merge_is_compulsory(q);
2058 }
2059
2060 /*
2061  * Return 1 if it is acceptable to ignore merge_bvec_fn based
2062  * on the properties of the underlying devices.
2063  */
2064 static int dm_table_merge_is_optional(struct dm_table *table)
2065 {
2066         unsigned i = 0;
2067         struct dm_target *ti;
2068
2069         while (i < dm_table_get_num_targets(table)) {
2070                 ti = dm_table_get_target(table, i++);
2071
2072                 if (ti->type->iterate_devices &&
2073                     ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
2074                         return 0;
2075         }
2076
2077         return 1;
2078 }
2079
2080 /*
2081  * Returns old map, which caller must destroy.
2082  */
2083 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2084                                struct queue_limits *limits)
2085 {
2086         struct dm_table *old_map;
2087         struct request_queue *q = md->queue;
2088         sector_t size;
2089         unsigned long flags;
2090         int merge_is_optional;
2091
2092         size = dm_table_get_size(t);
2093
2094         /*
2095          * Wipe any geometry if the size of the table changed.
2096          */
2097         if (size != get_capacity(md->disk))
2098                 memset(&md->geometry, 0, sizeof(md->geometry));
2099
2100         __set_size(md, size);
2101
2102         dm_table_event_callback(t, event_callback, md);
2103
2104         /*
2105          * The queue hasn't been stopped yet, if the old table type wasn't
2106          * for request-based during suspension.  So stop it to prevent
2107          * I/O mapping before resume.
2108          * This must be done before setting the queue restrictions,
2109          * because request-based dm may be run just after the setting.
2110          */
2111         if (dm_table_request_based(t) && !blk_queue_stopped(q))
2112                 stop_queue(q);
2113
2114         __bind_mempools(md, t);
2115
2116         merge_is_optional = dm_table_merge_is_optional(t);
2117
2118         write_lock_irqsave(&md->map_lock, flags);
2119         old_map = md->map;
2120         md->map = t;
2121         md->immutable_target_type = dm_table_get_immutable_target_type(t);
2122
2123         dm_table_set_restrictions(t, q, limits);
2124         if (merge_is_optional)
2125                 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2126         else
2127                 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2128         write_unlock_irqrestore(&md->map_lock, flags);
2129
2130         return old_map;
2131 }
2132
2133 /*
2134  * Returns unbound table for the caller to free.
2135  */
2136 static struct dm_table *__unbind(struct mapped_device *md)
2137 {
2138         struct dm_table *map = md->map;
2139         unsigned long flags;
2140
2141         if (!map)
2142                 return NULL;
2143
2144         dm_table_event_callback(map, NULL, NULL);
2145         write_lock_irqsave(&md->map_lock, flags);
2146         md->map = NULL;
2147         write_unlock_irqrestore(&md->map_lock, flags);
2148
2149         return map;
2150 }
2151
2152 /*
2153  * Constructor for a new device.
2154  */
2155 int dm_create(int minor, struct mapped_device **result)
2156 {
2157         struct mapped_device *md;
2158
2159         md = alloc_dev(minor);
2160         if (!md)
2161                 return -ENXIO;
2162
2163         dm_sysfs_init(md);
2164
2165         *result = md;
2166         return 0;
2167 }
2168
2169 /*
2170  * Functions to manage md->type.
2171  * All are required to hold md->type_lock.
2172  */
2173 void dm_lock_md_type(struct mapped_device *md)
2174 {
2175         mutex_lock(&md->type_lock);
2176 }
2177
2178 void dm_unlock_md_type(struct mapped_device *md)
2179 {
2180         mutex_unlock(&md->type_lock);
2181 }
2182
2183 void dm_set_md_type(struct mapped_device *md, unsigned type)
2184 {
2185         md->type = type;
2186 }
2187
2188 unsigned dm_get_md_type(struct mapped_device *md)
2189 {
2190         return md->type;
2191 }
2192
2193 struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2194 {
2195         return md->immutable_target_type;
2196 }
2197
2198 /*
2199  * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
2200  */
2201 static int dm_init_request_based_queue(struct mapped_device *md)
2202 {
2203         struct request_queue *q = NULL;
2204
2205         if (md->queue->elevator)
2206                 return 1;
2207
2208         /* Fully initialize the queue */
2209         q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
2210         if (!q)
2211                 return 0;
2212
2213         md->queue = q;
2214         dm_init_md_queue(md);
2215         blk_queue_softirq_done(md->queue, dm_softirq_done);
2216         blk_queue_prep_rq(md->queue, dm_prep_fn);
2217         blk_queue_lld_busy(md->queue, dm_lld_busy);
2218
2219         elv_register_queue(md->queue);
2220
2221         return 1;
2222 }
2223
2224 /*
2225  * Setup the DM device's queue based on md's type
2226  */
2227 int dm_setup_md_queue(struct mapped_device *md)
2228 {
2229         if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&
2230             !dm_init_request_based_queue(md)) {
2231                 DMWARN("Cannot initialize queue for request-based mapped device");
2232                 return -EINVAL;
2233         }
2234
2235         return 0;
2236 }
2237
2238 static struct mapped_device *dm_find_md(dev_t dev)
2239 {
2240         struct mapped_device *md;
2241         unsigned minor = MINOR(dev);
2242
2243         if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2244                 return NULL;
2245
2246         spin_lock(&_minor_lock);
2247
2248         md = idr_find(&_minor_idr, minor);
2249         if (md && (md == MINOR_ALLOCED ||
2250                    (MINOR(disk_devt(dm_disk(md))) != minor) ||
2251                    dm_deleting_md(md) ||
2252                    test_bit(DMF_FREEING, &md->flags))) {
2253                 md = NULL;
2254                 goto out;
2255         }
2256
2257 out:
2258         spin_unlock(&_minor_lock);
2259
2260         return md;
2261 }
2262
2263 struct mapped_device *dm_get_md(dev_t dev)
2264 {
2265         struct mapped_device *md = dm_find_md(dev);
2266
2267         if (md)
2268                 dm_get(md);
2269
2270         return md;
2271 }
2272 EXPORT_SYMBOL_GPL(dm_get_md);
2273
2274 void *dm_get_mdptr(struct mapped_device *md)
2275 {
2276         return md->interface_ptr;
2277 }
2278
2279 void dm_set_mdptr(struct mapped_device *md, void *ptr)
2280 {
2281         md->interface_ptr = ptr;
2282 }
2283
2284 void dm_get(struct mapped_device *md)
2285 {
2286         atomic_inc(&md->holders);
2287         BUG_ON(test_bit(DMF_FREEING, &md->flags));
2288 }
2289
2290 const char *dm_device_name(struct mapped_device *md)
2291 {
2292         return md->name;
2293 }
2294 EXPORT_SYMBOL_GPL(dm_device_name);
2295
2296 static void __dm_destroy(struct mapped_device *md, bool wait)
2297 {
2298         struct dm_table *map;
2299
2300         might_sleep();
2301
2302         spin_lock(&_minor_lock);
2303         map = dm_get_live_table(md);
2304         idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2305         set_bit(DMF_FREEING, &md->flags);
2306         spin_unlock(&_minor_lock);
2307
2308         if (!dm_suspended_md(md)) {
2309                 dm_table_presuspend_targets(map);
2310                 dm_table_postsuspend_targets(map);
2311         }
2312
2313         /*
2314          * Rare, but there may be I/O requests still going to complete,
2315          * for example.  Wait for all references to disappear.
2316          * No one should increment the reference count of the mapped_device,
2317          * after the mapped_device state becomes DMF_FREEING.
2318          */
2319         if (wait)
2320                 while (atomic_read(&md->holders))
2321                         msleep(1);
2322         else if (atomic_read(&md->holders))
2323                 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2324                        dm_device_name(md), atomic_read(&md->holders));
2325
2326         dm_sysfs_exit(md);
2327         dm_table_put(map);
2328         dm_table_destroy(__unbind(md));
2329         free_dev(md);
2330 }
2331
2332 void dm_destroy(struct mapped_device *md)
2333 {
2334         __dm_destroy(md, true);
2335 }
2336
2337 void dm_destroy_immediate(struct mapped_device *md)
2338 {
2339         __dm_destroy(md, false);
2340 }
2341
2342 void dm_put(struct mapped_device *md)
2343 {
2344         atomic_dec(&md->holders);
2345 }
2346 EXPORT_SYMBOL_GPL(dm_put);
2347
2348 static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2349 {
2350         int r = 0;
2351         DECLARE_WAITQUEUE(wait, current);
2352
2353         add_wait_queue(&md->wait, &wait);
2354
2355         while (1) {
2356                 set_current_state(interruptible);
2357
2358                 if (!md_in_flight(md))
2359                         break;
2360
2361                 if (interruptible == TASK_INTERRUPTIBLE &&
2362                     signal_pending(current)) {
2363                         r = -EINTR;
2364                         break;
2365                 }
2366
2367                 io_schedule();
2368         }
2369         set_current_state(TASK_RUNNING);
2370
2371         remove_wait_queue(&md->wait, &wait);
2372
2373         return r;
2374 }
2375
2376 /*
2377  * Process the deferred bios
2378  */
2379 static void dm_wq_work(struct work_struct *work)
2380 {
2381         struct mapped_device *md = container_of(work, struct mapped_device,
2382                                                 work);
2383         struct bio *c;
2384
2385         down_read(&md->io_lock);
2386
2387         while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2388                 spin_lock_irq(&md->deferred_lock);
2389                 c = bio_list_pop(&md->deferred);
2390                 spin_unlock_irq(&md->deferred_lock);
2391
2392                 if (!c)
2393                         break;
2394
2395                 up_read(&md->io_lock);
2396
2397                 if (dm_request_based(md))
2398                         generic_make_request(c);
2399                 else
2400                         __split_and_process_bio(md, c);
2401
2402                 down_read(&md->io_lock);
2403         }
2404
2405         up_read(&md->io_lock);
2406 }
2407
2408 static void dm_queue_flush(struct mapped_device *md)
2409 {
2410         clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2411         smp_mb__after_clear_bit();
2412         queue_work(md->wq, &md->work);
2413 }
2414
2415 /*
2416  * Swap in a new table, returning the old one for the caller to destroy.
2417  */
2418 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2419 {
2420         struct dm_table *map = ERR_PTR(-EINVAL);
2421         struct queue_limits limits;
2422         int r;
2423
2424         mutex_lock(&md->suspend_lock);
2425
2426         /* device must be suspended */
2427         if (!dm_suspended_md(md))
2428                 goto out;
2429
2430         r = dm_calculate_queue_limits(table, &limits);
2431         if (r) {
2432                 map = ERR_PTR(r);
2433                 goto out;
2434         }
2435
2436         map = __bind(md, table, &limits);
2437
2438 out:
2439         mutex_unlock(&md->suspend_lock);
2440         return map;
2441 }
2442
2443 /*
2444  * Functions to lock and unlock any filesystem running on the
2445  * device.
2446  */
2447 static int lock_fs(struct mapped_device *md)
2448 {
2449         int r;
2450
2451         WARN_ON(md->frozen_sb);
2452
2453         md->frozen_sb = freeze_bdev(md->bdev);
2454         if (IS_ERR(md->frozen_sb)) {
2455                 r = PTR_ERR(md->frozen_sb);
2456                 md->frozen_sb = NULL;
2457                 return r;
2458         }
2459
2460         set_bit(DMF_FROZEN, &md->flags);
2461
2462         return 0;
2463 }
2464
2465 static void unlock_fs(struct mapped_device *md)
2466 {
2467         if (!test_bit(DMF_FROZEN, &md->flags))
2468                 return;
2469
2470         thaw_bdev(md->bdev, md->frozen_sb);
2471         md->frozen_sb = NULL;
2472         clear_bit(DMF_FROZEN, &md->flags);
2473 }
2474
2475 /*
2476  * We need to be able to change a mapping table under a mounted
2477  * filesystem.  For example we might want to move some data in
2478  * the background.  Before the table can be swapped with
2479  * dm_bind_table, dm_suspend must be called to flush any in
2480  * flight bios and ensure that any further io gets deferred.
2481  */
2482 /*
2483  * Suspend mechanism in request-based dm.
2484  *
2485  * 1. Flush all I/Os by lock_fs() if needed.
2486  * 2. Stop dispatching any I/O by stopping the request_queue.
2487  * 3. Wait for all in-flight I/Os to be completed or requeued.
2488  *
2489  * To abort suspend, start the request_queue.
2490  */
2491 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2492 {
2493         struct dm_table *map = NULL;
2494         int r = 0;
2495         int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
2496         int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
2497
2498         mutex_lock(&md->suspend_lock);
2499
2500         if (dm_suspended_md(md)) {
2501                 r = -EINVAL;
2502                 goto out_unlock;
2503         }
2504
2505         map = dm_get_live_table(md);
2506
2507         /*
2508          * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2509          * This flag is cleared before dm_suspend returns.
2510          */
2511         if (noflush)
2512                 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2513
2514         /* This does not get reverted if there's an error later. */
2515         dm_table_presuspend_targets(map);
2516
2517         /*
2518          * Flush I/O to the device.
2519          * Any I/O submitted after lock_fs() may not be flushed.
2520          * noflush takes precedence over do_lockfs.
2521          * (lock_fs() flushes I/Os and waits for them to complete.)
2522          */
2523         if (!noflush && do_lockfs) {
2524                 r = lock_fs(md);
2525                 if (r)
2526                         goto out;
2527         }
2528
2529         /*
2530          * Here we must make sure that no processes are submitting requests
2531          * to target drivers i.e. no one may be executing
2532          * __split_and_process_bio. This is called from dm_request and
2533          * dm_wq_work.
2534          *
2535          * To get all processes out of __split_and_process_bio in dm_request,
2536          * we take the write lock. To prevent any process from reentering
2537          * __split_and_process_bio from dm_request and quiesce the thread
2538          * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2539          * flush_workqueue(md->wq).
2540          */
2541         down_write(&md->io_lock);
2542         set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2543         up_write(&md->io_lock);
2544
2545         /*
2546          * Stop md->queue before flushing md->wq in case request-based
2547          * dm defers requests to md->wq from md->queue.
2548          */
2549         if (dm_request_based(md))
2550                 stop_queue(md->queue);
2551
2552         flush_workqueue(md->wq);
2553
2554         /*
2555          * At this point no more requests are entering target request routines.
2556          * We call dm_wait_for_completion to wait for all existing requests
2557          * to finish.
2558          */
2559         r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
2560
2561         down_write(&md->io_lock);
2562         if (noflush)
2563                 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2564         up_write(&md->io_lock);
2565
2566         /* were we interrupted ? */
2567         if (r < 0) {
2568                 dm_queue_flush(md);
2569
2570                 if (dm_request_based(md))
2571                         start_queue(md->queue);
2572
2573                 unlock_fs(md);
2574                 goto out; /* pushback list is already flushed, so skip flush */
2575         }
2576
2577         /*
2578          * If dm_wait_for_completion returned 0, the device is completely
2579          * quiescent now. There is no request-processing activity. All new
2580          * requests are being added to md->deferred list.
2581          */
2582
2583         set_bit(DMF_SUSPENDED, &md->flags);
2584
2585         dm_table_postsuspend_targets(map);
2586
2587 out:
2588         dm_table_put(map);
2589
2590 out_unlock:
2591         mutex_unlock(&md->suspend_lock);
2592         return r;
2593 }
2594
2595 int dm_resume(struct mapped_device *md)
2596 {
2597         int r = -EINVAL;
2598         struct dm_table *map = NULL;
2599
2600         mutex_lock(&md->suspend_lock);
2601         if (!dm_suspended_md(md))
2602                 goto out;
2603
2604         map = dm_get_live_table(md);
2605         if (!map || !dm_table_get_size(map))
2606                 goto out;
2607
2608         r = dm_table_resume_targets(map);
2609         if (r)
2610                 goto out;
2611
2612         dm_queue_flush(md);
2613
2614         /*
2615          * Flushing deferred I/Os must be done after targets are resumed
2616          * so that mapping of targets can work correctly.
2617          * Request-based dm is queueing the deferred I/Os in its request_queue.
2618          */
2619         if (dm_request_based(md))
2620                 start_queue(md->queue);
2621
2622         unlock_fs(md);
2623
2624         clear_bit(DMF_SUSPENDED, &md->flags);
2625
2626         r = 0;
2627 out:
2628         dm_table_put(map);
2629         mutex_unlock(&md->suspend_lock);
2630
2631         return r;
2632 }
2633
2634 /*-----------------------------------------------------------------
2635  * Event notification.
2636  *---------------------------------------------------------------*/
2637 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2638                        unsigned cookie)
2639 {
2640         char udev_cookie[DM_COOKIE_LENGTH];
2641         char *envp[] = { udev_cookie, NULL };
2642
2643         if (!cookie)
2644                 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2645         else {
2646                 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2647                          DM_COOKIE_ENV_VAR_NAME, cookie);
2648                 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2649                                           action, envp);
2650         }
2651 }
2652
2653 uint32_t dm_next_uevent_seq(struct mapped_device *md)
2654 {
2655         return atomic_add_return(1, &md->uevent_seq);
2656 }
2657
2658 uint32_t dm_get_event_nr(struct mapped_device *md)
2659 {
2660         return atomic_read(&md->event_nr);
2661 }
2662
2663 int dm_wait_event(struct mapped_device *md, int event_nr)
2664 {
2665         return wait_event_interruptible(md->eventq,
2666                         (event_nr != atomic_read(&md->event_nr)));
2667 }
2668
2669 void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2670 {
2671         unsigned long flags;
2672
2673         spin_lock_irqsave(&md->uevent_lock, flags);
2674         list_add(elist, &md->uevent_list);
2675         spin_unlock_irqrestore(&md->uevent_lock, flags);
2676 }
2677
2678 /*
2679  * The gendisk is only valid as long as you have a reference
2680  * count on 'md'.
2681  */
2682 struct gendisk *dm_disk(struct mapped_device *md)
2683 {
2684         return md->disk;
2685 }
2686
2687 struct kobject *dm_kobject(struct mapped_device *md)
2688 {
2689         return &md->kobj;
2690 }
2691
2692 /*
2693  * struct mapped_device should not be exported outside of dm.c
2694  * so use this check to verify that kobj is part of md structure
2695  */
2696 struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2697 {
2698         struct mapped_device *md;
2699
2700         md = container_of(kobj, struct mapped_device, kobj);
2701         if (&md->kobj != kobj)
2702                 return NULL;
2703
2704         if (test_bit(DMF_FREEING, &md->flags) ||
2705             dm_deleting_md(md))
2706                 return NULL;
2707
2708         dm_get(md);
2709         return md;
2710 }
2711
2712 struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
2713 {
2714         struct mapped_device *md = container_of(kobj, struct mapped_device, kobj);
2715
2716         return &md->kobj_completion;
2717 }
2718
2719 int dm_suspended_md(struct mapped_device *md)
2720 {
2721         return test_bit(DMF_SUSPENDED, &md->flags);
2722 }
2723
2724 int dm_suspended(struct dm_target *ti)
2725 {
2726         return dm_suspended_md(dm_table_get_md(ti->table));
2727 }
2728 EXPORT_SYMBOL_GPL(dm_suspended);
2729
2730 int dm_noflush_suspending(struct dm_target *ti)
2731 {
2732         return __noflush_suspending(dm_table_get_md(ti->table));
2733 }
2734 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2735
2736 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
2737 {
2738         struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
2739         unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
2740
2741         if (!pools)
2742                 return NULL;
2743
2744         pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
2745                          mempool_create_slab_pool(MIN_IOS, _io_cache) :
2746                          mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
2747         if (!pools->io_pool)
2748                 goto free_pools_and_out;
2749
2750         pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?
2751                           mempool_create_slab_pool(MIN_IOS, _tio_cache) :
2752                           mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
2753         if (!pools->tio_pool)
2754                 goto free_io_pool_and_out;
2755
2756         pools->bs = bioset_create(pool_size, 0);
2757         if (!pools->bs)
2758                 goto free_tio_pool_and_out;
2759
2760         if (integrity && bioset_integrity_create(pools->bs, pool_size))
2761                 goto free_bioset_and_out;
2762
2763         return pools;
2764
2765 free_bioset_and_out:
2766         bioset_free(pools->bs);
2767
2768 free_tio_pool_and_out:
2769         mempool_destroy(pools->tio_pool);
2770
2771 free_io_pool_and_out:
2772         mempool_destroy(pools->io_pool);
2773
2774 free_pools_and_out:
2775         kfree(pools);
2776
2777         return NULL;
2778 }
2779
2780 void dm_free_md_mempools(struct dm_md_mempools *pools)
2781 {
2782         if (!pools)
2783                 return;
2784
2785         if (pools->io_pool)
2786                 mempool_destroy(pools->io_pool);
2787
2788         if (pools->tio_pool)
2789                 mempool_destroy(pools->tio_pool);
2790
2791         if (pools->bs)
2792                 bioset_free(pools->bs);
2793
2794         kfree(pools);
2795 }
2796
2797 static const struct block_device_operations dm_blk_dops = {
2798         .open = dm_blk_open,
2799         .release = dm_blk_close,
2800         .ioctl = dm_blk_ioctl,
2801         .getgeo = dm_blk_getgeo,
2802         .owner = THIS_MODULE
2803 };
2804
2805 EXPORT_SYMBOL(dm_get_mapinfo);
2806
2807 /*
2808  * module hooks
2809  */
2810 module_init(dm_init);
2811 module_exit(dm_exit);
2812
2813 module_param(major, uint, 0);
2814 MODULE_PARM_DESC(major, "The major number of the device mapper");
2815 MODULE_DESCRIPTION(DM_NAME " driver");
2816 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2817 MODULE_LICENSE("GPL");