block/blk-barrier.c

   1 /*
   2  * Functions related to barrier IO handling
   3  */
   4 #include <linux/kernel.h>
   5 #include <linux/module.h>
   6 #include <linux/bio.h>
   7 #include <linux/blkdev.h>
   8 #include <linux/gfp.h>
   9
  10 #include "blk.h"
  11
  12 /**
  13  * blk_queue_ordered - does this queue support ordered writes
  14  * @q:        the request queue
  15  * @ordered:  one of QUEUE_ORDERED_*
  16  * @prepare_flush_fn: rq setup helper for cache flush ordered writes
  17  *
  18  * Description:
  19  *   For journalled file systems, doing ordered writes on a commit
  20  *   block instead of explicitly doing wait_on_buffer (which is bad
  21  *   for performance) can be a big win. Block drivers supporting this
  22  *   feature should call this function and indicate so.
  23  *
  24  **/
  25 int blk_queue_ordered(struct request_queue *q, unsigned ordered,
  26                       prepare_flush_fn *prepare_flush_fn)
  27 {
  28         if (!prepare_flush_fn && (ordered & (QUEUE_ORDERED_DO_PREFLUSH |
  29                                              QUEUE_ORDERED_DO_POSTFLUSH))) {
  30                 printk(KERN_ERR "%s: prepare_flush_fn required\n", __func__);
  31                 return -EINVAL;
  32         }
  33
  34         if (ordered != QUEUE_ORDERED_NONE &&
  35             ordered != QUEUE_ORDERED_DRAIN &&
  36             ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
  37             ordered != QUEUE_ORDERED_DRAIN_FUA &&
  38             ordered != QUEUE_ORDERED_TAG &&
  39             ordered != QUEUE_ORDERED_TAG_FLUSH &&
  40             ordered != QUEUE_ORDERED_TAG_FUA) {
  41                 printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
  42                 return -EINVAL;
  43         }
  44
  45         q->ordered = ordered;
  46         q->next_ordered = ordered;
  47         q->prepare_flush_fn = prepare_flush_fn;
  48
  49         return 0;
  50 }
  51 EXPORT_SYMBOL(blk_queue_ordered);
  52
  53 /*
  54  * Cache flushing for ordered writes handling
  55  */
  56 unsigned blk_ordered_cur_seq(struct request_queue *q)
  57 {
  58         if (!q->ordseq)
  59                 return 0;
  60         return 1 << ffz(q->ordseq);
  61 }
  62
  63 unsigned blk_ordered_req_seq(struct request *rq)
  64 {
  65         struct request_queue *q = rq->q;
  66
  67         BUG_ON(q->ordseq == 0);
  68
  69         if (rq == &q->pre_flush_rq)
  70                 return QUEUE_ORDSEQ_PREFLUSH;
  71         if (rq == &q->bar_rq)
  72                 return QUEUE_ORDSEQ_BAR;
  73         if (rq == &q->post_flush_rq)
  74                 return QUEUE_ORDSEQ_POSTFLUSH;
  75
  76         /*
  77          * !fs requests don't need to follow barrier ordering.  Always
  78          * put them at the front.  This fixes the following deadlock.
  79          *
  80          * http://thread.gmane.org/gmane.linux.kernel/537473
  81          */
  82         if (!blk_fs_request(rq))
  83                 return QUEUE_ORDSEQ_DRAIN;
  84
  85         if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
  86             (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
  87                 return QUEUE_ORDSEQ_DRAIN;
  88         else
  89                 return QUEUE_ORDSEQ_DONE;
  90 }
  91
  92 bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
  93 {
  94         struct request *rq;
  95
  96         if (error && !q->orderr)
  97                 q->orderr = error;
  98
  99         BUG_ON(q->ordseq & seq);
 100         q->ordseq |= seq;
 101
 102         if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
 103                 return false;
 104
 105         /*
 106          * Okay, sequence complete.
 107          */
 108         q->ordseq = 0;
 109         rq = q->orig_bar_rq;
 110         __blk_end_request_all(rq, q->orderr);
 111         return true;
 112 }
 113
 114 static void pre_flush_end_io(struct request *rq, int error)
 115 {
 116         elv_completed_request(rq->q, rq);
 117         blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
 118 }
 119
 120 static void bar_end_io(struct request *rq, int error)
 121 {
 122         elv_completed_request(rq->q, rq);
 123         blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
 124 }
 125
 126 static void post_flush_end_io(struct request *rq, int error)
 127 {
 128         elv_completed_request(rq->q, rq);
 129         blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
 130 }
 131
 132 static void queue_flush(struct request_queue *q, unsigned which)
 133 {
 134         struct request *rq;
 135         rq_end_io_fn *end_io;
 136
 137         if (which == QUEUE_ORDERED_DO_PREFLUSH) {
 138                 rq = &q->pre_flush_rq;
 139                 end_io = pre_flush_end_io;
 140         } else {
 141                 rq = &q->post_flush_rq;
 142                 end_io = post_flush_end_io;
 143         }
 144
 145         blk_rq_init(q, rq);
 146         rq->cmd_flags = REQ_HARDBARRIER;
 147         rq->rq_disk = q->bar_rq.rq_disk;
 148         rq->end_io = end_io;
 149         q->prepare_flush_fn(q, rq);
 150
 151         elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 152 }
 153
 154 static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 155 {
 156         struct request *rq = *rqp;
 157         unsigned skip = 0;
 158
 159         q->orderr = 0;
 160         q->ordered = q->next_ordered;
 161         q->ordseq |= QUEUE_ORDSEQ_STARTED;
 162
 163         /*
 164          * For an empty barrier, there's no actual BAR request, which
 165          * in turn makes POSTFLUSH unnecessary.  Mask them off.
 166          */
 167         if (!blk_rq_sectors(rq)) {
 168                 q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
 169                                 QUEUE_ORDERED_DO_POSTFLUSH);
 170                 /*
 171                  * Empty barrier on a write-through device w/ ordered
 172                  * tag has no command to issue and without any command
 173                  * to issue, ordering by tag can't be used.  Drain
 174                  * instead.
 175                  */
 176                 if ((q->ordered & QUEUE_ORDERED_BY_TAG) &&
 177                     !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) {
 178                         q->ordered &= ~QUEUE_ORDERED_BY_TAG;
 179                         q->ordered |= QUEUE_ORDERED_BY_DRAIN;
 180                 }
 181         }
 182
 183         /* stash away the original request */
 184         blk_dequeue_request(rq);
 185         q->orig_bar_rq = rq;
 186         rq = NULL;
 187
 188         /*
 189          * Queue ordered sequence.  As we stack them at the head, we
 190          * need to queue in reverse order.  Note that we rely on that
 191          * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
 192          * request gets inbetween ordered sequence.
 193          */
 194         if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) {
 195                 queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
 196                 rq = &q->post_flush_rq;
 197         } else
 198                 skip |= QUEUE_ORDSEQ_POSTFLUSH;
 199
 200         if (q->ordered & QUEUE_ORDERED_DO_BAR) {
 201                 rq = &q->bar_rq;
 202
 203                 /* initialize proxy request and queue it */
 204                 blk_rq_init(q, rq);
 205                 if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
 206                         rq->cmd_flags |= REQ_RW;
 207                 if (q->ordered & QUEUE_ORDERED_DO_FUA)
 208                         rq->cmd_flags |= REQ_FUA;
 209                 init_request_from_bio(rq, q->orig_bar_rq->bio);
 210                 rq->end_io = bar_end_io;
 211
 212                 elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 213         } else
 214                 skip |= QUEUE_ORDSEQ_BAR;
 215
 216         if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
 217                 queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
 218                 rq = &q->pre_flush_rq;
 219         } else
 220                 skip |= QUEUE_ORDSEQ_PREFLUSH;
 221
 222         if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q))
 223                 rq = NULL;
 224         else
 225                 skip |= QUEUE_ORDSEQ_DRAIN;
 226
 227         *rqp = rq;
 228
 229         /*
 230          * Complete skipped sequences.  If whole sequence is complete,
 231          * return false to tell elevator that this request is gone.
 232          */
 233         return !blk_ordered_complete_seq(q, skip, 0);
 234 }
 235
 236 bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 237 {
 238         struct request *rq = *rqp;
 239         const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
 240
 241         if (!q->ordseq) {
 242                 if (!is_barrier)
 243                         return true;
 244
 245                 if (q->next_ordered != QUEUE_ORDERED_NONE)
 246                         return start_ordered(q, rqp);
 247                 else {
 248                         /*
 249                          * Queue ordering not supported.  Terminate
 250                          * with prejudice.
 251                          */
 252                         blk_dequeue_request(rq);
 253                         __blk_end_request_all(rq, -EOPNOTSUPP);
 254                         *rqp = NULL;
 255                         return false;
 256                 }
 257         }
 258
 259         /*
 260          * Ordered sequence in progress
 261          */
 262
 263         /* Special requests are not subject to ordering rules. */
 264         if (!blk_fs_request(rq) &&
 265             rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
 266                 return true;
 267
 268         if (q->ordered & QUEUE_ORDERED_BY_TAG) {
 269                 /* Ordered by tag.  Blocking the next barrier is enough. */
 270                 if (is_barrier && rq != &q->bar_rq)
 271                         *rqp = NULL;
 272         } else {
 273                 /* Ordered by draining.  Wait for turn. */
 274                 WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
 275                 if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
 276                         *rqp = NULL;
 277         }
 278
 279         return true;
 280 }
 281
 282 static void bio_end_empty_barrier(struct bio *bio, int err)
 283 {
 284         if (err) {
 285                 if (err == -EOPNOTSUPP)
 286                         set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
 287                 clear_bit(BIO_UPTODATE, &bio->bi_flags);
 288         }
 289
 290         complete(bio->bi_private);
 291 }
 292
 293 /**
 294  * blkdev_issue_flush - queue a flush
 295  * @bdev:       blockdev to issue flush for
 296  * @error_sector:       error sector
 297  *
 298  * Description:
 299  *    Issue a flush for the block device in question. Caller can supply
 300  *    room for storing the error offset in case of a flush error, if they
 301  *    wish to.
 302  */
 303 int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 304 {
 305         DECLARE_COMPLETION_ONSTACK(wait);
 306         struct request_queue *q;
 307         struct bio *bio;
 308         int ret;
 309
 310         if (bdev->bd_disk == NULL)
 311                 return -ENXIO;
 312
 313         q = bdev_get_queue(bdev);
 314         if (!q)
 315                 return -ENXIO;
 316
 317         bio = bio_alloc(GFP_KERNEL, 0);
 318         bio->bi_end_io = bio_end_empty_barrier;
 319         bio->bi_private = &wait;
 320         bio->bi_bdev = bdev;
 321         submit_bio(WRITE_BARRIER, bio);
 322
 323         wait_for_completion(&wait);
 324
 325         /*
 326          * The driver must store the error location in ->bi_sector, if
 327          * it supports it. For non-stacked drivers, this should be copied
 328          * from blk_rq_pos(rq).
 329          */
 330         if (error_sector)
 331                 *error_sector = bio->bi_sector;
 332
 333         ret = 0;
 334         if (bio_flagged(bio, BIO_EOPNOTSUPP))
 335                 ret = -EOPNOTSUPP;
 336         else if (!bio_flagged(bio, BIO_UPTODATE))
 337                 ret = -EIO;
 338
 339         bio_put(bio);
 340         return ret;
 341 }
 342 EXPORT_SYMBOL(blkdev_issue_flush);
 343
 344 static void blkdev_discard_end_io(struct bio *bio, int err)
 345 {
 346         if (err) {
 347                 if (err == -EOPNOTSUPP)
 348                         set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
 349                 clear_bit(BIO_UPTODATE, &bio->bi_flags);
 350         }
 351
 352         if (bio->bi_private)
 353                 complete(bio->bi_private);
 354         __free_page(bio_page(bio));
 355
 356         bio_put(bio);
 357 }
 358
 359 /**
 360  * blkdev_issue_discard - queue a discard
 361  * @bdev:       blockdev to issue discard for
 362  * @sector:     start sector
 363  * @nr_sects:   number of sectors to discard
 364  * @gfp_mask:   memory allocation flags (for bio_alloc)
 365  * @flags:      DISCARD_FL_* flags to control behaviour
 366  *
 367  * Description:
 368  *    Issue a discard request for the sectors in question.
 369  */
 370 int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 371                 sector_t nr_sects, gfp_t gfp_mask, int flags)
 372 {
 373         DECLARE_COMPLETION_ONSTACK(wait);
 374         struct request_queue *q = bdev_get_queue(bdev);
 375         int type = flags & DISCARD_FL_BARRIER ?
 376                 DISCARD_BARRIER : DISCARD_NOBARRIER;
 377         struct bio *bio;
 378         struct page *page;
 379         int ret = 0;
 380
 381         if (!q)
 382                 return -ENXIO;
 383
 384         if (!blk_queue_discard(q))
 385                 return -EOPNOTSUPP;
 386
 387         while (nr_sects && !ret) {
 388                 unsigned int sector_size = q->limits.logical_block_size;
 389                 unsigned int max_discard_sectors =
 390                         min(q->limits.max_discard_sectors, UINT_MAX >> 9);
 391
 392                 bio = bio_alloc(gfp_mask, 1);
 393                 if (!bio)
 394                         goto out;
 395                 bio->bi_sector = sector;
 396                 bio->bi_end_io = blkdev_discard_end_io;
 397                 bio->bi_bdev = bdev;
 398                 if (flags & DISCARD_FL_WAIT)
 399                         bio->bi_private = &wait;
 400
 401                 /*
 402                  * Add a zeroed one-sector payload as that's what
 403                  * our current implementations need.  If we'll ever need
 404                  * more the interface will need revisiting.
 405                  */
 406                 page = alloc_page(gfp_mask | __GFP_ZERO);
 407                 if (!page)
 408                         goto out_free_bio;
 409                 if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
 410                         goto out_free_page;
 411
 412                 /*
 413                  * And override the bio size - the way discard works we
 414                  * touch many more blocks on disk than the actual payload
 415                  * length.
 416                  */
 417                 if (nr_sects > max_discard_sectors) {
 418                         bio->bi_size = max_discard_sectors << 9;
 419                         nr_sects -= max_discard_sectors;
 420                         sector += max_discard_sectors;
 421                 } else {
 422                         bio->bi_size = nr_sects << 9;
 423                         nr_sects = 0;
 424                 }
 425
 426                 bio_get(bio);
 427                 submit_bio(type, bio);
 428
 429                 if (flags & DISCARD_FL_WAIT)
 430                         wait_for_completion(&wait);
 431
 432                 if (bio_flagged(bio, BIO_EOPNOTSUPP))
 433                         ret = -EOPNOTSUPP;
 434                 else if (!bio_flagged(bio, BIO_UPTODATE))
 435                         ret = -EIO;
 436                 bio_put(bio);
 437         }
 438         return ret;
 439 out_free_page:
 440         __free_page(page);
 441 out_free_bio:
 442         bio_put(bio);
 443 out:
 444         return -ENOMEM;
 445 }
 446 EXPORT_SYMBOL(blkdev_issue_discard);