fs/nfs/blocklayout/blocklayout.c

   1 /*
   2  *  linux/fs/nfs/blocklayout/blocklayout.c
   3  *
   4  *  Module for the NFSv4.1 pNFS block layout driver.
   5  *
   6  *  Copyright (c) 2006 The Regents of the University of Michigan.
   7  *  All rights reserved.
   8  *
   9  *  Andy Adamson <andros@citi.umich.edu>
  10  *  Fred Isaman <iisaman@umich.edu>
  11  *
  12  * permission is granted to use, copy, create derivative works and
  13  * redistribute this software and such derivative works for any purpose,
  14  * so long as the name of the university of michigan is not used in
  15  * any advertising or publicity pertaining to the use or distribution
  16  * of this software without specific, written prior authorization.  if
  17  * the above copyright notice or any other identification of the
  18  * university of michigan is included in any copy of any portion of
  19  * this software, then the disclaimer below must also be included.
  20  *
  21  * this software is provided as is, without representation from the
  22  * university of michigan as to its fitness for any purpose, and without
  23  * warranty by the university of michigan of any kind, either express
  24  * or implied, including without limitation the implied warranties of
  25  * merchantability and fitness for a particular purpose.  the regents
  26  * of the university of michigan shall not be liable for any damages,
  27  * including special, indirect, incidental, or consequential damages,
  28  * with respect to any claim arising out or in connection with the use
  29  * of the software, even if it has been or is hereafter advised of the
  30  * possibility of such damages.
  31  */
  32
  33 #include <linux/module.h>
  34 #include <linux/init.h>
  35 #include <linux/mount.h>
  36 #include <linux/namei.h>
  37 #include <linux/bio.h>          /* struct bio */
  38 #include <linux/buffer_head.h>  /* various write calls */
  39
  40 #include "blocklayout.h"
  41
  42 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
  43
  44 MODULE_LICENSE("GPL");
  45 MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
  46 MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
  47
  48 struct dentry *bl_device_pipe;
  49 wait_queue_head_t bl_wq;
  50
  51 static void print_page(struct page *page)
  52 {
  53         dprintk("PRINTPAGE page %p\n", page);
  54         dprintk("       PagePrivate %d\n", PagePrivate(page));
  55         dprintk("       PageUptodate %d\n", PageUptodate(page));
  56         dprintk("       PageError %d\n", PageError(page));
  57         dprintk("       PageDirty %d\n", PageDirty(page));
  58         dprintk("       PageReferenced %d\n", PageReferenced(page));
  59         dprintk("       PageLocked %d\n", PageLocked(page));
  60         dprintk("       PageWriteback %d\n", PageWriteback(page));
  61         dprintk("       PageMappedToDisk %d\n", PageMappedToDisk(page));
  62         dprintk("\n");
  63 }
  64
  65 /* Given the be associated with isect, determine if page data needs to be
  66  * initialized.
  67  */
  68 static int is_hole(struct pnfs_block_extent *be, sector_t isect)
  69 {
  70         if (be->be_state == PNFS_BLOCK_NONE_DATA)
  71                 return 1;
  72         else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
  73                 return 0;
  74         else
  75                 return !bl_is_sector_init(be->be_inval, isect);
  76 }
  77
  78 /* Given the be associated with isect, determine if page data can be
  79  * written to disk.
  80  */
  81 static int is_writable(struct pnfs_block_extent *be, sector_t isect)
  82 {
  83         return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
  84                 be->be_state == PNFS_BLOCK_INVALID_DATA);
  85 }
  86
  87 /* The data we are handed might be spread across several bios.  We need
  88  * to track when the last one is finished.
  89  */
  90 struct parallel_io {
  91         struct kref refcnt;
  92         struct rpc_call_ops call_ops;
  93         void (*pnfs_callback) (void *data);
  94         void *data;
  95 };
  96
  97 static inline struct parallel_io *alloc_parallel(void *data)
  98 {
  99         struct parallel_io *rv;
 100
 101         rv  = kmalloc(sizeof(*rv), GFP_NOFS);
 102         if (rv) {
 103                 rv->data = data;
 104                 kref_init(&rv->refcnt);
 105         }
 106         return rv;
 107 }
 108
 109 static inline void get_parallel(struct parallel_io *p)
 110 {
 111         kref_get(&p->refcnt);
 112 }
 113
 114 static void destroy_parallel(struct kref *kref)
 115 {
 116         struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
 117
 118         dprintk("%s enter\n", __func__);
 119         p->pnfs_callback(p->data);
 120         kfree(p);
 121 }
 122
 123 static inline void put_parallel(struct parallel_io *p)
 124 {
 125         kref_put(&p->refcnt, destroy_parallel);
 126 }
 127
 128 static struct bio *
 129 bl_submit_bio(int rw, struct bio *bio)
 130 {
 131         if (bio) {
 132                 get_parallel(bio->bi_private);
 133                 dprintk("%s submitting %s bio %u@%llu\n", __func__,
 134                         rw == READ ? "read" : "write",
 135                         bio->bi_size, (unsigned long long)bio->bi_sector);
 136                 submit_bio(rw, bio);
 137         }
 138         return NULL;
 139 }
 140
 141 static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
 142                                      struct pnfs_block_extent *be,
 143                                      void (*end_io)(struct bio *, int err),
 144                                      struct parallel_io *par)
 145 {
 146         struct bio *bio;
 147
 148         bio = bio_alloc(GFP_NOIO, npg);
 149         if (!bio)
 150                 return NULL;
 151
 152         bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
 153         bio->bi_bdev = be->be_mdev;
 154         bio->bi_end_io = end_io;
 155         bio->bi_private = par;
 156         return bio;
 157 }
 158
 159 static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
 160                                       sector_t isect, struct page *page,
 161                                       struct pnfs_block_extent *be,
 162                                       void (*end_io)(struct bio *, int err),
 163                                       struct parallel_io *par)
 164 {
 165 retry:
 166         if (!bio) {
 167                 bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
 168                 if (!bio)
 169                         return ERR_PTR(-ENOMEM);
 170         }
 171         if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
 172                 bio = bl_submit_bio(rw, bio);
 173                 goto retry;
 174         }
 175         return bio;
 176 }
 177
 178 static void bl_set_lo_fail(struct pnfs_layout_segment *lseg)
 179 {
 180         if (lseg->pls_range.iomode == IOMODE_RW) {
 181                 dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
 182                 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
 183         } else {
 184                 dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
 185                 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
 186         }
 187 }
 188
 189 /* This is basically copied from mpage_end_io_read */
 190 static void bl_end_io_read(struct bio *bio, int err)
 191 {
 192         struct parallel_io *par = bio->bi_private;
 193         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 194         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 195         struct nfs_read_data *rdata = (struct nfs_read_data *)par->data;
 196
 197         do {
 198                 struct page *page = bvec->bv_page;
 199
 200                 if (--bvec >= bio->bi_io_vec)
 201                         prefetchw(&bvec->bv_page->flags);
 202                 if (uptodate)
 203                         SetPageUptodate(page);
 204         } while (bvec >= bio->bi_io_vec);
 205         if (!uptodate) {
 206                 if (!rdata->pnfs_error)
 207                         rdata->pnfs_error = -EIO;
 208                 bl_set_lo_fail(rdata->lseg);
 209         }
 210         bio_put(bio);
 211         put_parallel(par);
 212 }
 213
 214 static void bl_read_cleanup(struct work_struct *work)
 215 {
 216         struct rpc_task *task;
 217         struct nfs_read_data *rdata;
 218         dprintk("%s enter\n", __func__);
 219         task = container_of(work, struct rpc_task, u.tk_work);
 220         rdata = container_of(task, struct nfs_read_data, task);
 221         pnfs_ld_read_done(rdata);
 222 }
 223
 224 static void
 225 bl_end_par_io_read(void *data)
 226 {
 227         struct nfs_read_data *rdata = data;
 228
 229         INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
 230         schedule_work(&rdata->task.u.tk_work);
 231 }
 232
 233 /* We don't want normal .rpc_call_done callback used, so we replace it
 234  * with this stub.
 235  */
 236 static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata)
 237 {
 238         return;
 239 }
 240
 241 static enum pnfs_try_status
 242 bl_read_pagelist(struct nfs_read_data *rdata)
 243 {
 244         int i, hole;
 245         struct bio *bio = NULL;
 246         struct pnfs_block_extent *be = NULL, *cow_read = NULL;
 247         sector_t isect, extent_length = 0;
 248         struct parallel_io *par;
 249         loff_t f_offset = rdata->args.offset;
 250         size_t count = rdata->args.count;
 251         struct page **pages = rdata->args.pages;
 252         int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
 253
 254         dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__,
 255                rdata->npages, f_offset, count);
 256
 257         par = alloc_parallel(rdata);
 258         if (!par)
 259                 goto use_mds;
 260         par->call_ops = *rdata->mds_ops;
 261         par->call_ops.rpc_call_done = bl_rpc_do_nothing;
 262         par->pnfs_callback = bl_end_par_io_read;
 263         /* At this point, we can no longer jump to use_mds */
 264
 265         isect = (sector_t) (f_offset >> SECTOR_SHIFT);
 266         /* Code assumes extents are page-aligned */
 267         for (i = pg_index; i < rdata->npages; i++) {
 268                 if (!extent_length) {
 269                         /* We've used up the previous extent */
 270                         bl_put_extent(be);
 271                         bl_put_extent(cow_read);
 272                         bio = bl_submit_bio(READ, bio);
 273                         /* Get the next one */
 274                         be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg),
 275                                              isect, &cow_read);
 276                         if (!be) {
 277                                 rdata->pnfs_error = -EIO;
 278                                 goto out;
 279                         }
 280                         extent_length = be->be_length -
 281                                 (isect - be->be_f_offset);
 282                         if (cow_read) {
 283                                 sector_t cow_length = cow_read->be_length -
 284                                         (isect - cow_read->be_f_offset);
 285                                 extent_length = min(extent_length, cow_length);
 286                         }
 287                 }
 288                 hole = is_hole(be, isect);
 289                 if (hole && !cow_read) {
 290                         bio = bl_submit_bio(READ, bio);
 291                         /* Fill hole w/ zeroes w/o accessing device */
 292                         dprintk("%s Zeroing page for hole\n", __func__);
 293                         zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
 294                         print_page(pages[i]);
 295                         SetPageUptodate(pages[i]);
 296                 } else {
 297                         struct pnfs_block_extent *be_read;
 298
 299                         be_read = (hole && cow_read) ? cow_read : be;
 300                         bio = bl_add_page_to_bio(bio, rdata->npages - i, READ,
 301                                                  isect, pages[i], be_read,
 302                                                  bl_end_io_read, par);
 303                         if (IS_ERR(bio)) {
 304                                 rdata->pnfs_error = PTR_ERR(bio);
 305                                 goto out;
 306                         }
 307                 }
 308                 isect += PAGE_CACHE_SECTORS;
 309                 extent_length -= PAGE_CACHE_SECTORS;
 310         }
 311         if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) {
 312                 rdata->res.eof = 1;
 313                 rdata->res.count = rdata->inode->i_size - f_offset;
 314         } else {
 315                 rdata->res.count = (isect << SECTOR_SHIFT) - f_offset;
 316         }
 317 out:
 318         bl_put_extent(be);
 319         bl_put_extent(cow_read);
 320         bl_submit_bio(READ, bio);
 321         put_parallel(par);
 322         return PNFS_ATTEMPTED;
 323
 324  use_mds:
 325         dprintk("Giving up and using normal NFS\n");
 326         return PNFS_NOT_ATTEMPTED;
 327 }
 328
 329 static void mark_extents_written(struct pnfs_block_layout *bl,
 330                                  __u64 offset, __u32 count)
 331 {
 332         sector_t isect, end;
 333         struct pnfs_block_extent *be;
 334
 335         dprintk("%s(%llu, %u)\n", __func__, offset, count);
 336         if (count == 0)
 337                 return;
 338         isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
 339         end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
 340         end >>= SECTOR_SHIFT;
 341         while (isect < end) {
 342                 sector_t len;
 343                 be = bl_find_get_extent(bl, isect, NULL);
 344                 BUG_ON(!be); /* FIXME */
 345                 len = min(end, be->be_f_offset + be->be_length) - isect;
 346                 if (be->be_state == PNFS_BLOCK_INVALID_DATA)
 347                         bl_mark_for_commit(be, isect, len); /* What if fails? */
 348                 isect += len;
 349                 bl_put_extent(be);
 350         }
 351 }
 352
 353 static void bl_end_io_write_zero(struct bio *bio, int err)
 354 {
 355         struct parallel_io *par = bio->bi_private;
 356         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 357         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 358         struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
 359
 360         do {
 361                 struct page *page = bvec->bv_page;
 362
 363                 if (--bvec >= bio->bi_io_vec)
 364                         prefetchw(&bvec->bv_page->flags);
 365                 /* This is the zeroing page we added */
 366                 end_page_writeback(page);
 367                 page_cache_release(page);
 368         } while (bvec >= bio->bi_io_vec);
 369         if (!uptodate) {
 370                 if (!wdata->pnfs_error)
 371                         wdata->pnfs_error = -EIO;
 372                 bl_set_lo_fail(wdata->lseg);
 373         }
 374         bio_put(bio);
 375         put_parallel(par);
 376 }
 377
 378 /* This is basically copied from mpage_end_io_read */
 379 static void bl_end_io_write(struct bio *bio, int err)
 380 {
 381         struct parallel_io *par = bio->bi_private;
 382         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 383         struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
 384
 385         if (!uptodate) {
 386                 if (!wdata->pnfs_error)
 387                         wdata->pnfs_error = -EIO;
 388                 bl_set_lo_fail(wdata->lseg);
 389         }
 390         bio_put(bio);
 391         put_parallel(par);
 392 }
 393
 394 /* Function scheduled for call during bl_end_par_io_write,
 395  * it marks sectors as written and extends the commitlist.
 396  */
 397 static void bl_write_cleanup(struct work_struct *work)
 398 {
 399         struct rpc_task *task;
 400         struct nfs_write_data *wdata;
 401         dprintk("%s enter\n", __func__);
 402         task = container_of(work, struct rpc_task, u.tk_work);
 403         wdata = container_of(task, struct nfs_write_data, task);
 404         if (!wdata->pnfs_error) {
 405                 /* Marks for LAYOUTCOMMIT */
 406                 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
 407                                      wdata->args.offset, wdata->args.count);
 408         }
 409         pnfs_ld_write_done(wdata);
 410 }
 411
 412 /* Called when last of bios associated with a bl_write_pagelist call finishes */
 413 static void bl_end_par_io_write(void *data)
 414 {
 415         struct nfs_write_data *wdata = data;
 416
 417         wdata->task.tk_status = 0;
 418         wdata->verf.committed = NFS_FILE_SYNC;
 419         INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
 420         schedule_work(&wdata->task.u.tk_work);
 421 }
 422
 423 /* FIXME STUB - mark intersection of layout and page as bad, so is not
 424  * used again.
 425  */
 426 static void mark_bad_read(void)
 427 {
 428         return;
 429 }
 430
 431 /*
 432  * map_block:  map a requested I/0 block (isect) into an offset in the LVM
 433  * block_device
 434  */
 435 static void
 436 map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
 437 {
 438         dprintk("%s enter be=%p\n", __func__, be);
 439
 440         set_buffer_mapped(bh);
 441         bh->b_bdev = be->be_mdev;
 442         bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
 443             (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
 444
 445         dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
 446                 __func__, (unsigned long long)isect, (long)bh->b_blocknr,
 447                 bh->b_size);
 448         return;
 449 }
 450
 451 /* Given an unmapped page, zero it or read in page for COW, page is locked
 452  * by caller.
 453  */
 454 static int
 455 init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
 456 {
 457         struct buffer_head *bh = NULL;
 458         int ret = 0;
 459         sector_t isect;
 460
 461         dprintk("%s enter, %p\n", __func__, page);
 462         BUG_ON(PageUptodate(page));
 463         if (!cow_read) {
 464                 zero_user_segment(page, 0, PAGE_SIZE);
 465                 SetPageUptodate(page);
 466                 goto cleanup;
 467         }
 468
 469         bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
 470         if (!bh) {
 471                 ret = -ENOMEM;
 472                 goto cleanup;
 473         }
 474
 475         isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
 476         map_block(bh, isect, cow_read);
 477         if (!bh_uptodate_or_lock(bh))
 478                 ret = bh_submit_read(bh);
 479         if (ret)
 480                 goto cleanup;
 481         SetPageUptodate(page);
 482
 483 cleanup:
 484         bl_put_extent(cow_read);
 485         if (bh)
 486                 free_buffer_head(bh);
 487         if (ret) {
 488                 /* Need to mark layout with bad read...should now
 489                  * just use nfs4 for reads and writes.
 490                  */
 491                 mark_bad_read();
 492         }
 493         return ret;
 494 }
 495
 496 static enum pnfs_try_status
 497 bl_write_pagelist(struct nfs_write_data *wdata, int sync)
 498 {
 499         int i, ret, npg_zero, pg_index, last = 0;
 500         struct bio *bio = NULL;
 501         struct pnfs_block_extent *be = NULL, *cow_read = NULL;
 502         sector_t isect, last_isect = 0, extent_length = 0;
 503         struct parallel_io *par;
 504         loff_t offset = wdata->args.offset;
 505         size_t count = wdata->args.count;
 506         struct page **pages = wdata->args.pages;
 507         struct page *page;
 508         pgoff_t index;
 509         u64 temp;
 510         int npg_per_block =
 511             NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
 512
 513         dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
 514         /* At this point, wdata->pages is a (sequential) list of nfs_pages.
 515          * We want to write each, and if there is an error set pnfs_error
 516          * to have it redone using nfs.
 517          */
 518         par = alloc_parallel(wdata);
 519         if (!par)
 520                 return PNFS_NOT_ATTEMPTED;
 521         par->call_ops = *wdata->mds_ops;
 522         par->call_ops.rpc_call_done = bl_rpc_do_nothing;
 523         par->pnfs_callback = bl_end_par_io_write;
 524         /* At this point, have to be more careful with error handling */
 525
 526         isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
 527         be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
 528         if (!be || !is_writable(be, isect)) {
 529                 dprintk("%s no matching extents!\n", __func__);
 530                 wdata->pnfs_error = -EINVAL;
 531                 goto out;
 532         }
 533
 534         /* First page inside INVALID extent */
 535         if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
 536                 temp = offset >> PAGE_CACHE_SHIFT;
 537                 npg_zero = do_div(temp, npg_per_block);
 538                 isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
 539                                      (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
 540                 extent_length = be->be_length - (isect - be->be_f_offset);
 541
 542 fill_invalid_ext:
 543                 dprintk("%s need to zero %d pages\n", __func__, npg_zero);
 544                 for (;npg_zero > 0; npg_zero--) {
 545                         /* page ref released in bl_end_io_write_zero */
 546                         index = isect >> PAGE_CACHE_SECTOR_SHIFT;
 547                         dprintk("%s zero %dth page: index %lu isect %llu\n",
 548                                 __func__, npg_zero, index,
 549                                 (unsigned long long)isect);
 550                         page =
 551                             find_or_create_page(wdata->inode->i_mapping, index,
 552                                                 GFP_NOFS);
 553                         if (!page) {
 554                                 dprintk("%s oom\n", __func__);
 555                                 wdata->pnfs_error = -ENOMEM;
 556                                 goto out;
 557                         }
 558
 559                         /* PageDirty: Other will write this out
 560                          * PageWriteback: Other is writing this out
 561                          * PageUptodate: It was read before
 562                          * sector_initialized: already written out
 563                          */
 564                         if (PageDirty(page) || PageWriteback(page) ||
 565                             bl_is_sector_init(be->be_inval, isect)) {
 566                                 print_page(page);
 567                                 unlock_page(page);
 568                                 page_cache_release(page);
 569                                 goto next_page;
 570                         }
 571                         if (!PageUptodate(page)) {
 572                                 /* New page, readin or zero it */
 573                                 init_page_for_write(page, cow_read);
 574                         }
 575                         set_page_writeback(page);
 576                         unlock_page(page);
 577
 578                         ret = bl_mark_sectors_init(be->be_inval, isect,
 579                                                        PAGE_CACHE_SECTORS,
 580                                                        NULL);
 581                         if (unlikely(ret)) {
 582                                 dprintk("%s bl_mark_sectors_init fail %d\n",
 583                                         __func__, ret);
 584                                 end_page_writeback(page);
 585                                 page_cache_release(page);
 586                                 wdata->pnfs_error = ret;
 587                                 goto out;
 588                         }
 589                         bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
 590                                                  isect, page, be,
 591                                                  bl_end_io_write_zero, par);
 592                         if (IS_ERR(bio)) {
 593                                 wdata->pnfs_error = PTR_ERR(bio);
 594                                 goto out;
 595                         }
 596                         /* FIXME: This should be done in bi_end_io */
 597                         mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
 598                                              page->index << PAGE_CACHE_SHIFT,
 599                                              PAGE_CACHE_SIZE);
 600 next_page:
 601                         isect += PAGE_CACHE_SECTORS;
 602                         extent_length -= PAGE_CACHE_SECTORS;
 603                 }
 604                 if (last)
 605                         goto write_done;
 606         }
 607         bio = bl_submit_bio(WRITE, bio);
 608
 609         /* Middle pages */
 610         pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
 611         for (i = pg_index; i < wdata->npages; i++) {
 612                 if (!extent_length) {
 613                         /* We've used up the previous extent */
 614                         bl_put_extent(be);
 615                         bio = bl_submit_bio(WRITE, bio);
 616                         /* Get the next one */
 617                         be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg),
 618                                              isect, NULL);
 619                         if (!be || !is_writable(be, isect)) {
 620                                 wdata->pnfs_error = -EINVAL;
 621                                 goto out;
 622                         }
 623                         extent_length = be->be_length -
 624                             (isect - be->be_f_offset);
 625                 }
 626                 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
 627                         ret = bl_mark_sectors_init(be->be_inval, isect,
 628                                                        PAGE_CACHE_SECTORS,
 629                                                        NULL);
 630                         if (unlikely(ret)) {
 631                                 dprintk("%s bl_mark_sectors_init fail %d\n",
 632                                         __func__, ret);
 633                                 wdata->pnfs_error = ret;
 634                                 goto out;
 635                         }
 636                 }
 637                 bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE,
 638                                          isect, pages[i], be,
 639                                          bl_end_io_write, par);
 640                 if (IS_ERR(bio)) {
 641                         wdata->pnfs_error = PTR_ERR(bio);
 642                         goto out;
 643                 }
 644                 isect += PAGE_CACHE_SECTORS;
 645                 last_isect = isect;
 646                 extent_length -= PAGE_CACHE_SECTORS;
 647         }
 648
 649         /* Last page inside INVALID extent */
 650         if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
 651                 bio = bl_submit_bio(WRITE, bio);
 652                 temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
 653                 npg_zero = npg_per_block - do_div(temp, npg_per_block);
 654                 if (npg_zero < npg_per_block) {
 655                         last = 1;
 656                         goto fill_invalid_ext;
 657                 }
 658         }
 659
 660 write_done:
 661         wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
 662         if (count < wdata->res.count) {
 663                 wdata->res.count = count;
 664         }
 665 out:
 666         bl_put_extent(be);
 667         bl_submit_bio(WRITE, bio);
 668         put_parallel(par);
 669         return PNFS_ATTEMPTED;
 670 }
 671
 672 /* FIXME - range ignored */
 673 static void
 674 release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
 675 {
 676         int i;
 677         struct pnfs_block_extent *be;
 678
 679         spin_lock(&bl->bl_ext_lock);
 680         for (i = 0; i < EXTENT_LISTS; i++) {
 681                 while (!list_empty(&bl->bl_extents[i])) {
 682                         be = list_first_entry(&bl->bl_extents[i],
 683                                               struct pnfs_block_extent,
 684                                               be_node);
 685                         list_del(&be->be_node);
 686                         bl_put_extent(be);
 687                 }
 688         }
 689         spin_unlock(&bl->bl_ext_lock);
 690 }
 691
 692 static void
 693 release_inval_marks(struct pnfs_inval_markings *marks)
 694 {
 695         struct pnfs_inval_tracking *pos, *temp;
 696
 697         list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
 698                 list_del(&pos->it_link);
 699                 kfree(pos);
 700         }
 701         return;
 702 }
 703
 704 static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
 705 {
 706         struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
 707
 708         dprintk("%s enter\n", __func__);
 709         release_extents(bl, NULL);
 710         release_inval_marks(&bl->bl_inval);
 711         kfree(bl);
 712 }
 713
 714 static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
 715                                                    gfp_t gfp_flags)
 716 {
 717         struct pnfs_block_layout *bl;
 718
 719         dprintk("%s enter\n", __func__);
 720         bl = kzalloc(sizeof(*bl), gfp_flags);
 721         if (!bl)
 722                 return NULL;
 723         spin_lock_init(&bl->bl_ext_lock);
 724         INIT_LIST_HEAD(&bl->bl_extents[0]);
 725         INIT_LIST_HEAD(&bl->bl_extents[1]);
 726         INIT_LIST_HEAD(&bl->bl_commit);
 727         INIT_LIST_HEAD(&bl->bl_committing);
 728         bl->bl_count = 0;
 729         bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
 730         BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
 731         return &bl->bl_layout;
 732 }
 733
 734 static void bl_free_lseg(struct pnfs_layout_segment *lseg)
 735 {
 736         dprintk("%s enter\n", __func__);
 737         kfree(lseg);
 738 }
 739
 740 /* We pretty much ignore lseg, and store all data layout wide, so we
 741  * can correctly merge.
 742  */
 743 static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
 744                                                  struct nfs4_layoutget_res *lgr,
 745                                                  gfp_t gfp_flags)
 746 {
 747         struct pnfs_layout_segment *lseg;
 748         int status;
 749
 750         dprintk("%s enter\n", __func__);
 751         lseg = kzalloc(sizeof(*lseg), gfp_flags);
 752         if (!lseg)
 753                 return ERR_PTR(-ENOMEM);
 754         status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags);
 755         if (status) {
 756                 /* We don't want to call the full-blown bl_free_lseg,
 757                  * since on error extents were not touched.
 758                  */
 759                 kfree(lseg);
 760                 return ERR_PTR(status);
 761         }
 762         return lseg;
 763 }
 764
 765 static void
 766 bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
 767                        const struct nfs4_layoutcommit_args *arg)
 768 {
 769         dprintk("%s enter\n", __func__);
 770         encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg);
 771 }
 772
 773 static void
 774 bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
 775 {
 776         struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout;
 777
 778         dprintk("%s enter\n", __func__);
 779         clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status);
 780 }
 781
 782 static void free_blk_mountid(struct block_mount_id *mid)
 783 {
 784         if (mid) {
 785                 struct pnfs_block_dev *dev;
 786                 spin_lock(&mid->bm_lock);
 787                 while (!list_empty(&mid->bm_devlist)) {
 788                         dev = list_first_entry(&mid->bm_devlist,
 789                                                struct pnfs_block_dev,
 790                                                bm_node);
 791                         list_del(&dev->bm_node);
 792                         bl_free_block_dev(dev);
 793                 }
 794                 spin_unlock(&mid->bm_lock);
 795                 kfree(mid);
 796         }
 797 }
 798
 799 /* This is mostly copied from the filelayout's get_device_info function.
 800  * It seems much of this should be at the generic pnfs level.
 801  */
 802 static struct pnfs_block_dev *
 803 nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
 804                         struct nfs4_deviceid *d_id)
 805 {
 806         struct pnfs_device *dev;
 807         struct pnfs_block_dev *rv = NULL;
 808         u32 max_resp_sz;
 809         int max_pages;
 810         struct page **pages = NULL;
 811         int i, rc;
 812
 813         /*
 814          * Use the session max response size as the basis for setting
 815          * GETDEVICEINFO's maxcount
 816          */
 817         max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
 818         max_pages = max_resp_sz >> PAGE_SHIFT;
 819         dprintk("%s max_resp_sz %u max_pages %d\n",
 820                 __func__, max_resp_sz, max_pages);
 821
 822         dev = kmalloc(sizeof(*dev), GFP_NOFS);
 823         if (!dev) {
 824                 dprintk("%s kmalloc failed\n", __func__);
 825                 return NULL;
 826         }
 827
 828         pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
 829         if (pages == NULL) {
 830                 kfree(dev);
 831                 return NULL;
 832         }
 833         for (i = 0; i < max_pages; i++) {
 834                 pages[i] = alloc_page(GFP_NOFS);
 835                 if (!pages[i])
 836                         goto out_free;
 837         }
 838
 839         memcpy(&dev->dev_id, d_id, sizeof(*d_id));
 840         dev->layout_type = LAYOUT_BLOCK_VOLUME;
 841         dev->pages = pages;
 842         dev->pgbase = 0;
 843         dev->pglen = PAGE_SIZE * max_pages;
 844         dev->mincount = 0;
 845
 846         dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
 847         rc = nfs4_proc_getdeviceinfo(server, dev);
 848         dprintk("%s getdevice info returns %d\n", __func__, rc);
 849         if (rc)
 850                 goto out_free;
 851
 852         rv = nfs4_blk_decode_device(server, dev);
 853  out_free:
 854         for (i = 0; i < max_pages; i++)
 855                 __free_page(pages[i]);
 856         kfree(pages);
 857         kfree(dev);
 858         return rv;
 859 }
 860
 861 static int
 862 bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
 863 {
 864         struct block_mount_id *b_mt_id = NULL;
 865         struct pnfs_devicelist *dlist = NULL;
 866         struct pnfs_block_dev *bdev;
 867         LIST_HEAD(block_disklist);
 868         int status = 0, i;
 869
 870         dprintk("%s enter\n", __func__);
 871
 872         if (server->pnfs_blksize == 0) {
 873                 dprintk("%s Server did not return blksize\n", __func__);
 874                 return -EINVAL;
 875         }
 876         b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
 877         if (!b_mt_id) {
 878                 status = -ENOMEM;
 879                 goto out_error;
 880         }
 881         /* Initialize nfs4 block layout mount id */
 882         spin_lock_init(&b_mt_id->bm_lock);
 883         INIT_LIST_HEAD(&b_mt_id->bm_devlist);
 884
 885         dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
 886         if (!dlist) {
 887                 status = -ENOMEM;
 888                 goto out_error;
 889         }
 890         dlist->eof = 0;
 891         while (!dlist->eof) {
 892                 status = nfs4_proc_getdevicelist(server, fh, dlist);
 893                 if (status)
 894                         goto out_error;
 895                 dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
 896                         __func__, dlist->num_devs, dlist->eof);
 897                 for (i = 0; i < dlist->num_devs; i++) {
 898                         bdev = nfs4_blk_get_deviceinfo(server, fh,
 899                                                        &dlist->dev_id[i]);
 900                         if (!bdev) {
 901                                 status = -ENODEV;
 902                                 goto out_error;
 903                         }
 904                         spin_lock(&b_mt_id->bm_lock);
 905                         list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
 906                         spin_unlock(&b_mt_id->bm_lock);
 907                 }
 908         }
 909         dprintk("%s SUCCESS\n", __func__);
 910         server->pnfs_ld_data = b_mt_id;
 911
 912  out_return:
 913         kfree(dlist);
 914         return status;
 915
 916  out_error:
 917         free_blk_mountid(b_mt_id);
 918         goto out_return;
 919 }
 920
 921 static int
 922 bl_clear_layoutdriver(struct nfs_server *server)
 923 {
 924         struct block_mount_id *b_mt_id = server->pnfs_ld_data;
 925
 926         dprintk("%s enter\n", __func__);
 927         free_blk_mountid(b_mt_id);
 928         dprintk("%s RETURNS\n", __func__);
 929         return 0;
 930 }
 931
 932 static const struct nfs_pageio_ops bl_pg_read_ops = {
 933         .pg_init = pnfs_generic_pg_init_read,
 934         .pg_test = pnfs_generic_pg_test,
 935         .pg_doio = pnfs_generic_pg_readpages,
 936 };
 937
 938 static const struct nfs_pageio_ops bl_pg_write_ops = {
 939         .pg_init = pnfs_generic_pg_init_write,
 940         .pg_test = pnfs_generic_pg_test,
 941         .pg_doio = pnfs_generic_pg_writepages,
 942 };
 943
 944 static struct pnfs_layoutdriver_type blocklayout_type = {
 945         .id                             = LAYOUT_BLOCK_VOLUME,
 946         .name                           = "LAYOUT_BLOCK_VOLUME",
 947         .read_pagelist                  = bl_read_pagelist,
 948         .write_pagelist                 = bl_write_pagelist,
 949         .alloc_layout_hdr               = bl_alloc_layout_hdr,
 950         .free_layout_hdr                = bl_free_layout_hdr,
 951         .alloc_lseg                     = bl_alloc_lseg,
 952         .free_lseg                      = bl_free_lseg,
 953         .encode_layoutcommit            = bl_encode_layoutcommit,
 954         .cleanup_layoutcommit           = bl_cleanup_layoutcommit,
 955         .set_layoutdriver               = bl_set_layoutdriver,
 956         .clear_layoutdriver             = bl_clear_layoutdriver,
 957         .pg_read_ops                    = &bl_pg_read_ops,
 958         .pg_write_ops                   = &bl_pg_write_ops,
 959 };
 960
 961 static const struct rpc_pipe_ops bl_upcall_ops = {
 962         .upcall         = bl_pipe_upcall,
 963         .downcall       = bl_pipe_downcall,
 964         .destroy_msg    = bl_pipe_destroy_msg,
 965 };
 966
 967 static int __init nfs4blocklayout_init(void)
 968 {
 969         struct vfsmount *mnt;
 970         struct path path;
 971         int ret;
 972
 973         dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
 974
 975         ret = pnfs_register_layoutdriver(&blocklayout_type);
 976         if (ret)
 977                 goto out;
 978
 979         init_waitqueue_head(&bl_wq);
 980
 981         mnt = rpc_get_mount();
 982         if (IS_ERR(mnt)) {
 983                 ret = PTR_ERR(mnt);
 984                 goto out_remove;
 985         }
 986
 987         ret = vfs_path_lookup(mnt->mnt_root,
 988                               mnt,
 989                               NFS_PIPE_DIRNAME, 0, &path);
 990         if (ret)
 991                 goto out_remove;
 992
 993         bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
 994                                     &bl_upcall_ops, 0);
 995         if (IS_ERR(bl_device_pipe)) {
 996                 ret = PTR_ERR(bl_device_pipe);
 997                 goto out_remove;
 998         }
 999 out:
1000         return ret;
1001
1002 out_remove:
1003         pnfs_unregister_layoutdriver(&blocklayout_type);
1004         return ret;
1005 }
1006
1007 static void __exit nfs4blocklayout_exit(void)
1008 {
1009         dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
1010                __func__);
1011
1012         pnfs_unregister_layoutdriver(&blocklayout_type);
1013         rpc_unlink(bl_device_pipe);
1014 }
1015
1016 MODULE_ALIAS("nfs-layouttype4-3");
1017
1018 module_init(nfs4blocklayout_init);
1019 module_exit(nfs4blocklayout_exit);