NFSv4.1: reset the inode MDS threshold counters on layout destruction
[pandora-kernel.git] / fs / nfs / pnfs.c
1 /*
2  *  pNFS functions to call and manage layout drivers.
3  *
4  *  Copyright (c) 2002 [year of first publication]
5  *  The Regents of the University of Michigan
6  *  All Rights Reserved
7  *
8  *  Dean Hildebrand <dhildebz@umich.edu>
9  *
10  *  Permission is granted to use, copy, create derivative works, and
11  *  redistribute this software and such derivative works for any purpose,
12  *  so long as the name of the University of Michigan is not used in
13  *  any advertising or publicity pertaining to the use or distribution
14  *  of this software without specific, written prior authorization. If
15  *  the above copyright notice or any other identification of the
16  *  University of Michigan is included in any copy of any portion of
17  *  this software, then the disclaimer below must also be included.
18  *
19  *  This software is provided as is, without representation or warranty
20  *  of any kind either express or implied, including without limitation
21  *  the implied warranties of merchantability, fitness for a particular
22  *  purpose, or noninfringement.  The Regents of the University of
23  *  Michigan shall not be liable for any damages, including special,
24  *  indirect, incidental, or consequential damages, with respect to any
25  *  claim arising out of or in connection with the use of the software,
26  *  even if it has been or is hereafter advised of the possibility of
27  *  such damages.
28  */
29
30 #include <linux/nfs_fs.h>
31 #include <linux/nfs_page.h>
32 #include <linux/module.h>
33 #include "internal.h"
34 #include "pnfs.h"
35 #include "iostat.h"
36
37 #define NFSDBG_FACILITY         NFSDBG_PNFS
38 #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
39
40 /* Locking:
41  *
42  * pnfs_spinlock:
43  *      protects pnfs_modules_tbl.
44  */
45 static DEFINE_SPINLOCK(pnfs_spinlock);
46
47 /*
48  * pnfs_modules_tbl holds all pnfs modules
49  */
50 static LIST_HEAD(pnfs_modules_tbl);
51
52 /* Return the registered pnfs layout driver module matching given id */
53 static struct pnfs_layoutdriver_type *
54 find_pnfs_driver_locked(u32 id)
55 {
56         struct pnfs_layoutdriver_type *local;
57
58         list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
59                 if (local->id == id)
60                         goto out;
61         local = NULL;
62 out:
63         dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
64         return local;
65 }
66
67 static struct pnfs_layoutdriver_type *
68 find_pnfs_driver(u32 id)
69 {
70         struct pnfs_layoutdriver_type *local;
71
72         spin_lock(&pnfs_spinlock);
73         local = find_pnfs_driver_locked(id);
74         if (local != NULL && !try_module_get(local->owner)) {
75                 dprintk("%s: Could not grab reference on module\n", __func__);
76                 local = NULL;
77         }
78         spin_unlock(&pnfs_spinlock);
79         return local;
80 }
81
82 void
83 unset_pnfs_layoutdriver(struct nfs_server *nfss)
84 {
85         if (nfss->pnfs_curr_ld) {
86                 if (nfss->pnfs_curr_ld->clear_layoutdriver)
87                         nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
88                 /* Decrement the MDS count. Purge the deviceid cache if zero */
89                 if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count))
90                         nfs4_deviceid_purge_client(nfss->nfs_client);
91                 module_put(nfss->pnfs_curr_ld->owner);
92         }
93         nfss->pnfs_curr_ld = NULL;
94 }
95
96 /*
97  * Try to set the server's pnfs module to the pnfs layout type specified by id.
98  * Currently only one pNFS layout driver per filesystem is supported.
99  *
100  * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
101  */
102 void
103 set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
104                       u32 id)
105 {
106         struct pnfs_layoutdriver_type *ld_type = NULL;
107
108         if (id == 0)
109                 goto out_no_driver;
110         if (!(server->nfs_client->cl_exchange_flags &
111                  (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
112                 printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n",
113                         __func__, id, server->nfs_client->cl_exchange_flags);
114                 goto out_no_driver;
115         }
116         ld_type = find_pnfs_driver(id);
117         if (!ld_type) {
118                 request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
119                 ld_type = find_pnfs_driver(id);
120                 if (!ld_type) {
121                         dprintk("%s: No pNFS module found for %u.\n",
122                                 __func__, id);
123                         goto out_no_driver;
124                 }
125         }
126         server->pnfs_curr_ld = ld_type;
127         if (ld_type->set_layoutdriver
128             && ld_type->set_layoutdriver(server, mntfh)) {
129                 printk(KERN_ERR "NFS: %s: Error initializing pNFS layout "
130                         "driver %u.\n", __func__, id);
131                 module_put(ld_type->owner);
132                 goto out_no_driver;
133         }
134         /* Bump the MDS count */
135         atomic_inc(&server->nfs_client->cl_mds_count);
136
137         dprintk("%s: pNFS module for %u set\n", __func__, id);
138         return;
139
140 out_no_driver:
141         dprintk("%s: Using NFSv4 I/O\n", __func__);
142         server->pnfs_curr_ld = NULL;
143 }
144
145 int
146 pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
147 {
148         int status = -EINVAL;
149         struct pnfs_layoutdriver_type *tmp;
150
151         if (ld_type->id == 0) {
152                 printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__);
153                 return status;
154         }
155         if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
156                 printk(KERN_ERR "NFS: %s Layout driver must provide "
157                        "alloc_lseg and free_lseg.\n", __func__);
158                 return status;
159         }
160
161         spin_lock(&pnfs_spinlock);
162         tmp = find_pnfs_driver_locked(ld_type->id);
163         if (!tmp) {
164                 list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
165                 status = 0;
166                 dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
167                         ld_type->name);
168         } else {
169                 printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n",
170                         __func__, ld_type->id);
171         }
172         spin_unlock(&pnfs_spinlock);
173
174         return status;
175 }
176 EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
177
178 void
179 pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
180 {
181         dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
182         spin_lock(&pnfs_spinlock);
183         list_del(&ld_type->pnfs_tblid);
184         spin_unlock(&pnfs_spinlock);
185 }
186 EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
187
188 /*
189  * pNFS client layout cache
190  */
191
192 /* Need to hold i_lock if caller does not already hold reference */
193 void
194 pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)
195 {
196         atomic_inc(&lo->plh_refcount);
197 }
198
199 static struct pnfs_layout_hdr *
200 pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
201 {
202         struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
203         return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) :
204                 kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
205 }
206
207 static void
208 pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
209 {
210         struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
211         put_rpccred(lo->plh_lc_cred);
212         return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
213 }
214
215 static void
216 destroy_layout_hdr(struct pnfs_layout_hdr *lo)
217 {
218         struct nfs_inode *nfsi = NFS_I(lo->plh_inode);
219         dprintk("%s: freeing layout cache %p\n", __func__, lo);
220         BUG_ON(!list_empty(&lo->plh_layouts));
221         nfsi->layout = NULL;
222         /* Reset MDS Threshold I/O counters */
223         nfsi->write_io = 0;
224         nfsi->read_io = 0;
225         pnfs_free_layout_hdr(lo);
226 }
227
228 static void
229 pnfs_put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
230 {
231         if (atomic_dec_and_test(&lo->plh_refcount))
232                 destroy_layout_hdr(lo);
233 }
234
235 void
236 pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
237 {
238         struct inode *inode = lo->plh_inode;
239
240         if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
241                 destroy_layout_hdr(lo);
242                 spin_unlock(&inode->i_lock);
243         }
244 }
245
246 static int
247 pnfs_iomode_to_fail_bit(u32 iomode)
248 {
249         return iomode == IOMODE_RW ?
250                 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
251 }
252
253 static void
254 pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
255 {
256         lo->plh_retry_timestamp = jiffies;
257         if (test_and_set_bit(fail_bit, &lo->plh_flags))
258                 atomic_inc(&lo->plh_refcount);
259 }
260
261 static void
262 pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
263 {
264         if (test_and_clear_bit(fail_bit, &lo->plh_flags))
265                 atomic_dec(&lo->plh_refcount);
266 }
267
268 static void
269 pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
270 {
271         struct inode *inode = lo->plh_inode;
272         struct pnfs_layout_range range = {
273                 .iomode = iomode,
274                 .offset = 0,
275                 .length = NFS4_MAX_UINT64,
276         };
277         LIST_HEAD(head);
278
279         spin_lock(&inode->i_lock);
280         pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
281         pnfs_mark_matching_lsegs_invalid(lo, &head, &range);
282         spin_unlock(&inode->i_lock);
283         pnfs_free_lseg_list(&head);
284         dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
285                         iomode == IOMODE_RW ?  "RW" : "READ");
286 }
287
288 static bool
289 pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
290 {
291         unsigned long start, end;
292         int fail_bit = pnfs_iomode_to_fail_bit(iomode);
293
294         if (test_bit(fail_bit, &lo->plh_flags) == 0)
295                 return false;
296         end = jiffies;
297         start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT;
298         if (!time_in_range(lo->plh_retry_timestamp, start, end)) {
299                 /* It is time to retry the failed layoutgets */
300                 pnfs_layout_clear_fail_bit(lo, fail_bit);
301                 return false;
302         }
303         return true;
304 }
305
306 static void
307 init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
308 {
309         INIT_LIST_HEAD(&lseg->pls_list);
310         INIT_LIST_HEAD(&lseg->pls_lc_list);
311         atomic_set(&lseg->pls_refcount, 1);
312         smp_mb();
313         set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
314         lseg->pls_layout = lo;
315 }
316
317 static void free_lseg(struct pnfs_layout_segment *lseg)
318 {
319         struct inode *ino = lseg->pls_layout->plh_inode;
320
321         NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
322         /* Matched by pnfs_get_layout_hdr in pnfs_insert_layout */
323         pnfs_put_layout_hdr(NFS_I(ino)->layout);
324 }
325
326 static void
327 pnfs_put_lseg_common(struct pnfs_layout_segment *lseg)
328 {
329         struct inode *inode = lseg->pls_layout->plh_inode;
330
331         WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
332         list_del_init(&lseg->pls_list);
333         if (list_empty(&lseg->pls_layout->plh_segs)) {
334                 set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
335                 /* Matched by initial refcount set in alloc_init_layout_hdr */
336                 pnfs_put_layout_hdr_locked(lseg->pls_layout);
337         }
338         rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
339 }
340
341 void
342 pnfs_put_lseg(struct pnfs_layout_segment *lseg)
343 {
344         struct inode *inode;
345
346         if (!lseg)
347                 return;
348
349         dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
350                 atomic_read(&lseg->pls_refcount),
351                 test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
352         inode = lseg->pls_layout->plh_inode;
353         if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
354                 LIST_HEAD(free_me);
355
356                 pnfs_put_lseg_common(lseg);
357                 list_add(&lseg->pls_list, &free_me);
358                 spin_unlock(&inode->i_lock);
359                 pnfs_free_lseg_list(&free_me);
360         }
361 }
362 EXPORT_SYMBOL_GPL(pnfs_put_lseg);
363
364 static inline u64
365 end_offset(u64 start, u64 len)
366 {
367         u64 end;
368
369         end = start + len;
370         return end >= start ? end : NFS4_MAX_UINT64;
371 }
372
373 /* last octet in a range */
374 static inline u64
375 last_byte_offset(u64 start, u64 len)
376 {
377         u64 end;
378
379         BUG_ON(!len);
380         end = start + len;
381         return end > start ? end - 1 : NFS4_MAX_UINT64;
382 }
383
384 /*
385  * is l2 fully contained in l1?
386  *   start1                             end1
387  *   [----------------------------------)
388  *           start2           end2
389  *           [----------------)
390  */
391 static inline int
392 lo_seg_contained(struct pnfs_layout_range *l1,
393                  struct pnfs_layout_range *l2)
394 {
395         u64 start1 = l1->offset;
396         u64 end1 = end_offset(start1, l1->length);
397         u64 start2 = l2->offset;
398         u64 end2 = end_offset(start2, l2->length);
399
400         return (start1 <= start2) && (end1 >= end2);
401 }
402
403 /*
404  * is l1 and l2 intersecting?
405  *   start1                             end1
406  *   [----------------------------------)
407  *                              start2           end2
408  *                              [----------------)
409  */
410 static inline int
411 lo_seg_intersecting(struct pnfs_layout_range *l1,
412                     struct pnfs_layout_range *l2)
413 {
414         u64 start1 = l1->offset;
415         u64 end1 = end_offset(start1, l1->length);
416         u64 start2 = l2->offset;
417         u64 end2 = end_offset(start2, l2->length);
418
419         return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
420                (end2 == NFS4_MAX_UINT64 || end2 > start1);
421 }
422
423 static bool
424 should_free_lseg(struct pnfs_layout_range *lseg_range,
425                  struct pnfs_layout_range *recall_range)
426 {
427         return (recall_range->iomode == IOMODE_ANY ||
428                 lseg_range->iomode == recall_range->iomode) &&
429                lo_seg_intersecting(lseg_range, recall_range);
430 }
431
432 /* Returns 1 if lseg is removed from list, 0 otherwise */
433 static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
434                              struct list_head *tmp_list)
435 {
436         int rv = 0;
437
438         if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
439                 /* Remove the reference keeping the lseg in the
440                  * list.  It will now be removed when all
441                  * outstanding io is finished.
442                  */
443                 dprintk("%s: lseg %p ref %d\n", __func__, lseg,
444                         atomic_read(&lseg->pls_refcount));
445                 if (atomic_dec_and_test(&lseg->pls_refcount)) {
446                         pnfs_put_lseg_common(lseg);
447                         list_add(&lseg->pls_list, tmp_list);
448                         rv = 1;
449                 }
450         }
451         return rv;
452 }
453
454 /* Returns count of number of matching invalid lsegs remaining in list
455  * after call.
456  */
457 int
458 pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
459                             struct list_head *tmp_list,
460                             struct pnfs_layout_range *recall_range)
461 {
462         struct pnfs_layout_segment *lseg, *next;
463         int invalid = 0, removed = 0;
464
465         dprintk("%s:Begin lo %p\n", __func__, lo);
466
467         if (list_empty(&lo->plh_segs)) {
468                 if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
469                         pnfs_put_layout_hdr_locked(lo);
470                 return 0;
471         }
472         list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
473                 if (!recall_range ||
474                     should_free_lseg(&lseg->pls_range, recall_range)) {
475                         dprintk("%s: freeing lseg %p iomode %d "
476                                 "offset %llu length %llu\n", __func__,
477                                 lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
478                                 lseg->pls_range.length);
479                         invalid++;
480                         removed += mark_lseg_invalid(lseg, tmp_list);
481                 }
482         dprintk("%s:Return %i\n", __func__, invalid - removed);
483         return invalid - removed;
484 }
485
486 /* note free_me must contain lsegs from a single layout_hdr */
487 void
488 pnfs_free_lseg_list(struct list_head *free_me)
489 {
490         struct pnfs_layout_segment *lseg, *tmp;
491         struct pnfs_layout_hdr *lo;
492
493         if (list_empty(free_me))
494                 return;
495
496         lo = list_first_entry(free_me, struct pnfs_layout_segment,
497                               pls_list)->pls_layout;
498
499         if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
500                 struct nfs_client *clp;
501
502                 clp = NFS_SERVER(lo->plh_inode)->nfs_client;
503                 spin_lock(&clp->cl_lock);
504                 list_del_init(&lo->plh_layouts);
505                 spin_unlock(&clp->cl_lock);
506         }
507         list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
508                 list_del(&lseg->pls_list);
509                 free_lseg(lseg);
510         }
511 }
512
513 void
514 pnfs_destroy_layout(struct nfs_inode *nfsi)
515 {
516         struct pnfs_layout_hdr *lo;
517         LIST_HEAD(tmp_list);
518
519         spin_lock(&nfsi->vfs_inode.i_lock);
520         lo = nfsi->layout;
521         if (lo) {
522                 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
523                 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
524                 pnfs_get_layout_hdr(lo);
525                 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
526                 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
527                 spin_unlock(&nfsi->vfs_inode.i_lock);
528                 pnfs_free_lseg_list(&tmp_list);
529                 pnfs_put_layout_hdr(lo);
530         } else
531                 spin_unlock(&nfsi->vfs_inode.i_lock);
532 }
533 EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
534
535 /*
536  * Called by the state manger to remove all layouts established under an
537  * expired lease.
538  */
539 void
540 pnfs_destroy_all_layouts(struct nfs_client *clp)
541 {
542         struct nfs_server *server;
543         struct pnfs_layout_hdr *lo;
544         LIST_HEAD(tmp_list);
545
546         nfs4_deviceid_mark_client_invalid(clp);
547         nfs4_deviceid_purge_client(clp);
548
549         spin_lock(&clp->cl_lock);
550         rcu_read_lock();
551         list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
552                 if (!list_empty(&server->layouts))
553                         list_splice_init(&server->layouts, &tmp_list);
554         }
555         rcu_read_unlock();
556         spin_unlock(&clp->cl_lock);
557
558         while (!list_empty(&tmp_list)) {
559                 lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
560                                 plh_layouts);
561                 dprintk("%s freeing layout for inode %lu\n", __func__,
562                         lo->plh_inode->i_ino);
563                 list_del_init(&lo->plh_layouts);
564                 pnfs_destroy_layout(NFS_I(lo->plh_inode));
565         }
566 }
567
568 /* update lo->plh_stateid with new if is more recent */
569 void
570 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
571                         bool update_barrier)
572 {
573         u32 oldseq, newseq;
574
575         oldseq = be32_to_cpu(lo->plh_stateid.seqid);
576         newseq = be32_to_cpu(new->seqid);
577         if ((int)(newseq - oldseq) > 0) {
578                 nfs4_stateid_copy(&lo->plh_stateid, new);
579                 if (update_barrier) {
580                         u32 new_barrier = be32_to_cpu(new->seqid);
581
582                         if ((int)(new_barrier - lo->plh_barrier))
583                                 lo->plh_barrier = new_barrier;
584                 } else {
585                         /* Because of wraparound, we want to keep the barrier
586                          * "close" to the current seqids.  It needs to be
587                          * within 2**31 to count as "behind", so if it
588                          * gets too near that limit, give us a litle leeway
589                          * and bring it to within 2**30.
590                          * NOTE - and yes, this is all unsigned arithmetic.
591                          */
592                         if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
593                                 lo->plh_barrier = newseq - (1 << 30);
594                 }
595         }
596 }
597
598 /* lget is set to 1 if called from inside send_layoutget call chain */
599 static bool
600 pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
601                         int lget)
602 {
603         if ((stateid) &&
604             (int)(lo->plh_barrier - be32_to_cpu(stateid->seqid)) >= 0)
605                 return true;
606         return lo->plh_block_lgets ||
607                 test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
608                 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
609                 (list_empty(&lo->plh_segs) &&
610                  (atomic_read(&lo->plh_outstanding) > lget));
611 }
612
613 int
614 pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
615                               struct nfs4_state *open_state)
616 {
617         int status = 0;
618
619         dprintk("--> %s\n", __func__);
620         spin_lock(&lo->plh_inode->i_lock);
621         if (pnfs_layoutgets_blocked(lo, NULL, 1)) {
622                 status = -EAGAIN;
623         } else if (list_empty(&lo->plh_segs)) {
624                 int seq;
625
626                 do {
627                         seq = read_seqbegin(&open_state->seqlock);
628                         nfs4_stateid_copy(dst, &open_state->stateid);
629                 } while (read_seqretry(&open_state->seqlock, seq));
630         } else
631                 nfs4_stateid_copy(dst, &lo->plh_stateid);
632         spin_unlock(&lo->plh_inode->i_lock);
633         dprintk("<-- %s\n", __func__);
634         return status;
635 }
636
637 /*
638 * Get layout from server.
639 *    for now, assume that whole file layouts are requested.
640 *    arg->offset: 0
641 *    arg->length: all ones
642 */
643 static struct pnfs_layout_segment *
644 send_layoutget(struct pnfs_layout_hdr *lo,
645            struct nfs_open_context *ctx,
646            struct pnfs_layout_range *range,
647            gfp_t gfp_flags)
648 {
649         struct inode *ino = lo->plh_inode;
650         struct nfs_server *server = NFS_SERVER(ino);
651         struct nfs4_layoutget *lgp;
652         struct pnfs_layout_segment *lseg;
653
654         dprintk("--> %s\n", __func__);
655
656         BUG_ON(ctx == NULL);
657         lgp = kzalloc(sizeof(*lgp), gfp_flags);
658         if (lgp == NULL)
659                 return NULL;
660
661         lgp->args.minlength = PAGE_CACHE_SIZE;
662         if (lgp->args.minlength > range->length)
663                 lgp->args.minlength = range->length;
664         lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
665         lgp->args.range = *range;
666         lgp->args.type = server->pnfs_curr_ld->id;
667         lgp->args.inode = ino;
668         lgp->args.ctx = get_nfs_open_context(ctx);
669         lgp->gfp_flags = gfp_flags;
670
671         /* Synchronously retrieve layout information from server and
672          * store in lseg.
673          */
674         lseg = nfs4_proc_layoutget(lgp, gfp_flags);
675         if (IS_ERR(lseg)) {
676                 switch (PTR_ERR(lseg)) {
677                 case -ENOMEM:
678                 case -ERESTARTSYS:
679                         break;
680                 default:
681                         /* remember that LAYOUTGET failed and suspend trying */
682                         pnfs_layout_io_set_failed(lo, range->iomode);
683                 }
684                 return NULL;
685         }
686
687         return lseg;
688 }
689
690 /*
691  * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
692  * when the layout segment list is empty.
693  *
694  * Note that a pnfs_layout_hdr can exist with an empty layout segment
695  * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the
696  * deviceid is marked invalid.
697  */
698 int
699 _pnfs_return_layout(struct inode *ino)
700 {
701         struct pnfs_layout_hdr *lo = NULL;
702         struct nfs_inode *nfsi = NFS_I(ino);
703         LIST_HEAD(tmp_list);
704         struct nfs4_layoutreturn *lrp;
705         nfs4_stateid stateid;
706         int status = 0, empty;
707
708         dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino);
709
710         spin_lock(&ino->i_lock);
711         lo = nfsi->layout;
712         if (!lo || pnfs_test_layout_returned(lo)) {
713                 spin_unlock(&ino->i_lock);
714                 dprintk("NFS: %s no layout to return\n", __func__);
715                 goto out;
716         }
717         stateid = nfsi->layout->plh_stateid;
718         /* Reference matched in nfs4_layoutreturn_release */
719         pnfs_get_layout_hdr(lo);
720         empty = list_empty(&lo->plh_segs);
721         pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
722         /* Don't send a LAYOUTRETURN if list was initially empty */
723         if (empty) {
724                 spin_unlock(&ino->i_lock);
725                 pnfs_put_layout_hdr(lo);
726                 dprintk("NFS: %s no layout segments to return\n", __func__);
727                 goto out;
728         }
729         lo->plh_block_lgets++;
730         pnfs_mark_layout_returned(lo);
731         spin_unlock(&ino->i_lock);
732         pnfs_free_lseg_list(&tmp_list);
733
734         WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags));
735
736         lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
737         if (unlikely(lrp == NULL)) {
738                 status = -ENOMEM;
739                 pnfs_layout_io_set_failed(lo, IOMODE_RW);
740                 pnfs_layout_io_set_failed(lo, IOMODE_READ);
741                 pnfs_clear_layout_returned(lo);
742                 pnfs_put_layout_hdr(lo);
743                 goto out;
744         }
745
746         lrp->args.stateid = stateid;
747         lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
748         lrp->args.inode = ino;
749         lrp->args.layout = lo;
750         lrp->clp = NFS_SERVER(ino)->nfs_client;
751
752         status = nfs4_proc_layoutreturn(lrp);
753 out:
754         dprintk("<-- %s status: %d\n", __func__, status);
755         return status;
756 }
757 EXPORT_SYMBOL_GPL(_pnfs_return_layout);
758
759 bool pnfs_roc(struct inode *ino)
760 {
761         struct pnfs_layout_hdr *lo;
762         struct pnfs_layout_segment *lseg, *tmp;
763         LIST_HEAD(tmp_list);
764         bool found = false;
765
766         spin_lock(&ino->i_lock);
767         lo = NFS_I(ino)->layout;
768         if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
769             test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
770                 goto out_nolayout;
771         list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
772                 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
773                         mark_lseg_invalid(lseg, &tmp_list);
774                         found = true;
775                 }
776         if (!found)
777                 goto out_nolayout;
778         lo->plh_block_lgets++;
779         pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
780         spin_unlock(&ino->i_lock);
781         pnfs_free_lseg_list(&tmp_list);
782         return true;
783
784 out_nolayout:
785         spin_unlock(&ino->i_lock);
786         return false;
787 }
788
789 void pnfs_roc_release(struct inode *ino)
790 {
791         struct pnfs_layout_hdr *lo;
792
793         spin_lock(&ino->i_lock);
794         lo = NFS_I(ino)->layout;
795         lo->plh_block_lgets--;
796         pnfs_put_layout_hdr_locked(lo);
797         spin_unlock(&ino->i_lock);
798 }
799
800 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
801 {
802         struct pnfs_layout_hdr *lo;
803
804         spin_lock(&ino->i_lock);
805         lo = NFS_I(ino)->layout;
806         if ((int)(barrier - lo->plh_barrier) > 0)
807                 lo->plh_barrier = barrier;
808         spin_unlock(&ino->i_lock);
809 }
810
811 bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
812 {
813         struct nfs_inode *nfsi = NFS_I(ino);
814         struct pnfs_layout_hdr *lo;
815         struct pnfs_layout_segment *lseg;
816         u32 current_seqid;
817         bool found = false;
818
819         spin_lock(&ino->i_lock);
820         list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
821                 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
822                         rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
823                         found = true;
824                         goto out;
825                 }
826         lo = nfsi->layout;
827         current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
828
829         /* Since close does not return a layout stateid for use as
830          * a barrier, we choose the worst-case barrier.
831          */
832         *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
833 out:
834         spin_unlock(&ino->i_lock);
835         return found;
836 }
837
838 /*
839  * Compare two layout segments for sorting into layout cache.
840  * We want to preferentially return RW over RO layouts, so ensure those
841  * are seen first.
842  */
843 static s64
844 cmp_layout(struct pnfs_layout_range *l1,
845            struct pnfs_layout_range *l2)
846 {
847         s64 d;
848
849         /* high offset > low offset */
850         d = l1->offset - l2->offset;
851         if (d)
852                 return d;
853
854         /* short length > long length */
855         d = l2->length - l1->length;
856         if (d)
857                 return d;
858
859         /* read > read/write */
860         return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
861 }
862
863 static void
864 pnfs_insert_layout(struct pnfs_layout_hdr *lo,
865                    struct pnfs_layout_segment *lseg)
866 {
867         struct pnfs_layout_segment *lp;
868
869         dprintk("%s:Begin\n", __func__);
870
871         assert_spin_locked(&lo->plh_inode->i_lock);
872         list_for_each_entry(lp, &lo->plh_segs, pls_list) {
873                 if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
874                         continue;
875                 list_add_tail(&lseg->pls_list, &lp->pls_list);
876                 dprintk("%s: inserted lseg %p "
877                         "iomode %d offset %llu length %llu before "
878                         "lp %p iomode %d offset %llu length %llu\n",
879                         __func__, lseg, lseg->pls_range.iomode,
880                         lseg->pls_range.offset, lseg->pls_range.length,
881                         lp, lp->pls_range.iomode, lp->pls_range.offset,
882                         lp->pls_range.length);
883                 goto out;
884         }
885         list_add_tail(&lseg->pls_list, &lo->plh_segs);
886         dprintk("%s: inserted lseg %p "
887                 "iomode %d offset %llu length %llu at tail\n",
888                 __func__, lseg, lseg->pls_range.iomode,
889                 lseg->pls_range.offset, lseg->pls_range.length);
890 out:
891         pnfs_get_layout_hdr(lo);
892
893         dprintk("%s:Return\n", __func__);
894 }
895
896 static struct pnfs_layout_hdr *
897 alloc_init_layout_hdr(struct inode *ino,
898                       struct nfs_open_context *ctx,
899                       gfp_t gfp_flags)
900 {
901         struct pnfs_layout_hdr *lo;
902
903         lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
904         if (!lo)
905                 return NULL;
906         atomic_set(&lo->plh_refcount, 1);
907         INIT_LIST_HEAD(&lo->plh_layouts);
908         INIT_LIST_HEAD(&lo->plh_segs);
909         INIT_LIST_HEAD(&lo->plh_bulk_recall);
910         lo->plh_inode = ino;
911         lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred);
912         return lo;
913 }
914
915 static struct pnfs_layout_hdr *
916 pnfs_find_alloc_layout(struct inode *ino,
917                        struct nfs_open_context *ctx,
918                        gfp_t gfp_flags)
919 {
920         struct nfs_inode *nfsi = NFS_I(ino);
921         struct pnfs_layout_hdr *new = NULL;
922
923         dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
924
925         assert_spin_locked(&ino->i_lock);
926         if (nfsi->layout) {
927                 if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
928                         return NULL;
929                 else
930                         return nfsi->layout;
931         }
932         spin_unlock(&ino->i_lock);
933         new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
934         spin_lock(&ino->i_lock);
935
936         if (likely(nfsi->layout == NULL))       /* Won the race? */
937                 nfsi->layout = new;
938         else
939                 pnfs_free_layout_hdr(new);
940         return nfsi->layout;
941 }
942
943 /*
944  * iomode matching rules:
945  * iomode       lseg    match
946  * -----        -----   -----
947  * ANY          READ    true
948  * ANY          RW      true
949  * RW           READ    false
950  * RW           RW      true
951  * READ         READ    true
952  * READ         RW      true
953  */
954 static int
955 is_matching_lseg(struct pnfs_layout_range *ls_range,
956                  struct pnfs_layout_range *range)
957 {
958         struct pnfs_layout_range range1;
959
960         if ((range->iomode == IOMODE_RW &&
961              ls_range->iomode != IOMODE_RW) ||
962             !lo_seg_intersecting(ls_range, range))
963                 return 0;
964
965         /* range1 covers only the first byte in the range */
966         range1 = *range;
967         range1.length = 1;
968         return lo_seg_contained(ls_range, &range1);
969 }
970
971 /*
972  * lookup range in layout
973  */
974 static struct pnfs_layout_segment *
975 pnfs_find_lseg(struct pnfs_layout_hdr *lo,
976                 struct pnfs_layout_range *range)
977 {
978         struct pnfs_layout_segment *lseg, *ret = NULL;
979
980         dprintk("%s:Begin\n", __func__);
981
982         assert_spin_locked(&lo->plh_inode->i_lock);
983         list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
984                 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
985                     is_matching_lseg(&lseg->pls_range, range)) {
986                         ret = pnfs_get_lseg(lseg);
987                         break;
988                 }
989                 if (lseg->pls_range.offset > range->offset)
990                         break;
991         }
992
993         dprintk("%s:Return lseg %p ref %d\n",
994                 __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
995         return ret;
996 }
997
998 /*
999  * Use mdsthreshold hints set at each OPEN to determine if I/O should go
1000  * to the MDS or over pNFS
1001  *
1002  * The nfs_inode read_io and write_io fields are cumulative counters reset
1003  * when there are no layout segments. Note that in pnfs_update_layout iomode
1004  * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a
1005  * WRITE request.
1006  *
1007  * A return of true means use MDS I/O.
1008  *
1009  * From rfc 5661:
1010  * If a file's size is smaller than the file size threshold, data accesses
1011  * SHOULD be sent to the metadata server.  If an I/O request has a length that
1012  * is below the I/O size threshold, the I/O SHOULD be sent to the metadata
1013  * server.  If both file size and I/O size are provided, the client SHOULD
1014  * reach or exceed  both thresholds before sending its read or write
1015  * requests to the data server.
1016  */
1017 static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
1018                                      struct inode *ino, int iomode)
1019 {
1020         struct nfs4_threshold *t = ctx->mdsthreshold;
1021         struct nfs_inode *nfsi = NFS_I(ino);
1022         loff_t fsize = i_size_read(ino);
1023         bool size = false, size_set = false, io = false, io_set = false, ret = false;
1024
1025         if (t == NULL)
1026                 return ret;
1027
1028         dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
1029                 __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz);
1030
1031         switch (iomode) {
1032         case IOMODE_READ:
1033                 if (t->bm & THRESHOLD_RD) {
1034                         dprintk("%s fsize %llu\n", __func__, fsize);
1035                         size_set = true;
1036                         if (fsize < t->rd_sz)
1037                                 size = true;
1038                 }
1039                 if (t->bm & THRESHOLD_RD_IO) {
1040                         dprintk("%s nfsi->read_io %llu\n", __func__,
1041                                 nfsi->read_io);
1042                         io_set = true;
1043                         if (nfsi->read_io < t->rd_io_sz)
1044                                 io = true;
1045                 }
1046                 break;
1047         case IOMODE_RW:
1048                 if (t->bm & THRESHOLD_WR) {
1049                         dprintk("%s fsize %llu\n", __func__, fsize);
1050                         size_set = true;
1051                         if (fsize < t->wr_sz)
1052                                 size = true;
1053                 }
1054                 if (t->bm & THRESHOLD_WR_IO) {
1055                         dprintk("%s nfsi->write_io %llu\n", __func__,
1056                                 nfsi->write_io);
1057                         io_set = true;
1058                         if (nfsi->write_io < t->wr_io_sz)
1059                                 io = true;
1060                 }
1061                 break;
1062         }
1063         if (size_set && io_set) {
1064                 if (size && io)
1065                         ret = true;
1066         } else if (size || io)
1067                 ret = true;
1068
1069         dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret);
1070         return ret;
1071 }
1072
1073 /*
1074  * Layout segment is retreived from the server if not cached.
1075  * The appropriate layout segment is referenced and returned to the caller.
1076  */
1077 struct pnfs_layout_segment *
1078 pnfs_update_layout(struct inode *ino,
1079                    struct nfs_open_context *ctx,
1080                    loff_t pos,
1081                    u64 count,
1082                    enum pnfs_iomode iomode,
1083                    gfp_t gfp_flags)
1084 {
1085         struct pnfs_layout_range arg = {
1086                 .iomode = iomode,
1087                 .offset = pos,
1088                 .length = count,
1089         };
1090         unsigned pg_offset;
1091         struct nfs_server *server = NFS_SERVER(ino);
1092         struct nfs_client *clp = server->nfs_client;
1093         struct pnfs_layout_hdr *lo;
1094         struct pnfs_layout_segment *lseg = NULL;
1095         bool first = false;
1096
1097         if (!pnfs_enabled_sb(NFS_SERVER(ino)))
1098                 goto out;
1099
1100         if (pnfs_within_mdsthreshold(ctx, ino, iomode))
1101                 goto out;
1102
1103         spin_lock(&ino->i_lock);
1104         lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
1105         if (lo == NULL) {
1106                 spin_unlock(&ino->i_lock);
1107                 goto out;
1108         }
1109
1110         /* Do we even need to bother with this? */
1111         if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1112                 dprintk("%s matches recall, use MDS\n", __func__);
1113                 goto out_unlock;
1114         }
1115
1116         /* if LAYOUTGET already failed once we don't try again */
1117         if (pnfs_layout_io_test_failed(lo, iomode))
1118                 goto out_unlock;
1119
1120         /* Check to see if the layout for the given range already exists */
1121         lseg = pnfs_find_lseg(lo, &arg);
1122         if (lseg)
1123                 goto out_unlock;
1124
1125         if (pnfs_layoutgets_blocked(lo, NULL, 0))
1126                 goto out_unlock;
1127         atomic_inc(&lo->plh_outstanding);
1128
1129         pnfs_get_layout_hdr(lo);
1130         if (list_empty(&lo->plh_segs))
1131                 first = true;
1132
1133         /* Enable LAYOUTRETURNs */
1134         pnfs_clear_layout_returned(lo);
1135
1136         spin_unlock(&ino->i_lock);
1137         if (first) {
1138                 /* The lo must be on the clp list if there is any
1139                  * chance of a CB_LAYOUTRECALL(FILE) coming in.
1140                  */
1141                 spin_lock(&clp->cl_lock);
1142                 BUG_ON(!list_empty(&lo->plh_layouts));
1143                 list_add_tail(&lo->plh_layouts, &server->layouts);
1144                 spin_unlock(&clp->cl_lock);
1145         }
1146
1147         pg_offset = arg.offset & ~PAGE_CACHE_MASK;
1148         if (pg_offset) {
1149                 arg.offset -= pg_offset;
1150                 arg.length += pg_offset;
1151         }
1152         if (arg.length != NFS4_MAX_UINT64)
1153                 arg.length = PAGE_CACHE_ALIGN(arg.length);
1154
1155         lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
1156         if (!lseg && first) {
1157                 spin_lock(&clp->cl_lock);
1158                 list_del_init(&lo->plh_layouts);
1159                 spin_unlock(&clp->cl_lock);
1160         }
1161         atomic_dec(&lo->plh_outstanding);
1162 out_put_layout_hdr:
1163         pnfs_put_layout_hdr(lo);
1164 out:
1165         dprintk("%s: inode %s/%llu pNFS layout segment %s for "
1166                         "(%s, offset: %llu, length: %llu)\n",
1167                         __func__, ino->i_sb->s_id,
1168                         (unsigned long long)NFS_FILEID(ino),
1169                         lseg == NULL ? "not found" : "found",
1170                         iomode==IOMODE_RW ?  "read/write" : "read-only",
1171                         (unsigned long long)pos,
1172                         (unsigned long long)count);
1173         return lseg;
1174 out_unlock:
1175         spin_unlock(&ino->i_lock);
1176         goto out_put_layout_hdr;
1177 }
1178 EXPORT_SYMBOL_GPL(pnfs_update_layout);
1179
1180 struct pnfs_layout_segment *
1181 pnfs_layout_process(struct nfs4_layoutget *lgp)
1182 {
1183         struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
1184         struct nfs4_layoutget_res *res = &lgp->res;
1185         struct pnfs_layout_segment *lseg;
1186         struct inode *ino = lo->plh_inode;
1187         int status = 0;
1188
1189         /* Inject layout blob into I/O device driver */
1190         lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
1191         if (!lseg || IS_ERR(lseg)) {
1192                 if (!lseg)
1193                         status = -ENOMEM;
1194                 else
1195                         status = PTR_ERR(lseg);
1196                 dprintk("%s: Could not allocate layout: error %d\n",
1197                        __func__, status);
1198                 goto out;
1199         }
1200
1201         spin_lock(&ino->i_lock);
1202         if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1203                 dprintk("%s forget reply due to recall\n", __func__);
1204                 goto out_forget_reply;
1205         }
1206
1207         if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) {
1208                 dprintk("%s forget reply due to state\n", __func__);
1209                 goto out_forget_reply;
1210         }
1211         init_lseg(lo, lseg);
1212         lseg->pls_range = res->range;
1213         pnfs_get_lseg(lseg);
1214         pnfs_insert_layout(lo, lseg);
1215
1216         if (res->return_on_close) {
1217                 set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
1218                 set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
1219         }
1220
1221         /* Done processing layoutget. Set the layout stateid */
1222         pnfs_set_layout_stateid(lo, &res->stateid, false);
1223         spin_unlock(&ino->i_lock);
1224         return lseg;
1225 out:
1226         return ERR_PTR(status);
1227
1228 out_forget_reply:
1229         spin_unlock(&ino->i_lock);
1230         lseg->pls_layout = lo;
1231         NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
1232         goto out;
1233 }
1234
1235 void
1236 pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1237 {
1238         BUG_ON(pgio->pg_lseg != NULL);
1239
1240         if (req->wb_offset != req->wb_pgbase) {
1241                 nfs_pageio_reset_read_mds(pgio);
1242                 return;
1243         }
1244         pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1245                                            req->wb_context,
1246                                            req_offset(req),
1247                                            req->wb_bytes,
1248                                            IOMODE_READ,
1249                                            GFP_KERNEL);
1250         /* If no lseg, fall back to read through mds */
1251         if (pgio->pg_lseg == NULL)
1252                 nfs_pageio_reset_read_mds(pgio);
1253
1254 }
1255 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
1256
1257 void
1258 pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1259 {
1260         BUG_ON(pgio->pg_lseg != NULL);
1261
1262         if (req->wb_offset != req->wb_pgbase) {
1263                 nfs_pageio_reset_write_mds(pgio);
1264                 return;
1265         }
1266         pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1267                                            req->wb_context,
1268                                            req_offset(req),
1269                                            req->wb_bytes,
1270                                            IOMODE_RW,
1271                                            GFP_NOFS);
1272         /* If no lseg, fall back to write through mds */
1273         if (pgio->pg_lseg == NULL)
1274                 nfs_pageio_reset_write_mds(pgio);
1275 }
1276 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
1277
1278 void
1279 pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode,
1280                       const struct nfs_pgio_completion_ops *compl_ops)
1281 {
1282         struct nfs_server *server = NFS_SERVER(inode);
1283         struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
1284
1285         if (ld == NULL)
1286                 nfs_pageio_init_read(pgio, inode, compl_ops);
1287         else
1288                 nfs_pageio_init(pgio, inode, ld->pg_read_ops, compl_ops, server->rsize, 0);
1289 }
1290
1291 void
1292 pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode,
1293                        int ioflags,
1294                        const struct nfs_pgio_completion_ops *compl_ops)
1295 {
1296         struct nfs_server *server = NFS_SERVER(inode);
1297         struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
1298
1299         if (ld == NULL)
1300                 nfs_pageio_init_write(pgio, inode, ioflags, compl_ops);
1301         else
1302                 nfs_pageio_init(pgio, inode, ld->pg_write_ops, compl_ops, server->wsize, ioflags);
1303 }
1304
1305 bool
1306 pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1307                      struct nfs_page *req)
1308 {
1309         if (pgio->pg_lseg == NULL)
1310                 return nfs_generic_pg_test(pgio, prev, req);
1311
1312         /*
1313          * Test if a nfs_page is fully contained in the pnfs_layout_range.
1314          * Note that this test makes several assumptions:
1315          * - that the previous nfs_page in the struct nfs_pageio_descriptor
1316          *   is known to lie within the range.
1317          *   - that the nfs_page being tested is known to be contiguous with the
1318          *   previous nfs_page.
1319          *   - Layout ranges are page aligned, so we only have to test the
1320          *   start offset of the request.
1321          *
1322          * Please also note that 'end_offset' is actually the offset of the
1323          * first byte that lies outside the pnfs_layout_range. FIXME?
1324          *
1325          */
1326         return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset,
1327                                          pgio->pg_lseg->pls_range.length);
1328 }
1329 EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
1330
1331 int pnfs_write_done_resend_to_mds(struct inode *inode,
1332                                 struct list_head *head,
1333                                 const struct nfs_pgio_completion_ops *compl_ops)
1334 {
1335         struct nfs_pageio_descriptor pgio;
1336         LIST_HEAD(failed);
1337
1338         /* Resend all requests through the MDS */
1339         nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops);
1340         while (!list_empty(head)) {
1341                 struct nfs_page *req = nfs_list_entry(head->next);
1342
1343                 nfs_list_remove_request(req);
1344                 if (!nfs_pageio_add_request(&pgio, req))
1345                         nfs_list_add_request(req, &failed);
1346         }
1347         nfs_pageio_complete(&pgio);
1348
1349         if (!list_empty(&failed)) {
1350                 /* For some reason our attempt to resend pages. Mark the
1351                  * overall send request as having failed, and let
1352                  * nfs_writeback_release_full deal with the error.
1353                  */
1354                 list_move(&failed, head);
1355                 return -EIO;
1356         }
1357         return 0;
1358 }
1359 EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
1360
1361 static void pnfs_ld_handle_write_error(struct nfs_write_data *data)
1362 {
1363         struct nfs_pgio_header *hdr = data->header;
1364
1365         dprintk("pnfs write error = %d\n", hdr->pnfs_error);
1366         if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
1367             PNFS_LAYOUTRET_ON_ERROR) {
1368                 clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags);
1369                 pnfs_return_layout(hdr->inode);
1370         }
1371         if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
1372                 data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
1373                                                         &hdr->pages,
1374                                                         hdr->completion_ops);
1375 }
1376
1377 /*
1378  * Called by non rpc-based layout drivers
1379  */
1380 void pnfs_ld_write_done(struct nfs_write_data *data)
1381 {
1382         struct nfs_pgio_header *hdr = data->header;
1383
1384         if (!hdr->pnfs_error) {
1385                 pnfs_set_layoutcommit(data);
1386                 hdr->mds_ops->rpc_call_done(&data->task, data);
1387         } else
1388                 pnfs_ld_handle_write_error(data);
1389         hdr->mds_ops->rpc_release(data);
1390 }
1391 EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
1392
1393 static void
1394 pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
1395                 struct nfs_write_data *data)
1396 {
1397         struct nfs_pgio_header *hdr = data->header;
1398
1399         if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1400                 list_splice_tail_init(&hdr->pages, &desc->pg_list);
1401                 nfs_pageio_reset_write_mds(desc);
1402                 desc->pg_recoalesce = 1;
1403         }
1404         nfs_writedata_release(data);
1405 }
1406
1407 static enum pnfs_try_status
1408 pnfs_try_to_write_data(struct nfs_write_data *wdata,
1409                         const struct rpc_call_ops *call_ops,
1410                         struct pnfs_layout_segment *lseg,
1411                         int how)
1412 {
1413         struct nfs_pgio_header *hdr = wdata->header;
1414         struct inode *inode = hdr->inode;
1415         enum pnfs_try_status trypnfs;
1416         struct nfs_server *nfss = NFS_SERVER(inode);
1417
1418         hdr->mds_ops = call_ops;
1419
1420         dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
1421                 inode->i_ino, wdata->args.count, wdata->args.offset, how);
1422         trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
1423         if (trypnfs != PNFS_NOT_ATTEMPTED)
1424                 nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
1425         dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
1426         return trypnfs;
1427 }
1428
1429 static void
1430 pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how)
1431 {
1432         struct nfs_write_data *data;
1433         const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
1434         struct pnfs_layout_segment *lseg = desc->pg_lseg;
1435
1436         desc->pg_lseg = NULL;
1437         while (!list_empty(head)) {
1438                 enum pnfs_try_status trypnfs;
1439
1440                 data = list_first_entry(head, struct nfs_write_data, list);
1441                 list_del_init(&data->list);
1442
1443                 trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
1444                 if (trypnfs == PNFS_NOT_ATTEMPTED)
1445                         pnfs_write_through_mds(desc, data);
1446         }
1447         pnfs_put_lseg(lseg);
1448 }
1449
1450 static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
1451 {
1452         pnfs_put_lseg(hdr->lseg);
1453         nfs_writehdr_free(hdr);
1454 }
1455 EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
1456
1457 int
1458 pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
1459 {
1460         struct nfs_write_header *whdr;
1461         struct nfs_pgio_header *hdr;
1462         int ret;
1463
1464         whdr = nfs_writehdr_alloc();
1465         if (!whdr) {
1466                 desc->pg_completion_ops->error_cleanup(&desc->pg_list);
1467                 pnfs_put_lseg(desc->pg_lseg);
1468                 desc->pg_lseg = NULL;
1469                 return -ENOMEM;
1470         }
1471         hdr = &whdr->header;
1472         nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
1473         hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
1474         atomic_inc(&hdr->refcnt);
1475         ret = nfs_generic_flush(desc, hdr);
1476         if (ret != 0) {
1477                 pnfs_put_lseg(desc->pg_lseg);
1478                 desc->pg_lseg = NULL;
1479         } else
1480                 pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags);
1481         if (atomic_dec_and_test(&hdr->refcnt))
1482                 hdr->completion_ops->completion(hdr);
1483         return ret;
1484 }
1485 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
1486
1487 int pnfs_read_done_resend_to_mds(struct inode *inode,
1488                                 struct list_head *head,
1489                                 const struct nfs_pgio_completion_ops *compl_ops)
1490 {
1491         struct nfs_pageio_descriptor pgio;
1492         LIST_HEAD(failed);
1493
1494         /* Resend all requests through the MDS */
1495         nfs_pageio_init_read(&pgio, inode, compl_ops);
1496         while (!list_empty(head)) {
1497                 struct nfs_page *req = nfs_list_entry(head->next);
1498
1499                 nfs_list_remove_request(req);
1500                 if (!nfs_pageio_add_request(&pgio, req))
1501                         nfs_list_add_request(req, &failed);
1502         }
1503         nfs_pageio_complete(&pgio);
1504
1505         if (!list_empty(&failed)) {
1506                 list_move(&failed, head);
1507                 return -EIO;
1508         }
1509         return 0;
1510 }
1511 EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
1512
1513 static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
1514 {
1515         struct nfs_pgio_header *hdr = data->header;
1516
1517         dprintk("pnfs read error = %d\n", hdr->pnfs_error);
1518         if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
1519             PNFS_LAYOUTRET_ON_ERROR) {
1520                 clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags);
1521                 pnfs_return_layout(hdr->inode);
1522         }
1523         if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
1524                 data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
1525                                                         &hdr->pages,
1526                                                         hdr->completion_ops);
1527 }
1528
1529 /*
1530  * Called by non rpc-based layout drivers
1531  */
1532 void pnfs_ld_read_done(struct nfs_read_data *data)
1533 {
1534         struct nfs_pgio_header *hdr = data->header;
1535
1536         if (likely(!hdr->pnfs_error)) {
1537                 __nfs4_read_done_cb(data);
1538                 hdr->mds_ops->rpc_call_done(&data->task, data);
1539         } else
1540                 pnfs_ld_handle_read_error(data);
1541         hdr->mds_ops->rpc_release(data);
1542 }
1543 EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
1544
1545 static void
1546 pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
1547                 struct nfs_read_data *data)
1548 {
1549         struct nfs_pgio_header *hdr = data->header;
1550
1551         if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1552                 list_splice_tail_init(&hdr->pages, &desc->pg_list);
1553                 nfs_pageio_reset_read_mds(desc);
1554                 desc->pg_recoalesce = 1;
1555         }
1556         nfs_readdata_release(data);
1557 }
1558
1559 /*
1560  * Call the appropriate parallel I/O subsystem read function.
1561  */
1562 static enum pnfs_try_status
1563 pnfs_try_to_read_data(struct nfs_read_data *rdata,
1564                        const struct rpc_call_ops *call_ops,
1565                        struct pnfs_layout_segment *lseg)
1566 {
1567         struct nfs_pgio_header *hdr = rdata->header;
1568         struct inode *inode = hdr->inode;
1569         struct nfs_server *nfss = NFS_SERVER(inode);
1570         enum pnfs_try_status trypnfs;
1571
1572         hdr->mds_ops = call_ops;
1573
1574         dprintk("%s: Reading ino:%lu %u@%llu\n",
1575                 __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
1576
1577         trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
1578         if (trypnfs != PNFS_NOT_ATTEMPTED)
1579                 nfs_inc_stats(inode, NFSIOS_PNFS_READ);
1580         dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
1581         return trypnfs;
1582 }
1583
1584 static void
1585 pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head)
1586 {
1587         struct nfs_read_data *data;
1588         const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
1589         struct pnfs_layout_segment *lseg = desc->pg_lseg;
1590
1591         desc->pg_lseg = NULL;
1592         while (!list_empty(head)) {
1593                 enum pnfs_try_status trypnfs;
1594
1595                 data = list_first_entry(head, struct nfs_read_data, list);
1596                 list_del_init(&data->list);
1597
1598                 trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
1599                 if (trypnfs == PNFS_NOT_ATTEMPTED)
1600                         pnfs_read_through_mds(desc, data);
1601         }
1602         pnfs_put_lseg(lseg);
1603 }
1604
1605 static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
1606 {
1607         pnfs_put_lseg(hdr->lseg);
1608         nfs_readhdr_free(hdr);
1609 }
1610 EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
1611
1612 int
1613 pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
1614 {
1615         struct nfs_read_header *rhdr;
1616         struct nfs_pgio_header *hdr;
1617         int ret;
1618
1619         rhdr = nfs_readhdr_alloc();
1620         if (!rhdr) {
1621                 desc->pg_completion_ops->error_cleanup(&desc->pg_list);
1622                 ret = -ENOMEM;
1623                 pnfs_put_lseg(desc->pg_lseg);
1624                 desc->pg_lseg = NULL;
1625                 return ret;
1626         }
1627         hdr = &rhdr->header;
1628         nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
1629         hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
1630         atomic_inc(&hdr->refcnt);
1631         ret = nfs_generic_pagein(desc, hdr);
1632         if (ret != 0) {
1633                 pnfs_put_lseg(desc->pg_lseg);
1634                 desc->pg_lseg = NULL;
1635         } else
1636                 pnfs_do_multiple_reads(desc, &hdr->rpc_list);
1637         if (atomic_dec_and_test(&hdr->refcnt))
1638                 hdr->completion_ops->completion(hdr);
1639         return ret;
1640 }
1641 EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
1642
1643 /*
1644  * There can be multiple RW segments.
1645  */
1646 static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
1647 {
1648         struct pnfs_layout_segment *lseg;
1649
1650         list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
1651                 if (lseg->pls_range.iomode == IOMODE_RW &&
1652                     test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
1653                         list_add(&lseg->pls_lc_list, listp);
1654         }
1655 }
1656
1657 void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
1658 {
1659         pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode);
1660 }
1661 EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
1662
1663 void
1664 pnfs_set_layoutcommit(struct nfs_write_data *wdata)
1665 {
1666         struct nfs_pgio_header *hdr = wdata->header;
1667         struct inode *inode = hdr->inode;
1668         struct nfs_inode *nfsi = NFS_I(inode);
1669         loff_t end_pos = wdata->mds_offset + wdata->res.count;
1670         bool mark_as_dirty = false;
1671
1672         spin_lock(&inode->i_lock);
1673         if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1674                 mark_as_dirty = true;
1675                 dprintk("%s: Set layoutcommit for inode %lu ",
1676                         __func__, inode->i_ino);
1677         }
1678         if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) {
1679                 /* references matched in nfs4_layoutcommit_release */
1680                 pnfs_get_lseg(hdr->lseg);
1681         }
1682         if (end_pos > nfsi->layout->plh_lwb)
1683                 nfsi->layout->plh_lwb = end_pos;
1684         spin_unlock(&inode->i_lock);
1685         dprintk("%s: lseg %p end_pos %llu\n",
1686                 __func__, hdr->lseg, nfsi->layout->plh_lwb);
1687
1688         /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
1689          * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
1690         if (mark_as_dirty)
1691                 mark_inode_dirty_sync(inode);
1692 }
1693 EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
1694
1695 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
1696 {
1697         struct nfs_server *nfss = NFS_SERVER(data->args.inode);
1698
1699         if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
1700                 nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
1701 }
1702
1703 /*
1704  * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
1705  * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
1706  * data to disk to allow the server to recover the data if it crashes.
1707  * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag
1708  * is off, and a COMMIT is sent to a data server, or
1709  * if WRITEs to a data server return NFS_DATA_SYNC.
1710  */
1711 int
1712 pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1713 {
1714         struct nfs4_layoutcommit_data *data;
1715         struct nfs_inode *nfsi = NFS_I(inode);
1716         loff_t end_pos;
1717         int status = 0;
1718
1719         dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
1720
1721         if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
1722                 return 0;
1723
1724         /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
1725         data = kzalloc(sizeof(*data), GFP_NOFS);
1726         if (!data) {
1727                 status = -ENOMEM;
1728                 goto out;
1729         }
1730
1731         if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
1732                 goto out_free;
1733
1734         if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
1735                 if (!sync) {
1736                         status = -EAGAIN;
1737                         goto out_free;
1738                 }
1739                 status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING,
1740                                         nfs_wait_bit_killable, TASK_KILLABLE);
1741                 if (status)
1742                         goto out_free;
1743         }
1744
1745         INIT_LIST_HEAD(&data->lseg_list);
1746         spin_lock(&inode->i_lock);
1747         if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1748                 clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags);
1749                 spin_unlock(&inode->i_lock);
1750                 wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING);
1751                 goto out_free;
1752         }
1753
1754         pnfs_list_write_lseg(inode, &data->lseg_list);
1755
1756         end_pos = nfsi->layout->plh_lwb;
1757         nfsi->layout->plh_lwb = 0;
1758
1759         nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
1760         spin_unlock(&inode->i_lock);
1761
1762         data->args.inode = inode;
1763         data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
1764         nfs_fattr_init(&data->fattr);
1765         data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
1766         data->res.fattr = &data->fattr;
1767         data->args.lastbytewritten = end_pos - 1;
1768         data->res.server = NFS_SERVER(inode);
1769
1770         status = nfs4_proc_layoutcommit(data, sync);
1771 out:
1772         if (status)
1773                 mark_inode_dirty_sync(inode);
1774         dprintk("<-- %s status %d\n", __func__, status);
1775         return status;
1776 out_free:
1777         kfree(data);
1778         goto out;
1779 }
1780
1781 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
1782 {
1783         struct nfs4_threshold *thp;
1784
1785         thp = kzalloc(sizeof(*thp), GFP_NOFS);
1786         if (!thp) {
1787                 dprintk("%s mdsthreshold allocation failed\n", __func__);
1788                 return NULL;
1789         }
1790         return thp;
1791 }