ore/exofs: Define new ore_verify_layout
[pandora-kernel.git] / fs / exofs / ore.c
1 /*
2  * Copyright (C) 2005, 2006
3  * Avishay Traeger (avishay@gmail.com)
4  * Copyright (C) 2008, 2009
5  * Boaz Harrosh <bharrosh@panasas.com>
6  *
7  * This file is part of exofs.
8  *
9  * exofs is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation.  Since it is based on ext2, and the only
12  * valid version of GPL for the Linux kernel is version 2, the only valid
13  * version of GPL for exofs is version 2.
14  *
15  * exofs is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with exofs; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
23  */
24
25 #include <linux/slab.h>
26 #include <asm/div64.h>
27
28 #include <scsi/osd_ore.h>
29
30 #define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
31
32 #ifdef CONFIG_EXOFS_DEBUG
33 #define ORE_DBGMSG(fmt, a...) \
34         printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
35 #else
36 #define ORE_DBGMSG(fmt, a...) \
37         do { if (0) printk(fmt, ##a); } while (0)
38 #endif
39
40 /* u64 has problems with printk this will cast it to unsigned long long */
41 #define _LLU(x) (unsigned long long)(x)
42
43 #define ORE_DBGMSG2(M...) do {} while (0)
44 /* #define ORE_DBGMSG2 ORE_DBGMSG */
45
46 MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
47 MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
48 MODULE_LICENSE("GPL");
49
50 /* ore_verify_layout does a couple of things:
51  * 1. Given a minimum number of needed parameters fixes up the rest of the
52  *    members to be operatonals for the ore. The needed parameters are those
53  *    that are defined by the pnfs-objects layout STD.
54  * 2. Check to see if the current ore code actually supports these parameters
55  *    for example stripe_unit must be a multple of the system PAGE_SIZE,
56  *    and etc...
57  * 3. Cache some havily used calculations that will be needed by users.
58  */
59
60 static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
61                                  struct ore_striping_info *si);
62
63 enum { BIO_MAX_PAGES_KMALLOC =
64                 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),};
65
66 int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
67 {
68         u64 stripe_length;
69
70 /* FIXME: Only raid0 is supported for now. */
71         if (layout->raid_algorithm != PNFS_OSD_RAID_0) {
72                 ORE_ERR("Only RAID_0 for now\n");
73                 return -EINVAL;
74         }
75         if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
76                 ORE_ERR("Stripe Unit(0x%llx)"
77                           " must be Multples of PAGE_SIZE(0x%lx)\n",
78                           _LLU(layout->stripe_unit), PAGE_SIZE);
79                 return -EINVAL;
80         }
81         if (layout->group_width) {
82                 if (!layout->group_depth) {
83                         ORE_ERR("group_depth == 0 && group_width != 0\n");
84                         return -EINVAL;
85                 }
86                 if (total_comps < (layout->group_width * layout->mirrors_p1)) {
87                         ORE_ERR("Data Map wrong, "
88                                 "numdevs=%d < group_width=%d * mirrors=%d\n",
89                                 total_comps, layout->group_width,
90                                 layout->mirrors_p1);
91                         return -EINVAL;
92                 }
93                 layout->group_count = total_comps / layout->mirrors_p1 /
94                                                 layout->group_width;
95         } else {
96                 if (layout->group_depth) {
97                         printk(KERN_NOTICE "Warning: group_depth ignored "
98                                 "group_width == 0 && group_depth == %lld\n",
99                                 _LLU(layout->group_depth));
100                 }
101                 layout->group_width = total_comps / layout->mirrors_p1;
102                 layout->group_depth = -1;
103                 layout->group_count = 1;
104         }
105
106         stripe_length = (u64)layout->group_width * layout->stripe_unit;
107         if (stripe_length >= (1ULL << 32)) {
108                 ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n",
109                         _LLU(stripe_length));
110                 return -EINVAL;
111         }
112
113         layout->max_io_length =
114                 (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
115                                                         layout->group_width;
116         return 0;
117 }
118 EXPORT_SYMBOL(ore_verify_layout);
119
120 static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
121 {
122         return ios->oc->comps[index & ios->oc->single_comp].cred;
123 }
124
125 static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
126 {
127         return &ios->oc->comps[index & ios->oc->single_comp].obj;
128 }
129
130 static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
131 {
132         ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n",
133                     ios->oc->first_dev, ios->oc->numdevs, index,
134                     ios->oc->ods);
135
136         return ore_comp_dev(ios->oc, index);
137 }
138
139 static int  _get_io_state(struct ore_layout *layout,
140                           struct ore_components *oc, unsigned numdevs,
141                           struct ore_io_state **pios)
142 {
143         struct ore_io_state *ios;
144
145         /*TODO: Maybe use kmem_cach per sbi of size
146          * exofs_io_state_size(layout->s_numdevs)
147          */
148         ios = kzalloc(ore_io_state_size(numdevs), GFP_KERNEL);
149         if (unlikely(!ios)) {
150                 ORE_DBGMSG("Failed kzalloc bytes=%d\n",
151                            ore_io_state_size(numdevs));
152                 *pios = NULL;
153                 return -ENOMEM;
154         }
155
156         ios->layout = layout;
157         ios->oc = oc;
158         *pios = ios;
159         return 0;
160 }
161
162 /* Allocate an io_state for only a single group of devices
163  *
164  * If a user needs to call ore_read/write() this version must be used becase it
165  * allocates extra stuff for striping and raid.
166  * The ore might decide to only IO less then @length bytes do to alignmets
167  * and constrains as follows:
168  * - The IO cannot cross group boundary.
169  * - In raid5/6 The end of the IO must align at end of a stripe eg.
170  *   (@offset + @length) % strip_size == 0. Or the complete range is within a
171  *   single stripe.
172  * - Memory condition only permitted a shorter IO. (A user can use @length=~0
173  *   And check the returned ios->length for max_io_size.)
174  *
175  * The caller must check returned ios->length (and/or ios->nr_pages) and
176  * re-issue these pages that fall outside of ios->length
177  */
178 int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
179                       bool is_reading, u64 offset, u64 length,
180                       struct ore_io_state **pios)
181 {
182         struct ore_io_state *ios;
183         unsigned numdevs = layout->group_width * layout->mirrors_p1;
184         int ret;
185
186         ret = _get_io_state(layout, oc, numdevs, pios);
187         if (unlikely(ret))
188                 return ret;
189
190         ios = *pios;
191         ios->reading = is_reading;
192         ios->offset = offset;
193
194         if (length) {
195                 ore_calc_stripe_info(layout, offset, &ios->si);
196                 ios->length = (length <= ios->si.group_length) ? length :
197                                                         ios->si.group_length;
198                 ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
199         }
200
201         return 0;
202 }
203 EXPORT_SYMBOL(ore_get_rw_state);
204
205 /* Allocate an io_state for all the devices in the comps array
206  *
207  * This version of io_state allocation is used mostly by create/remove
208  * and trunc where we currently need all the devices. The only wastful
209  * bit is the read/write_attributes with no IO. Those sites should
210  * be converted to use ore_get_rw_state() with length=0
211  */
212 int  ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
213                       struct ore_io_state **pios)
214 {
215         return _get_io_state(layout, oc, oc->numdevs, pios);
216 }
217 EXPORT_SYMBOL(ore_get_io_state);
218
219 void ore_put_io_state(struct ore_io_state *ios)
220 {
221         if (ios) {
222                 unsigned i;
223
224                 for (i = 0; i < ios->numdevs; i++) {
225                         struct ore_per_dev_state *per_dev = &ios->per_dev[i];
226
227                         if (per_dev->or)
228                                 osd_end_request(per_dev->or);
229                         if (per_dev->bio)
230                                 bio_put(per_dev->bio);
231                 }
232
233                 kfree(ios);
234         }
235 }
236 EXPORT_SYMBOL(ore_put_io_state);
237
238 static void _sync_done(struct ore_io_state *ios, void *p)
239 {
240         struct completion *waiting = p;
241
242         complete(waiting);
243 }
244
245 static void _last_io(struct kref *kref)
246 {
247         struct ore_io_state *ios = container_of(
248                                         kref, struct ore_io_state, kref);
249
250         ios->done(ios, ios->private);
251 }
252
253 static void _done_io(struct osd_request *or, void *p)
254 {
255         struct ore_io_state *ios = p;
256
257         kref_put(&ios->kref, _last_io);
258 }
259
260 static int ore_io_execute(struct ore_io_state *ios)
261 {
262         DECLARE_COMPLETION_ONSTACK(wait);
263         bool sync = (ios->done == NULL);
264         int i, ret;
265
266         if (sync) {
267                 ios->done = _sync_done;
268                 ios->private = &wait;
269         }
270
271         for (i = 0; i < ios->numdevs; i++) {
272                 struct osd_request *or = ios->per_dev[i].or;
273                 if (unlikely(!or))
274                         continue;
275
276                 ret = osd_finalize_request(or, 0, _ios_cred(ios, i), NULL);
277                 if (unlikely(ret)) {
278                         ORE_DBGMSG("Failed to osd_finalize_request() => %d\n",
279                                      ret);
280                         return ret;
281                 }
282         }
283
284         kref_init(&ios->kref);
285
286         for (i = 0; i < ios->numdevs; i++) {
287                 struct osd_request *or = ios->per_dev[i].or;
288                 if (unlikely(!or))
289                         continue;
290
291                 kref_get(&ios->kref);
292                 osd_execute_request_async(or, _done_io, ios);
293         }
294
295         kref_put(&ios->kref, _last_io);
296         ret = 0;
297
298         if (sync) {
299                 wait_for_completion(&wait);
300                 ret = ore_check_io(ios, NULL);
301         }
302         return ret;
303 }
304
305 static void _clear_bio(struct bio *bio)
306 {
307         struct bio_vec *bv;
308         unsigned i;
309
310         __bio_for_each_segment(bv, bio, i, 0) {
311                 unsigned this_count = bv->bv_len;
312
313                 if (likely(PAGE_SIZE == this_count))
314                         clear_highpage(bv->bv_page);
315                 else
316                         zero_user(bv->bv_page, bv->bv_offset, this_count);
317         }
318 }
319
320 int ore_check_io(struct ore_io_state *ios, u64 *resid)
321 {
322         enum osd_err_priority acumulated_osd_err = 0;
323         int acumulated_lin_err = 0;
324         int i;
325
326         for (i = 0; i < ios->numdevs; i++) {
327                 struct osd_sense_info osi;
328                 struct osd_request *or = ios->per_dev[i].or;
329                 int ret;
330
331                 if (unlikely(!or))
332                         continue;
333
334                 ret = osd_req_decode_sense(or, &osi);
335                 if (likely(!ret))
336                         continue;
337
338                 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
339                         /* start read offset passed endof file */
340                         _clear_bio(ios->per_dev[i].bio);
341                         ORE_DBGMSG("start read offset passed end of file "
342                                 "offset=0x%llx, length=0x%llx\n",
343                                 _LLU(ios->per_dev[i].offset),
344                                 _LLU(ios->per_dev[i].length));
345
346                         continue; /* we recovered */
347                 }
348
349                 if (osi.osd_err_pri >= acumulated_osd_err) {
350                         acumulated_osd_err = osi.osd_err_pri;
351                         acumulated_lin_err = ret;
352                 }
353         }
354
355         /* TODO: raid specific residual calculations */
356         if (resid) {
357                 if (likely(!acumulated_lin_err))
358                         *resid = 0;
359                 else
360                         *resid = ios->length;
361         }
362
363         return acumulated_lin_err;
364 }
365 EXPORT_SYMBOL(ore_check_io);
366
367 /*
368  * L - logical offset into the file
369  *
370  * U - The number of bytes in a stripe within a group
371  *
372  *      U = stripe_unit * group_width
373  *
374  * T - The number of bytes striped within a group of component objects
375  *     (before advancing to the next group)
376  *
377  *      T = stripe_unit * group_width * group_depth
378  *
379  * S - The number of bytes striped across all component objects
380  *     before the pattern repeats
381  *
382  *      S = stripe_unit * group_width * group_depth * group_count
383  *
384  * M - The "major" (i.e., across all components) stripe number
385  *
386  *      M = L / S
387  *
388  * G - Counts the groups from the beginning of the major stripe
389  *
390  *      G = (L - (M * S)) / T   [or (L % S) / T]
391  *
392  * H - The byte offset within the group
393  *
394  *      H = (L - (M * S)) % T   [or (L % S) % T]
395  *
396  * N - The "minor" (i.e., across the group) stripe number
397  *
398  *      N = H / U
399  *
400  * C - The component index coresponding to L
401  *
402  *      C = (H - (N * U)) / stripe_unit + G * group_width
403  *      [or (L % U) / stripe_unit + G * group_width]
404  *
405  * O - The component offset coresponding to L
406  *
407  *      O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
408  */
409 static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
410                                  struct ore_striping_info *si)
411 {
412         u32     stripe_unit = layout->stripe_unit;
413         u32     group_width = layout->group_width;
414         u64     group_depth = layout->group_depth;
415
416         u32     U = stripe_unit * group_width;
417         u64     T = U * group_depth;
418         u64     S = T * layout->group_count;
419         u64     M = div64_u64(file_offset, S);
420
421         /*
422         G = (L - (M * S)) / T
423         H = (L - (M * S)) % T
424         */
425         u64     LmodS = file_offset - M * S;
426         u32     G = div64_u64(LmodS, T);
427         u64     H = LmodS - G * T;
428
429         u32     N = div_u64(H, U);
430
431         /* "H - (N * U)" is just "H % U" so it's bound to u32 */
432         si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
433         si->dev *= layout->mirrors_p1;
434
435         div_u64_rem(file_offset, stripe_unit, &si->unit_off);
436
437         si->obj_offset = si->unit_off + (N * stripe_unit) +
438                                   (M * group_depth * stripe_unit);
439
440         si->group_length = T - H;
441         si->M = M;
442 }
443
444 static int _add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
445                 unsigned pgbase, struct ore_per_dev_state *per_dev,
446                 int cur_len)
447 {
448         unsigned pg = *cur_pg;
449         struct request_queue *q =
450                         osd_request_queue(_ios_od(ios, per_dev->dev));
451         unsigned len = cur_len;
452         int ret;
453
454         if (per_dev->bio == NULL) {
455                 unsigned pages_in_stripe = ios->layout->group_width *
456                                         (ios->layout->stripe_unit / PAGE_SIZE);
457                 unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
458                                                 ios->layout->group_width;
459
460                 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
461                 if (unlikely(!per_dev->bio)) {
462                         ORE_DBGMSG("Failed to allocate BIO size=%u\n",
463                                      bio_size);
464                         ret = -ENOMEM;
465                         goto out;
466                 }
467         }
468
469         while (cur_len > 0) {
470                 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
471                 unsigned added_len;
472
473                 BUG_ON(ios->nr_pages <= pg);
474                 cur_len -= pglen;
475
476                 added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
477                                             pglen, pgbase);
478                 if (unlikely(pglen != added_len)) {
479                         ret = -ENOMEM;
480                         goto out;
481                 }
482                 pgbase = 0;
483                 ++pg;
484         }
485         BUG_ON(cur_len);
486
487         per_dev->length += len;
488         *cur_pg = pg;
489         ret = 0;
490 out:    /* we fail the complete unit on an error eg don't advance
491          * per_dev->length and cur_pg. This means that we might have a bigger
492          * bio than the CDB requested length (per_dev->length). That's fine
493          * only the oposite is fatal.
494          */
495         return ret;
496 }
497
498 static int _prepare_for_striping(struct ore_io_state *ios)
499 {
500         struct ore_striping_info *si = &ios->si;
501         unsigned stripe_unit = ios->layout->stripe_unit;
502         unsigned mirrors_p1 = ios->layout->mirrors_p1;
503         unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
504         unsigned dev = si->dev;
505         unsigned first_dev = dev - (dev % devs_in_group);
506         unsigned cur_pg = ios->pages_consumed;
507         u64 length = ios->length;
508         int ret = 0;
509
510         if (!ios->pages) {
511                 ios->numdevs = ios->layout->mirrors_p1;
512                 return 0;
513         }
514
515         BUG_ON(length > si->group_length);
516
517         while (length) {
518                 unsigned comp = dev - first_dev;
519                 struct ore_per_dev_state *per_dev = &ios->per_dev[comp];
520                 unsigned cur_len, page_off = 0;
521
522                 if (!per_dev->length) {
523                         per_dev->dev = dev;
524                         if (dev < si->dev) {
525                                 per_dev->offset = si->obj_offset + stripe_unit -
526                                                                    si->unit_off;
527                                 cur_len = stripe_unit;
528                         } else if (dev == si->dev) {
529                                 per_dev->offset = si->obj_offset;
530                                 cur_len = stripe_unit - si->unit_off;
531                                 page_off = si->unit_off & ~PAGE_MASK;
532                                 BUG_ON(page_off && (page_off != ios->pgbase));
533                         } else { /* dev > si->dev */
534                                 per_dev->offset = si->obj_offset - si->unit_off;
535                                 cur_len = stripe_unit;
536                         }
537                 } else {
538                         cur_len = stripe_unit;
539                 }
540                 if (cur_len >= length)
541                         cur_len = length;
542
543                 ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
544                                        cur_len);
545                 if (unlikely(ret))
546                         goto out;
547
548                 dev += mirrors_p1;
549                 dev = (dev % devs_in_group) + first_dev;
550
551                 length -= cur_len;
552         }
553 out:
554         ios->numdevs = devs_in_group;
555         ios->pages_consumed = cur_pg;
556         if (unlikely(ret)) {
557                 if (length == ios->length)
558                         return ret;
559                 else
560                         ios->length -= length;
561         }
562         return 0;
563 }
564
565 int ore_create(struct ore_io_state *ios)
566 {
567         int i, ret;
568
569         for (i = 0; i < ios->oc->numdevs; i++) {
570                 struct osd_request *or;
571
572                 or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
573                 if (unlikely(!or)) {
574                         ORE_ERR("%s: osd_start_request failed\n", __func__);
575                         ret = -ENOMEM;
576                         goto out;
577                 }
578                 ios->per_dev[i].or = or;
579                 ios->numdevs++;
580
581                 osd_req_create_object(or, _ios_obj(ios, i));
582         }
583         ret = ore_io_execute(ios);
584
585 out:
586         return ret;
587 }
588 EXPORT_SYMBOL(ore_create);
589
590 int ore_remove(struct ore_io_state *ios)
591 {
592         int i, ret;
593
594         for (i = 0; i < ios->oc->numdevs; i++) {
595                 struct osd_request *or;
596
597                 or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
598                 if (unlikely(!or)) {
599                         ORE_ERR("%s: osd_start_request failed\n", __func__);
600                         ret = -ENOMEM;
601                         goto out;
602                 }
603                 ios->per_dev[i].or = or;
604                 ios->numdevs++;
605
606                 osd_req_remove_object(or, _ios_obj(ios, i));
607         }
608         ret = ore_io_execute(ios);
609
610 out:
611         return ret;
612 }
613 EXPORT_SYMBOL(ore_remove);
614
615 static int _write_mirror(struct ore_io_state *ios, int cur_comp)
616 {
617         struct ore_per_dev_state *master_dev = &ios->per_dev[cur_comp];
618         unsigned dev = ios->per_dev[cur_comp].dev;
619         unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
620         int ret = 0;
621
622         if (ios->pages && !master_dev->length)
623                 return 0; /* Just an empty slot */
624
625         for (; cur_comp < last_comp; ++cur_comp, ++dev) {
626                 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
627                 struct osd_request *or;
628
629                 or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL);
630                 if (unlikely(!or)) {
631                         ORE_ERR("%s: osd_start_request failed\n", __func__);
632                         ret = -ENOMEM;
633                         goto out;
634                 }
635                 per_dev->or = or;
636
637                 if (ios->pages) {
638                         struct bio *bio;
639
640                         if (per_dev != master_dev) {
641                                 bio = bio_kmalloc(GFP_KERNEL,
642                                                   master_dev->bio->bi_max_vecs);
643                                 if (unlikely(!bio)) {
644                                         ORE_DBGMSG(
645                                               "Failed to allocate BIO size=%u\n",
646                                               master_dev->bio->bi_max_vecs);
647                                         ret = -ENOMEM;
648                                         goto out;
649                                 }
650
651                                 __bio_clone(bio, master_dev->bio);
652                                 bio->bi_bdev = NULL;
653                                 bio->bi_next = NULL;
654                                 per_dev->offset = master_dev->offset;
655                                 per_dev->length = master_dev->length;
656                                 per_dev->bio =  bio;
657                                 per_dev->dev = dev;
658                         } else {
659                                 bio = master_dev->bio;
660                                 /* FIXME: bio_set_dir() */
661                                 bio->bi_rw |= REQ_WRITE;
662                         }
663
664                         osd_req_write(or, _ios_obj(ios, dev), per_dev->offset,
665                                       bio, per_dev->length);
666                         ORE_DBGMSG("write(0x%llx) offset=0x%llx "
667                                       "length=0x%llx dev=%d\n",
668                                      _LLU(_ios_obj(ios, dev)->id),
669                                      _LLU(per_dev->offset),
670                                      _LLU(per_dev->length), dev);
671                 } else if (ios->kern_buff) {
672                         per_dev->offset = ios->si.obj_offset;
673                         per_dev->dev = ios->si.dev + dev;
674
675                         /* no cross device without page array */
676                         BUG_ON((ios->layout->group_width > 1) &&
677                                (ios->si.unit_off + ios->length >
678                                 ios->layout->stripe_unit));
679
680                         ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev),
681                                                  per_dev->offset,
682                                                  ios->kern_buff, ios->length);
683                         if (unlikely(ret))
684                                 goto out;
685                         ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
686                                       "length=0x%llx dev=%d\n",
687                                      _LLU(_ios_obj(ios, dev)->id),
688                                      _LLU(per_dev->offset),
689                                      _LLU(ios->length), per_dev->dev);
690                 } else {
691                         osd_req_set_attributes(or, _ios_obj(ios, dev));
692                         ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
693                                      _LLU(_ios_obj(ios, dev)->id),
694                                      ios->out_attr_len, dev);
695                 }
696
697                 if (ios->out_attr)
698                         osd_req_add_set_attr_list(or, ios->out_attr,
699                                                   ios->out_attr_len);
700
701                 if (ios->in_attr)
702                         osd_req_add_get_attr_list(or, ios->in_attr,
703                                                   ios->in_attr_len);
704         }
705
706 out:
707         return ret;
708 }
709
710 int ore_write(struct ore_io_state *ios)
711 {
712         int i;
713         int ret;
714
715         ret = _prepare_for_striping(ios);
716         if (unlikely(ret))
717                 return ret;
718
719         for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
720                 ret = _write_mirror(ios, i);
721                 if (unlikely(ret))
722                         return ret;
723         }
724
725         ret = ore_io_execute(ios);
726         return ret;
727 }
728 EXPORT_SYMBOL(ore_write);
729
730 static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
731 {
732         struct osd_request *or;
733         struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
734         struct osd_obj_id *obj = _ios_obj(ios, cur_comp);
735         unsigned first_dev = (unsigned)obj->id;
736
737         if (ios->pages && !per_dev->length)
738                 return 0; /* Just an empty slot */
739
740         first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
741         or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL);
742         if (unlikely(!or)) {
743                 ORE_ERR("%s: osd_start_request failed\n", __func__);
744                 return -ENOMEM;
745         }
746         per_dev->or = or;
747
748         if (ios->pages) {
749                 osd_req_read(or, obj, per_dev->offset,
750                                 per_dev->bio, per_dev->length);
751                 ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
752                              " dev=%d\n", _LLU(obj->id),
753                              _LLU(per_dev->offset), _LLU(per_dev->length),
754                              first_dev);
755         } else {
756                 BUG_ON(ios->kern_buff);
757
758                 osd_req_get_attributes(or, obj);
759                 ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
760                               _LLU(obj->id),
761                               ios->in_attr_len, first_dev);
762         }
763         if (ios->out_attr)
764                 osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
765
766         if (ios->in_attr)
767                 osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
768
769         return 0;
770 }
771
772 int ore_read(struct ore_io_state *ios)
773 {
774         int i;
775         int ret;
776
777         ret = _prepare_for_striping(ios);
778         if (unlikely(ret))
779                 return ret;
780
781         for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
782                 ret = _read_mirror(ios, i);
783                 if (unlikely(ret))
784                         return ret;
785         }
786
787         ret = ore_io_execute(ios);
788         return ret;
789 }
790 EXPORT_SYMBOL(ore_read);
791
792 int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr)
793 {
794         struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
795         void *iter = NULL;
796         int nelem;
797
798         do {
799                 nelem = 1;
800                 osd_req_decode_get_attr_list(ios->per_dev[0].or,
801                                              &cur_attr, &nelem, &iter);
802                 if ((cur_attr.attr_page == attr->attr_page) &&
803                     (cur_attr.attr_id == attr->attr_id)) {
804                         attr->len = cur_attr.len;
805                         attr->val_ptr = cur_attr.val_ptr;
806                         return 0;
807                 }
808         } while (iter);
809
810         return -EIO;
811 }
812 EXPORT_SYMBOL(extract_attr_from_ios);
813
814 static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
815                              struct osd_attr *attr)
816 {
817         int last_comp = cur_comp + ios->layout->mirrors_p1;
818
819         for (; cur_comp < last_comp; ++cur_comp) {
820                 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
821                 struct osd_request *or;
822
823                 or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL);
824                 if (unlikely(!or)) {
825                         ORE_ERR("%s: osd_start_request failed\n", __func__);
826                         return -ENOMEM;
827                 }
828                 per_dev->or = or;
829
830                 osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
831                 osd_req_add_set_attr_list(or, attr, 1);
832         }
833
834         return 0;
835 }
836
837 struct _trunc_info {
838         struct ore_striping_info si;
839         u64 prev_group_obj_off;
840         u64 next_group_obj_off;
841
842         unsigned first_group_dev;
843         unsigned nex_group_dev;
844 };
845
846 static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
847                              struct _trunc_info *ti)
848 {
849         unsigned stripe_unit = layout->stripe_unit;
850
851         ore_calc_stripe_info(layout, file_offset, &ti->si);
852
853         ti->prev_group_obj_off = ti->si.M * stripe_unit;
854         ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
855
856         ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
857         ti->nex_group_dev = ti->first_group_dev + layout->group_width;
858 }
859
860 int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
861                    u64 size)
862 {
863         struct ore_io_state *ios;
864         struct exofs_trunc_attr {
865                 struct osd_attr attr;
866                 __be64 newsize;
867         } *size_attrs;
868         struct _trunc_info ti;
869         int i, ret;
870
871         ret = ore_get_io_state(layout, oc, &ios);
872         if (unlikely(ret))
873                 return ret;
874
875         _calc_trunk_info(ios->layout, size, &ti);
876
877         size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs),
878                              GFP_KERNEL);
879         if (unlikely(!size_attrs)) {
880                 ret = -ENOMEM;
881                 goto out;
882         }
883
884         ios->numdevs = ios->oc->numdevs;
885
886         for (i = 0; i < ios->numdevs; ++i) {
887                 struct exofs_trunc_attr *size_attr = &size_attrs[i];
888                 u64 obj_size;
889
890                 if (i < ti.first_group_dev)
891                         obj_size = ti.prev_group_obj_off;
892                 else if (i >= ti.nex_group_dev)
893                         obj_size = ti.next_group_obj_off;
894                 else if (i < ti.si.dev) /* dev within this group */
895                         obj_size = ti.si.obj_offset +
896                                       ios->layout->stripe_unit - ti.si.unit_off;
897                 else if (i == ti.si.dev)
898                         obj_size = ti.si.obj_offset;
899                 else /* i > ti.dev */
900                         obj_size = ti.si.obj_offset - ti.si.unit_off;
901
902                 size_attr->newsize = cpu_to_be64(obj_size);
903                 size_attr->attr = g_attr_logical_length;
904                 size_attr->attr.val_ptr = &size_attr->newsize;
905
906                 ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
907                              _LLU(oc->comps->obj.id), _LLU(obj_size), i);
908                 ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
909                                         &size_attr->attr);
910                 if (unlikely(ret))
911                         goto out;
912         }
913         ret = ore_io_execute(ios);
914
915 out:
916         kfree(size_attrs);
917         ore_put_io_state(ios);
918         return ret;
919 }
920 EXPORT_SYMBOL(ore_truncate);
921
922 const struct osd_attr g_attr_logical_length = ATTR_DEF(
923         OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
924 EXPORT_SYMBOL(g_attr_logical_length);