ore: Make ore_calc_stripe_info EXPORT_SYMBOL
[pandora-kernel.git] / fs / exofs / ore.c
1 /*
2  * Copyright (C) 2005, 2006
3  * Avishay Traeger (avishay@gmail.com)
4  * Copyright (C) 2008, 2009
5  * Boaz Harrosh <bharrosh@panasas.com>
6  *
7  * This file is part of exofs.
8  *
9  * exofs is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation.  Since it is based on ext2, and the only
12  * valid version of GPL for the Linux kernel is version 2, the only valid
13  * version of GPL for exofs is version 2.
14  *
15  * exofs is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with exofs; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
23  */
24
25 #include <linux/slab.h>
26 #include <asm/div64.h>
27
28 #include <scsi/osd_ore.h>
29
30 #define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
31
32 #ifdef CONFIG_EXOFS_DEBUG
33 #define ORE_DBGMSG(fmt, a...) \
34         printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
35 #else
36 #define ORE_DBGMSG(fmt, a...) \
37         do { if (0) printk(fmt, ##a); } while (0)
38 #endif
39
40 /* u64 has problems with printk this will cast it to unsigned long long */
41 #define _LLU(x) (unsigned long long)(x)
42
43 #define ORE_DBGMSG2(M...) do {} while (0)
44 /* #define ORE_DBGMSG2 ORE_DBGMSG */
45
46 MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
47 MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
48 MODULE_LICENSE("GPL");
49
50 /* ore_verify_layout does a couple of things:
51  * 1. Given a minimum number of needed parameters fixes up the rest of the
52  *    members to be operatonals for the ore. The needed parameters are those
53  *    that are defined by the pnfs-objects layout STD.
54  * 2. Check to see if the current ore code actually supports these parameters
55  *    for example stripe_unit must be a multple of the system PAGE_SIZE,
56  *    and etc...
57  * 3. Cache some havily used calculations that will be needed by users.
58  */
59
60 enum { BIO_MAX_PAGES_KMALLOC =
61                 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),};
62
63 int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
64 {
65         u64 stripe_length;
66
67 /* FIXME: Only raid0 is supported for now. */
68         if (layout->raid_algorithm != PNFS_OSD_RAID_0) {
69                 ORE_ERR("Only RAID_0 for now\n");
70                 return -EINVAL;
71         }
72         if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
73                 ORE_ERR("Stripe Unit(0x%llx)"
74                           " must be Multples of PAGE_SIZE(0x%lx)\n",
75                           _LLU(layout->stripe_unit), PAGE_SIZE);
76                 return -EINVAL;
77         }
78         if (layout->group_width) {
79                 if (!layout->group_depth) {
80                         ORE_ERR("group_depth == 0 && group_width != 0\n");
81                         return -EINVAL;
82                 }
83                 if (total_comps < (layout->group_width * layout->mirrors_p1)) {
84                         ORE_ERR("Data Map wrong, "
85                                 "numdevs=%d < group_width=%d * mirrors=%d\n",
86                                 total_comps, layout->group_width,
87                                 layout->mirrors_p1);
88                         return -EINVAL;
89                 }
90                 layout->group_count = total_comps / layout->mirrors_p1 /
91                                                 layout->group_width;
92         } else {
93                 if (layout->group_depth) {
94                         printk(KERN_NOTICE "Warning: group_depth ignored "
95                                 "group_width == 0 && group_depth == %lld\n",
96                                 _LLU(layout->group_depth));
97                 }
98                 layout->group_width = total_comps / layout->mirrors_p1;
99                 layout->group_depth = -1;
100                 layout->group_count = 1;
101         }
102
103         stripe_length = (u64)layout->group_width * layout->stripe_unit;
104         if (stripe_length >= (1ULL << 32)) {
105                 ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n",
106                         _LLU(stripe_length));
107                 return -EINVAL;
108         }
109
110         layout->max_io_length =
111                 (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
112                                                         layout->group_width;
113         return 0;
114 }
115 EXPORT_SYMBOL(ore_verify_layout);
116
117 static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
118 {
119         return ios->oc->comps[index & ios->oc->single_comp].cred;
120 }
121
122 static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
123 {
124         return &ios->oc->comps[index & ios->oc->single_comp].obj;
125 }
126
127 static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
128 {
129         ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n",
130                     ios->oc->first_dev, ios->oc->numdevs, index,
131                     ios->oc->ods);
132
133         return ore_comp_dev(ios->oc, index);
134 }
135
136 static int  _get_io_state(struct ore_layout *layout,
137                           struct ore_components *oc, unsigned numdevs,
138                           struct ore_io_state **pios)
139 {
140         struct ore_io_state *ios;
141
142         /*TODO: Maybe use kmem_cach per sbi of size
143          * exofs_io_state_size(layout->s_numdevs)
144          */
145         ios = kzalloc(ore_io_state_size(numdevs), GFP_KERNEL);
146         if (unlikely(!ios)) {
147                 ORE_DBGMSG("Failed kzalloc bytes=%d\n",
148                            ore_io_state_size(numdevs));
149                 *pios = NULL;
150                 return -ENOMEM;
151         }
152
153         ios->layout = layout;
154         ios->oc = oc;
155         *pios = ios;
156         return 0;
157 }
158
159 /* Allocate an io_state for only a single group of devices
160  *
161  * If a user needs to call ore_read/write() this version must be used becase it
162  * allocates extra stuff for striping and raid.
163  * The ore might decide to only IO less then @length bytes do to alignmets
164  * and constrains as follows:
165  * - The IO cannot cross group boundary.
166  * - In raid5/6 The end of the IO must align at end of a stripe eg.
167  *   (@offset + @length) % strip_size == 0. Or the complete range is within a
168  *   single stripe.
169  * - Memory condition only permitted a shorter IO. (A user can use @length=~0
170  *   And check the returned ios->length for max_io_size.)
171  *
172  * The caller must check returned ios->length (and/or ios->nr_pages) and
173  * re-issue these pages that fall outside of ios->length
174  */
175 int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
176                       bool is_reading, u64 offset, u64 length,
177                       struct ore_io_state **pios)
178 {
179         struct ore_io_state *ios;
180         unsigned numdevs = layout->group_width * layout->mirrors_p1;
181         int ret;
182
183         ret = _get_io_state(layout, oc, numdevs, pios);
184         if (unlikely(ret))
185                 return ret;
186
187         ios = *pios;
188         ios->reading = is_reading;
189         ios->offset = offset;
190
191         if (length) {
192                 ore_calc_stripe_info(layout, offset, &ios->si);
193                 ios->length = (length <= ios->si.group_length) ? length :
194                                                         ios->si.group_length;
195                 ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
196         }
197
198         return 0;
199 }
200 EXPORT_SYMBOL(ore_get_rw_state);
201
202 /* Allocate an io_state for all the devices in the comps array
203  *
204  * This version of io_state allocation is used mostly by create/remove
205  * and trunc where we currently need all the devices. The only wastful
206  * bit is the read/write_attributes with no IO. Those sites should
207  * be converted to use ore_get_rw_state() with length=0
208  */
209 int  ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
210                       struct ore_io_state **pios)
211 {
212         return _get_io_state(layout, oc, oc->numdevs, pios);
213 }
214 EXPORT_SYMBOL(ore_get_io_state);
215
216 void ore_put_io_state(struct ore_io_state *ios)
217 {
218         if (ios) {
219                 unsigned i;
220
221                 for (i = 0; i < ios->numdevs; i++) {
222                         struct ore_per_dev_state *per_dev = &ios->per_dev[i];
223
224                         if (per_dev->or)
225                                 osd_end_request(per_dev->or);
226                         if (per_dev->bio)
227                                 bio_put(per_dev->bio);
228                 }
229
230                 kfree(ios);
231         }
232 }
233 EXPORT_SYMBOL(ore_put_io_state);
234
235 static void _sync_done(struct ore_io_state *ios, void *p)
236 {
237         struct completion *waiting = p;
238
239         complete(waiting);
240 }
241
242 static void _last_io(struct kref *kref)
243 {
244         struct ore_io_state *ios = container_of(
245                                         kref, struct ore_io_state, kref);
246
247         ios->done(ios, ios->private);
248 }
249
250 static void _done_io(struct osd_request *or, void *p)
251 {
252         struct ore_io_state *ios = p;
253
254         kref_put(&ios->kref, _last_io);
255 }
256
257 static int ore_io_execute(struct ore_io_state *ios)
258 {
259         DECLARE_COMPLETION_ONSTACK(wait);
260         bool sync = (ios->done == NULL);
261         int i, ret;
262
263         if (sync) {
264                 ios->done = _sync_done;
265                 ios->private = &wait;
266         }
267
268         for (i = 0; i < ios->numdevs; i++) {
269                 struct osd_request *or = ios->per_dev[i].or;
270                 if (unlikely(!or))
271                         continue;
272
273                 ret = osd_finalize_request(or, 0, _ios_cred(ios, i), NULL);
274                 if (unlikely(ret)) {
275                         ORE_DBGMSG("Failed to osd_finalize_request() => %d\n",
276                                      ret);
277                         return ret;
278                 }
279         }
280
281         kref_init(&ios->kref);
282
283         for (i = 0; i < ios->numdevs; i++) {
284                 struct osd_request *or = ios->per_dev[i].or;
285                 if (unlikely(!or))
286                         continue;
287
288                 kref_get(&ios->kref);
289                 osd_execute_request_async(or, _done_io, ios);
290         }
291
292         kref_put(&ios->kref, _last_io);
293         ret = 0;
294
295         if (sync) {
296                 wait_for_completion(&wait);
297                 ret = ore_check_io(ios, NULL);
298         }
299         return ret;
300 }
301
302 static void _clear_bio(struct bio *bio)
303 {
304         struct bio_vec *bv;
305         unsigned i;
306
307         __bio_for_each_segment(bv, bio, i, 0) {
308                 unsigned this_count = bv->bv_len;
309
310                 if (likely(PAGE_SIZE == this_count))
311                         clear_highpage(bv->bv_page);
312                 else
313                         zero_user(bv->bv_page, bv->bv_offset, this_count);
314         }
315 }
316
317 int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
318 {
319         enum osd_err_priority acumulated_osd_err = 0;
320         int acumulated_lin_err = 0;
321         int i;
322
323         for (i = 0; i < ios->numdevs; i++) {
324                 struct osd_sense_info osi;
325                 struct ore_per_dev_state *per_dev = &ios->per_dev[i];
326                 struct osd_request *or = per_dev->or;
327                 int ret;
328
329                 if (unlikely(!or))
330                         continue;
331
332                 ret = osd_req_decode_sense(or, &osi);
333                 if (likely(!ret))
334                         continue;
335
336                 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
337                         /* start read offset passed endof file */
338                         _clear_bio(per_dev->bio);
339                         ORE_DBGMSG("start read offset passed end of file "
340                                 "offset=0x%llx, length=0x%llx\n",
341                                 _LLU(per_dev->offset),
342                                 _LLU(per_dev->length));
343
344                         continue; /* we recovered */
345                 }
346
347                 if (on_dev_error) {
348                         u64 residual = ios->reading ?
349                                         or->in.residual : or->out.residual;
350                         u64 offset = (ios->offset + ios->length) - residual;
351                         struct ore_dev *od = ios->oc->ods[
352                                         per_dev->dev - ios->oc->first_dev];
353
354                         on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri,
355                                      offset, residual);
356                 }
357                 if (osi.osd_err_pri >= acumulated_osd_err) {
358                         acumulated_osd_err = osi.osd_err_pri;
359                         acumulated_lin_err = ret;
360                 }
361         }
362
363         return acumulated_lin_err;
364 }
365 EXPORT_SYMBOL(ore_check_io);
366
367 /*
368  * L - logical offset into the file
369  *
370  * U - The number of bytes in a stripe within a group
371  *
372  *      U = stripe_unit * group_width
373  *
374  * T - The number of bytes striped within a group of component objects
375  *     (before advancing to the next group)
376  *
377  *      T = stripe_unit * group_width * group_depth
378  *
379  * S - The number of bytes striped across all component objects
380  *     before the pattern repeats
381  *
382  *      S = stripe_unit * group_width * group_depth * group_count
383  *
384  * M - The "major" (i.e., across all components) stripe number
385  *
386  *      M = L / S
387  *
388  * G - Counts the groups from the beginning of the major stripe
389  *
390  *      G = (L - (M * S)) / T   [or (L % S) / T]
391  *
392  * H - The byte offset within the group
393  *
394  *      H = (L - (M * S)) % T   [or (L % S) % T]
395  *
396  * N - The "minor" (i.e., across the group) stripe number
397  *
398  *      N = H / U
399  *
400  * C - The component index coresponding to L
401  *
402  *      C = (H - (N * U)) / stripe_unit + G * group_width
403  *      [or (L % U) / stripe_unit + G * group_width]
404  *
405  * O - The component offset coresponding to L
406  *
407  *      O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
408  */
409 void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
410                           struct ore_striping_info *si)
411 {
412         u32     stripe_unit = layout->stripe_unit;
413         u32     group_width = layout->group_width;
414         u64     group_depth = layout->group_depth;
415
416         u32     U = stripe_unit * group_width;
417         u64     T = U * group_depth;
418         u64     S = T * layout->group_count;
419         u64     M = div64_u64(file_offset, S);
420
421         /*
422         G = (L - (M * S)) / T
423         H = (L - (M * S)) % T
424         */
425         u64     LmodS = file_offset - M * S;
426         u32     G = div64_u64(LmodS, T);
427         u64     H = LmodS - G * T;
428
429         u32     N = div_u64(H, U);
430
431         /* "H - (N * U)" is just "H % U" so it's bound to u32 */
432         si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
433         si->dev *= layout->mirrors_p1;
434
435         div_u64_rem(file_offset, stripe_unit, &si->unit_off);
436
437         si->obj_offset = si->unit_off + (N * stripe_unit) +
438                                   (M * group_depth * stripe_unit);
439
440         si->group_length = T - H;
441         si->M = M;
442 }
443 EXPORT_SYMBOL(ore_calc_stripe_info);
444
445 static int _add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
446                 unsigned pgbase, struct ore_per_dev_state *per_dev,
447                 int cur_len)
448 {
449         unsigned pg = *cur_pg;
450         struct request_queue *q =
451                         osd_request_queue(_ios_od(ios, per_dev->dev));
452         unsigned len = cur_len;
453         int ret;
454
455         if (per_dev->bio == NULL) {
456                 unsigned pages_in_stripe = ios->layout->group_width *
457                                         (ios->layout->stripe_unit / PAGE_SIZE);
458                 unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
459                                                 ios->layout->group_width;
460
461                 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
462                 if (unlikely(!per_dev->bio)) {
463                         ORE_DBGMSG("Failed to allocate BIO size=%u\n",
464                                      bio_size);
465                         ret = -ENOMEM;
466                         goto out;
467                 }
468         }
469
470         while (cur_len > 0) {
471                 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
472                 unsigned added_len;
473
474                 BUG_ON(ios->nr_pages <= pg);
475                 cur_len -= pglen;
476
477                 added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
478                                             pglen, pgbase);
479                 if (unlikely(pglen != added_len)) {
480                         ret = -ENOMEM;
481                         goto out;
482                 }
483                 pgbase = 0;
484                 ++pg;
485         }
486         BUG_ON(cur_len);
487
488         per_dev->length += len;
489         *cur_pg = pg;
490         ret = 0;
491 out:    /* we fail the complete unit on an error eg don't advance
492          * per_dev->length and cur_pg. This means that we might have a bigger
493          * bio than the CDB requested length (per_dev->length). That's fine
494          * only the oposite is fatal.
495          */
496         return ret;
497 }
498
499 static int _prepare_for_striping(struct ore_io_state *ios)
500 {
501         struct ore_striping_info *si = &ios->si;
502         unsigned stripe_unit = ios->layout->stripe_unit;
503         unsigned mirrors_p1 = ios->layout->mirrors_p1;
504         unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
505         unsigned dev = si->dev;
506         unsigned first_dev = dev - (dev % devs_in_group);
507         unsigned cur_pg = ios->pages_consumed;
508         u64 length = ios->length;
509         int ret = 0;
510
511         if (!ios->pages) {
512                 ios->numdevs = ios->layout->mirrors_p1;
513                 return 0;
514         }
515
516         BUG_ON(length > si->group_length);
517
518         while (length) {
519                 unsigned comp = dev - first_dev;
520                 struct ore_per_dev_state *per_dev = &ios->per_dev[comp];
521                 unsigned cur_len, page_off = 0;
522
523                 if (!per_dev->length) {
524                         per_dev->dev = dev;
525                         if (dev < si->dev) {
526                                 per_dev->offset = si->obj_offset + stripe_unit -
527                                                                    si->unit_off;
528                                 cur_len = stripe_unit;
529                         } else if (dev == si->dev) {
530                                 per_dev->offset = si->obj_offset;
531                                 cur_len = stripe_unit - si->unit_off;
532                                 page_off = si->unit_off & ~PAGE_MASK;
533                                 BUG_ON(page_off && (page_off != ios->pgbase));
534                         } else { /* dev > si->dev */
535                                 per_dev->offset = si->obj_offset - si->unit_off;
536                                 cur_len = stripe_unit;
537                         }
538                 } else {
539                         cur_len = stripe_unit;
540                 }
541                 if (cur_len >= length)
542                         cur_len = length;
543
544                 ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
545                                        cur_len);
546                 if (unlikely(ret))
547                         goto out;
548
549                 dev += mirrors_p1;
550                 dev = (dev % devs_in_group) + first_dev;
551
552                 length -= cur_len;
553         }
554 out:
555         ios->numdevs = devs_in_group;
556         ios->pages_consumed = cur_pg;
557         if (unlikely(ret)) {
558                 if (length == ios->length)
559                         return ret;
560                 else
561                         ios->length -= length;
562         }
563         return 0;
564 }
565
566 int ore_create(struct ore_io_state *ios)
567 {
568         int i, ret;
569
570         for (i = 0; i < ios->oc->numdevs; i++) {
571                 struct osd_request *or;
572
573                 or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
574                 if (unlikely(!or)) {
575                         ORE_ERR("%s: osd_start_request failed\n", __func__);
576                         ret = -ENOMEM;
577                         goto out;
578                 }
579                 ios->per_dev[i].or = or;
580                 ios->numdevs++;
581
582                 osd_req_create_object(or, _ios_obj(ios, i));
583         }
584         ret = ore_io_execute(ios);
585
586 out:
587         return ret;
588 }
589 EXPORT_SYMBOL(ore_create);
590
591 int ore_remove(struct ore_io_state *ios)
592 {
593         int i, ret;
594
595         for (i = 0; i < ios->oc->numdevs; i++) {
596                 struct osd_request *or;
597
598                 or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
599                 if (unlikely(!or)) {
600                         ORE_ERR("%s: osd_start_request failed\n", __func__);
601                         ret = -ENOMEM;
602                         goto out;
603                 }
604                 ios->per_dev[i].or = or;
605                 ios->numdevs++;
606
607                 osd_req_remove_object(or, _ios_obj(ios, i));
608         }
609         ret = ore_io_execute(ios);
610
611 out:
612         return ret;
613 }
614 EXPORT_SYMBOL(ore_remove);
615
616 static int _write_mirror(struct ore_io_state *ios, int cur_comp)
617 {
618         struct ore_per_dev_state *master_dev = &ios->per_dev[cur_comp];
619         unsigned dev = ios->per_dev[cur_comp].dev;
620         unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
621         int ret = 0;
622
623         if (ios->pages && !master_dev->length)
624                 return 0; /* Just an empty slot */
625
626         for (; cur_comp < last_comp; ++cur_comp, ++dev) {
627                 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
628                 struct osd_request *or;
629
630                 or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL);
631                 if (unlikely(!or)) {
632                         ORE_ERR("%s: osd_start_request failed\n", __func__);
633                         ret = -ENOMEM;
634                         goto out;
635                 }
636                 per_dev->or = or;
637
638                 if (ios->pages) {
639                         struct bio *bio;
640
641                         if (per_dev != master_dev) {
642                                 bio = bio_kmalloc(GFP_KERNEL,
643                                                   master_dev->bio->bi_max_vecs);
644                                 if (unlikely(!bio)) {
645                                         ORE_DBGMSG(
646                                               "Failed to allocate BIO size=%u\n",
647                                               master_dev->bio->bi_max_vecs);
648                                         ret = -ENOMEM;
649                                         goto out;
650                                 }
651
652                                 __bio_clone(bio, master_dev->bio);
653                                 bio->bi_bdev = NULL;
654                                 bio->bi_next = NULL;
655                                 per_dev->offset = master_dev->offset;
656                                 per_dev->length = master_dev->length;
657                                 per_dev->bio =  bio;
658                                 per_dev->dev = dev;
659                         } else {
660                                 bio = master_dev->bio;
661                                 /* FIXME: bio_set_dir() */
662                                 bio->bi_rw |= REQ_WRITE;
663                         }
664
665                         osd_req_write(or, _ios_obj(ios, dev), per_dev->offset,
666                                       bio, per_dev->length);
667                         ORE_DBGMSG("write(0x%llx) offset=0x%llx "
668                                       "length=0x%llx dev=%d\n",
669                                      _LLU(_ios_obj(ios, dev)->id),
670                                      _LLU(per_dev->offset),
671                                      _LLU(per_dev->length), dev);
672                 } else if (ios->kern_buff) {
673                         per_dev->offset = ios->si.obj_offset;
674                         per_dev->dev = ios->si.dev + dev;
675
676                         /* no cross device without page array */
677                         BUG_ON((ios->layout->group_width > 1) &&
678                                (ios->si.unit_off + ios->length >
679                                 ios->layout->stripe_unit));
680
681                         ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev),
682                                                  per_dev->offset,
683                                                  ios->kern_buff, ios->length);
684                         if (unlikely(ret))
685                                 goto out;
686                         ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
687                                       "length=0x%llx dev=%d\n",
688                                      _LLU(_ios_obj(ios, dev)->id),
689                                      _LLU(per_dev->offset),
690                                      _LLU(ios->length), per_dev->dev);
691                 } else {
692                         osd_req_set_attributes(or, _ios_obj(ios, dev));
693                         ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
694                                      _LLU(_ios_obj(ios, dev)->id),
695                                      ios->out_attr_len, dev);
696                 }
697
698                 if (ios->out_attr)
699                         osd_req_add_set_attr_list(or, ios->out_attr,
700                                                   ios->out_attr_len);
701
702                 if (ios->in_attr)
703                         osd_req_add_get_attr_list(or, ios->in_attr,
704                                                   ios->in_attr_len);
705         }
706
707 out:
708         return ret;
709 }
710
711 int ore_write(struct ore_io_state *ios)
712 {
713         int i;
714         int ret;
715
716         ret = _prepare_for_striping(ios);
717         if (unlikely(ret))
718                 return ret;
719
720         for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
721                 ret = _write_mirror(ios, i);
722                 if (unlikely(ret))
723                         return ret;
724         }
725
726         ret = ore_io_execute(ios);
727         return ret;
728 }
729 EXPORT_SYMBOL(ore_write);
730
731 static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
732 {
733         struct osd_request *or;
734         struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
735         struct osd_obj_id *obj = _ios_obj(ios, cur_comp);
736         unsigned first_dev = (unsigned)obj->id;
737
738         if (ios->pages && !per_dev->length)
739                 return 0; /* Just an empty slot */
740
741         first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
742         or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL);
743         if (unlikely(!or)) {
744                 ORE_ERR("%s: osd_start_request failed\n", __func__);
745                 return -ENOMEM;
746         }
747         per_dev->or = or;
748
749         if (ios->pages) {
750                 osd_req_read(or, obj, per_dev->offset,
751                                 per_dev->bio, per_dev->length);
752                 ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
753                              " dev=%d\n", _LLU(obj->id),
754                              _LLU(per_dev->offset), _LLU(per_dev->length),
755                              first_dev);
756         } else {
757                 BUG_ON(ios->kern_buff);
758
759                 osd_req_get_attributes(or, obj);
760                 ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
761                               _LLU(obj->id),
762                               ios->in_attr_len, first_dev);
763         }
764         if (ios->out_attr)
765                 osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
766
767         if (ios->in_attr)
768                 osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
769
770         return 0;
771 }
772
773 int ore_read(struct ore_io_state *ios)
774 {
775         int i;
776         int ret;
777
778         ret = _prepare_for_striping(ios);
779         if (unlikely(ret))
780                 return ret;
781
782         for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
783                 ret = _read_mirror(ios, i);
784                 if (unlikely(ret))
785                         return ret;
786         }
787
788         ret = ore_io_execute(ios);
789         return ret;
790 }
791 EXPORT_SYMBOL(ore_read);
792
793 int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr)
794 {
795         struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
796         void *iter = NULL;
797         int nelem;
798
799         do {
800                 nelem = 1;
801                 osd_req_decode_get_attr_list(ios->per_dev[0].or,
802                                              &cur_attr, &nelem, &iter);
803                 if ((cur_attr.attr_page == attr->attr_page) &&
804                     (cur_attr.attr_id == attr->attr_id)) {
805                         attr->len = cur_attr.len;
806                         attr->val_ptr = cur_attr.val_ptr;
807                         return 0;
808                 }
809         } while (iter);
810
811         return -EIO;
812 }
813 EXPORT_SYMBOL(extract_attr_from_ios);
814
815 static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
816                              struct osd_attr *attr)
817 {
818         int last_comp = cur_comp + ios->layout->mirrors_p1;
819
820         for (; cur_comp < last_comp; ++cur_comp) {
821                 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
822                 struct osd_request *or;
823
824                 or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL);
825                 if (unlikely(!or)) {
826                         ORE_ERR("%s: osd_start_request failed\n", __func__);
827                         return -ENOMEM;
828                 }
829                 per_dev->or = or;
830
831                 osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
832                 osd_req_add_set_attr_list(or, attr, 1);
833         }
834
835         return 0;
836 }
837
838 struct _trunc_info {
839         struct ore_striping_info si;
840         u64 prev_group_obj_off;
841         u64 next_group_obj_off;
842
843         unsigned first_group_dev;
844         unsigned nex_group_dev;
845 };
846
847 static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
848                              struct _trunc_info *ti)
849 {
850         unsigned stripe_unit = layout->stripe_unit;
851
852         ore_calc_stripe_info(layout, file_offset, &ti->si);
853
854         ti->prev_group_obj_off = ti->si.M * stripe_unit;
855         ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
856
857         ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
858         ti->nex_group_dev = ti->first_group_dev + layout->group_width;
859 }
860
861 int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
862                    u64 size)
863 {
864         struct ore_io_state *ios;
865         struct exofs_trunc_attr {
866                 struct osd_attr attr;
867                 __be64 newsize;
868         } *size_attrs;
869         struct _trunc_info ti;
870         int i, ret;
871
872         ret = ore_get_io_state(layout, oc, &ios);
873         if (unlikely(ret))
874                 return ret;
875
876         _calc_trunk_info(ios->layout, size, &ti);
877
878         size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs),
879                              GFP_KERNEL);
880         if (unlikely(!size_attrs)) {
881                 ret = -ENOMEM;
882                 goto out;
883         }
884
885         ios->numdevs = ios->oc->numdevs;
886
887         for (i = 0; i < ios->numdevs; ++i) {
888                 struct exofs_trunc_attr *size_attr = &size_attrs[i];
889                 u64 obj_size;
890
891                 if (i < ti.first_group_dev)
892                         obj_size = ti.prev_group_obj_off;
893                 else if (i >= ti.nex_group_dev)
894                         obj_size = ti.next_group_obj_off;
895                 else if (i < ti.si.dev) /* dev within this group */
896                         obj_size = ti.si.obj_offset +
897                                       ios->layout->stripe_unit - ti.si.unit_off;
898                 else if (i == ti.si.dev)
899                         obj_size = ti.si.obj_offset;
900                 else /* i > ti.dev */
901                         obj_size = ti.si.obj_offset - ti.si.unit_off;
902
903                 size_attr->newsize = cpu_to_be64(obj_size);
904                 size_attr->attr = g_attr_logical_length;
905                 size_attr->attr.val_ptr = &size_attr->newsize;
906
907                 ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
908                              _LLU(oc->comps->obj.id), _LLU(obj_size), i);
909                 ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
910                                         &size_attr->attr);
911                 if (unlikely(ret))
912                         goto out;
913         }
914         ret = ore_io_execute(ios);
915
916 out:
917         kfree(size_attrs);
918         ore_put_io_state(ios);
919         return ret;
920 }
921 EXPORT_SYMBOL(ore_truncate);
922
923 const struct osd_attr g_attr_logical_length = ATTR_DEF(
924         OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
925 EXPORT_SYMBOL(g_attr_logical_length);