btrfs: quasi-round-robin for chunk allocation
[pandora-kernel.git] / fs / btrfs / volumes.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/slab.h>
21 #include <linux/buffer_head.h>
22 #include <linux/blkdev.h>
23 #include <linux/random.h>
24 #include <linux/iocontext.h>
25 #include <linux/capability.h>
26 #include <asm/div64.h>
27 #include "compat.h"
28 #include "ctree.h"
29 #include "extent_map.h"
30 #include "disk-io.h"
31 #include "transaction.h"
32 #include "print-tree.h"
33 #include "volumes.h"
34 #include "async-thread.h"
35
36 static int init_first_rw_device(struct btrfs_trans_handle *trans,
37                                 struct btrfs_root *root,
38                                 struct btrfs_device *device);
39 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
40
41 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
42                             (sizeof(struct btrfs_bio_stripe) * (n)))
43
44 static DEFINE_MUTEX(uuid_mutex);
45 static LIST_HEAD(fs_uuids);
46
47 void btrfs_lock_volumes(void)
48 {
49         mutex_lock(&uuid_mutex);
50 }
51
52 void btrfs_unlock_volumes(void)
53 {
54         mutex_unlock(&uuid_mutex);
55 }
56
57 static void lock_chunks(struct btrfs_root *root)
58 {
59         mutex_lock(&root->fs_info->chunk_mutex);
60 }
61
62 static void unlock_chunks(struct btrfs_root *root)
63 {
64         mutex_unlock(&root->fs_info->chunk_mutex);
65 }
66
67 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
68 {
69         struct btrfs_device *device;
70         WARN_ON(fs_devices->opened);
71         while (!list_empty(&fs_devices->devices)) {
72                 device = list_entry(fs_devices->devices.next,
73                                     struct btrfs_device, dev_list);
74                 list_del(&device->dev_list);
75                 kfree(device->name);
76                 kfree(device);
77         }
78         kfree(fs_devices);
79 }
80
81 int btrfs_cleanup_fs_uuids(void)
82 {
83         struct btrfs_fs_devices *fs_devices;
84
85         while (!list_empty(&fs_uuids)) {
86                 fs_devices = list_entry(fs_uuids.next,
87                                         struct btrfs_fs_devices, list);
88                 list_del(&fs_devices->list);
89                 free_fs_devices(fs_devices);
90         }
91         return 0;
92 }
93
94 static noinline struct btrfs_device *__find_device(struct list_head *head,
95                                                    u64 devid, u8 *uuid)
96 {
97         struct btrfs_device *dev;
98
99         list_for_each_entry(dev, head, dev_list) {
100                 if (dev->devid == devid &&
101                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
102                         return dev;
103                 }
104         }
105         return NULL;
106 }
107
108 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
109 {
110         struct btrfs_fs_devices *fs_devices;
111
112         list_for_each_entry(fs_devices, &fs_uuids, list) {
113                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
114                         return fs_devices;
115         }
116         return NULL;
117 }
118
119 static void requeue_list(struct btrfs_pending_bios *pending_bios,
120                         struct bio *head, struct bio *tail)
121 {
122
123         struct bio *old_head;
124
125         old_head = pending_bios->head;
126         pending_bios->head = head;
127         if (pending_bios->tail)
128                 tail->bi_next = old_head;
129         else
130                 pending_bios->tail = tail;
131 }
132
133 /*
134  * we try to collect pending bios for a device so we don't get a large
135  * number of procs sending bios down to the same device.  This greatly
136  * improves the schedulers ability to collect and merge the bios.
137  *
138  * But, it also turns into a long list of bios to process and that is sure
139  * to eventually make the worker thread block.  The solution here is to
140  * make some progress and then put this work struct back at the end of
141  * the list if the block device is congested.  This way, multiple devices
142  * can make progress from a single worker thread.
143  */
144 static noinline int run_scheduled_bios(struct btrfs_device *device)
145 {
146         struct bio *pending;
147         struct backing_dev_info *bdi;
148         struct btrfs_fs_info *fs_info;
149         struct btrfs_pending_bios *pending_bios;
150         struct bio *tail;
151         struct bio *cur;
152         int again = 0;
153         unsigned long num_run;
154         unsigned long num_sync_run;
155         unsigned long batch_run = 0;
156         unsigned long limit;
157         unsigned long last_waited = 0;
158         int force_reg = 0;
159
160         bdi = blk_get_backing_dev_info(device->bdev);
161         fs_info = device->dev_root->fs_info;
162         limit = btrfs_async_submit_limit(fs_info);
163         limit = limit * 2 / 3;
164
165         /* we want to make sure that every time we switch from the sync
166          * list to the normal list, we unplug
167          */
168         num_sync_run = 0;
169
170 loop:
171         spin_lock(&device->io_lock);
172
173 loop_lock:
174         num_run = 0;
175
176         /* take all the bios off the list at once and process them
177          * later on (without the lock held).  But, remember the
178          * tail and other pointers so the bios can be properly reinserted
179          * into the list if we hit congestion
180          */
181         if (!force_reg && device->pending_sync_bios.head) {
182                 pending_bios = &device->pending_sync_bios;
183                 force_reg = 1;
184         } else {
185                 pending_bios = &device->pending_bios;
186                 force_reg = 0;
187         }
188
189         pending = pending_bios->head;
190         tail = pending_bios->tail;
191         WARN_ON(pending && !tail);
192
193         /*
194          * if pending was null this time around, no bios need processing
195          * at all and we can stop.  Otherwise it'll loop back up again
196          * and do an additional check so no bios are missed.
197          *
198          * device->running_pending is used to synchronize with the
199          * schedule_bio code.
200          */
201         if (device->pending_sync_bios.head == NULL &&
202             device->pending_bios.head == NULL) {
203                 again = 0;
204                 device->running_pending = 0;
205         } else {
206                 again = 1;
207                 device->running_pending = 1;
208         }
209
210         pending_bios->head = NULL;
211         pending_bios->tail = NULL;
212
213         spin_unlock(&device->io_lock);
214
215         /*
216          * if we're doing the regular priority list, make sure we unplug
217          * for any high prio bios we've sent down
218          */
219         if (pending_bios == &device->pending_bios && num_sync_run > 0) {
220                 num_sync_run = 0;
221                 blk_run_backing_dev(bdi, NULL);
222         }
223
224         while (pending) {
225
226                 rmb();
227                 /* we want to work on both lists, but do more bios on the
228                  * sync list than the regular list
229                  */
230                 if ((num_run > 32 &&
231                     pending_bios != &device->pending_sync_bios &&
232                     device->pending_sync_bios.head) ||
233                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
234                     device->pending_bios.head)) {
235                         spin_lock(&device->io_lock);
236                         requeue_list(pending_bios, pending, tail);
237                         goto loop_lock;
238                 }
239
240                 cur = pending;
241                 pending = pending->bi_next;
242                 cur->bi_next = NULL;
243                 atomic_dec(&fs_info->nr_async_bios);
244
245                 if (atomic_read(&fs_info->nr_async_bios) < limit &&
246                     waitqueue_active(&fs_info->async_submit_wait))
247                         wake_up(&fs_info->async_submit_wait);
248
249                 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
250
251                 if (cur->bi_rw & REQ_SYNC)
252                         num_sync_run++;
253
254                 submit_bio(cur->bi_rw, cur);
255                 num_run++;
256                 batch_run++;
257                 if (need_resched()) {
258                         if (num_sync_run) {
259                                 blk_run_backing_dev(bdi, NULL);
260                                 num_sync_run = 0;
261                         }
262                         cond_resched();
263                 }
264
265                 /*
266                  * we made progress, there is more work to do and the bdi
267                  * is now congested.  Back off and let other work structs
268                  * run instead
269                  */
270                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
271                     fs_info->fs_devices->open_devices > 1) {
272                         struct io_context *ioc;
273
274                         ioc = current->io_context;
275
276                         /*
277                          * the main goal here is that we don't want to
278                          * block if we're going to be able to submit
279                          * more requests without blocking.
280                          *
281                          * This code does two great things, it pokes into
282                          * the elevator code from a filesystem _and_
283                          * it makes assumptions about how batching works.
284                          */
285                         if (ioc && ioc->nr_batch_requests > 0 &&
286                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
287                             (last_waited == 0 ||
288                              ioc->last_waited == last_waited)) {
289                                 /*
290                                  * we want to go through our batch of
291                                  * requests and stop.  So, we copy out
292                                  * the ioc->last_waited time and test
293                                  * against it before looping
294                                  */
295                                 last_waited = ioc->last_waited;
296                                 if (need_resched()) {
297                                         if (num_sync_run) {
298                                                 blk_run_backing_dev(bdi, NULL);
299                                                 num_sync_run = 0;
300                                         }
301                                         cond_resched();
302                                 }
303                                 continue;
304                         }
305                         spin_lock(&device->io_lock);
306                         requeue_list(pending_bios, pending, tail);
307                         device->running_pending = 1;
308
309                         spin_unlock(&device->io_lock);
310                         btrfs_requeue_work(&device->work);
311                         goto done;
312                 }
313         }
314
315         if (num_sync_run) {
316                 num_sync_run = 0;
317                 blk_run_backing_dev(bdi, NULL);
318         }
319         /*
320          * IO has already been through a long path to get here.  Checksumming,
321          * async helper threads, perhaps compression.  We've done a pretty
322          * good job of collecting a batch of IO and should just unplug
323          * the device right away.
324          *
325          * This will help anyone who is waiting on the IO, they might have
326          * already unplugged, but managed to do so before the bio they
327          * cared about found its way down here.
328          */
329         blk_run_backing_dev(bdi, NULL);
330
331         cond_resched();
332         if (again)
333                 goto loop;
334
335         spin_lock(&device->io_lock);
336         if (device->pending_bios.head || device->pending_sync_bios.head)
337                 goto loop_lock;
338         spin_unlock(&device->io_lock);
339
340 done:
341         return 0;
342 }
343
344 static void pending_bios_fn(struct btrfs_work *work)
345 {
346         struct btrfs_device *device;
347
348         device = container_of(work, struct btrfs_device, work);
349         run_scheduled_bios(device);
350 }
351
352 static noinline int device_list_add(const char *path,
353                            struct btrfs_super_block *disk_super,
354                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
355 {
356         struct btrfs_device *device;
357         struct btrfs_fs_devices *fs_devices;
358         u64 found_transid = btrfs_super_generation(disk_super);
359         char *name;
360
361         fs_devices = find_fsid(disk_super->fsid);
362         if (!fs_devices) {
363                 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
364                 if (!fs_devices)
365                         return -ENOMEM;
366                 INIT_LIST_HEAD(&fs_devices->devices);
367                 INIT_LIST_HEAD(&fs_devices->alloc_list);
368                 list_add(&fs_devices->list, &fs_uuids);
369                 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
370                 fs_devices->latest_devid = devid;
371                 fs_devices->latest_trans = found_transid;
372                 mutex_init(&fs_devices->device_list_mutex);
373                 device = NULL;
374         } else {
375                 device = __find_device(&fs_devices->devices, devid,
376                                        disk_super->dev_item.uuid);
377         }
378         if (!device) {
379                 if (fs_devices->opened)
380                         return -EBUSY;
381
382                 device = kzalloc(sizeof(*device), GFP_NOFS);
383                 if (!device) {
384                         /* we can safely leave the fs_devices entry around */
385                         return -ENOMEM;
386                 }
387                 device->devid = devid;
388                 device->work.func = pending_bios_fn;
389                 memcpy(device->uuid, disk_super->dev_item.uuid,
390                        BTRFS_UUID_SIZE);
391                 spin_lock_init(&device->io_lock);
392                 device->name = kstrdup(path, GFP_NOFS);
393                 if (!device->name) {
394                         kfree(device);
395                         return -ENOMEM;
396                 }
397                 INIT_LIST_HEAD(&device->dev_alloc_list);
398
399                 mutex_lock(&fs_devices->device_list_mutex);
400                 list_add(&device->dev_list, &fs_devices->devices);
401                 mutex_unlock(&fs_devices->device_list_mutex);
402
403                 device->fs_devices = fs_devices;
404                 fs_devices->num_devices++;
405         } else if (!device->name || strcmp(device->name, path)) {
406                 name = kstrdup(path, GFP_NOFS);
407                 if (!name)
408                         return -ENOMEM;
409                 kfree(device->name);
410                 device->name = name;
411                 if (device->missing) {
412                         fs_devices->missing_devices--;
413                         device->missing = 0;
414                 }
415         }
416
417         if (found_transid > fs_devices->latest_trans) {
418                 fs_devices->latest_devid = devid;
419                 fs_devices->latest_trans = found_transid;
420         }
421         *fs_devices_ret = fs_devices;
422         return 0;
423 }
424
425 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
426 {
427         struct btrfs_fs_devices *fs_devices;
428         struct btrfs_device *device;
429         struct btrfs_device *orig_dev;
430
431         fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
432         if (!fs_devices)
433                 return ERR_PTR(-ENOMEM);
434
435         INIT_LIST_HEAD(&fs_devices->devices);
436         INIT_LIST_HEAD(&fs_devices->alloc_list);
437         INIT_LIST_HEAD(&fs_devices->list);
438         mutex_init(&fs_devices->device_list_mutex);
439         fs_devices->latest_devid = orig->latest_devid;
440         fs_devices->latest_trans = orig->latest_trans;
441         memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
442
443         mutex_lock(&orig->device_list_mutex);
444         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
445                 device = kzalloc(sizeof(*device), GFP_NOFS);
446                 if (!device)
447                         goto error;
448
449                 device->name = kstrdup(orig_dev->name, GFP_NOFS);
450                 if (!device->name) {
451                         kfree(device);
452                         goto error;
453                 }
454
455                 device->devid = orig_dev->devid;
456                 device->work.func = pending_bios_fn;
457                 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
458                 spin_lock_init(&device->io_lock);
459                 INIT_LIST_HEAD(&device->dev_list);
460                 INIT_LIST_HEAD(&device->dev_alloc_list);
461
462                 list_add(&device->dev_list, &fs_devices->devices);
463                 device->fs_devices = fs_devices;
464                 fs_devices->num_devices++;
465         }
466         mutex_unlock(&orig->device_list_mutex);
467         return fs_devices;
468 error:
469         mutex_unlock(&orig->device_list_mutex);
470         free_fs_devices(fs_devices);
471         return ERR_PTR(-ENOMEM);
472 }
473
474 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
475 {
476         struct btrfs_device *device, *next;
477
478         mutex_lock(&uuid_mutex);
479 again:
480         mutex_lock(&fs_devices->device_list_mutex);
481         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
482                 if (device->in_fs_metadata)
483                         continue;
484
485                 if (device->bdev) {
486                         blkdev_put(device->bdev, device->mode);
487                         device->bdev = NULL;
488                         fs_devices->open_devices--;
489                 }
490                 if (device->writeable) {
491                         list_del_init(&device->dev_alloc_list);
492                         device->writeable = 0;
493                         fs_devices->rw_devices--;
494                 }
495                 list_del_init(&device->dev_list);
496                 fs_devices->num_devices--;
497                 kfree(device->name);
498                 kfree(device);
499         }
500         mutex_unlock(&fs_devices->device_list_mutex);
501
502         if (fs_devices->seed) {
503                 fs_devices = fs_devices->seed;
504                 goto again;
505         }
506
507         mutex_unlock(&uuid_mutex);
508         return 0;
509 }
510
511 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
512 {
513         struct btrfs_device *device;
514
515         if (--fs_devices->opened > 0)
516                 return 0;
517
518         list_for_each_entry(device, &fs_devices->devices, dev_list) {
519                 if (device->bdev) {
520                         blkdev_put(device->bdev, device->mode);
521                         fs_devices->open_devices--;
522                 }
523                 if (device->writeable) {
524                         list_del_init(&device->dev_alloc_list);
525                         fs_devices->rw_devices--;
526                 }
527
528                 device->bdev = NULL;
529                 device->writeable = 0;
530                 device->in_fs_metadata = 0;
531         }
532         WARN_ON(fs_devices->open_devices);
533         WARN_ON(fs_devices->rw_devices);
534         fs_devices->opened = 0;
535         fs_devices->seeding = 0;
536
537         return 0;
538 }
539
540 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
541 {
542         struct btrfs_fs_devices *seed_devices = NULL;
543         int ret;
544
545         mutex_lock(&uuid_mutex);
546         ret = __btrfs_close_devices(fs_devices);
547         if (!fs_devices->opened) {
548                 seed_devices = fs_devices->seed;
549                 fs_devices->seed = NULL;
550         }
551         mutex_unlock(&uuid_mutex);
552
553         while (seed_devices) {
554                 fs_devices = seed_devices;
555                 seed_devices = fs_devices->seed;
556                 __btrfs_close_devices(fs_devices);
557                 free_fs_devices(fs_devices);
558         }
559         return ret;
560 }
561
562 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
563                                 fmode_t flags, void *holder)
564 {
565         struct block_device *bdev;
566         struct list_head *head = &fs_devices->devices;
567         struct btrfs_device *device;
568         struct block_device *latest_bdev = NULL;
569         struct buffer_head *bh;
570         struct btrfs_super_block *disk_super;
571         u64 latest_devid = 0;
572         u64 latest_transid = 0;
573         u64 devid;
574         int seeding = 1;
575         int ret = 0;
576
577         flags |= FMODE_EXCL;
578
579         list_for_each_entry(device, head, dev_list) {
580                 if (device->bdev)
581                         continue;
582                 if (!device->name)
583                         continue;
584
585                 bdev = blkdev_get_by_path(device->name, flags, holder);
586                 if (IS_ERR(bdev)) {
587                         printk(KERN_INFO "open %s failed\n", device->name);
588                         goto error;
589                 }
590                 set_blocksize(bdev, 4096);
591
592                 bh = btrfs_read_dev_super(bdev);
593                 if (!bh) {
594                         ret = -EINVAL;
595                         goto error_close;
596                 }
597
598                 disk_super = (struct btrfs_super_block *)bh->b_data;
599                 devid = btrfs_stack_device_id(&disk_super->dev_item);
600                 if (devid != device->devid)
601                         goto error_brelse;
602
603                 if (memcmp(device->uuid, disk_super->dev_item.uuid,
604                            BTRFS_UUID_SIZE))
605                         goto error_brelse;
606
607                 device->generation = btrfs_super_generation(disk_super);
608                 if (!latest_transid || device->generation > latest_transid) {
609                         latest_devid = devid;
610                         latest_transid = device->generation;
611                         latest_bdev = bdev;
612                 }
613
614                 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
615                         device->writeable = 0;
616                 } else {
617                         device->writeable = !bdev_read_only(bdev);
618                         seeding = 0;
619                 }
620
621                 device->bdev = bdev;
622                 device->in_fs_metadata = 0;
623                 device->mode = flags;
624
625                 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
626                         fs_devices->rotating = 1;
627
628                 fs_devices->open_devices++;
629                 if (device->writeable) {
630                         fs_devices->rw_devices++;
631                         list_add(&device->dev_alloc_list,
632                                  &fs_devices->alloc_list);
633                 }
634                 continue;
635
636 error_brelse:
637                 brelse(bh);
638 error_close:
639                 blkdev_put(bdev, flags);
640 error:
641                 continue;
642         }
643         if (fs_devices->open_devices == 0) {
644                 ret = -EIO;
645                 goto out;
646         }
647         fs_devices->seeding = seeding;
648         fs_devices->opened = 1;
649         fs_devices->latest_bdev = latest_bdev;
650         fs_devices->latest_devid = latest_devid;
651         fs_devices->latest_trans = latest_transid;
652         fs_devices->total_rw_bytes = 0;
653 out:
654         return ret;
655 }
656
657 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
658                        fmode_t flags, void *holder)
659 {
660         int ret;
661
662         mutex_lock(&uuid_mutex);
663         if (fs_devices->opened) {
664                 fs_devices->opened++;
665                 ret = 0;
666         } else {
667                 ret = __btrfs_open_devices(fs_devices, flags, holder);
668         }
669         mutex_unlock(&uuid_mutex);
670         return ret;
671 }
672
673 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
674                           struct btrfs_fs_devices **fs_devices_ret)
675 {
676         struct btrfs_super_block *disk_super;
677         struct block_device *bdev;
678         struct buffer_head *bh;
679         int ret;
680         u64 devid;
681         u64 transid;
682
683         mutex_lock(&uuid_mutex);
684
685         flags |= FMODE_EXCL;
686         bdev = blkdev_get_by_path(path, flags, holder);
687
688         if (IS_ERR(bdev)) {
689                 ret = PTR_ERR(bdev);
690                 goto error;
691         }
692
693         ret = set_blocksize(bdev, 4096);
694         if (ret)
695                 goto error_close;
696         bh = btrfs_read_dev_super(bdev);
697         if (!bh) {
698                 ret = -EINVAL;
699                 goto error_close;
700         }
701         disk_super = (struct btrfs_super_block *)bh->b_data;
702         devid = btrfs_stack_device_id(&disk_super->dev_item);
703         transid = btrfs_super_generation(disk_super);
704         if (disk_super->label[0])
705                 printk(KERN_INFO "device label %s ", disk_super->label);
706         else {
707                 /* FIXME, make a readl uuid parser */
708                 printk(KERN_INFO "device fsid %llx-%llx ",
709                        *(unsigned long long *)disk_super->fsid,
710                        *(unsigned long long *)(disk_super->fsid + 8));
711         }
712         printk(KERN_CONT "devid %llu transid %llu %s\n",
713                (unsigned long long)devid, (unsigned long long)transid, path);
714         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
715
716         brelse(bh);
717 error_close:
718         blkdev_put(bdev, flags);
719 error:
720         mutex_unlock(&uuid_mutex);
721         return ret;
722 }
723
724 /* helper to account the used device space in the range */
725 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
726                                    u64 end, u64 *length)
727 {
728         struct btrfs_key key;
729         struct btrfs_root *root = device->dev_root;
730         struct btrfs_dev_extent *dev_extent;
731         struct btrfs_path *path;
732         u64 extent_end;
733         int ret;
734         int slot;
735         struct extent_buffer *l;
736
737         *length = 0;
738
739         if (start >= device->total_bytes)
740                 return 0;
741
742         path = btrfs_alloc_path();
743         if (!path)
744                 return -ENOMEM;
745         path->reada = 2;
746
747         key.objectid = device->devid;
748         key.offset = start;
749         key.type = BTRFS_DEV_EXTENT_KEY;
750
751         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
752         if (ret < 0)
753                 goto out;
754         if (ret > 0) {
755                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
756                 if (ret < 0)
757                         goto out;
758         }
759
760         while (1) {
761                 l = path->nodes[0];
762                 slot = path->slots[0];
763                 if (slot >= btrfs_header_nritems(l)) {
764                         ret = btrfs_next_leaf(root, path);
765                         if (ret == 0)
766                                 continue;
767                         if (ret < 0)
768                                 goto out;
769
770                         break;
771                 }
772                 btrfs_item_key_to_cpu(l, &key, slot);
773
774                 if (key.objectid < device->devid)
775                         goto next;
776
777                 if (key.objectid > device->devid)
778                         break;
779
780                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
781                         goto next;
782
783                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
784                 extent_end = key.offset + btrfs_dev_extent_length(l,
785                                                                   dev_extent);
786                 if (key.offset <= start && extent_end > end) {
787                         *length = end - start + 1;
788                         break;
789                 } else if (key.offset <= start && extent_end > start)
790                         *length += extent_end - start;
791                 else if (key.offset > start && extent_end <= end)
792                         *length += extent_end - key.offset;
793                 else if (key.offset > start && key.offset <= end) {
794                         *length += end - key.offset + 1;
795                         break;
796                 } else if (key.offset > end)
797                         break;
798
799 next:
800                 path->slots[0]++;
801         }
802         ret = 0;
803 out:
804         btrfs_free_path(path);
805         return ret;
806 }
807
808 /*
809  * find_free_dev_extent - find free space in the specified device
810  * @trans:      transaction handler
811  * @device:     the device which we search the free space in
812  * @num_bytes:  the size of the free space that we need
813  * @start:      store the start of the free space.
814  * @len:        the size of the free space. that we find, or the size of the max
815  *              free space if we don't find suitable free space
816  *
817  * this uses a pretty simple search, the expectation is that it is
818  * called very infrequently and that a given device has a small number
819  * of extents
820  *
821  * @start is used to store the start of the free space if we find. But if we
822  * don't find suitable free space, it will be used to store the start position
823  * of the max free space.
824  *
825  * @len is used to store the size of the free space that we find.
826  * But if we don't find suitable free space, it is used to store the size of
827  * the max free space.
828  */
829 int find_free_dev_extent(struct btrfs_trans_handle *trans,
830                          struct btrfs_device *device, u64 num_bytes,
831                          u64 *start, u64 *len)
832 {
833         struct btrfs_key key;
834         struct btrfs_root *root = device->dev_root;
835         struct btrfs_dev_extent *dev_extent;
836         struct btrfs_path *path;
837         u64 hole_size;
838         u64 max_hole_start;
839         u64 max_hole_size;
840         u64 extent_end;
841         u64 search_start;
842         u64 search_end = device->total_bytes;
843         int ret;
844         int slot;
845         struct extent_buffer *l;
846
847         /* FIXME use last free of some kind */
848
849         /* we don't want to overwrite the superblock on the drive,
850          * so we make sure to start at an offset of at least 1MB
851          */
852         search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
853
854         max_hole_start = search_start;
855         max_hole_size = 0;
856
857         if (search_start >= search_end) {
858                 ret = -ENOSPC;
859                 goto error;
860         }
861
862         path = btrfs_alloc_path();
863         if (!path) {
864                 ret = -ENOMEM;
865                 goto error;
866         }
867         path->reada = 2;
868
869         key.objectid = device->devid;
870         key.offset = search_start;
871         key.type = BTRFS_DEV_EXTENT_KEY;
872
873         ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
874         if (ret < 0)
875                 goto out;
876         if (ret > 0) {
877                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
878                 if (ret < 0)
879                         goto out;
880         }
881
882         while (1) {
883                 l = path->nodes[0];
884                 slot = path->slots[0];
885                 if (slot >= btrfs_header_nritems(l)) {
886                         ret = btrfs_next_leaf(root, path);
887                         if (ret == 0)
888                                 continue;
889                         if (ret < 0)
890                                 goto out;
891
892                         break;
893                 }
894                 btrfs_item_key_to_cpu(l, &key, slot);
895
896                 if (key.objectid < device->devid)
897                         goto next;
898
899                 if (key.objectid > device->devid)
900                         break;
901
902                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
903                         goto next;
904
905                 if (key.offset > search_start) {
906                         hole_size = key.offset - search_start;
907
908                         if (hole_size > max_hole_size) {
909                                 max_hole_start = search_start;
910                                 max_hole_size = hole_size;
911                         }
912
913                         /*
914                          * If this free space is greater than which we need,
915                          * it must be the max free space that we have found
916                          * until now, so max_hole_start must point to the start
917                          * of this free space and the length of this free space
918                          * is stored in max_hole_size. Thus, we return
919                          * max_hole_start and max_hole_size and go back to the
920                          * caller.
921                          */
922                         if (hole_size >= num_bytes) {
923                                 ret = 0;
924                                 goto out;
925                         }
926                 }
927
928                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
929                 extent_end = key.offset + btrfs_dev_extent_length(l,
930                                                                   dev_extent);
931                 if (extent_end > search_start)
932                         search_start = extent_end;
933 next:
934                 path->slots[0]++;
935                 cond_resched();
936         }
937
938         hole_size = search_end- search_start;
939         if (hole_size > max_hole_size) {
940                 max_hole_start = search_start;
941                 max_hole_size = hole_size;
942         }
943
944         /* See above. */
945         if (hole_size < num_bytes)
946                 ret = -ENOSPC;
947         else
948                 ret = 0;
949
950 out:
951         btrfs_free_path(path);
952 error:
953         *start = max_hole_start;
954         if (len)
955                 *len = max_hole_size;
956         return ret;
957 }
958
959 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
960                           struct btrfs_device *device,
961                           u64 start)
962 {
963         int ret;
964         struct btrfs_path *path;
965         struct btrfs_root *root = device->dev_root;
966         struct btrfs_key key;
967         struct btrfs_key found_key;
968         struct extent_buffer *leaf = NULL;
969         struct btrfs_dev_extent *extent = NULL;
970
971         path = btrfs_alloc_path();
972         if (!path)
973                 return -ENOMEM;
974
975         key.objectid = device->devid;
976         key.offset = start;
977         key.type = BTRFS_DEV_EXTENT_KEY;
978
979         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
980         if (ret > 0) {
981                 ret = btrfs_previous_item(root, path, key.objectid,
982                                           BTRFS_DEV_EXTENT_KEY);
983                 BUG_ON(ret);
984                 leaf = path->nodes[0];
985                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
986                 extent = btrfs_item_ptr(leaf, path->slots[0],
987                                         struct btrfs_dev_extent);
988                 BUG_ON(found_key.offset > start || found_key.offset +
989                        btrfs_dev_extent_length(leaf, extent) < start);
990                 ret = 0;
991         } else if (ret == 0) {
992                 leaf = path->nodes[0];
993                 extent = btrfs_item_ptr(leaf, path->slots[0],
994                                         struct btrfs_dev_extent);
995         }
996         BUG_ON(ret);
997
998         if (device->bytes_used > 0)
999                 device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
1000         ret = btrfs_del_item(trans, root, path);
1001         BUG_ON(ret);
1002
1003         btrfs_free_path(path);
1004         return ret;
1005 }
1006
1007 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1008                            struct btrfs_device *device,
1009                            u64 chunk_tree, u64 chunk_objectid,
1010                            u64 chunk_offset, u64 start, u64 num_bytes)
1011 {
1012         int ret;
1013         struct btrfs_path *path;
1014         struct btrfs_root *root = device->dev_root;
1015         struct btrfs_dev_extent *extent;
1016         struct extent_buffer *leaf;
1017         struct btrfs_key key;
1018
1019         WARN_ON(!device->in_fs_metadata);
1020         path = btrfs_alloc_path();
1021         if (!path)
1022                 return -ENOMEM;
1023
1024         key.objectid = device->devid;
1025         key.offset = start;
1026         key.type = BTRFS_DEV_EXTENT_KEY;
1027         ret = btrfs_insert_empty_item(trans, root, path, &key,
1028                                       sizeof(*extent));
1029         BUG_ON(ret);
1030
1031         leaf = path->nodes[0];
1032         extent = btrfs_item_ptr(leaf, path->slots[0],
1033                                 struct btrfs_dev_extent);
1034         btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1035         btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1036         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1037
1038         write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1039                     (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
1040                     BTRFS_UUID_SIZE);
1041
1042         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1043         btrfs_mark_buffer_dirty(leaf);
1044         btrfs_free_path(path);
1045         return ret;
1046 }
1047
1048 static noinline int find_next_chunk(struct btrfs_root *root,
1049                                     u64 objectid, u64 *offset)
1050 {
1051         struct btrfs_path *path;
1052         int ret;
1053         struct btrfs_key key;
1054         struct btrfs_chunk *chunk;
1055         struct btrfs_key found_key;
1056
1057         path = btrfs_alloc_path();
1058         BUG_ON(!path);
1059
1060         key.objectid = objectid;
1061         key.offset = (u64)-1;
1062         key.type = BTRFS_CHUNK_ITEM_KEY;
1063
1064         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1065         if (ret < 0)
1066                 goto error;
1067
1068         BUG_ON(ret == 0);
1069
1070         ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
1071         if (ret) {
1072                 *offset = 0;
1073         } else {
1074                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1075                                       path->slots[0]);
1076                 if (found_key.objectid != objectid)
1077                         *offset = 0;
1078                 else {
1079                         chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1080                                                struct btrfs_chunk);
1081                         *offset = found_key.offset +
1082                                 btrfs_chunk_length(path->nodes[0], chunk);
1083                 }
1084         }
1085         ret = 0;
1086 error:
1087         btrfs_free_path(path);
1088         return ret;
1089 }
1090
1091 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
1092 {
1093         int ret;
1094         struct btrfs_key key;
1095         struct btrfs_key found_key;
1096         struct btrfs_path *path;
1097
1098         root = root->fs_info->chunk_root;
1099
1100         path = btrfs_alloc_path();
1101         if (!path)
1102                 return -ENOMEM;
1103
1104         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1105         key.type = BTRFS_DEV_ITEM_KEY;
1106         key.offset = (u64)-1;
1107
1108         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1109         if (ret < 0)
1110                 goto error;
1111
1112         BUG_ON(ret == 0);
1113
1114         ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
1115                                   BTRFS_DEV_ITEM_KEY);
1116         if (ret) {
1117                 *objectid = 1;
1118         } else {
1119                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1120                                       path->slots[0]);
1121                 *objectid = found_key.offset + 1;
1122         }
1123         ret = 0;
1124 error:
1125         btrfs_free_path(path);
1126         return ret;
1127 }
1128
1129 /*
1130  * the device information is stored in the chunk root
1131  * the btrfs_device struct should be fully filled in
1132  */
1133 int btrfs_add_device(struct btrfs_trans_handle *trans,
1134                      struct btrfs_root *root,
1135                      struct btrfs_device *device)
1136 {
1137         int ret;
1138         struct btrfs_path *path;
1139         struct btrfs_dev_item *dev_item;
1140         struct extent_buffer *leaf;
1141         struct btrfs_key key;
1142         unsigned long ptr;
1143
1144         root = root->fs_info->chunk_root;
1145
1146         path = btrfs_alloc_path();
1147         if (!path)
1148                 return -ENOMEM;
1149
1150         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1151         key.type = BTRFS_DEV_ITEM_KEY;
1152         key.offset = device->devid;
1153
1154         ret = btrfs_insert_empty_item(trans, root, path, &key,
1155                                       sizeof(*dev_item));
1156         if (ret)
1157                 goto out;
1158
1159         leaf = path->nodes[0];
1160         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1161
1162         btrfs_set_device_id(leaf, dev_item, device->devid);
1163         btrfs_set_device_generation(leaf, dev_item, 0);
1164         btrfs_set_device_type(leaf, dev_item, device->type);
1165         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1166         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1167         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1168         btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1169         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1170         btrfs_set_device_group(leaf, dev_item, 0);
1171         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1172         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1173         btrfs_set_device_start_offset(leaf, dev_item, 0);
1174
1175         ptr = (unsigned long)btrfs_device_uuid(dev_item);
1176         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1177         ptr = (unsigned long)btrfs_device_fsid(dev_item);
1178         write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1179         btrfs_mark_buffer_dirty(leaf);
1180
1181         ret = 0;
1182 out:
1183         btrfs_free_path(path);
1184         return ret;
1185 }
1186
1187 static int btrfs_rm_dev_item(struct btrfs_root *root,
1188                              struct btrfs_device *device)
1189 {
1190         int ret;
1191         struct btrfs_path *path;
1192         struct btrfs_key key;
1193         struct btrfs_trans_handle *trans;
1194
1195         root = root->fs_info->chunk_root;
1196
1197         path = btrfs_alloc_path();
1198         if (!path)
1199                 return -ENOMEM;
1200
1201         trans = btrfs_start_transaction(root, 0);
1202         if (IS_ERR(trans)) {
1203                 btrfs_free_path(path);
1204                 return PTR_ERR(trans);
1205         }
1206         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1207         key.type = BTRFS_DEV_ITEM_KEY;
1208         key.offset = device->devid;
1209         lock_chunks(root);
1210
1211         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1212         if (ret < 0)
1213                 goto out;
1214
1215         if (ret > 0) {
1216                 ret = -ENOENT;
1217                 goto out;
1218         }
1219
1220         ret = btrfs_del_item(trans, root, path);
1221         if (ret)
1222                 goto out;
1223 out:
1224         btrfs_free_path(path);
1225         unlock_chunks(root);
1226         btrfs_commit_transaction(trans, root);
1227         return ret;
1228 }
1229
1230 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1231 {
1232         struct btrfs_device *device;
1233         struct btrfs_device *next_device;
1234         struct block_device *bdev;
1235         struct buffer_head *bh = NULL;
1236         struct btrfs_super_block *disk_super;
1237         u64 all_avail;
1238         u64 devid;
1239         u64 num_devices;
1240         u8 *dev_uuid;
1241         int ret = 0;
1242
1243         mutex_lock(&uuid_mutex);
1244         mutex_lock(&root->fs_info->volume_mutex);
1245
1246         all_avail = root->fs_info->avail_data_alloc_bits |
1247                 root->fs_info->avail_system_alloc_bits |
1248                 root->fs_info->avail_metadata_alloc_bits;
1249
1250         if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1251             root->fs_info->fs_devices->num_devices <= 4) {
1252                 printk(KERN_ERR "btrfs: unable to go below four devices "
1253                        "on raid10\n");
1254                 ret = -EINVAL;
1255                 goto out;
1256         }
1257
1258         if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1259             root->fs_info->fs_devices->num_devices <= 2) {
1260                 printk(KERN_ERR "btrfs: unable to go below two "
1261                        "devices on raid1\n");
1262                 ret = -EINVAL;
1263                 goto out;
1264         }
1265
1266         if (strcmp(device_path, "missing") == 0) {
1267                 struct list_head *devices;
1268                 struct btrfs_device *tmp;
1269
1270                 device = NULL;
1271                 devices = &root->fs_info->fs_devices->devices;
1272                 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1273                 list_for_each_entry(tmp, devices, dev_list) {
1274                         if (tmp->in_fs_metadata && !tmp->bdev) {
1275                                 device = tmp;
1276                                 break;
1277                         }
1278                 }
1279                 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1280                 bdev = NULL;
1281                 bh = NULL;
1282                 disk_super = NULL;
1283                 if (!device) {
1284                         printk(KERN_ERR "btrfs: no missing devices found to "
1285                                "remove\n");
1286                         goto out;
1287                 }
1288         } else {
1289                 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1290                                           root->fs_info->bdev_holder);
1291                 if (IS_ERR(bdev)) {
1292                         ret = PTR_ERR(bdev);
1293                         goto out;
1294                 }
1295
1296                 set_blocksize(bdev, 4096);
1297                 bh = btrfs_read_dev_super(bdev);
1298                 if (!bh) {
1299                         ret = -EINVAL;
1300                         goto error_close;
1301                 }
1302                 disk_super = (struct btrfs_super_block *)bh->b_data;
1303                 devid = btrfs_stack_device_id(&disk_super->dev_item);
1304                 dev_uuid = disk_super->dev_item.uuid;
1305                 device = btrfs_find_device(root, devid, dev_uuid,
1306                                            disk_super->fsid);
1307                 if (!device) {
1308                         ret = -ENOENT;
1309                         goto error_brelse;
1310                 }
1311         }
1312
1313         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1314                 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1315                        "device\n");
1316                 ret = -EINVAL;
1317                 goto error_brelse;
1318         }
1319
1320         if (device->writeable) {
1321                 list_del_init(&device->dev_alloc_list);
1322                 root->fs_info->fs_devices->rw_devices--;
1323         }
1324
1325         ret = btrfs_shrink_device(device, 0);
1326         if (ret)
1327                 goto error_undo;
1328
1329         ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1330         if (ret)
1331                 goto error_undo;
1332
1333         device->in_fs_metadata = 0;
1334
1335         /*
1336          * the device list mutex makes sure that we don't change
1337          * the device list while someone else is writing out all
1338          * the device supers.
1339          */
1340         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1341         list_del_init(&device->dev_list);
1342         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1343
1344         device->fs_devices->num_devices--;
1345
1346         if (device->missing)
1347                 root->fs_info->fs_devices->missing_devices--;
1348
1349         next_device = list_entry(root->fs_info->fs_devices->devices.next,
1350                                  struct btrfs_device, dev_list);
1351         if (device->bdev == root->fs_info->sb->s_bdev)
1352                 root->fs_info->sb->s_bdev = next_device->bdev;
1353         if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1354                 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1355
1356         if (device->bdev) {
1357                 blkdev_put(device->bdev, device->mode);
1358                 device->bdev = NULL;
1359                 device->fs_devices->open_devices--;
1360         }
1361
1362         num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
1363         btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
1364
1365         if (device->fs_devices->open_devices == 0) {
1366                 struct btrfs_fs_devices *fs_devices;
1367                 fs_devices = root->fs_info->fs_devices;
1368                 while (fs_devices) {
1369                         if (fs_devices->seed == device->fs_devices)
1370                                 break;
1371                         fs_devices = fs_devices->seed;
1372                 }
1373                 fs_devices->seed = device->fs_devices->seed;
1374                 device->fs_devices->seed = NULL;
1375                 __btrfs_close_devices(device->fs_devices);
1376                 free_fs_devices(device->fs_devices);
1377         }
1378
1379         /*
1380          * at this point, the device is zero sized.  We want to
1381          * remove it from the devices list and zero out the old super
1382          */
1383         if (device->writeable) {
1384                 /* make sure this device isn't detected as part of
1385                  * the FS anymore
1386                  */
1387                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1388                 set_buffer_dirty(bh);
1389                 sync_dirty_buffer(bh);
1390         }
1391
1392         kfree(device->name);
1393         kfree(device);
1394         ret = 0;
1395
1396 error_brelse:
1397         brelse(bh);
1398 error_close:
1399         if (bdev)
1400                 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1401 out:
1402         mutex_unlock(&root->fs_info->volume_mutex);
1403         mutex_unlock(&uuid_mutex);
1404         return ret;
1405 error_undo:
1406         if (device->writeable) {
1407                 list_add(&device->dev_alloc_list,
1408                          &root->fs_info->fs_devices->alloc_list);
1409                 root->fs_info->fs_devices->rw_devices++;
1410         }
1411         goto error_brelse;
1412 }
1413
1414 /*
1415  * does all the dirty work required for changing file system's UUID.
1416  */
1417 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1418                                 struct btrfs_root *root)
1419 {
1420         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1421         struct btrfs_fs_devices *old_devices;
1422         struct btrfs_fs_devices *seed_devices;
1423         struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
1424         struct btrfs_device *device;
1425         u64 super_flags;
1426
1427         BUG_ON(!mutex_is_locked(&uuid_mutex));
1428         if (!fs_devices->seeding)
1429                 return -EINVAL;
1430
1431         seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1432         if (!seed_devices)
1433                 return -ENOMEM;
1434
1435         old_devices = clone_fs_devices(fs_devices);
1436         if (IS_ERR(old_devices)) {
1437                 kfree(seed_devices);
1438                 return PTR_ERR(old_devices);
1439         }
1440
1441         list_add(&old_devices->list, &fs_uuids);
1442
1443         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1444         seed_devices->opened = 1;
1445         INIT_LIST_HEAD(&seed_devices->devices);
1446         INIT_LIST_HEAD(&seed_devices->alloc_list);
1447         mutex_init(&seed_devices->device_list_mutex);
1448         list_splice_init(&fs_devices->devices, &seed_devices->devices);
1449         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1450         list_for_each_entry(device, &seed_devices->devices, dev_list) {
1451                 device->fs_devices = seed_devices;
1452         }
1453
1454         fs_devices->seeding = 0;
1455         fs_devices->num_devices = 0;
1456         fs_devices->open_devices = 0;
1457         fs_devices->seed = seed_devices;
1458
1459         generate_random_uuid(fs_devices->fsid);
1460         memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1461         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1462         super_flags = btrfs_super_flags(disk_super) &
1463                       ~BTRFS_SUPER_FLAG_SEEDING;
1464         btrfs_set_super_flags(disk_super, super_flags);
1465
1466         return 0;
1467 }
1468
1469 /*
1470  * strore the expected generation for seed devices in device items.
1471  */
1472 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1473                                struct btrfs_root *root)
1474 {
1475         struct btrfs_path *path;
1476         struct extent_buffer *leaf;
1477         struct btrfs_dev_item *dev_item;
1478         struct btrfs_device *device;
1479         struct btrfs_key key;
1480         u8 fs_uuid[BTRFS_UUID_SIZE];
1481         u8 dev_uuid[BTRFS_UUID_SIZE];
1482         u64 devid;
1483         int ret;
1484
1485         path = btrfs_alloc_path();
1486         if (!path)
1487                 return -ENOMEM;
1488
1489         root = root->fs_info->chunk_root;
1490         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1491         key.offset = 0;
1492         key.type = BTRFS_DEV_ITEM_KEY;
1493
1494         while (1) {
1495                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1496                 if (ret < 0)
1497                         goto error;
1498
1499                 leaf = path->nodes[0];
1500 next_slot:
1501                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1502                         ret = btrfs_next_leaf(root, path);
1503                         if (ret > 0)
1504                                 break;
1505                         if (ret < 0)
1506                                 goto error;
1507                         leaf = path->nodes[0];
1508                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1509                         btrfs_release_path(root, path);
1510                         continue;
1511                 }
1512
1513                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1514                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1515                     key.type != BTRFS_DEV_ITEM_KEY)
1516                         break;
1517
1518                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1519                                           struct btrfs_dev_item);
1520                 devid = btrfs_device_id(leaf, dev_item);
1521                 read_extent_buffer(leaf, dev_uuid,
1522                                    (unsigned long)btrfs_device_uuid(dev_item),
1523                                    BTRFS_UUID_SIZE);
1524                 read_extent_buffer(leaf, fs_uuid,
1525                                    (unsigned long)btrfs_device_fsid(dev_item),
1526                                    BTRFS_UUID_SIZE);
1527                 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1528                 BUG_ON(!device);
1529
1530                 if (device->fs_devices->seeding) {
1531                         btrfs_set_device_generation(leaf, dev_item,
1532                                                     device->generation);
1533                         btrfs_mark_buffer_dirty(leaf);
1534                 }
1535
1536                 path->slots[0]++;
1537                 goto next_slot;
1538         }
1539         ret = 0;
1540 error:
1541         btrfs_free_path(path);
1542         return ret;
1543 }
1544
1545 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1546 {
1547         struct btrfs_trans_handle *trans;
1548         struct btrfs_device *device;
1549         struct block_device *bdev;
1550         struct list_head *devices;
1551         struct super_block *sb = root->fs_info->sb;
1552         u64 total_bytes;
1553         int seeding_dev = 0;
1554         int ret = 0;
1555
1556         if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1557                 return -EINVAL;
1558
1559         bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
1560                                   root->fs_info->bdev_holder);
1561         if (IS_ERR(bdev))
1562                 return PTR_ERR(bdev);
1563
1564         if (root->fs_info->fs_devices->seeding) {
1565                 seeding_dev = 1;
1566                 down_write(&sb->s_umount);
1567                 mutex_lock(&uuid_mutex);
1568         }
1569
1570         filemap_write_and_wait(bdev->bd_inode->i_mapping);
1571         mutex_lock(&root->fs_info->volume_mutex);
1572
1573         devices = &root->fs_info->fs_devices->devices;
1574         /*
1575          * we have the volume lock, so we don't need the extra
1576          * device list mutex while reading the list here.
1577          */
1578         list_for_each_entry(device, devices, dev_list) {
1579                 if (device->bdev == bdev) {
1580                         ret = -EEXIST;
1581                         goto error;
1582                 }
1583         }
1584
1585         device = kzalloc(sizeof(*device), GFP_NOFS);
1586         if (!device) {
1587                 /* we can safely leave the fs_devices entry around */
1588                 ret = -ENOMEM;
1589                 goto error;
1590         }
1591
1592         device->name = kstrdup(device_path, GFP_NOFS);
1593         if (!device->name) {
1594                 kfree(device);
1595                 ret = -ENOMEM;
1596                 goto error;
1597         }
1598
1599         ret = find_next_devid(root, &device->devid);
1600         if (ret) {
1601                 kfree(device->name);
1602                 kfree(device);
1603                 goto error;
1604         }
1605
1606         trans = btrfs_start_transaction(root, 0);
1607         if (IS_ERR(trans)) {
1608                 kfree(device->name);
1609                 kfree(device);
1610                 ret = PTR_ERR(trans);
1611                 goto error;
1612         }
1613
1614         lock_chunks(root);
1615
1616         device->writeable = 1;
1617         device->work.func = pending_bios_fn;
1618         generate_random_uuid(device->uuid);
1619         spin_lock_init(&device->io_lock);
1620         device->generation = trans->transid;
1621         device->io_width = root->sectorsize;
1622         device->io_align = root->sectorsize;
1623         device->sector_size = root->sectorsize;
1624         device->total_bytes = i_size_read(bdev->bd_inode);
1625         device->disk_total_bytes = device->total_bytes;
1626         device->dev_root = root->fs_info->dev_root;
1627         device->bdev = bdev;
1628         device->in_fs_metadata = 1;
1629         device->mode = FMODE_EXCL;
1630         set_blocksize(device->bdev, 4096);
1631
1632         if (seeding_dev) {
1633                 sb->s_flags &= ~MS_RDONLY;
1634                 ret = btrfs_prepare_sprout(trans, root);
1635                 BUG_ON(ret);
1636         }
1637
1638         device->fs_devices = root->fs_info->fs_devices;
1639
1640         /*
1641          * we don't want write_supers to jump in here with our device
1642          * half setup
1643          */
1644         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1645         list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
1646         list_add(&device->dev_alloc_list,
1647                  &root->fs_info->fs_devices->alloc_list);
1648         root->fs_info->fs_devices->num_devices++;
1649         root->fs_info->fs_devices->open_devices++;
1650         root->fs_info->fs_devices->rw_devices++;
1651         root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1652
1653         if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1654                 root->fs_info->fs_devices->rotating = 1;
1655
1656         total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
1657         btrfs_set_super_total_bytes(&root->fs_info->super_copy,
1658                                     total_bytes + device->total_bytes);
1659
1660         total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
1661         btrfs_set_super_num_devices(&root->fs_info->super_copy,
1662                                     total_bytes + 1);
1663         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1664
1665         if (seeding_dev) {
1666                 ret = init_first_rw_device(trans, root, device);
1667                 BUG_ON(ret);
1668                 ret = btrfs_finish_sprout(trans, root);
1669                 BUG_ON(ret);
1670         } else {
1671                 ret = btrfs_add_device(trans, root, device);
1672         }
1673
1674         /*
1675          * we've got more storage, clear any full flags on the space
1676          * infos
1677          */
1678         btrfs_clear_space_info_full(root->fs_info);
1679
1680         unlock_chunks(root);
1681         btrfs_commit_transaction(trans, root);
1682
1683         if (seeding_dev) {
1684                 mutex_unlock(&uuid_mutex);
1685                 up_write(&sb->s_umount);
1686
1687                 ret = btrfs_relocate_sys_chunks(root);
1688                 BUG_ON(ret);
1689         }
1690 out:
1691         mutex_unlock(&root->fs_info->volume_mutex);
1692         return ret;
1693 error:
1694         blkdev_put(bdev, FMODE_EXCL);
1695         if (seeding_dev) {
1696                 mutex_unlock(&uuid_mutex);
1697                 up_write(&sb->s_umount);
1698         }
1699         goto out;
1700 }
1701
1702 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1703                                         struct btrfs_device *device)
1704 {
1705         int ret;
1706         struct btrfs_path *path;
1707         struct btrfs_root *root;
1708         struct btrfs_dev_item *dev_item;
1709         struct extent_buffer *leaf;
1710         struct btrfs_key key;
1711
1712         root = device->dev_root->fs_info->chunk_root;
1713
1714         path = btrfs_alloc_path();
1715         if (!path)
1716                 return -ENOMEM;
1717
1718         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1719         key.type = BTRFS_DEV_ITEM_KEY;
1720         key.offset = device->devid;
1721
1722         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1723         if (ret < 0)
1724                 goto out;
1725
1726         if (ret > 0) {
1727                 ret = -ENOENT;
1728                 goto out;
1729         }
1730
1731         leaf = path->nodes[0];
1732         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1733
1734         btrfs_set_device_id(leaf, dev_item, device->devid);
1735         btrfs_set_device_type(leaf, dev_item, device->type);
1736         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1737         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1738         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1739         btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
1740         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1741         btrfs_mark_buffer_dirty(leaf);
1742
1743 out:
1744         btrfs_free_path(path);
1745         return ret;
1746 }
1747
1748 static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1749                       struct btrfs_device *device, u64 new_size)
1750 {
1751         struct btrfs_super_block *super_copy =
1752                 &device->dev_root->fs_info->super_copy;
1753         u64 old_total = btrfs_super_total_bytes(super_copy);
1754         u64 diff = new_size - device->total_bytes;
1755
1756         if (!device->writeable)
1757                 return -EACCES;
1758         if (new_size <= device->total_bytes)
1759                 return -EINVAL;
1760
1761         btrfs_set_super_total_bytes(super_copy, old_total + diff);
1762         device->fs_devices->total_rw_bytes += diff;
1763
1764         device->total_bytes = new_size;
1765         device->disk_total_bytes = new_size;
1766         btrfs_clear_space_info_full(device->dev_root->fs_info);
1767
1768         return btrfs_update_device(trans, device);
1769 }
1770
1771 int btrfs_grow_device(struct btrfs_trans_handle *trans,
1772                       struct btrfs_device *device, u64 new_size)
1773 {
1774         int ret;
1775         lock_chunks(device->dev_root);
1776         ret = __btrfs_grow_device(trans, device, new_size);
1777         unlock_chunks(device->dev_root);
1778         return ret;
1779 }
1780
1781 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1782                             struct btrfs_root *root,
1783                             u64 chunk_tree, u64 chunk_objectid,
1784                             u64 chunk_offset)
1785 {
1786         int ret;
1787         struct btrfs_path *path;
1788         struct btrfs_key key;
1789
1790         root = root->fs_info->chunk_root;
1791         path = btrfs_alloc_path();
1792         if (!path)
1793                 return -ENOMEM;
1794
1795         key.objectid = chunk_objectid;
1796         key.offset = chunk_offset;
1797         key.type = BTRFS_CHUNK_ITEM_KEY;
1798
1799         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1800         BUG_ON(ret);
1801
1802         ret = btrfs_del_item(trans, root, path);
1803         BUG_ON(ret);
1804
1805         btrfs_free_path(path);
1806         return 0;
1807 }
1808
1809 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1810                         chunk_offset)
1811 {
1812         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1813         struct btrfs_disk_key *disk_key;
1814         struct btrfs_chunk *chunk;
1815         u8 *ptr;
1816         int ret = 0;
1817         u32 num_stripes;
1818         u32 array_size;
1819         u32 len = 0;
1820         u32 cur;
1821         struct btrfs_key key;
1822
1823         array_size = btrfs_super_sys_array_size(super_copy);
1824
1825         ptr = super_copy->sys_chunk_array;
1826         cur = 0;
1827
1828         while (cur < array_size) {
1829                 disk_key = (struct btrfs_disk_key *)ptr;
1830                 btrfs_disk_key_to_cpu(&key, disk_key);
1831
1832                 len = sizeof(*disk_key);
1833
1834                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1835                         chunk = (struct btrfs_chunk *)(ptr + len);
1836                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1837                         len += btrfs_chunk_item_size(num_stripes);
1838                 } else {
1839                         ret = -EIO;
1840                         break;
1841                 }
1842                 if (key.objectid == chunk_objectid &&
1843                     key.offset == chunk_offset) {
1844                         memmove(ptr, ptr + len, array_size - (cur + len));
1845                         array_size -= len;
1846                         btrfs_set_super_sys_array_size(super_copy, array_size);
1847                 } else {
1848                         ptr += len;
1849                         cur += len;
1850                 }
1851         }
1852         return ret;
1853 }
1854
1855 static int btrfs_relocate_chunk(struct btrfs_root *root,
1856                          u64 chunk_tree, u64 chunk_objectid,
1857                          u64 chunk_offset)
1858 {
1859         struct extent_map_tree *em_tree;
1860         struct btrfs_root *extent_root;
1861         struct btrfs_trans_handle *trans;
1862         struct extent_map *em;
1863         struct map_lookup *map;
1864         int ret;
1865         int i;
1866
1867         root = root->fs_info->chunk_root;
1868         extent_root = root->fs_info->extent_root;
1869         em_tree = &root->fs_info->mapping_tree.map_tree;
1870
1871         ret = btrfs_can_relocate(extent_root, chunk_offset);
1872         if (ret)
1873                 return -ENOSPC;
1874
1875         /* step one, relocate all the extents inside this chunk */
1876         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1877         if (ret)
1878                 return ret;
1879
1880         trans = btrfs_start_transaction(root, 0);
1881         BUG_ON(IS_ERR(trans));
1882
1883         lock_chunks(root);
1884
1885         /*
1886          * step two, delete the device extents and the
1887          * chunk tree entries
1888          */
1889         read_lock(&em_tree->lock);
1890         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1891         read_unlock(&em_tree->lock);
1892
1893         BUG_ON(em->start > chunk_offset ||
1894                em->start + em->len < chunk_offset);
1895         map = (struct map_lookup *)em->bdev;
1896
1897         for (i = 0; i < map->num_stripes; i++) {
1898                 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1899                                             map->stripes[i].physical);
1900                 BUG_ON(ret);
1901
1902                 if (map->stripes[i].dev) {
1903                         ret = btrfs_update_device(trans, map->stripes[i].dev);
1904                         BUG_ON(ret);
1905                 }
1906         }
1907         ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1908                                chunk_offset);
1909
1910         BUG_ON(ret);
1911
1912         trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
1913
1914         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1915                 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1916                 BUG_ON(ret);
1917         }
1918
1919         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1920         BUG_ON(ret);
1921
1922         write_lock(&em_tree->lock);
1923         remove_extent_mapping(em_tree, em);
1924         write_unlock(&em_tree->lock);
1925
1926         kfree(map);
1927         em->bdev = NULL;
1928
1929         /* once for the tree */
1930         free_extent_map(em);
1931         /* once for us */
1932         free_extent_map(em);
1933
1934         unlock_chunks(root);
1935         btrfs_end_transaction(trans, root);
1936         return 0;
1937 }
1938
1939 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
1940 {
1941         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
1942         struct btrfs_path *path;
1943         struct extent_buffer *leaf;
1944         struct btrfs_chunk *chunk;
1945         struct btrfs_key key;
1946         struct btrfs_key found_key;
1947         u64 chunk_tree = chunk_root->root_key.objectid;
1948         u64 chunk_type;
1949         bool retried = false;
1950         int failed = 0;
1951         int ret;
1952
1953         path = btrfs_alloc_path();
1954         if (!path)
1955                 return -ENOMEM;
1956
1957 again:
1958         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1959         key.offset = (u64)-1;
1960         key.type = BTRFS_CHUNK_ITEM_KEY;
1961
1962         while (1) {
1963                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1964                 if (ret < 0)
1965                         goto error;
1966                 BUG_ON(ret == 0);
1967
1968                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
1969                                           key.type);
1970                 if (ret < 0)
1971                         goto error;
1972                 if (ret > 0)
1973                         break;
1974
1975                 leaf = path->nodes[0];
1976                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1977
1978                 chunk = btrfs_item_ptr(leaf, path->slots[0],
1979                                        struct btrfs_chunk);
1980                 chunk_type = btrfs_chunk_type(leaf, chunk);
1981                 btrfs_release_path(chunk_root, path);
1982
1983                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
1984                         ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
1985                                                    found_key.objectid,
1986                                                    found_key.offset);
1987                         if (ret == -ENOSPC)
1988                                 failed++;
1989                         else if (ret)
1990                                 BUG();
1991                 }
1992
1993                 if (found_key.offset == 0)
1994                         break;
1995                 key.offset = found_key.offset - 1;
1996         }
1997         ret = 0;
1998         if (failed && !retried) {
1999                 failed = 0;
2000                 retried = true;
2001                 goto again;
2002         } else if (failed && retried) {
2003                 WARN_ON(1);
2004                 ret = -ENOSPC;
2005         }
2006 error:
2007         btrfs_free_path(path);
2008         return ret;
2009 }
2010
2011 static u64 div_factor(u64 num, int factor)
2012 {
2013         if (factor == 10)
2014                 return num;
2015         num *= factor;
2016         do_div(num, 10);
2017         return num;
2018 }
2019
2020 int btrfs_balance(struct btrfs_root *dev_root)
2021 {
2022         int ret;
2023         struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
2024         struct btrfs_device *device;
2025         u64 old_size;
2026         u64 size_to_free;
2027         struct btrfs_path *path;
2028         struct btrfs_key key;
2029         struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
2030         struct btrfs_trans_handle *trans;
2031         struct btrfs_key found_key;
2032
2033         if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
2034                 return -EROFS;
2035
2036         if (!capable(CAP_SYS_ADMIN))
2037                 return -EPERM;
2038
2039         mutex_lock(&dev_root->fs_info->volume_mutex);
2040         dev_root = dev_root->fs_info->dev_root;
2041
2042         /* step one make some room on all the devices */
2043         list_for_each_entry(device, devices, dev_list) {
2044                 old_size = device->total_bytes;
2045                 size_to_free = div_factor(old_size, 1);
2046                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2047                 if (!device->writeable ||
2048                     device->total_bytes - device->bytes_used > size_to_free)
2049                         continue;
2050
2051                 ret = btrfs_shrink_device(device, old_size - size_to_free);
2052                 if (ret == -ENOSPC)
2053                         break;
2054                 BUG_ON(ret);
2055
2056                 trans = btrfs_start_transaction(dev_root, 0);
2057                 BUG_ON(IS_ERR(trans));
2058
2059                 ret = btrfs_grow_device(trans, device, old_size);
2060                 BUG_ON(ret);
2061
2062                 btrfs_end_transaction(trans, dev_root);
2063         }
2064
2065         /* step two, relocate all the chunks */
2066         path = btrfs_alloc_path();
2067         BUG_ON(!path);
2068
2069         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2070         key.offset = (u64)-1;
2071         key.type = BTRFS_CHUNK_ITEM_KEY;
2072
2073         while (1) {
2074                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2075                 if (ret < 0)
2076                         goto error;
2077
2078                 /*
2079                  * this shouldn't happen, it means the last relocate
2080                  * failed
2081                  */
2082                 if (ret == 0)
2083                         break;
2084
2085                 ret = btrfs_previous_item(chunk_root, path, 0,
2086                                           BTRFS_CHUNK_ITEM_KEY);
2087                 if (ret)
2088                         break;
2089
2090                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2091                                       path->slots[0]);
2092                 if (found_key.objectid != key.objectid)
2093                         break;
2094
2095                 /* chunk zero is special */
2096                 if (found_key.offset == 0)
2097                         break;
2098
2099                 btrfs_release_path(chunk_root, path);
2100                 ret = btrfs_relocate_chunk(chunk_root,
2101                                            chunk_root->root_key.objectid,
2102                                            found_key.objectid,
2103                                            found_key.offset);
2104                 BUG_ON(ret && ret != -ENOSPC);
2105                 key.offset = found_key.offset - 1;
2106         }
2107         ret = 0;
2108 error:
2109         btrfs_free_path(path);
2110         mutex_unlock(&dev_root->fs_info->volume_mutex);
2111         return ret;
2112 }
2113
2114 /*
2115  * shrinking a device means finding all of the device extents past
2116  * the new size, and then following the back refs to the chunks.
2117  * The chunk relocation code actually frees the device extent
2118  */
2119 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2120 {
2121         struct btrfs_trans_handle *trans;
2122         struct btrfs_root *root = device->dev_root;
2123         struct btrfs_dev_extent *dev_extent = NULL;
2124         struct btrfs_path *path;
2125         u64 length;
2126         u64 chunk_tree;
2127         u64 chunk_objectid;
2128         u64 chunk_offset;
2129         int ret;
2130         int slot;
2131         int failed = 0;
2132         bool retried = false;
2133         struct extent_buffer *l;
2134         struct btrfs_key key;
2135         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
2136         u64 old_total = btrfs_super_total_bytes(super_copy);
2137         u64 old_size = device->total_bytes;
2138         u64 diff = device->total_bytes - new_size;
2139
2140         if (new_size >= device->total_bytes)
2141                 return -EINVAL;
2142
2143         path = btrfs_alloc_path();
2144         if (!path)
2145                 return -ENOMEM;
2146
2147         path->reada = 2;
2148
2149         lock_chunks(root);
2150
2151         device->total_bytes = new_size;
2152         if (device->writeable)
2153                 device->fs_devices->total_rw_bytes -= diff;
2154         unlock_chunks(root);
2155
2156 again:
2157         key.objectid = device->devid;
2158         key.offset = (u64)-1;
2159         key.type = BTRFS_DEV_EXTENT_KEY;
2160
2161         while (1) {
2162                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2163                 if (ret < 0)
2164                         goto done;
2165
2166                 ret = btrfs_previous_item(root, path, 0, key.type);
2167                 if (ret < 0)
2168                         goto done;
2169                 if (ret) {
2170                         ret = 0;
2171                         btrfs_release_path(root, path);
2172                         break;
2173                 }
2174
2175                 l = path->nodes[0];
2176                 slot = path->slots[0];
2177                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
2178
2179                 if (key.objectid != device->devid) {
2180                         btrfs_release_path(root, path);
2181                         break;
2182                 }
2183
2184                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2185                 length = btrfs_dev_extent_length(l, dev_extent);
2186
2187                 if (key.offset + length <= new_size) {
2188                         btrfs_release_path(root, path);
2189                         break;
2190                 }
2191
2192                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2193                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2194                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2195                 btrfs_release_path(root, path);
2196
2197                 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
2198                                            chunk_offset);
2199                 if (ret && ret != -ENOSPC)
2200                         goto done;
2201                 if (ret == -ENOSPC)
2202                         failed++;
2203                 key.offset -= 1;
2204         }
2205
2206         if (failed && !retried) {
2207                 failed = 0;
2208                 retried = true;
2209                 goto again;
2210         } else if (failed && retried) {
2211                 ret = -ENOSPC;
2212                 lock_chunks(root);
2213
2214                 device->total_bytes = old_size;
2215                 if (device->writeable)
2216                         device->fs_devices->total_rw_bytes += diff;
2217                 unlock_chunks(root);
2218                 goto done;
2219         }
2220
2221         /* Shrinking succeeded, else we would be at "done". */
2222         trans = btrfs_start_transaction(root, 0);
2223         if (IS_ERR(trans)) {
2224                 ret = PTR_ERR(trans);
2225                 goto done;
2226         }
2227
2228         lock_chunks(root);
2229
2230         device->disk_total_bytes = new_size;
2231         /* Now btrfs_update_device() will change the on-disk size. */
2232         ret = btrfs_update_device(trans, device);
2233         if (ret) {
2234                 unlock_chunks(root);
2235                 btrfs_end_transaction(trans, root);
2236                 goto done;
2237         }
2238         WARN_ON(diff > old_total);
2239         btrfs_set_super_total_bytes(super_copy, old_total - diff);
2240         unlock_chunks(root);
2241         btrfs_end_transaction(trans, root);
2242 done:
2243         btrfs_free_path(path);
2244         return ret;
2245 }
2246
2247 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
2248                            struct btrfs_root *root,
2249                            struct btrfs_key *key,
2250                            struct btrfs_chunk *chunk, int item_size)
2251 {
2252         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
2253         struct btrfs_disk_key disk_key;
2254         u32 array_size;
2255         u8 *ptr;
2256
2257         array_size = btrfs_super_sys_array_size(super_copy);
2258         if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
2259                 return -EFBIG;
2260
2261         ptr = super_copy->sys_chunk_array + array_size;
2262         btrfs_cpu_key_to_disk(&disk_key, key);
2263         memcpy(ptr, &disk_key, sizeof(disk_key));
2264         ptr += sizeof(disk_key);
2265         memcpy(ptr, chunk, item_size);
2266         item_size += sizeof(disk_key);
2267         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
2268         return 0;
2269 }
2270
2271 /*
2272  * sort the devices in descending order by max_avail, total_avail
2273  */
2274 static int btrfs_cmp_device_info(const void *a, const void *b)
2275 {
2276         const struct btrfs_device_info *di_a = a;
2277         const struct btrfs_device_info *di_b = b;
2278
2279         if (di_a->max_avail > di_b->max_avail)
2280                 return -1;
2281         if (di_a->max_avail < di_b->max_avail)
2282                 return 1;
2283         if (di_a->total_avail > di_b->total_avail)
2284                 return -1;
2285         if (di_a->total_avail < di_b->total_avail)
2286                 return 1;
2287         return 0;
2288 }
2289
2290 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2291                                struct btrfs_root *extent_root,
2292                                struct map_lookup **map_ret,
2293                                u64 *num_bytes_out, u64 *stripe_size_out,
2294                                u64 start, u64 type)
2295 {
2296         struct btrfs_fs_info *info = extent_root->fs_info;
2297         struct btrfs_fs_devices *fs_devices = info->fs_devices;
2298         struct list_head *cur;
2299         struct map_lookup *map = NULL;
2300         struct extent_map_tree *em_tree;
2301         struct extent_map *em;
2302         struct btrfs_device_info *devices_info = NULL;
2303         u64 total_avail;
2304         int num_stripes;        /* total number of stripes to allocate */
2305         int sub_stripes;        /* sub_stripes info for map */
2306         int dev_stripes;        /* stripes per dev */
2307         int devs_max;           /* max devs to use */
2308         int devs_min;           /* min devs needed */
2309         int devs_increment;     /* ndevs has to be a multiple of this */
2310         int ncopies;            /* how many copies to data has */
2311         int ret;
2312         u64 max_stripe_size;
2313         u64 max_chunk_size;
2314         u64 stripe_size;
2315         u64 num_bytes;
2316         int ndevs;
2317         int i;
2318         int j;
2319
2320         if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
2321             (type & BTRFS_BLOCK_GROUP_DUP)) {
2322                 WARN_ON(1);
2323                 type &= ~BTRFS_BLOCK_GROUP_DUP;
2324         }
2325
2326         if (list_empty(&fs_devices->alloc_list))
2327                 return -ENOSPC;
2328
2329         sub_stripes = 1;
2330         dev_stripes = 1;
2331         devs_increment = 1;
2332         ncopies = 1;
2333         devs_max = 0;   /* 0 == as many as possible */
2334         devs_min = 1;
2335
2336         /*
2337          * define the properties of each RAID type.
2338          * FIXME: move this to a global table and use it in all RAID
2339          * calculation code
2340          */
2341         if (type & (BTRFS_BLOCK_GROUP_DUP)) {
2342                 dev_stripes = 2;
2343                 ncopies = 2;
2344                 devs_max = 1;
2345         } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
2346                 devs_min = 2;
2347         } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2348                 devs_increment = 2;
2349                 ncopies = 2;
2350                 devs_max = 2;
2351                 devs_min = 2;
2352         } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2353                 sub_stripes = 2;
2354                 devs_increment = 2;
2355                 ncopies = 2;
2356                 devs_min = 4;
2357         } else {
2358                 devs_max = 1;
2359         }
2360
2361         if (type & BTRFS_BLOCK_GROUP_DATA) {
2362                 max_stripe_size = 1024 * 1024 * 1024;
2363                 max_chunk_size = 10 * max_stripe_size;
2364         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
2365                 max_stripe_size = 256 * 1024 * 1024;
2366                 max_chunk_size = max_stripe_size;
2367         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2368                 max_stripe_size = 8 * 1024 * 1024;
2369                 max_chunk_size = 2 * max_stripe_size;
2370         } else {
2371                 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
2372                        type);
2373                 BUG_ON(1);
2374         }
2375
2376         /* we don't want a chunk larger than 10% of writeable space */
2377         max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2378                              max_chunk_size);
2379
2380         devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
2381                                GFP_NOFS);
2382         if (!devices_info)
2383                 return -ENOMEM;
2384
2385         cur = fs_devices->alloc_list.next;
2386
2387         /*
2388          * in the first pass through the devices list, we gather information
2389          * about the available holes on each device.
2390          */
2391         ndevs = 0;
2392         while (cur != &fs_devices->alloc_list) {
2393                 struct btrfs_device *device;
2394                 u64 max_avail;
2395                 u64 dev_offset;
2396
2397                 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
2398
2399                 cur = cur->next;
2400
2401                 if (!device->writeable) {
2402                         printk(KERN_ERR
2403                                "btrfs: read-only device in alloc_list\n");
2404                         WARN_ON(1);
2405                         continue;
2406                 }
2407
2408                 if (!device->in_fs_metadata)
2409                         continue;
2410
2411                 if (device->total_bytes > device->bytes_used)
2412                         total_avail = device->total_bytes - device->bytes_used;
2413                 else
2414                         total_avail = 0;
2415                 /* avail is off by max(alloc_start, 1MB), but that is the same
2416                  * for all devices, so it doesn't hurt the sorting later on
2417                  */
2418
2419                 ret = find_free_dev_extent(trans, device,
2420                                            max_stripe_size * dev_stripes,
2421                                            &dev_offset, &max_avail);
2422                 if (ret && ret != -ENOSPC)
2423                         goto error;
2424
2425                 if (ret == 0)
2426                         max_avail = max_stripe_size * dev_stripes;
2427
2428                 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
2429                         continue;
2430
2431                 devices_info[ndevs].dev_offset = dev_offset;
2432                 devices_info[ndevs].max_avail = max_avail;
2433                 devices_info[ndevs].total_avail = total_avail;
2434                 devices_info[ndevs].dev = device;
2435                 ++ndevs;
2436         }
2437
2438         /*
2439          * now sort the devices by hole size / available space
2440          */
2441         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
2442              btrfs_cmp_device_info, NULL);
2443
2444         /* round down to number of usable stripes */
2445         ndevs -= ndevs % devs_increment;
2446
2447         if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
2448                 ret = -ENOSPC;
2449                 goto error;
2450         }
2451
2452         if (devs_max && ndevs > devs_max)
2453                 ndevs = devs_max;
2454         /*
2455          * the primary goal is to maximize the number of stripes, so use as many
2456          * devices as possible, even if the stripes are not maximum sized.
2457          */
2458         stripe_size = devices_info[ndevs-1].max_avail;
2459         num_stripes = ndevs * dev_stripes;
2460
2461         if (stripe_size * num_stripes > max_chunk_size * ncopies) {
2462                 stripe_size = max_chunk_size * ncopies;
2463                 do_div(stripe_size, num_stripes);
2464         }
2465
2466         do_div(stripe_size, dev_stripes);
2467         do_div(stripe_size, BTRFS_STRIPE_LEN);
2468         stripe_size *= BTRFS_STRIPE_LEN;
2469
2470         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2471         if (!map) {
2472                 ret = -ENOMEM;
2473                 goto error;
2474         }
2475         map->num_stripes = num_stripes;
2476
2477         for (i = 0; i < ndevs; ++i) {
2478                 for (j = 0; j < dev_stripes; ++j) {
2479                         int s = i * dev_stripes + j;
2480                         map->stripes[s].dev = devices_info[i].dev;
2481                         map->stripes[s].physical = devices_info[i].dev_offset +
2482                                                    j * stripe_size;
2483                 }
2484         }
2485         map->sector_size = extent_root->sectorsize;
2486         map->stripe_len = BTRFS_STRIPE_LEN;
2487         map->io_align = BTRFS_STRIPE_LEN;
2488         map->io_width = BTRFS_STRIPE_LEN;
2489         map->type = type;
2490         map->sub_stripes = sub_stripes;
2491
2492         *map_ret = map;
2493         num_bytes = stripe_size * (num_stripes / ncopies);
2494
2495         *stripe_size_out = stripe_size;
2496         *num_bytes_out = num_bytes;
2497
2498         trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
2499
2500         em = alloc_extent_map(GFP_NOFS);
2501         if (!em) {
2502                 ret = -ENOMEM;
2503                 goto error;
2504         }
2505         em->bdev = (struct block_device *)map;
2506         em->start = start;
2507         em->len = num_bytes;
2508         em->block_start = 0;
2509         em->block_len = em->len;
2510
2511         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
2512         write_lock(&em_tree->lock);
2513         ret = add_extent_mapping(em_tree, em);
2514         write_unlock(&em_tree->lock);
2515         BUG_ON(ret);
2516         free_extent_map(em);
2517
2518         ret = btrfs_make_block_group(trans, extent_root, 0, type,
2519                                      BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2520                                      start, num_bytes);
2521         BUG_ON(ret);
2522
2523         for (i = 0; i < map->num_stripes; ++i) {
2524                 struct btrfs_device *device;
2525                 u64 dev_offset;
2526
2527                 device = map->stripes[i].dev;
2528                 dev_offset = map->stripes[i].physical;
2529
2530                 ret = btrfs_alloc_dev_extent(trans, device,
2531                                 info->chunk_root->root_key.objectid,
2532                                 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2533                                 start, dev_offset, stripe_size);
2534                 BUG_ON(ret);
2535         }
2536
2537         kfree(devices_info);
2538         return 0;
2539
2540 error:
2541         kfree(map);
2542         kfree(devices_info);
2543         return ret;
2544 }
2545
2546 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2547                                 struct btrfs_root *extent_root,
2548                                 struct map_lookup *map, u64 chunk_offset,
2549                                 u64 chunk_size, u64 stripe_size)
2550 {
2551         u64 dev_offset;
2552         struct btrfs_key key;
2553         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2554         struct btrfs_device *device;
2555         struct btrfs_chunk *chunk;
2556         struct btrfs_stripe *stripe;
2557         size_t item_size = btrfs_chunk_item_size(map->num_stripes);
2558         int index = 0;
2559         int ret;
2560
2561         chunk = kzalloc(item_size, GFP_NOFS);
2562         if (!chunk)
2563                 return -ENOMEM;
2564
2565         index = 0;
2566         while (index < map->num_stripes) {
2567                 device = map->stripes[index].dev;
2568                 device->bytes_used += stripe_size;
2569                 ret = btrfs_update_device(trans, device);
2570                 BUG_ON(ret);
2571                 index++;
2572         }
2573
2574         index = 0;
2575         stripe = &chunk->stripe;
2576         while (index < map->num_stripes) {
2577                 device = map->stripes[index].dev;
2578                 dev_offset = map->stripes[index].physical;
2579
2580                 btrfs_set_stack_stripe_devid(stripe, device->devid);
2581                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
2582                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
2583                 stripe++;
2584                 index++;
2585         }
2586
2587         btrfs_set_stack_chunk_length(chunk, chunk_size);
2588         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
2589         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
2590         btrfs_set_stack_chunk_type(chunk, map->type);
2591         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
2592         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
2593         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
2594         btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
2595         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
2596
2597         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2598         key.type = BTRFS_CHUNK_ITEM_KEY;
2599         key.offset = chunk_offset;
2600
2601         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
2602         BUG_ON(ret);
2603
2604         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2605                 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
2606                                              item_size);
2607                 BUG_ON(ret);
2608         }
2609
2610         kfree(chunk);
2611         return 0;
2612 }
2613
2614 /*
2615  * Chunk allocation falls into two parts. The first part does works
2616  * that make the new allocated chunk useable, but not do any operation
2617  * that modifies the chunk tree. The second part does the works that
2618  * require modifying the chunk tree. This division is important for the
2619  * bootstrap process of adding storage to a seed btrfs.
2620  */
2621 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2622                       struct btrfs_root *extent_root, u64 type)
2623 {
2624         u64 chunk_offset;
2625         u64 chunk_size;
2626         u64 stripe_size;
2627         struct map_lookup *map;
2628         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2629         int ret;
2630
2631         ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2632                               &chunk_offset);
2633         if (ret)
2634                 return ret;
2635
2636         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2637                                   &stripe_size, chunk_offset, type);
2638         if (ret)
2639                 return ret;
2640
2641         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2642                                    chunk_size, stripe_size);
2643         BUG_ON(ret);
2644         return 0;
2645 }
2646
2647 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2648                                          struct btrfs_root *root,
2649                                          struct btrfs_device *device)
2650 {
2651         u64 chunk_offset;
2652         u64 sys_chunk_offset;
2653         u64 chunk_size;
2654         u64 sys_chunk_size;
2655         u64 stripe_size;
2656         u64 sys_stripe_size;
2657         u64 alloc_profile;
2658         struct map_lookup *map;
2659         struct map_lookup *sys_map;
2660         struct btrfs_fs_info *fs_info = root->fs_info;
2661         struct btrfs_root *extent_root = fs_info->extent_root;
2662         int ret;
2663
2664         ret = find_next_chunk(fs_info->chunk_root,
2665                               BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
2666         BUG_ON(ret);
2667
2668         alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2669                         (fs_info->metadata_alloc_profile &
2670                          fs_info->avail_metadata_alloc_bits);
2671         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2672
2673         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2674                                   &stripe_size, chunk_offset, alloc_profile);
2675         BUG_ON(ret);
2676
2677         sys_chunk_offset = chunk_offset + chunk_size;
2678
2679         alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
2680                         (fs_info->system_alloc_profile &
2681                          fs_info->avail_system_alloc_bits);
2682         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2683
2684         ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
2685                                   &sys_chunk_size, &sys_stripe_size,
2686                                   sys_chunk_offset, alloc_profile);
2687         BUG_ON(ret);
2688
2689         ret = btrfs_add_device(trans, fs_info->chunk_root, device);
2690         BUG_ON(ret);
2691
2692         /*
2693          * Modifying chunk tree needs allocating new blocks from both
2694          * system block group and metadata block group. So we only can
2695          * do operations require modifying the chunk tree after both
2696          * block groups were created.
2697          */
2698         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2699                                    chunk_size, stripe_size);
2700         BUG_ON(ret);
2701
2702         ret = __finish_chunk_alloc(trans, extent_root, sys_map,
2703                                    sys_chunk_offset, sys_chunk_size,
2704                                    sys_stripe_size);
2705         BUG_ON(ret);
2706         return 0;
2707 }
2708
2709 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
2710 {
2711         struct extent_map *em;
2712         struct map_lookup *map;
2713         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2714         int readonly = 0;
2715         int i;
2716
2717         read_lock(&map_tree->map_tree.lock);
2718         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2719         read_unlock(&map_tree->map_tree.lock);
2720         if (!em)
2721                 return 1;
2722
2723         if (btrfs_test_opt(root, DEGRADED)) {
2724                 free_extent_map(em);
2725                 return 0;
2726         }
2727
2728         map = (struct map_lookup *)em->bdev;
2729         for (i = 0; i < map->num_stripes; i++) {
2730                 if (!map->stripes[i].dev->writeable) {
2731                         readonly = 1;
2732                         break;
2733                 }
2734         }
2735         free_extent_map(em);
2736         return readonly;
2737 }
2738
2739 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
2740 {
2741         extent_map_tree_init(&tree->map_tree, GFP_NOFS);
2742 }
2743
2744 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
2745 {
2746         struct extent_map *em;
2747
2748         while (1) {
2749                 write_lock(&tree->map_tree.lock);
2750                 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
2751                 if (em)
2752                         remove_extent_mapping(&tree->map_tree, em);
2753                 write_unlock(&tree->map_tree.lock);
2754                 if (!em)
2755                         break;
2756                 kfree(em->bdev);
2757                 /* once for us */
2758                 free_extent_map(em);
2759                 /* once for the tree */
2760                 free_extent_map(em);
2761         }
2762 }
2763
2764 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
2765 {
2766         struct extent_map *em;
2767         struct map_lookup *map;
2768         struct extent_map_tree *em_tree = &map_tree->map_tree;
2769         int ret;
2770
2771         read_lock(&em_tree->lock);
2772         em = lookup_extent_mapping(em_tree, logical, len);
2773         read_unlock(&em_tree->lock);
2774         BUG_ON(!em);
2775
2776         BUG_ON(em->start > logical || em->start + em->len < logical);
2777         map = (struct map_lookup *)em->bdev;
2778         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
2779                 ret = map->num_stripes;
2780         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2781                 ret = map->sub_stripes;
2782         else
2783                 ret = 1;
2784         free_extent_map(em);
2785         return ret;
2786 }
2787
2788 static int find_live_mirror(struct map_lookup *map, int first, int num,
2789                             int optimal)
2790 {
2791         int i;
2792         if (map->stripes[optimal].dev->bdev)
2793                 return optimal;
2794         for (i = first; i < first + num; i++) {
2795                 if (map->stripes[i].dev->bdev)
2796                         return i;
2797         }
2798         /* we couldn't find one that doesn't fail.  Just return something
2799          * and the io error handling code will clean up eventually
2800          */
2801         return optimal;
2802 }
2803
2804 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2805                              u64 logical, u64 *length,
2806                              struct btrfs_multi_bio **multi_ret,
2807                              int mirror_num, struct page *unplug_page)
2808 {
2809         struct extent_map *em;
2810         struct map_lookup *map;
2811         struct extent_map_tree *em_tree = &map_tree->map_tree;
2812         u64 offset;
2813         u64 stripe_offset;
2814         u64 stripe_end_offset;
2815         u64 stripe_nr;
2816         u64 stripe_nr_orig;
2817         u64 stripe_nr_end;
2818         int stripes_allocated = 8;
2819         int stripes_required = 1;
2820         int stripe_index;
2821         int i;
2822         int num_stripes;
2823         int max_errors = 0;
2824         struct btrfs_multi_bio *multi = NULL;
2825
2826         if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2827                 stripes_allocated = 1;
2828 again:
2829         if (multi_ret) {
2830                 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
2831                                 GFP_NOFS);
2832                 if (!multi)
2833                         return -ENOMEM;
2834
2835                 atomic_set(&multi->error, 0);
2836         }
2837
2838         read_lock(&em_tree->lock);
2839         em = lookup_extent_mapping(em_tree, logical, *length);
2840         read_unlock(&em_tree->lock);
2841
2842         if (!em && unplug_page) {
2843                 kfree(multi);
2844                 return 0;
2845         }
2846
2847         if (!em) {
2848                 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
2849                        (unsigned long long)logical,
2850                        (unsigned long long)*length);
2851                 BUG();
2852         }
2853
2854         BUG_ON(em->start > logical || em->start + em->len < logical);
2855         map = (struct map_lookup *)em->bdev;
2856         offset = logical - em->start;
2857
2858         if (mirror_num > map->num_stripes)
2859                 mirror_num = 0;
2860
2861         /* if our multi bio struct is too small, back off and try again */
2862         if (rw & REQ_WRITE) {
2863                 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2864                                  BTRFS_BLOCK_GROUP_DUP)) {
2865                         stripes_required = map->num_stripes;
2866                         max_errors = 1;
2867                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2868                         stripes_required = map->sub_stripes;
2869                         max_errors = 1;
2870                 }
2871         }
2872         if (rw & REQ_DISCARD) {
2873                 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2874                                  BTRFS_BLOCK_GROUP_RAID1 |
2875                                  BTRFS_BLOCK_GROUP_DUP |
2876                                  BTRFS_BLOCK_GROUP_RAID10)) {
2877                         stripes_required = map->num_stripes;
2878                 }
2879         }
2880         if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2881             stripes_allocated < stripes_required) {
2882                 stripes_allocated = map->num_stripes;
2883                 free_extent_map(em);
2884                 kfree(multi);
2885                 goto again;
2886         }
2887         stripe_nr = offset;
2888         /*
2889          * stripe_nr counts the total number of stripes we have to stride
2890          * to get to this block
2891          */
2892         do_div(stripe_nr, map->stripe_len);
2893
2894         stripe_offset = stripe_nr * map->stripe_len;
2895         BUG_ON(offset < stripe_offset);
2896
2897         /* stripe_offset is the offset of this block in its stripe*/
2898         stripe_offset = offset - stripe_offset;
2899
2900         if (rw & REQ_DISCARD)
2901                 *length = min_t(u64, em->len - offset, *length);
2902         else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2903                               BTRFS_BLOCK_GROUP_RAID1 |
2904                               BTRFS_BLOCK_GROUP_RAID10 |
2905                               BTRFS_BLOCK_GROUP_DUP)) {
2906                 /* we limit the length of each bio to what fits in a stripe */
2907                 *length = min_t(u64, em->len - offset,
2908                                 map->stripe_len - stripe_offset);
2909         } else {
2910                 *length = em->len - offset;
2911         }
2912
2913         if (!multi_ret && !unplug_page)
2914                 goto out;
2915
2916         num_stripes = 1;
2917         stripe_index = 0;
2918         stripe_nr_orig = stripe_nr;
2919         stripe_nr_end = (offset + *length + map->stripe_len - 1) &
2920                         (~(map->stripe_len - 1));
2921         do_div(stripe_nr_end, map->stripe_len);
2922         stripe_end_offset = stripe_nr_end * map->stripe_len -
2923                             (offset + *length);
2924         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2925                 if (rw & REQ_DISCARD)
2926                         num_stripes = min_t(u64, map->num_stripes,
2927                                             stripe_nr_end - stripe_nr_orig);
2928                 stripe_index = do_div(stripe_nr, map->num_stripes);
2929         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2930                 if (unplug_page || (rw & (REQ_WRITE | REQ_DISCARD)))
2931                         num_stripes = map->num_stripes;
2932                 else if (mirror_num)
2933                         stripe_index = mirror_num - 1;
2934                 else {
2935                         stripe_index = find_live_mirror(map, 0,
2936                                             map->num_stripes,
2937                                             current->pid % map->num_stripes);
2938                 }
2939
2940         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2941                 if (rw & (REQ_WRITE | REQ_DISCARD))
2942                         num_stripes = map->num_stripes;
2943                 else if (mirror_num)
2944                         stripe_index = mirror_num - 1;
2945
2946         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2947                 int factor = map->num_stripes / map->sub_stripes;
2948
2949                 stripe_index = do_div(stripe_nr, factor);
2950                 stripe_index *= map->sub_stripes;
2951
2952                 if (unplug_page || (rw & REQ_WRITE))
2953                         num_stripes = map->sub_stripes;
2954                 else if (rw & REQ_DISCARD)
2955                         num_stripes = min_t(u64, map->sub_stripes *
2956                                             (stripe_nr_end - stripe_nr_orig),
2957                                             map->num_stripes);
2958                 else if (mirror_num)
2959                         stripe_index += mirror_num - 1;
2960                 else {
2961                         stripe_index = find_live_mirror(map, stripe_index,
2962                                               map->sub_stripes, stripe_index +
2963                                               current->pid % map->sub_stripes);
2964                 }
2965         } else {
2966                 /*
2967                  * after this do_div call, stripe_nr is the number of stripes
2968                  * on this device we have to walk to find the data, and
2969                  * stripe_index is the number of our device in the stripe array
2970                  */
2971                 stripe_index = do_div(stripe_nr, map->num_stripes);
2972         }
2973         BUG_ON(stripe_index >= map->num_stripes);
2974
2975         if (rw & REQ_DISCARD) {
2976                 for (i = 0; i < num_stripes; i++) {
2977                         multi->stripes[i].physical =
2978                                 map->stripes[stripe_index].physical +
2979                                 stripe_offset + stripe_nr * map->stripe_len;
2980                         multi->stripes[i].dev = map->stripes[stripe_index].dev;
2981
2982                         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2983                                 u64 stripes;
2984                                 u32 last_stripe = 0;
2985                                 int j;
2986
2987                                 div_u64_rem(stripe_nr_end - 1,
2988                                             map->num_stripes,
2989                                             &last_stripe);
2990
2991                                 for (j = 0; j < map->num_stripes; j++) {
2992                                         u32 test;
2993
2994                                         div_u64_rem(stripe_nr_end - 1 - j,
2995                                                     map->num_stripes, &test);
2996                                         if (test == stripe_index)
2997                                                 break;
2998                                 }
2999                                 stripes = stripe_nr_end - 1 - j;
3000                                 do_div(stripes, map->num_stripes);
3001                                 multi->stripes[i].length = map->stripe_len *
3002                                         (stripes - stripe_nr + 1);
3003
3004                                 if (i == 0) {
3005                                         multi->stripes[i].length -=
3006                                                 stripe_offset;
3007                                         stripe_offset = 0;
3008                                 }
3009                                 if (stripe_index == last_stripe)
3010                                         multi->stripes[i].length -=
3011                                                 stripe_end_offset;
3012                         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3013                                 u64 stripes;
3014                                 int j;
3015                                 int factor = map->num_stripes /
3016                                              map->sub_stripes;
3017                                 u32 last_stripe = 0;
3018
3019                                 div_u64_rem(stripe_nr_end - 1,
3020                                             factor, &last_stripe);
3021                                 last_stripe *= map->sub_stripes;
3022
3023                                 for (j = 0; j < factor; j++) {
3024                                         u32 test;
3025
3026                                         div_u64_rem(stripe_nr_end - 1 - j,
3027                                                     factor, &test);
3028
3029                                         if (test ==
3030                                             stripe_index / map->sub_stripes)
3031                                                 break;
3032                                 }
3033                                 stripes = stripe_nr_end - 1 - j;
3034                                 do_div(stripes, factor);
3035                                 multi->stripes[i].length = map->stripe_len *
3036                                         (stripes - stripe_nr + 1);
3037
3038                                 if (i < map->sub_stripes) {
3039                                         multi->stripes[i].length -=
3040                                                 stripe_offset;
3041                                         if (i == map->sub_stripes - 1)
3042                                                 stripe_offset = 0;
3043                                 }
3044                                 if (stripe_index >= last_stripe &&
3045                                     stripe_index <= (last_stripe +
3046                                                      map->sub_stripes - 1)) {
3047                                         multi->stripes[i].length -=
3048                                                 stripe_end_offset;
3049                                 }
3050                         } else
3051                                 multi->stripes[i].length = *length;
3052
3053                         stripe_index++;
3054                         if (stripe_index == map->num_stripes) {
3055                                 /* This could only happen for RAID0/10 */
3056                                 stripe_index = 0;
3057                                 stripe_nr++;
3058                         }
3059                 }
3060         } else {
3061                 for (i = 0; i < num_stripes; i++) {
3062                         if (unplug_page) {
3063                                 struct btrfs_device *device;
3064                                 struct backing_dev_info *bdi;
3065
3066                                 device = map->stripes[stripe_index].dev;
3067                                 if (device->bdev) {
3068                                         bdi = blk_get_backing_dev_info(device->
3069                                                                        bdev);
3070                                         if (bdi->unplug_io_fn)
3071                                                 bdi->unplug_io_fn(bdi,
3072                                                                   unplug_page);
3073                                 }
3074                         } else {
3075                                 multi->stripes[i].physical =
3076                                         map->stripes[stripe_index].physical +
3077                                         stripe_offset +
3078                                         stripe_nr * map->stripe_len;
3079                                 multi->stripes[i].dev =
3080                                         map->stripes[stripe_index].dev;
3081                         }
3082                         stripe_index++;
3083                 }
3084         }
3085         if (multi_ret) {
3086                 *multi_ret = multi;
3087                 multi->num_stripes = num_stripes;
3088                 multi->max_errors = max_errors;
3089         }
3090 out:
3091         free_extent_map(em);
3092         return 0;
3093 }
3094
3095 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3096                       u64 logical, u64 *length,
3097                       struct btrfs_multi_bio **multi_ret, int mirror_num)
3098 {
3099         return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
3100                                  mirror_num, NULL);
3101 }
3102
3103 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3104                      u64 chunk_start, u64 physical, u64 devid,
3105                      u64 **logical, int *naddrs, int *stripe_len)
3106 {
3107         struct extent_map_tree *em_tree = &map_tree->map_tree;
3108         struct extent_map *em;
3109         struct map_lookup *map;
3110         u64 *buf;
3111         u64 bytenr;
3112         u64 length;
3113         u64 stripe_nr;
3114         int i, j, nr = 0;
3115
3116         read_lock(&em_tree->lock);
3117         em = lookup_extent_mapping(em_tree, chunk_start, 1);
3118         read_unlock(&em_tree->lock);
3119
3120         BUG_ON(!em || em->start != chunk_start);
3121         map = (struct map_lookup *)em->bdev;
3122
3123         length = em->len;
3124         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
3125                 do_div(length, map->num_stripes / map->sub_stripes);
3126         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3127                 do_div(length, map->num_stripes);
3128
3129         buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
3130         BUG_ON(!buf);
3131
3132         for (i = 0; i < map->num_stripes; i++) {
3133                 if (devid && map->stripes[i].dev->devid != devid)
3134                         continue;
3135                 if (map->stripes[i].physical > physical ||
3136                     map->stripes[i].physical + length <= physical)
3137                         continue;
3138
3139                 stripe_nr = physical - map->stripes[i].physical;
3140                 do_div(stripe_nr, map->stripe_len);
3141
3142                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3143                         stripe_nr = stripe_nr * map->num_stripes + i;
3144                         do_div(stripe_nr, map->sub_stripes);
3145                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3146                         stripe_nr = stripe_nr * map->num_stripes + i;
3147                 }
3148                 bytenr = chunk_start + stripe_nr * map->stripe_len;
3149                 WARN_ON(nr >= map->num_stripes);
3150                 for (j = 0; j < nr; j++) {
3151                         if (buf[j] == bytenr)
3152                                 break;
3153                 }
3154                 if (j == nr) {
3155                         WARN_ON(nr >= map->num_stripes);
3156                         buf[nr++] = bytenr;
3157                 }
3158         }
3159
3160         *logical = buf;
3161         *naddrs = nr;
3162         *stripe_len = map->stripe_len;
3163
3164         free_extent_map(em);
3165         return 0;
3166 }
3167
3168 int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
3169                       u64 logical, struct page *page)
3170 {
3171         u64 length = PAGE_CACHE_SIZE;
3172         return __btrfs_map_block(map_tree, READ, logical, &length,
3173                                  NULL, 0, page);
3174 }
3175
3176 static void end_bio_multi_stripe(struct bio *bio, int err)
3177 {
3178         struct btrfs_multi_bio *multi = bio->bi_private;
3179         int is_orig_bio = 0;
3180
3181         if (err)
3182                 atomic_inc(&multi->error);
3183
3184         if (bio == multi->orig_bio)
3185                 is_orig_bio = 1;
3186
3187         if (atomic_dec_and_test(&multi->stripes_pending)) {
3188                 if (!is_orig_bio) {
3189                         bio_put(bio);
3190                         bio = multi->orig_bio;
3191                 }
3192                 bio->bi_private = multi->private;
3193                 bio->bi_end_io = multi->end_io;
3194                 /* only send an error to the higher layers if it is
3195                  * beyond the tolerance of the multi-bio
3196                  */
3197                 if (atomic_read(&multi->error) > multi->max_errors) {
3198                         err = -EIO;
3199                 } else if (err) {
3200                         /*
3201                          * this bio is actually up to date, we didn't
3202                          * go over the max number of errors
3203                          */
3204                         set_bit(BIO_UPTODATE, &bio->bi_flags);
3205                         err = 0;
3206                 }
3207                 kfree(multi);
3208
3209                 bio_endio(bio, err);
3210         } else if (!is_orig_bio) {
3211                 bio_put(bio);
3212         }
3213 }
3214
3215 struct async_sched {
3216         struct bio *bio;
3217         int rw;
3218         struct btrfs_fs_info *info;
3219         struct btrfs_work work;
3220 };
3221
3222 /*
3223  * see run_scheduled_bios for a description of why bios are collected for
3224  * async submit.
3225  *
3226  * This will add one bio to the pending list for a device and make sure
3227  * the work struct is scheduled.
3228  */
3229 static noinline int schedule_bio(struct btrfs_root *root,
3230                                  struct btrfs_device *device,
3231                                  int rw, struct bio *bio)
3232 {
3233         int should_queue = 1;
3234         struct btrfs_pending_bios *pending_bios;
3235
3236         /* don't bother with additional async steps for reads, right now */
3237         if (!(rw & REQ_WRITE)) {
3238                 bio_get(bio);
3239                 submit_bio(rw, bio);
3240                 bio_put(bio);
3241                 return 0;
3242         }
3243
3244         /*
3245          * nr_async_bios allows us to reliably return congestion to the
3246          * higher layers.  Otherwise, the async bio makes it appear we have
3247          * made progress against dirty pages when we've really just put it
3248          * on a queue for later
3249          */
3250         atomic_inc(&root->fs_info->nr_async_bios);
3251         WARN_ON(bio->bi_next);
3252         bio->bi_next = NULL;
3253         bio->bi_rw |= rw;
3254
3255         spin_lock(&device->io_lock);
3256         if (bio->bi_rw & REQ_SYNC)
3257                 pending_bios = &device->pending_sync_bios;
3258         else
3259                 pending_bios = &device->pending_bios;
3260
3261         if (pending_bios->tail)
3262                 pending_bios->tail->bi_next = bio;
3263
3264         pending_bios->tail = bio;
3265         if (!pending_bios->head)
3266                 pending_bios->head = bio;
3267         if (device->running_pending)
3268                 should_queue = 0;
3269
3270         spin_unlock(&device->io_lock);
3271
3272         if (should_queue)
3273                 btrfs_queue_worker(&root->fs_info->submit_workers,
3274                                    &device->work);
3275         return 0;
3276 }
3277
3278 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3279                   int mirror_num, int async_submit)
3280 {
3281         struct btrfs_mapping_tree *map_tree;
3282         struct btrfs_device *dev;
3283         struct bio *first_bio = bio;
3284         u64 logical = (u64)bio->bi_sector << 9;
3285         u64 length = 0;
3286         u64 map_length;
3287         struct btrfs_multi_bio *multi = NULL;
3288         int ret;
3289         int dev_nr = 0;
3290         int total_devs = 1;
3291
3292         length = bio->bi_size;
3293         map_tree = &root->fs_info->mapping_tree;
3294         map_length = length;
3295
3296         ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
3297                               mirror_num);
3298         BUG_ON(ret);
3299
3300         total_devs = multi->num_stripes;
3301         if (map_length < length) {
3302                 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
3303                        "len %llu\n", (unsigned long long)logical,
3304                        (unsigned long long)length,
3305                        (unsigned long long)map_length);
3306                 BUG();
3307         }
3308         multi->end_io = first_bio->bi_end_io;
3309         multi->private = first_bio->bi_private;
3310         multi->orig_bio = first_bio;
3311         atomic_set(&multi->stripes_pending, multi->num_stripes);
3312
3313         while (dev_nr < total_devs) {
3314                 if (total_devs > 1) {
3315                         if (dev_nr < total_devs - 1) {
3316                                 bio = bio_clone(first_bio, GFP_NOFS);
3317                                 BUG_ON(!bio);
3318                         } else {
3319                                 bio = first_bio;
3320                         }
3321                         bio->bi_private = multi;
3322                         bio->bi_end_io = end_bio_multi_stripe;
3323                 }
3324                 bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
3325                 dev = multi->stripes[dev_nr].dev;
3326                 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
3327                         bio->bi_bdev = dev->bdev;
3328                         if (async_submit)
3329                                 schedule_bio(root, dev, rw, bio);
3330                         else
3331                                 submit_bio(rw, bio);
3332                 } else {
3333                         bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
3334                         bio->bi_sector = logical >> 9;
3335                         bio_endio(bio, -EIO);
3336                 }
3337                 dev_nr++;
3338         }
3339         if (total_devs == 1)
3340                 kfree(multi);
3341         return 0;
3342 }
3343
3344 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
3345                                        u8 *uuid, u8 *fsid)
3346 {
3347         struct btrfs_device *device;
3348         struct btrfs_fs_devices *cur_devices;
3349
3350         cur_devices = root->fs_info->fs_devices;
3351         while (cur_devices) {
3352                 if (!fsid ||
3353                     !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
3354                         device = __find_device(&cur_devices->devices,
3355                                                devid, uuid);
3356                         if (device)
3357                                 return device;
3358                 }
3359                 cur_devices = cur_devices->seed;
3360         }
3361         return NULL;
3362 }
3363
3364 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
3365                                             u64 devid, u8 *dev_uuid)
3366 {
3367         struct btrfs_device *device;
3368         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
3369
3370         device = kzalloc(sizeof(*device), GFP_NOFS);
3371         if (!device)
3372                 return NULL;
3373         list_add(&device->dev_list,
3374                  &fs_devices->devices);
3375         device->dev_root = root->fs_info->dev_root;
3376         device->devid = devid;
3377         device->work.func = pending_bios_fn;
3378         device->fs_devices = fs_devices;
3379         device->missing = 1;
3380         fs_devices->num_devices++;
3381         fs_devices->missing_devices++;
3382         spin_lock_init(&device->io_lock);
3383         INIT_LIST_HEAD(&device->dev_alloc_list);
3384         memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
3385         return device;
3386 }
3387
3388 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
3389                           struct extent_buffer *leaf,
3390                           struct btrfs_chunk *chunk)
3391 {
3392         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
3393         struct map_lookup *map;
3394         struct extent_map *em;
3395         u64 logical;
3396         u64 length;
3397         u64 devid;
3398         u8 uuid[BTRFS_UUID_SIZE];
3399         int num_stripes;
3400         int ret;
3401         int i;
3402
3403         logical = key->offset;
3404         length = btrfs_chunk_length(leaf, chunk);
3405
3406         read_lock(&map_tree->map_tree.lock);
3407         em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
3408         read_unlock(&map_tree->map_tree.lock);
3409
3410         /* already mapped? */
3411         if (em && em->start <= logical && em->start + em->len > logical) {
3412                 free_extent_map(em);
3413                 return 0;
3414         } else if (em) {
3415                 free_extent_map(em);
3416         }
3417
3418         em = alloc_extent_map(GFP_NOFS);
3419         if (!em)
3420                 return -ENOMEM;
3421         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3422         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3423         if (!map) {
3424                 free_extent_map(em);
3425                 return -ENOMEM;
3426         }
3427
3428         em->bdev = (struct block_device *)map;
3429         em->start = logical;
3430         em->len = length;
3431         em->block_start = 0;
3432         em->block_len = em->len;
3433
3434         map->num_stripes = num_stripes;
3435         map->io_width = btrfs_chunk_io_width(leaf, chunk);
3436         map->io_align = btrfs_chunk_io_align(leaf, chunk);
3437         map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
3438         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
3439         map->type = btrfs_chunk_type(leaf, chunk);
3440         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
3441         for (i = 0; i < num_stripes; i++) {
3442                 map->stripes[i].physical =
3443                         btrfs_stripe_offset_nr(leaf, chunk, i);
3444                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
3445                 read_extent_buffer(leaf, uuid, (unsigned long)
3446                                    btrfs_stripe_dev_uuid_nr(chunk, i),
3447                                    BTRFS_UUID_SIZE);
3448                 map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
3449                                                         NULL);
3450                 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
3451                         kfree(map);
3452                         free_extent_map(em);
3453                         return -EIO;
3454                 }
3455                 if (!map->stripes[i].dev) {
3456                         map->stripes[i].dev =
3457                                 add_missing_dev(root, devid, uuid);
3458                         if (!map->stripes[i].dev) {
3459                                 kfree(map);
3460                                 free_extent_map(em);
3461                                 return -EIO;
3462                         }
3463                 }
3464                 map->stripes[i].dev->in_fs_metadata = 1;
3465         }
3466
3467         write_lock(&map_tree->map_tree.lock);
3468         ret = add_extent_mapping(&map_tree->map_tree, em);
3469         write_unlock(&map_tree->map_tree.lock);
3470         BUG_ON(ret);
3471         free_extent_map(em);
3472
3473         return 0;
3474 }
3475
3476 static int fill_device_from_item(struct extent_buffer *leaf,
3477                                  struct btrfs_dev_item *dev_item,
3478                                  struct btrfs_device *device)
3479 {
3480         unsigned long ptr;
3481
3482         device->devid = btrfs_device_id(leaf, dev_item);
3483         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
3484         device->total_bytes = device->disk_total_bytes;
3485         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
3486         device->type = btrfs_device_type(leaf, dev_item);
3487         device->io_align = btrfs_device_io_align(leaf, dev_item);
3488         device->io_width = btrfs_device_io_width(leaf, dev_item);
3489         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
3490
3491         ptr = (unsigned long)btrfs_device_uuid(dev_item);
3492         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
3493
3494         return 0;
3495 }
3496
3497 static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
3498 {
3499         struct btrfs_fs_devices *fs_devices;
3500         int ret;
3501
3502         mutex_lock(&uuid_mutex);
3503
3504         fs_devices = root->fs_info->fs_devices->seed;
3505         while (fs_devices) {
3506                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
3507                         ret = 0;
3508                         goto out;
3509                 }
3510                 fs_devices = fs_devices->seed;
3511         }
3512
3513         fs_devices = find_fsid(fsid);
3514         if (!fs_devices) {
3515                 ret = -ENOENT;
3516                 goto out;
3517         }
3518
3519         fs_devices = clone_fs_devices(fs_devices);
3520         if (IS_ERR(fs_devices)) {
3521                 ret = PTR_ERR(fs_devices);
3522                 goto out;
3523         }
3524
3525         ret = __btrfs_open_devices(fs_devices, FMODE_READ,
3526                                    root->fs_info->bdev_holder);
3527         if (ret)
3528                 goto out;
3529
3530         if (!fs_devices->seeding) {
3531                 __btrfs_close_devices(fs_devices);
3532                 free_fs_devices(fs_devices);
3533                 ret = -EINVAL;
3534                 goto out;
3535         }
3536
3537         fs_devices->seed = root->fs_info->fs_devices->seed;
3538         root->fs_info->fs_devices->seed = fs_devices;
3539 out:
3540         mutex_unlock(&uuid_mutex);
3541         return ret;
3542 }
3543
3544 static int read_one_dev(struct btrfs_root *root,
3545                         struct extent_buffer *leaf,
3546                         struct btrfs_dev_item *dev_item)
3547 {
3548         struct btrfs_device *device;
3549         u64 devid;
3550         int ret;
3551         u8 fs_uuid[BTRFS_UUID_SIZE];
3552         u8 dev_uuid[BTRFS_UUID_SIZE];
3553
3554         devid = btrfs_device_id(leaf, dev_item);
3555         read_extent_buffer(leaf, dev_uuid,
3556                            (unsigned long)btrfs_device_uuid(dev_item),
3557                            BTRFS_UUID_SIZE);
3558         read_extent_buffer(leaf, fs_uuid,
3559                            (unsigned long)btrfs_device_fsid(dev_item),
3560                            BTRFS_UUID_SIZE);
3561
3562         if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
3563                 ret = open_seed_devices(root, fs_uuid);
3564                 if (ret && !btrfs_test_opt(root, DEGRADED))
3565                         return ret;
3566         }
3567
3568         device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
3569         if (!device || !device->bdev) {
3570                 if (!btrfs_test_opt(root, DEGRADED))
3571                         return -EIO;
3572
3573                 if (!device) {
3574                         printk(KERN_WARNING "warning devid %llu missing\n",
3575                                (unsigned long long)devid);
3576                         device = add_missing_dev(root, devid, dev_uuid);
3577                         if (!device)
3578                                 return -ENOMEM;
3579                 } else if (!device->missing) {
3580                         /*
3581                          * this happens when a device that was properly setup
3582                          * in the device info lists suddenly goes bad.
3583                          * device->bdev is NULL, and so we have to set
3584                          * device->missing to one here
3585                          */
3586                         root->fs_info->fs_devices->missing_devices++;
3587                         device->missing = 1;
3588                 }
3589         }
3590
3591         if (device->fs_devices != root->fs_info->fs_devices) {
3592                 BUG_ON(device->writeable);
3593                 if (device->generation !=
3594                     btrfs_device_generation(leaf, dev_item))
3595                         return -EINVAL;
3596         }
3597
3598         fill_device_from_item(leaf, dev_item, device);
3599         device->dev_root = root->fs_info->dev_root;
3600         device->in_fs_metadata = 1;
3601         if (device->writeable)
3602                 device->fs_devices->total_rw_bytes += device->total_bytes;
3603         ret = 0;
3604         return ret;
3605 }
3606
3607 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
3608 {
3609         struct btrfs_dev_item *dev_item;
3610
3611         dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
3612                                                      dev_item);
3613         return read_one_dev(root, buf, dev_item);
3614 }
3615
3616 int btrfs_read_sys_array(struct btrfs_root *root)
3617 {
3618         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
3619         struct extent_buffer *sb;
3620         struct btrfs_disk_key *disk_key;
3621         struct btrfs_chunk *chunk;
3622         u8 *ptr;
3623         unsigned long sb_ptr;
3624         int ret = 0;
3625         u32 num_stripes;
3626         u32 array_size;
3627         u32 len = 0;
3628         u32 cur;
3629         struct btrfs_key key;
3630
3631         sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
3632                                           BTRFS_SUPER_INFO_SIZE);
3633         if (!sb)
3634                 return -ENOMEM;
3635         btrfs_set_buffer_uptodate(sb);
3636         btrfs_set_buffer_lockdep_class(sb, 0);
3637
3638         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3639         array_size = btrfs_super_sys_array_size(super_copy);
3640
3641         ptr = super_copy->sys_chunk_array;
3642         sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
3643         cur = 0;
3644
3645         while (cur < array_size) {
3646                 disk_key = (struct btrfs_disk_key *)ptr;
3647                 btrfs_disk_key_to_cpu(&key, disk_key);
3648
3649                 len = sizeof(*disk_key); ptr += len;
3650                 sb_ptr += len;
3651                 cur += len;
3652
3653                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
3654                         chunk = (struct btrfs_chunk *)sb_ptr;
3655                         ret = read_one_chunk(root, &key, sb, chunk);
3656                         if (ret)
3657                                 break;
3658                         num_stripes = btrfs_chunk_num_stripes(sb, chunk);
3659                         len = btrfs_chunk_item_size(num_stripes);
3660                 } else {
3661                         ret = -EIO;
3662                         break;
3663                 }
3664                 ptr += len;
3665                 sb_ptr += len;
3666                 cur += len;
3667         }
3668         free_extent_buffer(sb);
3669         return ret;
3670 }
3671
3672 int btrfs_read_chunk_tree(struct btrfs_root *root)
3673 {
3674         struct btrfs_path *path;
3675         struct extent_buffer *leaf;
3676         struct btrfs_key key;
3677         struct btrfs_key found_key;
3678         int ret;
3679         int slot;
3680
3681         root = root->fs_info->chunk_root;
3682
3683         path = btrfs_alloc_path();
3684         if (!path)
3685                 return -ENOMEM;
3686
3687         /* first we search for all of the device items, and then we
3688          * read in all of the chunk items.  This way we can create chunk
3689          * mappings that reference all of the devices that are afound
3690          */
3691         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
3692         key.offset = 0;
3693         key.type = 0;
3694 again:
3695         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3696         if (ret < 0)
3697                 goto error;
3698         while (1) {
3699                 leaf = path->nodes[0];
3700                 slot = path->slots[0];
3701                 if (slot >= btrfs_header_nritems(leaf)) {
3702                         ret = btrfs_next_leaf(root, path);
3703                         if (ret == 0)
3704                                 continue;
3705                         if (ret < 0)
3706                                 goto error;
3707                         break;
3708                 }
3709                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3710                 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3711                         if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
3712                                 break;
3713                         if (found_key.type == BTRFS_DEV_ITEM_KEY) {
3714                                 struct btrfs_dev_item *dev_item;
3715                                 dev_item = btrfs_item_ptr(leaf, slot,
3716                                                   struct btrfs_dev_item);
3717                                 ret = read_one_dev(root, leaf, dev_item);
3718                                 if (ret)
3719                                         goto error;
3720                         }
3721                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
3722                         struct btrfs_chunk *chunk;
3723                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3724                         ret = read_one_chunk(root, &found_key, leaf, chunk);
3725                         if (ret)
3726                                 goto error;
3727                 }
3728                 path->slots[0]++;
3729         }
3730         if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3731                 key.objectid = 0;
3732                 btrfs_release_path(root, path);
3733                 goto again;
3734         }
3735         ret = 0;
3736 error:
3737         btrfs_free_path(path);
3738         return ret;
3739 }