btrfs: fix wrong free space information of btrfs
[pandora-kernel.git] / fs / btrfs / volumes.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/slab.h>
21 #include <linux/buffer_head.h>
22 #include <linux/blkdev.h>
23 #include <linux/random.h>
24 #include <linux/iocontext.h>
25 #include <asm/div64.h>
26 #include "compat.h"
27 #include "ctree.h"
28 #include "extent_map.h"
29 #include "disk-io.h"
30 #include "transaction.h"
31 #include "print-tree.h"
32 #include "volumes.h"
33 #include "async-thread.h"
34
35 struct map_lookup {
36         u64 type;
37         int io_align;
38         int io_width;
39         int stripe_len;
40         int sector_size;
41         int num_stripes;
42         int sub_stripes;
43         struct btrfs_bio_stripe stripes[];
44 };
45
46 static int init_first_rw_device(struct btrfs_trans_handle *trans,
47                                 struct btrfs_root *root,
48                                 struct btrfs_device *device);
49 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
50
51 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
52                             (sizeof(struct btrfs_bio_stripe) * (n)))
53
54 static DEFINE_MUTEX(uuid_mutex);
55 static LIST_HEAD(fs_uuids);
56
57 void btrfs_lock_volumes(void)
58 {
59         mutex_lock(&uuid_mutex);
60 }
61
62 void btrfs_unlock_volumes(void)
63 {
64         mutex_unlock(&uuid_mutex);
65 }
66
67 static void lock_chunks(struct btrfs_root *root)
68 {
69         mutex_lock(&root->fs_info->chunk_mutex);
70 }
71
72 static void unlock_chunks(struct btrfs_root *root)
73 {
74         mutex_unlock(&root->fs_info->chunk_mutex);
75 }
76
77 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
78 {
79         struct btrfs_device *device;
80         WARN_ON(fs_devices->opened);
81         while (!list_empty(&fs_devices->devices)) {
82                 device = list_entry(fs_devices->devices.next,
83                                     struct btrfs_device, dev_list);
84                 list_del(&device->dev_list);
85                 kfree(device->name);
86                 kfree(device);
87         }
88         kfree(fs_devices);
89 }
90
91 int btrfs_cleanup_fs_uuids(void)
92 {
93         struct btrfs_fs_devices *fs_devices;
94
95         while (!list_empty(&fs_uuids)) {
96                 fs_devices = list_entry(fs_uuids.next,
97                                         struct btrfs_fs_devices, list);
98                 list_del(&fs_devices->list);
99                 free_fs_devices(fs_devices);
100         }
101         return 0;
102 }
103
104 static noinline struct btrfs_device *__find_device(struct list_head *head,
105                                                    u64 devid, u8 *uuid)
106 {
107         struct btrfs_device *dev;
108
109         list_for_each_entry(dev, head, dev_list) {
110                 if (dev->devid == devid &&
111                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
112                         return dev;
113                 }
114         }
115         return NULL;
116 }
117
118 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
119 {
120         struct btrfs_fs_devices *fs_devices;
121
122         list_for_each_entry(fs_devices, &fs_uuids, list) {
123                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
124                         return fs_devices;
125         }
126         return NULL;
127 }
128
129 static void requeue_list(struct btrfs_pending_bios *pending_bios,
130                         struct bio *head, struct bio *tail)
131 {
132
133         struct bio *old_head;
134
135         old_head = pending_bios->head;
136         pending_bios->head = head;
137         if (pending_bios->tail)
138                 tail->bi_next = old_head;
139         else
140                 pending_bios->tail = tail;
141 }
142
143 /*
144  * we try to collect pending bios for a device so we don't get a large
145  * number of procs sending bios down to the same device.  This greatly
146  * improves the schedulers ability to collect and merge the bios.
147  *
148  * But, it also turns into a long list of bios to process and that is sure
149  * to eventually make the worker thread block.  The solution here is to
150  * make some progress and then put this work struct back at the end of
151  * the list if the block device is congested.  This way, multiple devices
152  * can make progress from a single worker thread.
153  */
154 static noinline int run_scheduled_bios(struct btrfs_device *device)
155 {
156         struct bio *pending;
157         struct backing_dev_info *bdi;
158         struct btrfs_fs_info *fs_info;
159         struct btrfs_pending_bios *pending_bios;
160         struct bio *tail;
161         struct bio *cur;
162         int again = 0;
163         unsigned long num_run;
164         unsigned long num_sync_run;
165         unsigned long batch_run = 0;
166         unsigned long limit;
167         unsigned long last_waited = 0;
168         int force_reg = 0;
169
170         bdi = blk_get_backing_dev_info(device->bdev);
171         fs_info = device->dev_root->fs_info;
172         limit = btrfs_async_submit_limit(fs_info);
173         limit = limit * 2 / 3;
174
175         /* we want to make sure that every time we switch from the sync
176          * list to the normal list, we unplug
177          */
178         num_sync_run = 0;
179
180 loop:
181         spin_lock(&device->io_lock);
182
183 loop_lock:
184         num_run = 0;
185
186         /* take all the bios off the list at once and process them
187          * later on (without the lock held).  But, remember the
188          * tail and other pointers so the bios can be properly reinserted
189          * into the list if we hit congestion
190          */
191         if (!force_reg && device->pending_sync_bios.head) {
192                 pending_bios = &device->pending_sync_bios;
193                 force_reg = 1;
194         } else {
195                 pending_bios = &device->pending_bios;
196                 force_reg = 0;
197         }
198
199         pending = pending_bios->head;
200         tail = pending_bios->tail;
201         WARN_ON(pending && !tail);
202
203         /*
204          * if pending was null this time around, no bios need processing
205          * at all and we can stop.  Otherwise it'll loop back up again
206          * and do an additional check so no bios are missed.
207          *
208          * device->running_pending is used to synchronize with the
209          * schedule_bio code.
210          */
211         if (device->pending_sync_bios.head == NULL &&
212             device->pending_bios.head == NULL) {
213                 again = 0;
214                 device->running_pending = 0;
215         } else {
216                 again = 1;
217                 device->running_pending = 1;
218         }
219
220         pending_bios->head = NULL;
221         pending_bios->tail = NULL;
222
223         spin_unlock(&device->io_lock);
224
225         /*
226          * if we're doing the regular priority list, make sure we unplug
227          * for any high prio bios we've sent down
228          */
229         if (pending_bios == &device->pending_bios && num_sync_run > 0) {
230                 num_sync_run = 0;
231                 blk_run_backing_dev(bdi, NULL);
232         }
233
234         while (pending) {
235
236                 rmb();
237                 /* we want to work on both lists, but do more bios on the
238                  * sync list than the regular list
239                  */
240                 if ((num_run > 32 &&
241                     pending_bios != &device->pending_sync_bios &&
242                     device->pending_sync_bios.head) ||
243                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
244                     device->pending_bios.head)) {
245                         spin_lock(&device->io_lock);
246                         requeue_list(pending_bios, pending, tail);
247                         goto loop_lock;
248                 }
249
250                 cur = pending;
251                 pending = pending->bi_next;
252                 cur->bi_next = NULL;
253                 atomic_dec(&fs_info->nr_async_bios);
254
255                 if (atomic_read(&fs_info->nr_async_bios) < limit &&
256                     waitqueue_active(&fs_info->async_submit_wait))
257                         wake_up(&fs_info->async_submit_wait);
258
259                 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
260
261                 if (cur->bi_rw & REQ_SYNC)
262                         num_sync_run++;
263
264                 submit_bio(cur->bi_rw, cur);
265                 num_run++;
266                 batch_run++;
267                 if (need_resched()) {
268                         if (num_sync_run) {
269                                 blk_run_backing_dev(bdi, NULL);
270                                 num_sync_run = 0;
271                         }
272                         cond_resched();
273                 }
274
275                 /*
276                  * we made progress, there is more work to do and the bdi
277                  * is now congested.  Back off and let other work structs
278                  * run instead
279                  */
280                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
281                     fs_info->fs_devices->open_devices > 1) {
282                         struct io_context *ioc;
283
284                         ioc = current->io_context;
285
286                         /*
287                          * the main goal here is that we don't want to
288                          * block if we're going to be able to submit
289                          * more requests without blocking.
290                          *
291                          * This code does two great things, it pokes into
292                          * the elevator code from a filesystem _and_
293                          * it makes assumptions about how batching works.
294                          */
295                         if (ioc && ioc->nr_batch_requests > 0 &&
296                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
297                             (last_waited == 0 ||
298                              ioc->last_waited == last_waited)) {
299                                 /*
300                                  * we want to go through our batch of
301                                  * requests and stop.  So, we copy out
302                                  * the ioc->last_waited time and test
303                                  * against it before looping
304                                  */
305                                 last_waited = ioc->last_waited;
306                                 if (need_resched()) {
307                                         if (num_sync_run) {
308                                                 blk_run_backing_dev(bdi, NULL);
309                                                 num_sync_run = 0;
310                                         }
311                                         cond_resched();
312                                 }
313                                 continue;
314                         }
315                         spin_lock(&device->io_lock);
316                         requeue_list(pending_bios, pending, tail);
317                         device->running_pending = 1;
318
319                         spin_unlock(&device->io_lock);
320                         btrfs_requeue_work(&device->work);
321                         goto done;
322                 }
323         }
324
325         if (num_sync_run) {
326                 num_sync_run = 0;
327                 blk_run_backing_dev(bdi, NULL);
328         }
329         /*
330          * IO has already been through a long path to get here.  Checksumming,
331          * async helper threads, perhaps compression.  We've done a pretty
332          * good job of collecting a batch of IO and should just unplug
333          * the device right away.
334          *
335          * This will help anyone who is waiting on the IO, they might have
336          * already unplugged, but managed to do so before the bio they
337          * cared about found its way down here.
338          */
339         blk_run_backing_dev(bdi, NULL);
340
341         cond_resched();
342         if (again)
343                 goto loop;
344
345         spin_lock(&device->io_lock);
346         if (device->pending_bios.head || device->pending_sync_bios.head)
347                 goto loop_lock;
348         spin_unlock(&device->io_lock);
349
350 done:
351         return 0;
352 }
353
354 static void pending_bios_fn(struct btrfs_work *work)
355 {
356         struct btrfs_device *device;
357
358         device = container_of(work, struct btrfs_device, work);
359         run_scheduled_bios(device);
360 }
361
362 static noinline int device_list_add(const char *path,
363                            struct btrfs_super_block *disk_super,
364                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
365 {
366         struct btrfs_device *device;
367         struct btrfs_fs_devices *fs_devices;
368         u64 found_transid = btrfs_super_generation(disk_super);
369         char *name;
370
371         fs_devices = find_fsid(disk_super->fsid);
372         if (!fs_devices) {
373                 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
374                 if (!fs_devices)
375                         return -ENOMEM;
376                 INIT_LIST_HEAD(&fs_devices->devices);
377                 INIT_LIST_HEAD(&fs_devices->alloc_list);
378                 list_add(&fs_devices->list, &fs_uuids);
379                 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
380                 fs_devices->latest_devid = devid;
381                 fs_devices->latest_trans = found_transid;
382                 mutex_init(&fs_devices->device_list_mutex);
383                 device = NULL;
384         } else {
385                 device = __find_device(&fs_devices->devices, devid,
386                                        disk_super->dev_item.uuid);
387         }
388         if (!device) {
389                 if (fs_devices->opened)
390                         return -EBUSY;
391
392                 device = kzalloc(sizeof(*device), GFP_NOFS);
393                 if (!device) {
394                         /* we can safely leave the fs_devices entry around */
395                         return -ENOMEM;
396                 }
397                 device->devid = devid;
398                 device->work.func = pending_bios_fn;
399                 memcpy(device->uuid, disk_super->dev_item.uuid,
400                        BTRFS_UUID_SIZE);
401                 device->barriers = 1;
402                 spin_lock_init(&device->io_lock);
403                 device->name = kstrdup(path, GFP_NOFS);
404                 if (!device->name) {
405                         kfree(device);
406                         return -ENOMEM;
407                 }
408                 INIT_LIST_HEAD(&device->dev_alloc_list);
409
410                 mutex_lock(&fs_devices->device_list_mutex);
411                 list_add(&device->dev_list, &fs_devices->devices);
412                 mutex_unlock(&fs_devices->device_list_mutex);
413
414                 device->fs_devices = fs_devices;
415                 fs_devices->num_devices++;
416         } else if (!device->name || strcmp(device->name, path)) {
417                 name = kstrdup(path, GFP_NOFS);
418                 if (!name)
419                         return -ENOMEM;
420                 kfree(device->name);
421                 device->name = name;
422                 if (device->missing) {
423                         fs_devices->missing_devices--;
424                         device->missing = 0;
425                 }
426         }
427
428         if (found_transid > fs_devices->latest_trans) {
429                 fs_devices->latest_devid = devid;
430                 fs_devices->latest_trans = found_transid;
431         }
432         *fs_devices_ret = fs_devices;
433         return 0;
434 }
435
436 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
437 {
438         struct btrfs_fs_devices *fs_devices;
439         struct btrfs_device *device;
440         struct btrfs_device *orig_dev;
441
442         fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
443         if (!fs_devices)
444                 return ERR_PTR(-ENOMEM);
445
446         INIT_LIST_HEAD(&fs_devices->devices);
447         INIT_LIST_HEAD(&fs_devices->alloc_list);
448         INIT_LIST_HEAD(&fs_devices->list);
449         mutex_init(&fs_devices->device_list_mutex);
450         fs_devices->latest_devid = orig->latest_devid;
451         fs_devices->latest_trans = orig->latest_trans;
452         memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
453
454         mutex_lock(&orig->device_list_mutex);
455         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
456                 device = kzalloc(sizeof(*device), GFP_NOFS);
457                 if (!device)
458                         goto error;
459
460                 device->name = kstrdup(orig_dev->name, GFP_NOFS);
461                 if (!device->name) {
462                         kfree(device);
463                         goto error;
464                 }
465
466                 device->devid = orig_dev->devid;
467                 device->work.func = pending_bios_fn;
468                 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
469                 device->barriers = 1;
470                 spin_lock_init(&device->io_lock);
471                 INIT_LIST_HEAD(&device->dev_list);
472                 INIT_LIST_HEAD(&device->dev_alloc_list);
473
474                 list_add(&device->dev_list, &fs_devices->devices);
475                 device->fs_devices = fs_devices;
476                 fs_devices->num_devices++;
477         }
478         mutex_unlock(&orig->device_list_mutex);
479         return fs_devices;
480 error:
481         mutex_unlock(&orig->device_list_mutex);
482         free_fs_devices(fs_devices);
483         return ERR_PTR(-ENOMEM);
484 }
485
486 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
487 {
488         struct btrfs_device *device, *next;
489
490         mutex_lock(&uuid_mutex);
491 again:
492         mutex_lock(&fs_devices->device_list_mutex);
493         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
494                 if (device->in_fs_metadata)
495                         continue;
496
497                 if (device->bdev) {
498                         close_bdev_exclusive(device->bdev, device->mode);
499                         device->bdev = NULL;
500                         fs_devices->open_devices--;
501                 }
502                 if (device->writeable) {
503                         list_del_init(&device->dev_alloc_list);
504                         device->writeable = 0;
505                         fs_devices->rw_devices--;
506                 }
507                 list_del_init(&device->dev_list);
508                 fs_devices->num_devices--;
509                 kfree(device->name);
510                 kfree(device);
511         }
512         mutex_unlock(&fs_devices->device_list_mutex);
513
514         if (fs_devices->seed) {
515                 fs_devices = fs_devices->seed;
516                 goto again;
517         }
518
519         mutex_unlock(&uuid_mutex);
520         return 0;
521 }
522
523 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
524 {
525         struct btrfs_device *device;
526
527         if (--fs_devices->opened > 0)
528                 return 0;
529
530         list_for_each_entry(device, &fs_devices->devices, dev_list) {
531                 if (device->bdev) {
532                         close_bdev_exclusive(device->bdev, device->mode);
533                         fs_devices->open_devices--;
534                 }
535                 if (device->writeable) {
536                         list_del_init(&device->dev_alloc_list);
537                         fs_devices->rw_devices--;
538                 }
539
540                 device->bdev = NULL;
541                 device->writeable = 0;
542                 device->in_fs_metadata = 0;
543         }
544         WARN_ON(fs_devices->open_devices);
545         WARN_ON(fs_devices->rw_devices);
546         fs_devices->opened = 0;
547         fs_devices->seeding = 0;
548
549         return 0;
550 }
551
552 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
553 {
554         struct btrfs_fs_devices *seed_devices = NULL;
555         int ret;
556
557         mutex_lock(&uuid_mutex);
558         ret = __btrfs_close_devices(fs_devices);
559         if (!fs_devices->opened) {
560                 seed_devices = fs_devices->seed;
561                 fs_devices->seed = NULL;
562         }
563         mutex_unlock(&uuid_mutex);
564
565         while (seed_devices) {
566                 fs_devices = seed_devices;
567                 seed_devices = fs_devices->seed;
568                 __btrfs_close_devices(fs_devices);
569                 free_fs_devices(fs_devices);
570         }
571         return ret;
572 }
573
574 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
575                                 fmode_t flags, void *holder)
576 {
577         struct block_device *bdev;
578         struct list_head *head = &fs_devices->devices;
579         struct btrfs_device *device;
580         struct block_device *latest_bdev = NULL;
581         struct buffer_head *bh;
582         struct btrfs_super_block *disk_super;
583         u64 latest_devid = 0;
584         u64 latest_transid = 0;
585         u64 devid;
586         int seeding = 1;
587         int ret = 0;
588
589         list_for_each_entry(device, head, dev_list) {
590                 if (device->bdev)
591                         continue;
592                 if (!device->name)
593                         continue;
594
595                 bdev = open_bdev_exclusive(device->name, flags, holder);
596                 if (IS_ERR(bdev)) {
597                         printk(KERN_INFO "open %s failed\n", device->name);
598                         goto error;
599                 }
600                 set_blocksize(bdev, 4096);
601
602                 bh = btrfs_read_dev_super(bdev);
603                 if (!bh)
604                         goto error_close;
605
606                 disk_super = (struct btrfs_super_block *)bh->b_data;
607                 devid = btrfs_stack_device_id(&disk_super->dev_item);
608                 if (devid != device->devid)
609                         goto error_brelse;
610
611                 if (memcmp(device->uuid, disk_super->dev_item.uuid,
612                            BTRFS_UUID_SIZE))
613                         goto error_brelse;
614
615                 device->generation = btrfs_super_generation(disk_super);
616                 if (!latest_transid || device->generation > latest_transid) {
617                         latest_devid = devid;
618                         latest_transid = device->generation;
619                         latest_bdev = bdev;
620                 }
621
622                 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
623                         device->writeable = 0;
624                 } else {
625                         device->writeable = !bdev_read_only(bdev);
626                         seeding = 0;
627                 }
628
629                 device->bdev = bdev;
630                 device->in_fs_metadata = 0;
631                 device->mode = flags;
632
633                 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
634                         fs_devices->rotating = 1;
635
636                 fs_devices->open_devices++;
637                 if (device->writeable) {
638                         fs_devices->rw_devices++;
639                         list_add(&device->dev_alloc_list,
640                                  &fs_devices->alloc_list);
641                 }
642                 continue;
643
644 error_brelse:
645                 brelse(bh);
646 error_close:
647                 close_bdev_exclusive(bdev, FMODE_READ);
648 error:
649                 continue;
650         }
651         if (fs_devices->open_devices == 0) {
652                 ret = -EIO;
653                 goto out;
654         }
655         fs_devices->seeding = seeding;
656         fs_devices->opened = 1;
657         fs_devices->latest_bdev = latest_bdev;
658         fs_devices->latest_devid = latest_devid;
659         fs_devices->latest_trans = latest_transid;
660         fs_devices->total_rw_bytes = 0;
661 out:
662         return ret;
663 }
664
665 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
666                        fmode_t flags, void *holder)
667 {
668         int ret;
669
670         mutex_lock(&uuid_mutex);
671         if (fs_devices->opened) {
672                 fs_devices->opened++;
673                 ret = 0;
674         } else {
675                 ret = __btrfs_open_devices(fs_devices, flags, holder);
676         }
677         mutex_unlock(&uuid_mutex);
678         return ret;
679 }
680
681 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
682                           struct btrfs_fs_devices **fs_devices_ret)
683 {
684         struct btrfs_super_block *disk_super;
685         struct block_device *bdev;
686         struct buffer_head *bh;
687         int ret;
688         u64 devid;
689         u64 transid;
690
691         mutex_lock(&uuid_mutex);
692
693         bdev = open_bdev_exclusive(path, flags, holder);
694
695         if (IS_ERR(bdev)) {
696                 ret = PTR_ERR(bdev);
697                 goto error;
698         }
699
700         ret = set_blocksize(bdev, 4096);
701         if (ret)
702                 goto error_close;
703         bh = btrfs_read_dev_super(bdev);
704         if (!bh) {
705                 ret = -EIO;
706                 goto error_close;
707         }
708         disk_super = (struct btrfs_super_block *)bh->b_data;
709         devid = btrfs_stack_device_id(&disk_super->dev_item);
710         transid = btrfs_super_generation(disk_super);
711         if (disk_super->label[0])
712                 printk(KERN_INFO "device label %s ", disk_super->label);
713         else {
714                 /* FIXME, make a readl uuid parser */
715                 printk(KERN_INFO "device fsid %llx-%llx ",
716                        *(unsigned long long *)disk_super->fsid,
717                        *(unsigned long long *)(disk_super->fsid + 8));
718         }
719         printk(KERN_CONT "devid %llu transid %llu %s\n",
720                (unsigned long long)devid, (unsigned long long)transid, path);
721         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
722
723         brelse(bh);
724 error_close:
725         close_bdev_exclusive(bdev, flags);
726 error:
727         mutex_unlock(&uuid_mutex);
728         return ret;
729 }
730
731 /* helper to account the used device space in the range */
732 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
733                                    u64 end, u64 *length)
734 {
735         struct btrfs_key key;
736         struct btrfs_root *root = device->dev_root;
737         struct btrfs_dev_extent *dev_extent;
738         struct btrfs_path *path;
739         u64 extent_end;
740         int ret;
741         int slot;
742         struct extent_buffer *l;
743
744         *length = 0;
745
746         if (start >= device->total_bytes)
747                 return 0;
748
749         path = btrfs_alloc_path();
750         if (!path)
751                 return -ENOMEM;
752         path->reada = 2;
753
754         key.objectid = device->devid;
755         key.offset = start;
756         key.type = BTRFS_DEV_EXTENT_KEY;
757
758         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
759         if (ret < 0)
760                 goto out;
761         if (ret > 0) {
762                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
763                 if (ret < 0)
764                         goto out;
765         }
766
767         while (1) {
768                 l = path->nodes[0];
769                 slot = path->slots[0];
770                 if (slot >= btrfs_header_nritems(l)) {
771                         ret = btrfs_next_leaf(root, path);
772                         if (ret == 0)
773                                 continue;
774                         if (ret < 0)
775                                 goto out;
776
777                         break;
778                 }
779                 btrfs_item_key_to_cpu(l, &key, slot);
780
781                 if (key.objectid < device->devid)
782                         goto next;
783
784                 if (key.objectid > device->devid)
785                         break;
786
787                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
788                         goto next;
789
790                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
791                 extent_end = key.offset + btrfs_dev_extent_length(l,
792                                                                   dev_extent);
793                 if (key.offset <= start && extent_end > end) {
794                         *length = end - start + 1;
795                         break;
796                 } else if (key.offset <= start && extent_end > start)
797                         *length += extent_end - start;
798                 else if (key.offset > start && extent_end <= end)
799                         *length += extent_end - key.offset;
800                 else if (key.offset > start && key.offset <= end) {
801                         *length += end - key.offset + 1;
802                         break;
803                 } else if (key.offset > end)
804                         break;
805
806 next:
807                 path->slots[0]++;
808         }
809         ret = 0;
810 out:
811         btrfs_free_path(path);
812         return ret;
813 }
814
815 /*
816  * find_free_dev_extent - find free space in the specified device
817  * @trans:      transaction handler
818  * @device:     the device which we search the free space in
819  * @num_bytes:  the size of the free space that we need
820  * @start:      store the start of the free space.
821  * @len:        the size of the free space. that we find, or the size of the max
822  *              free space if we don't find suitable free space
823  *
824  * this uses a pretty simple search, the expectation is that it is
825  * called very infrequently and that a given device has a small number
826  * of extents
827  *
828  * @start is used to store the start of the free space if we find. But if we
829  * don't find suitable free space, it will be used to store the start position
830  * of the max free space.
831  *
832  * @len is used to store the size of the free space that we find.
833  * But if we don't find suitable free space, it is used to store the size of
834  * the max free space.
835  */
836 int find_free_dev_extent(struct btrfs_trans_handle *trans,
837                          struct btrfs_device *device, u64 num_bytes,
838                          u64 *start, u64 *len)
839 {
840         struct btrfs_key key;
841         struct btrfs_root *root = device->dev_root;
842         struct btrfs_dev_extent *dev_extent;
843         struct btrfs_path *path;
844         u64 hole_size;
845         u64 max_hole_start;
846         u64 max_hole_size;
847         u64 extent_end;
848         u64 search_start;
849         u64 search_end = device->total_bytes;
850         int ret;
851         int slot;
852         struct extent_buffer *l;
853
854         /* FIXME use last free of some kind */
855
856         /* we don't want to overwrite the superblock on the drive,
857          * so we make sure to start at an offset of at least 1MB
858          */
859         search_start = 1024 * 1024;
860
861         if (root->fs_info->alloc_start + num_bytes <= search_end)
862                 search_start = max(root->fs_info->alloc_start, search_start);
863
864         max_hole_start = search_start;
865         max_hole_size = 0;
866
867         if (search_start >= search_end) {
868                 ret = -ENOSPC;
869                 goto error;
870         }
871
872         path = btrfs_alloc_path();
873         if (!path) {
874                 ret = -ENOMEM;
875                 goto error;
876         }
877         path->reada = 2;
878
879         key.objectid = device->devid;
880         key.offset = search_start;
881         key.type = BTRFS_DEV_EXTENT_KEY;
882
883         ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
884         if (ret < 0)
885                 goto out;
886         if (ret > 0) {
887                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
888                 if (ret < 0)
889                         goto out;
890         }
891
892         while (1) {
893                 l = path->nodes[0];
894                 slot = path->slots[0];
895                 if (slot >= btrfs_header_nritems(l)) {
896                         ret = btrfs_next_leaf(root, path);
897                         if (ret == 0)
898                                 continue;
899                         if (ret < 0)
900                                 goto out;
901
902                         break;
903                 }
904                 btrfs_item_key_to_cpu(l, &key, slot);
905
906                 if (key.objectid < device->devid)
907                         goto next;
908
909                 if (key.objectid > device->devid)
910                         break;
911
912                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
913                         goto next;
914
915                 if (key.offset > search_start) {
916                         hole_size = key.offset - search_start;
917
918                         if (hole_size > max_hole_size) {
919                                 max_hole_start = search_start;
920                                 max_hole_size = hole_size;
921                         }
922
923                         /*
924                          * If this free space is greater than which we need,
925                          * it must be the max free space that we have found
926                          * until now, so max_hole_start must point to the start
927                          * of this free space and the length of this free space
928                          * is stored in max_hole_size. Thus, we return
929                          * max_hole_start and max_hole_size and go back to the
930                          * caller.
931                          */
932                         if (hole_size >= num_bytes) {
933                                 ret = 0;
934                                 goto out;
935                         }
936                 }
937
938                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
939                 extent_end = key.offset + btrfs_dev_extent_length(l,
940                                                                   dev_extent);
941                 if (extent_end > search_start)
942                         search_start = extent_end;
943 next:
944                 path->slots[0]++;
945                 cond_resched();
946         }
947
948         hole_size = search_end- search_start;
949         if (hole_size > max_hole_size) {
950                 max_hole_start = search_start;
951                 max_hole_size = hole_size;
952         }
953
954         /* See above. */
955         if (hole_size < num_bytes)
956                 ret = -ENOSPC;
957         else
958                 ret = 0;
959
960 out:
961         btrfs_free_path(path);
962 error:
963         *start = max_hole_start;
964         if (len)
965                 *len = max_hole_size;
966         return ret;
967 }
968
969 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
970                           struct btrfs_device *device,
971                           u64 start)
972 {
973         int ret;
974         struct btrfs_path *path;
975         struct btrfs_root *root = device->dev_root;
976         struct btrfs_key key;
977         struct btrfs_key found_key;
978         struct extent_buffer *leaf = NULL;
979         struct btrfs_dev_extent *extent = NULL;
980
981         path = btrfs_alloc_path();
982         if (!path)
983                 return -ENOMEM;
984
985         key.objectid = device->devid;
986         key.offset = start;
987         key.type = BTRFS_DEV_EXTENT_KEY;
988
989         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
990         if (ret > 0) {
991                 ret = btrfs_previous_item(root, path, key.objectid,
992                                           BTRFS_DEV_EXTENT_KEY);
993                 BUG_ON(ret);
994                 leaf = path->nodes[0];
995                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
996                 extent = btrfs_item_ptr(leaf, path->slots[0],
997                                         struct btrfs_dev_extent);
998                 BUG_ON(found_key.offset > start || found_key.offset +
999                        btrfs_dev_extent_length(leaf, extent) < start);
1000                 ret = 0;
1001         } else if (ret == 0) {
1002                 leaf = path->nodes[0];
1003                 extent = btrfs_item_ptr(leaf, path->slots[0],
1004                                         struct btrfs_dev_extent);
1005         }
1006         BUG_ON(ret);
1007
1008         if (device->bytes_used > 0)
1009                 device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
1010         ret = btrfs_del_item(trans, root, path);
1011         BUG_ON(ret);
1012
1013         btrfs_free_path(path);
1014         return ret;
1015 }
1016
1017 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1018                            struct btrfs_device *device,
1019                            u64 chunk_tree, u64 chunk_objectid,
1020                            u64 chunk_offset, u64 start, u64 num_bytes)
1021 {
1022         int ret;
1023         struct btrfs_path *path;
1024         struct btrfs_root *root = device->dev_root;
1025         struct btrfs_dev_extent *extent;
1026         struct extent_buffer *leaf;
1027         struct btrfs_key key;
1028
1029         WARN_ON(!device->in_fs_metadata);
1030         path = btrfs_alloc_path();
1031         if (!path)
1032                 return -ENOMEM;
1033
1034         key.objectid = device->devid;
1035         key.offset = start;
1036         key.type = BTRFS_DEV_EXTENT_KEY;
1037         ret = btrfs_insert_empty_item(trans, root, path, &key,
1038                                       sizeof(*extent));
1039         BUG_ON(ret);
1040
1041         leaf = path->nodes[0];
1042         extent = btrfs_item_ptr(leaf, path->slots[0],
1043                                 struct btrfs_dev_extent);
1044         btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1045         btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1046         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1047
1048         write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1049                     (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
1050                     BTRFS_UUID_SIZE);
1051
1052         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1053         btrfs_mark_buffer_dirty(leaf);
1054         btrfs_free_path(path);
1055         return ret;
1056 }
1057
1058 static noinline int find_next_chunk(struct btrfs_root *root,
1059                                     u64 objectid, u64 *offset)
1060 {
1061         struct btrfs_path *path;
1062         int ret;
1063         struct btrfs_key key;
1064         struct btrfs_chunk *chunk;
1065         struct btrfs_key found_key;
1066
1067         path = btrfs_alloc_path();
1068         BUG_ON(!path);
1069
1070         key.objectid = objectid;
1071         key.offset = (u64)-1;
1072         key.type = BTRFS_CHUNK_ITEM_KEY;
1073
1074         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1075         if (ret < 0)
1076                 goto error;
1077
1078         BUG_ON(ret == 0);
1079
1080         ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
1081         if (ret) {
1082                 *offset = 0;
1083         } else {
1084                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1085                                       path->slots[0]);
1086                 if (found_key.objectid != objectid)
1087                         *offset = 0;
1088                 else {
1089                         chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1090                                                struct btrfs_chunk);
1091                         *offset = found_key.offset +
1092                                 btrfs_chunk_length(path->nodes[0], chunk);
1093                 }
1094         }
1095         ret = 0;
1096 error:
1097         btrfs_free_path(path);
1098         return ret;
1099 }
1100
1101 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
1102 {
1103         int ret;
1104         struct btrfs_key key;
1105         struct btrfs_key found_key;
1106         struct btrfs_path *path;
1107
1108         root = root->fs_info->chunk_root;
1109
1110         path = btrfs_alloc_path();
1111         if (!path)
1112                 return -ENOMEM;
1113
1114         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1115         key.type = BTRFS_DEV_ITEM_KEY;
1116         key.offset = (u64)-1;
1117
1118         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1119         if (ret < 0)
1120                 goto error;
1121
1122         BUG_ON(ret == 0);
1123
1124         ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
1125                                   BTRFS_DEV_ITEM_KEY);
1126         if (ret) {
1127                 *objectid = 1;
1128         } else {
1129                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1130                                       path->slots[0]);
1131                 *objectid = found_key.offset + 1;
1132         }
1133         ret = 0;
1134 error:
1135         btrfs_free_path(path);
1136         return ret;
1137 }
1138
1139 /*
1140  * the device information is stored in the chunk root
1141  * the btrfs_device struct should be fully filled in
1142  */
1143 int btrfs_add_device(struct btrfs_trans_handle *trans,
1144                      struct btrfs_root *root,
1145                      struct btrfs_device *device)
1146 {
1147         int ret;
1148         struct btrfs_path *path;
1149         struct btrfs_dev_item *dev_item;
1150         struct extent_buffer *leaf;
1151         struct btrfs_key key;
1152         unsigned long ptr;
1153
1154         root = root->fs_info->chunk_root;
1155
1156         path = btrfs_alloc_path();
1157         if (!path)
1158                 return -ENOMEM;
1159
1160         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1161         key.type = BTRFS_DEV_ITEM_KEY;
1162         key.offset = device->devid;
1163
1164         ret = btrfs_insert_empty_item(trans, root, path, &key,
1165                                       sizeof(*dev_item));
1166         if (ret)
1167                 goto out;
1168
1169         leaf = path->nodes[0];
1170         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1171
1172         btrfs_set_device_id(leaf, dev_item, device->devid);
1173         btrfs_set_device_generation(leaf, dev_item, 0);
1174         btrfs_set_device_type(leaf, dev_item, device->type);
1175         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1176         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1177         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1178         btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1179         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1180         btrfs_set_device_group(leaf, dev_item, 0);
1181         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1182         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1183         btrfs_set_device_start_offset(leaf, dev_item, 0);
1184
1185         ptr = (unsigned long)btrfs_device_uuid(dev_item);
1186         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1187         ptr = (unsigned long)btrfs_device_fsid(dev_item);
1188         write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1189         btrfs_mark_buffer_dirty(leaf);
1190
1191         ret = 0;
1192 out:
1193         btrfs_free_path(path);
1194         return ret;
1195 }
1196
1197 static int btrfs_rm_dev_item(struct btrfs_root *root,
1198                              struct btrfs_device *device)
1199 {
1200         int ret;
1201         struct btrfs_path *path;
1202         struct btrfs_key key;
1203         struct btrfs_trans_handle *trans;
1204
1205         root = root->fs_info->chunk_root;
1206
1207         path = btrfs_alloc_path();
1208         if (!path)
1209                 return -ENOMEM;
1210
1211         trans = btrfs_start_transaction(root, 0);
1212         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1213         key.type = BTRFS_DEV_ITEM_KEY;
1214         key.offset = device->devid;
1215         lock_chunks(root);
1216
1217         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1218         if (ret < 0)
1219                 goto out;
1220
1221         if (ret > 0) {
1222                 ret = -ENOENT;
1223                 goto out;
1224         }
1225
1226         ret = btrfs_del_item(trans, root, path);
1227         if (ret)
1228                 goto out;
1229 out:
1230         btrfs_free_path(path);
1231         unlock_chunks(root);
1232         btrfs_commit_transaction(trans, root);
1233         return ret;
1234 }
1235
1236 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1237 {
1238         struct btrfs_device *device;
1239         struct btrfs_device *next_device;
1240         struct block_device *bdev;
1241         struct buffer_head *bh = NULL;
1242         struct btrfs_super_block *disk_super;
1243         u64 all_avail;
1244         u64 devid;
1245         u64 num_devices;
1246         u8 *dev_uuid;
1247         int ret = 0;
1248
1249         mutex_lock(&uuid_mutex);
1250         mutex_lock(&root->fs_info->volume_mutex);
1251
1252         all_avail = root->fs_info->avail_data_alloc_bits |
1253                 root->fs_info->avail_system_alloc_bits |
1254                 root->fs_info->avail_metadata_alloc_bits;
1255
1256         if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1257             root->fs_info->fs_devices->num_devices <= 4) {
1258                 printk(KERN_ERR "btrfs: unable to go below four devices "
1259                        "on raid10\n");
1260                 ret = -EINVAL;
1261                 goto out;
1262         }
1263
1264         if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1265             root->fs_info->fs_devices->num_devices <= 2) {
1266                 printk(KERN_ERR "btrfs: unable to go below two "
1267                        "devices on raid1\n");
1268                 ret = -EINVAL;
1269                 goto out;
1270         }
1271
1272         if (strcmp(device_path, "missing") == 0) {
1273                 struct list_head *devices;
1274                 struct btrfs_device *tmp;
1275
1276                 device = NULL;
1277                 devices = &root->fs_info->fs_devices->devices;
1278                 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1279                 list_for_each_entry(tmp, devices, dev_list) {
1280                         if (tmp->in_fs_metadata && !tmp->bdev) {
1281                                 device = tmp;
1282                                 break;
1283                         }
1284                 }
1285                 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1286                 bdev = NULL;
1287                 bh = NULL;
1288                 disk_super = NULL;
1289                 if (!device) {
1290                         printk(KERN_ERR "btrfs: no missing devices found to "
1291                                "remove\n");
1292                         goto out;
1293                 }
1294         } else {
1295                 bdev = open_bdev_exclusive(device_path, FMODE_READ,
1296                                       root->fs_info->bdev_holder);
1297                 if (IS_ERR(bdev)) {
1298                         ret = PTR_ERR(bdev);
1299                         goto out;
1300                 }
1301
1302                 set_blocksize(bdev, 4096);
1303                 bh = btrfs_read_dev_super(bdev);
1304                 if (!bh) {
1305                         ret = -EIO;
1306                         goto error_close;
1307                 }
1308                 disk_super = (struct btrfs_super_block *)bh->b_data;
1309                 devid = btrfs_stack_device_id(&disk_super->dev_item);
1310                 dev_uuid = disk_super->dev_item.uuid;
1311                 device = btrfs_find_device(root, devid, dev_uuid,
1312                                            disk_super->fsid);
1313                 if (!device) {
1314                         ret = -ENOENT;
1315                         goto error_brelse;
1316                 }
1317         }
1318
1319         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1320                 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1321                        "device\n");
1322                 ret = -EINVAL;
1323                 goto error_brelse;
1324         }
1325
1326         if (device->writeable) {
1327                 list_del_init(&device->dev_alloc_list);
1328                 root->fs_info->fs_devices->rw_devices--;
1329         }
1330
1331         ret = btrfs_shrink_device(device, 0);
1332         if (ret)
1333                 goto error_brelse;
1334
1335         ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1336         if (ret)
1337                 goto error_brelse;
1338
1339         device->in_fs_metadata = 0;
1340
1341         /*
1342          * the device list mutex makes sure that we don't change
1343          * the device list while someone else is writing out all
1344          * the device supers.
1345          */
1346         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1347         list_del_init(&device->dev_list);
1348         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1349
1350         device->fs_devices->num_devices--;
1351
1352         if (device->missing)
1353                 root->fs_info->fs_devices->missing_devices--;
1354
1355         next_device = list_entry(root->fs_info->fs_devices->devices.next,
1356                                  struct btrfs_device, dev_list);
1357         if (device->bdev == root->fs_info->sb->s_bdev)
1358                 root->fs_info->sb->s_bdev = next_device->bdev;
1359         if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1360                 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1361
1362         if (device->bdev) {
1363                 close_bdev_exclusive(device->bdev, device->mode);
1364                 device->bdev = NULL;
1365                 device->fs_devices->open_devices--;
1366         }
1367
1368         num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
1369         btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
1370
1371         if (device->fs_devices->open_devices == 0) {
1372                 struct btrfs_fs_devices *fs_devices;
1373                 fs_devices = root->fs_info->fs_devices;
1374                 while (fs_devices) {
1375                         if (fs_devices->seed == device->fs_devices)
1376                                 break;
1377                         fs_devices = fs_devices->seed;
1378                 }
1379                 fs_devices->seed = device->fs_devices->seed;
1380                 device->fs_devices->seed = NULL;
1381                 __btrfs_close_devices(device->fs_devices);
1382                 free_fs_devices(device->fs_devices);
1383         }
1384
1385         /*
1386          * at this point, the device is zero sized.  We want to
1387          * remove it from the devices list and zero out the old super
1388          */
1389         if (device->writeable) {
1390                 /* make sure this device isn't detected as part of
1391                  * the FS anymore
1392                  */
1393                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1394                 set_buffer_dirty(bh);
1395                 sync_dirty_buffer(bh);
1396         }
1397
1398         kfree(device->name);
1399         kfree(device);
1400         ret = 0;
1401
1402 error_brelse:
1403         brelse(bh);
1404 error_close:
1405         if (bdev)
1406                 close_bdev_exclusive(bdev, FMODE_READ);
1407 out:
1408         mutex_unlock(&root->fs_info->volume_mutex);
1409         mutex_unlock(&uuid_mutex);
1410         return ret;
1411 }
1412
1413 /*
1414  * does all the dirty work required for changing file system's UUID.
1415  */
1416 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1417                                 struct btrfs_root *root)
1418 {
1419         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1420         struct btrfs_fs_devices *old_devices;
1421         struct btrfs_fs_devices *seed_devices;
1422         struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
1423         struct btrfs_device *device;
1424         u64 super_flags;
1425
1426         BUG_ON(!mutex_is_locked(&uuid_mutex));
1427         if (!fs_devices->seeding)
1428                 return -EINVAL;
1429
1430         seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1431         if (!seed_devices)
1432                 return -ENOMEM;
1433
1434         old_devices = clone_fs_devices(fs_devices);
1435         if (IS_ERR(old_devices)) {
1436                 kfree(seed_devices);
1437                 return PTR_ERR(old_devices);
1438         }
1439
1440         list_add(&old_devices->list, &fs_uuids);
1441
1442         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1443         seed_devices->opened = 1;
1444         INIT_LIST_HEAD(&seed_devices->devices);
1445         INIT_LIST_HEAD(&seed_devices->alloc_list);
1446         mutex_init(&seed_devices->device_list_mutex);
1447         list_splice_init(&fs_devices->devices, &seed_devices->devices);
1448         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1449         list_for_each_entry(device, &seed_devices->devices, dev_list) {
1450                 device->fs_devices = seed_devices;
1451         }
1452
1453         fs_devices->seeding = 0;
1454         fs_devices->num_devices = 0;
1455         fs_devices->open_devices = 0;
1456         fs_devices->seed = seed_devices;
1457
1458         generate_random_uuid(fs_devices->fsid);
1459         memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1460         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1461         super_flags = btrfs_super_flags(disk_super) &
1462                       ~BTRFS_SUPER_FLAG_SEEDING;
1463         btrfs_set_super_flags(disk_super, super_flags);
1464
1465         return 0;
1466 }
1467
1468 /*
1469  * strore the expected generation for seed devices in device items.
1470  */
1471 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1472                                struct btrfs_root *root)
1473 {
1474         struct btrfs_path *path;
1475         struct extent_buffer *leaf;
1476         struct btrfs_dev_item *dev_item;
1477         struct btrfs_device *device;
1478         struct btrfs_key key;
1479         u8 fs_uuid[BTRFS_UUID_SIZE];
1480         u8 dev_uuid[BTRFS_UUID_SIZE];
1481         u64 devid;
1482         int ret;
1483
1484         path = btrfs_alloc_path();
1485         if (!path)
1486                 return -ENOMEM;
1487
1488         root = root->fs_info->chunk_root;
1489         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1490         key.offset = 0;
1491         key.type = BTRFS_DEV_ITEM_KEY;
1492
1493         while (1) {
1494                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1495                 if (ret < 0)
1496                         goto error;
1497
1498                 leaf = path->nodes[0];
1499 next_slot:
1500                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1501                         ret = btrfs_next_leaf(root, path);
1502                         if (ret > 0)
1503                                 break;
1504                         if (ret < 0)
1505                                 goto error;
1506                         leaf = path->nodes[0];
1507                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1508                         btrfs_release_path(root, path);
1509                         continue;
1510                 }
1511
1512                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1513                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1514                     key.type != BTRFS_DEV_ITEM_KEY)
1515                         break;
1516
1517                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1518                                           struct btrfs_dev_item);
1519                 devid = btrfs_device_id(leaf, dev_item);
1520                 read_extent_buffer(leaf, dev_uuid,
1521                                    (unsigned long)btrfs_device_uuid(dev_item),
1522                                    BTRFS_UUID_SIZE);
1523                 read_extent_buffer(leaf, fs_uuid,
1524                                    (unsigned long)btrfs_device_fsid(dev_item),
1525                                    BTRFS_UUID_SIZE);
1526                 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1527                 BUG_ON(!device);
1528
1529                 if (device->fs_devices->seeding) {
1530                         btrfs_set_device_generation(leaf, dev_item,
1531                                                     device->generation);
1532                         btrfs_mark_buffer_dirty(leaf);
1533                 }
1534
1535                 path->slots[0]++;
1536                 goto next_slot;
1537         }
1538         ret = 0;
1539 error:
1540         btrfs_free_path(path);
1541         return ret;
1542 }
1543
1544 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1545 {
1546         struct btrfs_trans_handle *trans;
1547         struct btrfs_device *device;
1548         struct block_device *bdev;
1549         struct list_head *devices;
1550         struct super_block *sb = root->fs_info->sb;
1551         u64 total_bytes;
1552         int seeding_dev = 0;
1553         int ret = 0;
1554
1555         if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1556                 return -EINVAL;
1557
1558         bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
1559         if (IS_ERR(bdev))
1560                 return PTR_ERR(bdev);
1561
1562         if (root->fs_info->fs_devices->seeding) {
1563                 seeding_dev = 1;
1564                 down_write(&sb->s_umount);
1565                 mutex_lock(&uuid_mutex);
1566         }
1567
1568         filemap_write_and_wait(bdev->bd_inode->i_mapping);
1569         mutex_lock(&root->fs_info->volume_mutex);
1570
1571         devices = &root->fs_info->fs_devices->devices;
1572         /*
1573          * we have the volume lock, so we don't need the extra
1574          * device list mutex while reading the list here.
1575          */
1576         list_for_each_entry(device, devices, dev_list) {
1577                 if (device->bdev == bdev) {
1578                         ret = -EEXIST;
1579                         goto error;
1580                 }
1581         }
1582
1583         device = kzalloc(sizeof(*device), GFP_NOFS);
1584         if (!device) {
1585                 /* we can safely leave the fs_devices entry around */
1586                 ret = -ENOMEM;
1587                 goto error;
1588         }
1589
1590         device->name = kstrdup(device_path, GFP_NOFS);
1591         if (!device->name) {
1592                 kfree(device);
1593                 ret = -ENOMEM;
1594                 goto error;
1595         }
1596
1597         ret = find_next_devid(root, &device->devid);
1598         if (ret) {
1599                 kfree(device);
1600                 goto error;
1601         }
1602
1603         trans = btrfs_start_transaction(root, 0);
1604         lock_chunks(root);
1605
1606         device->barriers = 1;
1607         device->writeable = 1;
1608         device->work.func = pending_bios_fn;
1609         generate_random_uuid(device->uuid);
1610         spin_lock_init(&device->io_lock);
1611         device->generation = trans->transid;
1612         device->io_width = root->sectorsize;
1613         device->io_align = root->sectorsize;
1614         device->sector_size = root->sectorsize;
1615         device->total_bytes = i_size_read(bdev->bd_inode);
1616         device->disk_total_bytes = device->total_bytes;
1617         device->dev_root = root->fs_info->dev_root;
1618         device->bdev = bdev;
1619         device->in_fs_metadata = 1;
1620         device->mode = 0;
1621         set_blocksize(device->bdev, 4096);
1622
1623         if (seeding_dev) {
1624                 sb->s_flags &= ~MS_RDONLY;
1625                 ret = btrfs_prepare_sprout(trans, root);
1626                 BUG_ON(ret);
1627         }
1628
1629         device->fs_devices = root->fs_info->fs_devices;
1630
1631         /*
1632          * we don't want write_supers to jump in here with our device
1633          * half setup
1634          */
1635         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1636         list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
1637         list_add(&device->dev_alloc_list,
1638                  &root->fs_info->fs_devices->alloc_list);
1639         root->fs_info->fs_devices->num_devices++;
1640         root->fs_info->fs_devices->open_devices++;
1641         root->fs_info->fs_devices->rw_devices++;
1642         root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1643
1644         if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1645                 root->fs_info->fs_devices->rotating = 1;
1646
1647         total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
1648         btrfs_set_super_total_bytes(&root->fs_info->super_copy,
1649                                     total_bytes + device->total_bytes);
1650
1651         total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
1652         btrfs_set_super_num_devices(&root->fs_info->super_copy,
1653                                     total_bytes + 1);
1654         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1655
1656         if (seeding_dev) {
1657                 ret = init_first_rw_device(trans, root, device);
1658                 BUG_ON(ret);
1659                 ret = btrfs_finish_sprout(trans, root);
1660                 BUG_ON(ret);
1661         } else {
1662                 ret = btrfs_add_device(trans, root, device);
1663         }
1664
1665         /*
1666          * we've got more storage, clear any full flags on the space
1667          * infos
1668          */
1669         btrfs_clear_space_info_full(root->fs_info);
1670
1671         unlock_chunks(root);
1672         btrfs_commit_transaction(trans, root);
1673
1674         if (seeding_dev) {
1675                 mutex_unlock(&uuid_mutex);
1676                 up_write(&sb->s_umount);
1677
1678                 ret = btrfs_relocate_sys_chunks(root);
1679                 BUG_ON(ret);
1680         }
1681 out:
1682         mutex_unlock(&root->fs_info->volume_mutex);
1683         return ret;
1684 error:
1685         close_bdev_exclusive(bdev, 0);
1686         if (seeding_dev) {
1687                 mutex_unlock(&uuid_mutex);
1688                 up_write(&sb->s_umount);
1689         }
1690         goto out;
1691 }
1692
1693 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1694                                         struct btrfs_device *device)
1695 {
1696         int ret;
1697         struct btrfs_path *path;
1698         struct btrfs_root *root;
1699         struct btrfs_dev_item *dev_item;
1700         struct extent_buffer *leaf;
1701         struct btrfs_key key;
1702
1703         root = device->dev_root->fs_info->chunk_root;
1704
1705         path = btrfs_alloc_path();
1706         if (!path)
1707                 return -ENOMEM;
1708
1709         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1710         key.type = BTRFS_DEV_ITEM_KEY;
1711         key.offset = device->devid;
1712
1713         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1714         if (ret < 0)
1715                 goto out;
1716
1717         if (ret > 0) {
1718                 ret = -ENOENT;
1719                 goto out;
1720         }
1721
1722         leaf = path->nodes[0];
1723         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1724
1725         btrfs_set_device_id(leaf, dev_item, device->devid);
1726         btrfs_set_device_type(leaf, dev_item, device->type);
1727         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1728         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1729         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1730         btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
1731         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1732         btrfs_mark_buffer_dirty(leaf);
1733
1734 out:
1735         btrfs_free_path(path);
1736         return ret;
1737 }
1738
1739 static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1740                       struct btrfs_device *device, u64 new_size)
1741 {
1742         struct btrfs_super_block *super_copy =
1743                 &device->dev_root->fs_info->super_copy;
1744         u64 old_total = btrfs_super_total_bytes(super_copy);
1745         u64 diff = new_size - device->total_bytes;
1746
1747         if (!device->writeable)
1748                 return -EACCES;
1749         if (new_size <= device->total_bytes)
1750                 return -EINVAL;
1751
1752         btrfs_set_super_total_bytes(super_copy, old_total + diff);
1753         device->fs_devices->total_rw_bytes += diff;
1754
1755         device->total_bytes = new_size;
1756         device->disk_total_bytes = new_size;
1757         btrfs_clear_space_info_full(device->dev_root->fs_info);
1758
1759         return btrfs_update_device(trans, device);
1760 }
1761
1762 int btrfs_grow_device(struct btrfs_trans_handle *trans,
1763                       struct btrfs_device *device, u64 new_size)
1764 {
1765         int ret;
1766         lock_chunks(device->dev_root);
1767         ret = __btrfs_grow_device(trans, device, new_size);
1768         unlock_chunks(device->dev_root);
1769         return ret;
1770 }
1771
1772 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1773                             struct btrfs_root *root,
1774                             u64 chunk_tree, u64 chunk_objectid,
1775                             u64 chunk_offset)
1776 {
1777         int ret;
1778         struct btrfs_path *path;
1779         struct btrfs_key key;
1780
1781         root = root->fs_info->chunk_root;
1782         path = btrfs_alloc_path();
1783         if (!path)
1784                 return -ENOMEM;
1785
1786         key.objectid = chunk_objectid;
1787         key.offset = chunk_offset;
1788         key.type = BTRFS_CHUNK_ITEM_KEY;
1789
1790         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1791         BUG_ON(ret);
1792
1793         ret = btrfs_del_item(trans, root, path);
1794         BUG_ON(ret);
1795
1796         btrfs_free_path(path);
1797         return 0;
1798 }
1799
1800 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1801                         chunk_offset)
1802 {
1803         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1804         struct btrfs_disk_key *disk_key;
1805         struct btrfs_chunk *chunk;
1806         u8 *ptr;
1807         int ret = 0;
1808         u32 num_stripes;
1809         u32 array_size;
1810         u32 len = 0;
1811         u32 cur;
1812         struct btrfs_key key;
1813
1814         array_size = btrfs_super_sys_array_size(super_copy);
1815
1816         ptr = super_copy->sys_chunk_array;
1817         cur = 0;
1818
1819         while (cur < array_size) {
1820                 disk_key = (struct btrfs_disk_key *)ptr;
1821                 btrfs_disk_key_to_cpu(&key, disk_key);
1822
1823                 len = sizeof(*disk_key);
1824
1825                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1826                         chunk = (struct btrfs_chunk *)(ptr + len);
1827                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1828                         len += btrfs_chunk_item_size(num_stripes);
1829                 } else {
1830                         ret = -EIO;
1831                         break;
1832                 }
1833                 if (key.objectid == chunk_objectid &&
1834                     key.offset == chunk_offset) {
1835                         memmove(ptr, ptr + len, array_size - (cur + len));
1836                         array_size -= len;
1837                         btrfs_set_super_sys_array_size(super_copy, array_size);
1838                 } else {
1839                         ptr += len;
1840                         cur += len;
1841                 }
1842         }
1843         return ret;
1844 }
1845
1846 static int btrfs_relocate_chunk(struct btrfs_root *root,
1847                          u64 chunk_tree, u64 chunk_objectid,
1848                          u64 chunk_offset)
1849 {
1850         struct extent_map_tree *em_tree;
1851         struct btrfs_root *extent_root;
1852         struct btrfs_trans_handle *trans;
1853         struct extent_map *em;
1854         struct map_lookup *map;
1855         int ret;
1856         int i;
1857
1858         root = root->fs_info->chunk_root;
1859         extent_root = root->fs_info->extent_root;
1860         em_tree = &root->fs_info->mapping_tree.map_tree;
1861
1862         ret = btrfs_can_relocate(extent_root, chunk_offset);
1863         if (ret)
1864                 return -ENOSPC;
1865
1866         /* step one, relocate all the extents inside this chunk */
1867         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1868         if (ret)
1869                 return ret;
1870
1871         trans = btrfs_start_transaction(root, 0);
1872         BUG_ON(!trans);
1873
1874         lock_chunks(root);
1875
1876         /*
1877          * step two, delete the device extents and the
1878          * chunk tree entries
1879          */
1880         read_lock(&em_tree->lock);
1881         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1882         read_unlock(&em_tree->lock);
1883
1884         BUG_ON(em->start > chunk_offset ||
1885                em->start + em->len < chunk_offset);
1886         map = (struct map_lookup *)em->bdev;
1887
1888         for (i = 0; i < map->num_stripes; i++) {
1889                 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1890                                             map->stripes[i].physical);
1891                 BUG_ON(ret);
1892
1893                 if (map->stripes[i].dev) {
1894                         ret = btrfs_update_device(trans, map->stripes[i].dev);
1895                         BUG_ON(ret);
1896                 }
1897         }
1898         ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1899                                chunk_offset);
1900
1901         BUG_ON(ret);
1902
1903         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1904                 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1905                 BUG_ON(ret);
1906         }
1907
1908         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1909         BUG_ON(ret);
1910
1911         write_lock(&em_tree->lock);
1912         remove_extent_mapping(em_tree, em);
1913         write_unlock(&em_tree->lock);
1914
1915         kfree(map);
1916         em->bdev = NULL;
1917
1918         /* once for the tree */
1919         free_extent_map(em);
1920         /* once for us */
1921         free_extent_map(em);
1922
1923         unlock_chunks(root);
1924         btrfs_end_transaction(trans, root);
1925         return 0;
1926 }
1927
1928 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
1929 {
1930         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
1931         struct btrfs_path *path;
1932         struct extent_buffer *leaf;
1933         struct btrfs_chunk *chunk;
1934         struct btrfs_key key;
1935         struct btrfs_key found_key;
1936         u64 chunk_tree = chunk_root->root_key.objectid;
1937         u64 chunk_type;
1938         bool retried = false;
1939         int failed = 0;
1940         int ret;
1941
1942         path = btrfs_alloc_path();
1943         if (!path)
1944                 return -ENOMEM;
1945
1946 again:
1947         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1948         key.offset = (u64)-1;
1949         key.type = BTRFS_CHUNK_ITEM_KEY;
1950
1951         while (1) {
1952                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1953                 if (ret < 0)
1954                         goto error;
1955                 BUG_ON(ret == 0);
1956
1957                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
1958                                           key.type);
1959                 if (ret < 0)
1960                         goto error;
1961                 if (ret > 0)
1962                         break;
1963
1964                 leaf = path->nodes[0];
1965                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1966
1967                 chunk = btrfs_item_ptr(leaf, path->slots[0],
1968                                        struct btrfs_chunk);
1969                 chunk_type = btrfs_chunk_type(leaf, chunk);
1970                 btrfs_release_path(chunk_root, path);
1971
1972                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
1973                         ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
1974                                                    found_key.objectid,
1975                                                    found_key.offset);
1976                         if (ret == -ENOSPC)
1977                                 failed++;
1978                         else if (ret)
1979                                 BUG();
1980                 }
1981
1982                 if (found_key.offset == 0)
1983                         break;
1984                 key.offset = found_key.offset - 1;
1985         }
1986         ret = 0;
1987         if (failed && !retried) {
1988                 failed = 0;
1989                 retried = true;
1990                 goto again;
1991         } else if (failed && retried) {
1992                 WARN_ON(1);
1993                 ret = -ENOSPC;
1994         }
1995 error:
1996         btrfs_free_path(path);
1997         return ret;
1998 }
1999
2000 static u64 div_factor(u64 num, int factor)
2001 {
2002         if (factor == 10)
2003                 return num;
2004         num *= factor;
2005         do_div(num, 10);
2006         return num;
2007 }
2008
2009 int btrfs_balance(struct btrfs_root *dev_root)
2010 {
2011         int ret;
2012         struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
2013         struct btrfs_device *device;
2014         u64 old_size;
2015         u64 size_to_free;
2016         struct btrfs_path *path;
2017         struct btrfs_key key;
2018         struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
2019         struct btrfs_trans_handle *trans;
2020         struct btrfs_key found_key;
2021
2022         if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
2023                 return -EROFS;
2024
2025         mutex_lock(&dev_root->fs_info->volume_mutex);
2026         dev_root = dev_root->fs_info->dev_root;
2027
2028         /* step one make some room on all the devices */
2029         list_for_each_entry(device, devices, dev_list) {
2030                 old_size = device->total_bytes;
2031                 size_to_free = div_factor(old_size, 1);
2032                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2033                 if (!device->writeable ||
2034                     device->total_bytes - device->bytes_used > size_to_free)
2035                         continue;
2036
2037                 ret = btrfs_shrink_device(device, old_size - size_to_free);
2038                 if (ret == -ENOSPC)
2039                         break;
2040                 BUG_ON(ret);
2041
2042                 trans = btrfs_start_transaction(dev_root, 0);
2043                 BUG_ON(!trans);
2044
2045                 ret = btrfs_grow_device(trans, device, old_size);
2046                 BUG_ON(ret);
2047
2048                 btrfs_end_transaction(trans, dev_root);
2049         }
2050
2051         /* step two, relocate all the chunks */
2052         path = btrfs_alloc_path();
2053         BUG_ON(!path);
2054
2055         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2056         key.offset = (u64)-1;
2057         key.type = BTRFS_CHUNK_ITEM_KEY;
2058
2059         while (1) {
2060                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2061                 if (ret < 0)
2062                         goto error;
2063
2064                 /*
2065                  * this shouldn't happen, it means the last relocate
2066                  * failed
2067                  */
2068                 if (ret == 0)
2069                         break;
2070
2071                 ret = btrfs_previous_item(chunk_root, path, 0,
2072                                           BTRFS_CHUNK_ITEM_KEY);
2073                 if (ret)
2074                         break;
2075
2076                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2077                                       path->slots[0]);
2078                 if (found_key.objectid != key.objectid)
2079                         break;
2080
2081                 /* chunk zero is special */
2082                 if (found_key.offset == 0)
2083                         break;
2084
2085                 btrfs_release_path(chunk_root, path);
2086                 ret = btrfs_relocate_chunk(chunk_root,
2087                                            chunk_root->root_key.objectid,
2088                                            found_key.objectid,
2089                                            found_key.offset);
2090                 BUG_ON(ret && ret != -ENOSPC);
2091                 key.offset = found_key.offset - 1;
2092         }
2093         ret = 0;
2094 error:
2095         btrfs_free_path(path);
2096         mutex_unlock(&dev_root->fs_info->volume_mutex);
2097         return ret;
2098 }
2099
2100 /*
2101  * shrinking a device means finding all of the device extents past
2102  * the new size, and then following the back refs to the chunks.
2103  * The chunk relocation code actually frees the device extent
2104  */
2105 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2106 {
2107         struct btrfs_trans_handle *trans;
2108         struct btrfs_root *root = device->dev_root;
2109         struct btrfs_dev_extent *dev_extent = NULL;
2110         struct btrfs_path *path;
2111         u64 length;
2112         u64 chunk_tree;
2113         u64 chunk_objectid;
2114         u64 chunk_offset;
2115         int ret;
2116         int slot;
2117         int failed = 0;
2118         bool retried = false;
2119         struct extent_buffer *l;
2120         struct btrfs_key key;
2121         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
2122         u64 old_total = btrfs_super_total_bytes(super_copy);
2123         u64 old_size = device->total_bytes;
2124         u64 diff = device->total_bytes - new_size;
2125
2126         if (new_size >= device->total_bytes)
2127                 return -EINVAL;
2128
2129         path = btrfs_alloc_path();
2130         if (!path)
2131                 return -ENOMEM;
2132
2133         path->reada = 2;
2134
2135         lock_chunks(root);
2136
2137         device->total_bytes = new_size;
2138         if (device->writeable)
2139                 device->fs_devices->total_rw_bytes -= diff;
2140         unlock_chunks(root);
2141
2142 again:
2143         key.objectid = device->devid;
2144         key.offset = (u64)-1;
2145         key.type = BTRFS_DEV_EXTENT_KEY;
2146
2147         while (1) {
2148                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2149                 if (ret < 0)
2150                         goto done;
2151
2152                 ret = btrfs_previous_item(root, path, 0, key.type);
2153                 if (ret < 0)
2154                         goto done;
2155                 if (ret) {
2156                         ret = 0;
2157                         btrfs_release_path(root, path);
2158                         break;
2159                 }
2160
2161                 l = path->nodes[0];
2162                 slot = path->slots[0];
2163                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
2164
2165                 if (key.objectid != device->devid) {
2166                         btrfs_release_path(root, path);
2167                         break;
2168                 }
2169
2170                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2171                 length = btrfs_dev_extent_length(l, dev_extent);
2172
2173                 if (key.offset + length <= new_size) {
2174                         btrfs_release_path(root, path);
2175                         break;
2176                 }
2177
2178                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2179                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2180                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2181                 btrfs_release_path(root, path);
2182
2183                 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
2184                                            chunk_offset);
2185                 if (ret && ret != -ENOSPC)
2186                         goto done;
2187                 if (ret == -ENOSPC)
2188                         failed++;
2189                 key.offset -= 1;
2190         }
2191
2192         if (failed && !retried) {
2193                 failed = 0;
2194                 retried = true;
2195                 goto again;
2196         } else if (failed && retried) {
2197                 ret = -ENOSPC;
2198                 lock_chunks(root);
2199
2200                 device->total_bytes = old_size;
2201                 if (device->writeable)
2202                         device->fs_devices->total_rw_bytes += diff;
2203                 unlock_chunks(root);
2204                 goto done;
2205         }
2206
2207         /* Shrinking succeeded, else we would be at "done". */
2208         trans = btrfs_start_transaction(root, 0);
2209         lock_chunks(root);
2210
2211         device->disk_total_bytes = new_size;
2212         /* Now btrfs_update_device() will change the on-disk size. */
2213         ret = btrfs_update_device(trans, device);
2214         if (ret) {
2215                 unlock_chunks(root);
2216                 btrfs_end_transaction(trans, root);
2217                 goto done;
2218         }
2219         WARN_ON(diff > old_total);
2220         btrfs_set_super_total_bytes(super_copy, old_total - diff);
2221         unlock_chunks(root);
2222         btrfs_end_transaction(trans, root);
2223 done:
2224         btrfs_free_path(path);
2225         return ret;
2226 }
2227
2228 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
2229                            struct btrfs_root *root,
2230                            struct btrfs_key *key,
2231                            struct btrfs_chunk *chunk, int item_size)
2232 {
2233         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
2234         struct btrfs_disk_key disk_key;
2235         u32 array_size;
2236         u8 *ptr;
2237
2238         array_size = btrfs_super_sys_array_size(super_copy);
2239         if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
2240                 return -EFBIG;
2241
2242         ptr = super_copy->sys_chunk_array + array_size;
2243         btrfs_cpu_key_to_disk(&disk_key, key);
2244         memcpy(ptr, &disk_key, sizeof(disk_key));
2245         ptr += sizeof(disk_key);
2246         memcpy(ptr, chunk, item_size);
2247         item_size += sizeof(disk_key);
2248         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
2249         return 0;
2250 }
2251
2252 static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
2253                                         int num_stripes, int sub_stripes)
2254 {
2255         if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
2256                 return calc_size;
2257         else if (type & BTRFS_BLOCK_GROUP_RAID10)
2258                 return calc_size * (num_stripes / sub_stripes);
2259         else
2260                 return calc_size * num_stripes;
2261 }
2262
2263 /* Used to sort the devices by max_avail(descending sort) */
2264 int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
2265 {
2266         if (((struct btrfs_device_info *)dev_info1)->max_avail >
2267             ((struct btrfs_device_info *)dev_info2)->max_avail)
2268                 return -1;
2269         else if (((struct btrfs_device_info *)dev_info1)->max_avail <
2270                  ((struct btrfs_device_info *)dev_info2)->max_avail)
2271                 return 1;
2272         else
2273                 return 0;
2274 }
2275
2276 static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
2277                                  int *num_stripes, int *min_stripes,
2278                                  int *sub_stripes)
2279 {
2280         *num_stripes = 1;
2281         *min_stripes = 1;
2282         *sub_stripes = 0;
2283
2284         if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
2285                 *num_stripes = fs_devices->rw_devices;
2286                 *min_stripes = 2;
2287         }
2288         if (type & (BTRFS_BLOCK_GROUP_DUP)) {
2289                 *num_stripes = 2;
2290                 *min_stripes = 2;
2291         }
2292         if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2293                 if (fs_devices->rw_devices < 2)
2294                         return -ENOSPC;
2295                 *num_stripes = 2;
2296                 *min_stripes = 2;
2297         }
2298         if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2299                 *num_stripes = fs_devices->rw_devices;
2300                 if (*num_stripes < 4)
2301                         return -ENOSPC;
2302                 *num_stripes &= ~(u32)1;
2303                 *sub_stripes = 2;
2304                 *min_stripes = 4;
2305         }
2306
2307         return 0;
2308 }
2309
2310 static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
2311                                     u64 proposed_size, u64 type,
2312                                     int num_stripes, int small_stripe)
2313 {
2314         int min_stripe_size = 1 * 1024 * 1024;
2315         u64 calc_size = proposed_size;
2316         u64 max_chunk_size = calc_size;
2317         int ncopies = 1;
2318
2319         if (type & (BTRFS_BLOCK_GROUP_RAID1 |
2320                     BTRFS_BLOCK_GROUP_DUP |
2321                     BTRFS_BLOCK_GROUP_RAID10))
2322                 ncopies = 2;
2323
2324         if (type & BTRFS_BLOCK_GROUP_DATA) {
2325                 max_chunk_size = 10 * calc_size;
2326                 min_stripe_size = 64 * 1024 * 1024;
2327         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
2328                 max_chunk_size = 256 * 1024 * 1024;
2329                 min_stripe_size = 32 * 1024 * 1024;
2330         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2331                 calc_size = 8 * 1024 * 1024;
2332                 max_chunk_size = calc_size * 2;
2333                 min_stripe_size = 1 * 1024 * 1024;
2334         }
2335
2336         /* we don't want a chunk larger than 10% of writeable space */
2337         max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2338                              max_chunk_size);
2339
2340         if (calc_size * num_stripes > max_chunk_size * ncopies) {
2341                 calc_size = max_chunk_size * ncopies;
2342                 do_div(calc_size, num_stripes);
2343                 do_div(calc_size, BTRFS_STRIPE_LEN);
2344                 calc_size *= BTRFS_STRIPE_LEN;
2345         }
2346
2347         /* we don't want tiny stripes */
2348         if (!small_stripe)
2349                 calc_size = max_t(u64, min_stripe_size, calc_size);
2350
2351         /*
2352          * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
2353          * we end up with something bigger than a stripe
2354          */
2355         calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
2356
2357         do_div(calc_size, BTRFS_STRIPE_LEN);
2358         calc_size *= BTRFS_STRIPE_LEN;
2359
2360         return calc_size;
2361 }
2362
2363 static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
2364                                                       int num_stripes)
2365 {
2366         struct map_lookup *new;
2367         size_t len = map_lookup_size(num_stripes);
2368
2369         BUG_ON(map->num_stripes < num_stripes);
2370
2371         if (map->num_stripes == num_stripes)
2372                 return map;
2373
2374         new = kmalloc(len, GFP_NOFS);
2375         if (!new) {
2376                 /* just change map->num_stripes */
2377                 map->num_stripes = num_stripes;
2378                 return map;
2379         }
2380
2381         memcpy(new, map, len);
2382         new->num_stripes = num_stripes;
2383         kfree(map);
2384         return new;
2385 }
2386
2387 /*
2388  * helper to allocate device space from btrfs_device_info, in which we stored
2389  * max free space information of every device. It is used when we can not
2390  * allocate chunks by default size.
2391  *
2392  * By this helper, we can allocate a new chunk as larger as possible.
2393  */
2394 static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
2395                                     struct btrfs_fs_devices *fs_devices,
2396                                     struct btrfs_device_info *devices,
2397                                     int nr_device, u64 type,
2398                                     struct map_lookup **map_lookup,
2399                                     int min_stripes, u64 *stripe_size)
2400 {
2401         int i, index, sort_again = 0;
2402         int min_devices = min_stripes;
2403         u64 max_avail, min_free;
2404         struct map_lookup *map = *map_lookup;
2405         int ret;
2406
2407         if (nr_device < min_stripes)
2408                 return -ENOSPC;
2409
2410         btrfs_descending_sort_devices(devices, nr_device);
2411
2412         max_avail = devices[0].max_avail;
2413         if (!max_avail)
2414                 return -ENOSPC;
2415
2416         for (i = 0; i < nr_device; i++) {
2417                 /*
2418                  * if dev_offset = 0, it means the free space of this device
2419                  * is less than what we need, and we didn't search max avail
2420                  * extent on this device, so do it now.
2421                  */
2422                 if (!devices[i].dev_offset) {
2423                         ret = find_free_dev_extent(trans, devices[i].dev,
2424                                                    max_avail,
2425                                                    &devices[i].dev_offset,
2426                                                    &devices[i].max_avail);
2427                         if (ret != 0 && ret != -ENOSPC)
2428                                 return ret;
2429                         sort_again = 1;
2430                 }
2431         }
2432
2433         /* we update the max avail free extent of each devices, sort again */
2434         if (sort_again)
2435                 btrfs_descending_sort_devices(devices, nr_device);
2436
2437         if (type & BTRFS_BLOCK_GROUP_DUP)
2438                 min_devices = 1;
2439
2440         if (!devices[min_devices - 1].max_avail)
2441                 return -ENOSPC;
2442
2443         max_avail = devices[min_devices - 1].max_avail;
2444         if (type & BTRFS_BLOCK_GROUP_DUP)
2445                 do_div(max_avail, 2);
2446
2447         max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
2448                                              min_stripes, 1);
2449         if (type & BTRFS_BLOCK_GROUP_DUP)
2450                 min_free = max_avail * 2;
2451         else
2452                 min_free = max_avail;
2453
2454         if (min_free > devices[min_devices - 1].max_avail)
2455                 return -ENOSPC;
2456
2457         map = __shrink_map_lookup_stripes(map, min_stripes);
2458         *stripe_size = max_avail;
2459
2460         index = 0;
2461         for (i = 0; i < min_stripes; i++) {
2462                 map->stripes[i].dev = devices[index].dev;
2463                 map->stripes[i].physical = devices[index].dev_offset;
2464                 if (type & BTRFS_BLOCK_GROUP_DUP) {
2465                         i++;
2466                         map->stripes[i].dev = devices[index].dev;
2467                         map->stripes[i].physical = devices[index].dev_offset +
2468                                                    max_avail;
2469                 }
2470                 index++;
2471         }
2472         *map_lookup = map;
2473
2474         return 0;
2475 }
2476
2477 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2478                                struct btrfs_root *extent_root,
2479                                struct map_lookup **map_ret,
2480                                u64 *num_bytes, u64 *stripe_size,
2481                                u64 start, u64 type)
2482 {
2483         struct btrfs_fs_info *info = extent_root->fs_info;
2484         struct btrfs_device *device = NULL;
2485         struct btrfs_fs_devices *fs_devices = info->fs_devices;
2486         struct list_head *cur;
2487         struct map_lookup *map;
2488         struct extent_map_tree *em_tree;
2489         struct extent_map *em;
2490         struct btrfs_device_info *devices_info;
2491         struct list_head private_devs;
2492         u64 calc_size = 1024 * 1024 * 1024;
2493         u64 min_free;
2494         u64 avail;
2495         u64 dev_offset;
2496         int num_stripes;
2497         int min_stripes;
2498         int sub_stripes;
2499         int min_devices;        /* the min number of devices we need */
2500         int i;
2501         int ret;
2502         int index;
2503
2504         if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
2505             (type & BTRFS_BLOCK_GROUP_DUP)) {
2506                 WARN_ON(1);
2507                 type &= ~BTRFS_BLOCK_GROUP_DUP;
2508         }
2509         if (list_empty(&fs_devices->alloc_list))
2510                 return -ENOSPC;
2511
2512         ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
2513                                     &min_stripes, &sub_stripes);
2514         if (ret)
2515                 return ret;
2516
2517         devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
2518                                GFP_NOFS);
2519         if (!devices_info)
2520                 return -ENOMEM;
2521
2522         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2523         if (!map) {
2524                 ret = -ENOMEM;
2525                 goto error;
2526         }
2527         map->num_stripes = num_stripes;
2528
2529         cur = fs_devices->alloc_list.next;
2530         index = 0;
2531         i = 0;
2532
2533         calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
2534                                              num_stripes, 0);
2535
2536         if (type & BTRFS_BLOCK_GROUP_DUP) {
2537                 min_free = calc_size * 2;
2538                 min_devices = 1;
2539         } else {
2540                 min_free = calc_size;
2541                 min_devices = min_stripes;
2542         }
2543
2544         INIT_LIST_HEAD(&private_devs);
2545         while (index < num_stripes) {
2546                 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
2547                 BUG_ON(!device->writeable);
2548                 if (device->total_bytes > device->bytes_used)
2549                         avail = device->total_bytes - device->bytes_used;
2550                 else
2551                         avail = 0;
2552                 cur = cur->next;
2553
2554                 if (device->in_fs_metadata && avail >= min_free) {
2555                         ret = find_free_dev_extent(trans, device, min_free,
2556                                                    &devices_info[i].dev_offset,
2557                                                    &devices_info[i].max_avail);
2558                         if (ret == 0) {
2559                                 list_move_tail(&device->dev_alloc_list,
2560                                                &private_devs);
2561                                 map->stripes[index].dev = device;
2562                                 map->stripes[index].physical =
2563                                                 devices_info[i].dev_offset;
2564                                 index++;
2565                                 if (type & BTRFS_BLOCK_GROUP_DUP) {
2566                                         map->stripes[index].dev = device;
2567                                         map->stripes[index].physical =
2568                                                 devices_info[i].dev_offset +
2569                                                 calc_size;
2570                                         index++;
2571                                 }
2572                         } else if (ret != -ENOSPC)
2573                                 goto error;
2574
2575                         devices_info[i].dev = device;
2576                         i++;
2577                 } else if (device->in_fs_metadata &&
2578                            avail >= BTRFS_STRIPE_LEN) {
2579                         devices_info[i].dev = device;
2580                         devices_info[i].max_avail = avail;
2581                         i++;
2582                 }
2583
2584                 if (cur == &fs_devices->alloc_list)
2585                         break;
2586         }
2587
2588         list_splice(&private_devs, &fs_devices->alloc_list);
2589         if (index < num_stripes) {
2590                 if (index >= min_stripes) {
2591                         num_stripes = index;
2592                         if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2593                                 num_stripes /= sub_stripes;
2594                                 num_stripes *= sub_stripes;
2595                         }
2596
2597                         map = __shrink_map_lookup_stripes(map, num_stripes);
2598                 } else if (i >= min_devices) {
2599                         ret = __btrfs_alloc_tiny_space(trans, fs_devices,
2600                                                        devices_info, i, type,
2601                                                        &map, min_stripes,
2602                                                        &calc_size);
2603                         if (ret)
2604                                 goto error;
2605                 } else {
2606                         ret = -ENOSPC;
2607                         goto error;
2608                 }
2609         }
2610         map->sector_size = extent_root->sectorsize;
2611         map->stripe_len = BTRFS_STRIPE_LEN;
2612         map->io_align = BTRFS_STRIPE_LEN;
2613         map->io_width = BTRFS_STRIPE_LEN;
2614         map->type = type;
2615         map->sub_stripes = sub_stripes;
2616
2617         *map_ret = map;
2618         *stripe_size = calc_size;
2619         *num_bytes = chunk_bytes_by_type(type, calc_size,
2620                                          map->num_stripes, sub_stripes);
2621
2622         em = alloc_extent_map(GFP_NOFS);
2623         if (!em) {
2624                 ret = -ENOMEM;
2625                 goto error;
2626         }
2627         em->bdev = (struct block_device *)map;
2628         em->start = start;
2629         em->len = *num_bytes;
2630         em->block_start = 0;
2631         em->block_len = em->len;
2632
2633         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
2634         write_lock(&em_tree->lock);
2635         ret = add_extent_mapping(em_tree, em);
2636         write_unlock(&em_tree->lock);
2637         BUG_ON(ret);
2638         free_extent_map(em);
2639
2640         ret = btrfs_make_block_group(trans, extent_root, 0, type,
2641                                      BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2642                                      start, *num_bytes);
2643         BUG_ON(ret);
2644
2645         index = 0;
2646         while (index < map->num_stripes) {
2647                 device = map->stripes[index].dev;
2648                 dev_offset = map->stripes[index].physical;
2649
2650                 ret = btrfs_alloc_dev_extent(trans, device,
2651                                 info->chunk_root->root_key.objectid,
2652                                 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2653                                 start, dev_offset, calc_size);
2654                 BUG_ON(ret);
2655                 index++;
2656         }
2657
2658         kfree(devices_info);
2659         return 0;
2660
2661 error:
2662         kfree(map);
2663         kfree(devices_info);
2664         return ret;
2665 }
2666
2667 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2668                                 struct btrfs_root *extent_root,
2669                                 struct map_lookup *map, u64 chunk_offset,
2670                                 u64 chunk_size, u64 stripe_size)
2671 {
2672         u64 dev_offset;
2673         struct btrfs_key key;
2674         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2675         struct btrfs_device *device;
2676         struct btrfs_chunk *chunk;
2677         struct btrfs_stripe *stripe;
2678         size_t item_size = btrfs_chunk_item_size(map->num_stripes);
2679         int index = 0;
2680         int ret;
2681
2682         chunk = kzalloc(item_size, GFP_NOFS);
2683         if (!chunk)
2684                 return -ENOMEM;
2685
2686         index = 0;
2687         while (index < map->num_stripes) {
2688                 device = map->stripes[index].dev;
2689                 device->bytes_used += stripe_size;
2690                 ret = btrfs_update_device(trans, device);
2691                 BUG_ON(ret);
2692                 index++;
2693         }
2694
2695         index = 0;
2696         stripe = &chunk->stripe;
2697         while (index < map->num_stripes) {
2698                 device = map->stripes[index].dev;
2699                 dev_offset = map->stripes[index].physical;
2700
2701                 btrfs_set_stack_stripe_devid(stripe, device->devid);
2702                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
2703                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
2704                 stripe++;
2705                 index++;
2706         }
2707
2708         btrfs_set_stack_chunk_length(chunk, chunk_size);
2709         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
2710         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
2711         btrfs_set_stack_chunk_type(chunk, map->type);
2712         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
2713         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
2714         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
2715         btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
2716         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
2717
2718         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2719         key.type = BTRFS_CHUNK_ITEM_KEY;
2720         key.offset = chunk_offset;
2721
2722         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
2723         BUG_ON(ret);
2724
2725         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2726                 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
2727                                              item_size);
2728                 BUG_ON(ret);
2729         }
2730         kfree(chunk);
2731         return 0;
2732 }
2733
2734 /*
2735  * Chunk allocation falls into two parts. The first part does works
2736  * that make the new allocated chunk useable, but not do any operation
2737  * that modifies the chunk tree. The second part does the works that
2738  * require modifying the chunk tree. This division is important for the
2739  * bootstrap process of adding storage to a seed btrfs.
2740  */
2741 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2742                       struct btrfs_root *extent_root, u64 type)
2743 {
2744         u64 chunk_offset;
2745         u64 chunk_size;
2746         u64 stripe_size;
2747         struct map_lookup *map;
2748         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2749         int ret;
2750
2751         ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2752                               &chunk_offset);
2753         if (ret)
2754                 return ret;
2755
2756         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2757                                   &stripe_size, chunk_offset, type);
2758         if (ret)
2759                 return ret;
2760
2761         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2762                                    chunk_size, stripe_size);
2763         BUG_ON(ret);
2764         return 0;
2765 }
2766
2767 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2768                                          struct btrfs_root *root,
2769                                          struct btrfs_device *device)
2770 {
2771         u64 chunk_offset;
2772         u64 sys_chunk_offset;
2773         u64 chunk_size;
2774         u64 sys_chunk_size;
2775         u64 stripe_size;
2776         u64 sys_stripe_size;
2777         u64 alloc_profile;
2778         struct map_lookup *map;
2779         struct map_lookup *sys_map;
2780         struct btrfs_fs_info *fs_info = root->fs_info;
2781         struct btrfs_root *extent_root = fs_info->extent_root;
2782         int ret;
2783
2784         ret = find_next_chunk(fs_info->chunk_root,
2785                               BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
2786         BUG_ON(ret);
2787
2788         alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2789                         (fs_info->metadata_alloc_profile &
2790                          fs_info->avail_metadata_alloc_bits);
2791         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2792
2793         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2794                                   &stripe_size, chunk_offset, alloc_profile);
2795         BUG_ON(ret);
2796
2797         sys_chunk_offset = chunk_offset + chunk_size;
2798
2799         alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
2800                         (fs_info->system_alloc_profile &
2801                          fs_info->avail_system_alloc_bits);
2802         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2803
2804         ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
2805                                   &sys_chunk_size, &sys_stripe_size,
2806                                   sys_chunk_offset, alloc_profile);
2807         BUG_ON(ret);
2808
2809         ret = btrfs_add_device(trans, fs_info->chunk_root, device);
2810         BUG_ON(ret);
2811
2812         /*
2813          * Modifying chunk tree needs allocating new blocks from both
2814          * system block group and metadata block group. So we only can
2815          * do operations require modifying the chunk tree after both
2816          * block groups were created.
2817          */
2818         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2819                                    chunk_size, stripe_size);
2820         BUG_ON(ret);
2821
2822         ret = __finish_chunk_alloc(trans, extent_root, sys_map,
2823                                    sys_chunk_offset, sys_chunk_size,
2824                                    sys_stripe_size);
2825         BUG_ON(ret);
2826         return 0;
2827 }
2828
2829 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
2830 {
2831         struct extent_map *em;
2832         struct map_lookup *map;
2833         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2834         int readonly = 0;
2835         int i;
2836
2837         read_lock(&map_tree->map_tree.lock);
2838         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2839         read_unlock(&map_tree->map_tree.lock);
2840         if (!em)
2841                 return 1;
2842
2843         if (btrfs_test_opt(root, DEGRADED)) {
2844                 free_extent_map(em);
2845                 return 0;
2846         }
2847
2848         map = (struct map_lookup *)em->bdev;
2849         for (i = 0; i < map->num_stripes; i++) {
2850                 if (!map->stripes[i].dev->writeable) {
2851                         readonly = 1;
2852                         break;
2853                 }
2854         }
2855         free_extent_map(em);
2856         return readonly;
2857 }
2858
2859 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
2860 {
2861         extent_map_tree_init(&tree->map_tree, GFP_NOFS);
2862 }
2863
2864 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
2865 {
2866         struct extent_map *em;
2867
2868         while (1) {
2869                 write_lock(&tree->map_tree.lock);
2870                 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
2871                 if (em)
2872                         remove_extent_mapping(&tree->map_tree, em);
2873                 write_unlock(&tree->map_tree.lock);
2874                 if (!em)
2875                         break;
2876                 kfree(em->bdev);
2877                 /* once for us */
2878                 free_extent_map(em);
2879                 /* once for the tree */
2880                 free_extent_map(em);
2881         }
2882 }
2883
2884 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
2885 {
2886         struct extent_map *em;
2887         struct map_lookup *map;
2888         struct extent_map_tree *em_tree = &map_tree->map_tree;
2889         int ret;
2890
2891         read_lock(&em_tree->lock);
2892         em = lookup_extent_mapping(em_tree, logical, len);
2893         read_unlock(&em_tree->lock);
2894         BUG_ON(!em);
2895
2896         BUG_ON(em->start > logical || em->start + em->len < logical);
2897         map = (struct map_lookup *)em->bdev;
2898         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
2899                 ret = map->num_stripes;
2900         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2901                 ret = map->sub_stripes;
2902         else
2903                 ret = 1;
2904         free_extent_map(em);
2905         return ret;
2906 }
2907
2908 static int find_live_mirror(struct map_lookup *map, int first, int num,
2909                             int optimal)
2910 {
2911         int i;
2912         if (map->stripes[optimal].dev->bdev)
2913                 return optimal;
2914         for (i = first; i < first + num; i++) {
2915                 if (map->stripes[i].dev->bdev)
2916                         return i;
2917         }
2918         /* we couldn't find one that doesn't fail.  Just return something
2919          * and the io error handling code will clean up eventually
2920          */
2921         return optimal;
2922 }
2923
2924 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2925                              u64 logical, u64 *length,
2926                              struct btrfs_multi_bio **multi_ret,
2927                              int mirror_num, struct page *unplug_page)
2928 {
2929         struct extent_map *em;
2930         struct map_lookup *map;
2931         struct extent_map_tree *em_tree = &map_tree->map_tree;
2932         u64 offset;
2933         u64 stripe_offset;
2934         u64 stripe_nr;
2935         int stripes_allocated = 8;
2936         int stripes_required = 1;
2937         int stripe_index;
2938         int i;
2939         int num_stripes;
2940         int max_errors = 0;
2941         struct btrfs_multi_bio *multi = NULL;
2942
2943         if (multi_ret && !(rw & REQ_WRITE))
2944                 stripes_allocated = 1;
2945 again:
2946         if (multi_ret) {
2947                 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
2948                                 GFP_NOFS);
2949                 if (!multi)
2950                         return -ENOMEM;
2951
2952                 atomic_set(&multi->error, 0);
2953         }
2954
2955         read_lock(&em_tree->lock);
2956         em = lookup_extent_mapping(em_tree, logical, *length);
2957         read_unlock(&em_tree->lock);
2958
2959         if (!em && unplug_page) {
2960                 kfree(multi);
2961                 return 0;
2962         }
2963
2964         if (!em) {
2965                 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
2966                        (unsigned long long)logical,
2967                        (unsigned long long)*length);
2968                 BUG();
2969         }
2970
2971         BUG_ON(em->start > logical || em->start + em->len < logical);
2972         map = (struct map_lookup *)em->bdev;
2973         offset = logical - em->start;
2974
2975         if (mirror_num > map->num_stripes)
2976                 mirror_num = 0;
2977
2978         /* if our multi bio struct is too small, back off and try again */
2979         if (rw & REQ_WRITE) {
2980                 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2981                                  BTRFS_BLOCK_GROUP_DUP)) {
2982                         stripes_required = map->num_stripes;
2983                         max_errors = 1;
2984                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2985                         stripes_required = map->sub_stripes;
2986                         max_errors = 1;
2987                 }
2988         }
2989         if (multi_ret && (rw & REQ_WRITE) &&
2990             stripes_allocated < stripes_required) {
2991                 stripes_allocated = map->num_stripes;
2992                 free_extent_map(em);
2993                 kfree(multi);
2994                 goto again;
2995         }
2996         stripe_nr = offset;
2997         /*
2998          * stripe_nr counts the total number of stripes we have to stride
2999          * to get to this block
3000          */
3001         do_div(stripe_nr, map->stripe_len);
3002
3003         stripe_offset = stripe_nr * map->stripe_len;
3004         BUG_ON(offset < stripe_offset);
3005
3006         /* stripe_offset is the offset of this block in its stripe*/
3007         stripe_offset = offset - stripe_offset;
3008
3009         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
3010                          BTRFS_BLOCK_GROUP_RAID10 |
3011                          BTRFS_BLOCK_GROUP_DUP)) {
3012                 /* we limit the length of each bio to what fits in a stripe */
3013                 *length = min_t(u64, em->len - offset,
3014                               map->stripe_len - stripe_offset);
3015         } else {
3016                 *length = em->len - offset;
3017         }
3018
3019         if (!multi_ret && !unplug_page)
3020                 goto out;
3021
3022         num_stripes = 1;
3023         stripe_index = 0;
3024         if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3025                 if (unplug_page || (rw & REQ_WRITE))
3026                         num_stripes = map->num_stripes;
3027                 else if (mirror_num)
3028                         stripe_index = mirror_num - 1;
3029                 else {
3030                         stripe_index = find_live_mirror(map, 0,
3031                                             map->num_stripes,
3032                                             current->pid % map->num_stripes);
3033                 }
3034
3035         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3036                 if (rw & REQ_WRITE)
3037                         num_stripes = map->num_stripes;
3038                 else if (mirror_num)
3039                         stripe_index = mirror_num - 1;
3040
3041         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3042                 int factor = map->num_stripes / map->sub_stripes;
3043
3044                 stripe_index = do_div(stripe_nr, factor);
3045                 stripe_index *= map->sub_stripes;
3046
3047                 if (unplug_page || (rw & REQ_WRITE))
3048                         num_stripes = map->sub_stripes;
3049                 else if (mirror_num)
3050                         stripe_index += mirror_num - 1;
3051                 else {
3052                         stripe_index = find_live_mirror(map, stripe_index,
3053                                               map->sub_stripes, stripe_index +
3054                                               current->pid % map->sub_stripes);
3055                 }
3056         } else {
3057                 /*
3058                  * after this do_div call, stripe_nr is the number of stripes
3059                  * on this device we have to walk to find the data, and
3060                  * stripe_index is the number of our device in the stripe array
3061                  */
3062                 stripe_index = do_div(stripe_nr, map->num_stripes);
3063         }
3064         BUG_ON(stripe_index >= map->num_stripes);
3065
3066         for (i = 0; i < num_stripes; i++) {
3067                 if (unplug_page) {
3068                         struct btrfs_device *device;
3069                         struct backing_dev_info *bdi;
3070
3071                         device = map->stripes[stripe_index].dev;
3072                         if (device->bdev) {
3073                                 bdi = blk_get_backing_dev_info(device->bdev);
3074                                 if (bdi->unplug_io_fn)
3075                                         bdi->unplug_io_fn(bdi, unplug_page);
3076                         }
3077                 } else {
3078                         multi->stripes[i].physical =
3079                                 map->stripes[stripe_index].physical +
3080                                 stripe_offset + stripe_nr * map->stripe_len;
3081                         multi->stripes[i].dev = map->stripes[stripe_index].dev;
3082                 }
3083                 stripe_index++;
3084         }
3085         if (multi_ret) {
3086                 *multi_ret = multi;
3087                 multi->num_stripes = num_stripes;
3088                 multi->max_errors = max_errors;
3089         }
3090 out:
3091         free_extent_map(em);
3092         return 0;
3093 }
3094
3095 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3096                       u64 logical, u64 *length,
3097                       struct btrfs_multi_bio **multi_ret, int mirror_num)
3098 {
3099         return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
3100                                  mirror_num, NULL);
3101 }
3102
3103 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3104                      u64 chunk_start, u64 physical, u64 devid,
3105                      u64 **logical, int *naddrs, int *stripe_len)
3106 {
3107         struct extent_map_tree *em_tree = &map_tree->map_tree;
3108         struct extent_map *em;
3109         struct map_lookup *map;
3110         u64 *buf;
3111         u64 bytenr;
3112         u64 length;
3113         u64 stripe_nr;
3114         int i, j, nr = 0;
3115
3116         read_lock(&em_tree->lock);
3117         em = lookup_extent_mapping(em_tree, chunk_start, 1);
3118         read_unlock(&em_tree->lock);
3119
3120         BUG_ON(!em || em->start != chunk_start);
3121         map = (struct map_lookup *)em->bdev;
3122
3123         length = em->len;
3124         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
3125                 do_div(length, map->num_stripes / map->sub_stripes);
3126         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3127                 do_div(length, map->num_stripes);
3128
3129         buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
3130         BUG_ON(!buf);
3131
3132         for (i = 0; i < map->num_stripes; i++) {
3133                 if (devid && map->stripes[i].dev->devid != devid)
3134                         continue;
3135                 if (map->stripes[i].physical > physical ||
3136                     map->stripes[i].physical + length <= physical)
3137                         continue;
3138
3139                 stripe_nr = physical - map->stripes[i].physical;
3140                 do_div(stripe_nr, map->stripe_len);
3141
3142                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3143                         stripe_nr = stripe_nr * map->num_stripes + i;
3144                         do_div(stripe_nr, map->sub_stripes);
3145                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3146                         stripe_nr = stripe_nr * map->num_stripes + i;
3147                 }
3148                 bytenr = chunk_start + stripe_nr * map->stripe_len;
3149                 WARN_ON(nr >= map->num_stripes);
3150                 for (j = 0; j < nr; j++) {
3151                         if (buf[j] == bytenr)
3152                                 break;
3153                 }
3154                 if (j == nr) {
3155                         WARN_ON(nr >= map->num_stripes);
3156                         buf[nr++] = bytenr;
3157                 }
3158         }
3159
3160         *logical = buf;
3161         *naddrs = nr;
3162         *stripe_len = map->stripe_len;
3163
3164         free_extent_map(em);
3165         return 0;
3166 }
3167
3168 int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
3169                       u64 logical, struct page *page)
3170 {
3171         u64 length = PAGE_CACHE_SIZE;
3172         return __btrfs_map_block(map_tree, READ, logical, &length,
3173                                  NULL, 0, page);
3174 }
3175
3176 static void end_bio_multi_stripe(struct bio *bio, int err)
3177 {
3178         struct btrfs_multi_bio *multi = bio->bi_private;
3179         int is_orig_bio = 0;
3180
3181         if (err)
3182                 atomic_inc(&multi->error);
3183
3184         if (bio == multi->orig_bio)
3185                 is_orig_bio = 1;
3186
3187         if (atomic_dec_and_test(&multi->stripes_pending)) {
3188                 if (!is_orig_bio) {
3189                         bio_put(bio);
3190                         bio = multi->orig_bio;
3191                 }
3192                 bio->bi_private = multi->private;
3193                 bio->bi_end_io = multi->end_io;
3194                 /* only send an error to the higher layers if it is
3195                  * beyond the tolerance of the multi-bio
3196                  */
3197                 if (atomic_read(&multi->error) > multi->max_errors) {
3198                         err = -EIO;
3199                 } else if (err) {
3200                         /*
3201                          * this bio is actually up to date, we didn't
3202                          * go over the max number of errors
3203                          */
3204                         set_bit(BIO_UPTODATE, &bio->bi_flags);
3205                         err = 0;
3206                 }
3207                 kfree(multi);
3208
3209                 bio_endio(bio, err);
3210         } else if (!is_orig_bio) {
3211                 bio_put(bio);
3212         }
3213 }
3214
3215 struct async_sched {
3216         struct bio *bio;
3217         int rw;
3218         struct btrfs_fs_info *info;
3219         struct btrfs_work work;
3220 };
3221
3222 /*
3223  * see run_scheduled_bios for a description of why bios are collected for
3224  * async submit.
3225  *
3226  * This will add one bio to the pending list for a device and make sure
3227  * the work struct is scheduled.
3228  */
3229 static noinline int schedule_bio(struct btrfs_root *root,
3230                                  struct btrfs_device *device,
3231                                  int rw, struct bio *bio)
3232 {
3233         int should_queue = 1;
3234         struct btrfs_pending_bios *pending_bios;
3235
3236         /* don't bother with additional async steps for reads, right now */
3237         if (!(rw & REQ_WRITE)) {
3238                 bio_get(bio);
3239                 submit_bio(rw, bio);
3240                 bio_put(bio);
3241                 return 0;
3242         }
3243
3244         /*
3245          * nr_async_bios allows us to reliably return congestion to the
3246          * higher layers.  Otherwise, the async bio makes it appear we have
3247          * made progress against dirty pages when we've really just put it
3248          * on a queue for later
3249          */
3250         atomic_inc(&root->fs_info->nr_async_bios);
3251         WARN_ON(bio->bi_next);
3252         bio->bi_next = NULL;
3253         bio->bi_rw |= rw;
3254
3255         spin_lock(&device->io_lock);
3256         if (bio->bi_rw & REQ_SYNC)
3257                 pending_bios = &device->pending_sync_bios;
3258         else
3259                 pending_bios = &device->pending_bios;
3260
3261         if (pending_bios->tail)
3262                 pending_bios->tail->bi_next = bio;
3263
3264         pending_bios->tail = bio;
3265         if (!pending_bios->head)
3266                 pending_bios->head = bio;
3267         if (device->running_pending)
3268                 should_queue = 0;
3269
3270         spin_unlock(&device->io_lock);
3271
3272         if (should_queue)
3273                 btrfs_queue_worker(&root->fs_info->submit_workers,
3274                                    &device->work);
3275         return 0;
3276 }
3277
3278 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3279                   int mirror_num, int async_submit)
3280 {
3281         struct btrfs_mapping_tree *map_tree;
3282         struct btrfs_device *dev;
3283         struct bio *first_bio = bio;
3284         u64 logical = (u64)bio->bi_sector << 9;
3285         u64 length = 0;
3286         u64 map_length;
3287         struct btrfs_multi_bio *multi = NULL;
3288         int ret;
3289         int dev_nr = 0;
3290         int total_devs = 1;
3291
3292         length = bio->bi_size;
3293         map_tree = &root->fs_info->mapping_tree;
3294         map_length = length;
3295
3296         ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
3297                               mirror_num);
3298         BUG_ON(ret);
3299
3300         total_devs = multi->num_stripes;
3301         if (map_length < length) {
3302                 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
3303                        "len %llu\n", (unsigned long long)logical,
3304                        (unsigned long long)length,
3305                        (unsigned long long)map_length);
3306                 BUG();
3307         }
3308         multi->end_io = first_bio->bi_end_io;
3309         multi->private = first_bio->bi_private;
3310         multi->orig_bio = first_bio;
3311         atomic_set(&multi->stripes_pending, multi->num_stripes);
3312
3313         while (dev_nr < total_devs) {
3314                 if (total_devs > 1) {
3315                         if (dev_nr < total_devs - 1) {
3316                                 bio = bio_clone(first_bio, GFP_NOFS);
3317                                 BUG_ON(!bio);
3318                         } else {
3319                                 bio = first_bio;
3320                         }
3321                         bio->bi_private = multi;
3322                         bio->bi_end_io = end_bio_multi_stripe;
3323                 }
3324                 bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
3325                 dev = multi->stripes[dev_nr].dev;
3326                 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
3327                         bio->bi_bdev = dev->bdev;
3328                         if (async_submit)
3329                                 schedule_bio(root, dev, rw, bio);
3330                         else
3331                                 submit_bio(rw, bio);
3332                 } else {
3333                         bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
3334                         bio->bi_sector = logical >> 9;
3335                         bio_endio(bio, -EIO);
3336                 }
3337                 dev_nr++;
3338         }
3339         if (total_devs == 1)
3340                 kfree(multi);
3341         return 0;
3342 }
3343
3344 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
3345                                        u8 *uuid, u8 *fsid)
3346 {
3347         struct btrfs_device *device;
3348         struct btrfs_fs_devices *cur_devices;
3349
3350         cur_devices = root->fs_info->fs_devices;
3351         while (cur_devices) {
3352                 if (!fsid ||
3353                     !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
3354                         device = __find_device(&cur_devices->devices,
3355                                                devid, uuid);
3356                         if (device)
3357                                 return device;
3358                 }
3359                 cur_devices = cur_devices->seed;
3360         }
3361         return NULL;
3362 }
3363
3364 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
3365                                             u64 devid, u8 *dev_uuid)
3366 {
3367         struct btrfs_device *device;
3368         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
3369
3370         device = kzalloc(sizeof(*device), GFP_NOFS);
3371         if (!device)
3372                 return NULL;
3373         list_add(&device->dev_list,
3374                  &fs_devices->devices);
3375         device->barriers = 1;
3376         device->dev_root = root->fs_info->dev_root;
3377         device->devid = devid;
3378         device->work.func = pending_bios_fn;
3379         device->fs_devices = fs_devices;
3380         device->missing = 1;
3381         fs_devices->num_devices++;
3382         fs_devices->missing_devices++;
3383         spin_lock_init(&device->io_lock);
3384         INIT_LIST_HEAD(&device->dev_alloc_list);
3385         memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
3386         return device;
3387 }
3388
3389 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
3390                           struct extent_buffer *leaf,
3391                           struct btrfs_chunk *chunk)
3392 {
3393         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
3394         struct map_lookup *map;
3395         struct extent_map *em;
3396         u64 logical;
3397         u64 length;
3398         u64 devid;
3399         u8 uuid[BTRFS_UUID_SIZE];
3400         int num_stripes;
3401         int ret;
3402         int i;
3403
3404         logical = key->offset;
3405         length = btrfs_chunk_length(leaf, chunk);
3406
3407         read_lock(&map_tree->map_tree.lock);
3408         em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
3409         read_unlock(&map_tree->map_tree.lock);
3410
3411         /* already mapped? */
3412         if (em && em->start <= logical && em->start + em->len > logical) {
3413                 free_extent_map(em);
3414                 return 0;
3415         } else if (em) {
3416                 free_extent_map(em);
3417         }
3418
3419         em = alloc_extent_map(GFP_NOFS);
3420         if (!em)
3421                 return -ENOMEM;
3422         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3423         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3424         if (!map) {
3425                 free_extent_map(em);
3426                 return -ENOMEM;
3427         }
3428
3429         em->bdev = (struct block_device *)map;
3430         em->start = logical;
3431         em->len = length;
3432         em->block_start = 0;
3433         em->block_len = em->len;
3434
3435         map->num_stripes = num_stripes;
3436         map->io_width = btrfs_chunk_io_width(leaf, chunk);
3437         map->io_align = btrfs_chunk_io_align(leaf, chunk);
3438         map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
3439         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
3440         map->type = btrfs_chunk_type(leaf, chunk);
3441         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
3442         for (i = 0; i < num_stripes; i++) {
3443                 map->stripes[i].physical =
3444                         btrfs_stripe_offset_nr(leaf, chunk, i);
3445                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
3446                 read_extent_buffer(leaf, uuid, (unsigned long)
3447                                    btrfs_stripe_dev_uuid_nr(chunk, i),
3448                                    BTRFS_UUID_SIZE);
3449                 map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
3450                                                         NULL);
3451                 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
3452                         kfree(map);
3453                         free_extent_map(em);
3454                         return -EIO;
3455                 }
3456                 if (!map->stripes[i].dev) {
3457                         map->stripes[i].dev =
3458                                 add_missing_dev(root, devid, uuid);
3459                         if (!map->stripes[i].dev) {
3460                                 kfree(map);
3461                                 free_extent_map(em);
3462                                 return -EIO;
3463                         }
3464                 }
3465                 map->stripes[i].dev->in_fs_metadata = 1;
3466         }
3467
3468         write_lock(&map_tree->map_tree.lock);
3469         ret = add_extent_mapping(&map_tree->map_tree, em);
3470         write_unlock(&map_tree->map_tree.lock);
3471         BUG_ON(ret);
3472         free_extent_map(em);
3473
3474         return 0;
3475 }
3476
3477 static int fill_device_from_item(struct extent_buffer *leaf,
3478                                  struct btrfs_dev_item *dev_item,
3479                                  struct btrfs_device *device)
3480 {
3481         unsigned long ptr;
3482
3483         device->devid = btrfs_device_id(leaf, dev_item);
3484         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
3485         device->total_bytes = device->disk_total_bytes;
3486         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
3487         device->type = btrfs_device_type(leaf, dev_item);
3488         device->io_align = btrfs_device_io_align(leaf, dev_item);
3489         device->io_width = btrfs_device_io_width(leaf, dev_item);
3490         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
3491
3492         ptr = (unsigned long)btrfs_device_uuid(dev_item);
3493         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
3494
3495         return 0;
3496 }
3497
3498 static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
3499 {
3500         struct btrfs_fs_devices *fs_devices;
3501         int ret;
3502
3503         mutex_lock(&uuid_mutex);
3504
3505         fs_devices = root->fs_info->fs_devices->seed;
3506         while (fs_devices) {
3507                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
3508                         ret = 0;
3509                         goto out;
3510                 }
3511                 fs_devices = fs_devices->seed;
3512         }
3513
3514         fs_devices = find_fsid(fsid);
3515         if (!fs_devices) {
3516                 ret = -ENOENT;
3517                 goto out;
3518         }
3519
3520         fs_devices = clone_fs_devices(fs_devices);
3521         if (IS_ERR(fs_devices)) {
3522                 ret = PTR_ERR(fs_devices);
3523                 goto out;
3524         }
3525
3526         ret = __btrfs_open_devices(fs_devices, FMODE_READ,
3527                                    root->fs_info->bdev_holder);
3528         if (ret)
3529                 goto out;
3530
3531         if (!fs_devices->seeding) {
3532                 __btrfs_close_devices(fs_devices);
3533                 free_fs_devices(fs_devices);
3534                 ret = -EINVAL;
3535                 goto out;
3536         }
3537
3538         fs_devices->seed = root->fs_info->fs_devices->seed;
3539         root->fs_info->fs_devices->seed = fs_devices;
3540 out:
3541         mutex_unlock(&uuid_mutex);
3542         return ret;
3543 }
3544
3545 static int read_one_dev(struct btrfs_root *root,
3546                         struct extent_buffer *leaf,
3547                         struct btrfs_dev_item *dev_item)
3548 {
3549         struct btrfs_device *device;
3550         u64 devid;
3551         int ret;
3552         u8 fs_uuid[BTRFS_UUID_SIZE];
3553         u8 dev_uuid[BTRFS_UUID_SIZE];
3554
3555         devid = btrfs_device_id(leaf, dev_item);
3556         read_extent_buffer(leaf, dev_uuid,
3557                            (unsigned long)btrfs_device_uuid(dev_item),
3558                            BTRFS_UUID_SIZE);
3559         read_extent_buffer(leaf, fs_uuid,
3560                            (unsigned long)btrfs_device_fsid(dev_item),
3561                            BTRFS_UUID_SIZE);
3562
3563         if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
3564                 ret = open_seed_devices(root, fs_uuid);
3565                 if (ret && !btrfs_test_opt(root, DEGRADED))
3566                         return ret;
3567         }
3568
3569         device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
3570         if (!device || !device->bdev) {
3571                 if (!btrfs_test_opt(root, DEGRADED))
3572                         return -EIO;
3573
3574                 if (!device) {
3575                         printk(KERN_WARNING "warning devid %llu missing\n",
3576                                (unsigned long long)devid);
3577                         device = add_missing_dev(root, devid, dev_uuid);
3578                         if (!device)
3579                                 return -ENOMEM;
3580                 } else if (!device->missing) {
3581                         /*
3582                          * this happens when a device that was properly setup
3583                          * in the device info lists suddenly goes bad.
3584                          * device->bdev is NULL, and so we have to set
3585                          * device->missing to one here
3586                          */
3587                         root->fs_info->fs_devices->missing_devices++;
3588                         device->missing = 1;
3589                 }
3590         }
3591
3592         if (device->fs_devices != root->fs_info->fs_devices) {
3593                 BUG_ON(device->writeable);
3594                 if (device->generation !=
3595                     btrfs_device_generation(leaf, dev_item))
3596                         return -EINVAL;
3597         }
3598
3599         fill_device_from_item(leaf, dev_item, device);
3600         device->dev_root = root->fs_info->dev_root;
3601         device->in_fs_metadata = 1;
3602         if (device->writeable)
3603                 device->fs_devices->total_rw_bytes += device->total_bytes;
3604         ret = 0;
3605         return ret;
3606 }
3607
3608 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
3609 {
3610         struct btrfs_dev_item *dev_item;
3611
3612         dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
3613                                                      dev_item);
3614         return read_one_dev(root, buf, dev_item);
3615 }
3616
3617 int btrfs_read_sys_array(struct btrfs_root *root)
3618 {
3619         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
3620         struct extent_buffer *sb;
3621         struct btrfs_disk_key *disk_key;
3622         struct btrfs_chunk *chunk;
3623         u8 *ptr;
3624         unsigned long sb_ptr;
3625         int ret = 0;
3626         u32 num_stripes;
3627         u32 array_size;
3628         u32 len = 0;
3629         u32 cur;
3630         struct btrfs_key key;
3631
3632         sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
3633                                           BTRFS_SUPER_INFO_SIZE);
3634         if (!sb)
3635                 return -ENOMEM;
3636         btrfs_set_buffer_uptodate(sb);
3637         btrfs_set_buffer_lockdep_class(sb, 0);
3638
3639         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3640         array_size = btrfs_super_sys_array_size(super_copy);
3641
3642         ptr = super_copy->sys_chunk_array;
3643         sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
3644         cur = 0;
3645
3646         while (cur < array_size) {
3647                 disk_key = (struct btrfs_disk_key *)ptr;
3648                 btrfs_disk_key_to_cpu(&key, disk_key);
3649
3650                 len = sizeof(*disk_key); ptr += len;
3651                 sb_ptr += len;
3652                 cur += len;
3653
3654                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
3655                         chunk = (struct btrfs_chunk *)sb_ptr;
3656                         ret = read_one_chunk(root, &key, sb, chunk);
3657                         if (ret)
3658                                 break;
3659                         num_stripes = btrfs_chunk_num_stripes(sb, chunk);
3660                         len = btrfs_chunk_item_size(num_stripes);
3661                 } else {
3662                         ret = -EIO;
3663                         break;
3664                 }
3665                 ptr += len;
3666                 sb_ptr += len;
3667                 cur += len;
3668         }
3669         free_extent_buffer(sb);
3670         return ret;
3671 }
3672
3673 int btrfs_read_chunk_tree(struct btrfs_root *root)
3674 {
3675         struct btrfs_path *path;
3676         struct extent_buffer *leaf;
3677         struct btrfs_key key;
3678         struct btrfs_key found_key;
3679         int ret;
3680         int slot;
3681
3682         root = root->fs_info->chunk_root;
3683
3684         path = btrfs_alloc_path();
3685         if (!path)
3686                 return -ENOMEM;
3687
3688         /* first we search for all of the device items, and then we
3689          * read in all of the chunk items.  This way we can create chunk
3690          * mappings that reference all of the devices that are afound
3691          */
3692         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
3693         key.offset = 0;
3694         key.type = 0;
3695 again:
3696         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3697         if (ret < 0)
3698                 goto error;
3699         while (1) {
3700                 leaf = path->nodes[0];
3701                 slot = path->slots[0];
3702                 if (slot >= btrfs_header_nritems(leaf)) {
3703                         ret = btrfs_next_leaf(root, path);
3704                         if (ret == 0)
3705                                 continue;
3706                         if (ret < 0)
3707                                 goto error;
3708                         break;
3709                 }
3710                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3711                 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3712                         if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
3713                                 break;
3714                         if (found_key.type == BTRFS_DEV_ITEM_KEY) {
3715                                 struct btrfs_dev_item *dev_item;
3716                                 dev_item = btrfs_item_ptr(leaf, slot,
3717                                                   struct btrfs_dev_item);
3718                                 ret = read_one_dev(root, leaf, dev_item);
3719                                 if (ret)
3720                                         goto error;
3721                         }
3722                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
3723                         struct btrfs_chunk *chunk;
3724                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3725                         ret = read_one_chunk(root, &found_key, leaf, chunk);
3726                         if (ret)
3727                                 goto error;
3728                 }
3729                 path->slots[0]++;
3730         }
3731         if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3732                 key.objectid = 0;
3733                 btrfs_release_path(root, path);
3734                 goto again;
3735         }
3736         ret = 0;
3737 error:
3738         btrfs_free_path(path);
3739         return ret;
3740 }