Btrfs: move data checksumming into a dedicated tree
[linux-3.10.git] / fs / btrfs / volumes.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/buffer_head.h>
21 #include <linux/blkdev.h>
22 #include <linux/random.h>
23 #include <linux/version.h>
24 #include <asm/div64.h>
25 #include "compat.h"
26 #include "ctree.h"
27 #include "extent_map.h"
28 #include "disk-io.h"
29 #include "transaction.h"
30 #include "print-tree.h"
31 #include "volumes.h"
32 #include "async-thread.h"
33
34 struct map_lookup {
35         u64 type;
36         int io_align;
37         int io_width;
38         int stripe_len;
39         int sector_size;
40         int num_stripes;
41         int sub_stripes;
42         struct btrfs_bio_stripe stripes[];
43 };
44
45 static int init_first_rw_device(struct btrfs_trans_handle *trans,
46                                 struct btrfs_root *root,
47                                 struct btrfs_device *device);
48 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
49
50
51 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
52                             (sizeof(struct btrfs_bio_stripe) * (n)))
53
54 static DEFINE_MUTEX(uuid_mutex);
55 static LIST_HEAD(fs_uuids);
56
57 void btrfs_lock_volumes(void)
58 {
59         mutex_lock(&uuid_mutex);
60 }
61
62 void btrfs_unlock_volumes(void)
63 {
64         mutex_unlock(&uuid_mutex);
65 }
66
67 static void lock_chunks(struct btrfs_root *root)
68 {
69         mutex_lock(&root->fs_info->chunk_mutex);
70 }
71
72 static void unlock_chunks(struct btrfs_root *root)
73 {
74         mutex_unlock(&root->fs_info->chunk_mutex);
75 }
76
77 int btrfs_cleanup_fs_uuids(void)
78 {
79         struct btrfs_fs_devices *fs_devices;
80         struct btrfs_device *dev;
81
82         while (!list_empty(&fs_uuids)) {
83                 fs_devices = list_entry(fs_uuids.next,
84                                         struct btrfs_fs_devices, list);
85                 list_del(&fs_devices->list);
86                 while(!list_empty(&fs_devices->devices)) {
87                         dev = list_entry(fs_devices->devices.next,
88                                          struct btrfs_device, dev_list);
89                         if (dev->bdev) {
90                                 close_bdev_exclusive(dev->bdev, dev->mode);
91                                 fs_devices->open_devices--;
92                         }
93                         fs_devices->num_devices--;
94                         if (dev->writeable)
95                                 fs_devices->rw_devices--;
96                         list_del(&dev->dev_list);
97                         list_del(&dev->dev_alloc_list);
98                         kfree(dev->name);
99                         kfree(dev);
100                 }
101                 WARN_ON(fs_devices->num_devices);
102                 WARN_ON(fs_devices->open_devices);
103                 WARN_ON(fs_devices->rw_devices);
104                 kfree(fs_devices);
105         }
106         return 0;
107 }
108
109 static noinline struct btrfs_device *__find_device(struct list_head *head,
110                                                    u64 devid, u8 *uuid)
111 {
112         struct btrfs_device *dev;
113         struct list_head *cur;
114
115         list_for_each(cur, head) {
116                 dev = list_entry(cur, struct btrfs_device, dev_list);
117                 if (dev->devid == devid &&
118                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
119                         return dev;
120                 }
121         }
122         return NULL;
123 }
124
125 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
126 {
127         struct list_head *cur;
128         struct btrfs_fs_devices *fs_devices;
129
130         list_for_each(cur, &fs_uuids) {
131                 fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
132                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
133                         return fs_devices;
134         }
135         return NULL;
136 }
137
138 /*
139  * we try to collect pending bios for a device so we don't get a large
140  * number of procs sending bios down to the same device.  This greatly
141  * improves the schedulers ability to collect and merge the bios.
142  *
143  * But, it also turns into a long list of bios to process and that is sure
144  * to eventually make the worker thread block.  The solution here is to
145  * make some progress and then put this work struct back at the end of
146  * the list if the block device is congested.  This way, multiple devices
147  * can make progress from a single worker thread.
148  */
149 static int noinline run_scheduled_bios(struct btrfs_device *device)
150 {
151         struct bio *pending;
152         struct backing_dev_info *bdi;
153         struct btrfs_fs_info *fs_info;
154         struct bio *tail;
155         struct bio *cur;
156         int again = 0;
157         unsigned long num_run = 0;
158         unsigned long limit;
159
160         bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
161         fs_info = device->dev_root->fs_info;
162         limit = btrfs_async_submit_limit(fs_info);
163         limit = limit * 2 / 3;
164
165 loop:
166         spin_lock(&device->io_lock);
167
168         /* take all the bios off the list at once and process them
169          * later on (without the lock held).  But, remember the
170          * tail and other pointers so the bios can be properly reinserted
171          * into the list if we hit congestion
172          */
173         pending = device->pending_bios;
174         tail = device->pending_bio_tail;
175         WARN_ON(pending && !tail);
176         device->pending_bios = NULL;
177         device->pending_bio_tail = NULL;
178
179         /*
180          * if pending was null this time around, no bios need processing
181          * at all and we can stop.  Otherwise it'll loop back up again
182          * and do an additional check so no bios are missed.
183          *
184          * device->running_pending is used to synchronize with the
185          * schedule_bio code.
186          */
187         if (pending) {
188                 again = 1;
189                 device->running_pending = 1;
190         } else {
191                 again = 0;
192                 device->running_pending = 0;
193         }
194         spin_unlock(&device->io_lock);
195
196         while(pending) {
197                 cur = pending;
198                 pending = pending->bi_next;
199                 cur->bi_next = NULL;
200                 atomic_dec(&fs_info->nr_async_bios);
201
202                 if (atomic_read(&fs_info->nr_async_bios) < limit &&
203                     waitqueue_active(&fs_info->async_submit_wait))
204                         wake_up(&fs_info->async_submit_wait);
205
206                 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
207                 bio_get(cur);
208                 submit_bio(cur->bi_rw, cur);
209                 bio_put(cur);
210                 num_run++;
211
212                 /*
213                  * we made progress, there is more work to do and the bdi
214                  * is now congested.  Back off and let other work structs
215                  * run instead
216                  */
217                 if (pending && bdi_write_congested(bdi) &&
218                     fs_info->fs_devices->open_devices > 1) {
219                         struct bio *old_head;
220
221                         spin_lock(&device->io_lock);
222
223                         old_head = device->pending_bios;
224                         device->pending_bios = pending;
225                         if (device->pending_bio_tail)
226                                 tail->bi_next = old_head;
227                         else
228                                 device->pending_bio_tail = tail;
229
230                         spin_unlock(&device->io_lock);
231                         btrfs_requeue_work(&device->work);
232                         goto done;
233                 }
234         }
235         if (again)
236                 goto loop;
237 done:
238         return 0;
239 }
240
241 static void pending_bios_fn(struct btrfs_work *work)
242 {
243         struct btrfs_device *device;
244
245         device = container_of(work, struct btrfs_device, work);
246         run_scheduled_bios(device);
247 }
248
249 static noinline int device_list_add(const char *path,
250                            struct btrfs_super_block *disk_super,
251                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
252 {
253         struct btrfs_device *device;
254         struct btrfs_fs_devices *fs_devices;
255         u64 found_transid = btrfs_super_generation(disk_super);
256
257         fs_devices = find_fsid(disk_super->fsid);
258         if (!fs_devices) {
259                 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
260                 if (!fs_devices)
261                         return -ENOMEM;
262                 INIT_LIST_HEAD(&fs_devices->devices);
263                 INIT_LIST_HEAD(&fs_devices->alloc_list);
264                 list_add(&fs_devices->list, &fs_uuids);
265                 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
266                 fs_devices->latest_devid = devid;
267                 fs_devices->latest_trans = found_transid;
268                 device = NULL;
269         } else {
270                 device = __find_device(&fs_devices->devices, devid,
271                                        disk_super->dev_item.uuid);
272         }
273         if (!device) {
274                 if (fs_devices->opened)
275                         return -EBUSY;
276
277                 device = kzalloc(sizeof(*device), GFP_NOFS);
278                 if (!device) {
279                         /* we can safely leave the fs_devices entry around */
280                         return -ENOMEM;
281                 }
282                 device->devid = devid;
283                 device->work.func = pending_bios_fn;
284                 memcpy(device->uuid, disk_super->dev_item.uuid,
285                        BTRFS_UUID_SIZE);
286                 device->barriers = 1;
287                 spin_lock_init(&device->io_lock);
288                 device->name = kstrdup(path, GFP_NOFS);
289                 if (!device->name) {
290                         kfree(device);
291                         return -ENOMEM;
292                 }
293                 INIT_LIST_HEAD(&device->dev_alloc_list);
294                 list_add(&device->dev_list, &fs_devices->devices);
295                 device->fs_devices = fs_devices;
296                 fs_devices->num_devices++;
297         }
298
299         if (found_transid > fs_devices->latest_trans) {
300                 fs_devices->latest_devid = devid;
301                 fs_devices->latest_trans = found_transid;
302         }
303         *fs_devices_ret = fs_devices;
304         return 0;
305 }
306
307 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
308 {
309         struct list_head *tmp;
310         struct list_head *cur;
311         struct btrfs_device *device;
312         int seed_devices = 0;
313
314         mutex_lock(&uuid_mutex);
315 again:
316         list_for_each_safe(cur, tmp, &fs_devices->devices) {
317                 device = list_entry(cur, struct btrfs_device, dev_list);
318                 if (device->in_fs_metadata)
319                         continue;
320
321                 if (device->bdev) {
322                         close_bdev_exclusive(device->bdev, device->mode);
323                         device->bdev = NULL;
324                         fs_devices->open_devices--;
325                 }
326                 if (device->writeable) {
327                         list_del_init(&device->dev_alloc_list);
328                         device->writeable = 0;
329                         fs_devices->rw_devices--;
330                 }
331                 if (!seed_devices) {
332                         list_del_init(&device->dev_list);
333                         fs_devices->num_devices--;
334                         kfree(device->name);
335                         kfree(device);
336                 }
337         }
338
339         if (fs_devices->seed) {
340                 fs_devices = fs_devices->seed;
341                 seed_devices = 1;
342                 goto again;
343         }
344
345         mutex_unlock(&uuid_mutex);
346         return 0;
347 }
348
349 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
350 {
351         struct btrfs_fs_devices *seed_devices;
352         struct list_head *cur;
353         struct btrfs_device *device;
354 again:
355         if (--fs_devices->opened > 0)
356                 return 0;
357
358         list_for_each(cur, &fs_devices->devices) {
359                 device = list_entry(cur, struct btrfs_device, dev_list);
360                 if (device->bdev) {
361                         close_bdev_exclusive(device->bdev, device->mode);
362                         fs_devices->open_devices--;
363                 }
364                 if (device->writeable) {
365                         list_del_init(&device->dev_alloc_list);
366                         fs_devices->rw_devices--;
367                 }
368
369                 device->bdev = NULL;
370                 device->writeable = 0;
371                 device->in_fs_metadata = 0;
372         }
373         fs_devices->opened = 0;
374         fs_devices->seeding = 0;
375         fs_devices->sprouted = 0;
376
377         seed_devices = fs_devices->seed;
378         fs_devices->seed = NULL;
379         if (seed_devices) {
380                 fs_devices = seed_devices;
381                 goto again;
382         }
383         return 0;
384 }
385
386 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
387 {
388         int ret;
389
390         mutex_lock(&uuid_mutex);
391         ret = __btrfs_close_devices(fs_devices);
392         mutex_unlock(&uuid_mutex);
393         return ret;
394 }
395
396 int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
397                          fmode_t flags, void *holder)
398 {
399         struct block_device *bdev;
400         struct list_head *head = &fs_devices->devices;
401         struct list_head *cur;
402         struct btrfs_device *device;
403         struct block_device *latest_bdev = NULL;
404         struct buffer_head *bh;
405         struct btrfs_super_block *disk_super;
406         u64 latest_devid = 0;
407         u64 latest_transid = 0;
408         u64 devid;
409         int seeding = 1;
410         int ret = 0;
411
412         list_for_each(cur, head) {
413                 device = list_entry(cur, struct btrfs_device, dev_list);
414                 if (device->bdev)
415                         continue;
416                 if (!device->name)
417                         continue;
418
419                 bdev = open_bdev_exclusive(device->name, flags, holder);
420                 if (IS_ERR(bdev)) {
421                         printk("open %s failed\n", device->name);
422                         goto error;
423                 }
424                 set_blocksize(bdev, 4096);
425
426                 bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
427                 if (!bh)
428                         goto error_close;
429
430                 disk_super = (struct btrfs_super_block *)bh->b_data;
431                 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
432                     sizeof(disk_super->magic)))
433                         goto error_brelse;
434
435                 devid = le64_to_cpu(disk_super->dev_item.devid);
436                 if (devid != device->devid)
437                         goto error_brelse;
438
439                 if (memcmp(device->uuid, disk_super->dev_item.uuid,
440                            BTRFS_UUID_SIZE))
441                         goto error_brelse;
442
443                 device->generation = btrfs_super_generation(disk_super);
444                 if (!latest_transid || device->generation > latest_transid) {
445                         latest_devid = devid;
446                         latest_transid = device->generation;
447                         latest_bdev = bdev;
448                 }
449
450                 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
451                         device->writeable = 0;
452                 } else {
453                         device->writeable = !bdev_read_only(bdev);
454                         seeding = 0;
455                 }
456
457                 device->bdev = bdev;
458                 device->in_fs_metadata = 0;
459                 device->mode = flags;
460
461                 fs_devices->open_devices++;
462                 if (device->writeable) {
463                         fs_devices->rw_devices++;
464                         list_add(&device->dev_alloc_list,
465                                  &fs_devices->alloc_list);
466                 }
467                 continue;
468
469 error_brelse:
470                 brelse(bh);
471 error_close:
472                 close_bdev_exclusive(bdev, FMODE_READ);
473 error:
474                 continue;
475         }
476         if (fs_devices->open_devices == 0) {
477                 ret = -EIO;
478                 goto out;
479         }
480         fs_devices->seeding = seeding;
481         fs_devices->opened = 1;
482         fs_devices->latest_bdev = latest_bdev;
483         fs_devices->latest_devid = latest_devid;
484         fs_devices->latest_trans = latest_transid;
485         fs_devices->total_rw_bytes = 0;
486 out:
487         return ret;
488 }
489
490 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
491                        fmode_t flags, void *holder)
492 {
493         int ret;
494
495         mutex_lock(&uuid_mutex);
496         if (fs_devices->opened) {
497                 if (fs_devices->sprouted) {
498                         ret = -EBUSY;
499                 } else {
500                         fs_devices->opened++;
501                         ret = 0;
502                 }
503         } else {
504                 ret = __btrfs_open_devices(fs_devices, flags, holder);
505         }
506         mutex_unlock(&uuid_mutex);
507         return ret;
508 }
509
510 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
511                           struct btrfs_fs_devices **fs_devices_ret)
512 {
513         struct btrfs_super_block *disk_super;
514         struct block_device *bdev;
515         struct buffer_head *bh;
516         int ret;
517         u64 devid;
518         u64 transid;
519
520         mutex_lock(&uuid_mutex);
521
522         bdev = open_bdev_exclusive(path, flags, holder);
523
524         if (IS_ERR(bdev)) {
525                 ret = PTR_ERR(bdev);
526                 goto error;
527         }
528
529         ret = set_blocksize(bdev, 4096);
530         if (ret)
531                 goto error_close;
532         bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
533         if (!bh) {
534                 ret = -EIO;
535                 goto error_close;
536         }
537         disk_super = (struct btrfs_super_block *)bh->b_data;
538         if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
539             sizeof(disk_super->magic))) {
540                 ret = -EINVAL;
541                 goto error_brelse;
542         }
543         devid = le64_to_cpu(disk_super->dev_item.devid);
544         transid = btrfs_super_generation(disk_super);
545         if (disk_super->label[0])
546                 printk("device label %s ", disk_super->label);
547         else {
548                 /* FIXME, make a readl uuid parser */
549                 printk("device fsid %llx-%llx ",
550                        *(unsigned long long *)disk_super->fsid,
551                        *(unsigned long long *)(disk_super->fsid + 8));
552         }
553         printk("devid %Lu transid %Lu %s\n", devid, transid, path);
554         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
555
556 error_brelse:
557         brelse(bh);
558 error_close:
559         close_bdev_exclusive(bdev, flags);
560 error:
561         mutex_unlock(&uuid_mutex);
562         return ret;
563 }
564
565 /*
566  * this uses a pretty simple search, the expectation is that it is
567  * called very infrequently and that a given device has a small number
568  * of extents
569  */
570 static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
571                                          struct btrfs_device *device,
572                                          u64 num_bytes, u64 *start)
573 {
574         struct btrfs_key key;
575         struct btrfs_root *root = device->dev_root;
576         struct btrfs_dev_extent *dev_extent = NULL;
577         struct btrfs_path *path;
578         u64 hole_size = 0;
579         u64 last_byte = 0;
580         u64 search_start = 0;
581         u64 search_end = device->total_bytes;
582         int ret;
583         int slot = 0;
584         int start_found;
585         struct extent_buffer *l;
586
587         path = btrfs_alloc_path();
588         if (!path)
589                 return -ENOMEM;
590         path->reada = 2;
591         start_found = 0;
592
593         /* FIXME use last free of some kind */
594
595         /* we don't want to overwrite the superblock on the drive,
596          * so we make sure to start at an offset of at least 1MB
597          */
598         search_start = max((u64)1024 * 1024, search_start);
599
600         if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
601                 search_start = max(root->fs_info->alloc_start, search_start);
602
603         key.objectid = device->devid;
604         key.offset = search_start;
605         key.type = BTRFS_DEV_EXTENT_KEY;
606         ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
607         if (ret < 0)
608                 goto error;
609         ret = btrfs_previous_item(root, path, 0, key.type);
610         if (ret < 0)
611                 goto error;
612         l = path->nodes[0];
613         btrfs_item_key_to_cpu(l, &key, path->slots[0]);
614         while (1) {
615                 l = path->nodes[0];
616                 slot = path->slots[0];
617                 if (slot >= btrfs_header_nritems(l)) {
618                         ret = btrfs_next_leaf(root, path);
619                         if (ret == 0)
620                                 continue;
621                         if (ret < 0)
622                                 goto error;
623 no_more_items:
624                         if (!start_found) {
625                                 if (search_start >= search_end) {
626                                         ret = -ENOSPC;
627                                         goto error;
628                                 }
629                                 *start = search_start;
630                                 start_found = 1;
631                                 goto check_pending;
632                         }
633                         *start = last_byte > search_start ?
634                                 last_byte : search_start;
635                         if (search_end <= *start) {
636                                 ret = -ENOSPC;
637                                 goto error;
638                         }
639                         goto check_pending;
640                 }
641                 btrfs_item_key_to_cpu(l, &key, slot);
642
643                 if (key.objectid < device->devid)
644                         goto next;
645
646                 if (key.objectid > device->devid)
647                         goto no_more_items;
648
649                 if (key.offset >= search_start && key.offset > last_byte &&
650                     start_found) {
651                         if (last_byte < search_start)
652                                 last_byte = search_start;
653                         hole_size = key.offset - last_byte;
654                         if (key.offset > last_byte &&
655                             hole_size >= num_bytes) {
656                                 *start = last_byte;
657                                 goto check_pending;
658                         }
659                 }
660                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) {
661                         goto next;
662                 }
663
664                 start_found = 1;
665                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
666                 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
667 next:
668                 path->slots[0]++;
669                 cond_resched();
670         }
671 check_pending:
672         /* we have to make sure we didn't find an extent that has already
673          * been allocated by the map tree or the original allocation
674          */
675         BUG_ON(*start < search_start);
676
677         if (*start + num_bytes > search_end) {
678                 ret = -ENOSPC;
679                 goto error;
680         }
681         /* check for pending inserts here */
682         ret = 0;
683
684 error:
685         btrfs_free_path(path);
686         return ret;
687 }
688
689 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
690                           struct btrfs_device *device,
691                           u64 start)
692 {
693         int ret;
694         struct btrfs_path *path;
695         struct btrfs_root *root = device->dev_root;
696         struct btrfs_key key;
697         struct btrfs_key found_key;
698         struct extent_buffer *leaf = NULL;
699         struct btrfs_dev_extent *extent = NULL;
700
701         path = btrfs_alloc_path();
702         if (!path)
703                 return -ENOMEM;
704
705         key.objectid = device->devid;
706         key.offset = start;
707         key.type = BTRFS_DEV_EXTENT_KEY;
708
709         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
710         if (ret > 0) {
711                 ret = btrfs_previous_item(root, path, key.objectid,
712                                           BTRFS_DEV_EXTENT_KEY);
713                 BUG_ON(ret);
714                 leaf = path->nodes[0];
715                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
716                 extent = btrfs_item_ptr(leaf, path->slots[0],
717                                         struct btrfs_dev_extent);
718                 BUG_ON(found_key.offset > start || found_key.offset +
719                        btrfs_dev_extent_length(leaf, extent) < start);
720                 ret = 0;
721         } else if (ret == 0) {
722                 leaf = path->nodes[0];
723                 extent = btrfs_item_ptr(leaf, path->slots[0],
724                                         struct btrfs_dev_extent);
725         }
726         BUG_ON(ret);
727
728         if (device->bytes_used > 0)
729                 device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
730         ret = btrfs_del_item(trans, root, path);
731         BUG_ON(ret);
732
733         btrfs_free_path(path);
734         return ret;
735 }
736
737 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
738                            struct btrfs_device *device,
739                            u64 chunk_tree, u64 chunk_objectid,
740                            u64 chunk_offset, u64 start, u64 num_bytes)
741 {
742         int ret;
743         struct btrfs_path *path;
744         struct btrfs_root *root = device->dev_root;
745         struct btrfs_dev_extent *extent;
746         struct extent_buffer *leaf;
747         struct btrfs_key key;
748
749         WARN_ON(!device->in_fs_metadata);
750         path = btrfs_alloc_path();
751         if (!path)
752                 return -ENOMEM;
753
754         key.objectid = device->devid;
755         key.offset = start;
756         key.type = BTRFS_DEV_EXTENT_KEY;
757         ret = btrfs_insert_empty_item(trans, root, path, &key,
758                                       sizeof(*extent));
759         BUG_ON(ret);
760
761         leaf = path->nodes[0];
762         extent = btrfs_item_ptr(leaf, path->slots[0],
763                                 struct btrfs_dev_extent);
764         btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
765         btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
766         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
767
768         write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
769                     (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
770                     BTRFS_UUID_SIZE);
771
772         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
773         btrfs_mark_buffer_dirty(leaf);
774         btrfs_free_path(path);
775         return ret;
776 }
777
778 static noinline int find_next_chunk(struct btrfs_root *root,
779                                     u64 objectid, u64 *offset)
780 {
781         struct btrfs_path *path;
782         int ret;
783         struct btrfs_key key;
784         struct btrfs_chunk *chunk;
785         struct btrfs_key found_key;
786
787         path = btrfs_alloc_path();
788         BUG_ON(!path);
789
790         key.objectid = objectid;
791         key.offset = (u64)-1;
792         key.type = BTRFS_CHUNK_ITEM_KEY;
793
794         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
795         if (ret < 0)
796                 goto error;
797
798         BUG_ON(ret == 0);
799
800         ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
801         if (ret) {
802                 *offset = 0;
803         } else {
804                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
805                                       path->slots[0]);
806                 if (found_key.objectid != objectid)
807                         *offset = 0;
808                 else {
809                         chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
810                                                struct btrfs_chunk);
811                         *offset = found_key.offset +
812                                 btrfs_chunk_length(path->nodes[0], chunk);
813                 }
814         }
815         ret = 0;
816 error:
817         btrfs_free_path(path);
818         return ret;
819 }
820
821 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
822 {
823         int ret;
824         struct btrfs_key key;
825         struct btrfs_key found_key;
826         struct btrfs_path *path;
827
828         root = root->fs_info->chunk_root;
829
830         path = btrfs_alloc_path();
831         if (!path)
832                 return -ENOMEM;
833
834         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
835         key.type = BTRFS_DEV_ITEM_KEY;
836         key.offset = (u64)-1;
837
838         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
839         if (ret < 0)
840                 goto error;
841
842         BUG_ON(ret == 0);
843
844         ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
845                                   BTRFS_DEV_ITEM_KEY);
846         if (ret) {
847                 *objectid = 1;
848         } else {
849                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
850                                       path->slots[0]);
851                 *objectid = found_key.offset + 1;
852         }
853         ret = 0;
854 error:
855         btrfs_free_path(path);
856         return ret;
857 }
858
859 /*
860  * the device information is stored in the chunk root
861  * the btrfs_device struct should be fully filled in
862  */
863 int btrfs_add_device(struct btrfs_trans_handle *trans,
864                      struct btrfs_root *root,
865                      struct btrfs_device *device)
866 {
867         int ret;
868         struct btrfs_path *path;
869         struct btrfs_dev_item *dev_item;
870         struct extent_buffer *leaf;
871         struct btrfs_key key;
872         unsigned long ptr;
873
874         root = root->fs_info->chunk_root;
875
876         path = btrfs_alloc_path();
877         if (!path)
878                 return -ENOMEM;
879
880         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
881         key.type = BTRFS_DEV_ITEM_KEY;
882         key.offset = device->devid;
883
884         ret = btrfs_insert_empty_item(trans, root, path, &key,
885                                       sizeof(*dev_item));
886         if (ret)
887                 goto out;
888
889         leaf = path->nodes[0];
890         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
891
892         btrfs_set_device_id(leaf, dev_item, device->devid);
893         btrfs_set_device_generation(leaf, dev_item, 0);
894         btrfs_set_device_type(leaf, dev_item, device->type);
895         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
896         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
897         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
898         btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
899         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
900         btrfs_set_device_group(leaf, dev_item, 0);
901         btrfs_set_device_seek_speed(leaf, dev_item, 0);
902         btrfs_set_device_bandwidth(leaf, dev_item, 0);
903
904         ptr = (unsigned long)btrfs_device_uuid(dev_item);
905         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
906         ptr = (unsigned long)btrfs_device_fsid(dev_item);
907         write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
908         btrfs_mark_buffer_dirty(leaf);
909
910         ret = 0;
911 out:
912         btrfs_free_path(path);
913         return ret;
914 }
915
916 static int btrfs_rm_dev_item(struct btrfs_root *root,
917                              struct btrfs_device *device)
918 {
919         int ret;
920         struct btrfs_path *path;
921         struct btrfs_key key;
922         struct btrfs_trans_handle *trans;
923
924         root = root->fs_info->chunk_root;
925
926         path = btrfs_alloc_path();
927         if (!path)
928                 return -ENOMEM;
929
930         trans = btrfs_start_transaction(root, 1);
931         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
932         key.type = BTRFS_DEV_ITEM_KEY;
933         key.offset = device->devid;
934         lock_chunks(root);
935
936         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
937         if (ret < 0)
938                 goto out;
939
940         if (ret > 0) {
941                 ret = -ENOENT;
942                 goto out;
943         }
944
945         ret = btrfs_del_item(trans, root, path);
946         if (ret)
947                 goto out;
948 out:
949         btrfs_free_path(path);
950         unlock_chunks(root);
951         btrfs_commit_transaction(trans, root);
952         return ret;
953 }
954
955 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
956 {
957         struct btrfs_device *device;
958         struct btrfs_device *next_device;
959         struct block_device *bdev;
960         struct buffer_head *bh = NULL;
961         struct btrfs_super_block *disk_super;
962         u64 all_avail;
963         u64 devid;
964         u64 num_devices;
965         u8 *dev_uuid;
966         int ret = 0;
967
968         mutex_lock(&uuid_mutex);
969         mutex_lock(&root->fs_info->volume_mutex);
970
971         all_avail = root->fs_info->avail_data_alloc_bits |
972                 root->fs_info->avail_system_alloc_bits |
973                 root->fs_info->avail_metadata_alloc_bits;
974
975         if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
976             root->fs_info->fs_devices->rw_devices <= 4) {
977                 printk("btrfs: unable to go below four devices on raid10\n");
978                 ret = -EINVAL;
979                 goto out;
980         }
981
982         if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
983             root->fs_info->fs_devices->rw_devices <= 2) {
984                 printk("btrfs: unable to go below two devices on raid1\n");
985                 ret = -EINVAL;
986                 goto out;
987         }
988
989         if (strcmp(device_path, "missing") == 0) {
990                 struct list_head *cur;
991                 struct list_head *devices;
992                 struct btrfs_device *tmp;
993
994                 device = NULL;
995                 devices = &root->fs_info->fs_devices->devices;
996                 list_for_each(cur, devices) {
997                         tmp = list_entry(cur, struct btrfs_device, dev_list);
998                         if (tmp->in_fs_metadata && !tmp->bdev) {
999                                 device = tmp;
1000                                 break;
1001                         }
1002                 }
1003                 bdev = NULL;
1004                 bh = NULL;
1005                 disk_super = NULL;
1006                 if (!device) {
1007                         printk("btrfs: no missing devices found to remove\n");
1008                         goto out;
1009                 }
1010         } else {
1011                 bdev = open_bdev_exclusive(device_path, FMODE_READ,
1012                                       root->fs_info->bdev_holder);
1013                 if (IS_ERR(bdev)) {
1014                         ret = PTR_ERR(bdev);
1015                         goto out;
1016                 }
1017
1018                 set_blocksize(bdev, 4096);
1019                 bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
1020                 if (!bh) {
1021                         ret = -EIO;
1022                         goto error_close;
1023                 }
1024                 disk_super = (struct btrfs_super_block *)bh->b_data;
1025                 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
1026                             sizeof(disk_super->magic))) {
1027                         ret = -ENOENT;
1028                         goto error_brelse;
1029                 }
1030                 devid = le64_to_cpu(disk_super->dev_item.devid);
1031                 dev_uuid = disk_super->dev_item.uuid;
1032                 device = btrfs_find_device(root, devid, dev_uuid,
1033                                            disk_super->fsid);
1034                 if (!device) {
1035                         ret = -ENOENT;
1036                         goto error_brelse;
1037                 }
1038         }
1039
1040         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1041                 printk("btrfs: unable to remove the only writeable device\n");
1042                 ret = -EINVAL;
1043                 goto error_brelse;
1044         }
1045
1046         if (device->writeable) {
1047                 list_del_init(&device->dev_alloc_list);
1048                 root->fs_info->fs_devices->rw_devices--;
1049         }
1050
1051         ret = btrfs_shrink_device(device, 0);
1052         if (ret)
1053                 goto error_brelse;
1054
1055         ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1056         if (ret)
1057                 goto error_brelse;
1058
1059         device->in_fs_metadata = 0;
1060         if (device->fs_devices == root->fs_info->fs_devices) {
1061                 list_del_init(&device->dev_list);
1062                 root->fs_info->fs_devices->num_devices--;
1063                 if (device->bdev)
1064                         device->fs_devices->open_devices--;
1065         }
1066
1067         next_device = list_entry(root->fs_info->fs_devices->devices.next,
1068                                  struct btrfs_device, dev_list);
1069         if (device->bdev == root->fs_info->sb->s_bdev)
1070                 root->fs_info->sb->s_bdev = next_device->bdev;
1071         if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1072                 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1073
1074         num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
1075         btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
1076
1077         if (device->fs_devices != root->fs_info->fs_devices) {
1078                 BUG_ON(device->writeable);
1079                 brelse(bh);
1080                 if (bdev)
1081                         close_bdev_exclusive(bdev, FMODE_READ);
1082
1083                 if (device->bdev) {
1084                         close_bdev_exclusive(device->bdev, device->mode);
1085                         device->bdev = NULL;
1086                         device->fs_devices->open_devices--;
1087                 }
1088                 if (device->fs_devices->open_devices == 0) {
1089                         struct btrfs_fs_devices *fs_devices;
1090                         fs_devices = root->fs_info->fs_devices;
1091                         while (fs_devices) {
1092                                 if (fs_devices->seed == device->fs_devices)
1093                                         break;
1094                                 fs_devices = fs_devices->seed;
1095                         }
1096                         fs_devices->seed = device->fs_devices->seed;
1097                         device->fs_devices->seed = NULL;
1098                         __btrfs_close_devices(device->fs_devices);
1099                 }
1100                 ret = 0;
1101                 goto out;
1102         }
1103
1104         /*
1105          * at this point, the device is zero sized.  We want to
1106          * remove it from the devices list and zero out the old super
1107          */
1108         if (device->writeable) {
1109                 /* make sure this device isn't detected as part of
1110                  * the FS anymore
1111                  */
1112                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1113                 set_buffer_dirty(bh);
1114                 sync_dirty_buffer(bh);
1115         }
1116         brelse(bh);
1117
1118         if (device->bdev) {
1119                 /* one close for the device struct or super_block */
1120                 close_bdev_exclusive(device->bdev, device->mode);
1121         }
1122         if (bdev) {
1123                 /* one close for us */
1124                 close_bdev_exclusive(bdev, FMODE_READ);
1125         }
1126         kfree(device->name);
1127         kfree(device);
1128         ret = 0;
1129         goto out;
1130
1131 error_brelse:
1132         brelse(bh);
1133 error_close:
1134         if (bdev)
1135                 close_bdev_exclusive(bdev, FMODE_READ);
1136 out:
1137         mutex_unlock(&root->fs_info->volume_mutex);
1138         mutex_unlock(&uuid_mutex);
1139         return ret;
1140 }
1141
1142 /*
1143  * does all the dirty work required for changing file system's UUID.
1144  */
1145 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1146                                 struct btrfs_root *root)
1147 {
1148         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1149         struct btrfs_fs_devices *old_devices;
1150         struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
1151         struct btrfs_device *device;
1152         u64 super_flags;
1153
1154         BUG_ON(!mutex_is_locked(&uuid_mutex));
1155         if (!fs_devices->seeding || fs_devices->opened != 1)
1156                 return -EINVAL;
1157
1158         old_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1159         if (!old_devices)
1160                 return -ENOMEM;
1161
1162         memcpy(old_devices, fs_devices, sizeof(*old_devices));
1163         old_devices->opened = 1;
1164         old_devices->sprouted = 1;
1165         INIT_LIST_HEAD(&old_devices->devices);
1166         INIT_LIST_HEAD(&old_devices->alloc_list);
1167         list_splice_init(&fs_devices->devices, &old_devices->devices);
1168         list_splice_init(&fs_devices->alloc_list, &old_devices->alloc_list);
1169         list_for_each_entry(device, &old_devices->devices, dev_list) {
1170                 device->fs_devices = old_devices;
1171         }
1172         list_add(&old_devices->list, &fs_uuids);
1173
1174         fs_devices->seeding = 0;
1175         fs_devices->num_devices = 0;
1176         fs_devices->open_devices = 0;
1177         fs_devices->seed = old_devices;
1178
1179         generate_random_uuid(fs_devices->fsid);
1180         memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1181         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1182         super_flags = btrfs_super_flags(disk_super) &
1183                       ~BTRFS_SUPER_FLAG_SEEDING;
1184         btrfs_set_super_flags(disk_super, super_flags);
1185
1186         return 0;
1187 }
1188
1189 /*
1190  * strore the expected generation for seed devices in device items.
1191  */
1192 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1193                                struct btrfs_root *root)
1194 {
1195         struct btrfs_path *path;
1196         struct extent_buffer *leaf;
1197         struct btrfs_dev_item *dev_item;
1198         struct btrfs_device *device;
1199         struct btrfs_key key;
1200         u8 fs_uuid[BTRFS_UUID_SIZE];
1201         u8 dev_uuid[BTRFS_UUID_SIZE];
1202         u64 devid;
1203         int ret;
1204
1205         path = btrfs_alloc_path();
1206         if (!path)
1207                 return -ENOMEM;
1208
1209         root = root->fs_info->chunk_root;
1210         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1211         key.offset = 0;
1212         key.type = BTRFS_DEV_ITEM_KEY;
1213
1214         while (1) {
1215                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1216                 if (ret < 0)
1217                         goto error;
1218
1219                 leaf = path->nodes[0];
1220 next_slot:
1221                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1222                         ret = btrfs_next_leaf(root, path);
1223                         if (ret > 0)
1224                                 break;
1225                         if (ret < 0)
1226                                 goto error;
1227                         leaf = path->nodes[0];
1228                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1229                         btrfs_release_path(root, path);
1230                         continue;
1231                 }
1232
1233                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1234                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1235                     key.type != BTRFS_DEV_ITEM_KEY)
1236                         break;
1237
1238                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1239                                           struct btrfs_dev_item);
1240                 devid = btrfs_device_id(leaf, dev_item);
1241                 read_extent_buffer(leaf, dev_uuid,
1242                                    (unsigned long)btrfs_device_uuid(dev_item),
1243                                    BTRFS_UUID_SIZE);
1244                 read_extent_buffer(leaf, fs_uuid,
1245                                    (unsigned long)btrfs_device_fsid(dev_item),
1246                                    BTRFS_UUID_SIZE);
1247                 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1248                 BUG_ON(!device);
1249
1250                 if (device->fs_devices->seeding) {
1251                         btrfs_set_device_generation(leaf, dev_item,
1252                                                     device->generation);
1253                         btrfs_mark_buffer_dirty(leaf);
1254                 }
1255
1256                 path->slots[0]++;
1257                 goto next_slot;
1258         }
1259         ret = 0;
1260 error:
1261         btrfs_free_path(path);
1262         return ret;
1263 }
1264
1265 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1266 {
1267         struct btrfs_trans_handle *trans;
1268         struct btrfs_device *device;
1269         struct block_device *bdev;
1270         struct list_head *cur;
1271         struct list_head *devices;
1272         struct super_block *sb = root->fs_info->sb;
1273         u64 total_bytes;
1274         int seeding_dev = 0;
1275         int ret = 0;
1276
1277         if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1278                 return -EINVAL;
1279
1280         bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
1281         if (!bdev) {
1282                 return -EIO;
1283         }
1284
1285         if (root->fs_info->fs_devices->seeding) {
1286                 seeding_dev = 1;
1287                 down_write(&sb->s_umount);
1288                 mutex_lock(&uuid_mutex);
1289         }
1290
1291         filemap_write_and_wait(bdev->bd_inode->i_mapping);
1292         mutex_lock(&root->fs_info->volume_mutex);
1293
1294         devices = &root->fs_info->fs_devices->devices;
1295         list_for_each(cur, devices) {
1296                 device = list_entry(cur, struct btrfs_device, dev_list);
1297                 if (device->bdev == bdev) {
1298                         ret = -EEXIST;
1299                         goto error;
1300                 }
1301         }
1302
1303         device = kzalloc(sizeof(*device), GFP_NOFS);
1304         if (!device) {
1305                 /* we can safely leave the fs_devices entry around */
1306                 ret = -ENOMEM;
1307                 goto error;
1308         }
1309
1310         device->name = kstrdup(device_path, GFP_NOFS);
1311         if (!device->name) {
1312                 kfree(device);
1313                 ret = -ENOMEM;
1314                 goto error;
1315         }
1316
1317         ret = find_next_devid(root, &device->devid);
1318         if (ret) {
1319                 kfree(device);
1320                 goto error;
1321         }
1322
1323         trans = btrfs_start_transaction(root, 1);
1324         lock_chunks(root);
1325
1326         device->barriers = 1;
1327         device->writeable = 1;
1328         device->work.func = pending_bios_fn;
1329         generate_random_uuid(device->uuid);
1330         spin_lock_init(&device->io_lock);
1331         device->generation = trans->transid;
1332         device->io_width = root->sectorsize;
1333         device->io_align = root->sectorsize;
1334         device->sector_size = root->sectorsize;
1335         device->total_bytes = i_size_read(bdev->bd_inode);
1336         device->dev_root = root->fs_info->dev_root;
1337         device->bdev = bdev;
1338         device->in_fs_metadata = 1;
1339         device->mode = 0;
1340         set_blocksize(device->bdev, 4096);
1341
1342         if (seeding_dev) {
1343                 sb->s_flags &= ~MS_RDONLY;
1344                 ret = btrfs_prepare_sprout(trans, root);
1345                 BUG_ON(ret);
1346         }
1347
1348         device->fs_devices = root->fs_info->fs_devices;
1349         list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
1350         list_add(&device->dev_alloc_list,
1351                  &root->fs_info->fs_devices->alloc_list);
1352         root->fs_info->fs_devices->num_devices++;
1353         root->fs_info->fs_devices->open_devices++;
1354         root->fs_info->fs_devices->rw_devices++;
1355         root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1356
1357         total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
1358         btrfs_set_super_total_bytes(&root->fs_info->super_copy,
1359                                     total_bytes + device->total_bytes);
1360
1361         total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
1362         btrfs_set_super_num_devices(&root->fs_info->super_copy,
1363                                     total_bytes + 1);
1364
1365         if (seeding_dev) {
1366                 ret = init_first_rw_device(trans, root, device);
1367                 BUG_ON(ret);
1368                 ret = btrfs_finish_sprout(trans, root);
1369                 BUG_ON(ret);
1370         } else {
1371                 ret = btrfs_add_device(trans, root, device);
1372         }
1373
1374         unlock_chunks(root);
1375         btrfs_commit_transaction(trans, root);
1376
1377         if (seeding_dev) {
1378                 mutex_unlock(&uuid_mutex);
1379                 up_write(&sb->s_umount);
1380
1381                 ret = btrfs_relocate_sys_chunks(root);
1382                 BUG_ON(ret);
1383         }
1384 out:
1385         mutex_unlock(&root->fs_info->volume_mutex);
1386         return ret;
1387 error:
1388         close_bdev_exclusive(bdev, 0);
1389         if (seeding_dev) {
1390                 mutex_unlock(&uuid_mutex);
1391                 up_write(&sb->s_umount);
1392         }
1393         goto out;
1394 }
1395
1396 static int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
1397                                  struct btrfs_device *device)
1398 {
1399         int ret;
1400         struct btrfs_path *path;
1401         struct btrfs_root *root;
1402         struct btrfs_dev_item *dev_item;
1403         struct extent_buffer *leaf;
1404         struct btrfs_key key;
1405
1406         root = device->dev_root->fs_info->chunk_root;
1407
1408         path = btrfs_alloc_path();
1409         if (!path)
1410                 return -ENOMEM;
1411
1412         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1413         key.type = BTRFS_DEV_ITEM_KEY;
1414         key.offset = device->devid;
1415
1416         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1417         if (ret < 0)
1418                 goto out;
1419
1420         if (ret > 0) {
1421                 ret = -ENOENT;
1422                 goto out;
1423         }
1424
1425         leaf = path->nodes[0];
1426         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1427
1428         btrfs_set_device_id(leaf, dev_item, device->devid);
1429         btrfs_set_device_type(leaf, dev_item, device->type);
1430         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1431         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1432         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1433         btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1434         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1435         btrfs_mark_buffer_dirty(leaf);
1436
1437 out:
1438         btrfs_free_path(path);
1439         return ret;
1440 }
1441
1442 static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1443                       struct btrfs_device *device, u64 new_size)
1444 {
1445         struct btrfs_super_block *super_copy =
1446                 &device->dev_root->fs_info->super_copy;
1447         u64 old_total = btrfs_super_total_bytes(super_copy);
1448         u64 diff = new_size - device->total_bytes;
1449
1450         if (!device->writeable)
1451                 return -EACCES;
1452         if (new_size <= device->total_bytes)
1453                 return -EINVAL;
1454
1455         btrfs_set_super_total_bytes(super_copy, old_total + diff);
1456         device->fs_devices->total_rw_bytes += diff;
1457
1458         device->total_bytes = new_size;
1459         return btrfs_update_device(trans, device);
1460 }
1461
1462 int btrfs_grow_device(struct btrfs_trans_handle *trans,
1463                       struct btrfs_device *device, u64 new_size)
1464 {
1465         int ret;
1466         lock_chunks(device->dev_root);
1467         ret = __btrfs_grow_device(trans, device, new_size);
1468         unlock_chunks(device->dev_root);
1469         return ret;
1470 }
1471
1472 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1473                             struct btrfs_root *root,
1474                             u64 chunk_tree, u64 chunk_objectid,
1475                             u64 chunk_offset)
1476 {
1477         int ret;
1478         struct btrfs_path *path;
1479         struct btrfs_key key;
1480
1481         root = root->fs_info->chunk_root;
1482         path = btrfs_alloc_path();
1483         if (!path)
1484                 return -ENOMEM;
1485
1486         key.objectid = chunk_objectid;
1487         key.offset = chunk_offset;
1488         key.type = BTRFS_CHUNK_ITEM_KEY;
1489
1490         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1491         BUG_ON(ret);
1492
1493         ret = btrfs_del_item(trans, root, path);
1494         BUG_ON(ret);
1495
1496         btrfs_free_path(path);
1497         return 0;
1498 }
1499
1500 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1501                         chunk_offset)
1502 {
1503         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1504         struct btrfs_disk_key *disk_key;
1505         struct btrfs_chunk *chunk;
1506         u8 *ptr;
1507         int ret = 0;
1508         u32 num_stripes;
1509         u32 array_size;
1510         u32 len = 0;
1511         u32 cur;
1512         struct btrfs_key key;
1513
1514         array_size = btrfs_super_sys_array_size(super_copy);
1515
1516         ptr = super_copy->sys_chunk_array;
1517         cur = 0;
1518
1519         while (cur < array_size) {
1520                 disk_key = (struct btrfs_disk_key *)ptr;
1521                 btrfs_disk_key_to_cpu(&key, disk_key);
1522
1523                 len = sizeof(*disk_key);
1524
1525                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1526                         chunk = (struct btrfs_chunk *)(ptr + len);
1527                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1528                         len += btrfs_chunk_item_size(num_stripes);
1529                 } else {
1530                         ret = -EIO;
1531                         break;
1532                 }
1533                 if (key.objectid == chunk_objectid &&
1534                     key.offset == chunk_offset) {
1535                         memmove(ptr, ptr + len, array_size - (cur + len));
1536                         array_size -= len;
1537                         btrfs_set_super_sys_array_size(super_copy, array_size);
1538                 } else {
1539                         ptr += len;
1540                         cur += len;
1541                 }
1542         }
1543         return ret;
1544 }
1545
1546 static int btrfs_relocate_chunk(struct btrfs_root *root,
1547                          u64 chunk_tree, u64 chunk_objectid,
1548                          u64 chunk_offset)
1549 {
1550         struct extent_map_tree *em_tree;
1551         struct btrfs_root *extent_root;
1552         struct btrfs_trans_handle *trans;
1553         struct extent_map *em;
1554         struct map_lookup *map;
1555         int ret;
1556         int i;
1557
1558         printk("btrfs relocating chunk %llu\n",
1559                (unsigned long long)chunk_offset);
1560         root = root->fs_info->chunk_root;
1561         extent_root = root->fs_info->extent_root;
1562         em_tree = &root->fs_info->mapping_tree.map_tree;
1563
1564         /* step one, relocate all the extents inside this chunk */
1565         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1566         BUG_ON(ret);
1567
1568         trans = btrfs_start_transaction(root, 1);
1569         BUG_ON(!trans);
1570
1571         lock_chunks(root);
1572
1573         /*
1574          * step two, delete the device extents and the
1575          * chunk tree entries
1576          */
1577         spin_lock(&em_tree->lock);
1578         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1579         spin_unlock(&em_tree->lock);
1580
1581         BUG_ON(em->start > chunk_offset ||
1582                em->start + em->len < chunk_offset);
1583         map = (struct map_lookup *)em->bdev;
1584
1585         for (i = 0; i < map->num_stripes; i++) {
1586                 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1587                                             map->stripes[i].physical);
1588                 BUG_ON(ret);
1589
1590                 if (map->stripes[i].dev) {
1591                         ret = btrfs_update_device(trans, map->stripes[i].dev);
1592                         BUG_ON(ret);
1593                 }
1594         }
1595         ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1596                                chunk_offset);
1597
1598         BUG_ON(ret);
1599
1600         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1601                 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1602                 BUG_ON(ret);
1603         }
1604
1605         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1606         BUG_ON(ret);
1607
1608         spin_lock(&em_tree->lock);
1609         remove_extent_mapping(em_tree, em);
1610         spin_unlock(&em_tree->lock);
1611
1612         kfree(map);
1613         em->bdev = NULL;
1614
1615         /* once for the tree */
1616         free_extent_map(em);
1617         /* once for us */
1618         free_extent_map(em);
1619
1620         unlock_chunks(root);
1621         btrfs_end_transaction(trans, root);
1622         return 0;
1623 }
1624
1625 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
1626 {
1627         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
1628         struct btrfs_path *path;
1629         struct extent_buffer *leaf;
1630         struct btrfs_chunk *chunk;
1631         struct btrfs_key key;
1632         struct btrfs_key found_key;
1633         u64 chunk_tree = chunk_root->root_key.objectid;
1634         u64 chunk_type;
1635         int ret;
1636
1637         path = btrfs_alloc_path();
1638         if (!path)
1639                 return -ENOMEM;
1640
1641         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1642         key.offset = (u64)-1;
1643         key.type = BTRFS_CHUNK_ITEM_KEY;
1644
1645         while (1) {
1646                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1647                 if (ret < 0)
1648                         goto error;
1649                 BUG_ON(ret == 0);
1650
1651                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
1652                                           key.type);
1653                 if (ret < 0)
1654                         goto error;
1655                 if (ret > 0)
1656                         break;
1657
1658                 leaf = path->nodes[0];
1659                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1660
1661                 chunk = btrfs_item_ptr(leaf, path->slots[0],
1662                                        struct btrfs_chunk);
1663                 chunk_type = btrfs_chunk_type(leaf, chunk);
1664                 btrfs_release_path(chunk_root, path);
1665
1666                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
1667                         ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
1668                                                    found_key.objectid,
1669                                                    found_key.offset);
1670                         BUG_ON(ret);
1671                 }
1672
1673                 if (found_key.offset == 0)
1674                         break;
1675                 key.offset = found_key.offset - 1;
1676         }
1677         ret = 0;
1678 error:
1679         btrfs_free_path(path);
1680         return ret;
1681 }
1682
1683 static u64 div_factor(u64 num, int factor)
1684 {
1685         if (factor == 10)
1686                 return num;
1687         num *= factor;
1688         do_div(num, 10);
1689         return num;
1690 }
1691
1692 int btrfs_balance(struct btrfs_root *dev_root)
1693 {
1694         int ret;
1695         struct list_head *cur;
1696         struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
1697         struct btrfs_device *device;
1698         u64 old_size;
1699         u64 size_to_free;
1700         struct btrfs_path *path;
1701         struct btrfs_key key;
1702         struct btrfs_chunk *chunk;
1703         struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
1704         struct btrfs_trans_handle *trans;
1705         struct btrfs_key found_key;
1706
1707         if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
1708                 return -EROFS;
1709
1710         mutex_lock(&dev_root->fs_info->volume_mutex);
1711         dev_root = dev_root->fs_info->dev_root;
1712
1713         /* step one make some room on all the devices */
1714         list_for_each(cur, devices) {
1715                 device = list_entry(cur, struct btrfs_device, dev_list);
1716                 old_size = device->total_bytes;
1717                 size_to_free = div_factor(old_size, 1);
1718                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
1719                 if (!device->writeable ||
1720                     device->total_bytes - device->bytes_used > size_to_free)
1721                         continue;
1722
1723                 ret = btrfs_shrink_device(device, old_size - size_to_free);
1724                 BUG_ON(ret);
1725
1726                 trans = btrfs_start_transaction(dev_root, 1);
1727                 BUG_ON(!trans);
1728
1729                 ret = btrfs_grow_device(trans, device, old_size);
1730                 BUG_ON(ret);
1731
1732                 btrfs_end_transaction(trans, dev_root);
1733         }
1734
1735         /* step two, relocate all the chunks */
1736         path = btrfs_alloc_path();
1737         BUG_ON(!path);
1738
1739         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1740         key.offset = (u64)-1;
1741         key.type = BTRFS_CHUNK_ITEM_KEY;
1742
1743         while(1) {
1744                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1745                 if (ret < 0)
1746                         goto error;
1747
1748                 /*
1749                  * this shouldn't happen, it means the last relocate
1750                  * failed
1751                  */
1752                 if (ret == 0)
1753                         break;
1754
1755                 ret = btrfs_previous_item(chunk_root, path, 0,
1756                                           BTRFS_CHUNK_ITEM_KEY);
1757                 if (ret)
1758                         break;
1759
1760                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1761                                       path->slots[0]);
1762                 if (found_key.objectid != key.objectid)
1763                         break;
1764
1765                 chunk = btrfs_item_ptr(path->nodes[0],
1766                                        path->slots[0],
1767                                        struct btrfs_chunk);
1768                 key.offset = found_key.offset;
1769                 /* chunk zero is special */
1770                 if (key.offset == 0)
1771                         break;
1772
1773                 btrfs_release_path(chunk_root, path);
1774                 ret = btrfs_relocate_chunk(chunk_root,
1775                                            chunk_root->root_key.objectid,
1776                                            found_key.objectid,
1777                                            found_key.offset);
1778                 BUG_ON(ret);
1779         }
1780         ret = 0;
1781 error:
1782         btrfs_free_path(path);
1783         mutex_unlock(&dev_root->fs_info->volume_mutex);
1784         return ret;
1785 }
1786
1787 /*
1788  * shrinking a device means finding all of the device extents past
1789  * the new size, and then following the back refs to the chunks.
1790  * The chunk relocation code actually frees the device extent
1791  */
1792 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1793 {
1794         struct btrfs_trans_handle *trans;
1795         struct btrfs_root *root = device->dev_root;
1796         struct btrfs_dev_extent *dev_extent = NULL;
1797         struct btrfs_path *path;
1798         u64 length;
1799         u64 chunk_tree;
1800         u64 chunk_objectid;
1801         u64 chunk_offset;
1802         int ret;
1803         int slot;
1804         struct extent_buffer *l;
1805         struct btrfs_key key;
1806         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1807         u64 old_total = btrfs_super_total_bytes(super_copy);
1808         u64 diff = device->total_bytes - new_size;
1809
1810         if (new_size >= device->total_bytes)
1811                 return -EINVAL;
1812
1813         path = btrfs_alloc_path();
1814         if (!path)
1815                 return -ENOMEM;
1816
1817         trans = btrfs_start_transaction(root, 1);
1818         if (!trans) {
1819                 ret = -ENOMEM;
1820                 goto done;
1821         }
1822
1823         path->reada = 2;
1824
1825         lock_chunks(root);
1826
1827         device->total_bytes = new_size;
1828         if (device->writeable)
1829                 device->fs_devices->total_rw_bytes -= diff;
1830         ret = btrfs_update_device(trans, device);
1831         if (ret) {
1832                 unlock_chunks(root);
1833                 btrfs_end_transaction(trans, root);
1834                 goto done;
1835         }
1836         WARN_ON(diff > old_total);
1837         btrfs_set_super_total_bytes(super_copy, old_total - diff);
1838         unlock_chunks(root);
1839         btrfs_end_transaction(trans, root);
1840
1841         key.objectid = device->devid;
1842         key.offset = (u64)-1;
1843         key.type = BTRFS_DEV_EXTENT_KEY;
1844
1845         while (1) {
1846                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1847                 if (ret < 0)
1848                         goto done;
1849
1850                 ret = btrfs_previous_item(root, path, 0, key.type);
1851                 if (ret < 0)
1852                         goto done;
1853                 if (ret) {
1854                         ret = 0;
1855                         goto done;
1856                 }
1857
1858                 l = path->nodes[0];
1859                 slot = path->slots[0];
1860                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1861
1862                 if (key.objectid != device->devid)
1863                         goto done;
1864
1865                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1866                 length = btrfs_dev_extent_length(l, dev_extent);
1867
1868                 if (key.offset + length <= new_size)
1869                         goto done;
1870
1871                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
1872                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
1873                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
1874                 btrfs_release_path(root, path);
1875
1876                 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
1877                                            chunk_offset);
1878                 if (ret)
1879                         goto done;
1880         }
1881
1882 done:
1883         btrfs_free_path(path);
1884         return ret;
1885 }
1886
1887 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
1888                            struct btrfs_root *root,
1889                            struct btrfs_key *key,
1890                            struct btrfs_chunk *chunk, int item_size)
1891 {
1892         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1893         struct btrfs_disk_key disk_key;
1894         u32 array_size;
1895         u8 *ptr;
1896
1897         array_size = btrfs_super_sys_array_size(super_copy);
1898         if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
1899                 return -EFBIG;
1900
1901         ptr = super_copy->sys_chunk_array + array_size;
1902         btrfs_cpu_key_to_disk(&disk_key, key);
1903         memcpy(ptr, &disk_key, sizeof(disk_key));
1904         ptr += sizeof(disk_key);
1905         memcpy(ptr, chunk, item_size);
1906         item_size += sizeof(disk_key);
1907         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
1908         return 0;
1909 }
1910
1911 static u64 noinline chunk_bytes_by_type(u64 type, u64 calc_size,
1912                                         int num_stripes, int sub_stripes)
1913 {
1914         if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
1915                 return calc_size;
1916         else if (type & BTRFS_BLOCK_GROUP_RAID10)
1917                 return calc_size * (num_stripes / sub_stripes);
1918         else
1919                 return calc_size * num_stripes;
1920 }
1921
1922 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
1923                                struct btrfs_root *extent_root,
1924                                struct map_lookup **map_ret,
1925                                u64 *num_bytes, u64 *stripe_size,
1926                                u64 start, u64 type)
1927 {
1928         struct btrfs_fs_info *info = extent_root->fs_info;
1929         struct btrfs_device *device = NULL;
1930         struct btrfs_fs_devices *fs_devices = info->fs_devices;
1931         struct list_head *cur;
1932         struct map_lookup *map = NULL;
1933         struct extent_map_tree *em_tree;
1934         struct extent_map *em;
1935         struct list_head private_devs;
1936         int min_stripe_size = 1 * 1024 * 1024;
1937         u64 calc_size = 1024 * 1024 * 1024;
1938         u64 max_chunk_size = calc_size;
1939         u64 min_free;
1940         u64 avail;
1941         u64 max_avail = 0;
1942         u64 dev_offset;
1943         int num_stripes = 1;
1944         int min_stripes = 1;
1945         int sub_stripes = 0;
1946         int looped = 0;
1947         int ret;
1948         int index;
1949         int stripe_len = 64 * 1024;
1950
1951         if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
1952             (type & BTRFS_BLOCK_GROUP_DUP)) {
1953                 WARN_ON(1);
1954                 type &= ~BTRFS_BLOCK_GROUP_DUP;
1955         }
1956         if (list_empty(&fs_devices->alloc_list))
1957                 return -ENOSPC;
1958
1959         if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
1960                 num_stripes = fs_devices->rw_devices;
1961                 min_stripes = 2;
1962         }
1963         if (type & (BTRFS_BLOCK_GROUP_DUP)) {
1964                 num_stripes = 2;
1965                 min_stripes = 2;
1966         }
1967         if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
1968                 num_stripes = min_t(u64, 2, fs_devices->rw_devices);
1969                 if (num_stripes < 2)
1970                         return -ENOSPC;
1971                 min_stripes = 2;
1972         }
1973         if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
1974                 num_stripes = fs_devices->rw_devices;
1975                 if (num_stripes < 4)
1976                         return -ENOSPC;
1977                 num_stripes &= ~(u32)1;
1978                 sub_stripes = 2;
1979                 min_stripes = 4;
1980         }
1981
1982         if (type & BTRFS_BLOCK_GROUP_DATA) {
1983                 max_chunk_size = 10 * calc_size;
1984                 min_stripe_size = 64 * 1024 * 1024;
1985         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
1986                 max_chunk_size = 4 * calc_size;
1987                 min_stripe_size = 32 * 1024 * 1024;
1988         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
1989                 calc_size = 8 * 1024 * 1024;
1990                 max_chunk_size = calc_size * 2;
1991                 min_stripe_size = 1 * 1024 * 1024;
1992         }
1993
1994         /* we don't want a chunk larger than 10% of writeable space */
1995         max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
1996                              max_chunk_size);
1997
1998 again:
1999         if (!map || map->num_stripes != num_stripes) {
2000                 kfree(map);
2001                 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2002                 if (!map)
2003                         return -ENOMEM;
2004                 map->num_stripes = num_stripes;
2005         }
2006
2007         if (calc_size * num_stripes > max_chunk_size) {
2008                 calc_size = max_chunk_size;
2009                 do_div(calc_size, num_stripes);
2010                 do_div(calc_size, stripe_len);
2011                 calc_size *= stripe_len;
2012         }
2013         /* we don't want tiny stripes */
2014         calc_size = max_t(u64, min_stripe_size, calc_size);
2015
2016         do_div(calc_size, stripe_len);
2017         calc_size *= stripe_len;
2018
2019         cur = fs_devices->alloc_list.next;
2020         index = 0;
2021
2022         if (type & BTRFS_BLOCK_GROUP_DUP)
2023                 min_free = calc_size * 2;
2024         else
2025                 min_free = calc_size;
2026
2027         /*
2028          * we add 1MB because we never use the first 1MB of the device, unless
2029          * we've looped, then we are likely allocating the maximum amount of
2030          * space left already
2031          */
2032         if (!looped)
2033                 min_free += 1024 * 1024;
2034
2035         INIT_LIST_HEAD(&private_devs);
2036         while(index < num_stripes) {
2037                 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
2038                 BUG_ON(!device->writeable);
2039                 if (device->total_bytes > device->bytes_used)
2040                         avail = device->total_bytes - device->bytes_used;
2041                 else
2042                         avail = 0;
2043                 cur = cur->next;
2044
2045                 if (device->in_fs_metadata && avail >= min_free) {
2046                         ret = find_free_dev_extent(trans, device,
2047                                                    min_free, &dev_offset);
2048                         if (ret == 0) {
2049                                 list_move_tail(&device->dev_alloc_list,
2050                                                &private_devs);
2051                                 map->stripes[index].dev = device;
2052                                 map->stripes[index].physical = dev_offset;
2053                                 index++;
2054                                 if (type & BTRFS_BLOCK_GROUP_DUP) {
2055                                         map->stripes[index].dev = device;
2056                                         map->stripes[index].physical =
2057                                                 dev_offset + calc_size;
2058                                         index++;
2059                                 }
2060                         }
2061                 } else if (device->in_fs_metadata && avail > max_avail)
2062                         max_avail = avail;
2063                 if (cur == &fs_devices->alloc_list)
2064                         break;
2065         }
2066         list_splice(&private_devs, &fs_devices->alloc_list);
2067         if (index < num_stripes) {
2068                 if (index >= min_stripes) {
2069                         num_stripes = index;
2070                         if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2071                                 num_stripes /= sub_stripes;
2072                                 num_stripes *= sub_stripes;
2073                         }
2074                         looped = 1;
2075                         goto again;
2076                 }
2077                 if (!looped && max_avail > 0) {
2078                         looped = 1;
2079                         calc_size = max_avail;
2080                         goto again;
2081                 }
2082                 kfree(map);
2083                 return -ENOSPC;
2084         }
2085         map->sector_size = extent_root->sectorsize;
2086         map->stripe_len = stripe_len;
2087         map->io_align = stripe_len;
2088         map->io_width = stripe_len;
2089         map->type = type;
2090         map->num_stripes = num_stripes;
2091         map->sub_stripes = sub_stripes;
2092
2093         *map_ret = map;
2094         *stripe_size = calc_size;
2095         *num_bytes = chunk_bytes_by_type(type, calc_size,
2096                                          num_stripes, sub_stripes);
2097
2098         em = alloc_extent_map(GFP_NOFS);
2099         if (!em) {
2100                 kfree(map);
2101                 return -ENOMEM;
2102         }
2103         em->bdev = (struct block_device *)map;
2104         em->start = start;
2105         em->len = *num_bytes;
2106         em->block_start = 0;
2107         em->block_len = em->len;
2108
2109         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
2110         spin_lock(&em_tree->lock);
2111         ret = add_extent_mapping(em_tree, em);
2112         spin_unlock(&em_tree->lock);
2113         BUG_ON(ret);
2114         free_extent_map(em);
2115
2116         ret = btrfs_make_block_group(trans, extent_root, 0, type,
2117                                      BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2118                                      start, *num_bytes);
2119         BUG_ON(ret);
2120
2121         index = 0;
2122         while (index < map->num_stripes) {
2123                 device = map->stripes[index].dev;
2124                 dev_offset = map->stripes[index].physical;
2125
2126                 ret = btrfs_alloc_dev_extent(trans, device,
2127                                 info->chunk_root->root_key.objectid,
2128                                 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2129                                 start, dev_offset, calc_size);
2130                 BUG_ON(ret);
2131                 index++;
2132         }
2133
2134         return 0;
2135 }
2136
2137 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2138                                 struct btrfs_root *extent_root,
2139                                 struct map_lookup *map, u64 chunk_offset,
2140                                 u64 chunk_size, u64 stripe_size)
2141 {
2142         u64 dev_offset;
2143         struct btrfs_key key;
2144         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2145         struct btrfs_device *device;
2146         struct btrfs_chunk *chunk;
2147         struct btrfs_stripe *stripe;
2148         size_t item_size = btrfs_chunk_item_size(map->num_stripes);
2149         int index = 0;
2150         int ret;
2151
2152         chunk = kzalloc(item_size, GFP_NOFS);
2153         if (!chunk)
2154                 return -ENOMEM;
2155
2156         index = 0;
2157         while (index < map->num_stripes) {
2158                 device = map->stripes[index].dev;
2159                 device->bytes_used += stripe_size;
2160                 ret = btrfs_update_device(trans, device);
2161                 BUG_ON(ret);
2162                 index++;
2163         }
2164
2165         index = 0;
2166         stripe = &chunk->stripe;
2167         while (index < map->num_stripes) {
2168                 device = map->stripes[index].dev;
2169                 dev_offset = map->stripes[index].physical;
2170
2171                 btrfs_set_stack_stripe_devid(stripe, device->devid);
2172                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
2173                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
2174                 stripe++;
2175                 index++;
2176         }
2177
2178         btrfs_set_stack_chunk_length(chunk, chunk_size);
2179         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
2180         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
2181         btrfs_set_stack_chunk_type(chunk, map->type);
2182         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
2183         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
2184         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
2185         btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
2186         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
2187
2188         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2189         key.type = BTRFS_CHUNK_ITEM_KEY;
2190         key.offset = chunk_offset;
2191
2192         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
2193         BUG_ON(ret);
2194
2195         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2196                 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
2197                                              item_size);
2198                 BUG_ON(ret);
2199         }
2200         kfree(chunk);
2201         return 0;
2202 }
2203
2204 /*
2205  * Chunk allocation falls into two parts. The first part does works
2206  * that make the new allocated chunk useable, but not do any operation
2207  * that modifies the chunk tree. The second part does the works that
2208  * require modifying the chunk tree. This division is important for the
2209  * bootstrap process of adding storage to a seed btrfs.
2210  */
2211 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2212                       struct btrfs_root *extent_root, u64 type)
2213 {
2214         u64 chunk_offset;
2215         u64 chunk_size;
2216         u64 stripe_size;
2217         struct map_lookup *map;
2218         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2219         int ret;
2220
2221         ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2222                               &chunk_offset);
2223         if (ret)
2224                 return ret;
2225
2226         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2227                                   &stripe_size, chunk_offset, type);
2228         if (ret)
2229                 return ret;
2230
2231         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2232                                    chunk_size, stripe_size);
2233         BUG_ON(ret);
2234         return 0;
2235 }
2236
2237 static int noinline init_first_rw_device(struct btrfs_trans_handle *trans,
2238                                          struct btrfs_root *root,
2239                                          struct btrfs_device *device)
2240 {
2241         u64 chunk_offset;
2242         u64 sys_chunk_offset;
2243         u64 chunk_size;
2244         u64 sys_chunk_size;
2245         u64 stripe_size;
2246         u64 sys_stripe_size;
2247         u64 alloc_profile;
2248         struct map_lookup *map;
2249         struct map_lookup *sys_map;
2250         struct btrfs_fs_info *fs_info = root->fs_info;
2251         struct btrfs_root *extent_root = fs_info->extent_root;
2252         int ret;
2253
2254         ret = find_next_chunk(fs_info->chunk_root,
2255                               BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
2256         BUG_ON(ret);
2257
2258         alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2259                         (fs_info->metadata_alloc_profile &
2260                          fs_info->avail_metadata_alloc_bits);
2261         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2262
2263         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2264                                   &stripe_size, chunk_offset, alloc_profile);
2265         BUG_ON(ret);
2266
2267         sys_chunk_offset = chunk_offset + chunk_size;
2268
2269         alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
2270                         (fs_info->system_alloc_profile &
2271                          fs_info->avail_system_alloc_bits);
2272         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2273
2274         ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
2275                                   &sys_chunk_size, &sys_stripe_size,
2276                                   sys_chunk_offset, alloc_profile);
2277         BUG_ON(ret);
2278
2279         ret = btrfs_add_device(trans, fs_info->chunk_root, device);
2280         BUG_ON(ret);
2281
2282         /*
2283          * Modifying chunk tree needs allocating new blocks from both
2284          * system block group and metadata block group. So we only can
2285          * do operations require modifying the chunk tree after both
2286          * block groups were created.
2287          */
2288         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2289                                    chunk_size, stripe_size);
2290         BUG_ON(ret);
2291
2292         ret = __finish_chunk_alloc(trans, extent_root, sys_map,
2293                                    sys_chunk_offset, sys_chunk_size,
2294                                    sys_stripe_size);
2295         BUG_ON(ret);
2296         return 0;
2297 }
2298
2299 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
2300 {
2301         struct extent_map *em;
2302         struct map_lookup *map;
2303         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2304         int readonly = 0;
2305         int i;
2306
2307         spin_lock(&map_tree->map_tree.lock);
2308         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2309         spin_unlock(&map_tree->map_tree.lock);
2310         if (!em)
2311                 return 1;
2312
2313         map = (struct map_lookup *)em->bdev;
2314         for (i = 0; i < map->num_stripes; i++) {
2315                 if (!map->stripes[i].dev->writeable) {
2316                         readonly = 1;
2317                         break;
2318                 }
2319         }
2320         free_extent_map(em);
2321         return readonly;
2322 }
2323
2324 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
2325 {
2326         extent_map_tree_init(&tree->map_tree, GFP_NOFS);
2327 }
2328
2329 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
2330 {
2331         struct extent_map *em;
2332
2333         while(1) {
2334                 spin_lock(&tree->map_tree.lock);
2335                 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
2336                 if (em)
2337                         remove_extent_mapping(&tree->map_tree, em);
2338                 spin_unlock(&tree->map_tree.lock);
2339                 if (!em)
2340                         break;
2341                 kfree(em->bdev);
2342                 /* once for us */
2343                 free_extent_map(em);
2344                 /* once for the tree */
2345                 free_extent_map(em);
2346         }
2347 }
2348
2349 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
2350 {
2351         struct extent_map *em;
2352         struct map_lookup *map;
2353         struct extent_map_tree *em_tree = &map_tree->map_tree;
2354         int ret;
2355
2356         spin_lock(&em_tree->lock);
2357         em = lookup_extent_mapping(em_tree, logical, len);
2358         spin_unlock(&em_tree->lock);
2359         BUG_ON(!em);
2360
2361         BUG_ON(em->start > logical || em->start + em->len < logical);
2362         map = (struct map_lookup *)em->bdev;
2363         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
2364                 ret = map->num_stripes;
2365         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2366                 ret = map->sub_stripes;
2367         else
2368                 ret = 1;
2369         free_extent_map(em);
2370         return ret;
2371 }
2372
2373 static int find_live_mirror(struct map_lookup *map, int first, int num,
2374                             int optimal)
2375 {
2376         int i;
2377         if (map->stripes[optimal].dev->bdev)
2378                 return optimal;
2379         for (i = first; i < first + num; i++) {
2380                 if (map->stripes[i].dev->bdev)
2381                         return i;
2382         }
2383         /* we couldn't find one that doesn't fail.  Just return something
2384          * and the io error handling code will clean up eventually
2385          */
2386         return optimal;
2387 }
2388
2389 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2390                              u64 logical, u64 *length,
2391                              struct btrfs_multi_bio **multi_ret,
2392                              int mirror_num, struct page *unplug_page)
2393 {
2394         struct extent_map *em;
2395         struct map_lookup *map;
2396         struct extent_map_tree *em_tree = &map_tree->map_tree;
2397         u64 offset;
2398         u64 stripe_offset;
2399         u64 stripe_nr;
2400         int stripes_allocated = 8;
2401         int stripes_required = 1;
2402         int stripe_index;
2403         int i;
2404         int num_stripes;
2405         int max_errors = 0;
2406         struct btrfs_multi_bio *multi = NULL;
2407
2408         if (multi_ret && !(rw & (1 << BIO_RW))) {
2409                 stripes_allocated = 1;
2410         }
2411 again:
2412         if (multi_ret) {
2413                 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
2414                                 GFP_NOFS);
2415                 if (!multi)
2416                         return -ENOMEM;
2417
2418                 atomic_set(&multi->error, 0);
2419         }
2420
2421         spin_lock(&em_tree->lock);
2422         em = lookup_extent_mapping(em_tree, logical, *length);
2423         spin_unlock(&em_tree->lock);
2424
2425         if (!em && unplug_page)
2426                 return 0;
2427
2428         if (!em) {
2429                 printk("unable to find logical %Lu len %Lu\n", logical, *length);
2430                 BUG();
2431         }
2432
2433         BUG_ON(em->start > logical || em->start + em->len < logical);
2434         map = (struct map_lookup *)em->bdev;
2435         offset = logical - em->start;
2436
2437         if (mirror_num > map->num_stripes)
2438                 mirror_num = 0;
2439
2440         /* if our multi bio struct is too small, back off and try again */
2441         if (rw & (1 << BIO_RW)) {
2442                 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2443                                  BTRFS_BLOCK_GROUP_DUP)) {
2444                         stripes_required = map->num_stripes;
2445                         max_errors = 1;
2446                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2447                         stripes_required = map->sub_stripes;
2448                         max_errors = 1;
2449                 }
2450         }
2451         if (multi_ret && rw == WRITE &&
2452             stripes_allocated < stripes_required) {
2453                 stripes_allocated = map->num_stripes;
2454                 free_extent_map(em);
2455                 kfree(multi);
2456                 goto again;
2457         }
2458         stripe_nr = offset;
2459         /*
2460          * stripe_nr counts the total number of stripes we have to stride
2461          * to get to this block
2462          */
2463         do_div(stripe_nr, map->stripe_len);
2464
2465         stripe_offset = stripe_nr * map->stripe_len;
2466         BUG_ON(offset < stripe_offset);
2467
2468         /* stripe_offset is the offset of this block in its stripe*/
2469         stripe_offset = offset - stripe_offset;
2470
2471         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2472                          BTRFS_BLOCK_GROUP_RAID10 |
2473                          BTRFS_BLOCK_GROUP_DUP)) {
2474                 /* we limit the length of each bio to what fits in a stripe */
2475                 *length = min_t(u64, em->len - offset,
2476                               map->stripe_len - stripe_offset);
2477         } else {
2478                 *length = em->len - offset;
2479         }
2480
2481         if (!multi_ret && !unplug_page)
2482                 goto out;
2483
2484         num_stripes = 1;
2485         stripe_index = 0;
2486         if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2487                 if (unplug_page || (rw & (1 << BIO_RW)))
2488                         num_stripes = map->num_stripes;
2489                 else if (mirror_num)
2490                         stripe_index = mirror_num - 1;
2491                 else {
2492                         stripe_index = find_live_mirror(map, 0,
2493                                             map->num_stripes,
2494                                             current->pid % map->num_stripes);
2495                 }
2496
2497         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2498                 if (rw & (1 << BIO_RW))
2499                         num_stripes = map->num_stripes;
2500                 else if (mirror_num)
2501                         stripe_index = mirror_num - 1;
2502
2503         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2504                 int factor = map->num_stripes / map->sub_stripes;
2505
2506                 stripe_index = do_div(stripe_nr, factor);
2507                 stripe_index *= map->sub_stripes;
2508
2509                 if (unplug_page || (rw & (1 << BIO_RW)))
2510                         num_stripes = map->sub_stripes;
2511                 else if (mirror_num)
2512                         stripe_index += mirror_num - 1;
2513                 else {
2514                         stripe_index = find_live_mirror(map, stripe_index,
2515                                               map->sub_stripes, stripe_index +
2516                                               current->pid % map->sub_stripes);
2517                 }
2518         } else {
2519                 /*
2520                  * after this do_div call, stripe_nr is the number of stripes
2521                  * on this device we have to walk to find the data, and
2522                  * stripe_index is the number of our device in the stripe array
2523                  */
2524                 stripe_index = do_div(stripe_nr, map->num_stripes);
2525         }
2526         BUG_ON(stripe_index >= map->num_stripes);
2527
2528         for (i = 0; i < num_stripes; i++) {
2529                 if (unplug_page) {
2530                         struct btrfs_device *device;
2531                         struct backing_dev_info *bdi;
2532
2533                         device = map->stripes[stripe_index].dev;
2534                         if (device->bdev) {
2535                                 bdi = blk_get_backing_dev_info(device->bdev);
2536                                 if (bdi->unplug_io_fn) {
2537                                         bdi->unplug_io_fn(bdi, unplug_page);
2538                                 }
2539                         }
2540                 } else {
2541                         multi->stripes[i].physical =
2542                                 map->stripes[stripe_index].physical +
2543                                 stripe_offset + stripe_nr * map->stripe_len;
2544                         multi->stripes[i].dev = map->stripes[stripe_index].dev;
2545                 }
2546                 stripe_index++;
2547         }
2548         if (multi_ret) {
2549                 *multi_ret = multi;
2550                 multi->num_stripes = num_stripes;
2551                 multi->max_errors = max_errors;
2552         }
2553 out:
2554         free_extent_map(em);
2555         return 0;
2556 }
2557
2558 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2559                       u64 logical, u64 *length,
2560                       struct btrfs_multi_bio **multi_ret, int mirror_num)
2561 {
2562         return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
2563                                  mirror_num, NULL);
2564 }
2565
2566 int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
2567                       u64 logical, struct page *page)
2568 {
2569         u64 length = PAGE_CACHE_SIZE;
2570         return __btrfs_map_block(map_tree, READ, logical, &length,
2571                                  NULL, 0, page);
2572 }
2573
2574
2575 static void end_bio_multi_stripe(struct bio *bio, int err)
2576 {
2577         struct btrfs_multi_bio *multi = bio->bi_private;
2578         int is_orig_bio = 0;
2579
2580         if (err)
2581                 atomic_inc(&multi->error);
2582
2583         if (bio == multi->orig_bio)
2584                 is_orig_bio = 1;
2585
2586         if (atomic_dec_and_test(&multi->stripes_pending)) {
2587                 if (!is_orig_bio) {
2588                         bio_put(bio);
2589                         bio = multi->orig_bio;
2590                 }
2591                 bio->bi_private = multi->private;
2592                 bio->bi_end_io = multi->end_io;
2593                 /* only send an error to the higher layers if it is
2594                  * beyond the tolerance of the multi-bio
2595                  */
2596                 if (atomic_read(&multi->error) > multi->max_errors) {
2597                         err = -EIO;
2598                 } else if (err) {
2599                         /*
2600                          * this bio is actually up to date, we didn't
2601                          * go over the max number of errors
2602                          */
2603                         set_bit(BIO_UPTODATE, &bio->bi_flags);
2604                         err = 0;
2605                 }
2606                 kfree(multi);
2607
2608                 bio_endio(bio, err);
2609         } else if (!is_orig_bio) {
2610                 bio_put(bio);
2611         }
2612 }
2613
2614 struct async_sched {
2615         struct bio *bio;
2616         int rw;
2617         struct btrfs_fs_info *info;
2618         struct btrfs_work work;
2619 };
2620
2621 /*
2622  * see run_scheduled_bios for a description of why bios are collected for
2623  * async submit.
2624  *
2625  * This will add one bio to the pending list for a device and make sure
2626  * the work struct is scheduled.
2627  */
2628 static int noinline schedule_bio(struct btrfs_root *root,
2629                                  struct btrfs_device *device,
2630                                  int rw, struct bio *bio)
2631 {
2632         int should_queue = 1;
2633
2634         /* don't bother with additional async steps for reads, right now */
2635         if (!(rw & (1 << BIO_RW))) {
2636                 bio_get(bio);
2637                 submit_bio(rw, bio);
2638                 bio_put(bio);
2639                 return 0;
2640         }
2641
2642         /*
2643          * nr_async_bios allows us to reliably return congestion to the
2644          * higher layers.  Otherwise, the async bio makes it appear we have
2645          * made progress against dirty pages when we've really just put it
2646          * on a queue for later
2647          */
2648         atomic_inc(&root->fs_info->nr_async_bios);
2649         WARN_ON(bio->bi_next);
2650         bio->bi_next = NULL;
2651         bio->bi_rw |= rw;
2652
2653         spin_lock(&device->io_lock);
2654
2655         if (device->pending_bio_tail)
2656                 device->pending_bio_tail->bi_next = bio;
2657
2658         device->pending_bio_tail = bio;
2659         if (!device->pending_bios)
2660                 device->pending_bios = bio;
2661         if (device->running_pending)
2662                 should_queue = 0;
2663
2664         spin_unlock(&device->io_lock);
2665
2666         if (should_queue)
2667                 btrfs_queue_worker(&root->fs_info->submit_workers,
2668                                    &device->work);
2669         return 0;
2670 }
2671
2672 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
2673                   int mirror_num, int async_submit)
2674 {
2675         struct btrfs_mapping_tree *map_tree;
2676         struct btrfs_device *dev;
2677         struct bio *first_bio = bio;
2678         u64 logical = (u64)bio->bi_sector << 9;
2679         u64 length = 0;
2680         u64 map_length;
2681         struct btrfs_multi_bio *multi = NULL;
2682         int ret;
2683         int dev_nr = 0;
2684         int total_devs = 1;
2685
2686         length = bio->bi_size;
2687         map_tree = &root->fs_info->mapping_tree;
2688         map_length = length;
2689
2690         ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
2691                               mirror_num);
2692         BUG_ON(ret);
2693
2694         total_devs = multi->num_stripes;
2695         if (map_length < length) {
2696                 printk("mapping failed logical %Lu bio len %Lu "
2697                        "len %Lu\n", logical, length, map_length);
2698                 BUG();
2699         }
2700         multi->end_io = first_bio->bi_end_io;
2701         multi->private = first_bio->bi_private;
2702         multi->orig_bio = first_bio;
2703         atomic_set(&multi->stripes_pending, multi->num_stripes);
2704
2705         while(dev_nr < total_devs) {
2706                 if (total_devs > 1) {
2707                         if (dev_nr < total_devs - 1) {
2708                                 bio = bio_clone(first_bio, GFP_NOFS);
2709                                 BUG_ON(!bio);
2710                         } else {
2711                                 bio = first_bio;
2712                         }
2713                         bio->bi_private = multi;
2714                         bio->bi_end_io = end_bio_multi_stripe;
2715                 }
2716                 bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
2717                 dev = multi->stripes[dev_nr].dev;
2718                 BUG_ON(rw == WRITE && !dev->writeable);
2719                 if (dev && dev->bdev) {
2720                         bio->bi_bdev = dev->bdev;
2721                         if (async_submit)
2722                                 schedule_bio(root, dev, rw, bio);
2723                         else
2724                                 submit_bio(rw, bio);
2725                 } else {
2726                         bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
2727                         bio->bi_sector = logical >> 9;
2728                         bio_endio(bio, -EIO);
2729                 }
2730                 dev_nr++;
2731         }
2732         if (total_devs == 1)
2733                 kfree(multi);
2734         return 0;
2735 }
2736
2737 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
2738                                        u8 *uuid, u8 *fsid)
2739 {
2740         struct btrfs_device *device;
2741         struct btrfs_fs_devices *cur_devices;
2742
2743         cur_devices = root->fs_info->fs_devices;
2744         while (cur_devices) {
2745                 if (!fsid ||
2746                     !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
2747                         device = __find_device(&cur_devices->devices,
2748                                                devid, uuid);
2749                         if (device)
2750                                 return device;
2751                 }
2752                 cur_devices = cur_devices->seed;
2753         }
2754         return NULL;
2755 }
2756
2757 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
2758                                             u64 devid, u8 *dev_uuid)
2759 {
2760         struct btrfs_device *device;
2761         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
2762
2763         device = kzalloc(sizeof(*device), GFP_NOFS);
2764         if (!device)
2765                 return NULL;
2766         list_add(&device->dev_list,
2767                  &fs_devices->devices);
2768         device->barriers = 1;
2769         device->dev_root = root->fs_info->dev_root;
2770         device->devid = devid;
2771         device->work.func = pending_bios_fn;
2772         fs_devices->num_devices++;
2773         spin_lock_init(&device->io_lock);
2774         INIT_LIST_HEAD(&device->dev_alloc_list);
2775         memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
2776         return device;
2777 }
2778
2779 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
2780                           struct extent_buffer *leaf,
2781                           struct btrfs_chunk *chunk)
2782 {
2783         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2784         struct map_lookup *map;
2785         struct extent_map *em;
2786         u64 logical;
2787         u64 length;
2788         u64 devid;
2789         u8 uuid[BTRFS_UUID_SIZE];
2790         int num_stripes;
2791         int ret;
2792         int i;
2793
2794         logical = key->offset;
2795         length = btrfs_chunk_length(leaf, chunk);
2796
2797         spin_lock(&map_tree->map_tree.lock);
2798         em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
2799         spin_unlock(&map_tree->map_tree.lock);
2800
2801         /* already mapped? */
2802         if (em && em->start <= logical && em->start + em->len > logical) {
2803                 free_extent_map(em);
2804                 return 0;
2805         } else if (em) {
2806                 free_extent_map(em);
2807         }
2808
2809         map = kzalloc(sizeof(*map), GFP_NOFS);
2810         if (!map)
2811                 return -ENOMEM;
2812
2813         em = alloc_extent_map(GFP_NOFS);
2814         if (!em)
2815                 return -ENOMEM;
2816         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2817         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2818         if (!map) {
2819                 free_extent_map(em);
2820                 return -ENOMEM;
2821         }
2822
2823         em->bdev = (struct block_device *)map;
2824         em->start = logical;
2825         em->len = length;
2826         em->block_start = 0;
2827         em->block_len = em->len;
2828
2829         map->num_stripes = num_stripes;
2830         map->io_width = btrfs_chunk_io_width(leaf, chunk);
2831         map->io_align = btrfs_chunk_io_align(leaf, chunk);
2832         map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
2833         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
2834         map->type = btrfs_chunk_type(leaf, chunk);
2835         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
2836         for (i = 0; i < num_stripes; i++) {
2837                 map->stripes[i].physical =
2838                         btrfs_stripe_offset_nr(leaf, chunk, i);
2839                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
2840                 read_extent_buffer(leaf, uuid, (unsigned long)
2841                                    btrfs_stripe_dev_uuid_nr(chunk, i),
2842                                    BTRFS_UUID_SIZE);
2843                 map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
2844                                                         NULL);
2845                 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
2846                         kfree(map);
2847                         free_extent_map(em);
2848                         return -EIO;
2849                 }
2850                 if (!map->stripes[i].dev) {
2851                         map->stripes[i].dev =
2852                                 add_missing_dev(root, devid, uuid);
2853                         if (!map->stripes[i].dev) {
2854                                 kfree(map);
2855                                 free_extent_map(em);
2856                                 return -EIO;
2857                         }
2858                 }
2859                 map->stripes[i].dev->in_fs_metadata = 1;
2860         }
2861
2862         spin_lock(&map_tree->map_tree.lock);
2863         ret = add_extent_mapping(&map_tree->map_tree, em);
2864         spin_unlock(&map_tree->map_tree.lock);
2865         BUG_ON(ret);
2866         free_extent_map(em);
2867
2868         return 0;
2869 }
2870
2871 static int fill_device_from_item(struct extent_buffer *leaf,
2872                                  struct btrfs_dev_item *dev_item,
2873                                  struct btrfs_device *device)
2874 {
2875         unsigned long ptr;
2876
2877         device->devid = btrfs_device_id(leaf, dev_item);
2878         device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
2879         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
2880         device->type = btrfs_device_type(leaf, dev_item);
2881         device->io_align = btrfs_device_io_align(leaf, dev_item);
2882         device->io_width = btrfs_device_io_width(leaf, dev_item);
2883         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
2884
2885         ptr = (unsigned long)btrfs_device_uuid(dev_item);
2886         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
2887
2888         return 0;
2889 }
2890
2891 static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
2892 {
2893         struct btrfs_fs_devices *fs_devices;
2894         int ret;
2895
2896         mutex_lock(&uuid_mutex);
2897
2898         fs_devices = root->fs_info->fs_devices->seed;
2899         while (fs_devices) {
2900                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
2901                         ret = 0;
2902                         goto out;
2903                 }
2904                 fs_devices = fs_devices->seed;
2905         }
2906
2907         fs_devices = find_fsid(fsid);
2908         if (!fs_devices) {
2909                 ret = -ENOENT;
2910                 goto out;
2911         }
2912         if (fs_devices->opened) {
2913                 ret = -EBUSY;
2914                 goto out;
2915         }
2916
2917         ret = __btrfs_open_devices(fs_devices, FMODE_READ,
2918                                    root->fs_info->bdev_holder);
2919         if (ret)
2920                 goto out;
2921
2922         if (!fs_devices->seeding) {
2923                 __btrfs_close_devices(fs_devices);
2924                 ret = -EINVAL;
2925                 goto out;
2926         }
2927
2928         fs_devices->seed = root->fs_info->fs_devices->seed;
2929         root->fs_info->fs_devices->seed = fs_devices;
2930         fs_devices->sprouted = 1;
2931 out:
2932         mutex_unlock(&uuid_mutex);
2933         return ret;
2934 }
2935
2936 static int read_one_dev(struct btrfs_root *root,
2937                         struct extent_buffer *leaf,
2938                         struct btrfs_dev_item *dev_item)
2939 {
2940         struct btrfs_device *device;
2941         u64 devid;
2942         int ret;
2943         int seed_devices = 0;
2944         u8 fs_uuid[BTRFS_UUID_SIZE];
2945         u8 dev_uuid[BTRFS_UUID_SIZE];
2946
2947         devid = btrfs_device_id(leaf, dev_item);
2948         read_extent_buffer(leaf, dev_uuid,
2949                            (unsigned long)btrfs_device_uuid(dev_item),
2950                            BTRFS_UUID_SIZE);
2951         read_extent_buffer(leaf, fs_uuid,
2952                            (unsigned long)btrfs_device_fsid(dev_item),
2953                            BTRFS_UUID_SIZE);
2954
2955         if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
2956                 ret = open_seed_devices(root, fs_uuid);
2957                 if (ret)
2958                         return ret;
2959                 seed_devices = 1;
2960         }
2961
2962         device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
2963         if (!device || !device->bdev) {
2964                 if (!btrfs_test_opt(root, DEGRADED) || seed_devices)
2965                         return -EIO;
2966
2967                 if (!device) {
2968                         printk("warning devid %Lu missing\n", devid);
2969                         device = add_missing_dev(root, devid, dev_uuid);
2970                         if (!device)
2971                                 return -ENOMEM;
2972                 }
2973         }
2974
2975         if (device->fs_devices != root->fs_info->fs_devices) {
2976                 BUG_ON(device->writeable);
2977                 if (device->generation !=
2978                     btrfs_device_generation(leaf, dev_item))
2979                         return -EINVAL;
2980         }
2981
2982         fill_device_from_item(leaf, dev_item, device);
2983         device->dev_root = root->fs_info->dev_root;
2984         device->in_fs_metadata = 1;
2985         if (device->writeable)
2986                 device->fs_devices->total_rw_bytes += device->total_bytes;
2987         ret = 0;
2988 #if 0
2989         ret = btrfs_open_device(device);
2990         if (ret) {
2991                 kfree(device);
2992         }
2993 #endif
2994         return ret;
2995 }
2996
2997 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
2998 {
2999         struct btrfs_dev_item *dev_item;
3000
3001         dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
3002                                                      dev_item);
3003         return read_one_dev(root, buf, dev_item);
3004 }
3005
3006 int btrfs_read_sys_array(struct btrfs_root *root)
3007 {
3008         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
3009         struct extent_buffer *sb;
3010         struct btrfs_disk_key *disk_key;
3011         struct btrfs_chunk *chunk;
3012         u8 *ptr;
3013         unsigned long sb_ptr;
3014         int ret = 0;
3015         u32 num_stripes;
3016         u32 array_size;
3017         u32 len = 0;
3018         u32 cur;
3019         struct btrfs_key key;
3020
3021         sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
3022                                           BTRFS_SUPER_INFO_SIZE);
3023         if (!sb)
3024                 return -ENOMEM;
3025         btrfs_set_buffer_uptodate(sb);
3026         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3027         array_size = btrfs_super_sys_array_size(super_copy);
3028
3029         ptr = super_copy->sys_chunk_array;
3030         sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
3031         cur = 0;
3032
3033         while (cur < array_size) {
3034                 disk_key = (struct btrfs_disk_key *)ptr;
3035                 btrfs_disk_key_to_cpu(&key, disk_key);
3036
3037                 len = sizeof(*disk_key); ptr += len;
3038                 sb_ptr += len;
3039                 cur += len;
3040
3041                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
3042                         chunk = (struct btrfs_chunk *)sb_ptr;
3043                         ret = read_one_chunk(root, &key, sb, chunk);
3044                         if (ret)
3045                                 break;
3046                         num_stripes = btrfs_chunk_num_stripes(sb, chunk);
3047                         len = btrfs_chunk_item_size(num_stripes);
3048                 } else {
3049                         ret = -EIO;
3050                         break;
3051                 }
3052                 ptr += len;
3053                 sb_ptr += len;
3054                 cur += len;
3055         }
3056         free_extent_buffer(sb);
3057         return ret;
3058 }
3059
3060 int btrfs_read_chunk_tree(struct btrfs_root *root)
3061 {
3062         struct btrfs_path *path;
3063         struct extent_buffer *leaf;
3064         struct btrfs_key key;
3065         struct btrfs_key found_key;
3066         int ret;
3067         int slot;
3068
3069         root = root->fs_info->chunk_root;
3070
3071         path = btrfs_alloc_path();
3072         if (!path)
3073                 return -ENOMEM;
3074
3075         /* first we search for all of the device items, and then we
3076          * read in all of the chunk items.  This way we can create chunk
3077          * mappings that reference all of the devices that are afound
3078          */
3079         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
3080         key.offset = 0;
3081         key.type = 0;
3082 again:
3083         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3084         while(1) {
3085                 leaf = path->nodes[0];
3086                 slot = path->slots[0];
3087                 if (slot >= btrfs_header_nritems(leaf)) {
3088                         ret = btrfs_next_leaf(root, path);
3089                         if (ret == 0)
3090                                 continue;
3091                         if (ret < 0)
3092                                 goto error;
3093                         break;
3094                 }
3095                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3096                 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3097                         if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
3098                                 break;
3099                         if (found_key.type == BTRFS_DEV_ITEM_KEY) {
3100                                 struct btrfs_dev_item *dev_item;
3101                                 dev_item = btrfs_item_ptr(leaf, slot,
3102                                                   struct btrfs_dev_item);
3103                                 ret = read_one_dev(root, leaf, dev_item);
3104                                 if (ret)
3105                                         goto error;
3106                         }
3107                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
3108                         struct btrfs_chunk *chunk;
3109                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3110                         ret = read_one_chunk(root, &found_key, leaf, chunk);
3111                         if (ret)
3112                                 goto error;
3113                 }
3114                 path->slots[0]++;
3115         }
3116         if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3117                 key.objectid = 0;
3118                 btrfs_release_path(root, path);
3119                 goto again;
3120         }
3121         ret = 0;
3122 error:
3123         btrfs_free_path(path);
3124         return ret;
3125 }