Btrfs: Introduce contexts for metadata reservation
[linux-2.6.git] / fs / btrfs / relocation.c
1 /*
2  * Copyright (C) 2009 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/pagemap.h>
21 #include <linux/writeback.h>
22 #include <linux/blkdev.h>
23 #include <linux/rbtree.h>
24 #include <linux/slab.h>
25 #include "ctree.h"
26 #include "disk-io.h"
27 #include "transaction.h"
28 #include "volumes.h"
29 #include "locking.h"
30 #include "btrfs_inode.h"
31 #include "async-thread.h"
32
33 /*
34  * backref_node, mapping_node and tree_block start with this
35  */
36 struct tree_entry {
37         struct rb_node rb_node;
38         u64 bytenr;
39 };
40
41 /*
42  * present a tree block in the backref cache
43  */
44 struct backref_node {
45         struct rb_node rb_node;
46         u64 bytenr;
47         /* objectid tree block owner */
48         u64 owner;
49         /* list of upper level blocks reference this block */
50         struct list_head upper;
51         /* list of child blocks in the cache */
52         struct list_head lower;
53         /* NULL if this node is not tree root */
54         struct btrfs_root *root;
55         /* extent buffer got by COW the block */
56         struct extent_buffer *eb;
57         /* level of tree block */
58         unsigned int level:8;
59         /* 1 if the block is root of old snapshot */
60         unsigned int old_root:1;
61         /* 1 if no child blocks in the cache */
62         unsigned int lowest:1;
63         /* is the extent buffer locked */
64         unsigned int locked:1;
65         /* has the block been processed */
66         unsigned int processed:1;
67         /* have backrefs of this block been checked */
68         unsigned int checked:1;
69 };
70
71 /*
72  * present a block pointer in the backref cache
73  */
74 struct backref_edge {
75         struct list_head list[2];
76         struct backref_node *node[2];
77         u64 blockptr;
78 };
79
80 #define LOWER   0
81 #define UPPER   1
82
83 struct backref_cache {
84         /* red black tree of all backref nodes in the cache */
85         struct rb_root rb_root;
86         /* list of backref nodes with no child block in the cache */
87         struct list_head pending[BTRFS_MAX_LEVEL];
88         spinlock_t lock;
89 };
90
91 /*
92  * map address of tree root to tree
93  */
94 struct mapping_node {
95         struct rb_node rb_node;
96         u64 bytenr;
97         void *data;
98 };
99
100 struct mapping_tree {
101         struct rb_root rb_root;
102         spinlock_t lock;
103 };
104
105 /*
106  * present a tree block to process
107  */
108 struct tree_block {
109         struct rb_node rb_node;
110         u64 bytenr;
111         struct btrfs_key key;
112         unsigned int level:8;
113         unsigned int key_ready:1;
114 };
115
116 /* inode vector */
117 #define INODEVEC_SIZE 16
118
119 struct inodevec {
120         struct list_head list;
121         struct inode *inode[INODEVEC_SIZE];
122         int nr;
123 };
124
125 #define MAX_EXTENTS 128
126
127 struct file_extent_cluster {
128         u64 start;
129         u64 end;
130         u64 boundary[MAX_EXTENTS];
131         unsigned int nr;
132 };
133
134 struct reloc_control {
135         /* block group to relocate */
136         struct btrfs_block_group_cache *block_group;
137         /* extent tree */
138         struct btrfs_root *extent_root;
139         /* inode for moving data */
140         struct inode *data_inode;
141         struct btrfs_workers workers;
142         /* tree blocks have been processed */
143         struct extent_io_tree processed_blocks;
144         /* map start of tree root to corresponding reloc tree */
145         struct mapping_tree reloc_root_tree;
146         /* list of reloc trees */
147         struct list_head reloc_roots;
148         u64 search_start;
149         u64 extents_found;
150         u64 extents_skipped;
151         int stage;
152         int create_reloc_root;
153         unsigned int found_file_extent:1;
154         unsigned int found_old_snapshot:1;
155 };
156
157 /* stages of data relocation */
158 #define MOVE_DATA_EXTENTS       0
159 #define UPDATE_DATA_PTRS        1
160
161 /*
162  * merge reloc tree to corresponding fs tree in worker threads
163  */
164 struct async_merge {
165         struct btrfs_work work;
166         struct reloc_control *rc;
167         struct btrfs_root *root;
168         struct completion *done;
169         atomic_t *num_pending;
170 };
171
172 static void mapping_tree_init(struct mapping_tree *tree)
173 {
174         tree->rb_root = RB_ROOT;
175         spin_lock_init(&tree->lock);
176 }
177
178 static void backref_cache_init(struct backref_cache *cache)
179 {
180         int i;
181         cache->rb_root = RB_ROOT;
182         for (i = 0; i < BTRFS_MAX_LEVEL; i++)
183                 INIT_LIST_HEAD(&cache->pending[i]);
184         spin_lock_init(&cache->lock);
185 }
186
187 static void backref_node_init(struct backref_node *node)
188 {
189         memset(node, 0, sizeof(*node));
190         INIT_LIST_HEAD(&node->upper);
191         INIT_LIST_HEAD(&node->lower);
192         RB_CLEAR_NODE(&node->rb_node);
193 }
194
195 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
196                                    struct rb_node *node)
197 {
198         struct rb_node **p = &root->rb_node;
199         struct rb_node *parent = NULL;
200         struct tree_entry *entry;
201
202         while (*p) {
203                 parent = *p;
204                 entry = rb_entry(parent, struct tree_entry, rb_node);
205
206                 if (bytenr < entry->bytenr)
207                         p = &(*p)->rb_left;
208                 else if (bytenr > entry->bytenr)
209                         p = &(*p)->rb_right;
210                 else
211                         return parent;
212         }
213
214         rb_link_node(node, parent, p);
215         rb_insert_color(node, root);
216         return NULL;
217 }
218
219 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
220 {
221         struct rb_node *n = root->rb_node;
222         struct tree_entry *entry;
223
224         while (n) {
225                 entry = rb_entry(n, struct tree_entry, rb_node);
226
227                 if (bytenr < entry->bytenr)
228                         n = n->rb_left;
229                 else if (bytenr > entry->bytenr)
230                         n = n->rb_right;
231                 else
232                         return n;
233         }
234         return NULL;
235 }
236
237 /*
238  * walk up backref nodes until reach node presents tree root
239  */
240 static struct backref_node *walk_up_backref(struct backref_node *node,
241                                             struct backref_edge *edges[],
242                                             int *index)
243 {
244         struct backref_edge *edge;
245         int idx = *index;
246
247         while (!list_empty(&node->upper)) {
248                 edge = list_entry(node->upper.next,
249                                   struct backref_edge, list[LOWER]);
250                 edges[idx++] = edge;
251                 node = edge->node[UPPER];
252         }
253         *index = idx;
254         return node;
255 }
256
257 /*
258  * walk down backref nodes to find start of next reference path
259  */
260 static struct backref_node *walk_down_backref(struct backref_edge *edges[],
261                                               int *index)
262 {
263         struct backref_edge *edge;
264         struct backref_node *lower;
265         int idx = *index;
266
267         while (idx > 0) {
268                 edge = edges[idx - 1];
269                 lower = edge->node[LOWER];
270                 if (list_is_last(&edge->list[LOWER], &lower->upper)) {
271                         idx--;
272                         continue;
273                 }
274                 edge = list_entry(edge->list[LOWER].next,
275                                   struct backref_edge, list[LOWER]);
276                 edges[idx - 1] = edge;
277                 *index = idx;
278                 return edge->node[UPPER];
279         }
280         *index = 0;
281         return NULL;
282 }
283
284 static void drop_node_buffer(struct backref_node *node)
285 {
286         if (node->eb) {
287                 if (node->locked) {
288                         btrfs_tree_unlock(node->eb);
289                         node->locked = 0;
290                 }
291                 free_extent_buffer(node->eb);
292                 node->eb = NULL;
293         }
294 }
295
296 static void drop_backref_node(struct backref_cache *tree,
297                               struct backref_node *node)
298 {
299         BUG_ON(!node->lowest);
300         BUG_ON(!list_empty(&node->upper));
301
302         drop_node_buffer(node);
303         list_del(&node->lower);
304
305         rb_erase(&node->rb_node, &tree->rb_root);
306         kfree(node);
307 }
308
309 /*
310  * remove a backref node from the backref cache
311  */
312 static void remove_backref_node(struct backref_cache *cache,
313                                 struct backref_node *node)
314 {
315         struct backref_node *upper;
316         struct backref_edge *edge;
317
318         if (!node)
319                 return;
320
321         BUG_ON(!node->lowest);
322         while (!list_empty(&node->upper)) {
323                 edge = list_entry(node->upper.next, struct backref_edge,
324                                   list[LOWER]);
325                 upper = edge->node[UPPER];
326                 list_del(&edge->list[LOWER]);
327                 list_del(&edge->list[UPPER]);
328                 kfree(edge);
329                 /*
330                  * add the node to pending list if no other
331                  * child block cached.
332                  */
333                 if (list_empty(&upper->lower)) {
334                         list_add_tail(&upper->lower,
335                                       &cache->pending[upper->level]);
336                         upper->lowest = 1;
337                 }
338         }
339         drop_backref_node(cache, node);
340 }
341
342 /*
343  * find reloc tree by address of tree root
344  */
345 static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
346                                           u64 bytenr)
347 {
348         struct rb_node *rb_node;
349         struct mapping_node *node;
350         struct btrfs_root *root = NULL;
351
352         spin_lock(&rc->reloc_root_tree.lock);
353         rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
354         if (rb_node) {
355                 node = rb_entry(rb_node, struct mapping_node, rb_node);
356                 root = (struct btrfs_root *)node->data;
357         }
358         spin_unlock(&rc->reloc_root_tree.lock);
359         return root;
360 }
361
362 static int is_cowonly_root(u64 root_objectid)
363 {
364         if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
365             root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
366             root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
367             root_objectid == BTRFS_DEV_TREE_OBJECTID ||
368             root_objectid == BTRFS_TREE_LOG_OBJECTID ||
369             root_objectid == BTRFS_CSUM_TREE_OBJECTID)
370                 return 1;
371         return 0;
372 }
373
374 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
375                                         u64 root_objectid)
376 {
377         struct btrfs_key key;
378
379         key.objectid = root_objectid;
380         key.type = BTRFS_ROOT_ITEM_KEY;
381         if (is_cowonly_root(root_objectid))
382                 key.offset = 0;
383         else
384                 key.offset = (u64)-1;
385
386         return btrfs_read_fs_root_no_name(fs_info, &key);
387 }
388
389 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
390 static noinline_for_stack
391 struct btrfs_root *find_tree_root(struct reloc_control *rc,
392                                   struct extent_buffer *leaf,
393                                   struct btrfs_extent_ref_v0 *ref0)
394 {
395         struct btrfs_root *root;
396         u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
397         u64 generation = btrfs_ref_generation_v0(leaf, ref0);
398
399         BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
400
401         root = read_fs_root(rc->extent_root->fs_info, root_objectid);
402         BUG_ON(IS_ERR(root));
403
404         if (root->ref_cows &&
405             generation != btrfs_root_generation(&root->root_item))
406                 return NULL;
407
408         return root;
409 }
410 #endif
411
412 static noinline_for_stack
413 int find_inline_backref(struct extent_buffer *leaf, int slot,
414                         unsigned long *ptr, unsigned long *end)
415 {
416         struct btrfs_extent_item *ei;
417         struct btrfs_tree_block_info *bi;
418         u32 item_size;
419
420         item_size = btrfs_item_size_nr(leaf, slot);
421 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
422         if (item_size < sizeof(*ei)) {
423                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
424                 return 1;
425         }
426 #endif
427         ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
428         WARN_ON(!(btrfs_extent_flags(leaf, ei) &
429                   BTRFS_EXTENT_FLAG_TREE_BLOCK));
430
431         if (item_size <= sizeof(*ei) + sizeof(*bi)) {
432                 WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
433                 return 1;
434         }
435
436         bi = (struct btrfs_tree_block_info *)(ei + 1);
437         *ptr = (unsigned long)(bi + 1);
438         *end = (unsigned long)ei + item_size;
439         return 0;
440 }
441
442 /*
443  * build backref tree for a given tree block. root of the backref tree
444  * corresponds the tree block, leaves of the backref tree correspond
445  * roots of b-trees that reference the tree block.
446  *
447  * the basic idea of this function is check backrefs of a given block
448  * to find upper level blocks that refernece the block, and then check
449  * bakcrefs of these upper level blocks recursively. the recursion stop
450  * when tree root is reached or backrefs for the block is cached.
451  *
452  * NOTE: if we find backrefs for a block are cached, we know backrefs
453  * for all upper level blocks that directly/indirectly reference the
454  * block are also cached.
455  */
456 static struct backref_node *build_backref_tree(struct reloc_control *rc,
457                                                struct backref_cache *cache,
458                                                struct btrfs_key *node_key,
459                                                int level, u64 bytenr)
460 {
461         struct btrfs_path *path1;
462         struct btrfs_path *path2;
463         struct extent_buffer *eb;
464         struct btrfs_root *root;
465         struct backref_node *cur;
466         struct backref_node *upper;
467         struct backref_node *lower;
468         struct backref_node *node = NULL;
469         struct backref_node *exist = NULL;
470         struct backref_edge *edge;
471         struct rb_node *rb_node;
472         struct btrfs_key key;
473         unsigned long end;
474         unsigned long ptr;
475         LIST_HEAD(list);
476         int ret;
477         int err = 0;
478
479         path1 = btrfs_alloc_path();
480         path2 = btrfs_alloc_path();
481         if (!path1 || !path2) {
482                 err = -ENOMEM;
483                 goto out;
484         }
485
486         node = kmalloc(sizeof(*node), GFP_NOFS);
487         if (!node) {
488                 err = -ENOMEM;
489                 goto out;
490         }
491
492         backref_node_init(node);
493         node->bytenr = bytenr;
494         node->owner = 0;
495         node->level = level;
496         node->lowest = 1;
497         cur = node;
498 again:
499         end = 0;
500         ptr = 0;
501         key.objectid = cur->bytenr;
502         key.type = BTRFS_EXTENT_ITEM_KEY;
503         key.offset = (u64)-1;
504
505         path1->search_commit_root = 1;
506         path1->skip_locking = 1;
507         ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
508                                 0, 0);
509         if (ret < 0) {
510                 err = ret;
511                 goto out;
512         }
513         BUG_ON(!ret || !path1->slots[0]);
514
515         path1->slots[0]--;
516
517         WARN_ON(cur->checked);
518         if (!list_empty(&cur->upper)) {
519                 /*
520                  * the backref was added previously when processsing
521                  * backref of type BTRFS_TREE_BLOCK_REF_KEY
522                  */
523                 BUG_ON(!list_is_singular(&cur->upper));
524                 edge = list_entry(cur->upper.next, struct backref_edge,
525                                   list[LOWER]);
526                 BUG_ON(!list_empty(&edge->list[UPPER]));
527                 exist = edge->node[UPPER];
528                 /*
529                  * add the upper level block to pending list if we need
530                  * check its backrefs
531                  */
532                 if (!exist->checked)
533                         list_add_tail(&edge->list[UPPER], &list);
534         } else {
535                 exist = NULL;
536         }
537
538         while (1) {
539                 cond_resched();
540                 eb = path1->nodes[0];
541
542                 if (ptr >= end) {
543                         if (path1->slots[0] >= btrfs_header_nritems(eb)) {
544                                 ret = btrfs_next_leaf(rc->extent_root, path1);
545                                 if (ret < 0) {
546                                         err = ret;
547                                         goto out;
548                                 }
549                                 if (ret > 0)
550                                         break;
551                                 eb = path1->nodes[0];
552                         }
553
554                         btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
555                         if (key.objectid != cur->bytenr) {
556                                 WARN_ON(exist);
557                                 break;
558                         }
559
560                         if (key.type == BTRFS_EXTENT_ITEM_KEY) {
561                                 ret = find_inline_backref(eb, path1->slots[0],
562                                                           &ptr, &end);
563                                 if (ret)
564                                         goto next;
565                         }
566                 }
567
568                 if (ptr < end) {
569                         /* update key for inline back ref */
570                         struct btrfs_extent_inline_ref *iref;
571                         iref = (struct btrfs_extent_inline_ref *)ptr;
572                         key.type = btrfs_extent_inline_ref_type(eb, iref);
573                         key.offset = btrfs_extent_inline_ref_offset(eb, iref);
574                         WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
575                                 key.type != BTRFS_SHARED_BLOCK_REF_KEY);
576                 }
577
578                 if (exist &&
579                     ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
580                       exist->owner == key.offset) ||
581                      (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
582                       exist->bytenr == key.offset))) {
583                         exist = NULL;
584                         goto next;
585                 }
586
587 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
588                 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
589                     key.type == BTRFS_EXTENT_REF_V0_KEY) {
590                         if (key.objectid == key.offset &&
591                             key.type == BTRFS_EXTENT_REF_V0_KEY) {
592                                 struct btrfs_extent_ref_v0 *ref0;
593                                 ref0 = btrfs_item_ptr(eb, path1->slots[0],
594                                                 struct btrfs_extent_ref_v0);
595                                 root = find_tree_root(rc, eb, ref0);
596                                 if (root)
597                                         cur->root = root;
598                                 else
599                                         cur->old_root = 1;
600                                 break;
601                         }
602 #else
603                 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
604                 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
605 #endif
606                         if (key.objectid == key.offset) {
607                                 /*
608                                  * only root blocks of reloc trees use
609                                  * backref of this type.
610                                  */
611                                 root = find_reloc_root(rc, cur->bytenr);
612                                 BUG_ON(!root);
613                                 cur->root = root;
614                                 break;
615                         }
616
617                         edge = kzalloc(sizeof(*edge), GFP_NOFS);
618                         if (!edge) {
619                                 err = -ENOMEM;
620                                 goto out;
621                         }
622                         rb_node = tree_search(&cache->rb_root, key.offset);
623                         if (!rb_node) {
624                                 upper = kmalloc(sizeof(*upper), GFP_NOFS);
625                                 if (!upper) {
626                                         kfree(edge);
627                                         err = -ENOMEM;
628                                         goto out;
629                                 }
630                                 backref_node_init(upper);
631                                 upper->bytenr = key.offset;
632                                 upper->owner = 0;
633                                 upper->level = cur->level + 1;
634                                 /*
635                                  *  backrefs for the upper level block isn't
636                                  *  cached, add the block to pending list
637                                  */
638                                 list_add_tail(&edge->list[UPPER], &list);
639                         } else {
640                                 upper = rb_entry(rb_node, struct backref_node,
641                                                  rb_node);
642                                 INIT_LIST_HEAD(&edge->list[UPPER]);
643                         }
644                         list_add(&edge->list[LOWER], &cur->upper);
645                         edge->node[UPPER] = upper;
646                         edge->node[LOWER] = cur;
647
648                         goto next;
649                 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
650                         goto next;
651                 }
652
653                 /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
654                 root = read_fs_root(rc->extent_root->fs_info, key.offset);
655                 if (IS_ERR(root)) {
656                         err = PTR_ERR(root);
657                         goto out;
658                 }
659
660                 if (btrfs_root_level(&root->root_item) == cur->level) {
661                         /* tree root */
662                         BUG_ON(btrfs_root_bytenr(&root->root_item) !=
663                                cur->bytenr);
664                         cur->root = root;
665                         break;
666                 }
667
668                 level = cur->level + 1;
669
670                 /*
671                  * searching the tree to find upper level blocks
672                  * reference the block.
673                  */
674                 path2->search_commit_root = 1;
675                 path2->skip_locking = 1;
676                 path2->lowest_level = level;
677                 ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
678                 path2->lowest_level = 0;
679                 if (ret < 0) {
680                         err = ret;
681                         goto out;
682                 }
683                 if (ret > 0 && path2->slots[level] > 0)
684                         path2->slots[level]--;
685
686                 eb = path2->nodes[level];
687                 WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
688                         cur->bytenr);
689
690                 lower = cur;
691                 for (; level < BTRFS_MAX_LEVEL; level++) {
692                         if (!path2->nodes[level]) {
693                                 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
694                                        lower->bytenr);
695                                 lower->root = root;
696                                 break;
697                         }
698
699                         edge = kzalloc(sizeof(*edge), GFP_NOFS);
700                         if (!edge) {
701                                 err = -ENOMEM;
702                                 goto out;
703                         }
704
705                         eb = path2->nodes[level];
706                         rb_node = tree_search(&cache->rb_root, eb->start);
707                         if (!rb_node) {
708                                 upper = kmalloc(sizeof(*upper), GFP_NOFS);
709                                 if (!upper) {
710                                         kfree(edge);
711                                         err = -ENOMEM;
712                                         goto out;
713                                 }
714                                 backref_node_init(upper);
715                                 upper->bytenr = eb->start;
716                                 upper->owner = btrfs_header_owner(eb);
717                                 upper->level = lower->level + 1;
718
719                                 /*
720                                  * if we know the block isn't shared
721                                  * we can void checking its backrefs.
722                                  */
723                                 if (btrfs_block_can_be_shared(root, eb))
724                                         upper->checked = 0;
725                                 else
726                                         upper->checked = 1;
727
728                                 /*
729                                  * add the block to pending list if we
730                                  * need check its backrefs. only block
731                                  * at 'cur->level + 1' is added to the
732                                  * tail of pending list. this guarantees
733                                  * we check backrefs from lower level
734                                  * blocks to upper level blocks.
735                                  */
736                                 if (!upper->checked &&
737                                     level == cur->level + 1) {
738                                         list_add_tail(&edge->list[UPPER],
739                                                       &list);
740                                 } else
741                                         INIT_LIST_HEAD(&edge->list[UPPER]);
742                         } else {
743                                 upper = rb_entry(rb_node, struct backref_node,
744                                                  rb_node);
745                                 BUG_ON(!upper->checked);
746                                 INIT_LIST_HEAD(&edge->list[UPPER]);
747                         }
748                         list_add_tail(&edge->list[LOWER], &lower->upper);
749                         edge->node[UPPER] = upper;
750                         edge->node[LOWER] = lower;
751
752                         if (rb_node)
753                                 break;
754                         lower = upper;
755                         upper = NULL;
756                 }
757                 btrfs_release_path(root, path2);
758 next:
759                 if (ptr < end) {
760                         ptr += btrfs_extent_inline_ref_size(key.type);
761                         if (ptr >= end) {
762                                 WARN_ON(ptr > end);
763                                 ptr = 0;
764                                 end = 0;
765                         }
766                 }
767                 if (ptr >= end)
768                         path1->slots[0]++;
769         }
770         btrfs_release_path(rc->extent_root, path1);
771
772         cur->checked = 1;
773         WARN_ON(exist);
774
775         /* the pending list isn't empty, take the first block to process */
776         if (!list_empty(&list)) {
777                 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
778                 list_del_init(&edge->list[UPPER]);
779                 cur = edge->node[UPPER];
780                 goto again;
781         }
782
783         /*
784          * everything goes well, connect backref nodes and insert backref nodes
785          * into the cache.
786          */
787         BUG_ON(!node->checked);
788         rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
789         BUG_ON(rb_node);
790
791         list_for_each_entry(edge, &node->upper, list[LOWER])
792                 list_add_tail(&edge->list[UPPER], &list);
793
794         while (!list_empty(&list)) {
795                 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
796                 list_del_init(&edge->list[UPPER]);
797                 upper = edge->node[UPPER];
798
799                 if (!RB_EMPTY_NODE(&upper->rb_node)) {
800                         if (upper->lowest) {
801                                 list_del_init(&upper->lower);
802                                 upper->lowest = 0;
803                         }
804
805                         list_add_tail(&edge->list[UPPER], &upper->lower);
806                         continue;
807                 }
808
809                 BUG_ON(!upper->checked);
810                 rb_node = tree_insert(&cache->rb_root, upper->bytenr,
811                                       &upper->rb_node);
812                 BUG_ON(rb_node);
813
814                 list_add_tail(&edge->list[UPPER], &upper->lower);
815
816                 list_for_each_entry(edge, &upper->upper, list[LOWER])
817                         list_add_tail(&edge->list[UPPER], &list);
818         }
819 out:
820         btrfs_free_path(path1);
821         btrfs_free_path(path2);
822         if (err) {
823                 INIT_LIST_HEAD(&list);
824                 upper = node;
825                 while (upper) {
826                         if (RB_EMPTY_NODE(&upper->rb_node)) {
827                                 list_splice_tail(&upper->upper, &list);
828                                 kfree(upper);
829                         }
830
831                         if (list_empty(&list))
832                                 break;
833
834                         edge = list_entry(list.next, struct backref_edge,
835                                           list[LOWER]);
836                         upper = edge->node[UPPER];
837                         kfree(edge);
838                 }
839                 return ERR_PTR(err);
840         }
841         return node;
842 }
843
844 /*
845  * helper to add 'address of tree root -> reloc tree' mapping
846  */
847 static int __add_reloc_root(struct btrfs_root *root)
848 {
849         struct rb_node *rb_node;
850         struct mapping_node *node;
851         struct reloc_control *rc = root->fs_info->reloc_ctl;
852
853         node = kmalloc(sizeof(*node), GFP_NOFS);
854         BUG_ON(!node);
855
856         node->bytenr = root->node->start;
857         node->data = root;
858
859         spin_lock(&rc->reloc_root_tree.lock);
860         rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
861                               node->bytenr, &node->rb_node);
862         spin_unlock(&rc->reloc_root_tree.lock);
863         BUG_ON(rb_node);
864
865         list_add_tail(&root->root_list, &rc->reloc_roots);
866         return 0;
867 }
868
869 /*
870  * helper to update/delete the 'address of tree root -> reloc tree'
871  * mapping
872  */
873 static int __update_reloc_root(struct btrfs_root *root, int del)
874 {
875         struct rb_node *rb_node;
876         struct mapping_node *node = NULL;
877         struct reloc_control *rc = root->fs_info->reloc_ctl;
878
879         spin_lock(&rc->reloc_root_tree.lock);
880         rb_node = tree_search(&rc->reloc_root_tree.rb_root,
881                               root->commit_root->start);
882         if (rb_node) {
883                 node = rb_entry(rb_node, struct mapping_node, rb_node);
884                 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
885         }
886         spin_unlock(&rc->reloc_root_tree.lock);
887
888         BUG_ON((struct btrfs_root *)node->data != root);
889
890         if (!del) {
891                 spin_lock(&rc->reloc_root_tree.lock);
892                 node->bytenr = root->node->start;
893                 rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
894                                       node->bytenr, &node->rb_node);
895                 spin_unlock(&rc->reloc_root_tree.lock);
896                 BUG_ON(rb_node);
897         } else {
898                 list_del_init(&root->root_list);
899                 kfree(node);
900         }
901         return 0;
902 }
903
904 /*
905  * create reloc tree for a given fs tree. reloc tree is just a
906  * snapshot of the fs tree with special root objectid.
907  */
908 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
909                           struct btrfs_root *root)
910 {
911         struct btrfs_root *reloc_root;
912         struct extent_buffer *eb;
913         struct btrfs_root_item *root_item;
914         struct btrfs_key root_key;
915         int ret;
916
917         if (root->reloc_root) {
918                 reloc_root = root->reloc_root;
919                 reloc_root->last_trans = trans->transid;
920                 return 0;
921         }
922
923         if (!root->fs_info->reloc_ctl ||
924             !root->fs_info->reloc_ctl->create_reloc_root ||
925             root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
926                 return 0;
927
928         root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
929         BUG_ON(!root_item);
930
931         root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
932         root_key.type = BTRFS_ROOT_ITEM_KEY;
933         root_key.offset = root->root_key.objectid;
934
935         ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
936                               BTRFS_TREE_RELOC_OBJECTID);
937         BUG_ON(ret);
938
939         btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
940         memcpy(root_item, &root->root_item, sizeof(*root_item));
941         btrfs_set_root_refs(root_item, 1);
942         btrfs_set_root_bytenr(root_item, eb->start);
943         btrfs_set_root_level(root_item, btrfs_header_level(eb));
944         btrfs_set_root_generation(root_item, trans->transid);
945         memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
946         root_item->drop_level = 0;
947
948         btrfs_tree_unlock(eb);
949         free_extent_buffer(eb);
950
951         ret = btrfs_insert_root(trans, root->fs_info->tree_root,
952                                 &root_key, root_item);
953         BUG_ON(ret);
954         kfree(root_item);
955
956         reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
957                                                  &root_key);
958         BUG_ON(IS_ERR(reloc_root));
959         reloc_root->last_trans = trans->transid;
960
961         __add_reloc_root(reloc_root);
962         root->reloc_root = reloc_root;
963         return 0;
964 }
965
966 /*
967  * update root item of reloc tree
968  */
969 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
970                             struct btrfs_root *root)
971 {
972         struct btrfs_root *reloc_root;
973         struct btrfs_root_item *root_item;
974         int del = 0;
975         int ret;
976
977         if (!root->reloc_root)
978                 return 0;
979
980         reloc_root = root->reloc_root;
981         root_item = &reloc_root->root_item;
982
983         if (btrfs_root_refs(root_item) == 0) {
984                 root->reloc_root = NULL;
985                 del = 1;
986         }
987
988         __update_reloc_root(reloc_root, del);
989
990         if (reloc_root->commit_root != reloc_root->node) {
991                 btrfs_set_root_node(root_item, reloc_root->node);
992                 free_extent_buffer(reloc_root->commit_root);
993                 reloc_root->commit_root = btrfs_root_node(reloc_root);
994         }
995
996         ret = btrfs_update_root(trans, root->fs_info->tree_root,
997                                 &reloc_root->root_key, root_item);
998         BUG_ON(ret);
999         return 0;
1000 }
1001
1002 /*
1003  * helper to find first cached inode with inode number >= objectid
1004  * in a subvolume
1005  */
1006 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1007 {
1008         struct rb_node *node;
1009         struct rb_node *prev;
1010         struct btrfs_inode *entry;
1011         struct inode *inode;
1012
1013         spin_lock(&root->inode_lock);
1014 again:
1015         node = root->inode_tree.rb_node;
1016         prev = NULL;
1017         while (node) {
1018                 prev = node;
1019                 entry = rb_entry(node, struct btrfs_inode, rb_node);
1020
1021                 if (objectid < entry->vfs_inode.i_ino)
1022                         node = node->rb_left;
1023                 else if (objectid > entry->vfs_inode.i_ino)
1024                         node = node->rb_right;
1025                 else
1026                         break;
1027         }
1028         if (!node) {
1029                 while (prev) {
1030                         entry = rb_entry(prev, struct btrfs_inode, rb_node);
1031                         if (objectid <= entry->vfs_inode.i_ino) {
1032                                 node = prev;
1033                                 break;
1034                         }
1035                         prev = rb_next(prev);
1036                 }
1037         }
1038         while (node) {
1039                 entry = rb_entry(node, struct btrfs_inode, rb_node);
1040                 inode = igrab(&entry->vfs_inode);
1041                 if (inode) {
1042                         spin_unlock(&root->inode_lock);
1043                         return inode;
1044                 }
1045
1046                 objectid = entry->vfs_inode.i_ino + 1;
1047                 if (cond_resched_lock(&root->inode_lock))
1048                         goto again;
1049
1050                 node = rb_next(node);
1051         }
1052         spin_unlock(&root->inode_lock);
1053         return NULL;
1054 }
1055
1056 static int in_block_group(u64 bytenr,
1057                           struct btrfs_block_group_cache *block_group)
1058 {
1059         if (bytenr >= block_group->key.objectid &&
1060             bytenr < block_group->key.objectid + block_group->key.offset)
1061                 return 1;
1062         return 0;
1063 }
1064
1065 /*
1066  * get new location of data
1067  */
1068 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1069                             u64 bytenr, u64 num_bytes)
1070 {
1071         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1072         struct btrfs_path *path;
1073         struct btrfs_file_extent_item *fi;
1074         struct extent_buffer *leaf;
1075         int ret;
1076
1077         path = btrfs_alloc_path();
1078         if (!path)
1079                 return -ENOMEM;
1080
1081         bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1082         ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
1083                                        bytenr, 0);
1084         if (ret < 0)
1085                 goto out;
1086         if (ret > 0) {
1087                 ret = -ENOENT;
1088                 goto out;
1089         }
1090
1091         leaf = path->nodes[0];
1092         fi = btrfs_item_ptr(leaf, path->slots[0],
1093                             struct btrfs_file_extent_item);
1094
1095         BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1096                btrfs_file_extent_compression(leaf, fi) ||
1097                btrfs_file_extent_encryption(leaf, fi) ||
1098                btrfs_file_extent_other_encoding(leaf, fi));
1099
1100         if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1101                 ret = 1;
1102                 goto out;
1103         }
1104
1105         if (new_bytenr)
1106                 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1107         ret = 0;
1108 out:
1109         btrfs_free_path(path);
1110         return ret;
1111 }
1112
1113 /*
1114  * update file extent items in the tree leaf to point to
1115  * the new locations.
1116  */
1117 static int replace_file_extents(struct btrfs_trans_handle *trans,
1118                                 struct reloc_control *rc,
1119                                 struct btrfs_root *root,
1120                                 struct extent_buffer *leaf,
1121                                 struct list_head *inode_list)
1122 {
1123         struct btrfs_key key;
1124         struct btrfs_file_extent_item *fi;
1125         struct inode *inode = NULL;
1126         struct inodevec *ivec = NULL;
1127         u64 parent;
1128         u64 bytenr;
1129         u64 new_bytenr;
1130         u64 num_bytes;
1131         u64 end;
1132         u32 nritems;
1133         u32 i;
1134         int ret;
1135         int first = 1;
1136         int dirty = 0;
1137
1138         if (rc->stage != UPDATE_DATA_PTRS)
1139                 return 0;
1140
1141         /* reloc trees always use full backref */
1142         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1143                 parent = leaf->start;
1144         else
1145                 parent = 0;
1146
1147         nritems = btrfs_header_nritems(leaf);
1148         for (i = 0; i < nritems; i++) {
1149                 cond_resched();
1150                 btrfs_item_key_to_cpu(leaf, &key, i);
1151                 if (key.type != BTRFS_EXTENT_DATA_KEY)
1152                         continue;
1153                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1154                 if (btrfs_file_extent_type(leaf, fi) ==
1155                     BTRFS_FILE_EXTENT_INLINE)
1156                         continue;
1157                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1158                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1159                 if (bytenr == 0)
1160                         continue;
1161                 if (!in_block_group(bytenr, rc->block_group))
1162                         continue;
1163
1164                 /*
1165                  * if we are modifying block in fs tree, wait for readpage
1166                  * to complete and drop the extent cache
1167                  */
1168                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1169                         if (!ivec || ivec->nr == INODEVEC_SIZE) {
1170                                 ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
1171                                 BUG_ON(!ivec);
1172                                 ivec->nr = 0;
1173                                 list_add_tail(&ivec->list, inode_list);
1174                         }
1175                         if (first) {
1176                                 inode = find_next_inode(root, key.objectid);
1177                                 if (inode)
1178                                         ivec->inode[ivec->nr++] = inode;
1179                                 first = 0;
1180                         } else if (inode && inode->i_ino < key.objectid) {
1181                                 inode = find_next_inode(root, key.objectid);
1182                                 if (inode)
1183                                         ivec->inode[ivec->nr++] = inode;
1184                         }
1185                         if (inode && inode->i_ino == key.objectid) {
1186                                 end = key.offset +
1187                                       btrfs_file_extent_num_bytes(leaf, fi);
1188                                 WARN_ON(!IS_ALIGNED(key.offset,
1189                                                     root->sectorsize));
1190                                 WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1191                                 end--;
1192                                 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1193                                                       key.offset, end,
1194                                                       GFP_NOFS);
1195                                 if (!ret)
1196                                         continue;
1197
1198                                 btrfs_drop_extent_cache(inode, key.offset, end,
1199                                                         1);
1200                                 unlock_extent(&BTRFS_I(inode)->io_tree,
1201                                               key.offset, end, GFP_NOFS);
1202                         }
1203                 }
1204
1205                 ret = get_new_location(rc->data_inode, &new_bytenr,
1206                                        bytenr, num_bytes);
1207                 if (ret > 0)
1208                         continue;
1209                 BUG_ON(ret < 0);
1210
1211                 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1212                 dirty = 1;
1213
1214                 key.offset -= btrfs_file_extent_offset(leaf, fi);
1215                 ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1216                                            num_bytes, parent,
1217                                            btrfs_header_owner(leaf),
1218                                            key.objectid, key.offset);
1219                 BUG_ON(ret);
1220
1221                 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1222                                         parent, btrfs_header_owner(leaf),
1223                                         key.objectid, key.offset);
1224                 BUG_ON(ret);
1225         }
1226         if (dirty)
1227                 btrfs_mark_buffer_dirty(leaf);
1228         return 0;
1229 }
1230
1231 static noinline_for_stack
1232 int memcmp_node_keys(struct extent_buffer *eb, int slot,
1233                      struct btrfs_path *path, int level)
1234 {
1235         struct btrfs_disk_key key1;
1236         struct btrfs_disk_key key2;
1237         btrfs_node_key(eb, &key1, slot);
1238         btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1239         return memcmp(&key1, &key2, sizeof(key1));
1240 }
1241
1242 /*
1243  * try to replace tree blocks in fs tree with the new blocks
1244  * in reloc tree. tree blocks haven't been modified since the
1245  * reloc tree was create can be replaced.
1246  *
1247  * if a block was replaced, level of the block + 1 is returned.
1248  * if no block got replaced, 0 is returned. if there are other
1249  * errors, a negative error number is returned.
1250  */
1251 static int replace_path(struct btrfs_trans_handle *trans,
1252                         struct btrfs_root *dest, struct btrfs_root *src,
1253                         struct btrfs_path *path, struct btrfs_key *next_key,
1254                         struct extent_buffer **leaf,
1255                         int lowest_level, int max_level)
1256 {
1257         struct extent_buffer *eb;
1258         struct extent_buffer *parent;
1259         struct btrfs_key key;
1260         u64 old_bytenr;
1261         u64 new_bytenr;
1262         u64 old_ptr_gen;
1263         u64 new_ptr_gen;
1264         u64 last_snapshot;
1265         u32 blocksize;
1266         int level;
1267         int ret;
1268         int slot;
1269
1270         BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1271         BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1272         BUG_ON(lowest_level > 1 && leaf);
1273
1274         last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1275
1276         slot = path->slots[lowest_level];
1277         btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1278
1279         eb = btrfs_lock_root_node(dest);
1280         btrfs_set_lock_blocking(eb);
1281         level = btrfs_header_level(eb);
1282
1283         if (level < lowest_level) {
1284                 btrfs_tree_unlock(eb);
1285                 free_extent_buffer(eb);
1286                 return 0;
1287         }
1288
1289         ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1290         BUG_ON(ret);
1291         btrfs_set_lock_blocking(eb);
1292
1293         if (next_key) {
1294                 next_key->objectid = (u64)-1;
1295                 next_key->type = (u8)-1;
1296                 next_key->offset = (u64)-1;
1297         }
1298
1299         parent = eb;
1300         while (1) {
1301                 level = btrfs_header_level(parent);
1302                 BUG_ON(level < lowest_level);
1303
1304                 ret = btrfs_bin_search(parent, &key, level, &slot);
1305                 if (ret && slot > 0)
1306                         slot--;
1307
1308                 if (next_key && slot + 1 < btrfs_header_nritems(parent))
1309                         btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1310
1311                 old_bytenr = btrfs_node_blockptr(parent, slot);
1312                 blocksize = btrfs_level_size(dest, level - 1);
1313                 old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1314
1315                 if (level <= max_level) {
1316                         eb = path->nodes[level];
1317                         new_bytenr = btrfs_node_blockptr(eb,
1318                                                         path->slots[level]);
1319                         new_ptr_gen = btrfs_node_ptr_generation(eb,
1320                                                         path->slots[level]);
1321                 } else {
1322                         new_bytenr = 0;
1323                         new_ptr_gen = 0;
1324                 }
1325
1326                 if (new_bytenr > 0 && new_bytenr == old_bytenr) {
1327                         WARN_ON(1);
1328                         ret = level;
1329                         break;
1330                 }
1331
1332                 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1333                     memcmp_node_keys(parent, slot, path, level)) {
1334                         if (level <= lowest_level && !leaf) {
1335                                 ret = 0;
1336                                 break;
1337                         }
1338
1339                         eb = read_tree_block(dest, old_bytenr, blocksize,
1340                                              old_ptr_gen);
1341                         btrfs_tree_lock(eb);
1342                         ret = btrfs_cow_block(trans, dest, eb, parent,
1343                                               slot, &eb);
1344                         BUG_ON(ret);
1345                         btrfs_set_lock_blocking(eb);
1346
1347                         if (level <= lowest_level) {
1348                                 *leaf = eb;
1349                                 ret = 0;
1350                                 break;
1351                         }
1352
1353                         btrfs_tree_unlock(parent);
1354                         free_extent_buffer(parent);
1355
1356                         parent = eb;
1357                         continue;
1358                 }
1359
1360                 btrfs_node_key_to_cpu(path->nodes[level], &key,
1361                                       path->slots[level]);
1362                 btrfs_release_path(src, path);
1363
1364                 path->lowest_level = level;
1365                 ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1366                 path->lowest_level = 0;
1367                 BUG_ON(ret);
1368
1369                 /*
1370                  * swap blocks in fs tree and reloc tree.
1371                  */
1372                 btrfs_set_node_blockptr(parent, slot, new_bytenr);
1373                 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1374                 btrfs_mark_buffer_dirty(parent);
1375
1376                 btrfs_set_node_blockptr(path->nodes[level],
1377                                         path->slots[level], old_bytenr);
1378                 btrfs_set_node_ptr_generation(path->nodes[level],
1379                                               path->slots[level], old_ptr_gen);
1380                 btrfs_mark_buffer_dirty(path->nodes[level]);
1381
1382                 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1383                                         path->nodes[level]->start,
1384                                         src->root_key.objectid, level - 1, 0);
1385                 BUG_ON(ret);
1386                 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1387                                         0, dest->root_key.objectid, level - 1,
1388                                         0);
1389                 BUG_ON(ret);
1390
1391                 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1392                                         path->nodes[level]->start,
1393                                         src->root_key.objectid, level - 1, 0);
1394                 BUG_ON(ret);
1395
1396                 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1397                                         0, dest->root_key.objectid, level - 1,
1398                                         0);
1399                 BUG_ON(ret);
1400
1401                 btrfs_unlock_up_safe(path, 0);
1402
1403                 ret = level;
1404                 break;
1405         }
1406         btrfs_tree_unlock(parent);
1407         free_extent_buffer(parent);
1408         return ret;
1409 }
1410
1411 /*
1412  * helper to find next relocated block in reloc tree
1413  */
1414 static noinline_for_stack
1415 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1416                        int *level)
1417 {
1418         struct extent_buffer *eb;
1419         int i;
1420         u64 last_snapshot;
1421         u32 nritems;
1422
1423         last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1424
1425         for (i = 0; i < *level; i++) {
1426                 free_extent_buffer(path->nodes[i]);
1427                 path->nodes[i] = NULL;
1428         }
1429
1430         for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1431                 eb = path->nodes[i];
1432                 nritems = btrfs_header_nritems(eb);
1433                 while (path->slots[i] + 1 < nritems) {
1434                         path->slots[i]++;
1435                         if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1436                             last_snapshot)
1437                                 continue;
1438
1439                         *level = i;
1440                         return 0;
1441                 }
1442                 free_extent_buffer(path->nodes[i]);
1443                 path->nodes[i] = NULL;
1444         }
1445         return 1;
1446 }
1447
1448 /*
1449  * walk down reloc tree to find relocated block of lowest level
1450  */
1451 static noinline_for_stack
1452 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1453                          int *level)
1454 {
1455         struct extent_buffer *eb = NULL;
1456         int i;
1457         u64 bytenr;
1458         u64 ptr_gen = 0;
1459         u64 last_snapshot;
1460         u32 blocksize;
1461         u32 nritems;
1462
1463         last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1464
1465         for (i = *level; i > 0; i--) {
1466                 eb = path->nodes[i];
1467                 nritems = btrfs_header_nritems(eb);
1468                 while (path->slots[i] < nritems) {
1469                         ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1470                         if (ptr_gen > last_snapshot)
1471                                 break;
1472                         path->slots[i]++;
1473                 }
1474                 if (path->slots[i] >= nritems) {
1475                         if (i == *level)
1476                                 break;
1477                         *level = i + 1;
1478                         return 0;
1479                 }
1480                 if (i == 1) {
1481                         *level = i;
1482                         return 0;
1483                 }
1484
1485                 bytenr = btrfs_node_blockptr(eb, path->slots[i]);
1486                 blocksize = btrfs_level_size(root, i - 1);
1487                 eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
1488                 BUG_ON(btrfs_header_level(eb) != i - 1);
1489                 path->nodes[i - 1] = eb;
1490                 path->slots[i - 1] = 0;
1491         }
1492         return 1;
1493 }
1494
1495 /*
1496  * invalidate extent cache for file extents whose key in range of
1497  * [min_key, max_key)
1498  */
1499 static int invalidate_extent_cache(struct btrfs_root *root,
1500                                    struct btrfs_key *min_key,
1501                                    struct btrfs_key *max_key)
1502 {
1503         struct inode *inode = NULL;
1504         u64 objectid;
1505         u64 start, end;
1506
1507         objectid = min_key->objectid;
1508         while (1) {
1509                 cond_resched();
1510                 iput(inode);
1511
1512                 if (objectid > max_key->objectid)
1513                         break;
1514
1515                 inode = find_next_inode(root, objectid);
1516                 if (!inode)
1517                         break;
1518
1519                 if (inode->i_ino > max_key->objectid) {
1520                         iput(inode);
1521                         break;
1522                 }
1523
1524                 objectid = inode->i_ino + 1;
1525                 if (!S_ISREG(inode->i_mode))
1526                         continue;
1527
1528                 if (unlikely(min_key->objectid == inode->i_ino)) {
1529                         if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1530                                 continue;
1531                         if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1532                                 start = 0;
1533                         else {
1534                                 start = min_key->offset;
1535                                 WARN_ON(!IS_ALIGNED(start, root->sectorsize));
1536                         }
1537                 } else {
1538                         start = 0;
1539                 }
1540
1541                 if (unlikely(max_key->objectid == inode->i_ino)) {
1542                         if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1543                                 continue;
1544                         if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
1545                                 end = (u64)-1;
1546                         } else {
1547                                 if (max_key->offset == 0)
1548                                         continue;
1549                                 end = max_key->offset;
1550                                 WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1551                                 end--;
1552                         }
1553                 } else {
1554                         end = (u64)-1;
1555                 }
1556
1557                 /* the lock_extent waits for readpage to complete */
1558                 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
1559                 btrfs_drop_extent_cache(inode, start, end, 1);
1560                 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
1561         }
1562         return 0;
1563 }
1564
1565 static void put_inodes(struct list_head *list)
1566 {
1567         struct inodevec *ivec;
1568         while (!list_empty(list)) {
1569                 ivec = list_entry(list->next, struct inodevec, list);
1570                 list_del(&ivec->list);
1571                 while (ivec->nr > 0) {
1572                         ivec->nr--;
1573                         iput(ivec->inode[ivec->nr]);
1574                 }
1575                 kfree(ivec);
1576         }
1577 }
1578
1579 static int find_next_key(struct btrfs_path *path, int level,
1580                          struct btrfs_key *key)
1581
1582 {
1583         while (level < BTRFS_MAX_LEVEL) {
1584                 if (!path->nodes[level])
1585                         break;
1586                 if (path->slots[level] + 1 <
1587                     btrfs_header_nritems(path->nodes[level])) {
1588                         btrfs_node_key_to_cpu(path->nodes[level], key,
1589                                               path->slots[level] + 1);
1590                         return 0;
1591                 }
1592                 level++;
1593         }
1594         return 1;
1595 }
1596
1597 /*
1598  * merge the relocated tree blocks in reloc tree with corresponding
1599  * fs tree.
1600  */
1601 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1602                                                struct btrfs_root *root)
1603 {
1604         LIST_HEAD(inode_list);
1605         struct btrfs_key key;
1606         struct btrfs_key next_key;
1607         struct btrfs_trans_handle *trans;
1608         struct btrfs_root *reloc_root;
1609         struct btrfs_root_item *root_item;
1610         struct btrfs_path *path;
1611         struct extent_buffer *leaf = NULL;
1612         unsigned long nr;
1613         int level;
1614         int max_level;
1615         int replaced = 0;
1616         int ret;
1617         int err = 0;
1618
1619         path = btrfs_alloc_path();
1620         if (!path)
1621                 return -ENOMEM;
1622
1623         reloc_root = root->reloc_root;
1624         root_item = &reloc_root->root_item;
1625
1626         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
1627                 level = btrfs_root_level(root_item);
1628                 extent_buffer_get(reloc_root->node);
1629                 path->nodes[level] = reloc_root->node;
1630                 path->slots[level] = 0;
1631         } else {
1632                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
1633
1634                 level = root_item->drop_level;
1635                 BUG_ON(level == 0);
1636                 path->lowest_level = level;
1637                 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1638                 path->lowest_level = 0;
1639                 if (ret < 0) {
1640                         btrfs_free_path(path);
1641                         return ret;
1642                 }
1643
1644                 btrfs_node_key_to_cpu(path->nodes[level], &next_key,
1645                                       path->slots[level]);
1646                 WARN_ON(memcmp(&key, &next_key, sizeof(key)));
1647
1648                 btrfs_unlock_up_safe(path, 0);
1649         }
1650
1651         if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
1652                 trans = btrfs_start_transaction(root, 1);
1653
1654                 leaf = path->nodes[0];
1655                 btrfs_item_key_to_cpu(leaf, &key, 0);
1656                 btrfs_release_path(reloc_root, path);
1657
1658                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1659                 if (ret < 0) {
1660                         err = ret;
1661                         goto out;
1662                 }
1663
1664                 leaf = path->nodes[0];
1665                 btrfs_unlock_up_safe(path, 1);
1666                 ret = replace_file_extents(trans, rc, root, leaf,
1667                                            &inode_list);
1668                 if (ret < 0)
1669                         err = ret;
1670                 goto out;
1671         }
1672
1673         memset(&next_key, 0, sizeof(next_key));
1674
1675         while (1) {
1676                 leaf = NULL;
1677                 replaced = 0;
1678                 trans = btrfs_start_transaction(root, 1);
1679                 max_level = level;
1680
1681                 ret = walk_down_reloc_tree(reloc_root, path, &level);
1682                 if (ret < 0) {
1683                         err = ret;
1684                         goto out;
1685                 }
1686                 if (ret > 0)
1687                         break;
1688
1689                 if (!find_next_key(path, level, &key) &&
1690                     btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1691                         ret = 0;
1692                 } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
1693                         ret = replace_path(trans, root, reloc_root,
1694                                            path, &next_key, &leaf,
1695                                            level, max_level);
1696                 } else {
1697                         ret = replace_path(trans, root, reloc_root,
1698                                            path, &next_key, NULL,
1699                                            level, max_level);
1700                 }
1701                 if (ret < 0) {
1702                         err = ret;
1703                         goto out;
1704                 }
1705
1706                 if (ret > 0) {
1707                         level = ret;
1708                         btrfs_node_key_to_cpu(path->nodes[level], &key,
1709                                               path->slots[level]);
1710                         replaced = 1;
1711                 } else if (leaf) {
1712                         /*
1713                          * no block got replaced, try replacing file extents
1714                          */
1715                         btrfs_item_key_to_cpu(leaf, &key, 0);
1716                         ret = replace_file_extents(trans, rc, root, leaf,
1717                                                    &inode_list);
1718                         btrfs_tree_unlock(leaf);
1719                         free_extent_buffer(leaf);
1720                         BUG_ON(ret < 0);
1721                 }
1722
1723                 ret = walk_up_reloc_tree(reloc_root, path, &level);
1724                 if (ret > 0)
1725                         break;
1726
1727                 BUG_ON(level == 0);
1728                 /*
1729                  * save the merging progress in the drop_progress.
1730                  * this is OK since root refs == 1 in this case.
1731                  */
1732                 btrfs_node_key(path->nodes[level], &root_item->drop_progress,
1733                                path->slots[level]);
1734                 root_item->drop_level = level;
1735
1736                 nr = trans->blocks_used;
1737                 btrfs_end_transaction(trans, root);
1738
1739                 btrfs_btree_balance_dirty(root, nr);
1740
1741                 /*
1742                  * put inodes outside transaction, otherwise we may deadlock.
1743                  */
1744                 put_inodes(&inode_list);
1745
1746                 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1747                         invalidate_extent_cache(root, &key, &next_key);
1748         }
1749
1750         /*
1751          * handle the case only one block in the fs tree need to be
1752          * relocated and the block is tree root.
1753          */
1754         leaf = btrfs_lock_root_node(root);
1755         ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
1756         btrfs_tree_unlock(leaf);
1757         free_extent_buffer(leaf);
1758         if (ret < 0)
1759                 err = ret;
1760 out:
1761         btrfs_free_path(path);
1762
1763         if (err == 0) {
1764                 memset(&root_item->drop_progress, 0,
1765                        sizeof(root_item->drop_progress));
1766                 root_item->drop_level = 0;
1767                 btrfs_set_root_refs(root_item, 0);
1768         }
1769
1770         nr = trans->blocks_used;
1771         btrfs_end_transaction(trans, root);
1772
1773         btrfs_btree_balance_dirty(root, nr);
1774
1775         put_inodes(&inode_list);
1776
1777         if (replaced && rc->stage == UPDATE_DATA_PTRS)
1778                 invalidate_extent_cache(root, &key, &next_key);
1779
1780         return err;
1781 }
1782
1783 /*
1784  * callback for the work threads.
1785  * this function merges reloc tree with corresponding fs tree,
1786  * and then drops the reloc tree.
1787  */
1788 static void merge_func(struct btrfs_work *work)
1789 {
1790         struct btrfs_trans_handle *trans;
1791         struct btrfs_root *root;
1792         struct btrfs_root *reloc_root;
1793         struct async_merge *async;
1794
1795         async = container_of(work, struct async_merge, work);
1796         reloc_root = async->root;
1797
1798         if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1799                 root = read_fs_root(reloc_root->fs_info,
1800                                     reloc_root->root_key.offset);
1801                 BUG_ON(IS_ERR(root));
1802                 BUG_ON(root->reloc_root != reloc_root);
1803
1804                 merge_reloc_root(async->rc, root);
1805
1806                 trans = btrfs_start_transaction(root, 1);
1807                 btrfs_update_reloc_root(trans, root);
1808                 btrfs_end_transaction(trans, root);
1809         }
1810
1811         btrfs_drop_snapshot(reloc_root, 0);
1812
1813         if (atomic_dec_and_test(async->num_pending))
1814                 complete(async->done);
1815
1816         kfree(async);
1817 }
1818
1819 static int merge_reloc_roots(struct reloc_control *rc)
1820 {
1821         struct async_merge *async;
1822         struct btrfs_root *root;
1823         struct completion done;
1824         atomic_t num_pending;
1825
1826         init_completion(&done);
1827         atomic_set(&num_pending, 1);
1828
1829         while (!list_empty(&rc->reloc_roots)) {
1830                 root = list_entry(rc->reloc_roots.next,
1831                                   struct btrfs_root, root_list);
1832                 list_del_init(&root->root_list);
1833
1834                 async = kmalloc(sizeof(*async), GFP_NOFS);
1835                 BUG_ON(!async);
1836                 async->work.func = merge_func;
1837                 async->work.flags = 0;
1838                 async->rc = rc;
1839                 async->root = root;
1840                 async->done = &done;
1841                 async->num_pending = &num_pending;
1842                 atomic_inc(&num_pending);
1843                 btrfs_queue_worker(&rc->workers, &async->work);
1844         }
1845
1846         if (!atomic_dec_and_test(&num_pending))
1847                 wait_for_completion(&done);
1848
1849         BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1850         return 0;
1851 }
1852
1853 static void free_block_list(struct rb_root *blocks)
1854 {
1855         struct tree_block *block;
1856         struct rb_node *rb_node;
1857         while ((rb_node = rb_first(blocks))) {
1858                 block = rb_entry(rb_node, struct tree_block, rb_node);
1859                 rb_erase(rb_node, blocks);
1860                 kfree(block);
1861         }
1862 }
1863
1864 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1865                                       struct btrfs_root *reloc_root)
1866 {
1867         struct btrfs_root *root;
1868
1869         if (reloc_root->last_trans == trans->transid)
1870                 return 0;
1871
1872         root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
1873         BUG_ON(IS_ERR(root));
1874         BUG_ON(root->reloc_root != reloc_root);
1875
1876         return btrfs_record_root_in_trans(trans, root);
1877 }
1878
1879 /*
1880  * select one tree from trees that references the block.
1881  * for blocks in refernce counted trees, we preper reloc tree.
1882  * if no reloc tree found and reloc_only is true, NULL is returned.
1883  */
1884 static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
1885                                             struct backref_node *node,
1886                                             struct backref_edge *edges[],
1887                                             int *nr, int reloc_only)
1888 {
1889         struct backref_node *next;
1890         struct btrfs_root *root;
1891         int index;
1892         int loop = 0;
1893 again:
1894         index = 0;
1895         next = node;
1896         while (1) {
1897                 cond_resched();
1898                 next = walk_up_backref(next, edges, &index);
1899                 root = next->root;
1900                 if (!root) {
1901                         BUG_ON(!node->old_root);
1902                         goto skip;
1903                 }
1904
1905                 /* no other choice for non-refernce counted tree */
1906                 if (!root->ref_cows) {
1907                         BUG_ON(reloc_only);
1908                         break;
1909                 }
1910
1911                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1912                         record_reloc_root_in_trans(trans, root);
1913                         break;
1914                 }
1915
1916                 if (loop) {
1917                         btrfs_record_root_in_trans(trans, root);
1918                         break;
1919                 }
1920
1921                 if (reloc_only || next != node) {
1922                         if (!root->reloc_root)
1923                                 btrfs_record_root_in_trans(trans, root);
1924                         root = root->reloc_root;
1925                         /*
1926                          * if the reloc tree was created in current
1927                          * transation, there is no node in backref tree
1928                          * corresponds to the root of the reloc tree.
1929                          */
1930                         if (btrfs_root_last_snapshot(&root->root_item) ==
1931                             trans->transid - 1)
1932                                 break;
1933                 }
1934 skip:
1935                 root = NULL;
1936                 next = walk_down_backref(edges, &index);
1937                 if (!next || next->level <= node->level)
1938                         break;
1939         }
1940
1941         if (!root && !loop && !reloc_only) {
1942                 loop = 1;
1943                 goto again;
1944         }
1945
1946         if (root)
1947                 *nr = index;
1948         else
1949                 *nr = 0;
1950
1951         return root;
1952 }
1953
1954 static noinline_for_stack
1955 struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
1956                                    struct backref_node *node)
1957 {
1958         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1959         int nr;
1960         return __select_one_root(trans, node, edges, &nr, 0);
1961 }
1962
1963 static noinline_for_stack
1964 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
1965                                      struct backref_node *node,
1966                                      struct backref_edge *edges[], int *nr)
1967 {
1968         return __select_one_root(trans, node, edges, nr, 1);
1969 }
1970
1971 static void grab_path_buffers(struct btrfs_path *path,
1972                               struct backref_node *node,
1973                               struct backref_edge *edges[], int nr)
1974 {
1975         int i = 0;
1976         while (1) {
1977                 drop_node_buffer(node);
1978                 node->eb = path->nodes[node->level];
1979                 BUG_ON(!node->eb);
1980                 if (path->locks[node->level])
1981                         node->locked = 1;
1982                 path->nodes[node->level] = NULL;
1983                 path->locks[node->level] = 0;
1984
1985                 if (i >= nr)
1986                         break;
1987
1988                 edges[i]->blockptr = node->eb->start;
1989                 node = edges[i]->node[UPPER];
1990                 i++;
1991         }
1992 }
1993
1994 /*
1995  * relocate a block tree, and then update pointers in upper level
1996  * blocks that reference the block to point to the new location.
1997  *
1998  * if called by link_to_upper, the block has already been relocated.
1999  * in that case this function just updates pointers.
2000  */
2001 static int do_relocation(struct btrfs_trans_handle *trans,
2002                          struct backref_node *node,
2003                          struct btrfs_key *key,
2004                          struct btrfs_path *path, int lowest)
2005 {
2006         struct backref_node *upper;
2007         struct backref_edge *edge;
2008         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2009         struct btrfs_root *root;
2010         struct extent_buffer *eb;
2011         u32 blocksize;
2012         u64 bytenr;
2013         u64 generation;
2014         int nr;
2015         int slot;
2016         int ret;
2017         int err = 0;
2018
2019         BUG_ON(lowest && node->eb);
2020
2021         path->lowest_level = node->level + 1;
2022         list_for_each_entry(edge, &node->upper, list[LOWER]) {
2023                 cond_resched();
2024                 if (node->eb && node->eb->start == edge->blockptr)
2025                         continue;
2026
2027                 upper = edge->node[UPPER];
2028                 root = select_reloc_root(trans, upper, edges, &nr);
2029                 if (!root)
2030                         continue;
2031
2032                 if (upper->eb && !upper->locked)
2033                         drop_node_buffer(upper);
2034
2035                 if (!upper->eb) {
2036                         ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2037                         if (ret < 0) {
2038                                 err = ret;
2039                                 break;
2040                         }
2041                         BUG_ON(ret > 0);
2042
2043                         slot = path->slots[upper->level];
2044
2045                         btrfs_unlock_up_safe(path, upper->level + 1);
2046                         grab_path_buffers(path, upper, edges, nr);
2047
2048                         btrfs_release_path(NULL, path);
2049                 } else {
2050                         ret = btrfs_bin_search(upper->eb, key, upper->level,
2051                                                &slot);
2052                         BUG_ON(ret);
2053                 }
2054
2055                 bytenr = btrfs_node_blockptr(upper->eb, slot);
2056                 if (!lowest) {
2057                         if (node->eb->start == bytenr) {
2058                                 btrfs_tree_unlock(upper->eb);
2059                                 upper->locked = 0;
2060                                 continue;
2061                         }
2062                 } else {
2063                         BUG_ON(node->bytenr != bytenr);
2064                 }
2065
2066                 blocksize = btrfs_level_size(root, node->level);
2067                 generation = btrfs_node_ptr_generation(upper->eb, slot);
2068                 eb = read_tree_block(root, bytenr, blocksize, generation);
2069                 btrfs_tree_lock(eb);
2070                 btrfs_set_lock_blocking(eb);
2071
2072                 if (!node->eb) {
2073                         ret = btrfs_cow_block(trans, root, eb, upper->eb,
2074                                               slot, &eb);
2075                         if (ret < 0) {
2076                                 err = ret;
2077                                 break;
2078                         }
2079                         btrfs_set_lock_blocking(eb);
2080                         node->eb = eb;
2081                         node->locked = 1;
2082                 } else {
2083                         btrfs_set_node_blockptr(upper->eb, slot,
2084                                                 node->eb->start);
2085                         btrfs_set_node_ptr_generation(upper->eb, slot,
2086                                                       trans->transid);
2087                         btrfs_mark_buffer_dirty(upper->eb);
2088
2089                         ret = btrfs_inc_extent_ref(trans, root,
2090                                                 node->eb->start, blocksize,
2091                                                 upper->eb->start,
2092                                                 btrfs_header_owner(upper->eb),
2093                                                 node->level, 0);
2094                         BUG_ON(ret);
2095
2096                         ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2097                         BUG_ON(ret);
2098                 }
2099                 if (!lowest) {
2100                         btrfs_tree_unlock(upper->eb);
2101                         upper->locked = 0;
2102                 }
2103         }
2104         path->lowest_level = 0;
2105         return err;
2106 }
2107
2108 static int link_to_upper(struct btrfs_trans_handle *trans,
2109                          struct backref_node *node,
2110                          struct btrfs_path *path)
2111 {
2112         struct btrfs_key key;
2113         if (!node->eb || list_empty(&node->upper))
2114                 return 0;
2115
2116         btrfs_node_key_to_cpu(node->eb, &key, 0);
2117         return do_relocation(trans, node, &key, path, 0);
2118 }
2119
2120 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2121                                 struct backref_cache *cache,
2122                                 struct btrfs_path *path)
2123 {
2124         struct backref_node *node;
2125         int level;
2126         int ret;
2127         int err = 0;
2128
2129         for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2130                 while (!list_empty(&cache->pending[level])) {
2131                         node = list_entry(cache->pending[level].next,
2132                                           struct backref_node, lower);
2133                         BUG_ON(node->level != level);
2134
2135                         ret = link_to_upper(trans, node, path);
2136                         if (ret < 0)
2137                                 err = ret;
2138                         /*
2139                          * this remove the node from the pending list and
2140                          * may add some other nodes to the level + 1
2141                          * pending list
2142                          */
2143                         remove_backref_node(cache, node);
2144                 }
2145         }
2146         BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
2147         return err;
2148 }
2149
2150 static void mark_block_processed(struct reloc_control *rc,
2151                                  struct backref_node *node)
2152 {
2153         u32 blocksize;
2154         if (node->level == 0 ||
2155             in_block_group(node->bytenr, rc->block_group)) {
2156                 blocksize = btrfs_level_size(rc->extent_root, node->level);
2157                 set_extent_bits(&rc->processed_blocks, node->bytenr,
2158                                 node->bytenr + blocksize - 1, EXTENT_DIRTY,
2159                                 GFP_NOFS);
2160         }
2161         node->processed = 1;
2162 }
2163
2164 /*
2165  * mark a block and all blocks directly/indirectly reference the block
2166  * as processed.
2167  */
2168 static void update_processed_blocks(struct reloc_control *rc,
2169                                     struct backref_node *node)
2170 {
2171         struct backref_node *next = node;
2172         struct backref_edge *edge;
2173         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2174         int index = 0;
2175
2176         while (next) {
2177                 cond_resched();
2178                 while (1) {
2179                         if (next->processed)
2180                                 break;
2181
2182                         mark_block_processed(rc, next);
2183
2184                         if (list_empty(&next->upper))
2185                                 break;
2186
2187                         edge = list_entry(next->upper.next,
2188                                           struct backref_edge, list[LOWER]);
2189                         edges[index++] = edge;
2190                         next = edge->node[UPPER];
2191                 }
2192                 next = walk_down_backref(edges, &index);
2193         }
2194 }
2195
2196 static int tree_block_processed(u64 bytenr, u32 blocksize,
2197                                 struct reloc_control *rc)
2198 {
2199         if (test_range_bit(&rc->processed_blocks, bytenr,
2200                            bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2201                 return 1;
2202         return 0;
2203 }
2204
2205 /*
2206  * check if there are any file extent pointers in the leaf point to
2207  * data require processing
2208  */
2209 static int check_file_extents(struct reloc_control *rc,
2210                               u64 bytenr, u32 blocksize, u64 ptr_gen)
2211 {
2212         struct btrfs_key found_key;
2213         struct btrfs_file_extent_item *fi;
2214         struct extent_buffer *leaf;
2215         u32 nritems;
2216         int i;
2217         int ret = 0;
2218
2219         leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
2220
2221         nritems = btrfs_header_nritems(leaf);
2222         for (i = 0; i < nritems; i++) {
2223                 cond_resched();
2224                 btrfs_item_key_to_cpu(leaf, &found_key, i);
2225                 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
2226                         continue;
2227                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2228                 if (btrfs_file_extent_type(leaf, fi) ==
2229                     BTRFS_FILE_EXTENT_INLINE)
2230                         continue;
2231                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2232                 if (bytenr == 0)
2233                         continue;
2234                 if (in_block_group(bytenr, rc->block_group)) {
2235                         ret = 1;
2236                         break;
2237                 }
2238         }
2239         free_extent_buffer(leaf);
2240         return ret;
2241 }
2242
2243 /*
2244  * scan child blocks of a given block to find blocks require processing
2245  */
2246 static int add_child_blocks(struct btrfs_trans_handle *trans,
2247                             struct reloc_control *rc,
2248                             struct backref_node *node,
2249                             struct rb_root *blocks)
2250 {
2251         struct tree_block *block;
2252         struct rb_node *rb_node;
2253         u64 bytenr;
2254         u64 ptr_gen;
2255         u32 blocksize;
2256         u32 nritems;
2257         int i;
2258         int err = 0;
2259
2260         nritems = btrfs_header_nritems(node->eb);
2261         blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
2262         for (i = 0; i < nritems; i++) {
2263                 cond_resched();
2264                 bytenr = btrfs_node_blockptr(node->eb, i);
2265                 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2266                 if (ptr_gen == trans->transid)
2267                         continue;
2268                 if (!in_block_group(bytenr, rc->block_group) &&
2269                     (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2270                         continue;
2271                 if (tree_block_processed(bytenr, blocksize, rc))
2272                         continue;
2273
2274                 readahead_tree_block(rc->extent_root,
2275                                      bytenr, blocksize, ptr_gen);
2276         }
2277
2278         for (i = 0; i < nritems; i++) {
2279                 cond_resched();
2280                 bytenr = btrfs_node_blockptr(node->eb, i);
2281                 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2282                 if (ptr_gen == trans->transid)
2283                         continue;
2284                 if (!in_block_group(bytenr, rc->block_group) &&
2285                     (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2286                         continue;
2287                 if (tree_block_processed(bytenr, blocksize, rc))
2288                         continue;
2289                 if (!in_block_group(bytenr, rc->block_group) &&
2290                     !check_file_extents(rc, bytenr, blocksize, ptr_gen))
2291                         continue;
2292
2293                 block = kmalloc(sizeof(*block), GFP_NOFS);
2294                 if (!block) {
2295                         err = -ENOMEM;
2296                         break;
2297                 }
2298                 block->bytenr = bytenr;
2299                 btrfs_node_key_to_cpu(node->eb, &block->key, i);
2300                 block->level = node->level - 1;
2301                 block->key_ready = 1;
2302                 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2303                 BUG_ON(rb_node);
2304         }
2305         if (err)
2306                 free_block_list(blocks);
2307         return err;
2308 }
2309
2310 /*
2311  * find adjacent blocks require processing
2312  */
2313 static noinline_for_stack
2314 int add_adjacent_blocks(struct btrfs_trans_handle *trans,
2315                         struct reloc_control *rc,
2316                         struct backref_cache *cache,
2317                         struct rb_root *blocks, int level,
2318                         struct backref_node **upper)
2319 {
2320         struct backref_node *node;
2321         int ret = 0;
2322
2323         WARN_ON(!list_empty(&cache->pending[level]));
2324
2325         if (list_empty(&cache->pending[level + 1]))
2326                 return 1;
2327
2328         node = list_entry(cache->pending[level + 1].next,
2329                           struct backref_node, lower);
2330         if (node->eb)
2331                 ret = add_child_blocks(trans, rc, node, blocks);
2332
2333         *upper = node;
2334         return ret;
2335 }
2336
2337 static int get_tree_block_key(struct reloc_control *rc,
2338                               struct tree_block *block)
2339 {
2340         struct extent_buffer *eb;
2341
2342         BUG_ON(block->key_ready);
2343         eb = read_tree_block(rc->extent_root, block->bytenr,
2344                              block->key.objectid, block->key.offset);
2345         WARN_ON(btrfs_header_level(eb) != block->level);
2346         if (block->level == 0)
2347                 btrfs_item_key_to_cpu(eb, &block->key, 0);
2348         else
2349                 btrfs_node_key_to_cpu(eb, &block->key, 0);
2350         free_extent_buffer(eb);
2351         block->key_ready = 1;
2352         return 0;
2353 }
2354
2355 static int reada_tree_block(struct reloc_control *rc,
2356                             struct tree_block *block)
2357 {
2358         BUG_ON(block->key_ready);
2359         readahead_tree_block(rc->extent_root, block->bytenr,
2360                              block->key.objectid, block->key.offset);
2361         return 0;
2362 }
2363
2364 /*
2365  * helper function to relocate a tree block
2366  */
2367 static int relocate_tree_block(struct btrfs_trans_handle *trans,
2368                                 struct reloc_control *rc,
2369                                 struct backref_node *node,
2370                                 struct btrfs_key *key,
2371                                 struct btrfs_path *path)
2372 {
2373         struct btrfs_root *root;
2374         int ret;
2375
2376         root = select_one_root(trans, node);
2377         if (unlikely(!root)) {
2378                 rc->found_old_snapshot = 1;
2379                 update_processed_blocks(rc, node);
2380                 return 0;
2381         }
2382
2383         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2384                 ret = do_relocation(trans, node, key, path, 1);
2385                 if (ret < 0)
2386                         goto out;
2387                 if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
2388                         ret = replace_file_extents(trans, rc, root,
2389                                                    node->eb, NULL);
2390                         if (ret < 0)
2391                                 goto out;
2392                 }
2393                 drop_node_buffer(node);
2394         } else if (!root->ref_cows) {
2395                 path->lowest_level = node->level;
2396                 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2397                 btrfs_release_path(root, path);
2398                 if (ret < 0)
2399                         goto out;
2400         } else if (root != node->root) {
2401                 WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
2402         }
2403
2404         update_processed_blocks(rc, node);
2405         ret = 0;
2406 out:
2407         drop_node_buffer(node);
2408         return ret;
2409 }
2410
2411 /*
2412  * relocate a list of blocks
2413  */
2414 static noinline_for_stack
2415 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2416                          struct reloc_control *rc, struct rb_root *blocks)
2417 {
2418         struct backref_cache *cache;
2419         struct backref_node *node;
2420         struct btrfs_path *path;
2421         struct tree_block *block;
2422         struct rb_node *rb_node;
2423         int level = -1;
2424         int ret;
2425         int err = 0;
2426
2427         path = btrfs_alloc_path();
2428         if (!path)
2429                 return -ENOMEM;
2430
2431         cache = kmalloc(sizeof(*cache), GFP_NOFS);
2432         if (!cache) {
2433                 btrfs_free_path(path);
2434                 return -ENOMEM;
2435         }
2436
2437         backref_cache_init(cache);
2438
2439         rb_node = rb_first(blocks);
2440         while (rb_node) {
2441                 block = rb_entry(rb_node, struct tree_block, rb_node);
2442                 if (level == -1)
2443                         level = block->level;
2444                 else
2445                         BUG_ON(level != block->level);
2446                 if (!block->key_ready)
2447                         reada_tree_block(rc, block);
2448                 rb_node = rb_next(rb_node);
2449         }
2450
2451         rb_node = rb_first(blocks);
2452         while (rb_node) {
2453                 block = rb_entry(rb_node, struct tree_block, rb_node);
2454                 if (!block->key_ready)
2455                         get_tree_block_key(rc, block);
2456                 rb_node = rb_next(rb_node);
2457         }
2458
2459         rb_node = rb_first(blocks);
2460         while (rb_node) {
2461                 block = rb_entry(rb_node, struct tree_block, rb_node);
2462
2463                 node = build_backref_tree(rc, cache, &block->key,
2464                                           block->level, block->bytenr);
2465                 if (IS_ERR(node)) {
2466                         err = PTR_ERR(node);
2467                         goto out;
2468                 }
2469
2470                 ret = relocate_tree_block(trans, rc, node, &block->key,
2471                                           path);
2472                 if (ret < 0) {
2473                         err = ret;
2474                         goto out;
2475                 }
2476                 remove_backref_node(cache, node);
2477                 rb_node = rb_next(rb_node);
2478         }
2479
2480         if (level > 0)
2481                 goto out;
2482
2483         free_block_list(blocks);
2484
2485         /*
2486          * now backrefs of some upper level tree blocks have been cached,
2487          * try relocating blocks referenced by these upper level blocks.
2488          */
2489         while (1) {
2490                 struct backref_node *upper = NULL;
2491                 if (trans->transaction->in_commit ||
2492                     trans->transaction->delayed_refs.flushing)
2493                         break;
2494
2495                 ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
2496                                           &upper);
2497                 if (ret < 0)
2498                         err = ret;
2499                 if (ret != 0)
2500                         break;
2501
2502                 rb_node = rb_first(blocks);
2503                 while (rb_node) {
2504                         block = rb_entry(rb_node, struct tree_block, rb_node);
2505                         if (trans->transaction->in_commit ||
2506                             trans->transaction->delayed_refs.flushing)
2507                                 goto out;
2508                         BUG_ON(!block->key_ready);
2509                         node = build_backref_tree(rc, cache, &block->key,
2510                                                   level, block->bytenr);
2511                         if (IS_ERR(node)) {
2512                                 err = PTR_ERR(node);
2513                                 goto out;
2514                         }
2515
2516                         ret = relocate_tree_block(trans, rc, node,
2517                                                   &block->key, path);
2518                         if (ret < 0) {
2519                                 err = ret;
2520                                 goto out;
2521                         }
2522                         remove_backref_node(cache, node);
2523                         rb_node = rb_next(rb_node);
2524                 }
2525                 free_block_list(blocks);
2526
2527                 if (upper) {
2528                         ret = link_to_upper(trans, upper, path);
2529                         if (ret < 0) {
2530                                 err = ret;
2531                                 break;
2532                         }
2533                         remove_backref_node(cache, upper);
2534                 }
2535         }
2536 out:
2537         free_block_list(blocks);
2538
2539         ret = finish_pending_nodes(trans, cache, path);
2540         if (ret < 0)
2541                 err = ret;
2542
2543         kfree(cache);
2544         btrfs_free_path(path);
2545         return err;
2546 }
2547
2548 static noinline_for_stack
2549 int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
2550                          u64 block_start)
2551 {
2552         struct btrfs_root *root = BTRFS_I(inode)->root;
2553         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2554         struct extent_map *em;
2555         int ret = 0;
2556
2557         em = alloc_extent_map(GFP_NOFS);
2558         if (!em)
2559                 return -ENOMEM;
2560
2561         em->start = start;
2562         em->len = end + 1 - start;
2563         em->block_len = em->len;
2564         em->block_start = block_start;
2565         em->bdev = root->fs_info->fs_devices->latest_bdev;
2566         set_bit(EXTENT_FLAG_PINNED, &em->flags);
2567
2568         lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2569         while (1) {
2570                 write_lock(&em_tree->lock);
2571                 ret = add_extent_mapping(em_tree, em);
2572                 write_unlock(&em_tree->lock);
2573                 if (ret != -EEXIST) {
2574                         free_extent_map(em);
2575                         break;
2576                 }
2577                 btrfs_drop_extent_cache(inode, start, end, 0);
2578         }
2579         unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2580         return ret;
2581 }
2582
2583 static int relocate_file_extent_cluster(struct inode *inode,
2584                                         struct file_extent_cluster *cluster)
2585 {
2586         u64 page_start;
2587         u64 page_end;
2588         u64 offset = BTRFS_I(inode)->index_cnt;
2589         unsigned long index;
2590         unsigned long last_index;
2591         unsigned int dirty_page = 0;
2592         struct page *page;
2593         struct file_ra_state *ra;
2594         int nr = 0;
2595         int ret = 0;
2596
2597         if (!cluster->nr)
2598                 return 0;
2599
2600         ra = kzalloc(sizeof(*ra), GFP_NOFS);
2601         if (!ra)
2602                 return -ENOMEM;
2603
2604         index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2605         last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2606
2607         mutex_lock(&inode->i_mutex);
2608
2609         i_size_write(inode, cluster->end + 1 - offset);
2610         ret = setup_extent_mapping(inode, cluster->start - offset,
2611                                    cluster->end - offset, cluster->start);
2612         if (ret)
2613                 goto out_unlock;
2614
2615         file_ra_state_init(ra, inode->i_mapping);
2616
2617         WARN_ON(cluster->start != cluster->boundary[0]);
2618         while (index <= last_index) {
2619                 page = find_lock_page(inode->i_mapping, index);
2620                 if (!page) {
2621                         page_cache_sync_readahead(inode->i_mapping,
2622                                                   ra, NULL, index,
2623                                                   last_index + 1 - index);
2624                         page = grab_cache_page(inode->i_mapping, index);
2625                         if (!page) {
2626                                 ret = -ENOMEM;
2627                                 goto out_unlock;
2628                         }
2629                 }
2630
2631                 if (PageReadahead(page)) {
2632                         page_cache_async_readahead(inode->i_mapping,
2633                                                    ra, NULL, page, index,
2634                                                    last_index + 1 - index);
2635                 }
2636
2637                 if (!PageUptodate(page)) {
2638                         btrfs_readpage(NULL, page);
2639                         lock_page(page);
2640                         if (!PageUptodate(page)) {
2641                                 unlock_page(page);
2642                                 page_cache_release(page);
2643                                 ret = -EIO;
2644                                 goto out_unlock;
2645                         }
2646                 }
2647
2648                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2649                 page_end = page_start + PAGE_CACHE_SIZE - 1;
2650
2651                 lock_extent(&BTRFS_I(inode)->io_tree,
2652                             page_start, page_end, GFP_NOFS);
2653
2654                 set_page_extent_mapped(page);
2655
2656                 if (nr < cluster->nr &&
2657                     page_start + offset == cluster->boundary[nr]) {
2658                         set_extent_bits(&BTRFS_I(inode)->io_tree,
2659                                         page_start, page_end,
2660                                         EXTENT_BOUNDARY, GFP_NOFS);
2661                         nr++;
2662                 }
2663                 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2664
2665                 set_page_dirty(page);
2666                 dirty_page++;
2667
2668                 unlock_extent(&BTRFS_I(inode)->io_tree,
2669                               page_start, page_end, GFP_NOFS);
2670                 unlock_page(page);
2671                 page_cache_release(page);
2672
2673                 index++;
2674                 if (nr < cluster->nr &&
2675                     page_end + 1 + offset == cluster->boundary[nr]) {
2676                         balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2677                                                            dirty_page);
2678                         dirty_page = 0;
2679                 }
2680         }
2681         if (dirty_page) {
2682                 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2683                                                    dirty_page);
2684         }
2685         WARN_ON(nr != cluster->nr);
2686 out_unlock:
2687         mutex_unlock(&inode->i_mutex);
2688         kfree(ra);
2689         return ret;
2690 }
2691
2692 static noinline_for_stack
2693 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
2694                          struct file_extent_cluster *cluster)
2695 {
2696         int ret;
2697
2698         if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
2699                 ret = relocate_file_extent_cluster(inode, cluster);
2700                 if (ret)
2701                         return ret;
2702                 cluster->nr = 0;
2703         }
2704
2705         if (!cluster->nr)
2706                 cluster->start = extent_key->objectid;
2707         else
2708                 BUG_ON(cluster->nr >= MAX_EXTENTS);
2709         cluster->end = extent_key->objectid + extent_key->offset - 1;
2710         cluster->boundary[cluster->nr] = extent_key->objectid;
2711         cluster->nr++;
2712
2713         if (cluster->nr >= MAX_EXTENTS) {
2714                 ret = relocate_file_extent_cluster(inode, cluster);
2715                 if (ret)
2716                         return ret;
2717                 cluster->nr = 0;
2718         }
2719         return 0;
2720 }
2721
2722 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2723 static int get_ref_objectid_v0(struct reloc_control *rc,
2724                                struct btrfs_path *path,
2725                                struct btrfs_key *extent_key,
2726                                u64 *ref_objectid, int *path_change)
2727 {
2728         struct btrfs_key key;
2729         struct extent_buffer *leaf;
2730         struct btrfs_extent_ref_v0 *ref0;
2731         int ret;
2732         int slot;
2733
2734         leaf = path->nodes[0];
2735         slot = path->slots[0];
2736         while (1) {
2737                 if (slot >= btrfs_header_nritems(leaf)) {
2738                         ret = btrfs_next_leaf(rc->extent_root, path);
2739                         if (ret < 0)
2740                                 return ret;
2741                         BUG_ON(ret > 0);
2742                         leaf = path->nodes[0];
2743                         slot = path->slots[0];
2744                         if (path_change)
2745                                 *path_change = 1;
2746                 }
2747                 btrfs_item_key_to_cpu(leaf, &key, slot);
2748                 if (key.objectid != extent_key->objectid)
2749                         return -ENOENT;
2750
2751                 if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
2752                         slot++;
2753                         continue;
2754                 }
2755                 ref0 = btrfs_item_ptr(leaf, slot,
2756                                 struct btrfs_extent_ref_v0);
2757                 *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
2758                 break;
2759         }
2760         return 0;
2761 }
2762 #endif
2763
2764 /*
2765  * helper to add a tree block to the list.
2766  * the major work is getting the generation and level of the block
2767  */
2768 static int add_tree_block(struct reloc_control *rc,
2769                           struct btrfs_key *extent_key,
2770                           struct btrfs_path *path,
2771                           struct rb_root *blocks)
2772 {
2773         struct extent_buffer *eb;
2774         struct btrfs_extent_item *ei;
2775         struct btrfs_tree_block_info *bi;
2776         struct tree_block *block;
2777         struct rb_node *rb_node;
2778         u32 item_size;
2779         int level = -1;
2780         int generation;
2781
2782         eb =  path->nodes[0];
2783         item_size = btrfs_item_size_nr(eb, path->slots[0]);
2784
2785         if (item_size >= sizeof(*ei) + sizeof(*bi)) {
2786                 ei = btrfs_item_ptr(eb, path->slots[0],
2787                                 struct btrfs_extent_item);
2788                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2789                 generation = btrfs_extent_generation(eb, ei);
2790                 level = btrfs_tree_block_level(eb, bi);
2791         } else {
2792 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2793                 u64 ref_owner;
2794                 int ret;
2795
2796                 BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2797                 ret = get_ref_objectid_v0(rc, path, extent_key,
2798                                           &ref_owner, NULL);
2799                 BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
2800                 level = (int)ref_owner;
2801                 /* FIXME: get real generation */
2802                 generation = 0;
2803 #else
2804                 BUG();
2805 #endif
2806         }
2807
2808         btrfs_release_path(rc->extent_root, path);
2809
2810         BUG_ON(level == -1);
2811
2812         block = kmalloc(sizeof(*block), GFP_NOFS);
2813         if (!block)
2814                 return -ENOMEM;
2815
2816         block->bytenr = extent_key->objectid;
2817         block->key.objectid = extent_key->offset;
2818         block->key.offset = generation;
2819         block->level = level;
2820         block->key_ready = 0;
2821
2822         rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2823         BUG_ON(rb_node);
2824
2825         return 0;
2826 }
2827
2828 /*
2829  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
2830  */
2831 static int __add_tree_block(struct reloc_control *rc,
2832                             u64 bytenr, u32 blocksize,
2833                             struct rb_root *blocks)
2834 {
2835         struct btrfs_path *path;
2836         struct btrfs_key key;
2837         int ret;
2838
2839         if (tree_block_processed(bytenr, blocksize, rc))
2840                 return 0;
2841
2842         if (tree_search(blocks, bytenr))
2843                 return 0;
2844
2845         path = btrfs_alloc_path();
2846         if (!path)
2847                 return -ENOMEM;
2848
2849         key.objectid = bytenr;
2850         key.type = BTRFS_EXTENT_ITEM_KEY;
2851         key.offset = blocksize;
2852
2853         path->search_commit_root = 1;
2854         path->skip_locking = 1;
2855         ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
2856         if (ret < 0)
2857                 goto out;
2858         BUG_ON(ret);
2859
2860         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2861         ret = add_tree_block(rc, &key, path, blocks);
2862 out:
2863         btrfs_free_path(path);
2864         return ret;
2865 }
2866
2867 /*
2868  * helper to check if the block use full backrefs for pointers in it
2869  */
2870 static int block_use_full_backref(struct reloc_control *rc,
2871                                   struct extent_buffer *eb)
2872 {
2873         struct btrfs_path *path;
2874         struct btrfs_extent_item *ei;
2875         struct btrfs_key key;
2876         u64 flags;
2877         int ret;
2878
2879         if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
2880             btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
2881                 return 1;
2882
2883         path = btrfs_alloc_path();
2884         BUG_ON(!path);
2885
2886         key.objectid = eb->start;
2887         key.type = BTRFS_EXTENT_ITEM_KEY;
2888         key.offset = eb->len;
2889
2890         path->search_commit_root = 1;
2891         path->skip_locking = 1;
2892         ret = btrfs_search_slot(NULL, rc->extent_root,
2893                                 &key, path, 0, 0);
2894         BUG_ON(ret);
2895
2896         ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2897                             struct btrfs_extent_item);
2898         flags = btrfs_extent_flags(path->nodes[0], ei);
2899         BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2900         if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
2901                 ret = 1;
2902         else
2903                 ret = 0;
2904         btrfs_free_path(path);
2905         return ret;
2906 }
2907
2908 /*
2909  * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
2910  * this function scans fs tree to find blocks reference the data extent
2911  */
2912 static int find_data_references(struct reloc_control *rc,
2913                                 struct btrfs_key *extent_key,
2914                                 struct extent_buffer *leaf,
2915                                 struct btrfs_extent_data_ref *ref,
2916                                 struct rb_root *blocks)
2917 {
2918         struct btrfs_path *path;
2919         struct tree_block *block;
2920         struct btrfs_root *root;
2921         struct btrfs_file_extent_item *fi;
2922         struct rb_node *rb_node;
2923         struct btrfs_key key;
2924         u64 ref_root;
2925         u64 ref_objectid;
2926         u64 ref_offset;
2927         u32 ref_count;
2928         u32 nritems;
2929         int err = 0;
2930         int added = 0;
2931         int counted;
2932         int ret;
2933
2934         path = btrfs_alloc_path();
2935         if (!path)
2936                 return -ENOMEM;
2937
2938         ref_root = btrfs_extent_data_ref_root(leaf, ref);
2939         ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
2940         ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
2941         ref_count = btrfs_extent_data_ref_count(leaf, ref);
2942
2943         root = read_fs_root(rc->extent_root->fs_info, ref_root);
2944         if (IS_ERR(root)) {
2945                 err = PTR_ERR(root);
2946                 goto out;
2947         }
2948
2949         key.objectid = ref_objectid;
2950         key.offset = ref_offset;
2951         key.type = BTRFS_EXTENT_DATA_KEY;
2952
2953         path->search_commit_root = 1;
2954         path->skip_locking = 1;
2955         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2956         if (ret < 0) {
2957                 err = ret;
2958                 goto out;
2959         }
2960
2961         leaf = path->nodes[0];
2962         nritems = btrfs_header_nritems(leaf);
2963         /*
2964          * the references in tree blocks that use full backrefs
2965          * are not counted in
2966          */
2967         if (block_use_full_backref(rc, leaf))
2968                 counted = 0;
2969         else
2970                 counted = 1;
2971         rb_node = tree_search(blocks, leaf->start);
2972         if (rb_node) {
2973                 if (counted)
2974                         added = 1;
2975                 else
2976                         path->slots[0] = nritems;
2977         }
2978
2979         while (ref_count > 0) {
2980                 while (path->slots[0] >= nritems) {
2981                         ret = btrfs_next_leaf(root, path);
2982                         if (ret < 0) {
2983                                 err = ret;
2984                                 goto out;
2985                         }
2986                         if (ret > 0) {
2987                                 WARN_ON(1);
2988                                 goto out;
2989                         }
2990
2991                         leaf = path->nodes[0];
2992                         nritems = btrfs_header_nritems(leaf);
2993                         added = 0;
2994
2995                         if (block_use_full_backref(rc, leaf))
2996                                 counted = 0;
2997                         else
2998                                 counted = 1;
2999                         rb_node = tree_search(blocks, leaf->start);
3000                         if (rb_node) {
3001                                 if (counted)
3002                                         added = 1;
3003                                 else
3004                                         path->slots[0] = nritems;
3005                         }
3006                 }
3007
3008                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3009                 if (key.objectid != ref_objectid ||
3010                     key.type != BTRFS_EXTENT_DATA_KEY) {
3011                         WARN_ON(1);
3012                         break;
3013                 }
3014
3015                 fi = btrfs_item_ptr(leaf, path->slots[0],
3016                                     struct btrfs_file_extent_item);
3017
3018                 if (btrfs_file_extent_type(leaf, fi) ==
3019                     BTRFS_FILE_EXTENT_INLINE)
3020                         goto next;
3021
3022                 if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3023                     extent_key->objectid)
3024                         goto next;
3025
3026                 key.offset -= btrfs_file_extent_offset(leaf, fi);
3027                 if (key.offset != ref_offset)
3028                         goto next;
3029
3030                 if (counted)
3031                         ref_count--;
3032                 if (added)
3033                         goto next;
3034
3035                 if (!tree_block_processed(leaf->start, leaf->len, rc)) {
3036                         block = kmalloc(sizeof(*block), GFP_NOFS);
3037                         if (!block) {
3038                                 err = -ENOMEM;
3039                                 break;
3040                         }
3041                         block->bytenr = leaf->start;
3042                         btrfs_item_key_to_cpu(leaf, &block->key, 0);
3043                         block->level = 0;
3044                         block->key_ready = 1;
3045                         rb_node = tree_insert(blocks, block->bytenr,
3046                                               &block->rb_node);
3047                         BUG_ON(rb_node);
3048                 }
3049                 if (counted)
3050                         added = 1;
3051                 else
3052                         path->slots[0] = nritems;
3053 next:
3054                 path->slots[0]++;
3055
3056         }
3057 out:
3058         btrfs_free_path(path);
3059         return err;
3060 }
3061
3062 /*
3063  * hepler to find all tree blocks that reference a given data extent
3064  */
3065 static noinline_for_stack
3066 int add_data_references(struct reloc_control *rc,
3067                         struct btrfs_key *extent_key,
3068                         struct btrfs_path *path,
3069                         struct rb_root *blocks)
3070 {
3071         struct btrfs_key key;
3072         struct extent_buffer *eb;
3073         struct btrfs_extent_data_ref *dref;
3074         struct btrfs_extent_inline_ref *iref;
3075         unsigned long ptr;
3076         unsigned long end;
3077         u32 blocksize;
3078         int ret;
3079         int err = 0;
3080
3081         ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
3082                                extent_key->offset);
3083         BUG_ON(ret < 0);
3084         if (ret > 0) {
3085                 /* the relocated data is fragmented */
3086                 rc->extents_skipped++;
3087                 btrfs_release_path(rc->extent_root, path);
3088                 return 0;
3089         }
3090
3091         blocksize = btrfs_level_size(rc->extent_root, 0);
3092
3093         eb = path->nodes[0];
3094         ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3095         end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3096 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3097         if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3098                 ptr = end;
3099         else
3100 #endif
3101                 ptr += sizeof(struct btrfs_extent_item);
3102
3103         while (ptr < end) {
3104                 iref = (struct btrfs_extent_inline_ref *)ptr;
3105                 key.type = btrfs_extent_inline_ref_type(eb, iref);
3106                 if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3107                         key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3108                         ret = __add_tree_block(rc, key.offset, blocksize,
3109                                                blocks);
3110                 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3111                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3112                         ret = find_data_references(rc, extent_key,
3113                                                    eb, dref, blocks);
3114                 } else {
3115                         BUG();
3116                 }
3117                 ptr += btrfs_extent_inline_ref_size(key.type);
3118         }
3119         WARN_ON(ptr > end);
3120
3121         while (1) {
3122                 cond_resched();
3123                 eb = path->nodes[0];
3124                 if (path->slots[0] >= btrfs_header_nritems(eb)) {
3125                         ret = btrfs_next_leaf(rc->extent_root, path);
3126                         if (ret < 0) {
3127                                 err = ret;
3128                                 break;
3129                         }
3130                         if (ret > 0)
3131                                 break;
3132                         eb = path->nodes[0];
3133                 }
3134
3135                 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3136                 if (key.objectid != extent_key->objectid)
3137                         break;
3138
3139 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3140                 if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3141                     key.type == BTRFS_EXTENT_REF_V0_KEY) {
3142 #else
3143                 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3144                 if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3145 #endif
3146                         ret = __add_tree_block(rc, key.offset, blocksize,
3147                                                blocks);
3148                 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3149                         dref = btrfs_item_ptr(eb, path->slots[0],
3150                                               struct btrfs_extent_data_ref);
3151                         ret = find_data_references(rc, extent_key,
3152                                                    eb, dref, blocks);
3153                 } else {
3154                         ret = 0;
3155                 }
3156                 if (ret) {
3157                         err = ret;
3158                         break;
3159                 }
3160                 path->slots[0]++;
3161         }
3162         btrfs_release_path(rc->extent_root, path);
3163         if (err)
3164                 free_block_list(blocks);
3165         return err;
3166 }
3167
3168 /*
3169  * hepler to find next unprocessed extent
3170  */
3171 static noinline_for_stack
3172 int find_next_extent(struct btrfs_trans_handle *trans,
3173                      struct reloc_control *rc, struct btrfs_path *path)
3174 {
3175         struct btrfs_key key;
3176         struct extent_buffer *leaf;
3177         u64 start, end, last;
3178         int ret;
3179
3180         last = rc->block_group->key.objectid + rc->block_group->key.offset;
3181         while (1) {
3182                 cond_resched();
3183                 if (rc->search_start >= last) {
3184                         ret = 1;
3185                         break;
3186                 }
3187
3188                 key.objectid = rc->search_start;
3189                 key.type = BTRFS_EXTENT_ITEM_KEY;
3190                 key.offset = 0;
3191
3192                 path->search_commit_root = 1;
3193                 path->skip_locking = 1;
3194                 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3195                                         0, 0);
3196                 if (ret < 0)
3197                         break;
3198 next:
3199                 leaf = path->nodes[0];
3200                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3201                         ret = btrfs_next_leaf(rc->extent_root, path);
3202                         if (ret != 0)
3203                                 break;
3204                         leaf = path->nodes[0];
3205                 }
3206
3207                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3208                 if (key.objectid >= last) {
3209                         ret = 1;
3210                         break;
3211                 }
3212
3213                 if (key.type != BTRFS_EXTENT_ITEM_KEY ||
3214                     key.objectid + key.offset <= rc->search_start) {
3215                         path->slots[0]++;
3216                         goto next;
3217                 }
3218
3219                 ret = find_first_extent_bit(&rc->processed_blocks,
3220                                             key.objectid, &start, &end,
3221                                             EXTENT_DIRTY);
3222
3223                 if (ret == 0 && start <= key.objectid) {
3224                         btrfs_release_path(rc->extent_root, path);
3225                         rc->search_start = end + 1;
3226                 } else {
3227                         rc->search_start = key.objectid + key.offset;
3228                         return 0;
3229                 }
3230         }
3231         btrfs_release_path(rc->extent_root, path);
3232         return ret;
3233 }
3234
3235 static void set_reloc_control(struct reloc_control *rc)
3236 {
3237         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3238         mutex_lock(&fs_info->trans_mutex);
3239         fs_info->reloc_ctl = rc;
3240         mutex_unlock(&fs_info->trans_mutex);
3241 }
3242
3243 static void unset_reloc_control(struct reloc_control *rc)
3244 {
3245         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3246         mutex_lock(&fs_info->trans_mutex);
3247         fs_info->reloc_ctl = NULL;
3248         mutex_unlock(&fs_info->trans_mutex);
3249 }
3250
3251 static int check_extent_flags(u64 flags)
3252 {
3253         if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3254             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3255                 return 1;
3256         if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3257             !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3258                 return 1;
3259         if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3260             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3261                 return 1;
3262         return 0;
3263 }
3264
3265
3266 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3267 {
3268         struct rb_root blocks = RB_ROOT;
3269         struct btrfs_key key;
3270         struct file_extent_cluster *cluster;
3271         struct btrfs_trans_handle *trans = NULL;
3272         struct btrfs_path *path;
3273         struct btrfs_extent_item *ei;
3274         unsigned long nr;
3275         u64 flags;
3276         u32 item_size;
3277         int ret;
3278         int err = 0;
3279
3280         cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
3281         if (!cluster)
3282                 return -ENOMEM;
3283
3284         path = btrfs_alloc_path();
3285         if (!path) {
3286                 kfree(cluster);
3287                 return -ENOMEM;
3288         }
3289
3290         rc->extents_found = 0;
3291         rc->extents_skipped = 0;
3292
3293         rc->search_start = rc->block_group->key.objectid;
3294         clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3295                           GFP_NOFS);
3296
3297         rc->create_reloc_root = 1;
3298         set_reloc_control(rc);
3299
3300         trans = btrfs_start_transaction(rc->extent_root, 1);
3301         btrfs_commit_transaction(trans, rc->extent_root);
3302
3303         while (1) {
3304                 trans = btrfs_start_transaction(rc->extent_root, 1);
3305
3306                 ret = find_next_extent(trans, rc, path);
3307                 if (ret < 0)
3308                         err = ret;
3309                 if (ret != 0)
3310                         break;
3311
3312                 rc->extents_found++;
3313
3314                 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3315                                     struct btrfs_extent_item);
3316                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3317                 item_size = btrfs_item_size_nr(path->nodes[0],
3318                                                path->slots[0]);
3319                 if (item_size >= sizeof(*ei)) {
3320                         flags = btrfs_extent_flags(path->nodes[0], ei);
3321                         ret = check_extent_flags(flags);
3322                         BUG_ON(ret);
3323
3324                 } else {
3325 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3326                         u64 ref_owner;
3327                         int path_change = 0;
3328
3329                         BUG_ON(item_size !=
3330                                sizeof(struct btrfs_extent_item_v0));
3331                         ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
3332                                                   &path_change);
3333                         if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
3334                                 flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
3335                         else
3336                                 flags = BTRFS_EXTENT_FLAG_DATA;
3337
3338                         if (path_change) {
3339                                 btrfs_release_path(rc->extent_root, path);
3340
3341                                 path->search_commit_root = 1;
3342                                 path->skip_locking = 1;
3343                                 ret = btrfs_search_slot(NULL, rc->extent_root,
3344                                                         &key, path, 0, 0);
3345                                 if (ret < 0) {
3346                                         err = ret;
3347                                         break;
3348                                 }
3349                                 BUG_ON(ret > 0);
3350                         }
3351 #else
3352                         BUG();
3353 #endif
3354                 }
3355
3356                 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3357                         ret = add_tree_block(rc, &key, path, &blocks);
3358                 } else if (rc->stage == UPDATE_DATA_PTRS &&
3359                          (flags & BTRFS_EXTENT_FLAG_DATA)) {
3360                         ret = add_data_references(rc, &key, path, &blocks);
3361                 } else {
3362                         btrfs_release_path(rc->extent_root, path);
3363                         ret = 0;
3364                 }
3365                 if (ret < 0) {
3366                         err = 0;
3367                         break;
3368                 }
3369
3370                 if (!RB_EMPTY_ROOT(&blocks)) {
3371                         ret = relocate_tree_blocks(trans, rc, &blocks);
3372                         if (ret < 0) {
3373                                 err = ret;
3374                                 break;
3375                         }
3376                 }
3377
3378                 nr = trans->blocks_used;
3379                 btrfs_end_transaction(trans, rc->extent_root);
3380                 trans = NULL;
3381                 btrfs_btree_balance_dirty(rc->extent_root, nr);
3382
3383                 if (rc->stage == MOVE_DATA_EXTENTS &&
3384                     (flags & BTRFS_EXTENT_FLAG_DATA)) {
3385                         rc->found_file_extent = 1;
3386                         ret = relocate_data_extent(rc->data_inode,
3387                                                    &key, cluster);
3388                         if (ret < 0) {
3389                                 err = ret;
3390                                 break;
3391                         }
3392                 }
3393         }
3394         btrfs_free_path(path);
3395
3396         if (trans) {
3397                 nr = trans->blocks_used;
3398                 btrfs_end_transaction(trans, rc->extent_root);
3399                 btrfs_btree_balance_dirty(rc->extent_root, nr);
3400         }
3401
3402         if (!err) {
3403                 ret = relocate_file_extent_cluster(rc->data_inode, cluster);
3404                 if (ret < 0)
3405                         err = ret;
3406         }
3407
3408         kfree(cluster);
3409
3410         rc->create_reloc_root = 0;
3411         smp_mb();
3412
3413         if (rc->extents_found > 0) {
3414                 trans = btrfs_start_transaction(rc->extent_root, 1);
3415                 btrfs_commit_transaction(trans, rc->extent_root);
3416         }
3417
3418         merge_reloc_roots(rc);
3419
3420         unset_reloc_control(rc);
3421
3422         /* get rid of pinned extents */
3423         trans = btrfs_start_transaction(rc->extent_root, 1);
3424         btrfs_commit_transaction(trans, rc->extent_root);
3425
3426         return err;
3427 }
3428
3429 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3430                                  struct btrfs_root *root, u64 objectid)
3431 {
3432         struct btrfs_path *path;
3433         struct btrfs_inode_item *item;
3434         struct extent_buffer *leaf;
3435         int ret;
3436
3437         path = btrfs_alloc_path();
3438         if (!path)
3439                 return -ENOMEM;
3440
3441         ret = btrfs_insert_empty_inode(trans, root, path, objectid);
3442         if (ret)
3443                 goto out;
3444
3445         leaf = path->nodes[0];
3446         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3447         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
3448         btrfs_set_inode_generation(leaf, item, 1);
3449         btrfs_set_inode_size(leaf, item, 0);
3450         btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3451         btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
3452         btrfs_mark_buffer_dirty(leaf);
3453         btrfs_release_path(root, path);
3454 out:
3455         btrfs_free_path(path);
3456         return ret;
3457 }
3458
3459 /*
3460  * helper to create inode for data relocation.
3461  * the inode is in data relocation tree and its link count is 0
3462  */
3463 static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3464                                         struct btrfs_block_group_cache *group)
3465 {
3466         struct inode *inode = NULL;
3467         struct btrfs_trans_handle *trans;
3468         struct btrfs_root *root;
3469         struct btrfs_key key;
3470         unsigned long nr;
3471         u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
3472         int err = 0;
3473
3474         root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
3475         if (IS_ERR(root))
3476                 return ERR_CAST(root);
3477
3478         trans = btrfs_start_transaction(root, 1);
3479         BUG_ON(!trans);
3480
3481         err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
3482         if (err)
3483                 goto out;
3484
3485         err = __insert_orphan_inode(trans, root, objectid);
3486         BUG_ON(err);
3487
3488         key.objectid = objectid;
3489         key.type = BTRFS_INODE_ITEM_KEY;
3490         key.offset = 0;
3491         inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
3492         BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
3493         BTRFS_I(inode)->index_cnt = group->key.objectid;
3494
3495         err = btrfs_orphan_add(trans, inode);
3496 out:
3497         nr = trans->blocks_used;
3498         btrfs_end_transaction(trans, root);
3499
3500         btrfs_btree_balance_dirty(root, nr);
3501         if (err) {
3502                 if (inode)
3503                         iput(inode);
3504                 inode = ERR_PTR(err);
3505         }
3506         return inode;
3507 }
3508
3509 /*
3510  * function to relocate all extents in a block group.
3511  */
3512 int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3513 {
3514         struct btrfs_fs_info *fs_info = extent_root->fs_info;
3515         struct reloc_control *rc;
3516         int ret;
3517         int rw = 0;
3518         int err = 0;
3519
3520         rc = kzalloc(sizeof(*rc), GFP_NOFS);
3521         if (!rc)
3522                 return -ENOMEM;
3523
3524         mapping_tree_init(&rc->reloc_root_tree);
3525         extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3526         INIT_LIST_HEAD(&rc->reloc_roots);
3527
3528         rc->extent_root = extent_root;
3529         rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
3530         BUG_ON(!rc->block_group);
3531
3532         if (!rc->block_group->ro) {
3533                 ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
3534                 if (ret) {
3535                         err = ret;
3536                         goto out;
3537                 }
3538                 rw = 1;
3539         }
3540
3541         btrfs_init_workers(&rc->workers, "relocate",
3542                            fs_info->thread_pool_size, NULL);
3543
3544         rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3545         if (IS_ERR(rc->data_inode)) {
3546                 err = PTR_ERR(rc->data_inode);
3547                 rc->data_inode = NULL;
3548                 goto out;
3549         }
3550
3551         printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
3552                (unsigned long long)rc->block_group->key.objectid,
3553                (unsigned long long)rc->block_group->flags);
3554
3555         btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
3556         btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
3557
3558         while (1) {
3559                 rc->extents_found = 0;
3560                 rc->extents_skipped = 0;
3561
3562                 mutex_lock(&fs_info->cleaner_mutex);
3563
3564                 btrfs_clean_old_snapshots(fs_info->tree_root);
3565                 ret = relocate_block_group(rc);
3566
3567                 mutex_unlock(&fs_info->cleaner_mutex);
3568                 if (ret < 0) {
3569                         err = ret;
3570                         break;
3571                 }
3572
3573                 if (rc->extents_found == 0)
3574                         break;
3575
3576                 printk(KERN_INFO "btrfs: found %llu extents\n",
3577                         (unsigned long long)rc->extents_found);
3578
3579                 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
3580                         btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
3581                         invalidate_mapping_pages(rc->data_inode->i_mapping,
3582                                                  0, -1);
3583                         rc->stage = UPDATE_DATA_PTRS;
3584                 } else if (rc->stage == UPDATE_DATA_PTRS &&
3585                            rc->extents_skipped >= rc->extents_found) {
3586                         iput(rc->data_inode);
3587                         rc->data_inode = create_reloc_inode(fs_info,
3588                                                             rc->block_group);
3589                         if (IS_ERR(rc->data_inode)) {
3590                                 err = PTR_ERR(rc->data_inode);
3591                                 rc->data_inode = NULL;
3592                                 break;
3593                         }
3594                         rc->stage = MOVE_DATA_EXTENTS;
3595                         rc->found_file_extent = 0;
3596                 }
3597         }
3598
3599         filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
3600                                      rc->block_group->key.objectid,
3601                                      rc->block_group->key.objectid +
3602                                      rc->block_group->key.offset - 1);
3603
3604         WARN_ON(rc->block_group->pinned > 0);
3605         WARN_ON(rc->block_group->reserved > 0);
3606         WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
3607 out:
3608         if (err && rw)
3609                 btrfs_set_block_group_rw(extent_root, rc->block_group);
3610         iput(rc->data_inode);
3611         btrfs_stop_workers(&rc->workers);
3612         btrfs_put_block_group(rc->block_group);
3613         kfree(rc);
3614         return err;
3615 }
3616
3617 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3618 {
3619         struct btrfs_trans_handle *trans;
3620         int ret;
3621
3622         trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
3623
3624         memset(&root->root_item.drop_progress, 0,
3625                 sizeof(root->root_item.drop_progress));
3626         root->root_item.drop_level = 0;
3627         btrfs_set_root_refs(&root->root_item, 0);
3628         ret = btrfs_update_root(trans, root->fs_info->tree_root,
3629                                 &root->root_key, &root->root_item);
3630         BUG_ON(ret);
3631
3632         ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
3633         BUG_ON(ret);
3634         return 0;
3635 }
3636
3637 /*
3638  * recover relocation interrupted by system crash.
3639  *
3640  * this function resumes merging reloc trees with corresponding fs trees.
3641  * this is important for keeping the sharing of tree blocks
3642  */
3643 int btrfs_recover_relocation(struct btrfs_root *root)
3644 {
3645         LIST_HEAD(reloc_roots);
3646         struct btrfs_key key;
3647         struct btrfs_root *fs_root;
3648         struct btrfs_root *reloc_root;
3649         struct btrfs_path *path;
3650         struct extent_buffer *leaf;
3651         struct reloc_control *rc = NULL;
3652         struct btrfs_trans_handle *trans;
3653         int ret;
3654         int err = 0;
3655
3656         path = btrfs_alloc_path();
3657         if (!path)
3658                 return -ENOMEM;
3659
3660         key.objectid = BTRFS_TREE_RELOC_OBJECTID;
3661         key.type = BTRFS_ROOT_ITEM_KEY;
3662         key.offset = (u64)-1;
3663
3664         while (1) {
3665                 ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
3666                                         path, 0, 0);
3667                 if (ret < 0) {
3668                         err = ret;
3669                         goto out;
3670                 }
3671                 if (ret > 0) {
3672                         if (path->slots[0] == 0)
3673                                 break;
3674                         path->slots[0]--;
3675                 }
3676                 leaf = path->nodes[0];
3677                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3678                 btrfs_release_path(root->fs_info->tree_root, path);
3679
3680                 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
3681                     key.type != BTRFS_ROOT_ITEM_KEY)
3682                         break;
3683
3684                 reloc_root = btrfs_read_fs_root_no_radix(root, &key);
3685                 if (IS_ERR(reloc_root)) {
3686                         err = PTR_ERR(reloc_root);
3687                         goto out;
3688                 }
3689
3690                 list_add(&reloc_root->root_list, &reloc_roots);
3691
3692                 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
3693                         fs_root = read_fs_root(root->fs_info,
3694                                                reloc_root->root_key.offset);
3695                         if (IS_ERR(fs_root)) {
3696                                 ret = PTR_ERR(fs_root);
3697                                 if (ret != -ENOENT) {
3698                                         err = ret;
3699                                         goto out;
3700                                 }
3701                                 mark_garbage_root(reloc_root);
3702                         }
3703                 }
3704
3705                 if (key.offset == 0)
3706                         break;
3707
3708                 key.offset--;
3709         }
3710         btrfs_release_path(root->fs_info->tree_root, path);
3711
3712         if (list_empty(&reloc_roots))
3713                 goto out;
3714
3715         rc = kzalloc(sizeof(*rc), GFP_NOFS);
3716         if (!rc) {
3717                 err = -ENOMEM;
3718                 goto out;
3719         }
3720
3721         mapping_tree_init(&rc->reloc_root_tree);
3722         INIT_LIST_HEAD(&rc->reloc_roots);
3723         btrfs_init_workers(&rc->workers, "relocate",
3724                            root->fs_info->thread_pool_size, NULL);
3725         rc->extent_root = root->fs_info->extent_root;
3726
3727         set_reloc_control(rc);
3728
3729         while (!list_empty(&reloc_roots)) {
3730                 reloc_root = list_entry(reloc_roots.next,
3731                                         struct btrfs_root, root_list);
3732                 list_del(&reloc_root->root_list);
3733
3734                 if (btrfs_root_refs(&reloc_root->root_item) == 0) {
3735                         list_add_tail(&reloc_root->root_list,
3736                                       &rc->reloc_roots);
3737                         continue;
3738                 }
3739
3740                 fs_root = read_fs_root(root->fs_info,
3741                                        reloc_root->root_key.offset);
3742                 BUG_ON(IS_ERR(fs_root));
3743
3744                 __add_reloc_root(reloc_root);
3745                 fs_root->reloc_root = reloc_root;
3746         }
3747
3748         trans = btrfs_start_transaction(rc->extent_root, 1);
3749         btrfs_commit_transaction(trans, rc->extent_root);
3750
3751         merge_reloc_roots(rc);
3752
3753         unset_reloc_control(rc);
3754
3755         trans = btrfs_start_transaction(rc->extent_root, 1);
3756         btrfs_commit_transaction(trans, rc->extent_root);
3757 out:
3758         if (rc) {
3759                 btrfs_stop_workers(&rc->workers);
3760                 kfree(rc);
3761         }
3762         while (!list_empty(&reloc_roots)) {
3763                 reloc_root = list_entry(reloc_roots.next,
3764                                         struct btrfs_root, root_list);
3765                 list_del(&reloc_root->root_list);
3766                 free_extent_buffer(reloc_root->node);
3767                 free_extent_buffer(reloc_root->commit_root);
3768                 kfree(reloc_root);
3769         }
3770         btrfs_free_path(path);
3771
3772         if (err == 0) {
3773                 /* cleanup orphan inode in data relocation tree */
3774                 fs_root = read_fs_root(root->fs_info,
3775                                        BTRFS_DATA_RELOC_TREE_OBJECTID);
3776                 if (IS_ERR(fs_root))
3777                         err = PTR_ERR(fs_root);
3778                 else
3779                         btrfs_orphan_cleanup(fs_root);
3780         }
3781         return err;
3782 }
3783
3784 /*
3785  * helper to add ordered checksum for data relocation.
3786  *
3787  * cloning checksum properly handles the nodatasum extents.
3788  * it also saves CPU time to re-calculate the checksum.
3789  */
3790 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
3791 {
3792         struct btrfs_ordered_sum *sums;
3793         struct btrfs_sector_sum *sector_sum;
3794         struct btrfs_ordered_extent *ordered;
3795         struct btrfs_root *root = BTRFS_I(inode)->root;
3796         size_t offset;
3797         int ret;
3798         u64 disk_bytenr;
3799         LIST_HEAD(list);
3800
3801         ordered = btrfs_lookup_ordered_extent(inode, file_pos);
3802         BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
3803
3804         disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
3805         ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
3806                                        disk_bytenr + len - 1, &list);
3807
3808         while (!list_empty(&list)) {
3809                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
3810                 list_del_init(&sums->list);
3811
3812                 sector_sum = sums->sums;
3813                 sums->bytenr = ordered->start;
3814
3815                 offset = 0;
3816                 while (offset < sums->len) {
3817                         sector_sum->bytenr += ordered->start - disk_bytenr;
3818                         sector_sum++;
3819                         offset += root->sectorsize;
3820                 }
3821
3822                 btrfs_add_ordered_sum(inode, ordered, sums);
3823         }
3824         btrfs_put_ordered_extent(ordered);
3825         return 0;
3826 }