Btrfs: Avoid orphan inodes cleanup while replaying log
[linux-2.6.git] / fs / btrfs / relocation.c
1 /*
2  * Copyright (C) 2009 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/pagemap.h>
21 #include <linux/writeback.h>
22 #include <linux/blkdev.h>
23 #include <linux/rbtree.h>
24 #include "ctree.h"
25 #include "disk-io.h"
26 #include "transaction.h"
27 #include "volumes.h"
28 #include "locking.h"
29 #include "btrfs_inode.h"
30 #include "async-thread.h"
31
32 /*
33  * backref_node, mapping_node and tree_block start with this
34  */
35 struct tree_entry {
36         struct rb_node rb_node;
37         u64 bytenr;
38 };
39
40 /*
41  * present a tree block in the backref cache
42  */
43 struct backref_node {
44         struct rb_node rb_node;
45         u64 bytenr;
46         /* objectid tree block owner */
47         u64 owner;
48         /* list of upper level blocks reference this block */
49         struct list_head upper;
50         /* list of child blocks in the cache */
51         struct list_head lower;
52         /* NULL if this node is not tree root */
53         struct btrfs_root *root;
54         /* extent buffer got by COW the block */
55         struct extent_buffer *eb;
56         /* level of tree block */
57         unsigned int level:8;
58         /* 1 if the block is root of old snapshot */
59         unsigned int old_root:1;
60         /* 1 if no child blocks in the cache */
61         unsigned int lowest:1;
62         /* is the extent buffer locked */
63         unsigned int locked:1;
64         /* has the block been processed */
65         unsigned int processed:1;
66         /* have backrefs of this block been checked */
67         unsigned int checked:1;
68 };
69
70 /*
71  * present a block pointer in the backref cache
72  */
73 struct backref_edge {
74         struct list_head list[2];
75         struct backref_node *node[2];
76         u64 blockptr;
77 };
78
79 #define LOWER   0
80 #define UPPER   1
81
82 struct backref_cache {
83         /* red black tree of all backref nodes in the cache */
84         struct rb_root rb_root;
85         /* list of backref nodes with no child block in the cache */
86         struct list_head pending[BTRFS_MAX_LEVEL];
87         spinlock_t lock;
88 };
89
90 /*
91  * map address of tree root to tree
92  */
93 struct mapping_node {
94         struct rb_node rb_node;
95         u64 bytenr;
96         void *data;
97 };
98
99 struct mapping_tree {
100         struct rb_root rb_root;
101         spinlock_t lock;
102 };
103
104 /*
105  * present a tree block to process
106  */
107 struct tree_block {
108         struct rb_node rb_node;
109         u64 bytenr;
110         struct btrfs_key key;
111         unsigned int level:8;
112         unsigned int key_ready:1;
113 };
114
115 /* inode vector */
116 #define INODEVEC_SIZE 16
117
118 struct inodevec {
119         struct list_head list;
120         struct inode *inode[INODEVEC_SIZE];
121         int nr;
122 };
123
124 #define MAX_EXTENTS 128
125
126 struct file_extent_cluster {
127         u64 start;
128         u64 end;
129         u64 boundary[MAX_EXTENTS];
130         unsigned int nr;
131 };
132
133 struct reloc_control {
134         /* block group to relocate */
135         struct btrfs_block_group_cache *block_group;
136         /* extent tree */
137         struct btrfs_root *extent_root;
138         /* inode for moving data */
139         struct inode *data_inode;
140         struct btrfs_workers workers;
141         /* tree blocks have been processed */
142         struct extent_io_tree processed_blocks;
143         /* map start of tree root to corresponding reloc tree */
144         struct mapping_tree reloc_root_tree;
145         /* list of reloc trees */
146         struct list_head reloc_roots;
147         u64 search_start;
148         u64 extents_found;
149         u64 extents_skipped;
150         int stage;
151         int create_reloc_root;
152         unsigned int found_file_extent:1;
153         unsigned int found_old_snapshot:1;
154 };
155
156 /* stages of data relocation */
157 #define MOVE_DATA_EXTENTS       0
158 #define UPDATE_DATA_PTRS        1
159
160 /*
161  * merge reloc tree to corresponding fs tree in worker threads
162  */
163 struct async_merge {
164         struct btrfs_work work;
165         struct reloc_control *rc;
166         struct btrfs_root *root;
167         struct completion *done;
168         atomic_t *num_pending;
169 };
170
171 static void mapping_tree_init(struct mapping_tree *tree)
172 {
173         tree->rb_root.rb_node = NULL;
174         spin_lock_init(&tree->lock);
175 }
176
177 static void backref_cache_init(struct backref_cache *cache)
178 {
179         int i;
180         cache->rb_root.rb_node = NULL;
181         for (i = 0; i < BTRFS_MAX_LEVEL; i++)
182                 INIT_LIST_HEAD(&cache->pending[i]);
183         spin_lock_init(&cache->lock);
184 }
185
186 static void backref_node_init(struct backref_node *node)
187 {
188         memset(node, 0, sizeof(*node));
189         INIT_LIST_HEAD(&node->upper);
190         INIT_LIST_HEAD(&node->lower);
191         RB_CLEAR_NODE(&node->rb_node);
192 }
193
194 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
195                                    struct rb_node *node)
196 {
197         struct rb_node **p = &root->rb_node;
198         struct rb_node *parent = NULL;
199         struct tree_entry *entry;
200
201         while (*p) {
202                 parent = *p;
203                 entry = rb_entry(parent, struct tree_entry, rb_node);
204
205                 if (bytenr < entry->bytenr)
206                         p = &(*p)->rb_left;
207                 else if (bytenr > entry->bytenr)
208                         p = &(*p)->rb_right;
209                 else
210                         return parent;
211         }
212
213         rb_link_node(node, parent, p);
214         rb_insert_color(node, root);
215         return NULL;
216 }
217
218 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
219 {
220         struct rb_node *n = root->rb_node;
221         struct tree_entry *entry;
222
223         while (n) {
224                 entry = rb_entry(n, struct tree_entry, rb_node);
225
226                 if (bytenr < entry->bytenr)
227                         n = n->rb_left;
228                 else if (bytenr > entry->bytenr)
229                         n = n->rb_right;
230                 else
231                         return n;
232         }
233         return NULL;
234 }
235
236 /*
237  * walk up backref nodes until reach node presents tree root
238  */
239 static struct backref_node *walk_up_backref(struct backref_node *node,
240                                             struct backref_edge *edges[],
241                                             int *index)
242 {
243         struct backref_edge *edge;
244         int idx = *index;
245
246         while (!list_empty(&node->upper)) {
247                 edge = list_entry(node->upper.next,
248                                   struct backref_edge, list[LOWER]);
249                 edges[idx++] = edge;
250                 node = edge->node[UPPER];
251         }
252         *index = idx;
253         return node;
254 }
255
256 /*
257  * walk down backref nodes to find start of next reference path
258  */
259 static struct backref_node *walk_down_backref(struct backref_edge *edges[],
260                                               int *index)
261 {
262         struct backref_edge *edge;
263         struct backref_node *lower;
264         int idx = *index;
265
266         while (idx > 0) {
267                 edge = edges[idx - 1];
268                 lower = edge->node[LOWER];
269                 if (list_is_last(&edge->list[LOWER], &lower->upper)) {
270                         idx--;
271                         continue;
272                 }
273                 edge = list_entry(edge->list[LOWER].next,
274                                   struct backref_edge, list[LOWER]);
275                 edges[idx - 1] = edge;
276                 *index = idx;
277                 return edge->node[UPPER];
278         }
279         *index = 0;
280         return NULL;
281 }
282
283 static void drop_node_buffer(struct backref_node *node)
284 {
285         if (node->eb) {
286                 if (node->locked) {
287                         btrfs_tree_unlock(node->eb);
288                         node->locked = 0;
289                 }
290                 free_extent_buffer(node->eb);
291                 node->eb = NULL;
292         }
293 }
294
295 static void drop_backref_node(struct backref_cache *tree,
296                               struct backref_node *node)
297 {
298         BUG_ON(!node->lowest);
299         BUG_ON(!list_empty(&node->upper));
300
301         drop_node_buffer(node);
302         list_del(&node->lower);
303
304         rb_erase(&node->rb_node, &tree->rb_root);
305         kfree(node);
306 }
307
308 /*
309  * remove a backref node from the backref cache
310  */
311 static void remove_backref_node(struct backref_cache *cache,
312                                 struct backref_node *node)
313 {
314         struct backref_node *upper;
315         struct backref_edge *edge;
316
317         if (!node)
318                 return;
319
320         BUG_ON(!node->lowest);
321         while (!list_empty(&node->upper)) {
322                 edge = list_entry(node->upper.next, struct backref_edge,
323                                   list[LOWER]);
324                 upper = edge->node[UPPER];
325                 list_del(&edge->list[LOWER]);
326                 list_del(&edge->list[UPPER]);
327                 kfree(edge);
328                 /*
329                  * add the node to pending list if no other
330                  * child block cached.
331                  */
332                 if (list_empty(&upper->lower)) {
333                         list_add_tail(&upper->lower,
334                                       &cache->pending[upper->level]);
335                         upper->lowest = 1;
336                 }
337         }
338         drop_backref_node(cache, node);
339 }
340
341 /*
342  * find reloc tree by address of tree root
343  */
344 static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
345                                           u64 bytenr)
346 {
347         struct rb_node *rb_node;
348         struct mapping_node *node;
349         struct btrfs_root *root = NULL;
350
351         spin_lock(&rc->reloc_root_tree.lock);
352         rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
353         if (rb_node) {
354                 node = rb_entry(rb_node, struct mapping_node, rb_node);
355                 root = (struct btrfs_root *)node->data;
356         }
357         spin_unlock(&rc->reloc_root_tree.lock);
358         return root;
359 }
360
361 static int is_cowonly_root(u64 root_objectid)
362 {
363         if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
364             root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
365             root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
366             root_objectid == BTRFS_DEV_TREE_OBJECTID ||
367             root_objectid == BTRFS_TREE_LOG_OBJECTID ||
368             root_objectid == BTRFS_CSUM_TREE_OBJECTID)
369                 return 1;
370         return 0;
371 }
372
373 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
374                                         u64 root_objectid)
375 {
376         struct btrfs_key key;
377
378         key.objectid = root_objectid;
379         key.type = BTRFS_ROOT_ITEM_KEY;
380         if (is_cowonly_root(root_objectid))
381                 key.offset = 0;
382         else
383                 key.offset = (u64)-1;
384
385         return btrfs_read_fs_root_no_name(fs_info, &key);
386 }
387
388 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
389 static noinline_for_stack
390 struct btrfs_root *find_tree_root(struct reloc_control *rc,
391                                   struct extent_buffer *leaf,
392                                   struct btrfs_extent_ref_v0 *ref0)
393 {
394         struct btrfs_root *root;
395         u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
396         u64 generation = btrfs_ref_generation_v0(leaf, ref0);
397
398         BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
399
400         root = read_fs_root(rc->extent_root->fs_info, root_objectid);
401         BUG_ON(IS_ERR(root));
402
403         if (root->ref_cows &&
404             generation != btrfs_root_generation(&root->root_item))
405                 return NULL;
406
407         return root;
408 }
409 #endif
410
411 static noinline_for_stack
412 int find_inline_backref(struct extent_buffer *leaf, int slot,
413                         unsigned long *ptr, unsigned long *end)
414 {
415         struct btrfs_extent_item *ei;
416         struct btrfs_tree_block_info *bi;
417         u32 item_size;
418
419         item_size = btrfs_item_size_nr(leaf, slot);
420 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
421         if (item_size < sizeof(*ei)) {
422                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
423                 return 1;
424         }
425 #endif
426         ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
427         WARN_ON(!(btrfs_extent_flags(leaf, ei) &
428                   BTRFS_EXTENT_FLAG_TREE_BLOCK));
429
430         if (item_size <= sizeof(*ei) + sizeof(*bi)) {
431                 WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
432                 return 1;
433         }
434
435         bi = (struct btrfs_tree_block_info *)(ei + 1);
436         *ptr = (unsigned long)(bi + 1);
437         *end = (unsigned long)ei + item_size;
438         return 0;
439 }
440
441 /*
442  * build backref tree for a given tree block. root of the backref tree
443  * corresponds the tree block, leaves of the backref tree correspond
444  * roots of b-trees that reference the tree block.
445  *
446  * the basic idea of this function is check backrefs of a given block
447  * to find upper level blocks that refernece the block, and then check
448  * bakcrefs of these upper level blocks recursively. the recursion stop
449  * when tree root is reached or backrefs for the block is cached.
450  *
451  * NOTE: if we find backrefs for a block are cached, we know backrefs
452  * for all upper level blocks that directly/indirectly reference the
453  * block are also cached.
454  */
455 static struct backref_node *build_backref_tree(struct reloc_control *rc,
456                                                struct backref_cache *cache,
457                                                struct btrfs_key *node_key,
458                                                int level, u64 bytenr)
459 {
460         struct btrfs_path *path1;
461         struct btrfs_path *path2;
462         struct extent_buffer *eb;
463         struct btrfs_root *root;
464         struct backref_node *cur;
465         struct backref_node *upper;
466         struct backref_node *lower;
467         struct backref_node *node = NULL;
468         struct backref_node *exist = NULL;
469         struct backref_edge *edge;
470         struct rb_node *rb_node;
471         struct btrfs_key key;
472         unsigned long end;
473         unsigned long ptr;
474         LIST_HEAD(list);
475         int ret;
476         int err = 0;
477
478         path1 = btrfs_alloc_path();
479         path2 = btrfs_alloc_path();
480         if (!path1 || !path2) {
481                 err = -ENOMEM;
482                 goto out;
483         }
484
485         node = kmalloc(sizeof(*node), GFP_NOFS);
486         if (!node) {
487                 err = -ENOMEM;
488                 goto out;
489         }
490
491         backref_node_init(node);
492         node->bytenr = bytenr;
493         node->owner = 0;
494         node->level = level;
495         node->lowest = 1;
496         cur = node;
497 again:
498         end = 0;
499         ptr = 0;
500         key.objectid = cur->bytenr;
501         key.type = BTRFS_EXTENT_ITEM_KEY;
502         key.offset = (u64)-1;
503
504         path1->search_commit_root = 1;
505         path1->skip_locking = 1;
506         ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
507                                 0, 0);
508         if (ret < 0) {
509                 err = ret;
510                 goto out;
511         }
512         BUG_ON(!ret || !path1->slots[0]);
513
514         path1->slots[0]--;
515
516         WARN_ON(cur->checked);
517         if (!list_empty(&cur->upper)) {
518                 /*
519                  * the backref was added previously when processsing
520                  * backref of type BTRFS_TREE_BLOCK_REF_KEY
521                  */
522                 BUG_ON(!list_is_singular(&cur->upper));
523                 edge = list_entry(cur->upper.next, struct backref_edge,
524                                   list[LOWER]);
525                 BUG_ON(!list_empty(&edge->list[UPPER]));
526                 exist = edge->node[UPPER];
527                 /*
528                  * add the upper level block to pending list if we need
529                  * check its backrefs
530                  */
531                 if (!exist->checked)
532                         list_add_tail(&edge->list[UPPER], &list);
533         } else {
534                 exist = NULL;
535         }
536
537         while (1) {
538                 cond_resched();
539                 eb = path1->nodes[0];
540
541                 if (ptr >= end) {
542                         if (path1->slots[0] >= btrfs_header_nritems(eb)) {
543                                 ret = btrfs_next_leaf(rc->extent_root, path1);
544                                 if (ret < 0) {
545                                         err = ret;
546                                         goto out;
547                                 }
548                                 if (ret > 0)
549                                         break;
550                                 eb = path1->nodes[0];
551                         }
552
553                         btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
554                         if (key.objectid != cur->bytenr) {
555                                 WARN_ON(exist);
556                                 break;
557                         }
558
559                         if (key.type == BTRFS_EXTENT_ITEM_KEY) {
560                                 ret = find_inline_backref(eb, path1->slots[0],
561                                                           &ptr, &end);
562                                 if (ret)
563                                         goto next;
564                         }
565                 }
566
567                 if (ptr < end) {
568                         /* update key for inline back ref */
569                         struct btrfs_extent_inline_ref *iref;
570                         iref = (struct btrfs_extent_inline_ref *)ptr;
571                         key.type = btrfs_extent_inline_ref_type(eb, iref);
572                         key.offset = btrfs_extent_inline_ref_offset(eb, iref);
573                         WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
574                                 key.type != BTRFS_SHARED_BLOCK_REF_KEY);
575                 }
576
577                 if (exist &&
578                     ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
579                       exist->owner == key.offset) ||
580                      (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
581                       exist->bytenr == key.offset))) {
582                         exist = NULL;
583                         goto next;
584                 }
585
586 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
587                 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
588                     key.type == BTRFS_EXTENT_REF_V0_KEY) {
589                         if (key.objectid == key.offset &&
590                             key.type == BTRFS_EXTENT_REF_V0_KEY) {
591                                 struct btrfs_extent_ref_v0 *ref0;
592                                 ref0 = btrfs_item_ptr(eb, path1->slots[0],
593                                                 struct btrfs_extent_ref_v0);
594                                 root = find_tree_root(rc, eb, ref0);
595                                 if (root)
596                                         cur->root = root;
597                                 else
598                                         cur->old_root = 1;
599                                 break;
600                         }
601 #else
602                 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
603                 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
604 #endif
605                         if (key.objectid == key.offset) {
606                                 /*
607                                  * only root blocks of reloc trees use
608                                  * backref of this type.
609                                  */
610                                 root = find_reloc_root(rc, cur->bytenr);
611                                 BUG_ON(!root);
612                                 cur->root = root;
613                                 break;
614                         }
615
616                         edge = kzalloc(sizeof(*edge), GFP_NOFS);
617                         if (!edge) {
618                                 err = -ENOMEM;
619                                 goto out;
620                         }
621                         rb_node = tree_search(&cache->rb_root, key.offset);
622                         if (!rb_node) {
623                                 upper = kmalloc(sizeof(*upper), GFP_NOFS);
624                                 if (!upper) {
625                                         kfree(edge);
626                                         err = -ENOMEM;
627                                         goto out;
628                                 }
629                                 backref_node_init(upper);
630                                 upper->bytenr = key.offset;
631                                 upper->owner = 0;
632                                 upper->level = cur->level + 1;
633                                 /*
634                                  *  backrefs for the upper level block isn't
635                                  *  cached, add the block to pending list
636                                  */
637                                 list_add_tail(&edge->list[UPPER], &list);
638                         } else {
639                                 upper = rb_entry(rb_node, struct backref_node,
640                                                  rb_node);
641                                 INIT_LIST_HEAD(&edge->list[UPPER]);
642                         }
643                         list_add(&edge->list[LOWER], &cur->upper);
644                         edge->node[UPPER] = upper;
645                         edge->node[LOWER] = cur;
646
647                         goto next;
648                 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
649                         goto next;
650                 }
651
652                 /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
653                 root = read_fs_root(rc->extent_root->fs_info, key.offset);
654                 if (IS_ERR(root)) {
655                         err = PTR_ERR(root);
656                         goto out;
657                 }
658
659                 if (btrfs_root_level(&root->root_item) == cur->level) {
660                         /* tree root */
661                         BUG_ON(btrfs_root_bytenr(&root->root_item) !=
662                                cur->bytenr);
663                         cur->root = root;
664                         break;
665                 }
666
667                 level = cur->level + 1;
668
669                 /*
670                  * searching the tree to find upper level blocks
671                  * reference the block.
672                  */
673                 path2->search_commit_root = 1;
674                 path2->skip_locking = 1;
675                 path2->lowest_level = level;
676                 ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
677                 path2->lowest_level = 0;
678                 if (ret < 0) {
679                         err = ret;
680                         goto out;
681                 }
682                 if (ret > 0 && path2->slots[level] > 0)
683                         path2->slots[level]--;
684
685                 eb = path2->nodes[level];
686                 WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
687                         cur->bytenr);
688
689                 lower = cur;
690                 for (; level < BTRFS_MAX_LEVEL; level++) {
691                         if (!path2->nodes[level]) {
692                                 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
693                                        lower->bytenr);
694                                 lower->root = root;
695                                 break;
696                         }
697
698                         edge = kzalloc(sizeof(*edge), GFP_NOFS);
699                         if (!edge) {
700                                 err = -ENOMEM;
701                                 goto out;
702                         }
703
704                         eb = path2->nodes[level];
705                         rb_node = tree_search(&cache->rb_root, eb->start);
706                         if (!rb_node) {
707                                 upper = kmalloc(sizeof(*upper), GFP_NOFS);
708                                 if (!upper) {
709                                         kfree(edge);
710                                         err = -ENOMEM;
711                                         goto out;
712                                 }
713                                 backref_node_init(upper);
714                                 upper->bytenr = eb->start;
715                                 upper->owner = btrfs_header_owner(eb);
716                                 upper->level = lower->level + 1;
717
718                                 /*
719                                  * if we know the block isn't shared
720                                  * we can void checking its backrefs.
721                                  */
722                                 if (btrfs_block_can_be_shared(root, eb))
723                                         upper->checked = 0;
724                                 else
725                                         upper->checked = 1;
726
727                                 /*
728                                  * add the block to pending list if we
729                                  * need check its backrefs. only block
730                                  * at 'cur->level + 1' is added to the
731                                  * tail of pending list. this guarantees
732                                  * we check backrefs from lower level
733                                  * blocks to upper level blocks.
734                                  */
735                                 if (!upper->checked &&
736                                     level == cur->level + 1) {
737                                         list_add_tail(&edge->list[UPPER],
738                                                       &list);
739                                 } else
740                                         INIT_LIST_HEAD(&edge->list[UPPER]);
741                         } else {
742                                 upper = rb_entry(rb_node, struct backref_node,
743                                                  rb_node);
744                                 BUG_ON(!upper->checked);
745                                 INIT_LIST_HEAD(&edge->list[UPPER]);
746                         }
747                         list_add_tail(&edge->list[LOWER], &lower->upper);
748                         edge->node[UPPER] = upper;
749                         edge->node[LOWER] = lower;
750
751                         if (rb_node)
752                                 break;
753                         lower = upper;
754                         upper = NULL;
755                 }
756                 btrfs_release_path(root, path2);
757 next:
758                 if (ptr < end) {
759                         ptr += btrfs_extent_inline_ref_size(key.type);
760                         if (ptr >= end) {
761                                 WARN_ON(ptr > end);
762                                 ptr = 0;
763                                 end = 0;
764                         }
765                 }
766                 if (ptr >= end)
767                         path1->slots[0]++;
768         }
769         btrfs_release_path(rc->extent_root, path1);
770
771         cur->checked = 1;
772         WARN_ON(exist);
773
774         /* the pending list isn't empty, take the first block to process */
775         if (!list_empty(&list)) {
776                 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
777                 list_del_init(&edge->list[UPPER]);
778                 cur = edge->node[UPPER];
779                 goto again;
780         }
781
782         /*
783          * everything goes well, connect backref nodes and insert backref nodes
784          * into the cache.
785          */
786         BUG_ON(!node->checked);
787         rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
788         BUG_ON(rb_node);
789
790         list_for_each_entry(edge, &node->upper, list[LOWER])
791                 list_add_tail(&edge->list[UPPER], &list);
792
793         while (!list_empty(&list)) {
794                 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
795                 list_del_init(&edge->list[UPPER]);
796                 upper = edge->node[UPPER];
797
798                 if (!RB_EMPTY_NODE(&upper->rb_node)) {
799                         if (upper->lowest) {
800                                 list_del_init(&upper->lower);
801                                 upper->lowest = 0;
802                         }
803
804                         list_add_tail(&edge->list[UPPER], &upper->lower);
805                         continue;
806                 }
807
808                 BUG_ON(!upper->checked);
809                 rb_node = tree_insert(&cache->rb_root, upper->bytenr,
810                                       &upper->rb_node);
811                 BUG_ON(rb_node);
812
813                 list_add_tail(&edge->list[UPPER], &upper->lower);
814
815                 list_for_each_entry(edge, &upper->upper, list[LOWER])
816                         list_add_tail(&edge->list[UPPER], &list);
817         }
818 out:
819         btrfs_free_path(path1);
820         btrfs_free_path(path2);
821         if (err) {
822                 INIT_LIST_HEAD(&list);
823                 upper = node;
824                 while (upper) {
825                         if (RB_EMPTY_NODE(&upper->rb_node)) {
826                                 list_splice_tail(&upper->upper, &list);
827                                 kfree(upper);
828                         }
829
830                         if (list_empty(&list))
831                                 break;
832
833                         edge = list_entry(list.next, struct backref_edge,
834                                           list[LOWER]);
835                         upper = edge->node[UPPER];
836                         kfree(edge);
837                 }
838                 return ERR_PTR(err);
839         }
840         return node;
841 }
842
843 /*
844  * helper to add 'address of tree root -> reloc tree' mapping
845  */
846 static int __add_reloc_root(struct btrfs_root *root)
847 {
848         struct rb_node *rb_node;
849         struct mapping_node *node;
850         struct reloc_control *rc = root->fs_info->reloc_ctl;
851
852         node = kmalloc(sizeof(*node), GFP_NOFS);
853         BUG_ON(!node);
854
855         node->bytenr = root->node->start;
856         node->data = root;
857
858         spin_lock(&rc->reloc_root_tree.lock);
859         rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
860                               node->bytenr, &node->rb_node);
861         spin_unlock(&rc->reloc_root_tree.lock);
862         BUG_ON(rb_node);
863
864         list_add_tail(&root->root_list, &rc->reloc_roots);
865         return 0;
866 }
867
868 /*
869  * helper to update/delete the 'address of tree root -> reloc tree'
870  * mapping
871  */
872 static int __update_reloc_root(struct btrfs_root *root, int del)
873 {
874         struct rb_node *rb_node;
875         struct mapping_node *node = NULL;
876         struct reloc_control *rc = root->fs_info->reloc_ctl;
877
878         spin_lock(&rc->reloc_root_tree.lock);
879         rb_node = tree_search(&rc->reloc_root_tree.rb_root,
880                               root->commit_root->start);
881         if (rb_node) {
882                 node = rb_entry(rb_node, struct mapping_node, rb_node);
883                 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
884         }
885         spin_unlock(&rc->reloc_root_tree.lock);
886
887         BUG_ON((struct btrfs_root *)node->data != root);
888
889         if (!del) {
890                 spin_lock(&rc->reloc_root_tree.lock);
891                 node->bytenr = root->node->start;
892                 rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
893                                       node->bytenr, &node->rb_node);
894                 spin_unlock(&rc->reloc_root_tree.lock);
895                 BUG_ON(rb_node);
896         } else {
897                 list_del_init(&root->root_list);
898                 kfree(node);
899         }
900         return 0;
901 }
902
903 /*
904  * create reloc tree for a given fs tree. reloc tree is just a
905  * snapshot of the fs tree with special root objectid.
906  */
907 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
908                           struct btrfs_root *root)
909 {
910         struct btrfs_root *reloc_root;
911         struct extent_buffer *eb;
912         struct btrfs_root_item *root_item;
913         struct btrfs_key root_key;
914         int ret;
915
916         if (root->reloc_root) {
917                 reloc_root = root->reloc_root;
918                 reloc_root->last_trans = trans->transid;
919                 return 0;
920         }
921
922         if (!root->fs_info->reloc_ctl ||
923             !root->fs_info->reloc_ctl->create_reloc_root ||
924             root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
925                 return 0;
926
927         root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
928         BUG_ON(!root_item);
929
930         root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
931         root_key.type = BTRFS_ROOT_ITEM_KEY;
932         root_key.offset = root->root_key.objectid;
933
934         ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
935                               BTRFS_TREE_RELOC_OBJECTID);
936         BUG_ON(ret);
937
938         btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
939         memcpy(root_item, &root->root_item, sizeof(*root_item));
940         btrfs_set_root_refs(root_item, 1);
941         btrfs_set_root_bytenr(root_item, eb->start);
942         btrfs_set_root_level(root_item, btrfs_header_level(eb));
943         btrfs_set_root_generation(root_item, trans->transid);
944         memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
945         root_item->drop_level = 0;
946
947         btrfs_tree_unlock(eb);
948         free_extent_buffer(eb);
949
950         ret = btrfs_insert_root(trans, root->fs_info->tree_root,
951                                 &root_key, root_item);
952         BUG_ON(ret);
953         kfree(root_item);
954
955         reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
956                                                  &root_key);
957         BUG_ON(IS_ERR(reloc_root));
958         reloc_root->last_trans = trans->transid;
959
960         __add_reloc_root(reloc_root);
961         root->reloc_root = reloc_root;
962         return 0;
963 }
964
965 /*
966  * update root item of reloc tree
967  */
968 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
969                             struct btrfs_root *root)
970 {
971         struct btrfs_root *reloc_root;
972         struct btrfs_root_item *root_item;
973         int del = 0;
974         int ret;
975
976         if (!root->reloc_root)
977                 return 0;
978
979         reloc_root = root->reloc_root;
980         root_item = &reloc_root->root_item;
981
982         if (btrfs_root_refs(root_item) == 0) {
983                 root->reloc_root = NULL;
984                 del = 1;
985         }
986
987         __update_reloc_root(reloc_root, del);
988
989         if (reloc_root->commit_root != reloc_root->node) {
990                 btrfs_set_root_node(root_item, reloc_root->node);
991                 free_extent_buffer(reloc_root->commit_root);
992                 reloc_root->commit_root = btrfs_root_node(reloc_root);
993         }
994
995         ret = btrfs_update_root(trans, root->fs_info->tree_root,
996                                 &reloc_root->root_key, root_item);
997         BUG_ON(ret);
998         return 0;
999 }
1000
1001 /*
1002  * helper to find first cached inode with inode number >= objectid
1003  * in a subvolume
1004  */
1005 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1006 {
1007         struct rb_node *node;
1008         struct rb_node *prev;
1009         struct btrfs_inode *entry;
1010         struct inode *inode;
1011
1012         spin_lock(&root->inode_lock);
1013 again:
1014         node = root->inode_tree.rb_node;
1015         prev = NULL;
1016         while (node) {
1017                 prev = node;
1018                 entry = rb_entry(node, struct btrfs_inode, rb_node);
1019
1020                 if (objectid < entry->vfs_inode.i_ino)
1021                         node = node->rb_left;
1022                 else if (objectid > entry->vfs_inode.i_ino)
1023                         node = node->rb_right;
1024                 else
1025                         break;
1026         }
1027         if (!node) {
1028                 while (prev) {
1029                         entry = rb_entry(prev, struct btrfs_inode, rb_node);
1030                         if (objectid <= entry->vfs_inode.i_ino) {
1031                                 node = prev;
1032                                 break;
1033                         }
1034                         prev = rb_next(prev);
1035                 }
1036         }
1037         while (node) {
1038                 entry = rb_entry(node, struct btrfs_inode, rb_node);
1039                 inode = igrab(&entry->vfs_inode);
1040                 if (inode) {
1041                         spin_unlock(&root->inode_lock);
1042                         return inode;
1043                 }
1044
1045                 objectid = entry->vfs_inode.i_ino + 1;
1046                 if (cond_resched_lock(&root->inode_lock))
1047                         goto again;
1048
1049                 node = rb_next(node);
1050         }
1051         spin_unlock(&root->inode_lock);
1052         return NULL;
1053 }
1054
1055 static int in_block_group(u64 bytenr,
1056                           struct btrfs_block_group_cache *block_group)
1057 {
1058         if (bytenr >= block_group->key.objectid &&
1059             bytenr < block_group->key.objectid + block_group->key.offset)
1060                 return 1;
1061         return 0;
1062 }
1063
1064 /*
1065  * get new location of data
1066  */
1067 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1068                             u64 bytenr, u64 num_bytes)
1069 {
1070         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1071         struct btrfs_path *path;
1072         struct btrfs_file_extent_item *fi;
1073         struct extent_buffer *leaf;
1074         int ret;
1075
1076         path = btrfs_alloc_path();
1077         if (!path)
1078                 return -ENOMEM;
1079
1080         bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1081         ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
1082                                        bytenr, 0);
1083         if (ret < 0)
1084                 goto out;
1085         if (ret > 0) {
1086                 ret = -ENOENT;
1087                 goto out;
1088         }
1089
1090         leaf = path->nodes[0];
1091         fi = btrfs_item_ptr(leaf, path->slots[0],
1092                             struct btrfs_file_extent_item);
1093
1094         BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1095                btrfs_file_extent_compression(leaf, fi) ||
1096                btrfs_file_extent_encryption(leaf, fi) ||
1097                btrfs_file_extent_other_encoding(leaf, fi));
1098
1099         if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1100                 ret = 1;
1101                 goto out;
1102         }
1103
1104         if (new_bytenr)
1105                 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1106         ret = 0;
1107 out:
1108         btrfs_free_path(path);
1109         return ret;
1110 }
1111
1112 /*
1113  * update file extent items in the tree leaf to point to
1114  * the new locations.
1115  */
1116 static int replace_file_extents(struct btrfs_trans_handle *trans,
1117                                 struct reloc_control *rc,
1118                                 struct btrfs_root *root,
1119                                 struct extent_buffer *leaf,
1120                                 struct list_head *inode_list)
1121 {
1122         struct btrfs_key key;
1123         struct btrfs_file_extent_item *fi;
1124         struct inode *inode = NULL;
1125         struct inodevec *ivec = NULL;
1126         u64 parent;
1127         u64 bytenr;
1128         u64 new_bytenr;
1129         u64 num_bytes;
1130         u64 end;
1131         u32 nritems;
1132         u32 i;
1133         int ret;
1134         int first = 1;
1135         int dirty = 0;
1136
1137         if (rc->stage != UPDATE_DATA_PTRS)
1138                 return 0;
1139
1140         /* reloc trees always use full backref */
1141         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1142                 parent = leaf->start;
1143         else
1144                 parent = 0;
1145
1146         nritems = btrfs_header_nritems(leaf);
1147         for (i = 0; i < nritems; i++) {
1148                 cond_resched();
1149                 btrfs_item_key_to_cpu(leaf, &key, i);
1150                 if (key.type != BTRFS_EXTENT_DATA_KEY)
1151                         continue;
1152                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1153                 if (btrfs_file_extent_type(leaf, fi) ==
1154                     BTRFS_FILE_EXTENT_INLINE)
1155                         continue;
1156                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1157                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1158                 if (bytenr == 0)
1159                         continue;
1160                 if (!in_block_group(bytenr, rc->block_group))
1161                         continue;
1162
1163                 /*
1164                  * if we are modifying block in fs tree, wait for readpage
1165                  * to complete and drop the extent cache
1166                  */
1167                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1168                         if (!ivec || ivec->nr == INODEVEC_SIZE) {
1169                                 ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
1170                                 BUG_ON(!ivec);
1171                                 ivec->nr = 0;
1172                                 list_add_tail(&ivec->list, inode_list);
1173                         }
1174                         if (first) {
1175                                 inode = find_next_inode(root, key.objectid);
1176                                 if (inode)
1177                                         ivec->inode[ivec->nr++] = inode;
1178                                 first = 0;
1179                         } else if (inode && inode->i_ino < key.objectid) {
1180                                 inode = find_next_inode(root, key.objectid);
1181                                 if (inode)
1182                                         ivec->inode[ivec->nr++] = inode;
1183                         }
1184                         if (inode && inode->i_ino == key.objectid) {
1185                                 end = key.offset +
1186                                       btrfs_file_extent_num_bytes(leaf, fi);
1187                                 WARN_ON(!IS_ALIGNED(key.offset,
1188                                                     root->sectorsize));
1189                                 WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1190                                 end--;
1191                                 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1192                                                       key.offset, end,
1193                                                       GFP_NOFS);
1194                                 if (!ret)
1195                                         continue;
1196
1197                                 btrfs_drop_extent_cache(inode, key.offset, end,
1198                                                         1);
1199                                 unlock_extent(&BTRFS_I(inode)->io_tree,
1200                                               key.offset, end, GFP_NOFS);
1201                         }
1202                 }
1203
1204                 ret = get_new_location(rc->data_inode, &new_bytenr,
1205                                        bytenr, num_bytes);
1206                 if (ret > 0)
1207                         continue;
1208                 BUG_ON(ret < 0);
1209
1210                 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1211                 dirty = 1;
1212
1213                 key.offset -= btrfs_file_extent_offset(leaf, fi);
1214                 ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1215                                            num_bytes, parent,
1216                                            btrfs_header_owner(leaf),
1217                                            key.objectid, key.offset);
1218                 BUG_ON(ret);
1219
1220                 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1221                                         parent, btrfs_header_owner(leaf),
1222                                         key.objectid, key.offset);
1223                 BUG_ON(ret);
1224         }
1225         if (dirty)
1226                 btrfs_mark_buffer_dirty(leaf);
1227         return 0;
1228 }
1229
1230 static noinline_for_stack
1231 int memcmp_node_keys(struct extent_buffer *eb, int slot,
1232                      struct btrfs_path *path, int level)
1233 {
1234         struct btrfs_disk_key key1;
1235         struct btrfs_disk_key key2;
1236         btrfs_node_key(eb, &key1, slot);
1237         btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1238         return memcmp(&key1, &key2, sizeof(key1));
1239 }
1240
1241 /*
1242  * try to replace tree blocks in fs tree with the new blocks
1243  * in reloc tree. tree blocks haven't been modified since the
1244  * reloc tree was create can be replaced.
1245  *
1246  * if a block was replaced, level of the block + 1 is returned.
1247  * if no block got replaced, 0 is returned. if there are other
1248  * errors, a negative error number is returned.
1249  */
1250 static int replace_path(struct btrfs_trans_handle *trans,
1251                         struct btrfs_root *dest, struct btrfs_root *src,
1252                         struct btrfs_path *path, struct btrfs_key *next_key,
1253                         struct extent_buffer **leaf,
1254                         int lowest_level, int max_level)
1255 {
1256         struct extent_buffer *eb;
1257         struct extent_buffer *parent;
1258         struct btrfs_key key;
1259         u64 old_bytenr;
1260         u64 new_bytenr;
1261         u64 old_ptr_gen;
1262         u64 new_ptr_gen;
1263         u64 last_snapshot;
1264         u32 blocksize;
1265         int level;
1266         int ret;
1267         int slot;
1268
1269         BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1270         BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1271         BUG_ON(lowest_level > 1 && leaf);
1272
1273         last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1274
1275         slot = path->slots[lowest_level];
1276         btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1277
1278         eb = btrfs_lock_root_node(dest);
1279         btrfs_set_lock_blocking(eb);
1280         level = btrfs_header_level(eb);
1281
1282         if (level < lowest_level) {
1283                 btrfs_tree_unlock(eb);
1284                 free_extent_buffer(eb);
1285                 return 0;
1286         }
1287
1288         ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1289         BUG_ON(ret);
1290         btrfs_set_lock_blocking(eb);
1291
1292         if (next_key) {
1293                 next_key->objectid = (u64)-1;
1294                 next_key->type = (u8)-1;
1295                 next_key->offset = (u64)-1;
1296         }
1297
1298         parent = eb;
1299         while (1) {
1300                 level = btrfs_header_level(parent);
1301                 BUG_ON(level < lowest_level);
1302
1303                 ret = btrfs_bin_search(parent, &key, level, &slot);
1304                 if (ret && slot > 0)
1305                         slot--;
1306
1307                 if (next_key && slot + 1 < btrfs_header_nritems(parent))
1308                         btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1309
1310                 old_bytenr = btrfs_node_blockptr(parent, slot);
1311                 blocksize = btrfs_level_size(dest, level - 1);
1312                 old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1313
1314                 if (level <= max_level) {
1315                         eb = path->nodes[level];
1316                         new_bytenr = btrfs_node_blockptr(eb,
1317                                                         path->slots[level]);
1318                         new_ptr_gen = btrfs_node_ptr_generation(eb,
1319                                                         path->slots[level]);
1320                 } else {
1321                         new_bytenr = 0;
1322                         new_ptr_gen = 0;
1323                 }
1324
1325                 if (new_bytenr > 0 && new_bytenr == old_bytenr) {
1326                         WARN_ON(1);
1327                         ret = level;
1328                         break;
1329                 }
1330
1331                 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1332                     memcmp_node_keys(parent, slot, path, level)) {
1333                         if (level <= lowest_level && !leaf) {
1334                                 ret = 0;
1335                                 break;
1336                         }
1337
1338                         eb = read_tree_block(dest, old_bytenr, blocksize,
1339                                              old_ptr_gen);
1340                         btrfs_tree_lock(eb);
1341                         ret = btrfs_cow_block(trans, dest, eb, parent,
1342                                               slot, &eb);
1343                         BUG_ON(ret);
1344                         btrfs_set_lock_blocking(eb);
1345
1346                         if (level <= lowest_level) {
1347                                 *leaf = eb;
1348                                 ret = 0;
1349                                 break;
1350                         }
1351
1352                         btrfs_tree_unlock(parent);
1353                         free_extent_buffer(parent);
1354
1355                         parent = eb;
1356                         continue;
1357                 }
1358
1359                 btrfs_node_key_to_cpu(path->nodes[level], &key,
1360                                       path->slots[level]);
1361                 btrfs_release_path(src, path);
1362
1363                 path->lowest_level = level;
1364                 ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1365                 path->lowest_level = 0;
1366                 BUG_ON(ret);
1367
1368                 /*
1369                  * swap blocks in fs tree and reloc tree.
1370                  */
1371                 btrfs_set_node_blockptr(parent, slot, new_bytenr);
1372                 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1373                 btrfs_mark_buffer_dirty(parent);
1374
1375                 btrfs_set_node_blockptr(path->nodes[level],
1376                                         path->slots[level], old_bytenr);
1377                 btrfs_set_node_ptr_generation(path->nodes[level],
1378                                               path->slots[level], old_ptr_gen);
1379                 btrfs_mark_buffer_dirty(path->nodes[level]);
1380
1381                 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1382                                         path->nodes[level]->start,
1383                                         src->root_key.objectid, level - 1, 0);
1384                 BUG_ON(ret);
1385                 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1386                                         0, dest->root_key.objectid, level - 1,
1387                                         0);
1388                 BUG_ON(ret);
1389
1390                 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1391                                         path->nodes[level]->start,
1392                                         src->root_key.objectid, level - 1, 0);
1393                 BUG_ON(ret);
1394
1395                 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1396                                         0, dest->root_key.objectid, level - 1,
1397                                         0);
1398                 BUG_ON(ret);
1399
1400                 btrfs_unlock_up_safe(path, 0);
1401
1402                 ret = level;
1403                 break;
1404         }
1405         btrfs_tree_unlock(parent);
1406         free_extent_buffer(parent);
1407         return ret;
1408 }
1409
1410 /*
1411  * helper to find next relocated block in reloc tree
1412  */
1413 static noinline_for_stack
1414 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1415                        int *level)
1416 {
1417         struct extent_buffer *eb;
1418         int i;
1419         u64 last_snapshot;
1420         u32 nritems;
1421
1422         last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1423
1424         for (i = 0; i < *level; i++) {
1425                 free_extent_buffer(path->nodes[i]);
1426                 path->nodes[i] = NULL;
1427         }
1428
1429         for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1430                 eb = path->nodes[i];
1431                 nritems = btrfs_header_nritems(eb);
1432                 while (path->slots[i] + 1 < nritems) {
1433                         path->slots[i]++;
1434                         if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1435                             last_snapshot)
1436                                 continue;
1437
1438                         *level = i;
1439                         return 0;
1440                 }
1441                 free_extent_buffer(path->nodes[i]);
1442                 path->nodes[i] = NULL;
1443         }
1444         return 1;
1445 }
1446
1447 /*
1448  * walk down reloc tree to find relocated block of lowest level
1449  */
1450 static noinline_for_stack
1451 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1452                          int *level)
1453 {
1454         struct extent_buffer *eb = NULL;
1455         int i;
1456         u64 bytenr;
1457         u64 ptr_gen = 0;
1458         u64 last_snapshot;
1459         u32 blocksize;
1460         u32 nritems;
1461
1462         last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1463
1464         for (i = *level; i > 0; i--) {
1465                 eb = path->nodes[i];
1466                 nritems = btrfs_header_nritems(eb);
1467                 while (path->slots[i] < nritems) {
1468                         ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1469                         if (ptr_gen > last_snapshot)
1470                                 break;
1471                         path->slots[i]++;
1472                 }
1473                 if (path->slots[i] >= nritems) {
1474                         if (i == *level)
1475                                 break;
1476                         *level = i + 1;
1477                         return 0;
1478                 }
1479                 if (i == 1) {
1480                         *level = i;
1481                         return 0;
1482                 }
1483
1484                 bytenr = btrfs_node_blockptr(eb, path->slots[i]);
1485                 blocksize = btrfs_level_size(root, i - 1);
1486                 eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
1487                 BUG_ON(btrfs_header_level(eb) != i - 1);
1488                 path->nodes[i - 1] = eb;
1489                 path->slots[i - 1] = 0;
1490         }
1491         return 1;
1492 }
1493
1494 /*
1495  * invalidate extent cache for file extents whose key in range of
1496  * [min_key, max_key)
1497  */
1498 static int invalidate_extent_cache(struct btrfs_root *root,
1499                                    struct btrfs_key *min_key,
1500                                    struct btrfs_key *max_key)
1501 {
1502         struct inode *inode = NULL;
1503         u64 objectid;
1504         u64 start, end;
1505
1506         objectid = min_key->objectid;
1507         while (1) {
1508                 cond_resched();
1509                 iput(inode);
1510
1511                 if (objectid > max_key->objectid)
1512                         break;
1513
1514                 inode = find_next_inode(root, objectid);
1515                 if (!inode)
1516                         break;
1517
1518                 if (inode->i_ino > max_key->objectid) {
1519                         iput(inode);
1520                         break;
1521                 }
1522
1523                 objectid = inode->i_ino + 1;
1524                 if (!S_ISREG(inode->i_mode))
1525                         continue;
1526
1527                 if (unlikely(min_key->objectid == inode->i_ino)) {
1528                         if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1529                                 continue;
1530                         if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1531                                 start = 0;
1532                         else {
1533                                 start = min_key->offset;
1534                                 WARN_ON(!IS_ALIGNED(start, root->sectorsize));
1535                         }
1536                 } else {
1537                         start = 0;
1538                 }
1539
1540                 if (unlikely(max_key->objectid == inode->i_ino)) {
1541                         if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1542                                 continue;
1543                         if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
1544                                 end = (u64)-1;
1545                         } else {
1546                                 if (max_key->offset == 0)
1547                                         continue;
1548                                 end = max_key->offset;
1549                                 WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1550                                 end--;
1551                         }
1552                 } else {
1553                         end = (u64)-1;
1554                 }
1555
1556                 /* the lock_extent waits for readpage to complete */
1557                 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
1558                 btrfs_drop_extent_cache(inode, start, end, 1);
1559                 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
1560         }
1561         return 0;
1562 }
1563
1564 static int find_next_key(struct btrfs_path *path, int level,
1565                          struct btrfs_key *key)
1566
1567 {
1568         while (level < BTRFS_MAX_LEVEL) {
1569                 if (!path->nodes[level])
1570                         break;
1571                 if (path->slots[level] + 1 <
1572                     btrfs_header_nritems(path->nodes[level])) {
1573                         btrfs_node_key_to_cpu(path->nodes[level], key,
1574                                               path->slots[level] + 1);
1575                         return 0;
1576                 }
1577                 level++;
1578         }
1579         return 1;
1580 }
1581
1582 /*
1583  * merge the relocated tree blocks in reloc tree with corresponding
1584  * fs tree.
1585  */
1586 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1587                                                struct btrfs_root *root)
1588 {
1589         LIST_HEAD(inode_list);
1590         struct btrfs_key key;
1591         struct btrfs_key next_key;
1592         struct btrfs_trans_handle *trans;
1593         struct btrfs_root *reloc_root;
1594         struct btrfs_root_item *root_item;
1595         struct btrfs_path *path;
1596         struct extent_buffer *leaf = NULL;
1597         unsigned long nr;
1598         int level;
1599         int max_level;
1600         int replaced = 0;
1601         int ret;
1602         int err = 0;
1603
1604         path = btrfs_alloc_path();
1605         if (!path)
1606                 return -ENOMEM;
1607
1608         reloc_root = root->reloc_root;
1609         root_item = &reloc_root->root_item;
1610
1611         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
1612                 level = btrfs_root_level(root_item);
1613                 extent_buffer_get(reloc_root->node);
1614                 path->nodes[level] = reloc_root->node;
1615                 path->slots[level] = 0;
1616         } else {
1617                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
1618
1619                 level = root_item->drop_level;
1620                 BUG_ON(level == 0);
1621                 path->lowest_level = level;
1622                 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1623                 path->lowest_level = 0;
1624                 if (ret < 0) {
1625                         btrfs_free_path(path);
1626                         return ret;
1627                 }
1628
1629                 btrfs_node_key_to_cpu(path->nodes[level], &next_key,
1630                                       path->slots[level]);
1631                 WARN_ON(memcmp(&key, &next_key, sizeof(key)));
1632
1633                 btrfs_unlock_up_safe(path, 0);
1634         }
1635
1636         if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
1637                 trans = btrfs_start_transaction(root, 1);
1638
1639                 leaf = path->nodes[0];
1640                 btrfs_item_key_to_cpu(leaf, &key, 0);
1641                 btrfs_release_path(reloc_root, path);
1642
1643                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1644                 if (ret < 0) {
1645                         err = ret;
1646                         goto out;
1647                 }
1648
1649                 leaf = path->nodes[0];
1650                 btrfs_unlock_up_safe(path, 1);
1651                 ret = replace_file_extents(trans, rc, root, leaf,
1652                                            &inode_list);
1653                 if (ret < 0)
1654                         err = ret;
1655                 goto out;
1656         }
1657
1658         memset(&next_key, 0, sizeof(next_key));
1659
1660         while (1) {
1661                 leaf = NULL;
1662                 replaced = 0;
1663                 trans = btrfs_start_transaction(root, 1);
1664                 max_level = level;
1665
1666                 ret = walk_down_reloc_tree(reloc_root, path, &level);
1667                 if (ret < 0) {
1668                         err = ret;
1669                         goto out;
1670                 }
1671                 if (ret > 0)
1672                         break;
1673
1674                 if (!find_next_key(path, level, &key) &&
1675                     btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1676                         ret = 0;
1677                 } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
1678                         ret = replace_path(trans, root, reloc_root,
1679                                            path, &next_key, &leaf,
1680                                            level, max_level);
1681                 } else {
1682                         ret = replace_path(trans, root, reloc_root,
1683                                            path, &next_key, NULL,
1684                                            level, max_level);
1685                 }
1686                 if (ret < 0) {
1687                         err = ret;
1688                         goto out;
1689                 }
1690
1691                 if (ret > 0) {
1692                         level = ret;
1693                         btrfs_node_key_to_cpu(path->nodes[level], &key,
1694                                               path->slots[level]);
1695                         replaced = 1;
1696                 } else if (leaf) {
1697                         /*
1698                          * no block got replaced, try replacing file extents
1699                          */
1700                         btrfs_item_key_to_cpu(leaf, &key, 0);
1701                         ret = replace_file_extents(trans, rc, root, leaf,
1702                                                    &inode_list);
1703                         btrfs_tree_unlock(leaf);
1704                         free_extent_buffer(leaf);
1705                         BUG_ON(ret < 0);
1706                 }
1707
1708                 ret = walk_up_reloc_tree(reloc_root, path, &level);
1709                 if (ret > 0)
1710                         break;
1711
1712                 BUG_ON(level == 0);
1713                 /*
1714                  * save the merging progress in the drop_progress.
1715                  * this is OK since root refs == 1 in this case.
1716                  */
1717                 btrfs_node_key(path->nodes[level], &root_item->drop_progress,
1718                                path->slots[level]);
1719                 root_item->drop_level = level;
1720
1721                 nr = trans->blocks_used;
1722                 btrfs_end_transaction(trans, root);
1723
1724                 btrfs_btree_balance_dirty(root, nr);
1725
1726                 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1727                         invalidate_extent_cache(root, &key, &next_key);
1728         }
1729
1730         /*
1731          * handle the case only one block in the fs tree need to be
1732          * relocated and the block is tree root.
1733          */
1734         leaf = btrfs_lock_root_node(root);
1735         ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
1736         btrfs_tree_unlock(leaf);
1737         free_extent_buffer(leaf);
1738         if (ret < 0)
1739                 err = ret;
1740 out:
1741         btrfs_free_path(path);
1742
1743         if (err == 0) {
1744                 memset(&root_item->drop_progress, 0,
1745                        sizeof(root_item->drop_progress));
1746                 root_item->drop_level = 0;
1747                 btrfs_set_root_refs(root_item, 0);
1748         }
1749
1750         nr = trans->blocks_used;
1751         btrfs_end_transaction(trans, root);
1752
1753         btrfs_btree_balance_dirty(root, nr);
1754
1755         /*
1756          * put inodes while we aren't holding the tree locks
1757          */
1758         while (!list_empty(&inode_list)) {
1759                 struct inodevec *ivec;
1760                 ivec = list_entry(inode_list.next, struct inodevec, list);
1761                 list_del(&ivec->list);
1762                 while (ivec->nr > 0) {
1763                         ivec->nr--;
1764                         iput(ivec->inode[ivec->nr]);
1765                 }
1766                 kfree(ivec);
1767         }
1768
1769         if (replaced && rc->stage == UPDATE_DATA_PTRS)
1770                 invalidate_extent_cache(root, &key, &next_key);
1771
1772         return err;
1773 }
1774
1775 /*
1776  * callback for the work threads.
1777  * this function merges reloc tree with corresponding fs tree,
1778  * and then drops the reloc tree.
1779  */
1780 static void merge_func(struct btrfs_work *work)
1781 {
1782         struct btrfs_trans_handle *trans;
1783         struct btrfs_root *root;
1784         struct btrfs_root *reloc_root;
1785         struct async_merge *async;
1786
1787         async = container_of(work, struct async_merge, work);
1788         reloc_root = async->root;
1789
1790         if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1791                 root = read_fs_root(reloc_root->fs_info,
1792                                     reloc_root->root_key.offset);
1793                 BUG_ON(IS_ERR(root));
1794                 BUG_ON(root->reloc_root != reloc_root);
1795
1796                 merge_reloc_root(async->rc, root);
1797
1798                 trans = btrfs_start_transaction(root, 1);
1799                 btrfs_update_reloc_root(trans, root);
1800                 btrfs_end_transaction(trans, root);
1801         }
1802
1803         btrfs_drop_snapshot(reloc_root, 0);
1804
1805         if (atomic_dec_and_test(async->num_pending))
1806                 complete(async->done);
1807
1808         kfree(async);
1809 }
1810
1811 static int merge_reloc_roots(struct reloc_control *rc)
1812 {
1813         struct async_merge *async;
1814         struct btrfs_root *root;
1815         struct completion done;
1816         atomic_t num_pending;
1817
1818         init_completion(&done);
1819         atomic_set(&num_pending, 1);
1820
1821         while (!list_empty(&rc->reloc_roots)) {
1822                 root = list_entry(rc->reloc_roots.next,
1823                                   struct btrfs_root, root_list);
1824                 list_del_init(&root->root_list);
1825
1826                 async = kmalloc(sizeof(*async), GFP_NOFS);
1827                 BUG_ON(!async);
1828                 async->work.func = merge_func;
1829                 async->work.flags = 0;
1830                 async->rc = rc;
1831                 async->root = root;
1832                 async->done = &done;
1833                 async->num_pending = &num_pending;
1834                 atomic_inc(&num_pending);
1835                 btrfs_queue_worker(&rc->workers, &async->work);
1836         }
1837
1838         if (!atomic_dec_and_test(&num_pending))
1839                 wait_for_completion(&done);
1840
1841         BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1842         return 0;
1843 }
1844
1845 static void free_block_list(struct rb_root *blocks)
1846 {
1847         struct tree_block *block;
1848         struct rb_node *rb_node;
1849         while ((rb_node = rb_first(blocks))) {
1850                 block = rb_entry(rb_node, struct tree_block, rb_node);
1851                 rb_erase(rb_node, blocks);
1852                 kfree(block);
1853         }
1854 }
1855
1856 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1857                                       struct btrfs_root *reloc_root)
1858 {
1859         struct btrfs_root *root;
1860
1861         if (reloc_root->last_trans == trans->transid)
1862                 return 0;
1863
1864         root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
1865         BUG_ON(IS_ERR(root));
1866         BUG_ON(root->reloc_root != reloc_root);
1867
1868         return btrfs_record_root_in_trans(trans, root);
1869 }
1870
1871 /*
1872  * select one tree from trees that references the block.
1873  * for blocks in refernce counted trees, we preper reloc tree.
1874  * if no reloc tree found and reloc_only is true, NULL is returned.
1875  */
1876 static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
1877                                             struct backref_node *node,
1878                                             struct backref_edge *edges[],
1879                                             int *nr, int reloc_only)
1880 {
1881         struct backref_node *next;
1882         struct btrfs_root *root;
1883         int index;
1884         int loop = 0;
1885 again:
1886         index = 0;
1887         next = node;
1888         while (1) {
1889                 cond_resched();
1890                 next = walk_up_backref(next, edges, &index);
1891                 root = next->root;
1892                 if (!root) {
1893                         BUG_ON(!node->old_root);
1894                         goto skip;
1895                 }
1896
1897                 /* no other choice for non-refernce counted tree */
1898                 if (!root->ref_cows) {
1899                         BUG_ON(reloc_only);
1900                         break;
1901                 }
1902
1903                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1904                         record_reloc_root_in_trans(trans, root);
1905                         break;
1906                 }
1907
1908                 if (loop) {
1909                         btrfs_record_root_in_trans(trans, root);
1910                         break;
1911                 }
1912
1913                 if (reloc_only || next != node) {
1914                         if (!root->reloc_root)
1915                                 btrfs_record_root_in_trans(trans, root);
1916                         root = root->reloc_root;
1917                         /*
1918                          * if the reloc tree was created in current
1919                          * transation, there is no node in backref tree
1920                          * corresponds to the root of the reloc tree.
1921                          */
1922                         if (btrfs_root_last_snapshot(&root->root_item) ==
1923                             trans->transid - 1)
1924                                 break;
1925                 }
1926 skip:
1927                 root = NULL;
1928                 next = walk_down_backref(edges, &index);
1929                 if (!next || next->level <= node->level)
1930                         break;
1931         }
1932
1933         if (!root && !loop && !reloc_only) {
1934                 loop = 1;
1935                 goto again;
1936         }
1937
1938         if (root)
1939                 *nr = index;
1940         else
1941                 *nr = 0;
1942
1943         return root;
1944 }
1945
1946 static noinline_for_stack
1947 struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
1948                                    struct backref_node *node)
1949 {
1950         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1951         int nr;
1952         return __select_one_root(trans, node, edges, &nr, 0);
1953 }
1954
1955 static noinline_for_stack
1956 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
1957                                      struct backref_node *node,
1958                                      struct backref_edge *edges[], int *nr)
1959 {
1960         return __select_one_root(trans, node, edges, nr, 1);
1961 }
1962
1963 static void grab_path_buffers(struct btrfs_path *path,
1964                               struct backref_node *node,
1965                               struct backref_edge *edges[], int nr)
1966 {
1967         int i = 0;
1968         while (1) {
1969                 drop_node_buffer(node);
1970                 node->eb = path->nodes[node->level];
1971                 BUG_ON(!node->eb);
1972                 if (path->locks[node->level])
1973                         node->locked = 1;
1974                 path->nodes[node->level] = NULL;
1975                 path->locks[node->level] = 0;
1976
1977                 if (i >= nr)
1978                         break;
1979
1980                 edges[i]->blockptr = node->eb->start;
1981                 node = edges[i]->node[UPPER];
1982                 i++;
1983         }
1984 }
1985
1986 /*
1987  * relocate a block tree, and then update pointers in upper level
1988  * blocks that reference the block to point to the new location.
1989  *
1990  * if called by link_to_upper, the block has already been relocated.
1991  * in that case this function just updates pointers.
1992  */
1993 static int do_relocation(struct btrfs_trans_handle *trans,
1994                          struct backref_node *node,
1995                          struct btrfs_key *key,
1996                          struct btrfs_path *path, int lowest)
1997 {
1998         struct backref_node *upper;
1999         struct backref_edge *edge;
2000         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2001         struct btrfs_root *root;
2002         struct extent_buffer *eb;
2003         u32 blocksize;
2004         u64 bytenr;
2005         u64 generation;
2006         int nr;
2007         int slot;
2008         int ret;
2009         int err = 0;
2010
2011         BUG_ON(lowest && node->eb);
2012
2013         path->lowest_level = node->level + 1;
2014         list_for_each_entry(edge, &node->upper, list[LOWER]) {
2015                 cond_resched();
2016                 if (node->eb && node->eb->start == edge->blockptr)
2017                         continue;
2018
2019                 upper = edge->node[UPPER];
2020                 root = select_reloc_root(trans, upper, edges, &nr);
2021                 if (!root)
2022                         continue;
2023
2024                 if (upper->eb && !upper->locked)
2025                         drop_node_buffer(upper);
2026
2027                 if (!upper->eb) {
2028                         ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2029                         if (ret < 0) {
2030                                 err = ret;
2031                                 break;
2032                         }
2033                         BUG_ON(ret > 0);
2034
2035                         slot = path->slots[upper->level];
2036
2037                         btrfs_unlock_up_safe(path, upper->level + 1);
2038                         grab_path_buffers(path, upper, edges, nr);
2039
2040                         btrfs_release_path(NULL, path);
2041                 } else {
2042                         ret = btrfs_bin_search(upper->eb, key, upper->level,
2043                                                &slot);
2044                         BUG_ON(ret);
2045                 }
2046
2047                 bytenr = btrfs_node_blockptr(upper->eb, slot);
2048                 if (!lowest) {
2049                         if (node->eb->start == bytenr) {
2050                                 btrfs_tree_unlock(upper->eb);
2051                                 upper->locked = 0;
2052                                 continue;
2053                         }
2054                 } else {
2055                         BUG_ON(node->bytenr != bytenr);
2056                 }
2057
2058                 blocksize = btrfs_level_size(root, node->level);
2059                 generation = btrfs_node_ptr_generation(upper->eb, slot);
2060                 eb = read_tree_block(root, bytenr, blocksize, generation);
2061                 btrfs_tree_lock(eb);
2062                 btrfs_set_lock_blocking(eb);
2063
2064                 if (!node->eb) {
2065                         ret = btrfs_cow_block(trans, root, eb, upper->eb,
2066                                               slot, &eb);
2067                         if (ret < 0) {
2068                                 err = ret;
2069                                 break;
2070                         }
2071                         btrfs_set_lock_blocking(eb);
2072                         node->eb = eb;
2073                         node->locked = 1;
2074                 } else {
2075                         btrfs_set_node_blockptr(upper->eb, slot,
2076                                                 node->eb->start);
2077                         btrfs_set_node_ptr_generation(upper->eb, slot,
2078                                                       trans->transid);
2079                         btrfs_mark_buffer_dirty(upper->eb);
2080
2081                         ret = btrfs_inc_extent_ref(trans, root,
2082                                                 node->eb->start, blocksize,
2083                                                 upper->eb->start,
2084                                                 btrfs_header_owner(upper->eb),
2085                                                 node->level, 0);
2086                         BUG_ON(ret);
2087
2088                         ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2089                         BUG_ON(ret);
2090                 }
2091                 if (!lowest) {
2092                         btrfs_tree_unlock(upper->eb);
2093                         upper->locked = 0;
2094                 }
2095         }
2096         path->lowest_level = 0;
2097         return err;
2098 }
2099
2100 static int link_to_upper(struct btrfs_trans_handle *trans,
2101                          struct backref_node *node,
2102                          struct btrfs_path *path)
2103 {
2104         struct btrfs_key key;
2105         if (!node->eb || list_empty(&node->upper))
2106                 return 0;
2107
2108         btrfs_node_key_to_cpu(node->eb, &key, 0);
2109         return do_relocation(trans, node, &key, path, 0);
2110 }
2111
2112 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2113                                 struct backref_cache *cache,
2114                                 struct btrfs_path *path)
2115 {
2116         struct backref_node *node;
2117         int level;
2118         int ret;
2119         int err = 0;
2120
2121         for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2122                 while (!list_empty(&cache->pending[level])) {
2123                         node = list_entry(cache->pending[level].next,
2124                                           struct backref_node, lower);
2125                         BUG_ON(node->level != level);
2126
2127                         ret = link_to_upper(trans, node, path);
2128                         if (ret < 0)
2129                                 err = ret;
2130                         /*
2131                          * this remove the node from the pending list and
2132                          * may add some other nodes to the level + 1
2133                          * pending list
2134                          */
2135                         remove_backref_node(cache, node);
2136                 }
2137         }
2138         BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
2139         return err;
2140 }
2141
2142 static void mark_block_processed(struct reloc_control *rc,
2143                                  struct backref_node *node)
2144 {
2145         u32 blocksize;
2146         if (node->level == 0 ||
2147             in_block_group(node->bytenr, rc->block_group)) {
2148                 blocksize = btrfs_level_size(rc->extent_root, node->level);
2149                 set_extent_bits(&rc->processed_blocks, node->bytenr,
2150                                 node->bytenr + blocksize - 1, EXTENT_DIRTY,
2151                                 GFP_NOFS);
2152         }
2153         node->processed = 1;
2154 }
2155
2156 /*
2157  * mark a block and all blocks directly/indirectly reference the block
2158  * as processed.
2159  */
2160 static void update_processed_blocks(struct reloc_control *rc,
2161                                     struct backref_node *node)
2162 {
2163         struct backref_node *next = node;
2164         struct backref_edge *edge;
2165         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2166         int index = 0;
2167
2168         while (next) {
2169                 cond_resched();
2170                 while (1) {
2171                         if (next->processed)
2172                                 break;
2173
2174                         mark_block_processed(rc, next);
2175
2176                         if (list_empty(&next->upper))
2177                                 break;
2178
2179                         edge = list_entry(next->upper.next,
2180                                           struct backref_edge, list[LOWER]);
2181                         edges[index++] = edge;
2182                         next = edge->node[UPPER];
2183                 }
2184                 next = walk_down_backref(edges, &index);
2185         }
2186 }
2187
2188 static int tree_block_processed(u64 bytenr, u32 blocksize,
2189                                 struct reloc_control *rc)
2190 {
2191         if (test_range_bit(&rc->processed_blocks, bytenr,
2192                            bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2193                 return 1;
2194         return 0;
2195 }
2196
2197 /*
2198  * check if there are any file extent pointers in the leaf point to
2199  * data require processing
2200  */
2201 static int check_file_extents(struct reloc_control *rc,
2202                               u64 bytenr, u32 blocksize, u64 ptr_gen)
2203 {
2204         struct btrfs_key found_key;
2205         struct btrfs_file_extent_item *fi;
2206         struct extent_buffer *leaf;
2207         u32 nritems;
2208         int i;
2209         int ret = 0;
2210
2211         leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
2212
2213         nritems = btrfs_header_nritems(leaf);
2214         for (i = 0; i < nritems; i++) {
2215                 cond_resched();
2216                 btrfs_item_key_to_cpu(leaf, &found_key, i);
2217                 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
2218                         continue;
2219                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2220                 if (btrfs_file_extent_type(leaf, fi) ==
2221                     BTRFS_FILE_EXTENT_INLINE)
2222                         continue;
2223                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2224                 if (bytenr == 0)
2225                         continue;
2226                 if (in_block_group(bytenr, rc->block_group)) {
2227                         ret = 1;
2228                         break;
2229                 }
2230         }
2231         free_extent_buffer(leaf);
2232         return ret;
2233 }
2234
2235 /*
2236  * scan child blocks of a given block to find blocks require processing
2237  */
2238 static int add_child_blocks(struct btrfs_trans_handle *trans,
2239                             struct reloc_control *rc,
2240                             struct backref_node *node,
2241                             struct rb_root *blocks)
2242 {
2243         struct tree_block *block;
2244         struct rb_node *rb_node;
2245         u64 bytenr;
2246         u64 ptr_gen;
2247         u32 blocksize;
2248         u32 nritems;
2249         int i;
2250         int err = 0;
2251
2252         nritems = btrfs_header_nritems(node->eb);
2253         blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
2254         for (i = 0; i < nritems; i++) {
2255                 cond_resched();
2256                 bytenr = btrfs_node_blockptr(node->eb, i);
2257                 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2258                 if (ptr_gen == trans->transid)
2259                         continue;
2260                 if (!in_block_group(bytenr, rc->block_group) &&
2261                     (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2262                         continue;
2263                 if (tree_block_processed(bytenr, blocksize, rc))
2264                         continue;
2265
2266                 readahead_tree_block(rc->extent_root,
2267                                      bytenr, blocksize, ptr_gen);
2268         }
2269
2270         for (i = 0; i < nritems; i++) {
2271                 cond_resched();
2272                 bytenr = btrfs_node_blockptr(node->eb, i);
2273                 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2274                 if (ptr_gen == trans->transid)
2275                         continue;
2276                 if (!in_block_group(bytenr, rc->block_group) &&
2277                     (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2278                         continue;
2279                 if (tree_block_processed(bytenr, blocksize, rc))
2280                         continue;
2281                 if (!in_block_group(bytenr, rc->block_group) &&
2282                     !check_file_extents(rc, bytenr, blocksize, ptr_gen))
2283                         continue;
2284
2285                 block = kmalloc(sizeof(*block), GFP_NOFS);
2286                 if (!block) {
2287                         err = -ENOMEM;
2288                         break;
2289                 }
2290                 block->bytenr = bytenr;
2291                 btrfs_node_key_to_cpu(node->eb, &block->key, i);
2292                 block->level = node->level - 1;
2293                 block->key_ready = 1;
2294                 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2295                 BUG_ON(rb_node);
2296         }
2297         if (err)
2298                 free_block_list(blocks);
2299         return err;
2300 }
2301
2302 /*
2303  * find adjacent blocks require processing
2304  */
2305 static noinline_for_stack
2306 int add_adjacent_blocks(struct btrfs_trans_handle *trans,
2307                         struct reloc_control *rc,
2308                         struct backref_cache *cache,
2309                         struct rb_root *blocks, int level,
2310                         struct backref_node **upper)
2311 {
2312         struct backref_node *node;
2313         int ret = 0;
2314
2315         WARN_ON(!list_empty(&cache->pending[level]));
2316
2317         if (list_empty(&cache->pending[level + 1]))
2318                 return 1;
2319
2320         node = list_entry(cache->pending[level + 1].next,
2321                           struct backref_node, lower);
2322         if (node->eb)
2323                 ret = add_child_blocks(trans, rc, node, blocks);
2324
2325         *upper = node;
2326         return ret;
2327 }
2328
2329 static int get_tree_block_key(struct reloc_control *rc,
2330                               struct tree_block *block)
2331 {
2332         struct extent_buffer *eb;
2333
2334         BUG_ON(block->key_ready);
2335         eb = read_tree_block(rc->extent_root, block->bytenr,
2336                              block->key.objectid, block->key.offset);
2337         WARN_ON(btrfs_header_level(eb) != block->level);
2338         if (block->level == 0)
2339                 btrfs_item_key_to_cpu(eb, &block->key, 0);
2340         else
2341                 btrfs_node_key_to_cpu(eb, &block->key, 0);
2342         free_extent_buffer(eb);
2343         block->key_ready = 1;
2344         return 0;
2345 }
2346
2347 static int reada_tree_block(struct reloc_control *rc,
2348                             struct tree_block *block)
2349 {
2350         BUG_ON(block->key_ready);
2351         readahead_tree_block(rc->extent_root, block->bytenr,
2352                              block->key.objectid, block->key.offset);
2353         return 0;
2354 }
2355
2356 /*
2357  * helper function to relocate a tree block
2358  */
2359 static int relocate_tree_block(struct btrfs_trans_handle *trans,
2360                                 struct reloc_control *rc,
2361                                 struct backref_node *node,
2362                                 struct btrfs_key *key,
2363                                 struct btrfs_path *path)
2364 {
2365         struct btrfs_root *root;
2366         int ret;
2367
2368         root = select_one_root(trans, node);
2369         if (unlikely(!root)) {
2370                 rc->found_old_snapshot = 1;
2371                 update_processed_blocks(rc, node);
2372                 return 0;
2373         }
2374
2375         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2376                 ret = do_relocation(trans, node, key, path, 1);
2377                 if (ret < 0)
2378                         goto out;
2379                 if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
2380                         ret = replace_file_extents(trans, rc, root,
2381                                                    node->eb, NULL);
2382                         if (ret < 0)
2383                                 goto out;
2384                 }
2385                 drop_node_buffer(node);
2386         } else if (!root->ref_cows) {
2387                 path->lowest_level = node->level;
2388                 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2389                 btrfs_release_path(root, path);
2390                 if (ret < 0)
2391                         goto out;
2392         } else if (root != node->root) {
2393                 WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
2394         }
2395
2396         update_processed_blocks(rc, node);
2397         ret = 0;
2398 out:
2399         drop_node_buffer(node);
2400         return ret;
2401 }
2402
2403 /*
2404  * relocate a list of blocks
2405  */
2406 static noinline_for_stack
2407 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2408                          struct reloc_control *rc, struct rb_root *blocks)
2409 {
2410         struct backref_cache *cache;
2411         struct backref_node *node;
2412         struct btrfs_path *path;
2413         struct tree_block *block;
2414         struct rb_node *rb_node;
2415         int level = -1;
2416         int ret;
2417         int err = 0;
2418
2419         path = btrfs_alloc_path();
2420         if (!path)
2421                 return -ENOMEM;
2422
2423         cache = kmalloc(sizeof(*cache), GFP_NOFS);
2424         if (!cache) {
2425                 btrfs_free_path(path);
2426                 return -ENOMEM;
2427         }
2428
2429         backref_cache_init(cache);
2430
2431         rb_node = rb_first(blocks);
2432         while (rb_node) {
2433                 block = rb_entry(rb_node, struct tree_block, rb_node);
2434                 if (level == -1)
2435                         level = block->level;
2436                 else
2437                         BUG_ON(level != block->level);
2438                 if (!block->key_ready)
2439                         reada_tree_block(rc, block);
2440                 rb_node = rb_next(rb_node);
2441         }
2442
2443         rb_node = rb_first(blocks);
2444         while (rb_node) {
2445                 block = rb_entry(rb_node, struct tree_block, rb_node);
2446                 if (!block->key_ready)
2447                         get_tree_block_key(rc, block);
2448                 rb_node = rb_next(rb_node);
2449         }
2450
2451         rb_node = rb_first(blocks);
2452         while (rb_node) {
2453                 block = rb_entry(rb_node, struct tree_block, rb_node);
2454
2455                 node = build_backref_tree(rc, cache, &block->key,
2456                                           block->level, block->bytenr);
2457                 if (IS_ERR(node)) {
2458                         err = PTR_ERR(node);
2459                         goto out;
2460                 }
2461
2462                 ret = relocate_tree_block(trans, rc, node, &block->key,
2463                                           path);
2464                 if (ret < 0) {
2465                         err = ret;
2466                         goto out;
2467                 }
2468                 remove_backref_node(cache, node);
2469                 rb_node = rb_next(rb_node);
2470         }
2471
2472         if (level > 0)
2473                 goto out;
2474
2475         free_block_list(blocks);
2476
2477         /*
2478          * now backrefs of some upper level tree blocks have been cached,
2479          * try relocating blocks referenced by these upper level blocks.
2480          */
2481         while (1) {
2482                 struct backref_node *upper = NULL;
2483                 if (trans->transaction->in_commit ||
2484                     trans->transaction->delayed_refs.flushing)
2485                         break;
2486
2487                 ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
2488                                           &upper);
2489                 if (ret < 0)
2490                         err = ret;
2491                 if (ret != 0)
2492                         break;
2493
2494                 rb_node = rb_first(blocks);
2495                 while (rb_node) {
2496                         block = rb_entry(rb_node, struct tree_block, rb_node);
2497                         if (trans->transaction->in_commit ||
2498                             trans->transaction->delayed_refs.flushing)
2499                                 goto out;
2500                         BUG_ON(!block->key_ready);
2501                         node = build_backref_tree(rc, cache, &block->key,
2502                                                   level, block->bytenr);
2503                         if (IS_ERR(node)) {
2504                                 err = PTR_ERR(node);
2505                                 goto out;
2506                         }
2507
2508                         ret = relocate_tree_block(trans, rc, node,
2509                                                   &block->key, path);
2510                         if (ret < 0) {
2511                                 err = ret;
2512                                 goto out;
2513                         }
2514                         remove_backref_node(cache, node);
2515                         rb_node = rb_next(rb_node);
2516                 }
2517                 free_block_list(blocks);
2518
2519                 if (upper) {
2520                         ret = link_to_upper(trans, upper, path);
2521                         if (ret < 0) {
2522                                 err = ret;
2523                                 break;
2524                         }
2525                         remove_backref_node(cache, upper);
2526                 }
2527         }
2528 out:
2529         free_block_list(blocks);
2530
2531         ret = finish_pending_nodes(trans, cache, path);
2532         if (ret < 0)
2533                 err = ret;
2534
2535         kfree(cache);
2536         btrfs_free_path(path);
2537         return err;
2538 }
2539
2540 static noinline_for_stack
2541 int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
2542                          u64 block_start)
2543 {
2544         struct btrfs_root *root = BTRFS_I(inode)->root;
2545         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2546         struct extent_map *em;
2547         int ret = 0;
2548
2549         em = alloc_extent_map(GFP_NOFS);
2550         if (!em)
2551                 return -ENOMEM;
2552
2553         em->start = start;
2554         em->len = end + 1 - start;
2555         em->block_len = em->len;
2556         em->block_start = block_start;
2557         em->bdev = root->fs_info->fs_devices->latest_bdev;
2558         set_bit(EXTENT_FLAG_PINNED, &em->flags);
2559
2560         lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2561         while (1) {
2562                 write_lock(&em_tree->lock);
2563                 ret = add_extent_mapping(em_tree, em);
2564                 write_unlock(&em_tree->lock);
2565                 if (ret != -EEXIST) {
2566                         free_extent_map(em);
2567                         break;
2568                 }
2569                 btrfs_drop_extent_cache(inode, start, end, 0);
2570         }
2571         unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2572         return ret;
2573 }
2574
2575 static int relocate_file_extent_cluster(struct inode *inode,
2576                                         struct file_extent_cluster *cluster)
2577 {
2578         u64 page_start;
2579         u64 page_end;
2580         u64 offset = BTRFS_I(inode)->index_cnt;
2581         unsigned long index;
2582         unsigned long last_index;
2583         unsigned int dirty_page = 0;
2584         struct page *page;
2585         struct file_ra_state *ra;
2586         int nr = 0;
2587         int ret = 0;
2588
2589         if (!cluster->nr)
2590                 return 0;
2591
2592         ra = kzalloc(sizeof(*ra), GFP_NOFS);
2593         if (!ra)
2594                 return -ENOMEM;
2595
2596         index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2597         last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2598
2599         mutex_lock(&inode->i_mutex);
2600
2601         i_size_write(inode, cluster->end + 1 - offset);
2602         ret = setup_extent_mapping(inode, cluster->start - offset,
2603                                    cluster->end - offset, cluster->start);
2604         if (ret)
2605                 goto out_unlock;
2606
2607         file_ra_state_init(ra, inode->i_mapping);
2608
2609         WARN_ON(cluster->start != cluster->boundary[0]);
2610         while (index <= last_index) {
2611                 page = find_lock_page(inode->i_mapping, index);
2612                 if (!page) {
2613                         page_cache_sync_readahead(inode->i_mapping,
2614                                                   ra, NULL, index,
2615                                                   last_index + 1 - index);
2616                         page = grab_cache_page(inode->i_mapping, index);
2617                         if (!page) {
2618                                 ret = -ENOMEM;
2619                                 goto out_unlock;
2620                         }
2621                 }
2622
2623                 if (PageReadahead(page)) {
2624                         page_cache_async_readahead(inode->i_mapping,
2625                                                    ra, NULL, page, index,
2626                                                    last_index + 1 - index);
2627                 }
2628
2629                 if (!PageUptodate(page)) {
2630                         btrfs_readpage(NULL, page);
2631                         lock_page(page);
2632                         if (!PageUptodate(page)) {
2633                                 unlock_page(page);
2634                                 page_cache_release(page);
2635                                 ret = -EIO;
2636                                 goto out_unlock;
2637                         }
2638                 }
2639
2640                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2641                 page_end = page_start + PAGE_CACHE_SIZE - 1;
2642
2643                 lock_extent(&BTRFS_I(inode)->io_tree,
2644                             page_start, page_end, GFP_NOFS);
2645
2646                 set_page_extent_mapped(page);
2647
2648                 if (nr < cluster->nr &&
2649                     page_start + offset == cluster->boundary[nr]) {
2650                         set_extent_bits(&BTRFS_I(inode)->io_tree,
2651                                         page_start, page_end,
2652                                         EXTENT_BOUNDARY, GFP_NOFS);
2653                         nr++;
2654                 }
2655                 btrfs_set_extent_delalloc(inode, page_start, page_end);
2656
2657                 set_page_dirty(page);
2658                 dirty_page++;
2659
2660                 unlock_extent(&BTRFS_I(inode)->io_tree,
2661                               page_start, page_end, GFP_NOFS);
2662                 unlock_page(page);
2663                 page_cache_release(page);
2664
2665                 index++;
2666                 if (nr < cluster->nr &&
2667                     page_end + 1 + offset == cluster->boundary[nr]) {
2668                         balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2669                                                            dirty_page);
2670                         dirty_page = 0;
2671                 }
2672         }
2673         if (dirty_page) {
2674                 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2675                                                    dirty_page);
2676         }
2677         WARN_ON(nr != cluster->nr);
2678 out_unlock:
2679         mutex_unlock(&inode->i_mutex);
2680         kfree(ra);
2681         return ret;
2682 }
2683
2684 static noinline_for_stack
2685 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
2686                          struct file_extent_cluster *cluster)
2687 {
2688         int ret;
2689
2690         if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
2691                 ret = relocate_file_extent_cluster(inode, cluster);
2692                 if (ret)
2693                         return ret;
2694                 cluster->nr = 0;
2695         }
2696
2697         if (!cluster->nr)
2698                 cluster->start = extent_key->objectid;
2699         else
2700                 BUG_ON(cluster->nr >= MAX_EXTENTS);
2701         cluster->end = extent_key->objectid + extent_key->offset - 1;
2702         cluster->boundary[cluster->nr] = extent_key->objectid;
2703         cluster->nr++;
2704
2705         if (cluster->nr >= MAX_EXTENTS) {
2706                 ret = relocate_file_extent_cluster(inode, cluster);
2707                 if (ret)
2708                         return ret;
2709                 cluster->nr = 0;
2710         }
2711         return 0;
2712 }
2713
2714 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2715 static int get_ref_objectid_v0(struct reloc_control *rc,
2716                                struct btrfs_path *path,
2717                                struct btrfs_key *extent_key,
2718                                u64 *ref_objectid, int *path_change)
2719 {
2720         struct btrfs_key key;
2721         struct extent_buffer *leaf;
2722         struct btrfs_extent_ref_v0 *ref0;
2723         int ret;
2724         int slot;
2725
2726         leaf = path->nodes[0];
2727         slot = path->slots[0];
2728         while (1) {
2729                 if (slot >= btrfs_header_nritems(leaf)) {
2730                         ret = btrfs_next_leaf(rc->extent_root, path);
2731                         if (ret < 0)
2732                                 return ret;
2733                         BUG_ON(ret > 0);
2734                         leaf = path->nodes[0];
2735                         slot = path->slots[0];
2736                         if (path_change)
2737                                 *path_change = 1;
2738                 }
2739                 btrfs_item_key_to_cpu(leaf, &key, slot);
2740                 if (key.objectid != extent_key->objectid)
2741                         return -ENOENT;
2742
2743                 if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
2744                         slot++;
2745                         continue;
2746                 }
2747                 ref0 = btrfs_item_ptr(leaf, slot,
2748                                 struct btrfs_extent_ref_v0);
2749                 *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
2750                 break;
2751         }
2752         return 0;
2753 }
2754 #endif
2755
2756 /*
2757  * helper to add a tree block to the list.
2758  * the major work is getting the generation and level of the block
2759  */
2760 static int add_tree_block(struct reloc_control *rc,
2761                           struct btrfs_key *extent_key,
2762                           struct btrfs_path *path,
2763                           struct rb_root *blocks)
2764 {
2765         struct extent_buffer *eb;
2766         struct btrfs_extent_item *ei;
2767         struct btrfs_tree_block_info *bi;
2768         struct tree_block *block;
2769         struct rb_node *rb_node;
2770         u32 item_size;
2771         int level = -1;
2772         int generation;
2773
2774         eb =  path->nodes[0];
2775         item_size = btrfs_item_size_nr(eb, path->slots[0]);
2776
2777         if (item_size >= sizeof(*ei) + sizeof(*bi)) {
2778                 ei = btrfs_item_ptr(eb, path->slots[0],
2779                                 struct btrfs_extent_item);
2780                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2781                 generation = btrfs_extent_generation(eb, ei);
2782                 level = btrfs_tree_block_level(eb, bi);
2783         } else {
2784 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2785                 u64 ref_owner;
2786                 int ret;
2787
2788                 BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2789                 ret = get_ref_objectid_v0(rc, path, extent_key,
2790                                           &ref_owner, NULL);
2791                 BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
2792                 level = (int)ref_owner;
2793                 /* FIXME: get real generation */
2794                 generation = 0;
2795 #else
2796                 BUG();
2797 #endif
2798         }
2799
2800         btrfs_release_path(rc->extent_root, path);
2801
2802         BUG_ON(level == -1);
2803
2804         block = kmalloc(sizeof(*block), GFP_NOFS);
2805         if (!block)
2806                 return -ENOMEM;
2807
2808         block->bytenr = extent_key->objectid;
2809         block->key.objectid = extent_key->offset;
2810         block->key.offset = generation;
2811         block->level = level;
2812         block->key_ready = 0;
2813
2814         rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2815         BUG_ON(rb_node);
2816
2817         return 0;
2818 }
2819
2820 /*
2821  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
2822  */
2823 static int __add_tree_block(struct reloc_control *rc,
2824                             u64 bytenr, u32 blocksize,
2825                             struct rb_root *blocks)
2826 {
2827         struct btrfs_path *path;
2828         struct btrfs_key key;
2829         int ret;
2830
2831         if (tree_block_processed(bytenr, blocksize, rc))
2832                 return 0;
2833
2834         if (tree_search(blocks, bytenr))
2835                 return 0;
2836
2837         path = btrfs_alloc_path();
2838         if (!path)
2839                 return -ENOMEM;
2840
2841         key.objectid = bytenr;
2842         key.type = BTRFS_EXTENT_ITEM_KEY;
2843         key.offset = blocksize;
2844
2845         path->search_commit_root = 1;
2846         path->skip_locking = 1;
2847         ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
2848         if (ret < 0)
2849                 goto out;
2850         BUG_ON(ret);
2851
2852         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2853         ret = add_tree_block(rc, &key, path, blocks);
2854 out:
2855         btrfs_free_path(path);
2856         return ret;
2857 }
2858
2859 /*
2860  * helper to check if the block use full backrefs for pointers in it
2861  */
2862 static int block_use_full_backref(struct reloc_control *rc,
2863                                   struct extent_buffer *eb)
2864 {
2865         struct btrfs_path *path;
2866         struct btrfs_extent_item *ei;
2867         struct btrfs_key key;
2868         u64 flags;
2869         int ret;
2870
2871         if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
2872             btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
2873                 return 1;
2874
2875         path = btrfs_alloc_path();
2876         BUG_ON(!path);
2877
2878         key.objectid = eb->start;
2879         key.type = BTRFS_EXTENT_ITEM_KEY;
2880         key.offset = eb->len;
2881
2882         path->search_commit_root = 1;
2883         path->skip_locking = 1;
2884         ret = btrfs_search_slot(NULL, rc->extent_root,
2885                                 &key, path, 0, 0);
2886         BUG_ON(ret);
2887
2888         ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2889                             struct btrfs_extent_item);
2890         flags = btrfs_extent_flags(path->nodes[0], ei);
2891         BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2892         if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
2893                 ret = 1;
2894         else
2895                 ret = 0;
2896         btrfs_free_path(path);
2897         return ret;
2898 }
2899
2900 /*
2901  * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
2902  * this function scans fs tree to find blocks reference the data extent
2903  */
2904 static int find_data_references(struct reloc_control *rc,
2905                                 struct btrfs_key *extent_key,
2906                                 struct extent_buffer *leaf,
2907                                 struct btrfs_extent_data_ref *ref,
2908                                 struct rb_root *blocks)
2909 {
2910         struct btrfs_path *path;
2911         struct tree_block *block;
2912         struct btrfs_root *root;
2913         struct btrfs_file_extent_item *fi;
2914         struct rb_node *rb_node;
2915         struct btrfs_key key;
2916         u64 ref_root;
2917         u64 ref_objectid;
2918         u64 ref_offset;
2919         u32 ref_count;
2920         u32 nritems;
2921         int err = 0;
2922         int added = 0;
2923         int counted;
2924         int ret;
2925
2926         path = btrfs_alloc_path();
2927         if (!path)
2928                 return -ENOMEM;
2929
2930         ref_root = btrfs_extent_data_ref_root(leaf, ref);
2931         ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
2932         ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
2933         ref_count = btrfs_extent_data_ref_count(leaf, ref);
2934
2935         root = read_fs_root(rc->extent_root->fs_info, ref_root);
2936         if (IS_ERR(root)) {
2937                 err = PTR_ERR(root);
2938                 goto out;
2939         }
2940
2941         key.objectid = ref_objectid;
2942         key.offset = ref_offset;
2943         key.type = BTRFS_EXTENT_DATA_KEY;
2944
2945         path->search_commit_root = 1;
2946         path->skip_locking = 1;
2947         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2948         if (ret < 0) {
2949                 err = ret;
2950                 goto out;
2951         }
2952
2953         leaf = path->nodes[0];
2954         nritems = btrfs_header_nritems(leaf);
2955         /*
2956          * the references in tree blocks that use full backrefs
2957          * are not counted in
2958          */
2959         if (block_use_full_backref(rc, leaf))
2960                 counted = 0;
2961         else
2962                 counted = 1;
2963         rb_node = tree_search(blocks, leaf->start);
2964         if (rb_node) {
2965                 if (counted)
2966                         added = 1;
2967                 else
2968                         path->slots[0] = nritems;
2969         }
2970
2971         while (ref_count > 0) {
2972                 while (path->slots[0] >= nritems) {
2973                         ret = btrfs_next_leaf(root, path);
2974                         if (ret < 0) {
2975                                 err = ret;
2976                                 goto out;
2977                         }
2978                         if (ret > 0) {
2979                                 WARN_ON(1);
2980                                 goto out;
2981                         }
2982
2983                         leaf = path->nodes[0];
2984                         nritems = btrfs_header_nritems(leaf);
2985                         added = 0;
2986
2987                         if (block_use_full_backref(rc, leaf))
2988                                 counted = 0;
2989                         else
2990                                 counted = 1;
2991                         rb_node = tree_search(blocks, leaf->start);
2992                         if (rb_node) {
2993                                 if (counted)
2994                                         added = 1;
2995                                 else
2996                                         path->slots[0] = nritems;
2997                         }
2998                 }
2999
3000                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3001                 if (key.objectid != ref_objectid ||
3002                     key.type != BTRFS_EXTENT_DATA_KEY) {
3003                         WARN_ON(1);
3004                         break;
3005                 }
3006
3007                 fi = btrfs_item_ptr(leaf, path->slots[0],
3008                                     struct btrfs_file_extent_item);
3009
3010                 if (btrfs_file_extent_type(leaf, fi) ==
3011                     BTRFS_FILE_EXTENT_INLINE)
3012                         goto next;
3013
3014                 if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3015                     extent_key->objectid)
3016                         goto next;
3017
3018                 key.offset -= btrfs_file_extent_offset(leaf, fi);
3019                 if (key.offset != ref_offset)
3020                         goto next;
3021
3022                 if (counted)
3023                         ref_count--;
3024                 if (added)
3025                         goto next;
3026
3027                 if (!tree_block_processed(leaf->start, leaf->len, rc)) {
3028                         block = kmalloc(sizeof(*block), GFP_NOFS);
3029                         if (!block) {
3030                                 err = -ENOMEM;
3031                                 break;
3032                         }
3033                         block->bytenr = leaf->start;
3034                         btrfs_item_key_to_cpu(leaf, &block->key, 0);
3035                         block->level = 0;
3036                         block->key_ready = 1;
3037                         rb_node = tree_insert(blocks, block->bytenr,
3038                                               &block->rb_node);
3039                         BUG_ON(rb_node);
3040                 }
3041                 if (counted)
3042                         added = 1;
3043                 else
3044                         path->slots[0] = nritems;
3045 next:
3046                 path->slots[0]++;
3047
3048         }
3049 out:
3050         btrfs_free_path(path);
3051         return err;
3052 }
3053
3054 /*
3055  * hepler to find all tree blocks that reference a given data extent
3056  */
3057 static noinline_for_stack
3058 int add_data_references(struct reloc_control *rc,
3059                         struct btrfs_key *extent_key,
3060                         struct btrfs_path *path,
3061                         struct rb_root *blocks)
3062 {
3063         struct btrfs_key key;
3064         struct extent_buffer *eb;
3065         struct btrfs_extent_data_ref *dref;
3066         struct btrfs_extent_inline_ref *iref;
3067         unsigned long ptr;
3068         unsigned long end;
3069         u32 blocksize;
3070         int ret;
3071         int err = 0;
3072
3073         ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
3074                                extent_key->offset);
3075         BUG_ON(ret < 0);
3076         if (ret > 0) {
3077                 /* the relocated data is fragmented */
3078                 rc->extents_skipped++;
3079                 btrfs_release_path(rc->extent_root, path);
3080                 return 0;
3081         }
3082
3083         blocksize = btrfs_level_size(rc->extent_root, 0);
3084
3085         eb = path->nodes[0];
3086         ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3087         end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3088 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3089         if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3090                 ptr = end;
3091         else
3092 #endif
3093                 ptr += sizeof(struct btrfs_extent_item);
3094
3095         while (ptr < end) {
3096                 iref = (struct btrfs_extent_inline_ref *)ptr;
3097                 key.type = btrfs_extent_inline_ref_type(eb, iref);
3098                 if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3099                         key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3100                         ret = __add_tree_block(rc, key.offset, blocksize,
3101                                                blocks);
3102                 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3103                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3104                         ret = find_data_references(rc, extent_key,
3105                                                    eb, dref, blocks);
3106                 } else {
3107                         BUG();
3108                 }
3109                 ptr += btrfs_extent_inline_ref_size(key.type);
3110         }
3111         WARN_ON(ptr > end);
3112
3113         while (1) {
3114                 cond_resched();
3115                 eb = path->nodes[0];
3116                 if (path->slots[0] >= btrfs_header_nritems(eb)) {
3117                         ret = btrfs_next_leaf(rc->extent_root, path);
3118                         if (ret < 0) {
3119                                 err = ret;
3120                                 break;
3121                         }
3122                         if (ret > 0)
3123                                 break;
3124                         eb = path->nodes[0];
3125                 }
3126
3127                 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3128                 if (key.objectid != extent_key->objectid)
3129                         break;
3130
3131 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3132                 if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3133                     key.type == BTRFS_EXTENT_REF_V0_KEY) {
3134 #else
3135                 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3136                 if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3137 #endif
3138                         ret = __add_tree_block(rc, key.offset, blocksize,
3139                                                blocks);
3140                 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3141                         dref = btrfs_item_ptr(eb, path->slots[0],
3142                                               struct btrfs_extent_data_ref);
3143                         ret = find_data_references(rc, extent_key,
3144                                                    eb, dref, blocks);
3145                 } else {
3146                         ret = 0;
3147                 }
3148                 if (ret) {
3149                         err = ret;
3150                         break;
3151                 }
3152                 path->slots[0]++;
3153         }
3154         btrfs_release_path(rc->extent_root, path);
3155         if (err)
3156                 free_block_list(blocks);
3157         return err;
3158 }
3159
3160 /*
3161  * hepler to find next unprocessed extent
3162  */
3163 static noinline_for_stack
3164 int find_next_extent(struct btrfs_trans_handle *trans,
3165                      struct reloc_control *rc, struct btrfs_path *path)
3166 {
3167         struct btrfs_key key;
3168         struct extent_buffer *leaf;
3169         u64 start, end, last;
3170         int ret;
3171
3172         last = rc->block_group->key.objectid + rc->block_group->key.offset;
3173         while (1) {
3174                 cond_resched();
3175                 if (rc->search_start >= last) {
3176                         ret = 1;
3177                         break;
3178                 }
3179
3180                 key.objectid = rc->search_start;
3181                 key.type = BTRFS_EXTENT_ITEM_KEY;
3182                 key.offset = 0;
3183
3184                 path->search_commit_root = 1;
3185                 path->skip_locking = 1;
3186                 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3187                                         0, 0);
3188                 if (ret < 0)
3189                         break;
3190 next:
3191                 leaf = path->nodes[0];
3192                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3193                         ret = btrfs_next_leaf(rc->extent_root, path);
3194                         if (ret != 0)
3195                                 break;
3196                         leaf = path->nodes[0];
3197                 }
3198
3199                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3200                 if (key.objectid >= last) {
3201                         ret = 1;
3202                         break;
3203                 }
3204
3205                 if (key.type != BTRFS_EXTENT_ITEM_KEY ||
3206                     key.objectid + key.offset <= rc->search_start) {
3207                         path->slots[0]++;
3208                         goto next;
3209                 }
3210
3211                 ret = find_first_extent_bit(&rc->processed_blocks,
3212                                             key.objectid, &start, &end,
3213                                             EXTENT_DIRTY);
3214
3215                 if (ret == 0 && start <= key.objectid) {
3216                         btrfs_release_path(rc->extent_root, path);
3217                         rc->search_start = end + 1;
3218                 } else {
3219                         rc->search_start = key.objectid + key.offset;
3220                         return 0;
3221                 }
3222         }
3223         btrfs_release_path(rc->extent_root, path);
3224         return ret;
3225 }
3226
3227 static void set_reloc_control(struct reloc_control *rc)
3228 {
3229         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3230         mutex_lock(&fs_info->trans_mutex);
3231         fs_info->reloc_ctl = rc;
3232         mutex_unlock(&fs_info->trans_mutex);
3233 }
3234
3235 static void unset_reloc_control(struct reloc_control *rc)
3236 {
3237         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3238         mutex_lock(&fs_info->trans_mutex);
3239         fs_info->reloc_ctl = NULL;
3240         mutex_unlock(&fs_info->trans_mutex);
3241 }
3242
3243 static int check_extent_flags(u64 flags)
3244 {
3245         if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3246             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3247                 return 1;
3248         if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3249             !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3250                 return 1;
3251         if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3252             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3253                 return 1;
3254         return 0;
3255 }
3256
3257
3258 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3259 {
3260         struct rb_root blocks = RB_ROOT;
3261         struct btrfs_key key;
3262         struct file_extent_cluster *cluster;
3263         struct btrfs_trans_handle *trans = NULL;
3264         struct btrfs_path *path;
3265         struct btrfs_extent_item *ei;
3266         unsigned long nr;
3267         u64 flags;
3268         u32 item_size;
3269         int ret;
3270         int err = 0;
3271
3272         cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
3273         if (!cluster)
3274                 return -ENOMEM;
3275
3276         path = btrfs_alloc_path();
3277         if (!path)
3278                 return -ENOMEM;
3279
3280         rc->extents_found = 0;
3281         rc->extents_skipped = 0;
3282
3283         rc->search_start = rc->block_group->key.objectid;
3284         clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3285                           GFP_NOFS);
3286
3287         rc->create_reloc_root = 1;
3288         set_reloc_control(rc);
3289
3290         trans = btrfs_start_transaction(rc->extent_root, 1);
3291         btrfs_commit_transaction(trans, rc->extent_root);
3292
3293         while (1) {
3294                 trans = btrfs_start_transaction(rc->extent_root, 1);
3295
3296                 ret = find_next_extent(trans, rc, path);
3297                 if (ret < 0)
3298                         err = ret;
3299                 if (ret != 0)
3300                         break;
3301
3302                 rc->extents_found++;
3303
3304                 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3305                                     struct btrfs_extent_item);
3306                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3307                 item_size = btrfs_item_size_nr(path->nodes[0],
3308                                                path->slots[0]);
3309                 if (item_size >= sizeof(*ei)) {
3310                         flags = btrfs_extent_flags(path->nodes[0], ei);
3311                         ret = check_extent_flags(flags);
3312                         BUG_ON(ret);
3313
3314                 } else {
3315 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3316                         u64 ref_owner;
3317                         int path_change = 0;
3318
3319                         BUG_ON(item_size !=
3320                                sizeof(struct btrfs_extent_item_v0));
3321                         ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
3322                                                   &path_change);
3323                         if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
3324                                 flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
3325                         else
3326                                 flags = BTRFS_EXTENT_FLAG_DATA;
3327
3328                         if (path_change) {
3329                                 btrfs_release_path(rc->extent_root, path);
3330
3331                                 path->search_commit_root = 1;
3332                                 path->skip_locking = 1;
3333                                 ret = btrfs_search_slot(NULL, rc->extent_root,
3334                                                         &key, path, 0, 0);
3335                                 if (ret < 0) {
3336                                         err = ret;
3337                                         break;
3338                                 }
3339                                 BUG_ON(ret > 0);
3340                         }
3341 #else
3342                         BUG();
3343 #endif
3344                 }
3345
3346                 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3347                         ret = add_tree_block(rc, &key, path, &blocks);
3348                 } else if (rc->stage == UPDATE_DATA_PTRS &&
3349                          (flags & BTRFS_EXTENT_FLAG_DATA)) {
3350                         ret = add_data_references(rc, &key, path, &blocks);
3351                 } else {
3352                         btrfs_release_path(rc->extent_root, path);
3353                         ret = 0;
3354                 }
3355                 if (ret < 0) {
3356                         err = 0;
3357                         break;
3358                 }
3359
3360                 if (!RB_EMPTY_ROOT(&blocks)) {
3361                         ret = relocate_tree_blocks(trans, rc, &blocks);
3362                         if (ret < 0) {
3363                                 err = ret;
3364                                 break;
3365                         }
3366                 }
3367
3368                 nr = trans->blocks_used;
3369                 btrfs_end_transaction(trans, rc->extent_root);
3370                 trans = NULL;
3371                 btrfs_btree_balance_dirty(rc->extent_root, nr);
3372
3373                 if (rc->stage == MOVE_DATA_EXTENTS &&
3374                     (flags & BTRFS_EXTENT_FLAG_DATA)) {
3375                         rc->found_file_extent = 1;
3376                         ret = relocate_data_extent(rc->data_inode,
3377                                                    &key, cluster);
3378                         if (ret < 0) {
3379                                 err = ret;
3380                                 break;
3381                         }
3382                 }
3383         }
3384         btrfs_free_path(path);
3385
3386         if (trans) {
3387                 nr = trans->blocks_used;
3388                 btrfs_end_transaction(trans, rc->extent_root);
3389                 btrfs_btree_balance_dirty(rc->extent_root, nr);
3390         }
3391
3392         if (!err) {
3393                 ret = relocate_file_extent_cluster(rc->data_inode, cluster);
3394                 if (ret < 0)
3395                         err = ret;
3396         }
3397
3398         kfree(cluster);
3399
3400         rc->create_reloc_root = 0;
3401         smp_mb();
3402
3403         if (rc->extents_found > 0) {
3404                 trans = btrfs_start_transaction(rc->extent_root, 1);
3405                 btrfs_commit_transaction(trans, rc->extent_root);
3406         }
3407
3408         merge_reloc_roots(rc);
3409
3410         unset_reloc_control(rc);
3411
3412         /* get rid of pinned extents */
3413         trans = btrfs_start_transaction(rc->extent_root, 1);
3414         btrfs_commit_transaction(trans, rc->extent_root);
3415
3416         return err;
3417 }
3418
3419 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3420                                  struct btrfs_root *root, u64 objectid)
3421 {
3422         struct btrfs_path *path;
3423         struct btrfs_inode_item *item;
3424         struct extent_buffer *leaf;
3425         int ret;
3426
3427         path = btrfs_alloc_path();
3428         if (!path)
3429                 return -ENOMEM;
3430
3431         ret = btrfs_insert_empty_inode(trans, root, path, objectid);
3432         if (ret)
3433                 goto out;
3434
3435         leaf = path->nodes[0];
3436         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3437         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
3438         btrfs_set_inode_generation(leaf, item, 1);
3439         btrfs_set_inode_size(leaf, item, 0);
3440         btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3441         btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
3442         btrfs_mark_buffer_dirty(leaf);
3443         btrfs_release_path(root, path);
3444 out:
3445         btrfs_free_path(path);
3446         return ret;
3447 }
3448
3449 /*
3450  * helper to create inode for data relocation.
3451  * the inode is in data relocation tree and its link count is 0
3452  */
3453 static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3454                                         struct btrfs_block_group_cache *group)
3455 {
3456         struct inode *inode = NULL;
3457         struct btrfs_trans_handle *trans;
3458         struct btrfs_root *root;
3459         struct btrfs_key key;
3460         unsigned long nr;
3461         u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
3462         int err = 0;
3463
3464         root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
3465         if (IS_ERR(root))
3466                 return ERR_CAST(root);
3467
3468         trans = btrfs_start_transaction(root, 1);
3469         BUG_ON(!trans);
3470
3471         err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
3472         if (err)
3473                 goto out;
3474
3475         err = __insert_orphan_inode(trans, root, objectid);
3476         BUG_ON(err);
3477
3478         key.objectid = objectid;
3479         key.type = BTRFS_INODE_ITEM_KEY;
3480         key.offset = 0;
3481         inode = btrfs_iget(root->fs_info->sb, &key, root);
3482         BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
3483         BTRFS_I(inode)->index_cnt = group->key.objectid;
3484
3485         err = btrfs_orphan_add(trans, inode);
3486 out:
3487         nr = trans->blocks_used;
3488         btrfs_end_transaction(trans, root);
3489
3490         btrfs_btree_balance_dirty(root, nr);
3491         if (err) {
3492                 if (inode)
3493                         iput(inode);
3494                 inode = ERR_PTR(err);
3495         }
3496         return inode;
3497 }
3498
3499 /*
3500  * function to relocate all extents in a block group.
3501  */
3502 int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3503 {
3504         struct btrfs_fs_info *fs_info = extent_root->fs_info;
3505         struct reloc_control *rc;
3506         int ret;
3507         int err = 0;
3508
3509         rc = kzalloc(sizeof(*rc), GFP_NOFS);
3510         if (!rc)
3511                 return -ENOMEM;
3512
3513         mapping_tree_init(&rc->reloc_root_tree);
3514         extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3515         INIT_LIST_HEAD(&rc->reloc_roots);
3516
3517         rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
3518         BUG_ON(!rc->block_group);
3519
3520         btrfs_init_workers(&rc->workers, "relocate",
3521                            fs_info->thread_pool_size, NULL);
3522
3523         rc->extent_root = extent_root;
3524         btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
3525
3526         rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3527         if (IS_ERR(rc->data_inode)) {
3528                 err = PTR_ERR(rc->data_inode);
3529                 rc->data_inode = NULL;
3530                 goto out;
3531         }
3532
3533         printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
3534                (unsigned long long)rc->block_group->key.objectid,
3535                (unsigned long long)rc->block_group->flags);
3536
3537         btrfs_start_delalloc_inodes(fs_info->tree_root);
3538         btrfs_wait_ordered_extents(fs_info->tree_root, 0);
3539
3540         while (1) {
3541                 rc->extents_found = 0;
3542                 rc->extents_skipped = 0;
3543
3544                 mutex_lock(&fs_info->cleaner_mutex);
3545
3546                 btrfs_clean_old_snapshots(fs_info->tree_root);
3547                 ret = relocate_block_group(rc);
3548
3549                 mutex_unlock(&fs_info->cleaner_mutex);
3550                 if (ret < 0) {
3551                         err = ret;
3552                         break;
3553                 }
3554
3555                 if (rc->extents_found == 0)
3556                         break;
3557
3558                 printk(KERN_INFO "btrfs: found %llu extents\n",
3559                         (unsigned long long)rc->extents_found);
3560
3561                 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
3562                         btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
3563                         invalidate_mapping_pages(rc->data_inode->i_mapping,
3564                                                  0, -1);
3565                         rc->stage = UPDATE_DATA_PTRS;
3566                 } else if (rc->stage == UPDATE_DATA_PTRS &&
3567                            rc->extents_skipped >= rc->extents_found) {
3568                         iput(rc->data_inode);
3569                         rc->data_inode = create_reloc_inode(fs_info,
3570                                                             rc->block_group);
3571                         if (IS_ERR(rc->data_inode)) {
3572                                 err = PTR_ERR(rc->data_inode);
3573                                 rc->data_inode = NULL;
3574                                 break;
3575                         }
3576                         rc->stage = MOVE_DATA_EXTENTS;
3577                         rc->found_file_extent = 0;
3578                 }
3579         }
3580
3581         filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
3582                                      rc->block_group->key.objectid,
3583                                      rc->block_group->key.objectid +
3584                                      rc->block_group->key.offset - 1);
3585
3586         WARN_ON(rc->block_group->pinned > 0);
3587         WARN_ON(rc->block_group->reserved > 0);
3588         WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
3589 out:
3590         iput(rc->data_inode);
3591         btrfs_stop_workers(&rc->workers);
3592         btrfs_put_block_group(rc->block_group);
3593         kfree(rc);
3594         return err;
3595 }
3596
3597 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3598 {
3599         struct btrfs_trans_handle *trans;
3600         int ret;
3601
3602         trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
3603
3604         memset(&root->root_item.drop_progress, 0,
3605                 sizeof(root->root_item.drop_progress));
3606         root->root_item.drop_level = 0;
3607         btrfs_set_root_refs(&root->root_item, 0);
3608         ret = btrfs_update_root(trans, root->fs_info->tree_root,
3609                                 &root->root_key, &root->root_item);
3610         BUG_ON(ret);
3611
3612         ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
3613         BUG_ON(ret);
3614         return 0;
3615 }
3616
3617 /*
3618  * recover relocation interrupted by system crash.
3619  *
3620  * this function resumes merging reloc trees with corresponding fs trees.
3621  * this is important for keeping the sharing of tree blocks
3622  */
3623 int btrfs_recover_relocation(struct btrfs_root *root)
3624 {
3625         LIST_HEAD(reloc_roots);
3626         struct btrfs_key key;
3627         struct btrfs_root *fs_root;
3628         struct btrfs_root *reloc_root;
3629         struct btrfs_path *path;
3630         struct extent_buffer *leaf;
3631         struct reloc_control *rc = NULL;
3632         struct btrfs_trans_handle *trans;
3633         int ret;
3634         int err = 0;
3635
3636         path = btrfs_alloc_path();
3637         if (!path)
3638                 return -ENOMEM;
3639
3640         key.objectid = BTRFS_TREE_RELOC_OBJECTID;
3641         key.type = BTRFS_ROOT_ITEM_KEY;
3642         key.offset = (u64)-1;
3643
3644         while (1) {
3645                 ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
3646                                         path, 0, 0);
3647                 if (ret < 0) {
3648                         err = ret;
3649                         goto out;
3650                 }
3651                 if (ret > 0) {
3652                         if (path->slots[0] == 0)
3653                                 break;
3654                         path->slots[0]--;
3655                 }
3656                 leaf = path->nodes[0];
3657                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3658                 btrfs_release_path(root->fs_info->tree_root, path);
3659
3660                 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
3661                     key.type != BTRFS_ROOT_ITEM_KEY)
3662                         break;
3663
3664                 reloc_root = btrfs_read_fs_root_no_radix(root, &key);
3665                 if (IS_ERR(reloc_root)) {
3666                         err = PTR_ERR(reloc_root);
3667                         goto out;
3668                 }
3669
3670                 list_add(&reloc_root->root_list, &reloc_roots);
3671
3672                 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
3673                         fs_root = read_fs_root(root->fs_info,
3674                                                reloc_root->root_key.offset);
3675                         if (IS_ERR(fs_root)) {
3676                                 ret = PTR_ERR(fs_root);
3677                                 if (ret != -ENOENT) {
3678                                         err = ret;
3679                                         goto out;
3680                                 }
3681                                 mark_garbage_root(reloc_root);
3682                         }
3683                 }
3684
3685                 if (key.offset == 0)
3686                         break;
3687
3688                 key.offset--;
3689         }
3690         btrfs_release_path(root->fs_info->tree_root, path);
3691
3692         if (list_empty(&reloc_roots))
3693                 goto out;
3694
3695         rc = kzalloc(sizeof(*rc), GFP_NOFS);
3696         if (!rc) {
3697                 err = -ENOMEM;
3698                 goto out;
3699         }
3700
3701         mapping_tree_init(&rc->reloc_root_tree);
3702         INIT_LIST_HEAD(&rc->reloc_roots);
3703         btrfs_init_workers(&rc->workers, "relocate",
3704                            root->fs_info->thread_pool_size, NULL);
3705         rc->extent_root = root->fs_info->extent_root;
3706
3707         set_reloc_control(rc);
3708
3709         while (!list_empty(&reloc_roots)) {
3710                 reloc_root = list_entry(reloc_roots.next,
3711                                         struct btrfs_root, root_list);
3712                 list_del(&reloc_root->root_list);
3713
3714                 if (btrfs_root_refs(&reloc_root->root_item) == 0) {
3715                         list_add_tail(&reloc_root->root_list,
3716                                       &rc->reloc_roots);
3717                         continue;
3718                 }
3719
3720                 fs_root = read_fs_root(root->fs_info,
3721                                        reloc_root->root_key.offset);
3722                 BUG_ON(IS_ERR(fs_root));
3723
3724                 __add_reloc_root(reloc_root);
3725                 fs_root->reloc_root = reloc_root;
3726         }
3727
3728         trans = btrfs_start_transaction(rc->extent_root, 1);
3729         btrfs_commit_transaction(trans, rc->extent_root);
3730
3731         merge_reloc_roots(rc);
3732
3733         unset_reloc_control(rc);
3734
3735         trans = btrfs_start_transaction(rc->extent_root, 1);
3736         btrfs_commit_transaction(trans, rc->extent_root);
3737 out:
3738         if (rc) {
3739                 btrfs_stop_workers(&rc->workers);
3740                 kfree(rc);
3741         }
3742         while (!list_empty(&reloc_roots)) {
3743                 reloc_root = list_entry(reloc_roots.next,
3744                                         struct btrfs_root, root_list);
3745                 list_del(&reloc_root->root_list);
3746                 free_extent_buffer(reloc_root->node);
3747                 free_extent_buffer(reloc_root->commit_root);
3748                 kfree(reloc_root);
3749         }
3750         btrfs_free_path(path);
3751
3752         if (err == 0) {
3753                 /* cleanup orphan inode in data relocation tree */
3754                 fs_root = read_fs_root(root->fs_info,
3755                                        BTRFS_DATA_RELOC_TREE_OBJECTID);
3756                 if (IS_ERR(fs_root))
3757                         err = PTR_ERR(fs_root);
3758                 btrfs_orphan_cleanup(fs_root);
3759         }
3760         return err;
3761 }
3762
3763 /*
3764  * helper to add ordered checksum for data relocation.
3765  *
3766  * cloning checksum properly handles the nodatasum extents.
3767  * it also saves CPU time to re-calculate the checksum.
3768  */
3769 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
3770 {
3771         struct btrfs_ordered_sum *sums;
3772         struct btrfs_sector_sum *sector_sum;
3773         struct btrfs_ordered_extent *ordered;
3774         struct btrfs_root *root = BTRFS_I(inode)->root;
3775         size_t offset;
3776         int ret;
3777         u64 disk_bytenr;
3778         LIST_HEAD(list);
3779
3780         ordered = btrfs_lookup_ordered_extent(inode, file_pos);
3781         BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
3782
3783         disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
3784         ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
3785                                        disk_bytenr + len - 1, &list);
3786
3787         while (!list_empty(&list)) {
3788                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
3789                 list_del_init(&sums->list);
3790
3791                 sector_sum = sums->sums;
3792                 sums->bytenr = ordered->start;
3793
3794                 offset = 0;
3795                 while (offset < sums->len) {
3796                         sector_sum->bytenr += ordered->start - disk_bytenr;
3797                         sector_sum++;
3798                         offset += root->sectorsize;
3799                 }
3800
3801                 btrfs_add_ordered_sum(inode, ordered, sums);
3802         }
3803         btrfs_put_ordered_extent(ordered);
3804         return 0;
3805 }