Btrfs: fix bitmap size tracking
[linux-2.6.git] / fs / btrfs / free-space-cache.c
1 /*
2  * Copyright (C) 2008 Red Hat.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/pagemap.h>
20 #include <linux/sched.h>
21 #include <linux/math64.h>
22 #include "ctree.h"
23 #include "free-space-cache.h"
24 #include "transaction.h"
25
26 #define BITS_PER_BITMAP         (PAGE_CACHE_SIZE * 8)
27 #define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
28
29 static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
30                                           u64 offset)
31 {
32         BUG_ON(offset < bitmap_start);
33         offset -= bitmap_start;
34         return (unsigned long)(div64_u64(offset, sectorsize));
35 }
36
37 static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
38 {
39         return (unsigned long)(div64_u64(bytes, sectorsize));
40 }
41
42 static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
43                                    u64 offset)
44 {
45         u64 bitmap_start;
46         u64 bytes_per_bitmap;
47
48         bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
49         bitmap_start = offset - block_group->key.objectid;
50         bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
51         bitmap_start *= bytes_per_bitmap;
52         bitmap_start += block_group->key.objectid;
53
54         return bitmap_start;
55 }
56
57 static int tree_insert_offset(struct rb_root *root, u64 offset,
58                               struct rb_node *node, int bitmap)
59 {
60         struct rb_node **p = &root->rb_node;
61         struct rb_node *parent = NULL;
62         struct btrfs_free_space *info;
63
64         while (*p) {
65                 parent = *p;
66                 info = rb_entry(parent, struct btrfs_free_space, offset_index);
67
68                 if (offset < info->offset) {
69                         p = &(*p)->rb_left;
70                 } else if (offset > info->offset) {
71                         p = &(*p)->rb_right;
72                 } else {
73                         /*
74                          * we could have a bitmap entry and an extent entry
75                          * share the same offset.  If this is the case, we want
76                          * the extent entry to always be found first if we do a
77                          * linear search through the tree, since we want to have
78                          * the quickest allocation time, and allocating from an
79                          * extent is faster than allocating from a bitmap.  So
80                          * if we're inserting a bitmap and we find an entry at
81                          * this offset, we want to go right, or after this entry
82                          * logically.  If we are inserting an extent and we've
83                          * found a bitmap, we want to go left, or before
84                          * logically.
85                          */
86                         if (bitmap) {
87                                 WARN_ON(info->bitmap);
88                                 p = &(*p)->rb_right;
89                         } else {
90                                 WARN_ON(!info->bitmap);
91                                 p = &(*p)->rb_left;
92                         }
93                 }
94         }
95
96         rb_link_node(node, parent, p);
97         rb_insert_color(node, root);
98
99         return 0;
100 }
101
102 /*
103  * searches the tree for the given offset.
104  *
105  * fuzzy - If this is set, then we are trying to make an allocation, and we just
106  * want a section that has at least bytes size and comes at or after the given
107  * offset.
108  */
109 static struct btrfs_free_space *
110 tree_search_offset(struct btrfs_block_group_cache *block_group,
111                    u64 offset, int bitmap_only, int fuzzy)
112 {
113         struct rb_node *n = block_group->free_space_offset.rb_node;
114         struct btrfs_free_space *entry, *prev = NULL;
115
116         /* find entry that is closest to the 'offset' */
117         while (1) {
118                 if (!n) {
119                         entry = NULL;
120                         break;
121                 }
122
123                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
124                 prev = entry;
125
126                 if (offset < entry->offset)
127                         n = n->rb_left;
128                 else if (offset > entry->offset)
129                         n = n->rb_right;
130                 else
131                         break;
132         }
133
134         if (bitmap_only) {
135                 if (!entry)
136                         return NULL;
137                 if (entry->bitmap)
138                         return entry;
139
140                 /*
141                  * bitmap entry and extent entry may share same offset,
142                  * in that case, bitmap entry comes after extent entry.
143                  */
144                 n = rb_next(n);
145                 if (!n)
146                         return NULL;
147                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
148                 if (entry->offset != offset)
149                         return NULL;
150
151                 WARN_ON(!entry->bitmap);
152                 return entry;
153         } else if (entry) {
154                 if (entry->bitmap) {
155                         /*
156                          * if previous extent entry covers the offset,
157                          * we should return it instead of the bitmap entry
158                          */
159                         n = &entry->offset_index;
160                         while (1) {
161                                 n = rb_prev(n);
162                                 if (!n)
163                                         break;
164                                 prev = rb_entry(n, struct btrfs_free_space,
165                                                 offset_index);
166                                 if (!prev->bitmap) {
167                                         if (prev->offset + prev->bytes > offset)
168                                                 entry = prev;
169                                         break;
170                                 }
171                         }
172                 }
173                 return entry;
174         }
175
176         if (!prev)
177                 return NULL;
178
179         /* find last entry before the 'offset' */
180         entry = prev;
181         if (entry->offset > offset) {
182                 n = rb_prev(&entry->offset_index);
183                 if (n) {
184                         entry = rb_entry(n, struct btrfs_free_space,
185                                         offset_index);
186                         BUG_ON(entry->offset > offset);
187                 } else {
188                         if (fuzzy)
189                                 return entry;
190                         else
191                                 return NULL;
192                 }
193         }
194
195         if (entry->bitmap) {
196                 n = &entry->offset_index;
197                 while (1) {
198                         n = rb_prev(n);
199                         if (!n)
200                                 break;
201                         prev = rb_entry(n, struct btrfs_free_space,
202                                         offset_index);
203                         if (!prev->bitmap) {
204                                 if (prev->offset + prev->bytes > offset)
205                                         return prev;
206                                 break;
207                         }
208                 }
209                 if (entry->offset + BITS_PER_BITMAP *
210                     block_group->sectorsize > offset)
211                         return entry;
212         } else if (entry->offset + entry->bytes > offset)
213                 return entry;
214
215         if (!fuzzy)
216                 return NULL;
217
218         while (1) {
219                 if (entry->bitmap) {
220                         if (entry->offset + BITS_PER_BITMAP *
221                             block_group->sectorsize > offset)
222                                 break;
223                 } else {
224                         if (entry->offset + entry->bytes > offset)
225                                 break;
226                 }
227
228                 n = rb_next(&entry->offset_index);
229                 if (!n)
230                         return NULL;
231                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
232         }
233         return entry;
234 }
235
236 static void unlink_free_space(struct btrfs_block_group_cache *block_group,
237                               struct btrfs_free_space *info)
238 {
239         rb_erase(&info->offset_index, &block_group->free_space_offset);
240         block_group->free_extents--;
241         block_group->free_space -= info->bytes;
242 }
243
244 static int link_free_space(struct btrfs_block_group_cache *block_group,
245                            struct btrfs_free_space *info)
246 {
247         int ret = 0;
248
249         BUG_ON(!info->bitmap && !info->bytes);
250         ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
251                                  &info->offset_index, (info->bitmap != NULL));
252         if (ret)
253                 return ret;
254
255         block_group->free_space += info->bytes;
256         block_group->free_extents++;
257         return ret;
258 }
259
260 static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
261 {
262         u64 max_bytes, possible_bytes;
263
264         /*
265          * The goal is to keep the total amount of memory used per 1gb of space
266          * at or below 32k, so we need to adjust how much memory we allow to be
267          * used by extent based free space tracking
268          */
269         max_bytes = MAX_CACHE_BYTES_PER_GIG *
270                 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
271
272         possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
273                 (sizeof(struct btrfs_free_space) *
274                  block_group->extents_thresh);
275
276         if (possible_bytes > max_bytes) {
277                 int extent_bytes = max_bytes -
278                         (block_group->total_bitmaps * PAGE_CACHE_SIZE);
279
280                 if (extent_bytes <= 0) {
281                         block_group->extents_thresh = 0;
282                         return;
283                 }
284
285                 block_group->extents_thresh = extent_bytes /
286                         (sizeof(struct btrfs_free_space));
287         }
288 }
289
290 static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
291                               struct btrfs_free_space *info, u64 offset,
292                               u64 bytes)
293 {
294         unsigned long start, end;
295         unsigned long i;
296
297         start = offset_to_bit(info->offset, block_group->sectorsize, offset);
298         end = start + bytes_to_bits(bytes, block_group->sectorsize);
299         BUG_ON(end > BITS_PER_BITMAP);
300
301         for (i = start; i < end; i++)
302                 clear_bit(i, info->bitmap);
303
304         info->bytes -= bytes;
305         block_group->free_space -= bytes;
306 }
307
308 static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
309                             struct btrfs_free_space *info, u64 offset,
310                             u64 bytes)
311 {
312         unsigned long start, end;
313         unsigned long i;
314
315         start = offset_to_bit(info->offset, block_group->sectorsize, offset);
316         end = start + bytes_to_bits(bytes, block_group->sectorsize);
317         BUG_ON(end > BITS_PER_BITMAP);
318
319         for (i = start; i < end; i++)
320                 set_bit(i, info->bitmap);
321
322         info->bytes += bytes;
323         block_group->free_space += bytes;
324 }
325
326 static int search_bitmap(struct btrfs_block_group_cache *block_group,
327                          struct btrfs_free_space *bitmap_info, u64 *offset,
328                          u64 *bytes)
329 {
330         unsigned long found_bits = 0;
331         unsigned long bits, i;
332         unsigned long next_zero;
333
334         i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
335                           max_t(u64, *offset, bitmap_info->offset));
336         bits = bytes_to_bits(*bytes, block_group->sectorsize);
337
338         for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
339              i < BITS_PER_BITMAP;
340              i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
341                 next_zero = find_next_zero_bit(bitmap_info->bitmap,
342                                                BITS_PER_BITMAP, i);
343                 if ((next_zero - i) >= bits) {
344                         found_bits = next_zero - i;
345                         break;
346                 }
347                 i = next_zero;
348         }
349
350         if (found_bits) {
351                 *offset = (u64)(i * block_group->sectorsize) +
352                         bitmap_info->offset;
353                 *bytes = (u64)(found_bits) * block_group->sectorsize;
354                 return 0;
355         }
356
357         return -1;
358 }
359
360 static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
361                                                 *block_group, u64 *offset,
362                                                 u64 *bytes, int debug)
363 {
364         struct btrfs_free_space *entry;
365         struct rb_node *node;
366         int ret;
367
368         if (!block_group->free_space_offset.rb_node)
369                 return NULL;
370
371         entry = tree_search_offset(block_group,
372                                    offset_to_bitmap(block_group, *offset),
373                                    0, 1);
374         if (!entry)
375                 return NULL;
376
377         for (node = &entry->offset_index; node; node = rb_next(node)) {
378                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
379                 if (entry->bytes < *bytes)
380                         continue;
381
382                 if (entry->bitmap) {
383                         ret = search_bitmap(block_group, entry, offset, bytes);
384                         if (!ret)
385                                 return entry;
386                         continue;
387                 }
388
389                 *offset = entry->offset;
390                 *bytes = entry->bytes;
391                 return entry;
392         }
393
394         return NULL;
395 }
396
397 static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
398                            struct btrfs_free_space *info, u64 offset)
399 {
400         u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
401         int max_bitmaps = (int)div64_u64(block_group->key.offset +
402                                          bytes_per_bg - 1, bytes_per_bg);
403         BUG_ON(block_group->total_bitmaps >= max_bitmaps);
404
405         info->offset = offset_to_bitmap(block_group, offset);
406         info->bytes = 0;
407         link_free_space(block_group, info);
408         block_group->total_bitmaps++;
409
410         recalculate_thresholds(block_group);
411 }
412
413 static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
414                               struct btrfs_free_space *bitmap_info,
415                               u64 *offset, u64 *bytes)
416 {
417         u64 end;
418         u64 search_start, search_bytes;
419         int ret;
420
421 again:
422         end = bitmap_info->offset +
423                 (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
424
425         /*
426          * XXX - this can go away after a few releases.
427          *
428          * since the only user of btrfs_remove_free_space is the tree logging
429          * stuff, and the only way to test that is under crash conditions, we
430          * want to have this debug stuff here just in case somethings not
431          * working.  Search the bitmap for the space we are trying to use to
432          * make sure its actually there.  If its not there then we need to stop
433          * because something has gone wrong.
434          */
435         search_start = *offset;
436         search_bytes = *bytes;
437         ret = search_bitmap(block_group, bitmap_info, &search_start,
438                             &search_bytes);
439         BUG_ON(ret < 0 || search_start != *offset);
440
441         if (*offset > bitmap_info->offset && *offset + *bytes > end) {
442                 bitmap_clear_bits(block_group, bitmap_info, *offset,
443                                   end - *offset + 1);
444                 *bytes -= end - *offset + 1;
445                 *offset = end + 1;
446         } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
447                 bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
448                 *bytes = 0;
449         }
450
451         if (*bytes) {
452                 struct rb_node *next = rb_next(&bitmap_info->offset_index);
453                 if (!bitmap_info->bytes) {
454                         unlink_free_space(block_group, bitmap_info);
455                         kfree(bitmap_info->bitmap);
456                         kfree(bitmap_info);
457                         block_group->total_bitmaps--;
458                         recalculate_thresholds(block_group);
459                 }
460
461                 /*
462                  * no entry after this bitmap, but we still have bytes to
463                  * remove, so something has gone wrong.
464                  */
465                 if (!next)
466                         return -EINVAL;
467
468                 bitmap_info = rb_entry(next, struct btrfs_free_space,
469                                        offset_index);
470
471                 /*
472                  * if the next entry isn't a bitmap we need to return to let the
473                  * extent stuff do its work.
474                  */
475                 if (!bitmap_info->bitmap)
476                         return -EAGAIN;
477
478                 /*
479                  * Ok the next item is a bitmap, but it may not actually hold
480                  * the information for the rest of this free space stuff, so
481                  * look for it, and if we don't find it return so we can try
482                  * everything over again.
483                  */
484                 search_start = *offset;
485                 search_bytes = *bytes;
486                 ret = search_bitmap(block_group, bitmap_info, &search_start,
487                                     &search_bytes);
488                 if (ret < 0 || search_start != *offset)
489                         return -EAGAIN;
490
491                 goto again;
492         } else if (!bitmap_info->bytes) {
493                 unlink_free_space(block_group, bitmap_info);
494                 kfree(bitmap_info->bitmap);
495                 kfree(bitmap_info);
496                 block_group->total_bitmaps--;
497                 recalculate_thresholds(block_group);
498         }
499
500         return 0;
501 }
502
503 static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
504                               struct btrfs_free_space *info)
505 {
506         struct btrfs_free_space *bitmap_info;
507         int added = 0;
508         u64 bytes, offset, end;
509         int ret;
510
511         /*
512          * If we are below the extents threshold then we can add this as an
513          * extent, and don't have to deal with the bitmap
514          */
515         if (block_group->free_extents < block_group->extents_thresh &&
516             info->bytes > block_group->sectorsize * 4)
517                 return 0;
518
519         /*
520          * some block groups are so tiny they can't be enveloped by a bitmap, so
521          * don't even bother to create a bitmap for this
522          */
523         if (BITS_PER_BITMAP * block_group->sectorsize >
524             block_group->key.offset)
525                 return 0;
526
527         bytes = info->bytes;
528         offset = info->offset;
529
530 again:
531         bitmap_info = tree_search_offset(block_group,
532                                          offset_to_bitmap(block_group, offset),
533                                          1, 0);
534         if (!bitmap_info) {
535                 BUG_ON(added);
536                 goto new_bitmap;
537         }
538
539         end = bitmap_info->offset +
540                 (u64)(BITS_PER_BITMAP * block_group->sectorsize);
541
542         if (offset >= bitmap_info->offset && offset + bytes > end) {
543                 bitmap_set_bits(block_group, bitmap_info, offset,
544                                 end - offset);
545                 bytes -= end - offset;
546                 offset = end;
547                 added = 0;
548         } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
549                 bitmap_set_bits(block_group, bitmap_info, offset, bytes);
550                 bytes = 0;
551         } else {
552                 BUG();
553         }
554
555         if (!bytes) {
556                 ret = 1;
557                 goto out;
558         } else
559                 goto again;
560
561 new_bitmap:
562         if (info && info->bitmap) {
563                 add_new_bitmap(block_group, info, offset);
564                 added = 1;
565                 info = NULL;
566                 goto again;
567         } else {
568                 spin_unlock(&block_group->tree_lock);
569
570                 /* no pre-allocated info, allocate a new one */
571                 if (!info) {
572                         info = kzalloc(sizeof(struct btrfs_free_space),
573                                        GFP_NOFS);
574                         if (!info) {
575                                 spin_lock(&block_group->tree_lock);
576                                 ret = -ENOMEM;
577                                 goto out;
578                         }
579                 }
580
581                 /* allocate the bitmap */
582                 info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
583                 spin_lock(&block_group->tree_lock);
584                 if (!info->bitmap) {
585                         ret = -ENOMEM;
586                         goto out;
587                 }
588                 goto again;
589         }
590
591 out:
592         if (info) {
593                 if (info->bitmap)
594                         kfree(info->bitmap);
595                 kfree(info);
596         }
597
598         return ret;
599 }
600
601 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
602                          u64 offset, u64 bytes)
603 {
604         struct btrfs_free_space *right_info = NULL;
605         struct btrfs_free_space *left_info = NULL;
606         struct btrfs_free_space *info = NULL;
607         int ret = 0;
608
609         info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
610         if (!info)
611                 return -ENOMEM;
612
613         info->offset = offset;
614         info->bytes = bytes;
615
616         spin_lock(&block_group->tree_lock);
617
618         /*
619          * first we want to see if there is free space adjacent to the range we
620          * are adding, if there is remove that struct and add a new one to
621          * cover the entire range
622          */
623         right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
624         if (right_info && rb_prev(&right_info->offset_index))
625                 left_info = rb_entry(rb_prev(&right_info->offset_index),
626                                      struct btrfs_free_space, offset_index);
627         else
628                 left_info = tree_search_offset(block_group, offset - 1, 0, 0);
629
630         /*
631          * If there was no extent directly to the left or right of this new
632          * extent then we know we're going to have to allocate a new extent, so
633          * before we do that see if we need to drop this into a bitmap
634          */
635         if ((!left_info || left_info->bitmap) &&
636             (!right_info || right_info->bitmap)) {
637                 ret = insert_into_bitmap(block_group, info);
638
639                 if (ret < 0) {
640                         goto out;
641                 } else if (ret) {
642                         ret = 0;
643                         goto out;
644                 }
645         }
646
647         if (right_info && !right_info->bitmap) {
648                 unlink_free_space(block_group, right_info);
649                 info->bytes += right_info->bytes;
650                 kfree(right_info);
651         }
652
653         if (left_info && !left_info->bitmap &&
654             left_info->offset + left_info->bytes == offset) {
655                 unlink_free_space(block_group, left_info);
656                 info->offset = left_info->offset;
657                 info->bytes += left_info->bytes;
658                 kfree(left_info);
659         }
660
661         ret = link_free_space(block_group, info);
662         if (ret)
663                 kfree(info);
664 out:
665         spin_unlock(&block_group->tree_lock);
666
667         if (ret) {
668                 printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
669                 BUG_ON(ret == -EEXIST);
670         }
671
672         return ret;
673 }
674
675 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
676                             u64 offset, u64 bytes)
677 {
678         struct btrfs_free_space *info;
679         struct btrfs_free_space *next_info = NULL;
680         int ret = 0;
681
682         spin_lock(&block_group->tree_lock);
683
684 again:
685         info = tree_search_offset(block_group, offset, 0, 0);
686         if (!info) {
687                 /*
688                  * oops didn't find an extent that matched the space we wanted
689                  * to remove, look for a bitmap instead
690                  */
691                 info = tree_search_offset(block_group,
692                                           offset_to_bitmap(block_group, offset),
693                                           1, 0);
694                 if (!info) {
695                         WARN_ON(1);
696                         goto out_lock;
697                 }
698         }
699
700         if (info->bytes < bytes && rb_next(&info->offset_index)) {
701                 u64 end;
702                 next_info = rb_entry(rb_next(&info->offset_index),
703                                              struct btrfs_free_space,
704                                              offset_index);
705
706                 if (next_info->bitmap)
707                         end = next_info->offset + BITS_PER_BITMAP *
708                                 block_group->sectorsize - 1;
709                 else
710                         end = next_info->offset + next_info->bytes;
711
712                 if (next_info->bytes < bytes ||
713                     next_info->offset > offset || offset > end) {
714                         printk(KERN_CRIT "Found free space at %llu, size %llu,"
715                               " trying to use %llu\n",
716                               (unsigned long long)info->offset,
717                               (unsigned long long)info->bytes,
718                               (unsigned long long)bytes);
719                         WARN_ON(1);
720                         ret = -EINVAL;
721                         goto out_lock;
722                 }
723
724                 info = next_info;
725         }
726
727         if (info->bytes == bytes) {
728                 unlink_free_space(block_group, info);
729                 if (info->bitmap) {
730                         kfree(info->bitmap);
731                         block_group->total_bitmaps--;
732                 }
733                 kfree(info);
734                 goto out_lock;
735         }
736
737         if (!info->bitmap && info->offset == offset) {
738                 unlink_free_space(block_group, info);
739                 info->offset += bytes;
740                 info->bytes -= bytes;
741                 link_free_space(block_group, info);
742                 goto out_lock;
743         }
744
745         if (!info->bitmap && info->offset <= offset &&
746             info->offset + info->bytes >= offset + bytes) {
747                 u64 old_start = info->offset;
748                 /*
749                  * we're freeing space in the middle of the info,
750                  * this can happen during tree log replay
751                  *
752                  * first unlink the old info and then
753                  * insert it again after the hole we're creating
754                  */
755                 unlink_free_space(block_group, info);
756                 if (offset + bytes < info->offset + info->bytes) {
757                         u64 old_end = info->offset + info->bytes;
758
759                         info->offset = offset + bytes;
760                         info->bytes = old_end - info->offset;
761                         ret = link_free_space(block_group, info);
762                         WARN_ON(ret);
763                         if (ret)
764                                 goto out_lock;
765                 } else {
766                         /* the hole we're creating ends at the end
767                          * of the info struct, just free the info
768                          */
769                         kfree(info);
770                 }
771                 spin_unlock(&block_group->tree_lock);
772
773                 /* step two, insert a new info struct to cover
774                  * anything before the hole
775                  */
776                 ret = btrfs_add_free_space(block_group, old_start,
777                                            offset - old_start);
778                 WARN_ON(ret);
779                 goto out;
780         }
781
782         ret = remove_from_bitmap(block_group, info, &offset, &bytes);
783         if (ret == -EAGAIN)
784                 goto again;
785         BUG_ON(ret);
786 out_lock:
787         spin_unlock(&block_group->tree_lock);
788 out:
789         return ret;
790 }
791
792 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
793                            u64 bytes)
794 {
795         struct btrfs_free_space *info;
796         struct rb_node *n;
797         int count = 0;
798
799         for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
800                 info = rb_entry(n, struct btrfs_free_space, offset_index);
801                 if (info->bytes >= bytes)
802                         count++;
803                 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
804                        (unsigned long long)info->offset,
805                        (unsigned long long)info->bytes,
806                        (info->bitmap) ? "yes" : "no");
807         }
808         printk(KERN_INFO "block group has cluster?: %s\n",
809                list_empty(&block_group->cluster_list) ? "no" : "yes");
810         printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
811                "\n", count);
812 }
813
814 u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
815 {
816         struct btrfs_free_space *info;
817         struct rb_node *n;
818         u64 ret = 0;
819
820         for (n = rb_first(&block_group->free_space_offset); n;
821              n = rb_next(n)) {
822                 info = rb_entry(n, struct btrfs_free_space, offset_index);
823                 ret += info->bytes;
824         }
825
826         return ret;
827 }
828
829 /*
830  * for a given cluster, put all of its extents back into the free
831  * space cache.  If the block group passed doesn't match the block group
832  * pointed to by the cluster, someone else raced in and freed the
833  * cluster already.  In that case, we just return without changing anything
834  */
835 static int
836 __btrfs_return_cluster_to_free_space(
837                              struct btrfs_block_group_cache *block_group,
838                              struct btrfs_free_cluster *cluster)
839 {
840         struct btrfs_free_space *entry;
841         struct rb_node *node;
842         bool bitmap;
843
844         spin_lock(&cluster->lock);
845         if (cluster->block_group != block_group)
846                 goto out;
847
848         bitmap = cluster->points_to_bitmap;
849         cluster->block_group = NULL;
850         cluster->window_start = 0;
851         list_del_init(&cluster->block_group_list);
852         cluster->points_to_bitmap = false;
853
854         if (bitmap)
855                 goto out;
856
857         node = rb_first(&cluster->root);
858         while (node) {
859                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
860                 node = rb_next(&entry->offset_index);
861                 rb_erase(&entry->offset_index, &cluster->root);
862                 BUG_ON(entry->bitmap);
863                 tree_insert_offset(&block_group->free_space_offset,
864                                    entry->offset, &entry->offset_index, 0);
865         }
866         cluster->root.rb_node = NULL;
867
868 out:
869         spin_unlock(&cluster->lock);
870         btrfs_put_block_group(block_group);
871         return 0;
872 }
873
874 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
875 {
876         struct btrfs_free_space *info;
877         struct rb_node *node;
878         struct btrfs_free_cluster *cluster;
879         struct list_head *head;
880
881         spin_lock(&block_group->tree_lock);
882         while ((head = block_group->cluster_list.next) !=
883                &block_group->cluster_list) {
884                 cluster = list_entry(head, struct btrfs_free_cluster,
885                                      block_group_list);
886
887                 WARN_ON(cluster->block_group != block_group);
888                 __btrfs_return_cluster_to_free_space(block_group, cluster);
889                 if (need_resched()) {
890                         spin_unlock(&block_group->tree_lock);
891                         cond_resched();
892                         spin_lock(&block_group->tree_lock);
893                 }
894         }
895
896         while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
897                 info = rb_entry(node, struct btrfs_free_space, offset_index);
898                 unlink_free_space(block_group, info);
899                 if (info->bitmap)
900                         kfree(info->bitmap);
901                 kfree(info);
902                 if (need_resched()) {
903                         spin_unlock(&block_group->tree_lock);
904                         cond_resched();
905                         spin_lock(&block_group->tree_lock);
906                 }
907         }
908
909         spin_unlock(&block_group->tree_lock);
910 }
911
912 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
913                                u64 offset, u64 bytes, u64 empty_size)
914 {
915         struct btrfs_free_space *entry = NULL;
916         u64 bytes_search = bytes + empty_size;
917         u64 ret = 0;
918
919         spin_lock(&block_group->tree_lock);
920         entry = find_free_space(block_group, &offset, &bytes_search, 0);
921         if (!entry)
922                 goto out;
923
924         ret = offset;
925         if (entry->bitmap) {
926                 bitmap_clear_bits(block_group, entry, offset, bytes);
927                 if (!entry->bytes) {
928                         unlink_free_space(block_group, entry);
929                         kfree(entry->bitmap);
930                         kfree(entry);
931                         block_group->total_bitmaps--;
932                         recalculate_thresholds(block_group);
933                 }
934         } else {
935                 unlink_free_space(block_group, entry);
936                 entry->offset += bytes;
937                 entry->bytes -= bytes;
938                 if (!entry->bytes)
939                         kfree(entry);
940                 else
941                         link_free_space(block_group, entry);
942         }
943
944 out:
945         spin_unlock(&block_group->tree_lock);
946
947         return ret;
948 }
949
950 /*
951  * given a cluster, put all of its extents back into the free space
952  * cache.  If a block group is passed, this function will only free
953  * a cluster that belongs to the passed block group.
954  *
955  * Otherwise, it'll get a reference on the block group pointed to by the
956  * cluster and remove the cluster from it.
957  */
958 int btrfs_return_cluster_to_free_space(
959                                struct btrfs_block_group_cache *block_group,
960                                struct btrfs_free_cluster *cluster)
961 {
962         int ret;
963
964         /* first, get a safe pointer to the block group */
965         spin_lock(&cluster->lock);
966         if (!block_group) {
967                 block_group = cluster->block_group;
968                 if (!block_group) {
969                         spin_unlock(&cluster->lock);
970                         return 0;
971                 }
972         } else if (cluster->block_group != block_group) {
973                 /* someone else has already freed it don't redo their work */
974                 spin_unlock(&cluster->lock);
975                 return 0;
976         }
977         atomic_inc(&block_group->count);
978         spin_unlock(&cluster->lock);
979
980         /* now return any extents the cluster had on it */
981         spin_lock(&block_group->tree_lock);
982         ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
983         spin_unlock(&block_group->tree_lock);
984
985         /* finally drop our ref */
986         btrfs_put_block_group(block_group);
987         return ret;
988 }
989
990 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
991                                    struct btrfs_free_cluster *cluster,
992                                    u64 bytes, u64 min_start)
993 {
994         struct btrfs_free_space *entry;
995         int err;
996         u64 search_start = cluster->window_start;
997         u64 search_bytes = bytes;
998         u64 ret = 0;
999
1000         spin_lock(&block_group->tree_lock);
1001         spin_lock(&cluster->lock);
1002
1003         if (!cluster->points_to_bitmap)
1004                 goto out;
1005
1006         if (cluster->block_group != block_group)
1007                 goto out;
1008
1009         /*
1010          * search_start is the beginning of the bitmap, but at some point it may
1011          * be a good idea to point to the actual start of the free area in the
1012          * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
1013          * to 1 to make sure we get the bitmap entry
1014          */
1015         entry = tree_search_offset(block_group,
1016                                    offset_to_bitmap(block_group, search_start),
1017                                    1, 0);
1018         if (!entry || !entry->bitmap)
1019                 goto out;
1020
1021         search_start = min_start;
1022         search_bytes = bytes;
1023
1024         err = search_bitmap(block_group, entry, &search_start,
1025                             &search_bytes);
1026         if (err)
1027                 goto out;
1028
1029         ret = search_start;
1030         bitmap_clear_bits(block_group, entry, ret, bytes);
1031 out:
1032         spin_unlock(&cluster->lock);
1033         spin_unlock(&block_group->tree_lock);
1034
1035         return ret;
1036 }
1037
1038 /*
1039  * given a cluster, try to allocate 'bytes' from it, returns 0
1040  * if it couldn't find anything suitably large, or a logical disk offset
1041  * if things worked out
1042  */
1043 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
1044                              struct btrfs_free_cluster *cluster, u64 bytes,
1045                              u64 min_start)
1046 {
1047         struct btrfs_free_space *entry = NULL;
1048         struct rb_node *node;
1049         u64 ret = 0;
1050
1051         if (cluster->points_to_bitmap)
1052                 return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
1053                                                min_start);
1054
1055         spin_lock(&cluster->lock);
1056         if (bytes > cluster->max_size)
1057                 goto out;
1058
1059         if (cluster->block_group != block_group)
1060                 goto out;
1061
1062         node = rb_first(&cluster->root);
1063         if (!node)
1064                 goto out;
1065
1066         entry = rb_entry(node, struct btrfs_free_space, offset_index);
1067
1068         while(1) {
1069                 if (entry->bytes < bytes || entry->offset < min_start) {
1070                         struct rb_node *node;
1071
1072                         node = rb_next(&entry->offset_index);
1073                         if (!node)
1074                                 break;
1075                         entry = rb_entry(node, struct btrfs_free_space,
1076                                          offset_index);
1077                         continue;
1078                 }
1079                 ret = entry->offset;
1080
1081                 entry->offset += bytes;
1082                 entry->bytes -= bytes;
1083
1084                 if (entry->bytes == 0) {
1085                         rb_erase(&entry->offset_index, &cluster->root);
1086                         kfree(entry);
1087                 }
1088                 break;
1089         }
1090 out:
1091         spin_unlock(&cluster->lock);
1092
1093         return ret;
1094 }
1095
1096 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
1097                                 struct btrfs_free_space *entry,
1098                                 struct btrfs_free_cluster *cluster,
1099                                 u64 offset, u64 bytes, u64 min_bytes)
1100 {
1101         unsigned long next_zero;
1102         unsigned long i;
1103         unsigned long search_bits;
1104         unsigned long total_bits;
1105         unsigned long found_bits;
1106         unsigned long start = 0;
1107         unsigned long total_found = 0;
1108         bool found = false;
1109
1110         i = offset_to_bit(entry->offset, block_group->sectorsize,
1111                           max_t(u64, offset, entry->offset));
1112         search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
1113         total_bits = bytes_to_bits(bytes, block_group->sectorsize);
1114
1115 again:
1116         found_bits = 0;
1117         for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
1118              i < BITS_PER_BITMAP;
1119              i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
1120                 next_zero = find_next_zero_bit(entry->bitmap,
1121                                                BITS_PER_BITMAP, i);
1122                 if (next_zero - i >= search_bits) {
1123                         found_bits = next_zero - i;
1124                         break;
1125                 }
1126                 i = next_zero;
1127         }
1128
1129         if (!found_bits)
1130                 return -1;
1131
1132         if (!found) {
1133                 start = i;
1134                 found = true;
1135         }
1136
1137         total_found += found_bits;
1138
1139         if (cluster->max_size < found_bits * block_group->sectorsize)
1140                 cluster->max_size = found_bits * block_group->sectorsize;
1141
1142         if (total_found < total_bits) {
1143                 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
1144                 if (i - start > total_bits * 2) {
1145                         total_found = 0;
1146                         cluster->max_size = 0;
1147                         found = false;
1148                 }
1149                 goto again;
1150         }
1151
1152         cluster->window_start = start * block_group->sectorsize +
1153                 entry->offset;
1154         cluster->points_to_bitmap = true;
1155
1156         return 0;
1157 }
1158
1159 /*
1160  * here we try to find a cluster of blocks in a block group.  The goal
1161  * is to find at least bytes free and up to empty_size + bytes free.
1162  * We might not find them all in one contiguous area.
1163  *
1164  * returns zero and sets up cluster if things worked out, otherwise
1165  * it returns -enospc
1166  */
1167 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
1168                              struct btrfs_root *root,
1169                              struct btrfs_block_group_cache *block_group,
1170                              struct btrfs_free_cluster *cluster,
1171                              u64 offset, u64 bytes, u64 empty_size)
1172 {
1173         struct btrfs_free_space *entry = NULL;
1174         struct rb_node *node;
1175         struct btrfs_free_space *next;
1176         struct btrfs_free_space *last = NULL;
1177         u64 min_bytes;
1178         u64 window_start;
1179         u64 window_free;
1180         u64 max_extent = 0;
1181         bool found_bitmap = false;
1182         int ret;
1183
1184         /* for metadata, allow allocates with more holes */
1185         if (btrfs_test_opt(root, SSD_SPREAD)) {
1186                 min_bytes = bytes + empty_size;
1187         } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
1188                 /*
1189                  * we want to do larger allocations when we are
1190                  * flushing out the delayed refs, it helps prevent
1191                  * making more work as we go along.
1192                  */
1193                 if (trans->transaction->delayed_refs.flushing)
1194                         min_bytes = max(bytes, (bytes + empty_size) >> 1);
1195                 else
1196                         min_bytes = max(bytes, (bytes + empty_size) >> 4);
1197         } else
1198                 min_bytes = max(bytes, (bytes + empty_size) >> 2);
1199
1200         spin_lock(&block_group->tree_lock);
1201         spin_lock(&cluster->lock);
1202
1203         /* someone already found a cluster, hooray */
1204         if (cluster->block_group) {
1205                 ret = 0;
1206                 goto out;
1207         }
1208 again:
1209         entry = tree_search_offset(block_group, offset, found_bitmap, 1);
1210         if (!entry) {
1211                 ret = -ENOSPC;
1212                 goto out;
1213         }
1214
1215         /*
1216          * If found_bitmap is true, we exhausted our search for extent entries,
1217          * and we just want to search all of the bitmaps that we can find, and
1218          * ignore any extent entries we find.
1219          */
1220         while (entry->bitmap || found_bitmap ||
1221                (!entry->bitmap && entry->bytes < min_bytes)) {
1222                 struct rb_node *node = rb_next(&entry->offset_index);
1223
1224                 if (entry->bitmap && entry->bytes > bytes + empty_size) {
1225                         ret = btrfs_bitmap_cluster(block_group, entry, cluster,
1226                                                    offset, bytes + empty_size,
1227                                                    min_bytes);
1228                         if (!ret)
1229                                 goto got_it;
1230                 }
1231
1232                 if (!node) {
1233                         ret = -ENOSPC;
1234                         goto out;
1235                 }
1236                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1237         }
1238
1239         /*
1240          * We already searched all the extent entries from the passed in offset
1241          * to the end and didn't find enough space for the cluster, and we also
1242          * didn't find any bitmaps that met our criteria, just go ahead and exit
1243          */
1244         if (found_bitmap) {
1245                 ret = -ENOSPC;
1246                 goto out;
1247         }
1248
1249         cluster->points_to_bitmap = false;
1250         window_start = entry->offset;
1251         window_free = entry->bytes;
1252         last = entry;
1253         max_extent = entry->bytes;
1254
1255         while (1) {
1256                 /* out window is just right, lets fill it */
1257                 if (window_free >= bytes + empty_size)
1258                         break;
1259
1260                 node = rb_next(&last->offset_index);
1261                 if (!node) {
1262                         if (found_bitmap)
1263                                 goto again;
1264                         ret = -ENOSPC;
1265                         goto out;
1266                 }
1267                 next = rb_entry(node, struct btrfs_free_space, offset_index);
1268
1269                 /*
1270                  * we found a bitmap, so if this search doesn't result in a
1271                  * cluster, we know to go and search again for the bitmaps and
1272                  * start looking for space there
1273                  */
1274                 if (next->bitmap) {
1275                         if (!found_bitmap)
1276                                 offset = next->offset;
1277                         found_bitmap = true;
1278                         last = next;
1279                         continue;
1280                 }
1281
1282                 /*
1283                  * we haven't filled the empty size and the window is
1284                  * very large.  reset and try again
1285                  */
1286                 if (next->offset - (last->offset + last->bytes) > 128 * 1024 ||
1287                     next->offset - window_start > (bytes + empty_size) * 2) {
1288                         entry = next;
1289                         window_start = entry->offset;
1290                         window_free = entry->bytes;
1291                         last = entry;
1292                         max_extent = 0;
1293                 } else {
1294                         last = next;
1295                         window_free += next->bytes;
1296                         if (entry->bytes > max_extent)
1297                                 max_extent = entry->bytes;
1298                 }
1299         }
1300
1301         cluster->window_start = entry->offset;
1302
1303         /*
1304          * now we've found our entries, pull them out of the free space
1305          * cache and put them into the cluster rbtree
1306          *
1307          * The cluster includes an rbtree, but only uses the offset index
1308          * of each free space cache entry.
1309          */
1310         while (1) {
1311                 node = rb_next(&entry->offset_index);
1312                 if (entry->bitmap && node) {
1313                         entry = rb_entry(node, struct btrfs_free_space,
1314                                          offset_index);
1315                         continue;
1316                 } else if (entry->bitmap && !node) {
1317                         break;
1318                 }
1319
1320                 rb_erase(&entry->offset_index, &block_group->free_space_offset);
1321                 ret = tree_insert_offset(&cluster->root, entry->offset,
1322                                          &entry->offset_index, 0);
1323                 BUG_ON(ret);
1324
1325                 if (!node || entry == last)
1326                         break;
1327
1328                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1329         }
1330
1331         cluster->max_size = max_extent;
1332 got_it:
1333         ret = 0;
1334         atomic_inc(&block_group->count);
1335         list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
1336         cluster->block_group = block_group;
1337 out:
1338         spin_unlock(&cluster->lock);
1339         spin_unlock(&block_group->tree_lock);
1340
1341         return ret;
1342 }
1343
1344 /*
1345  * simple code to zero out a cluster
1346  */
1347 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
1348 {
1349         spin_lock_init(&cluster->lock);
1350         spin_lock_init(&cluster->refill_lock);
1351         cluster->root.rb_node = NULL;
1352         cluster->max_size = 0;
1353         cluster->points_to_bitmap = false;
1354         INIT_LIST_HEAD(&cluster->block_group_list);
1355         cluster->block_group = NULL;
1356 }
1357