[PATCH] OCFS2: The Second Oracle Cluster Filesystem
[linux-2.6.git] / fs / ocfs2 / suballoc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "dlmglue.h"
39 #include "inode.h"
40 #include "journal.h"
41 #include "localalloc.h"
42 #include "suballoc.h"
43 #include "super.h"
44 #include "sysfile.h"
45 #include "uptodate.h"
46
47 #include "buffer_head_io.h"
48
49 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
50 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
51 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
52 static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
53                                   struct inode *alloc_inode,
54                                   struct buffer_head *bg_bh,
55                                   u64 group_blkno,
56                                   u16 my_chain,
57                                   struct ocfs2_chain_list *cl);
58 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
59                                    struct inode *alloc_inode,
60                                    struct buffer_head *bh);
61
62 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
63                                        struct ocfs2_alloc_context *ac);
64
65 static int ocfs2_cluster_group_search(struct inode *inode,
66                                       struct buffer_head *group_bh,
67                                       u32 bits_wanted, u32 min_bits,
68                                       u16 *bit_off, u16 *bits_found);
69 static int ocfs2_block_group_search(struct inode *inode,
70                                     struct buffer_head *group_bh,
71                                     u32 bits_wanted, u32 min_bits,
72                                     u16 *bit_off, u16 *bits_found);
73 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
74                               u32 bits_wanted,
75                               u32 min_bits,
76                               u16 *bit_off,
77                               unsigned int *num_bits,
78                               u64 *bg_blkno);
79 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
80                                      struct ocfs2_alloc_context *ac,
81                                      u32 bits_wanted,
82                                      u32 min_bits,
83                                      u16 *bit_off,
84                                      unsigned int *num_bits,
85                                      u64 *bg_blkno);
86 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
87                                          int nr);
88 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
89                                              struct buffer_head *bg_bh,
90                                              unsigned int bits_wanted,
91                                              u16 *bit_off,
92                                              u16 *bits_found);
93 static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
94                                              struct inode *alloc_inode,
95                                              struct ocfs2_group_desc *bg,
96                                              struct buffer_head *group_bh,
97                                              unsigned int bit_off,
98                                              unsigned int num_bits);
99 static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
100                                                struct inode *alloc_inode,
101                                                struct ocfs2_group_desc *bg,
102                                                struct buffer_head *group_bh,
103                                                unsigned int bit_off,
104                                                unsigned int num_bits);
105
106 static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
107                                     struct inode *alloc_inode,
108                                     struct buffer_head *fe_bh,
109                                     struct buffer_head *bg_bh,
110                                     struct buffer_head *prev_bg_bh,
111                                     u16 chain);
112 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
113                                                      u32 wanted);
114 static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
115                                     struct inode *alloc_inode,
116                                     struct buffer_head *alloc_bh,
117                                     unsigned int start_bit,
118                                     u64 bg_blkno,
119                                     unsigned int count);
120 static inline u64 ocfs2_which_suballoc_group(u64 block,
121                                              unsigned int bit);
122 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
123                                                    u64 bg_blkno,
124                                                    u16 bg_bit_off);
125 static inline u64 ocfs2_which_cluster_group(struct inode *inode,
126                                             u32 cluster);
127 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
128                                                 u64 data_blkno,
129                                                 u64 *bg_blkno,
130                                                 u16 *bg_bit_off);
131
132 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
133 {
134         if (ac->ac_inode)
135                 iput(ac->ac_inode);
136         if (ac->ac_bh)
137                 brelse(ac->ac_bh);
138         kfree(ac);
139 }
140
141 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
142 {
143         return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
144 }
145
146 static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
147                                   struct inode *alloc_inode,
148                                   struct buffer_head *bg_bh,
149                                   u64 group_blkno,
150                                   u16 my_chain,
151                                   struct ocfs2_chain_list *cl)
152 {
153         int status = 0;
154         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
155         struct super_block * sb = alloc_inode->i_sb;
156
157         mlog_entry_void();
158
159         if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
160                 ocfs2_error(alloc_inode->i_sb, "group block (%"MLFu64") "
161                             "!= b_blocknr (%llu)", group_blkno,
162                             (unsigned long long) bg_bh->b_blocknr);
163                 status = -EIO;
164                 goto bail;
165         }
166
167         status = ocfs2_journal_access(handle,
168                                       alloc_inode,
169                                       bg_bh,
170                                       OCFS2_JOURNAL_ACCESS_CREATE);
171         if (status < 0) {
172                 mlog_errno(status);
173                 goto bail;
174         }
175
176         memset(bg, 0, sb->s_blocksize);
177         strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
178         bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
179         bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
180         bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
181         bg->bg_chain = cpu_to_le16(my_chain);
182         bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
183         bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
184         bg->bg_blkno = cpu_to_le64(group_blkno);
185         /* set the 1st bit in the bitmap to account for the descriptor block */
186         ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
187         bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
188
189         status = ocfs2_journal_dirty(handle, bg_bh);
190         if (status < 0)
191                 mlog_errno(status);
192
193         /* There is no need to zero out or otherwise initialize the
194          * other blocks in a group - All valid FS metadata in a block
195          * group stores the superblock fs_generation value at
196          * allocation time. */
197
198 bail:
199         mlog_exit(status);
200         return status;
201 }
202
203 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
204 {
205         u16 curr, best;
206
207         best = curr = 0;
208         while (curr < le16_to_cpu(cl->cl_count)) {
209                 if (le32_to_cpu(cl->cl_recs[best].c_total) >
210                     le32_to_cpu(cl->cl_recs[curr].c_total))
211                         best = curr;
212                 curr++;
213         }
214         return best;
215 }
216
217 /*
218  * We expect the block group allocator to already be locked.
219  */
220 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
221                                    struct inode *alloc_inode,
222                                    struct buffer_head *bh)
223 {
224         int status, credits;
225         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
226         struct ocfs2_chain_list *cl;
227         struct ocfs2_alloc_context *ac = NULL;
228         struct ocfs2_journal_handle *handle = NULL;
229         u32 bit_off, num_bits;
230         u16 alloc_rec;
231         u64 bg_blkno;
232         struct buffer_head *bg_bh = NULL;
233         struct ocfs2_group_desc *bg;
234
235         BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
236
237         mlog_entry_void();
238
239         handle = ocfs2_alloc_handle(osb);
240         if (!handle) {
241                 status = -ENOMEM;
242                 mlog_errno(status);
243                 goto bail;
244         }
245
246         cl = &fe->id2.i_chain;
247         status = ocfs2_reserve_clusters(osb,
248                                         handle,
249                                         le16_to_cpu(cl->cl_cpg),
250                                         &ac);
251         if (status < 0) {
252                 if (status != -ENOSPC)
253                         mlog_errno(status);
254                 goto bail;
255         }
256
257         credits = ocfs2_calc_group_alloc_credits(osb->sb,
258                                                  le16_to_cpu(cl->cl_cpg));
259         handle = ocfs2_start_trans(osb, handle, credits);
260         if (IS_ERR(handle)) {
261                 status = PTR_ERR(handle);
262                 handle = NULL;
263                 mlog_errno(status);
264                 goto bail;
265         }
266
267         status = ocfs2_claim_clusters(osb,
268                                       handle,
269                                       ac,
270                                       le16_to_cpu(cl->cl_cpg),
271                                       &bit_off,
272                                       &num_bits);
273         if (status < 0) {
274                 if (status != -ENOSPC)
275                         mlog_errno(status);
276                 goto bail;
277         }
278
279         alloc_rec = ocfs2_find_smallest_chain(cl);
280
281         /* setup the group */
282         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
283         mlog(0, "new descriptor, record %u, at block %"MLFu64"\n",
284              alloc_rec, bg_blkno);
285
286         bg_bh = sb_getblk(osb->sb, bg_blkno);
287         if (!bg_bh) {
288                 status = -EIO;
289                 mlog_errno(status);
290                 goto bail;
291         }
292         ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
293
294         status = ocfs2_block_group_fill(handle,
295                                         alloc_inode,
296                                         bg_bh,
297                                         bg_blkno,
298                                         alloc_rec,
299                                         cl);
300         if (status < 0) {
301                 mlog_errno(status);
302                 goto bail;
303         }
304
305         bg = (struct ocfs2_group_desc *) bg_bh->b_data;
306
307         status = ocfs2_journal_access(handle, alloc_inode,
308                                       bh, OCFS2_JOURNAL_ACCESS_WRITE);
309         if (status < 0) {
310                 mlog_errno(status);
311                 goto bail;
312         }
313
314         le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
315                      le16_to_cpu(bg->bg_free_bits_count));
316         le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
317         cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
318         if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
319                 le16_add_cpu(&cl->cl_next_free_rec, 1);
320
321         le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
322                                         le16_to_cpu(bg->bg_free_bits_count));
323         le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
324         le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
325
326         status = ocfs2_journal_dirty(handle, bh);
327         if (status < 0) {
328                 mlog_errno(status);
329                 goto bail;
330         }
331
332         spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
333         OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
334         fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
335                                              le32_to_cpu(fe->i_clusters)));
336         spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
337         i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
338         alloc_inode->i_blocks =
339                 ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
340
341         status = 0;
342 bail:
343         if (handle)
344                 ocfs2_commit_trans(handle);
345
346         if (ac)
347                 ocfs2_free_alloc_context(ac);
348
349         if (bg_bh)
350                 brelse(bg_bh);
351
352         mlog_exit(status);
353         return status;
354 }
355
356 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
357                                        struct ocfs2_alloc_context *ac)
358 {
359         int status;
360         u32 bits_wanted = ac->ac_bits_wanted;
361         struct inode *alloc_inode = ac->ac_inode;
362         struct buffer_head *bh = NULL;
363         struct ocfs2_journal_handle *handle = ac->ac_handle;
364         struct ocfs2_dinode *fe;
365         u32 free_bits;
366
367         mlog_entry_void();
368
369         BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
370
371         ocfs2_handle_add_inode(handle, alloc_inode);
372         status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1);
373         if (status < 0) {
374                 mlog_errno(status);
375                 goto bail;
376         }
377
378         fe = (struct ocfs2_dinode *) bh->b_data;
379         if (!OCFS2_IS_VALID_DINODE(fe)) {
380                 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
381                 status = -EIO;
382                 goto bail;
383         }
384         if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
385                 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator "
386                             "# %"MLFu64, le64_to_cpu(fe->i_blkno));
387                 status = -EIO;
388                 goto bail;
389         }
390
391         free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
392                 le32_to_cpu(fe->id1.bitmap1.i_used);
393
394         if (bits_wanted > free_bits) {
395                 /* cluster bitmap never grows */
396                 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
397                         mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
398                              bits_wanted, free_bits);
399                         status = -ENOSPC;
400                         goto bail;
401                 }
402
403                 status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
404                 if (status < 0) {
405                         if (status != -ENOSPC)
406                                 mlog_errno(status);
407                         goto bail;
408                 }
409                 atomic_inc(&osb->alloc_stats.bg_extends);
410
411                 /* You should never ask for this much metadata */
412                 BUG_ON(bits_wanted >
413                        (le32_to_cpu(fe->id1.bitmap1.i_total)
414                         - le32_to_cpu(fe->id1.bitmap1.i_used)));
415         }
416
417         get_bh(bh);
418         ac->ac_bh = bh;
419 bail:
420         if (bh)
421                 brelse(bh);
422
423         mlog_exit(status);
424         return status;
425 }
426
427 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
428                                struct ocfs2_journal_handle *handle,
429                                struct ocfs2_dinode *fe,
430                                struct ocfs2_alloc_context **ac)
431 {
432         int status;
433         struct inode *alloc_inode = NULL;
434
435         *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
436         if (!(*ac)) {
437                 status = -ENOMEM;
438                 mlog_errno(status);
439                 goto bail;
440         }
441
442         (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
443         (*ac)->ac_handle = handle;
444         (*ac)->ac_which = OCFS2_AC_USE_META;
445
446 #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
447         alloc_inode = ocfs2_get_system_file_inode(osb,
448                                                   EXTENT_ALLOC_SYSTEM_INODE,
449                                                   0);
450 #else
451         alloc_inode = ocfs2_get_system_file_inode(osb,
452                                                   EXTENT_ALLOC_SYSTEM_INODE,
453                                                   osb->slot_num);
454 #endif
455         if (!alloc_inode) {
456                 status = -ENOMEM;
457                 mlog_errno(status);
458                 goto bail;
459         }
460
461         (*ac)->ac_inode = igrab(alloc_inode);
462         (*ac)->ac_group_search = ocfs2_block_group_search;
463
464         status = ocfs2_reserve_suballoc_bits(osb, (*ac));
465         if (status < 0) {
466                 if (status != -ENOSPC)
467                         mlog_errno(status);
468                 goto bail;
469         }
470
471         status = 0;
472 bail:
473         if ((status < 0) && *ac) {
474                 ocfs2_free_alloc_context(*ac);
475                 *ac = NULL;
476         }
477
478         if (alloc_inode)
479                 iput(alloc_inode);
480
481         mlog_exit(status);
482         return status;
483 }
484
485 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
486                             struct ocfs2_journal_handle *handle,
487                             struct ocfs2_alloc_context **ac)
488 {
489         int status;
490         struct inode *alloc_inode = NULL;
491
492         *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
493         if (!(*ac)) {
494                 status = -ENOMEM;
495                 mlog_errno(status);
496                 goto bail;
497         }
498
499         (*ac)->ac_bits_wanted = 1;
500         (*ac)->ac_handle = handle;
501         (*ac)->ac_which = OCFS2_AC_USE_INODE;
502
503         alloc_inode = ocfs2_get_system_file_inode(osb,
504                                                   INODE_ALLOC_SYSTEM_INODE,
505                                                   osb->slot_num);
506         if (!alloc_inode) {
507                 status = -ENOMEM;
508                 mlog_errno(status);
509                 goto bail;
510         }
511
512         (*ac)->ac_inode = igrab(alloc_inode);
513         (*ac)->ac_group_search = ocfs2_block_group_search;
514
515         status = ocfs2_reserve_suballoc_bits(osb, *ac);
516         if (status < 0) {
517                 if (status != -ENOSPC)
518                         mlog_errno(status);
519                 goto bail;
520         }
521
522         status = 0;
523 bail:
524         if ((status < 0) && *ac) {
525                 ocfs2_free_alloc_context(*ac);
526                 *ac = NULL;
527         }
528
529         if (alloc_inode)
530                 iput(alloc_inode);
531
532         mlog_exit(status);
533         return status;
534 }
535
536 /* local alloc code has to do the same thing, so rather than do this
537  * twice.. */
538 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
539                                       struct ocfs2_alloc_context *ac)
540 {
541         int status;
542
543         ac->ac_inode = ocfs2_get_system_file_inode(osb,
544                                                    GLOBAL_BITMAP_SYSTEM_INODE,
545                                                    OCFS2_INVALID_SLOT);
546         if (!ac->ac_inode) {
547                 status = -EINVAL;
548                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
549                 goto bail;
550         }
551         ac->ac_which = OCFS2_AC_USE_MAIN;
552         ac->ac_group_search = ocfs2_cluster_group_search;
553
554         status = ocfs2_reserve_suballoc_bits(osb, ac);
555         if (status < 0 && status != -ENOSPC)
556                 mlog_errno(status);
557 bail:
558         return status;
559 }
560
561 /* Callers don't need to care which bitmap (local alloc or main) to
562  * use so we figure it out for them, but unfortunately this clutters
563  * things a bit. */
564 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
565                            struct ocfs2_journal_handle *handle,
566                            u32 bits_wanted,
567                            struct ocfs2_alloc_context **ac)
568 {
569         int status;
570
571         mlog_entry_void();
572
573         BUG_ON(!handle);
574
575         *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
576         if (!(*ac)) {
577                 status = -ENOMEM;
578                 mlog_errno(status);
579                 goto bail;
580         }
581
582         (*ac)->ac_bits_wanted = bits_wanted;
583         (*ac)->ac_handle = handle;
584
585         status = -ENOSPC;
586         if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
587                 status = ocfs2_reserve_local_alloc_bits(osb,
588                                                         handle,
589                                                         bits_wanted,
590                                                         *ac);
591                 if ((status < 0) && (status != -ENOSPC)) {
592                         mlog_errno(status);
593                         goto bail;
594                 } else if (status == -ENOSPC) {
595                         /* reserve_local_bits will return enospc with
596                          * the local alloc inode still locked, so we
597                          * can change this safely here. */
598                         mlog(0, "Disabling local alloc\n");
599                         /* We set to OCFS2_LA_DISABLED so that umount
600                          * can clean up what's left of the local
601                          * allocation */
602                         osb->local_alloc_state = OCFS2_LA_DISABLED;
603                 }
604         }
605
606         if (status == -ENOSPC) {
607                 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
608                 if (status < 0) {
609                         if (status != -ENOSPC)
610                                 mlog_errno(status);
611                         goto bail;
612                 }
613         }
614
615         status = 0;
616 bail:
617         if ((status < 0) && *ac) {
618                 ocfs2_free_alloc_context(*ac);
619                 *ac = NULL;
620         }
621
622         mlog_exit(status);
623         return status;
624 }
625
626 /*
627  * More or less lifted from ext3. I'll leave their description below:
628  *
629  * "For ext3 allocations, we must not reuse any blocks which are
630  * allocated in the bitmap buffer's "last committed data" copy.  This
631  * prevents deletes from freeing up the page for reuse until we have
632  * committed the delete transaction.
633  *
634  * If we didn't do this, then deleting something and reallocating it as
635  * data would allow the old block to be overwritten before the
636  * transaction committed (because we force data to disk before commit).
637  * This would lead to corruption if we crashed between overwriting the
638  * data and committing the delete.
639  *
640  * @@@ We may want to make this allocation behaviour conditional on
641  * data-writes at some point, and disable it for metadata allocations or
642  * sync-data inodes."
643  *
644  * Note: OCFS2 already does this differently for metadata vs data
645  * allocations, as those bitmaps are seperate and undo access is never
646  * called on a metadata group descriptor.
647  */
648 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
649                                          int nr)
650 {
651         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
652
653         if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
654                 return 0;
655         if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
656                 return 1;
657
658         bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
659         return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
660 }
661
662 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
663                                              struct buffer_head *bg_bh,
664                                              unsigned int bits_wanted,
665                                              u16 *bit_off,
666                                              u16 *bits_found)
667 {
668         void *bitmap;
669         u16 best_offset, best_size;
670         int offset, start, found, status = 0;
671         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
672
673         if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
674                 OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
675                 return -EIO;
676         }
677
678         found = start = best_offset = best_size = 0;
679         bitmap = bg->bg_bitmap;
680
681         while((offset = ocfs2_find_next_zero_bit(bitmap,
682                                                  le16_to_cpu(bg->bg_bits),
683                                                  start)) != -1) {
684                 if (offset == le16_to_cpu(bg->bg_bits))
685                         break;
686
687                 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
688                         /* We found a zero, but we can't use it as it
689                          * hasn't been put to disk yet! */
690                         found = 0;
691                         start = offset + 1;
692                 } else if (offset == start) {
693                         /* we found a zero */
694                         found++;
695                         /* move start to the next bit to test */
696                         start++;
697                 } else {
698                         /* got a zero after some ones */
699                         found = 1;
700                         start = offset + 1;
701                 }
702                 if (found > best_size) {
703                         best_size = found;
704                         best_offset = start - found;
705                 }
706                 /* we got everything we needed */
707                 if (found == bits_wanted) {
708                         /* mlog(0, "Found it all!\n"); */
709                         break;
710                 }
711         }
712
713         /* XXX: I think the first clause is equivalent to the second
714          *      - jlbec */
715         if (found == bits_wanted) {
716                 *bit_off = start - found;
717                 *bits_found = found;
718         } else if (best_size) {
719                 *bit_off = best_offset;
720                 *bits_found = best_size;
721         } else {
722                 status = -ENOSPC;
723                 /* No error log here -- see the comment above
724                  * ocfs2_test_bg_bit_allocatable */
725         }
726
727         return status;
728 }
729
730 static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
731                                              struct inode *alloc_inode,
732                                              struct ocfs2_group_desc *bg,
733                                              struct buffer_head *group_bh,
734                                              unsigned int bit_off,
735                                              unsigned int num_bits)
736 {
737         int status;
738         void *bitmap = bg->bg_bitmap;
739         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
740
741         mlog_entry_void();
742
743         if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
744                 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
745                 status = -EIO;
746                 goto bail;
747         }
748         BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
749
750         mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
751              num_bits);
752
753         if (ocfs2_is_cluster_bitmap(alloc_inode))
754                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
755
756         status = ocfs2_journal_access(handle,
757                                       alloc_inode,
758                                       group_bh,
759                                       journal_type);
760         if (status < 0) {
761                 mlog_errno(status);
762                 goto bail;
763         }
764
765         le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
766
767         while(num_bits--)
768                 ocfs2_set_bit(bit_off++, bitmap);
769
770         status = ocfs2_journal_dirty(handle,
771                                      group_bh);
772         if (status < 0) {
773                 mlog_errno(status);
774                 goto bail;
775         }
776
777 bail:
778         mlog_exit(status);
779         return status;
780 }
781
782 /* find the one with the most empty bits */
783 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
784 {
785         u16 curr, best;
786
787         BUG_ON(!cl->cl_next_free_rec);
788
789         best = curr = 0;
790         while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
791                 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
792                     le32_to_cpu(cl->cl_recs[best].c_free))
793                         best = curr;
794                 curr++;
795         }
796
797         BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
798         return best;
799 }
800
801 static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
802                                     struct inode *alloc_inode,
803                                     struct buffer_head *fe_bh,
804                                     struct buffer_head *bg_bh,
805                                     struct buffer_head *prev_bg_bh,
806                                     u16 chain)
807 {
808         int status;
809         /* there is a really tiny chance the journal calls could fail,
810          * but we wouldn't want inconsistent blocks in *any* case. */
811         u64 fe_ptr, bg_ptr, prev_bg_ptr;
812         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
813         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
814         struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
815
816         if (!OCFS2_IS_VALID_DINODE(fe)) {
817                 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
818                 status = -EIO;
819                 goto out;
820         }
821         if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
822                 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
823                 status = -EIO;
824                 goto out;
825         }
826         if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
827                 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
828                 status = -EIO;
829                 goto out;
830         }
831
832         mlog(0, "In suballoc %"MLFu64", chain %u, move group %"MLFu64" to "
833              "top, prev = %"MLFu64"\n",
834              fe->i_blkno, chain, bg->bg_blkno, prev_bg->bg_blkno);
835
836         fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
837         bg_ptr = le64_to_cpu(bg->bg_next_group);
838         prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
839
840         status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
841                                       OCFS2_JOURNAL_ACCESS_WRITE);
842         if (status < 0) {
843                 mlog_errno(status);
844                 goto out_rollback;
845         }
846
847         prev_bg->bg_next_group = bg->bg_next_group;
848
849         status = ocfs2_journal_dirty(handle, prev_bg_bh);
850         if (status < 0) {
851                 mlog_errno(status);
852                 goto out_rollback;
853         }
854
855         status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
856                                       OCFS2_JOURNAL_ACCESS_WRITE);
857         if (status < 0) {
858                 mlog_errno(status);
859                 goto out_rollback;
860         }
861
862         bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
863
864         status = ocfs2_journal_dirty(handle, bg_bh);
865         if (status < 0) {
866                 mlog_errno(status);
867                 goto out_rollback;
868         }
869
870         status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
871                                       OCFS2_JOURNAL_ACCESS_WRITE);
872         if (status < 0) {
873                 mlog_errno(status);
874                 goto out_rollback;
875         }
876
877         fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
878
879         status = ocfs2_journal_dirty(handle, fe_bh);
880         if (status < 0) {
881                 mlog_errno(status);
882                 goto out_rollback;
883         }
884
885         status = 0;
886 out_rollback:
887         if (status < 0) {
888                 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
889                 bg->bg_next_group = cpu_to_le64(bg_ptr);
890                 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
891         }
892 out:
893         mlog_exit(status);
894         return status;
895 }
896
897 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
898                                                      u32 wanted)
899 {
900         return le16_to_cpu(bg->bg_free_bits_count) > wanted;
901 }
902
903 /* return 0 on success, -ENOSPC to keep searching and any other < 0
904  * value on error. */
905 static int ocfs2_cluster_group_search(struct inode *inode,
906                                       struct buffer_head *group_bh,
907                                       u32 bits_wanted, u32 min_bits,
908                                       u16 *bit_off, u16 *bits_found)
909 {
910         int search = -ENOSPC;
911         int ret;
912         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
913         u16 tmp_off, tmp_found;
914
915         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
916
917         if (bg->bg_free_bits_count) {
918                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
919                                                         group_bh, bits_wanted,
920                                                         &tmp_off, &tmp_found);
921                 if (ret)
922                         return ret;
923
924                 /* ocfs2_block_group_find_clear_bits() might
925                  * return success, but we still want to return
926                  * -ENOSPC unless it found the minimum number
927                  * of bits. */
928                 if (min_bits <= tmp_found) {
929                         *bit_off = tmp_off;
930                         *bits_found = tmp_found;
931                         search = 0; /* success */
932                 }
933         }
934
935         return search;
936 }
937
938 static int ocfs2_block_group_search(struct inode *inode,
939                                     struct buffer_head *group_bh,
940                                     u32 bits_wanted, u32 min_bits,
941                                     u16 *bit_off, u16 *bits_found)
942 {
943         int ret = -ENOSPC;
944         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
945
946         BUG_ON(min_bits != 1);
947         BUG_ON(ocfs2_is_cluster_bitmap(inode));
948
949         if (bg->bg_free_bits_count)
950                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
951                                                         group_bh, bits_wanted,
952                                                         bit_off, bits_found);
953
954         return ret;
955 }
956
957 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
958                               u32 bits_wanted,
959                               u32 min_bits,
960                               u16 *bit_off,
961                               unsigned int *num_bits,
962                               u64 *bg_blkno)
963 {
964         int status;
965         u16 chain, tmp_bits;
966         u32 tmp_used;
967         u64 next_group;
968         struct ocfs2_journal_handle *handle = ac->ac_handle;
969         struct inode *alloc_inode = ac->ac_inode;
970         struct buffer_head *group_bh = NULL;
971         struct buffer_head *prev_group_bh = NULL;
972         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
973         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
974         struct ocfs2_group_desc *bg;
975
976         chain = ac->ac_chain;
977         mlog(0, "trying to alloc %u bits from chain %u, inode %"MLFu64"\n",
978              bits_wanted, chain, OCFS2_I(alloc_inode)->ip_blkno);
979
980         status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
981                                   le64_to_cpu(cl->cl_recs[chain].c_blkno),
982                                   &group_bh, OCFS2_BH_CACHED, alloc_inode);
983         if (status < 0) {
984                 mlog_errno(status);
985                 goto bail;
986         }
987         bg = (struct ocfs2_group_desc *) group_bh->b_data;
988         if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
989                 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
990                 status = -EIO;
991                 goto bail;
992         }
993
994         status = -ENOSPC;
995         /* for now, the chain search is a bit simplistic. We just use
996          * the 1st group with any empty bits. */
997         while ((status = ac->ac_group_search(alloc_inode, group_bh,
998                                              bits_wanted, min_bits, bit_off,
999                                              &tmp_bits)) == -ENOSPC) {
1000                 if (!bg->bg_next_group)
1001                         break;
1002
1003                 if (prev_group_bh) {
1004                         brelse(prev_group_bh);
1005                         prev_group_bh = NULL;
1006                 }
1007                 next_group = le64_to_cpu(bg->bg_next_group);
1008                 prev_group_bh = group_bh;
1009                 group_bh = NULL;
1010                 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
1011                                           next_group, &group_bh,
1012                                           OCFS2_BH_CACHED, alloc_inode);
1013                 if (status < 0) {
1014                         mlog_errno(status);
1015                         goto bail;
1016                 }
1017                 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1018                 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
1019                         OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
1020                         status = -EIO;
1021                         goto bail;
1022                 }
1023         }
1024         if (status < 0) {
1025                 if (status != -ENOSPC)
1026                         mlog_errno(status);
1027                 goto bail;
1028         }
1029
1030         mlog(0, "alloc succeeds: we give %u bits from block group %"MLFu64"\n",
1031              tmp_bits, bg->bg_blkno);
1032
1033         *num_bits = tmp_bits;
1034
1035         BUG_ON(*num_bits == 0);
1036
1037         /*
1038          * Keep track of previous block descriptor read. When
1039          * we find a target, if we have read more than X
1040          * number of descriptors, and the target is reasonably
1041          * empty, relink him to top of his chain.
1042          *
1043          * We've read 0 extra blocks and only send one more to
1044          * the transaction, yet the next guy to search has a
1045          * much easier time.
1046          *
1047          * Do this *after* figuring out how many bits we're taking out
1048          * of our target group.
1049          */
1050         if (ac->ac_allow_chain_relink &&
1051             (prev_group_bh) &&
1052             (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1053                 status = ocfs2_relink_block_group(handle, alloc_inode,
1054                                                   ac->ac_bh, group_bh,
1055                                                   prev_group_bh, chain);
1056                 if (status < 0) {
1057                         mlog_errno(status);
1058                         goto bail;
1059                 }
1060         }
1061
1062         /* Ok, claim our bits now: set the info on dinode, chainlist
1063          * and then the group */
1064         status = ocfs2_journal_access(handle,
1065                                       alloc_inode,
1066                                       ac->ac_bh,
1067                                       OCFS2_JOURNAL_ACCESS_WRITE);
1068         if (status < 0) {
1069                 mlog_errno(status);
1070                 goto bail;
1071         }
1072
1073         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1074         fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1075         le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1076
1077         status = ocfs2_journal_dirty(handle,
1078                                      ac->ac_bh);
1079         if (status < 0) {
1080                 mlog_errno(status);
1081                 goto bail;
1082         }
1083
1084         status = ocfs2_block_group_set_bits(handle,
1085                                             alloc_inode,
1086                                             bg,
1087                                             group_bh,
1088                                             *bit_off,
1089                                             *num_bits);
1090         if (status < 0) {
1091                 mlog_errno(status);
1092                 goto bail;
1093         }
1094
1095         mlog(0, "Allocated %u bits from suballocator %"MLFu64"\n",
1096              *num_bits, fe->i_blkno);
1097
1098         *bg_blkno = le64_to_cpu(bg->bg_blkno);
1099 bail:
1100         if (group_bh)
1101                 brelse(group_bh);
1102         if (prev_group_bh)
1103                 brelse(prev_group_bh);
1104
1105         mlog_exit(status);
1106         return status;
1107 }
1108
1109 /* will give out up to bits_wanted contiguous bits. */
1110 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1111                                      struct ocfs2_alloc_context *ac,
1112                                      u32 bits_wanted,
1113                                      u32 min_bits,
1114                                      u16 *bit_off,
1115                                      unsigned int *num_bits,
1116                                      u64 *bg_blkno)
1117 {
1118         int status;
1119         u16 victim, i;
1120         struct ocfs2_chain_list *cl;
1121         struct ocfs2_dinode *fe;
1122
1123         mlog_entry_void();
1124
1125         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1126         BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1127         BUG_ON(!ac->ac_bh);
1128
1129         fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1130         if (!OCFS2_IS_VALID_DINODE(fe)) {
1131                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
1132                 status = -EIO;
1133                 goto bail;
1134         }
1135         if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1136             le32_to_cpu(fe->id1.bitmap1.i_total)) {
1137                 ocfs2_error(osb->sb, "Chain allocator dinode %"MLFu64" has %u"
1138                             "used bits but only %u total.",
1139                             le64_to_cpu(fe->i_blkno),
1140                             le32_to_cpu(fe->id1.bitmap1.i_used),
1141                             le32_to_cpu(fe->id1.bitmap1.i_total));
1142                 status = -EIO;
1143                 goto bail;
1144         }
1145
1146         cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1147
1148         victim = ocfs2_find_victim_chain(cl);
1149         ac->ac_chain = victim;
1150         ac->ac_allow_chain_relink = 1;
1151
1152         status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off,
1153                                     num_bits, bg_blkno);
1154         if (!status)
1155                 goto bail;
1156         if (status < 0 && status != -ENOSPC) {
1157                 mlog_errno(status);
1158                 goto bail;
1159         }
1160
1161         mlog(0, "Search of victim chain %u came up with nothing, "
1162              "trying all chains now.\n", victim);
1163
1164         /* If we didn't pick a good victim, then just default to
1165          * searching each chain in order. Don't allow chain relinking
1166          * because we only calculate enough journal credits for one
1167          * relink per alloc. */
1168         ac->ac_allow_chain_relink = 0;
1169         for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1170                 if (i == victim)
1171                         continue;
1172                 if (!cl->cl_recs[i].c_free)
1173                         continue;
1174
1175                 ac->ac_chain = i;
1176                 status = ocfs2_search_chain(ac, bits_wanted, min_bits,
1177                                             bit_off, num_bits,
1178                                             bg_blkno);
1179                 if (!status)
1180                         break;
1181                 if (status < 0 && status != -ENOSPC) {
1182                         mlog_errno(status);
1183                         goto bail;
1184                 }
1185         }
1186 bail:
1187
1188         mlog_exit(status);
1189         return status;
1190 }
1191
1192 int ocfs2_claim_metadata(struct ocfs2_super *osb,
1193                          struct ocfs2_journal_handle *handle,
1194                          struct ocfs2_alloc_context *ac,
1195                          u32 bits_wanted,
1196                          u16 *suballoc_bit_start,
1197                          unsigned int *num_bits,
1198                          u64 *blkno_start)
1199 {
1200         int status;
1201         u64 bg_blkno;
1202
1203         BUG_ON(!ac);
1204         BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1205         BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1206         BUG_ON(ac->ac_handle != handle);
1207
1208         status = ocfs2_claim_suballoc_bits(osb,
1209                                            ac,
1210                                            bits_wanted,
1211                                            1,
1212                                            suballoc_bit_start,
1213                                            num_bits,
1214                                            &bg_blkno);
1215         if (status < 0) {
1216                 mlog_errno(status);
1217                 goto bail;
1218         }
1219         atomic_inc(&osb->alloc_stats.bg_allocs);
1220
1221         *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1222         ac->ac_bits_given += (*num_bits);
1223         status = 0;
1224 bail:
1225         mlog_exit(status);
1226         return status;
1227 }
1228
1229 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1230                           struct ocfs2_journal_handle *handle,
1231                           struct ocfs2_alloc_context *ac,
1232                           u16 *suballoc_bit,
1233                           u64 *fe_blkno)
1234 {
1235         int status;
1236         unsigned int num_bits;
1237         u64 bg_blkno;
1238
1239         mlog_entry_void();
1240
1241         BUG_ON(!ac);
1242         BUG_ON(ac->ac_bits_given != 0);
1243         BUG_ON(ac->ac_bits_wanted != 1);
1244         BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1245         BUG_ON(ac->ac_handle != handle);
1246
1247         status = ocfs2_claim_suballoc_bits(osb,
1248                                            ac,
1249                                            1,
1250                                            1,
1251                                            suballoc_bit,
1252                                            &num_bits,
1253                                            &bg_blkno);
1254         if (status < 0) {
1255                 mlog_errno(status);
1256                 goto bail;
1257         }
1258         atomic_inc(&osb->alloc_stats.bg_allocs);
1259
1260         BUG_ON(num_bits != 1);
1261
1262         *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1263         ac->ac_bits_given++;
1264         status = 0;
1265 bail:
1266         mlog_exit(status);
1267         return status;
1268 }
1269
1270 /* translate a group desc. blkno and it's bitmap offset into
1271  * disk cluster offset. */
1272 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1273                                                    u64 bg_blkno,
1274                                                    u16 bg_bit_off)
1275 {
1276         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1277         u32 cluster = 0;
1278
1279         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1280
1281         if (bg_blkno != osb->first_cluster_group_blkno)
1282                 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1283         cluster += (u32) bg_bit_off;
1284         return cluster;
1285 }
1286
1287 /* given a cluster offset, calculate which block group it belongs to
1288  * and return that block offset. */
1289 static inline u64 ocfs2_which_cluster_group(struct inode *inode,
1290                                             u32 cluster)
1291 {
1292         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1293         u32 group_no;
1294
1295         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1296
1297         group_no = cluster / osb->bitmap_cpg;
1298         if (!group_no)
1299                 return osb->first_cluster_group_blkno;
1300         return ocfs2_clusters_to_blocks(inode->i_sb,
1301                                         group_no * osb->bitmap_cpg);
1302 }
1303
1304 /* given the block number of a cluster start, calculate which cluster
1305  * group and descriptor bitmap offset that corresponds to. */
1306 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1307                                                 u64 data_blkno,
1308                                                 u64 *bg_blkno,
1309                                                 u16 *bg_bit_off)
1310 {
1311         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1312         u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1313
1314         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1315
1316         *bg_blkno = ocfs2_which_cluster_group(inode,
1317                                               data_cluster);
1318
1319         if (*bg_blkno == osb->first_cluster_group_blkno)
1320                 *bg_bit_off = (u16) data_cluster;
1321         else
1322                 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1323                                                              data_blkno - *bg_blkno);
1324 }
1325
1326 /*
1327  * min_bits - minimum contiguous chunk from this total allocation we
1328  * can handle. set to what we asked for originally for a full
1329  * contig. allocation, set to '1' to indicate we can deal with extents
1330  * of any size.
1331  */
1332 int ocfs2_claim_clusters(struct ocfs2_super *osb,
1333                          struct ocfs2_journal_handle *handle,
1334                          struct ocfs2_alloc_context *ac,
1335                          u32 min_clusters,
1336                          u32 *cluster_start,
1337                          u32 *num_clusters)
1338 {
1339         int status;
1340         unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1341         u64 bg_blkno;
1342         u16 bg_bit_off;
1343
1344         mlog_entry_void();
1345
1346         BUG_ON(!ac);
1347         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1348
1349         BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1350                && ac->ac_which != OCFS2_AC_USE_MAIN);
1351         BUG_ON(ac->ac_handle != handle);
1352
1353         if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1354                 status = ocfs2_claim_local_alloc_bits(osb,
1355                                                       handle,
1356                                                       ac,
1357                                                       bits_wanted,
1358                                                       cluster_start,
1359                                                       num_clusters);
1360                 if (!status)
1361                         atomic_inc(&osb->alloc_stats.local_data);
1362         } else {
1363                 if (min_clusters > (osb->bitmap_cpg - 1)) {
1364                         /* The only paths asking for contiguousness
1365                          * should know about this already. */
1366                         mlog(ML_ERROR, "minimum allocation requested exceeds "
1367                                        "group bitmap size!");
1368                         status = -ENOSPC;
1369                         goto bail;
1370                 }
1371                 /* clamp the current request down to a realistic size. */
1372                 if (bits_wanted > (osb->bitmap_cpg - 1))
1373                         bits_wanted = osb->bitmap_cpg - 1;
1374
1375                 status = ocfs2_claim_suballoc_bits(osb,
1376                                                    ac,
1377                                                    bits_wanted,
1378                                                    min_clusters,
1379                                                    &bg_bit_off,
1380                                                    num_clusters,
1381                                                    &bg_blkno);
1382                 if (!status) {
1383                         *cluster_start =
1384                                 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1385                                                                  bg_blkno,
1386                                                                  bg_bit_off);
1387                         atomic_inc(&osb->alloc_stats.bitmap_data);
1388                 }
1389         }
1390         if (status < 0) {
1391                 if (status != -ENOSPC)
1392                         mlog_errno(status);
1393                 goto bail;
1394         }
1395
1396         ac->ac_bits_given += *num_clusters;
1397
1398 bail:
1399         mlog_exit(status);
1400         return status;
1401 }
1402
1403 static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
1404                                                struct inode *alloc_inode,
1405                                                struct ocfs2_group_desc *bg,
1406                                                struct buffer_head *group_bh,
1407                                                unsigned int bit_off,
1408                                                unsigned int num_bits)
1409 {
1410         int status;
1411         unsigned int tmp;
1412         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1413         struct ocfs2_group_desc *undo_bg = NULL;
1414
1415         mlog_entry_void();
1416
1417         if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
1418                 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
1419                 status = -EIO;
1420                 goto bail;
1421         }
1422
1423         mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1424
1425         if (ocfs2_is_cluster_bitmap(alloc_inode))
1426                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1427
1428         status = ocfs2_journal_access(handle, alloc_inode, group_bh,
1429                                       journal_type);
1430         if (status < 0) {
1431                 mlog_errno(status);
1432                 goto bail;
1433         }
1434
1435         if (ocfs2_is_cluster_bitmap(alloc_inode))
1436                 undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
1437
1438         tmp = num_bits;
1439         while(tmp--) {
1440                 ocfs2_clear_bit((bit_off + tmp),
1441                                 (unsigned long *) bg->bg_bitmap);
1442                 if (ocfs2_is_cluster_bitmap(alloc_inode))
1443                         ocfs2_set_bit(bit_off + tmp,
1444                                       (unsigned long *) undo_bg->bg_bitmap);
1445         }
1446         le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1447
1448         status = ocfs2_journal_dirty(handle, group_bh);
1449         if (status < 0)
1450                 mlog_errno(status);
1451 bail:
1452         return status;
1453 }
1454
1455 /*
1456  * expects the suballoc inode to already be locked.
1457  */
1458 static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
1459                                     struct inode *alloc_inode,
1460                                     struct buffer_head *alloc_bh,
1461                                     unsigned int start_bit,
1462                                     u64 bg_blkno,
1463                                     unsigned int count)
1464 {
1465         int status = 0;
1466         u32 tmp_used;
1467         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
1468         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
1469         struct ocfs2_chain_list *cl = &fe->id2.i_chain;
1470         struct buffer_head *group_bh = NULL;
1471         struct ocfs2_group_desc *group;
1472
1473         mlog_entry_void();
1474
1475         if (!OCFS2_IS_VALID_DINODE(fe)) {
1476                 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
1477                 status = -EIO;
1478                 goto bail;
1479         }
1480         BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1481
1482         mlog(0, "suballocator %"MLFu64": freeing %u bits from group %"MLFu64
1483              ", starting at %u\n",
1484              OCFS2_I(alloc_inode)->ip_blkno, count, bg_blkno,
1485              start_bit);
1486
1487         status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
1488                                   alloc_inode);
1489         if (status < 0) {
1490                 mlog_errno(status);
1491                 goto bail;
1492         }
1493
1494         group = (struct ocfs2_group_desc *) group_bh->b_data;
1495         if (!OCFS2_IS_VALID_GROUP_DESC(group)) {
1496                 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, group);
1497                 status = -EIO;
1498                 goto bail;
1499         }
1500         BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1501
1502         status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1503                                               group, group_bh,
1504                                               start_bit, count);
1505         if (status < 0) {
1506                 mlog_errno(status);
1507                 goto bail;
1508         }
1509
1510         status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
1511                                       OCFS2_JOURNAL_ACCESS_WRITE);
1512         if (status < 0) {
1513                 mlog_errno(status);
1514                 goto bail;
1515         }
1516
1517         le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
1518                      count);
1519         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1520         fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
1521
1522         status = ocfs2_journal_dirty(handle, alloc_bh);
1523         if (status < 0) {
1524                 mlog_errno(status);
1525                 goto bail;
1526         }
1527
1528 bail:
1529         if (group_bh)
1530                 brelse(group_bh);
1531
1532         mlog_exit(status);
1533         return status;
1534 }
1535
1536 static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
1537 {
1538         u64 group = block - (u64) bit;
1539
1540         return group;
1541 }
1542
1543 int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
1544                       struct inode *inode_alloc_inode,
1545                       struct buffer_head *inode_alloc_bh,
1546                       struct ocfs2_dinode *di)
1547 {
1548         u64 blk = le64_to_cpu(di->i_blkno);
1549         u16 bit = le16_to_cpu(di->i_suballoc_bit);
1550         u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1551
1552         return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
1553                                         inode_alloc_bh, bit, bg_blkno, 1);
1554 }
1555
1556 int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
1557                             struct inode *eb_alloc_inode,
1558                             struct buffer_head *eb_alloc_bh,
1559                             struct ocfs2_extent_block *eb)
1560 {
1561         u64 blk = le64_to_cpu(eb->h_blkno);
1562         u16 bit = le16_to_cpu(eb->h_suballoc_bit);
1563         u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1564
1565         return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
1566                                         bit, bg_blkno, 1);
1567 }
1568
1569 int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
1570                        struct inode *bitmap_inode,
1571                        struct buffer_head *bitmap_bh,
1572                        u64 start_blk,
1573                        unsigned int num_clusters)
1574 {
1575         int status;
1576         u16 bg_start_bit;
1577         u64 bg_blkno;
1578         struct ocfs2_dinode *fe;
1579
1580         /* You can't ever have a contiguous set of clusters
1581          * bigger than a block group bitmap so we never have to worry
1582          * about looping on them. */
1583
1584         mlog_entry_void();
1585
1586         /* This is expensive. We can safely remove once this stuff has
1587          * gotten tested really well. */
1588         BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
1589
1590         fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
1591
1592         ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
1593                                      &bg_start_bit);
1594
1595         mlog(0, "want to free %u clusters starting at block %"MLFu64"\n",
1596              num_clusters, start_blk);
1597         mlog(0, "bg_blkno = %"MLFu64", bg_start_bit = %u\n",
1598              bg_blkno, bg_start_bit);
1599
1600         status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
1601                                           bg_start_bit, bg_blkno,
1602                                           num_clusters);
1603         if (status < 0)
1604                 mlog_errno(status);
1605
1606         mlog_exit(status);
1607         return status;
1608 }
1609
1610 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
1611 {
1612         printk("Block Group:\n");
1613         printk("bg_signature:       %s\n", bg->bg_signature);
1614         printk("bg_size:            %u\n", bg->bg_size);
1615         printk("bg_bits:            %u\n", bg->bg_bits);
1616         printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
1617         printk("bg_chain:           %u\n", bg->bg_chain);
1618         printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
1619         printk("bg_next_group:      %"MLFu64"\n", bg->bg_next_group);
1620         printk("bg_parent_dinode:   %"MLFu64"\n", bg->bg_parent_dinode);
1621         printk("bg_blkno:           %"MLFu64"\n", bg->bg_blkno);
1622 }
1623
1624 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
1625 {
1626         int i;
1627
1628         printk("Suballoc Inode %"MLFu64":\n", fe->i_blkno);
1629         printk("i_signature:                  %s\n", fe->i_signature);
1630         printk("i_size:                       %"MLFu64"\n", fe->i_size);
1631         printk("i_clusters:                   %u\n", fe->i_clusters);
1632         printk("i_generation:                 %u\n",
1633                le32_to_cpu(fe->i_generation));
1634         printk("id1.bitmap1.i_used:           %u\n",
1635                le32_to_cpu(fe->id1.bitmap1.i_used));
1636         printk("id1.bitmap1.i_total:          %u\n",
1637                le32_to_cpu(fe->id1.bitmap1.i_total));
1638         printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
1639         printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
1640         printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
1641         printk("id2.i_chain.cl_next_free_rec: %u\n",
1642                fe->id2.i_chain.cl_next_free_rec);
1643         for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
1644                 printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
1645                        fe->id2.i_chain.cl_recs[i].c_free);
1646                 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
1647                        fe->id2.i_chain.cl_recs[i].c_total);
1648                 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %"MLFu64"\n", i,
1649                        fe->id2.i_chain.cl_recs[i].c_blkno);
1650         }
1651 }