Prevent data corruption in logfs_rewrite_block()
[linux-2.6.git] / fs / logfs / readwrite.c
1 /*
2  * fs/logfs/readwrite.c
3  *
4  * As should be obvious for Linux kernel code, license is GPLv2
5  *
6  * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7  *
8  *
9  * Actually contains five sets of very similar functions:
10  * read         read blocks from a file
11  * seek_hole    find next hole
12  * seek_data    find next data block
13  * valid        check whether a block still belongs to a file
14  * write        write blocks to a file
15  * delete       delete a block (for directories and ifile)
16  * rewrite      move existing blocks of a file to a new location (gc helper)
17  * truncate     truncate a file
18  */
19 #include "logfs.h"
20 #include <linux/sched.h>
21
22 static u64 adjust_bix(u64 bix, level_t level)
23 {
24         switch (level) {
25         case 0:
26                 return bix;
27         case LEVEL(1):
28                 return max_t(u64, bix, I0_BLOCKS);
29         case LEVEL(2):
30                 return max_t(u64, bix, I1_BLOCKS);
31         case LEVEL(3):
32                 return max_t(u64, bix, I2_BLOCKS);
33         case LEVEL(4):
34                 return max_t(u64, bix, I3_BLOCKS);
35         case LEVEL(5):
36                 return max_t(u64, bix, I4_BLOCKS);
37         default:
38                 WARN_ON(1);
39                 return bix;
40         }
41 }
42
43 static inline u64 maxbix(u8 height)
44 {
45         return 1ULL << (LOGFS_BLOCK_BITS * height);
46 }
47
48 /**
49  * The inode address space is cut in two halves.  Lower half belongs to data
50  * pages, upper half to indirect blocks.  If the high bit (INDIRECT_BIT) is
51  * set, the actual block index (bix) and level can be derived from the page
52  * index.
53  *
54  * The lowest three bits of the block index are set to 0 after packing and
55  * unpacking.  Since the lowest n bits (9 for 4KiB blocksize) are ignored
56  * anyway this is harmless.
57  */
58 #define ARCH_SHIFT      (BITS_PER_LONG - 32)
59 #define INDIRECT_BIT    (0x80000000UL << ARCH_SHIFT)
60 #define LEVEL_SHIFT     (28 + ARCH_SHIFT)
61 static inline pgoff_t first_indirect_block(void)
62 {
63         return INDIRECT_BIT | (1ULL << LEVEL_SHIFT);
64 }
65
66 pgoff_t logfs_pack_index(u64 bix, level_t level)
67 {
68         pgoff_t index;
69
70         BUG_ON(bix >= INDIRECT_BIT);
71         if (level == 0)
72                 return bix;
73
74         index  = INDIRECT_BIT;
75         index |= (__force long)level << LEVEL_SHIFT;
76         index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS);
77         return index;
78 }
79
80 void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level)
81 {
82         u8 __level;
83
84         if (!(index & INDIRECT_BIT)) {
85                 *bix = index;
86                 *level = 0;
87                 return;
88         }
89
90         __level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT;
91         *level = LEVEL(__level);
92         *bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT;
93         *bix = adjust_bix(*bix, *level);
94         return;
95 }
96 #undef ARCH_SHIFT
97 #undef INDIRECT_BIT
98 #undef LEVEL_SHIFT
99
100 /*
101  * Time is stored as nanoseconds since the epoch.
102  */
103 static struct timespec be64_to_timespec(__be64 betime)
104 {
105         return ns_to_timespec(be64_to_cpu(betime));
106 }
107
108 static __be64 timespec_to_be64(struct timespec tsp)
109 {
110         return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec);
111 }
112
113 static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
114 {
115         struct logfs_inode *li = logfs_inode(inode);
116         int i;
117
118         inode->i_mode   = be16_to_cpu(di->di_mode);
119         li->li_height   = di->di_height;
120         li->li_flags    = be32_to_cpu(di->di_flags);
121         inode->i_uid    = be32_to_cpu(di->di_uid);
122         inode->i_gid    = be32_to_cpu(di->di_gid);
123         inode->i_size   = be64_to_cpu(di->di_size);
124         logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes));
125         inode->i_atime  = be64_to_timespec(di->di_atime);
126         inode->i_ctime  = be64_to_timespec(di->di_ctime);
127         inode->i_mtime  = be64_to_timespec(di->di_mtime);
128         inode->i_nlink  = be32_to_cpu(di->di_refcount);
129         inode->i_generation = be32_to_cpu(di->di_generation);
130
131         switch (inode->i_mode & S_IFMT) {
132         case S_IFSOCK:  /* fall through */
133         case S_IFBLK:   /* fall through */
134         case S_IFCHR:   /* fall through */
135         case S_IFIFO:
136                 inode->i_rdev = be64_to_cpu(di->di_data[0]);
137                 break;
138         case S_IFDIR:   /* fall through */
139         case S_IFREG:   /* fall through */
140         case S_IFLNK:
141                 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
142                         li->li_data[i] = be64_to_cpu(di->di_data[i]);
143                 break;
144         default:
145                 BUG();
146         }
147 }
148
149 static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di)
150 {
151         struct logfs_inode *li = logfs_inode(inode);
152         int i;
153
154         di->di_mode     = cpu_to_be16(inode->i_mode);
155         di->di_height   = li->li_height;
156         di->di_pad      = 0;
157         di->di_flags    = cpu_to_be32(li->li_flags);
158         di->di_uid      = cpu_to_be32(inode->i_uid);
159         di->di_gid      = cpu_to_be32(inode->i_gid);
160         di->di_size     = cpu_to_be64(i_size_read(inode));
161         di->di_used_bytes = cpu_to_be64(li->li_used_bytes);
162         di->di_atime    = timespec_to_be64(inode->i_atime);
163         di->di_ctime    = timespec_to_be64(inode->i_ctime);
164         di->di_mtime    = timespec_to_be64(inode->i_mtime);
165         di->di_refcount = cpu_to_be32(inode->i_nlink);
166         di->di_generation = cpu_to_be32(inode->i_generation);
167
168         switch (inode->i_mode & S_IFMT) {
169         case S_IFSOCK:  /* fall through */
170         case S_IFBLK:   /* fall through */
171         case S_IFCHR:   /* fall through */
172         case S_IFIFO:
173                 di->di_data[0] = cpu_to_be64(inode->i_rdev);
174                 break;
175         case S_IFDIR:   /* fall through */
176         case S_IFREG:   /* fall through */
177         case S_IFLNK:
178                 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
179                         di->di_data[i] = cpu_to_be64(li->li_data[i]);
180                 break;
181         default:
182                 BUG();
183         }
184 }
185
186 static void __logfs_set_blocks(struct inode *inode)
187 {
188         struct super_block *sb = inode->i_sb;
189         struct logfs_inode *li = logfs_inode(inode);
190
191         inode->i_blocks = ULONG_MAX;
192         if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX)
193                 inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9;
194 }
195
196 void logfs_set_blocks(struct inode *inode, u64 bytes)
197 {
198         struct logfs_inode *li = logfs_inode(inode);
199
200         li->li_used_bytes = bytes;
201         __logfs_set_blocks(inode);
202 }
203
204 static void prelock_page(struct super_block *sb, struct page *page, int lock)
205 {
206         struct logfs_super *super = logfs_super(sb);
207
208         BUG_ON(!PageLocked(page));
209         if (lock) {
210                 BUG_ON(PagePreLocked(page));
211                 SetPagePreLocked(page);
212         } else {
213                 /* We are in GC path. */
214                 if (PagePreLocked(page))
215                         super->s_lock_count++;
216                 else
217                         SetPagePreLocked(page);
218         }
219 }
220
221 static void preunlock_page(struct super_block *sb, struct page *page, int lock)
222 {
223         struct logfs_super *super = logfs_super(sb);
224
225         BUG_ON(!PageLocked(page));
226         if (lock)
227                 ClearPagePreLocked(page);
228         else {
229                 /* We are in GC path. */
230                 BUG_ON(!PagePreLocked(page));
231                 if (super->s_lock_count)
232                         super->s_lock_count--;
233                 else
234                         ClearPagePreLocked(page);
235         }
236 }
237
238 /*
239  * Logfs is prone to an AB-BA deadlock where one task tries to acquire
240  * s_write_mutex with a locked page and GC tries to get that page while holding
241  * s_write_mutex.
242  * To solve this issue logfs will ignore the page lock iff the page in question
243  * is waiting for s_write_mutex.  We annotate this fact by setting PG_pre_locked
244  * in addition to PG_locked.
245  */
246 static void logfs_get_wblocks(struct super_block *sb, struct page *page,
247                 int lock)
248 {
249         struct logfs_super *super = logfs_super(sb);
250
251         if (page)
252                 prelock_page(sb, page, lock);
253
254         if (lock) {
255                 mutex_lock(&super->s_write_mutex);
256                 logfs_gc_pass(sb);
257                 /* FIXME: We also have to check for shadowed space
258                  * and mempool fill grade */
259         }
260 }
261
262 static void logfs_put_wblocks(struct super_block *sb, struct page *page,
263                 int lock)
264 {
265         struct logfs_super *super = logfs_super(sb);
266
267         if (page)
268                 preunlock_page(sb, page, lock);
269         /* Order matters - we must clear PG_pre_locked before releasing
270          * s_write_mutex or we could race against another task. */
271         if (lock)
272                 mutex_unlock(&super->s_write_mutex);
273 }
274
275 static struct page *logfs_get_read_page(struct inode *inode, u64 bix,
276                 level_t level)
277 {
278         return find_or_create_page(inode->i_mapping,
279                         logfs_pack_index(bix, level), GFP_NOFS);
280 }
281
282 static void logfs_put_read_page(struct page *page)
283 {
284         unlock_page(page);
285         page_cache_release(page);
286 }
287
288 static void logfs_lock_write_page(struct page *page)
289 {
290         int loop = 0;
291
292         while (unlikely(!trylock_page(page))) {
293                 if (loop++ > 0x1000) {
294                         /* Has been observed once so far... */
295                         printk(KERN_ERR "stack at %p\n", &loop);
296                         BUG();
297                 }
298                 if (PagePreLocked(page)) {
299                         /* Holder of page lock is waiting for us, it
300                          * is safe to use this page. */
301                         break;
302                 }
303                 /* Some other process has this page locked and has
304                  * nothing to do with us.  Wait for it to finish.
305                  */
306                 schedule();
307         }
308         BUG_ON(!PageLocked(page));
309 }
310
311 static struct page *logfs_get_write_page(struct inode *inode, u64 bix,
312                 level_t level)
313 {
314         struct address_space *mapping = inode->i_mapping;
315         pgoff_t index = logfs_pack_index(bix, level);
316         struct page *page;
317         int err;
318
319 repeat:
320         page = find_get_page(mapping, index);
321         if (!page) {
322                 page = __page_cache_alloc(GFP_NOFS);
323                 if (!page)
324                         return NULL;
325                 err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
326                 if (unlikely(err)) {
327                         page_cache_release(page);
328                         if (err == -EEXIST)
329                                 goto repeat;
330                         return NULL;
331                 }
332         } else logfs_lock_write_page(page);
333         BUG_ON(!PageLocked(page));
334         return page;
335 }
336
337 static void logfs_unlock_write_page(struct page *page)
338 {
339         if (!PagePreLocked(page))
340                 unlock_page(page);
341 }
342
343 static void logfs_put_write_page(struct page *page)
344 {
345         logfs_unlock_write_page(page);
346         page_cache_release(page);
347 }
348
349 static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level,
350                 int rw)
351 {
352         if (rw == READ)
353                 return logfs_get_read_page(inode, bix, level);
354         else
355                 return logfs_get_write_page(inode, bix, level);
356 }
357
358 static void logfs_put_page(struct page *page, int rw)
359 {
360         if (rw == READ)
361                 logfs_put_read_page(page);
362         else
363                 logfs_put_write_page(page);
364 }
365
366 static unsigned long __get_bits(u64 val, int skip, int no)
367 {
368         u64 ret = val;
369
370         ret >>= skip * no;
371         ret <<= 64 - no;
372         ret >>= 64 - no;
373         return ret;
374 }
375
376 static unsigned long get_bits(u64 val, level_t skip)
377 {
378         return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS);
379 }
380
381 static inline void init_shadow_tree(struct super_block *sb,
382                 struct shadow_tree *tree)
383 {
384         struct logfs_super *super = logfs_super(sb);
385
386         btree_init_mempool64(&tree->new, super->s_btree_pool);
387         btree_init_mempool64(&tree->old, super->s_btree_pool);
388 }
389
390 static void indirect_write_block(struct logfs_block *block)
391 {
392         struct page *page;
393         struct inode *inode;
394         int ret;
395
396         page = block->page;
397         inode = page->mapping->host;
398         logfs_lock_write_page(page);
399         ret = logfs_write_buf(inode, page, 0);
400         logfs_unlock_write_page(page);
401         /*
402          * This needs some rework.  Unless you want your filesystem to run
403          * completely synchronously (you don't), the filesystem will always
404          * report writes as 'successful' before the actual work has been
405          * done.  The actual work gets done here and this is where any errors
406          * will show up.  And there isn't much we can do about it, really.
407          *
408          * Some attempts to fix the errors (move from bad blocks, retry io,...)
409          * have already been done, so anything left should be either a broken
410          * device or a bug somewhere in logfs itself.  Being relatively new,
411          * the odds currently favor a bug, so for now the line below isn't
412          * entirely tasteles.
413          */
414         BUG_ON(ret);
415 }
416
417 static void inode_write_block(struct logfs_block *block)
418 {
419         struct inode *inode;
420         int ret;
421
422         inode = block->inode;
423         if (inode->i_ino == LOGFS_INO_MASTER)
424                 logfs_write_anchor(inode->i_sb);
425         else {
426                 ret = __logfs_write_inode(inode, 0);
427                 /* see indirect_write_block comment */
428                 BUG_ON(ret);
429         }
430 }
431
432 static gc_level_t inode_block_level(struct logfs_block *block)
433 {
434         BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER);
435         return GC_LEVEL(LOGFS_MAX_LEVELS);
436 }
437
438 static gc_level_t indirect_block_level(struct logfs_block *block)
439 {
440         struct page *page;
441         struct inode *inode;
442         u64 bix;
443         level_t level;
444
445         page = block->page;
446         inode = page->mapping->host;
447         logfs_unpack_index(page->index, &bix, &level);
448         return expand_level(inode->i_ino, level);
449 }
450
451 /*
452  * This silences a false, yet annoying gcc warning.  I hate it when my editor
453  * jumps into bitops.h each time I recompile this file.
454  * TODO: Complain to gcc folks about this and upgrade compiler.
455  */
456 static unsigned long fnb(const unsigned long *addr,
457                 unsigned long size, unsigned long offset)
458 {
459         return find_next_bit(addr, size, offset);
460 }
461
462 static __be64 inode_val0(struct inode *inode)
463 {
464         struct logfs_inode *li = logfs_inode(inode);
465         u64 val;
466
467         /*
468          * Explicit shifting generates good code, but must match the format
469          * of the structure.  Add some paranoia just in case.
470          */
471         BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0);
472         BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2);
473         BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4);
474
475         val =   (u64)inode->i_mode << 48 |
476                 (u64)li->li_height << 40 |
477                 (u64)li->li_flags;
478         return cpu_to_be64(val);
479 }
480
481 static int inode_write_alias(struct super_block *sb,
482                 struct logfs_block *block, write_alias_t *write_one_alias)
483 {
484         struct inode *inode = block->inode;
485         struct logfs_inode *li = logfs_inode(inode);
486         unsigned long pos;
487         u64 ino , bix;
488         __be64 val;
489         level_t level;
490         int err;
491
492         for (pos = 0; ; pos++) {
493                 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
494                 if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS)
495                         return 0;
496
497                 switch (pos) {
498                 case INODE_HEIGHT_OFS:
499                         val = inode_val0(inode);
500                         break;
501                 case INODE_USED_OFS:
502                         val = cpu_to_be64(li->li_used_bytes);;
503                         break;
504                 case INODE_SIZE_OFS:
505                         val = cpu_to_be64(i_size_read(inode));
506                         break;
507                 case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1:
508                         val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]);
509                         break;
510                 default:
511                         BUG();
512                 }
513
514                 ino = LOGFS_INO_MASTER;
515                 bix = inode->i_ino;
516                 level = LEVEL(0);
517                 err = write_one_alias(sb, ino, bix, level, pos, val);
518                 if (err)
519                         return err;
520         }
521 }
522
523 static int indirect_write_alias(struct super_block *sb,
524                 struct logfs_block *block, write_alias_t *write_one_alias)
525 {
526         unsigned long pos;
527         struct page *page = block->page;
528         u64 ino , bix;
529         __be64 *child, val;
530         level_t level;
531         int err;
532
533         for (pos = 0; ; pos++) {
534                 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
535                 if (pos >= LOGFS_BLOCK_FACTOR)
536                         return 0;
537
538                 ino = page->mapping->host->i_ino;
539                 logfs_unpack_index(page->index, &bix, &level);
540                 child = kmap_atomic(page, KM_USER0);
541                 val = child[pos];
542                 kunmap_atomic(child, KM_USER0);
543                 err = write_one_alias(sb, ino, bix, level, pos, val);
544                 if (err)
545                         return err;
546         }
547 }
548
549 int logfs_write_obj_aliases_pagecache(struct super_block *sb)
550 {
551         struct logfs_super *super = logfs_super(sb);
552         struct logfs_block *block;
553         int err;
554
555         list_for_each_entry(block, &super->s_object_alias, alias_list) {
556                 err = block->ops->write_alias(sb, block, write_alias_journal);
557                 if (err)
558                         return err;
559         }
560         return 0;
561 }
562
563 void __free_block(struct super_block *sb, struct logfs_block *block)
564 {
565         BUG_ON(!list_empty(&block->item_list));
566         list_del(&block->alias_list);
567         mempool_free(block, logfs_super(sb)->s_block_pool);
568 }
569
570 static void inode_free_block(struct super_block *sb, struct logfs_block *block)
571 {
572         struct inode *inode = block->inode;
573
574         logfs_inode(inode)->li_block = NULL;
575         __free_block(sb, block);
576 }
577
578 static void indirect_free_block(struct super_block *sb,
579                 struct logfs_block *block)
580 {
581         ClearPagePrivate(block->page);
582         block->page->private = 0;
583         __free_block(sb, block);
584 }
585
586
587 static struct logfs_block_ops inode_block_ops = {
588         .write_block = inode_write_block,
589         .block_level = inode_block_level,
590         .free_block = inode_free_block,
591         .write_alias = inode_write_alias,
592 };
593
594 struct logfs_block_ops indirect_block_ops = {
595         .write_block = indirect_write_block,
596         .block_level = indirect_block_level,
597         .free_block = indirect_free_block,
598         .write_alias = indirect_write_alias,
599 };
600
601 struct logfs_block *__alloc_block(struct super_block *sb,
602                 u64 ino, u64 bix, level_t level)
603 {
604         struct logfs_super *super = logfs_super(sb);
605         struct logfs_block *block;
606
607         block = mempool_alloc(super->s_block_pool, GFP_NOFS);
608         memset(block, 0, sizeof(*block));
609         INIT_LIST_HEAD(&block->alias_list);
610         INIT_LIST_HEAD(&block->item_list);
611         block->sb = sb;
612         block->ino = ino;
613         block->bix = bix;
614         block->level = level;
615         return block;
616 }
617
618 static void alloc_inode_block(struct inode *inode)
619 {
620         struct logfs_inode *li = logfs_inode(inode);
621         struct logfs_block *block;
622
623         if (li->li_block)
624                 return;
625
626         block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0);
627         block->inode = inode;
628         li->li_block = block;
629         block->ops = &inode_block_ops;
630 }
631
632 void initialize_block_counters(struct page *page, struct logfs_block *block,
633                 __be64 *array, int page_is_empty)
634 {
635         u64 ptr;
636         int i, start;
637
638         block->partial = 0;
639         block->full = 0;
640         start = 0;
641         if (page->index < first_indirect_block()) {
642                 /* Counters are pointless on level 0 */
643                 return;
644         }
645         if (page->index == first_indirect_block()) {
646                 /* Skip unused pointers */
647                 start = I0_BLOCKS;
648                 block->full = I0_BLOCKS;
649         }
650         if (!page_is_empty) {
651                 for (i = start; i < LOGFS_BLOCK_FACTOR; i++) {
652                         ptr = be64_to_cpu(array[i]);
653                         if (ptr)
654                                 block->partial++;
655                         if (ptr & LOGFS_FULLY_POPULATED)
656                                 block->full++;
657                 }
658         }
659 }
660
661 static void alloc_data_block(struct inode *inode, struct page *page)
662 {
663         struct logfs_block *block;
664         u64 bix;
665         level_t level;
666
667         if (PagePrivate(page))
668                 return;
669
670         logfs_unpack_index(page->index, &bix, &level);
671         block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
672         block->page = page;
673         SetPagePrivate(page);
674         page->private = (unsigned long)block;
675         block->ops = &indirect_block_ops;
676 }
677
678 static void alloc_indirect_block(struct inode *inode, struct page *page,
679                 int page_is_empty)
680 {
681         struct logfs_block *block;
682         __be64 *array;
683
684         if (PagePrivate(page))
685                 return;
686
687         alloc_data_block(inode, page);
688
689         block = logfs_block(page);
690         array = kmap_atomic(page, KM_USER0);
691         initialize_block_counters(page, block, array, page_is_empty);
692         kunmap_atomic(array, KM_USER0);
693 }
694
695 static void block_set_pointer(struct page *page, int index, u64 ptr)
696 {
697         struct logfs_block *block = logfs_block(page);
698         __be64 *array;
699         u64 oldptr;
700
701         BUG_ON(!block);
702         array = kmap_atomic(page, KM_USER0);
703         oldptr = be64_to_cpu(array[index]);
704         array[index] = cpu_to_be64(ptr);
705         kunmap_atomic(array, KM_USER0);
706         SetPageUptodate(page);
707
708         block->full += !!(ptr & LOGFS_FULLY_POPULATED)
709                 - !!(oldptr & LOGFS_FULLY_POPULATED);
710         block->partial += !!ptr - !!oldptr;
711 }
712
713 static u64 block_get_pointer(struct page *page, int index)
714 {
715         __be64 *block;
716         u64 ptr;
717
718         block = kmap_atomic(page, KM_USER0);
719         ptr = be64_to_cpu(block[index]);
720         kunmap_atomic(block, KM_USER0);
721         return ptr;
722 }
723
724 static int logfs_read_empty(struct page *page)
725 {
726         zero_user_segment(page, 0, PAGE_CACHE_SIZE);
727         return 0;
728 }
729
730 static int logfs_read_direct(struct inode *inode, struct page *page)
731 {
732         struct logfs_inode *li = logfs_inode(inode);
733         pgoff_t index = page->index;
734         u64 block;
735
736         block = li->li_data[index];
737         if (!block)
738                 return logfs_read_empty(page);
739
740         return logfs_segment_read(inode, page, block, index, 0);
741 }
742
743 static int logfs_read_loop(struct inode *inode, struct page *page,
744                 int rw_context)
745 {
746         struct logfs_inode *li = logfs_inode(inode);
747         u64 bix, bofs = li->li_data[INDIRECT_INDEX];
748         level_t level, target_level;
749         int ret;
750         struct page *ipage;
751
752         logfs_unpack_index(page->index, &bix, &target_level);
753         if (!bofs)
754                 return logfs_read_empty(page);
755
756         if (bix >= maxbix(li->li_height))
757                 return logfs_read_empty(page);
758
759         for (level = LEVEL(li->li_height);
760                         (__force u8)level > (__force u8)target_level;
761                         level = SUBLEVEL(level)){
762                 ipage = logfs_get_page(inode, bix, level, rw_context);
763                 if (!ipage)
764                         return -ENOMEM;
765
766                 ret = logfs_segment_read(inode, ipage, bofs, bix, level);
767                 if (ret) {
768                         logfs_put_read_page(ipage);
769                         return ret;
770                 }
771
772                 bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
773                 logfs_put_page(ipage, rw_context);
774                 if (!bofs)
775                         return logfs_read_empty(page);
776         }
777
778         return logfs_segment_read(inode, page, bofs, bix, 0);
779 }
780
781 static int logfs_read_block(struct inode *inode, struct page *page,
782                 int rw_context)
783 {
784         pgoff_t index = page->index;
785
786         if (index < I0_BLOCKS)
787                 return logfs_read_direct(inode, page);
788         return logfs_read_loop(inode, page, rw_context);
789 }
790
791 static int logfs_exist_loop(struct inode *inode, u64 bix)
792 {
793         struct logfs_inode *li = logfs_inode(inode);
794         u64 bofs = li->li_data[INDIRECT_INDEX];
795         level_t level;
796         int ret;
797         struct page *ipage;
798
799         if (!bofs)
800                 return 0;
801         if (bix >= maxbix(li->li_height))
802                 return 0;
803
804         for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
805                 ipage = logfs_get_read_page(inode, bix, level);
806                 if (!ipage)
807                         return -ENOMEM;
808
809                 ret = logfs_segment_read(inode, ipage, bofs, bix, level);
810                 if (ret) {
811                         logfs_put_read_page(ipage);
812                         return ret;
813                 }
814
815                 bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
816                 logfs_put_read_page(ipage);
817                 if (!bofs)
818                         return 0;
819         }
820
821         return 1;
822 }
823
824 int logfs_exist_block(struct inode *inode, u64 bix)
825 {
826         struct logfs_inode *li = logfs_inode(inode);
827
828         if (bix < I0_BLOCKS)
829                 return !!li->li_data[bix];
830         return logfs_exist_loop(inode, bix);
831 }
832
833 static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data)
834 {
835         struct logfs_inode *li = logfs_inode(inode);
836
837         for (; bix < I0_BLOCKS; bix++)
838                 if (data ^ (li->li_data[bix] == 0))
839                         return bix;
840         return I0_BLOCKS;
841 }
842
843 static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data)
844 {
845         struct logfs_inode *li = logfs_inode(inode);
846         __be64 *rblock;
847         u64 increment, bofs = li->li_data[INDIRECT_INDEX];
848         level_t level;
849         int ret, slot;
850         struct page *page;
851
852         BUG_ON(!bofs);
853
854         for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
855                 increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1));
856                 page = logfs_get_read_page(inode, bix, level);
857                 if (!page)
858                         return bix;
859
860                 ret = logfs_segment_read(inode, page, bofs, bix, level);
861                 if (ret) {
862                         logfs_put_read_page(page);
863                         return bix;
864                 }
865
866                 slot = get_bits(bix, SUBLEVEL(level));
867                 rblock = kmap_atomic(page, KM_USER0);
868                 while (slot < LOGFS_BLOCK_FACTOR) {
869                         if (data && (rblock[slot] != 0))
870                                 break;
871                         if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED))
872                                 break;
873                         slot++;
874                         bix += increment;
875                         bix &= ~(increment - 1);
876                 }
877                 if (slot >= LOGFS_BLOCK_FACTOR) {
878                         kunmap_atomic(rblock, KM_USER0);
879                         logfs_put_read_page(page);
880                         return bix;
881                 }
882                 bofs = be64_to_cpu(rblock[slot]);
883                 kunmap_atomic(rblock, KM_USER0);
884                 logfs_put_read_page(page);
885                 if (!bofs) {
886                         BUG_ON(data);
887                         return bix;
888                 }
889         }
890         return bix;
891 }
892
893 /**
894  * logfs_seek_hole - find next hole starting at a given block index
895  * @inode:              inode to search in
896  * @bix:                block index to start searching
897  *
898  * Returns next hole.  If the file doesn't contain any further holes, the
899  * block address next to eof is returned instead.
900  */
901 u64 logfs_seek_hole(struct inode *inode, u64 bix)
902 {
903         struct logfs_inode *li = logfs_inode(inode);
904
905         if (bix < I0_BLOCKS) {
906                 bix = seek_holedata_direct(inode, bix, 0);
907                 if (bix < I0_BLOCKS)
908                         return bix;
909         }
910
911         if (!li->li_data[INDIRECT_INDEX])
912                 return bix;
913         else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
914                 bix = maxbix(li->li_height);
915         else {
916                 bix = seek_holedata_loop(inode, bix, 0);
917                 if (bix < maxbix(li->li_height))
918                         return bix;
919                 /* Should not happen anymore.  But if some port writes semi-
920                  * corrupt images (as this one used to) we might run into it.
921                  */
922                 WARN_ON_ONCE(bix == maxbix(li->li_height));
923         }
924
925         return bix;
926 }
927
928 static u64 __logfs_seek_data(struct inode *inode, u64 bix)
929 {
930         struct logfs_inode *li = logfs_inode(inode);
931
932         if (bix < I0_BLOCKS) {
933                 bix = seek_holedata_direct(inode, bix, 1);
934                 if (bix < I0_BLOCKS)
935                         return bix;
936         }
937
938         if (bix < maxbix(li->li_height)) {
939                 if (!li->li_data[INDIRECT_INDEX])
940                         bix = maxbix(li->li_height);
941                 else
942                         return seek_holedata_loop(inode, bix, 1);
943         }
944
945         return bix;
946 }
947
948 /**
949  * logfs_seek_data - find next data block after a given block index
950  * @inode:              inode to search in
951  * @bix:                block index to start searching
952  *
953  * Returns next data block.  If the file doesn't contain any further data
954  * blocks, the last block in the file is returned instead.
955  */
956 u64 logfs_seek_data(struct inode *inode, u64 bix)
957 {
958         struct super_block *sb = inode->i_sb;
959         u64 ret, end;
960
961         ret = __logfs_seek_data(inode, bix);
962         end = i_size_read(inode) >> sb->s_blocksize_bits;
963         if (ret >= end)
964                 ret = max(bix, end);
965         return ret;
966 }
967
968 static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs)
969 {
970         return pure_ofs(li->li_data[bix]) == ofs;
971 }
972
973 static int __logfs_is_valid_loop(struct inode *inode, u64 bix,
974                 u64 ofs, u64 bofs)
975 {
976         struct logfs_inode *li = logfs_inode(inode);
977         level_t level;
978         int ret;
979         struct page *page;
980
981         for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){
982                 page = logfs_get_write_page(inode, bix, level);
983                 BUG_ON(!page);
984
985                 ret = logfs_segment_read(inode, page, bofs, bix, level);
986                 if (ret) {
987                         logfs_put_write_page(page);
988                         return 0;
989                 }
990
991                 bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level)));
992                 logfs_put_write_page(page);
993                 if (!bofs)
994                         return 0;
995
996                 if (pure_ofs(bofs) == ofs)
997                         return 1;
998         }
999         return 0;
1000 }
1001
1002 static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs)
1003 {
1004         struct logfs_inode *li = logfs_inode(inode);
1005         u64 bofs = li->li_data[INDIRECT_INDEX];
1006
1007         if (!bofs)
1008                 return 0;
1009
1010         if (bix >= maxbix(li->li_height))
1011                 return 0;
1012
1013         if (pure_ofs(bofs) == ofs)
1014                 return 1;
1015
1016         return __logfs_is_valid_loop(inode, bix, ofs, bofs);
1017 }
1018
1019 static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs)
1020 {
1021         struct logfs_inode *li = logfs_inode(inode);
1022
1023         if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1)
1024                 return 0;
1025
1026         if (bix < I0_BLOCKS)
1027                 return logfs_is_valid_direct(li, bix, ofs);
1028         return logfs_is_valid_loop(inode, bix, ofs);
1029 }
1030
1031 /**
1032  * logfs_is_valid_block - check whether this block is still valid
1033  *
1034  * @sb  - superblock
1035  * @ofs - block physical offset
1036  * @ino - block inode number
1037  * @bix - block index
1038  * @level - block level
1039  *
1040  * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will
1041  * become invalid once the journal is written.
1042  */
1043 int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
1044                 gc_level_t gc_level)
1045 {
1046         struct logfs_super *super = logfs_super(sb);
1047         struct inode *inode;
1048         int ret, cookie;
1049
1050         /* Umount closes a segment with free blocks remaining.  Those
1051          * blocks are by definition invalid. */
1052         if (ino == -1)
1053                 return 0;
1054
1055         LOGFS_BUG_ON((u64)(u_long)ino != ino, sb);
1056
1057         inode = logfs_safe_iget(sb, ino, &cookie);
1058         if (IS_ERR(inode))
1059                 goto invalid;
1060
1061         ret = __logfs_is_valid_block(inode, bix, ofs);
1062         logfs_safe_iput(inode, cookie);
1063         if (ret)
1064                 return ret;
1065
1066 invalid:
1067         /* Block is nominally invalid, but may still sit in the shadow tree,
1068          * waiting for a journal commit.
1069          */
1070         if (btree_lookup64(&super->s_shadow_tree.old, ofs))
1071                 return 2;
1072         return 0;
1073 }
1074
1075 int logfs_readpage_nolock(struct page *page)
1076 {
1077         struct inode *inode = page->mapping->host;
1078         int ret = -EIO;
1079
1080         ret = logfs_read_block(inode, page, READ);
1081
1082         if (ret) {
1083                 ClearPageUptodate(page);
1084                 SetPageError(page);
1085         } else {
1086                 SetPageUptodate(page);
1087                 ClearPageError(page);
1088         }
1089         flush_dcache_page(page);
1090
1091         return ret;
1092 }
1093
1094 static int logfs_reserve_bytes(struct inode *inode, int bytes)
1095 {
1096         struct logfs_super *super = logfs_super(inode->i_sb);
1097         u64 available = super->s_free_bytes + super->s_dirty_free_bytes
1098                         - super->s_dirty_used_bytes - super->s_dirty_pages;
1099
1100         if (!bytes)
1101                 return 0;
1102
1103         if (available < bytes)
1104                 return -ENOSPC;
1105
1106         if (available < bytes + super->s_root_reserve &&
1107                         !capable(CAP_SYS_RESOURCE))
1108                 return -ENOSPC;
1109
1110         return 0;
1111 }
1112
1113 int get_page_reserve(struct inode *inode, struct page *page)
1114 {
1115         struct logfs_super *super = logfs_super(inode->i_sb);
1116         int ret;
1117
1118         if (logfs_block(page) && logfs_block(page)->reserved_bytes)
1119                 return 0;
1120
1121         logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
1122         ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE);
1123         if (!ret) {
1124                 alloc_data_block(inode, page);
1125                 logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
1126                 super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
1127         }
1128         logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
1129         return ret;
1130 }
1131
1132 /*
1133  * We are protected by write lock.  Push victims up to superblock level
1134  * and release transaction when appropriate.
1135  */
1136 /* FIXME: This is currently called from the wrong spots. */
1137 static void logfs_handle_transaction(struct inode *inode,
1138                 struct logfs_transaction *ta)
1139 {
1140         struct logfs_super *super = logfs_super(inode->i_sb);
1141
1142         if (!ta)
1143                 return;
1144         logfs_inode(inode)->li_block->ta = NULL;
1145
1146         if (inode->i_ino != LOGFS_INO_MASTER) {
1147                 BUG(); /* FIXME: Yes, this needs more thought */
1148                 /* just remember the transaction until inode is written */
1149                 //BUG_ON(logfs_inode(inode)->li_transaction);
1150                 //logfs_inode(inode)->li_transaction = ta;
1151                 return;
1152         }
1153
1154         switch (ta->state) {
1155         case CREATE_1: /* fall through */
1156         case UNLINK_1:
1157                 BUG_ON(super->s_victim_ino);
1158                 super->s_victim_ino = ta->ino;
1159                 break;
1160         case CREATE_2: /* fall through */
1161         case UNLINK_2:
1162                 BUG_ON(super->s_victim_ino != ta->ino);
1163                 super->s_victim_ino = 0;
1164                 /* transaction ends here - free it */
1165                 kfree(ta);
1166                 break;
1167         case CROSS_RENAME_1:
1168                 BUG_ON(super->s_rename_dir);
1169                 BUG_ON(super->s_rename_pos);
1170                 super->s_rename_dir = ta->dir;
1171                 super->s_rename_pos = ta->pos;
1172                 break;
1173         case CROSS_RENAME_2:
1174                 BUG_ON(super->s_rename_dir != ta->dir);
1175                 BUG_ON(super->s_rename_pos != ta->pos);
1176                 super->s_rename_dir = 0;
1177                 super->s_rename_pos = 0;
1178                 kfree(ta);
1179                 break;
1180         case TARGET_RENAME_1:
1181                 BUG_ON(super->s_rename_dir);
1182                 BUG_ON(super->s_rename_pos);
1183                 BUG_ON(super->s_victim_ino);
1184                 super->s_rename_dir = ta->dir;
1185                 super->s_rename_pos = ta->pos;
1186                 super->s_victim_ino = ta->ino;
1187                 break;
1188         case TARGET_RENAME_2:
1189                 BUG_ON(super->s_rename_dir != ta->dir);
1190                 BUG_ON(super->s_rename_pos != ta->pos);
1191                 BUG_ON(super->s_victim_ino != ta->ino);
1192                 super->s_rename_dir = 0;
1193                 super->s_rename_pos = 0;
1194                 break;
1195         case TARGET_RENAME_3:
1196                 BUG_ON(super->s_rename_dir);
1197                 BUG_ON(super->s_rename_pos);
1198                 BUG_ON(super->s_victim_ino != ta->ino);
1199                 super->s_victim_ino = 0;
1200                 kfree(ta);
1201                 break;
1202         default:
1203                 BUG();
1204         }
1205 }
1206
1207 /*
1208  * Not strictly a reservation, but rather a check that we still have enough
1209  * space to satisfy the write.
1210  */
1211 static int logfs_reserve_blocks(struct inode *inode, int blocks)
1212 {
1213         return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE);
1214 }
1215
1216 struct write_control {
1217         u64 ofs;
1218         long flags;
1219 };
1220
1221 static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix,
1222                 level_t level, u64 old_ofs)
1223 {
1224         struct logfs_super *super = logfs_super(inode->i_sb);
1225         struct logfs_shadow *shadow;
1226
1227         shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS);
1228         memset(shadow, 0, sizeof(*shadow));
1229         shadow->ino = inode->i_ino;
1230         shadow->bix = bix;
1231         shadow->gc_level = expand_level(inode->i_ino, level);
1232         shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED;
1233         return shadow;
1234 }
1235
1236 static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
1237 {
1238         struct logfs_super *super = logfs_super(inode->i_sb);
1239
1240         mempool_free(shadow, super->s_shadow_pool);
1241 }
1242
1243 /**
1244  * fill_shadow_tree - Propagate shadow tree changes due to a write
1245  * @inode:      Inode owning the page
1246  * @page:       Struct page that was written
1247  * @shadow:     Shadow for the current write
1248  *
1249  * Writes in logfs can result in two semi-valid objects.  The old object
1250  * is still valid as long as it can be reached by following pointers on
1251  * the medium.  Only when writes propagate all the way up to the journal
1252  * has the new object safely replaced the old one.
1253  *
1254  * To handle this problem, a struct logfs_shadow is used to represent
1255  * every single write.  It is attached to the indirect block, which is
1256  * marked dirty.  When the indirect block is written, its shadows are
1257  * handed up to the next indirect block (or inode).  Untimately they
1258  * will reach the master inode and be freed upon journal commit.
1259  *
1260  * This function handles a single step in the propagation.  It adds the
1261  * shadow for the current write to the tree, along with any shadows in
1262  * the page's tree, in case it was an indirect block.  If a page is
1263  * written, the inode parameter is left NULL, if an inode is written,
1264  * the page parameter is left NULL.
1265  */
1266 static void fill_shadow_tree(struct inode *inode, struct page *page,
1267                 struct logfs_shadow *shadow)
1268 {
1269         struct logfs_super *super = logfs_super(inode->i_sb);
1270         struct logfs_block *block = logfs_block(page);
1271         struct shadow_tree *tree = &super->s_shadow_tree;
1272
1273         if (PagePrivate(page)) {
1274                 if (block->alias_map)
1275                         super->s_no_object_aliases -= bitmap_weight(
1276                                         block->alias_map, LOGFS_BLOCK_FACTOR);
1277                 logfs_handle_transaction(inode, block->ta);
1278                 block->ops->free_block(inode->i_sb, block);
1279         }
1280         if (shadow) {
1281                 if (shadow->old_ofs)
1282                         btree_insert64(&tree->old, shadow->old_ofs, shadow,
1283                                         GFP_NOFS);
1284                 else
1285                         btree_insert64(&tree->new, shadow->new_ofs, shadow,
1286                                         GFP_NOFS);
1287
1288                 super->s_dirty_used_bytes += shadow->new_len;
1289                 super->s_dirty_free_bytes += shadow->old_len;
1290         }
1291 }
1292
1293 static void logfs_set_alias(struct super_block *sb, struct logfs_block *block,
1294                 long child_no)
1295 {
1296         struct logfs_super *super = logfs_super(sb);
1297
1298         if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) {
1299                 /* Aliases in the master inode are pointless. */
1300                 return;
1301         }
1302
1303         if (!test_bit(child_no, block->alias_map)) {
1304                 set_bit(child_no, block->alias_map);
1305                 super->s_no_object_aliases++;
1306         }
1307         list_move_tail(&block->alias_list, &super->s_object_alias);
1308 }
1309
1310 /*
1311  * Object aliases can and often do change the size and occupied space of a
1312  * file.  So not only do we have to change the pointers, we also have to
1313  * change inode->i_size and li->li_used_bytes.  Which is done by setting
1314  * another two object aliases for the inode itself.
1315  */
1316 static void set_iused(struct inode *inode, struct logfs_shadow *shadow)
1317 {
1318         struct logfs_inode *li = logfs_inode(inode);
1319
1320         if (shadow->new_len == shadow->old_len)
1321                 return;
1322
1323         alloc_inode_block(inode);
1324         li->li_used_bytes += shadow->new_len - shadow->old_len;
1325         __logfs_set_blocks(inode);
1326         logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS);
1327         logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS);
1328 }
1329
1330 static int logfs_write_i0(struct inode *inode, struct page *page,
1331                 struct write_control *wc)
1332 {
1333         struct logfs_shadow *shadow;
1334         u64 bix;
1335         level_t level;
1336         int full, err = 0;
1337
1338         logfs_unpack_index(page->index, &bix, &level);
1339         if (wc->ofs == 0)
1340                 if (logfs_reserve_blocks(inode, 1))
1341                         return -ENOSPC;
1342
1343         shadow = alloc_shadow(inode, bix, level, wc->ofs);
1344         if (wc->flags & WF_WRITE)
1345                 err = logfs_segment_write(inode, page, shadow);
1346         if (wc->flags & WF_DELETE)
1347                 logfs_segment_delete(inode, shadow);
1348         if (err) {
1349                 free_shadow(inode, shadow);
1350                 return err;
1351         }
1352
1353         set_iused(inode, shadow);
1354         full = 1;
1355         if (level != 0) {
1356                 alloc_indirect_block(inode, page, 0);
1357                 full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR;
1358         }
1359         fill_shadow_tree(inode, page, shadow);
1360         wc->ofs = shadow->new_ofs;
1361         if (wc->ofs && full)
1362                 wc->ofs |= LOGFS_FULLY_POPULATED;
1363         return 0;
1364 }
1365
1366 static int logfs_write_direct(struct inode *inode, struct page *page,
1367                 long flags)
1368 {
1369         struct logfs_inode *li = logfs_inode(inode);
1370         struct write_control wc = {
1371                 .ofs = li->li_data[page->index],
1372                 .flags = flags,
1373         };
1374         int err;
1375
1376         alloc_inode_block(inode);
1377
1378         err = logfs_write_i0(inode, page, &wc);
1379         if (err)
1380                 return err;
1381
1382         li->li_data[page->index] = wc.ofs;
1383         logfs_set_alias(inode->i_sb, li->li_block,
1384                         page->index + INODE_POINTER_OFS);
1385         return 0;
1386 }
1387
1388 static int ptr_change(u64 ofs, struct page *page)
1389 {
1390         struct logfs_block *block = logfs_block(page);
1391         int empty0, empty1, full0, full1;
1392
1393         empty0 = ofs == 0;
1394         empty1 = block->partial == 0;
1395         if (empty0 != empty1)
1396                 return 1;
1397
1398         /* The !! is necessary to shrink result to int */
1399         full0 = !!(ofs & LOGFS_FULLY_POPULATED);
1400         full1 = block->full == LOGFS_BLOCK_FACTOR;
1401         if (full0 != full1)
1402                 return 1;
1403         return 0;
1404 }
1405
1406 static int __logfs_write_rec(struct inode *inode, struct page *page,
1407                 struct write_control *this_wc,
1408                 pgoff_t bix, level_t target_level, level_t level)
1409 {
1410         int ret, page_empty = 0;
1411         int child_no = get_bits(bix, SUBLEVEL(level));
1412         struct page *ipage;
1413         struct write_control child_wc = {
1414                 .flags = this_wc->flags,
1415         };
1416
1417         ipage = logfs_get_write_page(inode, bix, level);
1418         if (!ipage)
1419                 return -ENOMEM;
1420
1421         if (this_wc->ofs) {
1422                 ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
1423                 if (ret)
1424                         goto out;
1425         } else if (!PageUptodate(ipage)) {
1426                 page_empty = 1;
1427                 logfs_read_empty(ipage);
1428         }
1429
1430         child_wc.ofs = block_get_pointer(ipage, child_no);
1431
1432         if ((__force u8)level-1 > (__force u8)target_level)
1433                 ret = __logfs_write_rec(inode, page, &child_wc, bix,
1434                                 target_level, SUBLEVEL(level));
1435         else
1436                 ret = logfs_write_i0(inode, page, &child_wc);
1437
1438         if (ret)
1439                 goto out;
1440
1441         alloc_indirect_block(inode, ipage, page_empty);
1442         block_set_pointer(ipage, child_no, child_wc.ofs);
1443         /* FIXME: first condition seems superfluous */
1444         if (child_wc.ofs || logfs_block(ipage)->partial)
1445                 this_wc->flags |= WF_WRITE;
1446         /* the condition on this_wc->ofs ensures that we won't consume extra
1447          * space for indirect blocks in the future, which we cannot reserve */
1448         if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage))
1449                 ret = logfs_write_i0(inode, ipage, this_wc);
1450         else
1451                 logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no);
1452 out:
1453         logfs_put_write_page(ipage);
1454         return ret;
1455 }
1456
1457 static int logfs_write_rec(struct inode *inode, struct page *page,
1458                 pgoff_t bix, level_t target_level, long flags)
1459 {
1460         struct logfs_inode *li = logfs_inode(inode);
1461         struct write_control wc = {
1462                 .ofs = li->li_data[INDIRECT_INDEX],
1463                 .flags = flags,
1464         };
1465         int ret;
1466
1467         alloc_inode_block(inode);
1468
1469         if (li->li_height > (__force u8)target_level)
1470                 ret = __logfs_write_rec(inode, page, &wc, bix, target_level,
1471                                 LEVEL(li->li_height));
1472         else
1473                 ret = logfs_write_i0(inode, page, &wc);
1474         if (ret)
1475                 return ret;
1476
1477         if (li->li_data[INDIRECT_INDEX] != wc.ofs) {
1478                 li->li_data[INDIRECT_INDEX] = wc.ofs;
1479                 logfs_set_alias(inode->i_sb, li->li_block,
1480                                 INDIRECT_INDEX + INODE_POINTER_OFS);
1481         }
1482         return ret;
1483 }
1484
1485 void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta)
1486 {
1487         alloc_inode_block(inode);
1488         logfs_inode(inode)->li_block->ta = ta;
1489 }
1490
1491 void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta)
1492 {
1493         struct logfs_block *block = logfs_inode(inode)->li_block;
1494
1495         if (block && block->ta)
1496                 block->ta = NULL;
1497 }
1498
1499 static int grow_inode(struct inode *inode, u64 bix, level_t level)
1500 {
1501         struct logfs_inode *li = logfs_inode(inode);
1502         u8 height = (__force u8)level;
1503         struct page *page;
1504         struct write_control wc = {
1505                 .flags = WF_WRITE,
1506         };
1507         int err;
1508
1509         BUG_ON(height > 5 || li->li_height > 5);
1510         while (height > li->li_height || bix >= maxbix(li->li_height)) {
1511                 page = logfs_get_write_page(inode, I0_BLOCKS + 1,
1512                                 LEVEL(li->li_height + 1));
1513                 if (!page)
1514                         return -ENOMEM;
1515                 logfs_read_empty(page);
1516                 alloc_indirect_block(inode, page, 1);
1517                 block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]);
1518                 err = logfs_write_i0(inode, page, &wc);
1519                 logfs_put_write_page(page);
1520                 if (err)
1521                         return err;
1522                 li->li_data[INDIRECT_INDEX] = wc.ofs;
1523                 wc.ofs = 0;
1524                 li->li_height++;
1525                 logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS);
1526         }
1527         return 0;
1528 }
1529
1530 static int __logfs_write_buf(struct inode *inode, struct page *page, long flags)
1531 {
1532         struct logfs_super *super = logfs_super(inode->i_sb);
1533         pgoff_t index = page->index;
1534         u64 bix;
1535         level_t level;
1536         int err;
1537
1538         flags |= WF_WRITE | WF_DELETE;
1539         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1540
1541         logfs_unpack_index(index, &bix, &level);
1542         if (logfs_block(page) && logfs_block(page)->reserved_bytes)
1543                 super->s_dirty_pages -= logfs_block(page)->reserved_bytes;
1544
1545         if (index < I0_BLOCKS)
1546                 return logfs_write_direct(inode, page, flags);
1547
1548         bix = adjust_bix(bix, level);
1549         err = grow_inode(inode, bix, level);
1550         if (err)
1551                 return err;
1552         return logfs_write_rec(inode, page, bix, level, flags);
1553 }
1554
1555 int logfs_write_buf(struct inode *inode, struct page *page, long flags)
1556 {
1557         struct super_block *sb = inode->i_sb;
1558         int ret;
1559
1560         logfs_get_wblocks(sb, page, flags & WF_LOCK);
1561         ret = __logfs_write_buf(inode, page, flags);
1562         logfs_put_wblocks(sb, page, flags & WF_LOCK);
1563         return ret;
1564 }
1565
1566 static int __logfs_delete(struct inode *inode, struct page *page)
1567 {
1568         long flags = WF_DELETE;
1569
1570         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1571
1572         if (page->index < I0_BLOCKS)
1573                 return logfs_write_direct(inode, page, flags);
1574         return logfs_write_rec(inode, page, page->index, 0, flags);
1575 }
1576
1577 int logfs_delete(struct inode *inode, pgoff_t index,
1578                 struct shadow_tree *shadow_tree)
1579 {
1580         struct super_block *sb = inode->i_sb;
1581         struct page *page;
1582         int ret;
1583
1584         page = logfs_get_read_page(inode, index, 0);
1585         if (!page)
1586                 return -ENOMEM;
1587
1588         logfs_get_wblocks(sb, page, 1);
1589         ret = __logfs_delete(inode, page);
1590         logfs_put_wblocks(sb, page, 1);
1591
1592         logfs_put_read_page(page);
1593
1594         return ret;
1595 }
1596
1597 int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
1598                 gc_level_t gc_level, long flags)
1599 {
1600         level_t level = shrink_level(gc_level);
1601         struct page *page;
1602         int err;
1603
1604         page = logfs_get_write_page(inode, bix, level);
1605         if (!page)
1606                 return -ENOMEM;
1607
1608         err = logfs_segment_read(inode, page, ofs, bix, level);
1609         if (!err) {
1610                 if (level != 0)
1611                         alloc_indirect_block(inode, page, 0);
1612                 err = logfs_write_buf(inode, page, flags);
1613                 if (!err && shrink_level(gc_level) == 0) {
1614                         /* Rewrite cannot mark the inode dirty but has to
1615                          * write it immediatly.
1616                          * Q: Can't we just create an alias for the inode
1617                          * instead?  And if not, why not?
1618                          */
1619                         if (inode->i_ino == LOGFS_INO_MASTER)
1620                                 logfs_write_anchor(inode->i_sb);
1621                         else {
1622                                 err = __logfs_write_inode(inode, flags);
1623                         }
1624                 }
1625         }
1626         logfs_put_write_page(page);
1627         return err;
1628 }
1629
1630 static int truncate_data_block(struct inode *inode, struct page *page,
1631                 u64 ofs, struct logfs_shadow *shadow, u64 size)
1632 {
1633         loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits;
1634         u64 bix;
1635         level_t level;
1636         int err;
1637
1638         /* Does truncation happen within this page? */
1639         if (size <= pageofs || size - pageofs >= PAGE_SIZE)
1640                 return 0;
1641
1642         logfs_unpack_index(page->index, &bix, &level);
1643         BUG_ON(level != 0);
1644
1645         err = logfs_segment_read(inode, page, ofs, bix, level);
1646         if (err)
1647                 return err;
1648
1649         zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE);
1650         return logfs_segment_write(inode, page, shadow);
1651 }
1652
1653 static int logfs_truncate_i0(struct inode *inode, struct page *page,
1654                 struct write_control *wc, u64 size)
1655 {
1656         struct logfs_shadow *shadow;
1657         u64 bix;
1658         level_t level;
1659         int err = 0;
1660
1661         logfs_unpack_index(page->index, &bix, &level);
1662         BUG_ON(level != 0);
1663         shadow = alloc_shadow(inode, bix, level, wc->ofs);
1664
1665         err = truncate_data_block(inode, page, wc->ofs, shadow, size);
1666         if (err) {
1667                 free_shadow(inode, shadow);
1668                 return err;
1669         }
1670
1671         logfs_segment_delete(inode, shadow);
1672         set_iused(inode, shadow);
1673         fill_shadow_tree(inode, page, shadow);
1674         wc->ofs = shadow->new_ofs;
1675         return 0;
1676 }
1677
1678 static int logfs_truncate_direct(struct inode *inode, u64 size)
1679 {
1680         struct logfs_inode *li = logfs_inode(inode);
1681         struct write_control wc;
1682         struct page *page;
1683         int e;
1684         int err;
1685
1686         alloc_inode_block(inode);
1687
1688         for (e = I0_BLOCKS - 1; e >= 0; e--) {
1689                 if (size > (e+1) * LOGFS_BLOCKSIZE)
1690                         break;
1691
1692                 wc.ofs = li->li_data[e];
1693                 if (!wc.ofs)
1694                         continue;
1695
1696                 page = logfs_get_write_page(inode, e, 0);
1697                 if (!page)
1698                         return -ENOMEM;
1699                 err = logfs_segment_read(inode, page, wc.ofs, e, 0);
1700                 if (err) {
1701                         logfs_put_write_page(page);
1702                         return err;
1703                 }
1704                 err = logfs_truncate_i0(inode, page, &wc, size);
1705                 logfs_put_write_page(page);
1706                 if (err)
1707                         return err;
1708
1709                 li->li_data[e] = wc.ofs;
1710         }
1711         return 0;
1712 }
1713
1714 /* FIXME: these need to become per-sb once we support different blocksizes */
1715 static u64 __logfs_step[] = {
1716         1,
1717         I1_BLOCKS,
1718         I2_BLOCKS,
1719         I3_BLOCKS,
1720 };
1721
1722 static u64 __logfs_start_index[] = {
1723         I0_BLOCKS,
1724         I1_BLOCKS,
1725         I2_BLOCKS,
1726         I3_BLOCKS
1727 };
1728
1729 static inline u64 logfs_step(level_t level)
1730 {
1731         return __logfs_step[(__force u8)level];
1732 }
1733
1734 static inline u64 logfs_factor(u8 level)
1735 {
1736         return __logfs_step[level] * LOGFS_BLOCKSIZE;
1737 }
1738
1739 static inline u64 logfs_start_index(level_t level)
1740 {
1741         return __logfs_start_index[(__force u8)level];
1742 }
1743
1744 static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level)
1745 {
1746         logfs_unpack_index(index, bix, level);
1747         if (*bix <= logfs_start_index(SUBLEVEL(*level)))
1748                 *bix = 0;
1749 }
1750
1751 static int __logfs_truncate_rec(struct inode *inode, struct page *ipage,
1752                 struct write_control *this_wc, u64 size)
1753 {
1754         int truncate_happened = 0;
1755         int e, err = 0;
1756         u64 bix, child_bix, next_bix;
1757         level_t level;
1758         struct page *page;
1759         struct write_control child_wc = { /* FIXME: flags */ };
1760
1761         logfs_unpack_raw_index(ipage->index, &bix, &level);
1762         err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
1763         if (err)
1764                 return err;
1765
1766         for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) {
1767                 child_bix = bix + e * logfs_step(SUBLEVEL(level));
1768                 next_bix = child_bix + logfs_step(SUBLEVEL(level));
1769                 if (size > next_bix * LOGFS_BLOCKSIZE)
1770                         break;
1771
1772                 child_wc.ofs = pure_ofs(block_get_pointer(ipage, e));
1773                 if (!child_wc.ofs)
1774                         continue;
1775
1776                 page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level));
1777                 if (!page)
1778                         return -ENOMEM;
1779
1780                 if ((__force u8)level > 1)
1781                         err = __logfs_truncate_rec(inode, page, &child_wc, size);
1782                 else
1783                         err = logfs_truncate_i0(inode, page, &child_wc, size);
1784                 logfs_put_write_page(page);
1785                 if (err)
1786                         return err;
1787
1788                 truncate_happened = 1;
1789                 alloc_indirect_block(inode, ipage, 0);
1790                 block_set_pointer(ipage, e, child_wc.ofs);
1791         }
1792
1793         if (!truncate_happened) {
1794                 printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size);
1795                 return 0;
1796         }
1797
1798         this_wc->flags = WF_DELETE;
1799         if (logfs_block(ipage)->partial)
1800                 this_wc->flags |= WF_WRITE;
1801
1802         return logfs_write_i0(inode, ipage, this_wc);
1803 }
1804
1805 static int logfs_truncate_rec(struct inode *inode, u64 size)
1806 {
1807         struct logfs_inode *li = logfs_inode(inode);
1808         struct write_control wc = {
1809                 .ofs = li->li_data[INDIRECT_INDEX],
1810         };
1811         struct page *page;
1812         int err;
1813
1814         alloc_inode_block(inode);
1815
1816         if (!wc.ofs)
1817                 return 0;
1818
1819         page = logfs_get_write_page(inode, 0, LEVEL(li->li_height));
1820         if (!page)
1821                 return -ENOMEM;
1822
1823         err = __logfs_truncate_rec(inode, page, &wc, size);
1824         logfs_put_write_page(page);
1825         if (err)
1826                 return err;
1827
1828         if (li->li_data[INDIRECT_INDEX] != wc.ofs)
1829                 li->li_data[INDIRECT_INDEX] = wc.ofs;
1830         return 0;
1831 }
1832
1833 static int __logfs_truncate(struct inode *inode, u64 size)
1834 {
1835         int ret;
1836
1837         if (size >= logfs_factor(logfs_inode(inode)->li_height))
1838                 return 0;
1839
1840         ret = logfs_truncate_rec(inode, size);
1841         if (ret)
1842                 return ret;
1843
1844         return logfs_truncate_direct(inode, size);
1845 }
1846
1847 int logfs_truncate(struct inode *inode, u64 size)
1848 {
1849         struct super_block *sb = inode->i_sb;
1850         int err;
1851
1852         logfs_get_wblocks(sb, NULL, 1);
1853         err = __logfs_truncate(inode, size);
1854         if (!err)
1855                 err = __logfs_write_inode(inode, 0);
1856         logfs_put_wblocks(sb, NULL, 1);
1857
1858         if (!err)
1859                 err = vmtruncate(inode, size);
1860
1861         /* I don't trust error recovery yet. */
1862         WARN_ON(err);
1863         return err;
1864 }
1865
1866 static void move_page_to_inode(struct inode *inode, struct page *page)
1867 {
1868         struct logfs_inode *li = logfs_inode(inode);
1869         struct logfs_block *block = logfs_block(page);
1870
1871         if (!block)
1872                 return;
1873
1874         log_blockmove("move_page_to_inode(%llx, %llx, %x)\n",
1875                         block->ino, block->bix, block->level);
1876         BUG_ON(li->li_block);
1877         block->ops = &inode_block_ops;
1878         block->inode = inode;
1879         li->li_block = block;
1880
1881         block->page = NULL;
1882         page->private = 0;
1883         ClearPagePrivate(page);
1884 }
1885
1886 static void move_inode_to_page(struct page *page, struct inode *inode)
1887 {
1888         struct logfs_inode *li = logfs_inode(inode);
1889         struct logfs_block *block = li->li_block;
1890
1891         if (!block)
1892                 return;
1893
1894         log_blockmove("move_inode_to_page(%llx, %llx, %x)\n",
1895                         block->ino, block->bix, block->level);
1896         BUG_ON(PagePrivate(page));
1897         block->ops = &indirect_block_ops;
1898         block->page = page;
1899         page->private = (unsigned long)block;
1900         SetPagePrivate(page);
1901
1902         block->inode = NULL;
1903         li->li_block = NULL;
1904 }
1905
1906 int logfs_read_inode(struct inode *inode)
1907 {
1908         struct super_block *sb = inode->i_sb;
1909         struct logfs_super *super = logfs_super(sb);
1910         struct inode *master_inode = super->s_master_inode;
1911         struct page *page;
1912         struct logfs_disk_inode *di;
1913         u64 ino = inode->i_ino;
1914
1915         if (ino << sb->s_blocksize_bits > i_size_read(master_inode))
1916                 return -ENODATA;
1917         if (!logfs_exist_block(master_inode, ino))
1918                 return -ENODATA;
1919
1920         page = read_cache_page(master_inode->i_mapping, ino,
1921                         (filler_t *)logfs_readpage, NULL);
1922         if (IS_ERR(page))
1923                 return PTR_ERR(page);
1924
1925         di = kmap_atomic(page, KM_USER0);
1926         logfs_disk_to_inode(di, inode);
1927         kunmap_atomic(di, KM_USER0);
1928         move_page_to_inode(inode, page);
1929         page_cache_release(page);
1930         return 0;
1931 }
1932
1933 /* Caller must logfs_put_write_page(page); */
1934 static struct page *inode_to_page(struct inode *inode)
1935 {
1936         struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode;
1937         struct logfs_disk_inode *di;
1938         struct page *page;
1939
1940         BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
1941
1942         page = logfs_get_write_page(master_inode, inode->i_ino, 0);
1943         if (!page)
1944                 return NULL;
1945
1946         di = kmap_atomic(page, KM_USER0);
1947         logfs_inode_to_disk(inode, di);
1948         kunmap_atomic(di, KM_USER0);
1949         move_inode_to_page(page, inode);
1950         return page;
1951 }
1952
1953 /* Cheaper version of write_inode.  All changes are concealed in
1954  * aliases, which are moved back.  No write to the medium happens.
1955  */
1956 void logfs_clear_inode(struct inode *inode)
1957 {
1958         struct super_block *sb = inode->i_sb;
1959         struct logfs_inode *li = logfs_inode(inode);
1960         struct logfs_block *block = li->li_block;
1961         struct page *page;
1962
1963         /* Only deleted files may be dirty at this point */
1964         BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
1965         if (!block)
1966                 return;
1967         if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
1968                 block->ops->free_block(inode->i_sb, block);
1969                 return;
1970         }
1971
1972         BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
1973         page = inode_to_page(inode);
1974         BUG_ON(!page); /* FIXME: Use emergency page */
1975         logfs_put_write_page(page);
1976 }
1977
1978 static int do_write_inode(struct inode *inode)
1979 {
1980         struct super_block *sb = inode->i_sb;
1981         struct inode *master_inode = logfs_super(sb)->s_master_inode;
1982         loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits;
1983         struct page *page;
1984         int err;
1985
1986         BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
1987         /* FIXME: lock inode */
1988
1989         if (i_size_read(master_inode) < size)
1990                 i_size_write(master_inode, size);
1991
1992         /* TODO: Tell vfs this inode is clean now */
1993
1994         page = inode_to_page(inode);
1995         if (!page)
1996                 return -ENOMEM;
1997
1998         /* FIXME: transaction is part of logfs_block now.  Is that enough? */
1999         err = logfs_write_buf(master_inode, page, 0);
2000         logfs_put_write_page(page);
2001         return err;
2002 }
2003
2004 static void logfs_mod_segment_entry(struct super_block *sb, u32 segno,
2005                 int write,
2006                 void (*change_se)(struct logfs_segment_entry *, long),
2007                 long arg)
2008 {
2009         struct logfs_super *super = logfs_super(sb);
2010         struct inode *inode;
2011         struct page *page;
2012         struct logfs_segment_entry *se;
2013         pgoff_t page_no;
2014         int child_no;
2015
2016         page_no = segno >> (sb->s_blocksize_bits - 3);
2017         child_no = segno & ((sb->s_blocksize >> 3) - 1);
2018
2019         inode = super->s_segfile_inode;
2020         page = logfs_get_write_page(inode, page_no, 0);
2021         BUG_ON(!page); /* FIXME: We need some reserve page for this case */
2022         if (!PageUptodate(page))
2023                 logfs_read_block(inode, page, WRITE);
2024
2025         if (write)
2026                 alloc_indirect_block(inode, page, 0);
2027         se = kmap_atomic(page, KM_USER0);
2028         change_se(se + child_no, arg);
2029         if (write) {
2030                 logfs_set_alias(sb, logfs_block(page), child_no);
2031                 BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize);
2032         }
2033         kunmap_atomic(se, KM_USER0);
2034
2035         logfs_put_write_page(page);
2036 }
2037
2038 static void __get_segment_entry(struct logfs_segment_entry *se, long _target)
2039 {
2040         struct logfs_segment_entry *target = (void *)_target;
2041
2042         *target = *se;
2043 }
2044
2045 void logfs_get_segment_entry(struct super_block *sb, u32 segno,
2046                 struct logfs_segment_entry *se)
2047 {
2048         logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se);
2049 }
2050
2051 static void __set_segment_used(struct logfs_segment_entry *se, long increment)
2052 {
2053         u32 valid;
2054
2055         valid = be32_to_cpu(se->valid);
2056         valid += increment;
2057         se->valid = cpu_to_be32(valid);
2058 }
2059
2060 void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment)
2061 {
2062         struct logfs_super *super = logfs_super(sb);
2063         u32 segno = ofs >> super->s_segshift;
2064
2065         if (!increment)
2066                 return;
2067
2068         logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment);
2069 }
2070
2071 static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level)
2072 {
2073         se->ec_level = cpu_to_be32(ec_level);
2074 }
2075
2076 void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
2077                 gc_level_t gc_level)
2078 {
2079         u32 ec_level = ec << 4 | (__force u8)gc_level;
2080
2081         logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level);
2082 }
2083
2084 static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore)
2085 {
2086         se->valid = cpu_to_be32(RESERVED);
2087 }
2088
2089 void logfs_set_segment_reserved(struct super_block *sb, u32 segno)
2090 {
2091         logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0);
2092 }
2093
2094 static void __set_segment_unreserved(struct logfs_segment_entry *se,
2095                 long ec_level)
2096 {
2097         se->valid = 0;
2098         se->ec_level = cpu_to_be32(ec_level);
2099 }
2100
2101 void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
2102 {
2103         u32 ec_level = ec << 4;
2104
2105         logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved,
2106                         ec_level);
2107 }
2108
2109 int __logfs_write_inode(struct inode *inode, long flags)
2110 {
2111         struct super_block *sb = inode->i_sb;
2112         int ret;
2113
2114         logfs_get_wblocks(sb, NULL, flags & WF_LOCK);
2115         ret = do_write_inode(inode);
2116         logfs_put_wblocks(sb, NULL, flags & WF_LOCK);
2117         return ret;
2118 }
2119
2120 static int do_delete_inode(struct inode *inode)
2121 {
2122         struct super_block *sb = inode->i_sb;
2123         struct inode *master_inode = logfs_super(sb)->s_master_inode;
2124         struct page *page;
2125         int ret;
2126
2127         page = logfs_get_write_page(master_inode, inode->i_ino, 0);
2128         if (!page)
2129                 return -ENOMEM;
2130
2131         move_inode_to_page(page, inode);
2132
2133         logfs_get_wblocks(sb, page, 1);
2134         ret = __logfs_delete(master_inode, page);
2135         logfs_put_wblocks(sb, page, 1);
2136
2137         logfs_put_write_page(page);
2138         return ret;
2139 }
2140
2141 /*
2142  * ZOMBIE inodes have already been deleted before and should remain dead,
2143  * if it weren't for valid checking.  No need to kill them again here.
2144  */
2145 void logfs_delete_inode(struct inode *inode)
2146 {
2147         struct logfs_inode *li = logfs_inode(inode);
2148
2149         if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
2150                 li->li_flags |= LOGFS_IF_ZOMBIE;
2151                 if (i_size_read(inode) > 0)
2152                         logfs_truncate(inode, 0);
2153                 do_delete_inode(inode);
2154         }
2155         truncate_inode_pages(&inode->i_data, 0);
2156         clear_inode(inode);
2157 }
2158
2159 void btree_write_block(struct logfs_block *block)
2160 {
2161         struct inode *inode;
2162         struct page *page;
2163         int err, cookie;
2164
2165         inode = logfs_safe_iget(block->sb, block->ino, &cookie);
2166         page = logfs_get_write_page(inode, block->bix, block->level);
2167
2168         err = logfs_readpage_nolock(page);
2169         BUG_ON(err);
2170         BUG_ON(!PagePrivate(page));
2171         BUG_ON(logfs_block(page) != block);
2172         err = __logfs_write_buf(inode, page, 0);
2173         BUG_ON(err);
2174         BUG_ON(PagePrivate(page) || page->private);
2175
2176         logfs_put_write_page(page);
2177         logfs_safe_iput(inode, cookie);
2178 }
2179
2180 /**
2181  * logfs_inode_write - write inode or dentry objects
2182  *
2183  * @inode:              parent inode (ifile or directory)
2184  * @buf:                object to write (inode or dentry)
2185  * @n:                  object size
2186  * @_pos:               object number (file position in blocks/objects)
2187  * @flags:              write flags
2188  * @lock:               0 if write lock is already taken, 1 otherwise
2189  * @shadow_tree:        shadow below this inode
2190  *
2191  * FIXME: All caller of this put a 200-300 byte variable on the stack,
2192  * only to call here and do a memcpy from that stack variable.  A good
2193  * example of wasted performance and stack space.
2194  */
2195 int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
2196                 loff_t bix, long flags, struct shadow_tree *shadow_tree)
2197 {
2198         loff_t pos = bix << inode->i_sb->s_blocksize_bits;
2199         int err;
2200         struct page *page;
2201         void *pagebuf;
2202
2203         BUG_ON(pos & (LOGFS_BLOCKSIZE-1));
2204         BUG_ON(count > LOGFS_BLOCKSIZE);
2205         page = logfs_get_write_page(inode, bix, 0);
2206         if (!page)
2207                 return -ENOMEM;
2208
2209         pagebuf = kmap_atomic(page, KM_USER0);
2210         memcpy(pagebuf, buf, count);
2211         flush_dcache_page(page);
2212         kunmap_atomic(pagebuf, KM_USER0);
2213
2214         if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE)
2215                 i_size_write(inode, pos + LOGFS_BLOCKSIZE);
2216
2217         err = logfs_write_buf(inode, page, flags);
2218         logfs_put_write_page(page);
2219         return err;
2220 }
2221
2222 int logfs_open_segfile(struct super_block *sb)
2223 {
2224         struct logfs_super *super = logfs_super(sb);
2225         struct inode *inode;
2226
2227         inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE);
2228         if (IS_ERR(inode))
2229                 return PTR_ERR(inode);
2230         super->s_segfile_inode = inode;
2231         return 0;
2232 }
2233
2234 int logfs_init_rw(struct super_block *sb)
2235 {
2236         struct logfs_super *super = logfs_super(sb);
2237         int min_fill = 3 * super->s_no_blocks;
2238
2239         INIT_LIST_HEAD(&super->s_object_alias);
2240         mutex_init(&super->s_write_mutex);
2241         super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
2242                         sizeof(struct logfs_block));
2243         super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill,
2244                         sizeof(struct logfs_shadow));
2245         return 0;
2246 }
2247
2248 void logfs_cleanup_rw(struct super_block *sb)
2249 {
2250         struct logfs_super *super = logfs_super(sb);
2251
2252         destroy_meta_inode(super->s_segfile_inode);
2253         if (super->s_block_pool)
2254                 mempool_destroy(super->s_block_pool);
2255         if (super->s_shadow_pool)
2256                 mempool_destroy(super->s_shadow_pool);
2257 }