xfs: avoid repeated pointer dereferences
[linux-2.6.git] / fs / xfs / xfs_log_recover.c
1 /*
2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include "xfs_fs.h"
20 #include "xfs_types.h"
21 #include "xfs_bit.h"
22 #include "xfs_log.h"
23 #include "xfs_inum.h"
24 #include "xfs_trans.h"
25 #include "xfs_sb.h"
26 #include "xfs_ag.h"
27 #include "xfs_dir2.h"
28 #include "xfs_dmapi.h"
29 #include "xfs_mount.h"
30 #include "xfs_error.h"
31 #include "xfs_bmap_btree.h"
32 #include "xfs_alloc_btree.h"
33 #include "xfs_ialloc_btree.h"
34 #include "xfs_dir2_sf.h"
35 #include "xfs_attr_sf.h"
36 #include "xfs_dinode.h"
37 #include "xfs_inode.h"
38 #include "xfs_inode_item.h"
39 #include "xfs_alloc.h"
40 #include "xfs_ialloc.h"
41 #include "xfs_log_priv.h"
42 #include "xfs_buf_item.h"
43 #include "xfs_log_recover.h"
44 #include "xfs_extfree_item.h"
45 #include "xfs_trans_priv.h"
46 #include "xfs_quota.h"
47 #include "xfs_rw.h"
48 #include "xfs_utils.h"
49 #include "xfs_trace.h"
50
51 STATIC int      xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
52 STATIC int      xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
53 #if defined(DEBUG)
54 STATIC void     xlog_recover_check_summary(xlog_t *);
55 #else
56 #define xlog_recover_check_summary(log)
57 #endif
58
59 /*
60  * Sector aligned buffer routines for buffer create/read/write/access
61  */
62
63 /* Number of basic blocks in a log sector */
64 #define xlog_sectbb(log) (1 << (log)->l_sectbb_log)
65
66 /*
67  * Verify the given count of basic blocks is valid number of blocks
68  * to specify for an operation involving the given XFS log buffer.
69  * Returns nonzero if the count is valid, 0 otherwise.
70  */
71
72 static inline int
73 xlog_buf_bbcount_valid(
74         xlog_t          *log,
75         int             bbcount)
76 {
77         return bbcount > 0 && bbcount <= log->l_logBBsize;
78 }
79
80 /*
81  * Allocate a buffer to hold log data.  The buffer needs to be able
82  * to map to a range of nbblks basic blocks at any valid (basic
83  * block) offset within the log.
84  */
85 STATIC xfs_buf_t *
86 xlog_get_bp(
87         xlog_t          *log,
88         int             nbblks)
89 {
90         if (!xlog_buf_bbcount_valid(log, nbblks)) {
91                 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
92                         nbblks);
93                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
94                 return NULL;
95         }
96
97         /*
98          * We do log I/O in units of log sectors (a power-of-2
99          * multiple of the basic block size), so we round up the
100          * requested size to acommodate the basic blocks required
101          * for complete log sectors.
102          *
103          * In addition, the buffer may be used for a non-sector-
104          * aligned block offset, in which case an I/O of the
105          * requested size could extend beyond the end of the
106          * buffer.  If the requested size is only 1 basic block it
107          * will never straddle a sector boundary, so this won't be
108          * an issue.  Nor will this be a problem if the log I/O is
109          * done in basic blocks (sector size 1).  But otherwise we
110          * extend the buffer by one extra log sector to ensure
111          * there's space to accomodate this possiblility.
112          */
113         if (nbblks > 1 && log->l_sectbb_log)
114                 nbblks += xlog_sectbb(log);
115         nbblks = round_up(nbblks, xlog_sectbb(log));
116
117         return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
118 }
119
120 STATIC void
121 xlog_put_bp(
122         xfs_buf_t       *bp)
123 {
124         xfs_buf_free(bp);
125 }
126
127 STATIC xfs_caddr_t
128 xlog_align(
129         xlog_t          *log,
130         xfs_daddr_t     blk_no,
131         int             nbblks,
132         xfs_buf_t       *bp)
133 {
134         xfs_caddr_t     ptr;
135
136         if (!log->l_sectbb_log)
137                 return XFS_BUF_PTR(bp);
138
139         ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
140         ASSERT(XFS_BUF_SIZE(bp) >=
141                 BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
142         return ptr;
143 }
144
145
146 /*
147  * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
148  */
149 STATIC int
150 xlog_bread_noalign(
151         xlog_t          *log,
152         xfs_daddr_t     blk_no,
153         int             nbblks,
154         xfs_buf_t       *bp)
155 {
156         int             error;
157
158         if (!xlog_buf_bbcount_valid(log, nbblks)) {
159                 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
160                         nbblks);
161                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
162                 return EFSCORRUPTED;
163         }
164
165         blk_no = round_down(blk_no, xlog_sectbb(log));
166         nbblks = round_up(nbblks, xlog_sectbb(log));
167
168         ASSERT(nbblks > 0);
169         ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
170         ASSERT(bp);
171
172         XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
173         XFS_BUF_READ(bp);
174         XFS_BUF_BUSY(bp);
175         XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
176         XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
177
178         xfsbdstrat(log->l_mp, bp);
179         error = xfs_iowait(bp);
180         if (error)
181                 xfs_ioerror_alert("xlog_bread", log->l_mp,
182                                   bp, XFS_BUF_ADDR(bp));
183         return error;
184 }
185
186 STATIC int
187 xlog_bread(
188         xlog_t          *log,
189         xfs_daddr_t     blk_no,
190         int             nbblks,
191         xfs_buf_t       *bp,
192         xfs_caddr_t     *offset)
193 {
194         int             error;
195
196         error = xlog_bread_noalign(log, blk_no, nbblks, bp);
197         if (error)
198                 return error;
199
200         *offset = xlog_align(log, blk_no, nbblks, bp);
201         return 0;
202 }
203
204 /*
205  * Write out the buffer at the given block for the given number of blocks.
206  * The buffer is kept locked across the write and is returned locked.
207  * This can only be used for synchronous log writes.
208  */
209 STATIC int
210 xlog_bwrite(
211         xlog_t          *log,
212         xfs_daddr_t     blk_no,
213         int             nbblks,
214         xfs_buf_t       *bp)
215 {
216         int             error;
217
218         if (!xlog_buf_bbcount_valid(log, nbblks)) {
219                 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
220                         nbblks);
221                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
222                 return EFSCORRUPTED;
223         }
224
225         blk_no = round_down(blk_no, xlog_sectbb(log));
226         nbblks = round_up(nbblks, xlog_sectbb(log));
227
228         ASSERT(nbblks > 0);
229         ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
230
231         XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
232         XFS_BUF_ZEROFLAGS(bp);
233         XFS_BUF_BUSY(bp);
234         XFS_BUF_HOLD(bp);
235         XFS_BUF_PSEMA(bp, PRIBIO);
236         XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
237         XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
238
239         if ((error = xfs_bwrite(log->l_mp, bp)))
240                 xfs_ioerror_alert("xlog_bwrite", log->l_mp,
241                                   bp, XFS_BUF_ADDR(bp));
242         return error;
243 }
244
245 #ifdef DEBUG
246 /*
247  * dump debug superblock and log record information
248  */
249 STATIC void
250 xlog_header_check_dump(
251         xfs_mount_t             *mp,
252         xlog_rec_header_t       *head)
253 {
254         cmn_err(CE_DEBUG, "%s:  SB : uuid = %pU, fmt = %d\n",
255                 __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
256         cmn_err(CE_DEBUG, "    log : uuid = %pU, fmt = %d\n",
257                 &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
258 }
259 #else
260 #define xlog_header_check_dump(mp, head)
261 #endif
262
263 /*
264  * check log record header for recovery
265  */
266 STATIC int
267 xlog_header_check_recover(
268         xfs_mount_t             *mp,
269         xlog_rec_header_t       *head)
270 {
271         ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM);
272
273         /*
274          * IRIX doesn't write the h_fmt field and leaves it zeroed
275          * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
276          * a dirty log created in IRIX.
277          */
278         if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) {
279                 xlog_warn(
280         "XFS: dirty log written in incompatible format - can't recover");
281                 xlog_header_check_dump(mp, head);
282                 XFS_ERROR_REPORT("xlog_header_check_recover(1)",
283                                  XFS_ERRLEVEL_HIGH, mp);
284                 return XFS_ERROR(EFSCORRUPTED);
285         } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
286                 xlog_warn(
287         "XFS: dirty log entry has mismatched uuid - can't recover");
288                 xlog_header_check_dump(mp, head);
289                 XFS_ERROR_REPORT("xlog_header_check_recover(2)",
290                                  XFS_ERRLEVEL_HIGH, mp);
291                 return XFS_ERROR(EFSCORRUPTED);
292         }
293         return 0;
294 }
295
296 /*
297  * read the head block of the log and check the header
298  */
299 STATIC int
300 xlog_header_check_mount(
301         xfs_mount_t             *mp,
302         xlog_rec_header_t       *head)
303 {
304         ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM);
305
306         if (uuid_is_nil(&head->h_fs_uuid)) {
307                 /*
308                  * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
309                  * h_fs_uuid is nil, we assume this log was last mounted
310                  * by IRIX and continue.
311                  */
312                 xlog_warn("XFS: nil uuid in log - IRIX style log");
313         } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
314                 xlog_warn("XFS: log has mismatched uuid - can't recover");
315                 xlog_header_check_dump(mp, head);
316                 XFS_ERROR_REPORT("xlog_header_check_mount",
317                                  XFS_ERRLEVEL_HIGH, mp);
318                 return XFS_ERROR(EFSCORRUPTED);
319         }
320         return 0;
321 }
322
323 STATIC void
324 xlog_recover_iodone(
325         struct xfs_buf  *bp)
326 {
327         if (XFS_BUF_GETERROR(bp)) {
328                 /*
329                  * We're not going to bother about retrying
330                  * this during recovery. One strike!
331                  */
332                 xfs_ioerror_alert("xlog_recover_iodone",
333                                   bp->b_mount, bp, XFS_BUF_ADDR(bp));
334                 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
335         }
336         bp->b_mount = NULL;
337         XFS_BUF_CLR_IODONE_FUNC(bp);
338         xfs_biodone(bp);
339 }
340
341 /*
342  * This routine finds (to an approximation) the first block in the physical
343  * log which contains the given cycle.  It uses a binary search algorithm.
344  * Note that the algorithm can not be perfect because the disk will not
345  * necessarily be perfect.
346  */
347 STATIC int
348 xlog_find_cycle_start(
349         xlog_t          *log,
350         xfs_buf_t       *bp,
351         xfs_daddr_t     first_blk,
352         xfs_daddr_t     *last_blk,
353         uint            cycle)
354 {
355         xfs_caddr_t     offset;
356         xfs_daddr_t     mid_blk;
357         xfs_daddr_t     end_blk;
358         uint            mid_cycle;
359         int             error;
360
361         end_blk = *last_blk;
362         mid_blk = BLK_AVG(first_blk, end_blk);
363         while (mid_blk != first_blk && mid_blk != end_blk) {
364                 error = xlog_bread(log, mid_blk, 1, bp, &offset);
365                 if (error)
366                         return error;
367                 mid_cycle = xlog_get_cycle(offset);
368                 if (mid_cycle == cycle)
369                         end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
370                 else
371                         first_blk = mid_blk; /* first_half_cycle == mid_cycle */
372                 mid_blk = BLK_AVG(first_blk, end_blk);
373         }
374         ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
375                (mid_blk == end_blk && mid_blk-1 == first_blk));
376
377         *last_blk = end_blk;
378
379         return 0;
380 }
381
382 /*
383  * Check that the range of blocks does not contain the cycle number
384  * given.  The scan needs to occur from front to back and the ptr into the
385  * region must be updated since a later routine will need to perform another
386  * test.  If the region is completely good, we end up returning the same
387  * last block number.
388  *
389  * Set blkno to -1 if we encounter no errors.  This is an invalid block number
390  * since we don't ever expect logs to get this large.
391  */
392 STATIC int
393 xlog_find_verify_cycle(
394         xlog_t          *log,
395         xfs_daddr_t     start_blk,
396         int             nbblks,
397         uint            stop_on_cycle_no,
398         xfs_daddr_t     *new_blk)
399 {
400         xfs_daddr_t     i, j;
401         uint            cycle;
402         xfs_buf_t       *bp;
403         xfs_daddr_t     bufblks;
404         xfs_caddr_t     buf = NULL;
405         int             error = 0;
406
407         /*
408          * Greedily allocate a buffer big enough to handle the full
409          * range of basic blocks we'll be examining.  If that fails,
410          * try a smaller size.  We need to be able to read at least
411          * a log sector, or we're out of luck.
412          */
413         bufblks = 1 << ffs(nbblks);
414         while (!(bp = xlog_get_bp(log, bufblks))) {
415                 bufblks >>= 1;
416                 if (bufblks < xlog_sectbb(log))
417                         return ENOMEM;
418         }
419
420         for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
421                 int     bcount;
422
423                 bcount = min(bufblks, (start_blk + nbblks - i));
424
425                 error = xlog_bread(log, i, bcount, bp, &buf);
426                 if (error)
427                         goto out;
428
429                 for (j = 0; j < bcount; j++) {
430                         cycle = xlog_get_cycle(buf);
431                         if (cycle == stop_on_cycle_no) {
432                                 *new_blk = i+j;
433                                 goto out;
434                         }
435
436                         buf += BBSIZE;
437                 }
438         }
439
440         *new_blk = -1;
441
442 out:
443         xlog_put_bp(bp);
444         return error;
445 }
446
447 /*
448  * Potentially backup over partial log record write.
449  *
450  * In the typical case, last_blk is the number of the block directly after
451  * a good log record.  Therefore, we subtract one to get the block number
452  * of the last block in the given buffer.  extra_bblks contains the number
453  * of blocks we would have read on a previous read.  This happens when the
454  * last log record is split over the end of the physical log.
455  *
456  * extra_bblks is the number of blocks potentially verified on a previous
457  * call to this routine.
458  */
459 STATIC int
460 xlog_find_verify_log_record(
461         xlog_t                  *log,
462         xfs_daddr_t             start_blk,
463         xfs_daddr_t             *last_blk,
464         int                     extra_bblks)
465 {
466         xfs_daddr_t             i;
467         xfs_buf_t               *bp;
468         xfs_caddr_t             offset = NULL;
469         xlog_rec_header_t       *head = NULL;
470         int                     error = 0;
471         int                     smallmem = 0;
472         int                     num_blks = *last_blk - start_blk;
473         int                     xhdrs;
474
475         ASSERT(start_blk != 0 || *last_blk != start_blk);
476
477         if (!(bp = xlog_get_bp(log, num_blks))) {
478                 if (!(bp = xlog_get_bp(log, 1)))
479                         return ENOMEM;
480                 smallmem = 1;
481         } else {
482                 error = xlog_bread(log, start_blk, num_blks, bp, &offset);
483                 if (error)
484                         goto out;
485                 offset += ((num_blks - 1) << BBSHIFT);
486         }
487
488         for (i = (*last_blk) - 1; i >= 0; i--) {
489                 if (i < start_blk) {
490                         /* valid log record not found */
491                         xlog_warn(
492                 "XFS: Log inconsistent (didn't find previous header)");
493                         ASSERT(0);
494                         error = XFS_ERROR(EIO);
495                         goto out;
496                 }
497
498                 if (smallmem) {
499                         error = xlog_bread(log, i, 1, bp, &offset);
500                         if (error)
501                                 goto out;
502                 }
503
504                 head = (xlog_rec_header_t *)offset;
505
506                 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(head->h_magicno))
507                         break;
508
509                 if (!smallmem)
510                         offset -= BBSIZE;
511         }
512
513         /*
514          * We hit the beginning of the physical log & still no header.  Return
515          * to caller.  If caller can handle a return of -1, then this routine
516          * will be called again for the end of the physical log.
517          */
518         if (i == -1) {
519                 error = -1;
520                 goto out;
521         }
522
523         /*
524          * We have the final block of the good log (the first block
525          * of the log record _before_ the head. So we check the uuid.
526          */
527         if ((error = xlog_header_check_mount(log->l_mp, head)))
528                 goto out;
529
530         /*
531          * We may have found a log record header before we expected one.
532          * last_blk will be the 1st block # with a given cycle #.  We may end
533          * up reading an entire log record.  In this case, we don't want to
534          * reset last_blk.  Only when last_blk points in the middle of a log
535          * record do we update last_blk.
536          */
537         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
538                 uint    h_size = be32_to_cpu(head->h_size);
539
540                 xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
541                 if (h_size % XLOG_HEADER_CYCLE_SIZE)
542                         xhdrs++;
543         } else {
544                 xhdrs = 1;
545         }
546
547         if (*last_blk - i + extra_bblks !=
548             BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
549                 *last_blk = i;
550
551 out:
552         xlog_put_bp(bp);
553         return error;
554 }
555
556 /*
557  * Head is defined to be the point of the log where the next log write
558  * write could go.  This means that incomplete LR writes at the end are
559  * eliminated when calculating the head.  We aren't guaranteed that previous
560  * LR have complete transactions.  We only know that a cycle number of
561  * current cycle number -1 won't be present in the log if we start writing
562  * from our current block number.
563  *
564  * last_blk contains the block number of the first block with a given
565  * cycle number.
566  *
567  * Return: zero if normal, non-zero if error.
568  */
569 STATIC int
570 xlog_find_head(
571         xlog_t          *log,
572         xfs_daddr_t     *return_head_blk)
573 {
574         xfs_buf_t       *bp;
575         xfs_caddr_t     offset;
576         xfs_daddr_t     new_blk, first_blk, start_blk, last_blk, head_blk;
577         int             num_scan_bblks;
578         uint            first_half_cycle, last_half_cycle;
579         uint            stop_on_cycle;
580         int             error, log_bbnum = log->l_logBBsize;
581
582         /* Is the end of the log device zeroed? */
583         if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
584                 *return_head_blk = first_blk;
585
586                 /* Is the whole lot zeroed? */
587                 if (!first_blk) {
588                         /* Linux XFS shouldn't generate totally zeroed logs -
589                          * mkfs etc write a dummy unmount record to a fresh
590                          * log so we can store the uuid in there
591                          */
592                         xlog_warn("XFS: totally zeroed log");
593                 }
594
595                 return 0;
596         } else if (error) {
597                 xlog_warn("XFS: empty log check failed");
598                 return error;
599         }
600
601         first_blk = 0;                  /* get cycle # of 1st block */
602         bp = xlog_get_bp(log, 1);
603         if (!bp)
604                 return ENOMEM;
605
606         error = xlog_bread(log, 0, 1, bp, &offset);
607         if (error)
608                 goto bp_err;
609
610         first_half_cycle = xlog_get_cycle(offset);
611
612         last_blk = head_blk = log_bbnum - 1;    /* get cycle # of last block */
613         error = xlog_bread(log, last_blk, 1, bp, &offset);
614         if (error)
615                 goto bp_err;
616
617         last_half_cycle = xlog_get_cycle(offset);
618         ASSERT(last_half_cycle != 0);
619
620         /*
621          * If the 1st half cycle number is equal to the last half cycle number,
622          * then the entire log is stamped with the same cycle number.  In this
623          * case, head_blk can't be set to zero (which makes sense).  The below
624          * math doesn't work out properly with head_blk equal to zero.  Instead,
625          * we set it to log_bbnum which is an invalid block number, but this
626          * value makes the math correct.  If head_blk doesn't changed through
627          * all the tests below, *head_blk is set to zero at the very end rather
628          * than log_bbnum.  In a sense, log_bbnum and zero are the same block
629          * in a circular file.
630          */
631         if (first_half_cycle == last_half_cycle) {
632                 /*
633                  * In this case we believe that the entire log should have
634                  * cycle number last_half_cycle.  We need to scan backwards
635                  * from the end verifying that there are no holes still
636                  * containing last_half_cycle - 1.  If we find such a hole,
637                  * then the start of that hole will be the new head.  The
638                  * simple case looks like
639                  *        x | x ... | x - 1 | x
640                  * Another case that fits this picture would be
641                  *        x | x + 1 | x ... | x
642                  * In this case the head really is somewhere at the end of the
643                  * log, as one of the latest writes at the beginning was
644                  * incomplete.
645                  * One more case is
646                  *        x | x + 1 | x ... | x - 1 | x
647                  * This is really the combination of the above two cases, and
648                  * the head has to end up at the start of the x-1 hole at the
649                  * end of the log.
650                  *
651                  * In the 256k log case, we will read from the beginning to the
652                  * end of the log and search for cycle numbers equal to x-1.
653                  * We don't worry about the x+1 blocks that we encounter,
654                  * because we know that they cannot be the head since the log
655                  * started with x.
656                  */
657                 head_blk = log_bbnum;
658                 stop_on_cycle = last_half_cycle - 1;
659         } else {
660                 /*
661                  * In this case we want to find the first block with cycle
662                  * number matching last_half_cycle.  We expect the log to be
663                  * some variation on
664                  *        x + 1 ... | x ...
665                  * The first block with cycle number x (last_half_cycle) will
666                  * be where the new head belongs.  First we do a binary search
667                  * for the first occurrence of last_half_cycle.  The binary
668                  * search may not be totally accurate, so then we scan back
669                  * from there looking for occurrences of last_half_cycle before
670                  * us.  If that backwards scan wraps around the beginning of
671                  * the log, then we look for occurrences of last_half_cycle - 1
672                  * at the end of the log.  The cases we're looking for look
673                  * like
674                  *        x + 1 ... | x | x + 1 | x ...
675                  *                               ^ binary search stopped here
676                  * or
677                  *        x + 1 ... | x ... | x - 1 | x
678                  *        <---------> less than scan distance
679                  */
680                 stop_on_cycle = last_half_cycle;
681                 if ((error = xlog_find_cycle_start(log, bp, first_blk,
682                                                 &head_blk, last_half_cycle)))
683                         goto bp_err;
684         }
685
686         /*
687          * Now validate the answer.  Scan back some number of maximum possible
688          * blocks and make sure each one has the expected cycle number.  The
689          * maximum is determined by the total possible amount of buffering
690          * in the in-core log.  The following number can be made tighter if
691          * we actually look at the block size of the filesystem.
692          */
693         num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
694         if (head_blk >= num_scan_bblks) {
695                 /*
696                  * We are guaranteed that the entire check can be performed
697                  * in one buffer.
698                  */
699                 start_blk = head_blk - num_scan_bblks;
700                 if ((error = xlog_find_verify_cycle(log,
701                                                 start_blk, num_scan_bblks,
702                                                 stop_on_cycle, &new_blk)))
703                         goto bp_err;
704                 if (new_blk != -1)
705                         head_blk = new_blk;
706         } else {                /* need to read 2 parts of log */
707                 /*
708                  * We are going to scan backwards in the log in two parts.
709                  * First we scan the physical end of the log.  In this part
710                  * of the log, we are looking for blocks with cycle number
711                  * last_half_cycle - 1.
712                  * If we find one, then we know that the log starts there, as
713                  * we've found a hole that didn't get written in going around
714                  * the end of the physical log.  The simple case for this is
715                  *        x + 1 ... | x ... | x - 1 | x
716                  *        <---------> less than scan distance
717                  * If all of the blocks at the end of the log have cycle number
718                  * last_half_cycle, then we check the blocks at the start of
719                  * the log looking for occurrences of last_half_cycle.  If we
720                  * find one, then our current estimate for the location of the
721                  * first occurrence of last_half_cycle is wrong and we move
722                  * back to the hole we've found.  This case looks like
723                  *        x + 1 ... | x | x + 1 | x ...
724                  *                               ^ binary search stopped here
725                  * Another case we need to handle that only occurs in 256k
726                  * logs is
727                  *        x + 1 ... | x ... | x+1 | x ...
728                  *                   ^ binary search stops here
729                  * In a 256k log, the scan at the end of the log will see the
730                  * x + 1 blocks.  We need to skip past those since that is
731                  * certainly not the head of the log.  By searching for
732                  * last_half_cycle-1 we accomplish that.
733                  */
734                 start_blk = log_bbnum - num_scan_bblks + head_blk;
735                 ASSERT(head_blk <= INT_MAX &&
736                         (xfs_daddr_t) num_scan_bblks - head_blk >= 0);
737                 if ((error = xlog_find_verify_cycle(log, start_blk,
738                                         num_scan_bblks - (int)head_blk,
739                                         (stop_on_cycle - 1), &new_blk)))
740                         goto bp_err;
741                 if (new_blk != -1) {
742                         head_blk = new_blk;
743                         goto validate_head;
744                 }
745
746                 /*
747                  * Scan beginning of log now.  The last part of the physical
748                  * log is good.  This scan needs to verify that it doesn't find
749                  * the last_half_cycle.
750                  */
751                 start_blk = 0;
752                 ASSERT(head_blk <= INT_MAX);
753                 if ((error = xlog_find_verify_cycle(log,
754                                         start_blk, (int)head_blk,
755                                         stop_on_cycle, &new_blk)))
756                         goto bp_err;
757                 if (new_blk != -1)
758                         head_blk = new_blk;
759         }
760
761 validate_head:
762         /*
763          * Now we need to make sure head_blk is not pointing to a block in
764          * the middle of a log record.
765          */
766         num_scan_bblks = XLOG_REC_SHIFT(log);
767         if (head_blk >= num_scan_bblks) {
768                 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
769
770                 /* start ptr at last block ptr before head_blk */
771                 if ((error = xlog_find_verify_log_record(log, start_blk,
772                                                         &head_blk, 0)) == -1) {
773                         error = XFS_ERROR(EIO);
774                         goto bp_err;
775                 } else if (error)
776                         goto bp_err;
777         } else {
778                 start_blk = 0;
779                 ASSERT(head_blk <= INT_MAX);
780                 if ((error = xlog_find_verify_log_record(log, start_blk,
781                                                         &head_blk, 0)) == -1) {
782                         /* We hit the beginning of the log during our search */
783                         start_blk = log_bbnum - num_scan_bblks + head_blk;
784                         new_blk = log_bbnum;
785                         ASSERT(start_blk <= INT_MAX &&
786                                 (xfs_daddr_t) log_bbnum-start_blk >= 0);
787                         ASSERT(head_blk <= INT_MAX);
788                         if ((error = xlog_find_verify_log_record(log,
789                                                         start_blk, &new_blk,
790                                                         (int)head_blk)) == -1) {
791                                 error = XFS_ERROR(EIO);
792                                 goto bp_err;
793                         } else if (error)
794                                 goto bp_err;
795                         if (new_blk != log_bbnum)
796                                 head_blk = new_blk;
797                 } else if (error)
798                         goto bp_err;
799         }
800
801         xlog_put_bp(bp);
802         if (head_blk == log_bbnum)
803                 *return_head_blk = 0;
804         else
805                 *return_head_blk = head_blk;
806         /*
807          * When returning here, we have a good block number.  Bad block
808          * means that during a previous crash, we didn't have a clean break
809          * from cycle number N to cycle number N-1.  In this case, we need
810          * to find the first block with cycle number N-1.
811          */
812         return 0;
813
814  bp_err:
815         xlog_put_bp(bp);
816
817         if (error)
818             xlog_warn("XFS: failed to find log head");
819         return error;
820 }
821
822 /*
823  * Find the sync block number or the tail of the log.
824  *
825  * This will be the block number of the last record to have its
826  * associated buffers synced to disk.  Every log record header has
827  * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
828  * to get a sync block number.  The only concern is to figure out which
829  * log record header to believe.
830  *
831  * The following algorithm uses the log record header with the largest
832  * lsn.  The entire log record does not need to be valid.  We only care
833  * that the header is valid.
834  *
835  * We could speed up search by using current head_blk buffer, but it is not
836  * available.
837  */
838 STATIC int
839 xlog_find_tail(
840         xlog_t                  *log,
841         xfs_daddr_t             *head_blk,
842         xfs_daddr_t             *tail_blk)
843 {
844         xlog_rec_header_t       *rhead;
845         xlog_op_header_t        *op_head;
846         xfs_caddr_t             offset = NULL;
847         xfs_buf_t               *bp;
848         int                     error, i, found;
849         xfs_daddr_t             umount_data_blk;
850         xfs_daddr_t             after_umount_blk;
851         xfs_lsn_t               tail_lsn;
852         int                     hblks;
853
854         found = 0;
855
856         /*
857          * Find previous log record
858          */
859         if ((error = xlog_find_head(log, head_blk)))
860                 return error;
861
862         bp = xlog_get_bp(log, 1);
863         if (!bp)
864                 return ENOMEM;
865         if (*head_blk == 0) {                           /* special case */
866                 error = xlog_bread(log, 0, 1, bp, &offset);
867                 if (error)
868                         goto done;
869
870                 if (xlog_get_cycle(offset) == 0) {
871                         *tail_blk = 0;
872                         /* leave all other log inited values alone */
873                         goto done;
874                 }
875         }
876
877         /*
878          * Search backwards looking for log record header block
879          */
880         ASSERT(*head_blk < INT_MAX);
881         for (i = (int)(*head_blk) - 1; i >= 0; i--) {
882                 error = xlog_bread(log, i, 1, bp, &offset);
883                 if (error)
884                         goto done;
885
886                 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
887                         found = 1;
888                         break;
889                 }
890         }
891         /*
892          * If we haven't found the log record header block, start looking
893          * again from the end of the physical log.  XXXmiken: There should be
894          * a check here to make sure we didn't search more than N blocks in
895          * the previous code.
896          */
897         if (!found) {
898                 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
899                         error = xlog_bread(log, i, 1, bp, &offset);
900                         if (error)
901                                 goto done;
902
903                         if (XLOG_HEADER_MAGIC_NUM ==
904                             be32_to_cpu(*(__be32 *)offset)) {
905                                 found = 2;
906                                 break;
907                         }
908                 }
909         }
910         if (!found) {
911                 xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
912                 ASSERT(0);
913                 return XFS_ERROR(EIO);
914         }
915
916         /* find blk_no of tail of log */
917         rhead = (xlog_rec_header_t *)offset;
918         *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
919
920         /*
921          * Reset log values according to the state of the log when we
922          * crashed.  In the case where head_blk == 0, we bump curr_cycle
923          * one because the next write starts a new cycle rather than
924          * continuing the cycle of the last good log record.  At this
925          * point we have guaranteed that all partial log records have been
926          * accounted for.  Therefore, we know that the last good log record
927          * written was complete and ended exactly on the end boundary
928          * of the physical log.
929          */
930         log->l_prev_block = i;
931         log->l_curr_block = (int)*head_blk;
932         log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
933         if (found == 2)
934                 log->l_curr_cycle++;
935         log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn);
936         log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn);
937         log->l_grant_reserve_cycle = log->l_curr_cycle;
938         log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
939         log->l_grant_write_cycle = log->l_curr_cycle;
940         log->l_grant_write_bytes = BBTOB(log->l_curr_block);
941
942         /*
943          * Look for unmount record.  If we find it, then we know there
944          * was a clean unmount.  Since 'i' could be the last block in
945          * the physical log, we convert to a log block before comparing
946          * to the head_blk.
947          *
948          * Save the current tail lsn to use to pass to
949          * xlog_clear_stale_blocks() below.  We won't want to clear the
950          * unmount record if there is one, so we pass the lsn of the
951          * unmount record rather than the block after it.
952          */
953         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
954                 int     h_size = be32_to_cpu(rhead->h_size);
955                 int     h_version = be32_to_cpu(rhead->h_version);
956
957                 if ((h_version & XLOG_VERSION_2) &&
958                     (h_size > XLOG_HEADER_CYCLE_SIZE)) {
959                         hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
960                         if (h_size % XLOG_HEADER_CYCLE_SIZE)
961                                 hblks++;
962                 } else {
963                         hblks = 1;
964                 }
965         } else {
966                 hblks = 1;
967         }
968         after_umount_blk = (i + hblks + (int)
969                 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
970         tail_lsn = log->l_tail_lsn;
971         if (*head_blk == after_umount_blk &&
972             be32_to_cpu(rhead->h_num_logops) == 1) {
973                 umount_data_blk = (i + hblks) % log->l_logBBsize;
974                 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
975                 if (error)
976                         goto done;
977
978                 op_head = (xlog_op_header_t *)offset;
979                 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
980                         /*
981                          * Set tail and last sync so that newly written
982                          * log records will point recovery to after the
983                          * current unmount record.
984                          */
985                         log->l_tail_lsn =
986                                 xlog_assign_lsn(log->l_curr_cycle,
987                                                 after_umount_blk);
988                         log->l_last_sync_lsn =
989                                 xlog_assign_lsn(log->l_curr_cycle,
990                                                 after_umount_blk);
991                         *tail_blk = after_umount_blk;
992
993                         /*
994                          * Note that the unmount was clean. If the unmount
995                          * was not clean, we need to know this to rebuild the
996                          * superblock counters from the perag headers if we
997                          * have a filesystem using non-persistent counters.
998                          */
999                         log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
1000                 }
1001         }
1002
1003         /*
1004          * Make sure that there are no blocks in front of the head
1005          * with the same cycle number as the head.  This can happen
1006          * because we allow multiple outstanding log writes concurrently,
1007          * and the later writes might make it out before earlier ones.
1008          *
1009          * We use the lsn from before modifying it so that we'll never
1010          * overwrite the unmount record after a clean unmount.
1011          *
1012          * Do this only if we are going to recover the filesystem
1013          *
1014          * NOTE: This used to say "if (!readonly)"
1015          * However on Linux, we can & do recover a read-only filesystem.
1016          * We only skip recovery if NORECOVERY is specified on mount,
1017          * in which case we would not be here.
1018          *
1019          * But... if the -device- itself is readonly, just skip this.
1020          * We can't recover this device anyway, so it won't matter.
1021          */
1022         if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
1023                 error = xlog_clear_stale_blocks(log, tail_lsn);
1024
1025 done:
1026         xlog_put_bp(bp);
1027
1028         if (error)
1029                 xlog_warn("XFS: failed to locate log tail");
1030         return error;
1031 }
1032
1033 /*
1034  * Is the log zeroed at all?
1035  *
1036  * The last binary search should be changed to perform an X block read
1037  * once X becomes small enough.  You can then search linearly through
1038  * the X blocks.  This will cut down on the number of reads we need to do.
1039  *
1040  * If the log is partially zeroed, this routine will pass back the blkno
1041  * of the first block with cycle number 0.  It won't have a complete LR
1042  * preceding it.
1043  *
1044  * Return:
1045  *      0  => the log is completely written to
1046  *      -1 => use *blk_no as the first block of the log
1047  *      >0 => error has occurred
1048  */
1049 STATIC int
1050 xlog_find_zeroed(
1051         xlog_t          *log,
1052         xfs_daddr_t     *blk_no)
1053 {
1054         xfs_buf_t       *bp;
1055         xfs_caddr_t     offset;
1056         uint            first_cycle, last_cycle;
1057         xfs_daddr_t     new_blk, last_blk, start_blk;
1058         xfs_daddr_t     num_scan_bblks;
1059         int             error, log_bbnum = log->l_logBBsize;
1060
1061         *blk_no = 0;
1062
1063         /* check totally zeroed log */
1064         bp = xlog_get_bp(log, 1);
1065         if (!bp)
1066                 return ENOMEM;
1067         error = xlog_bread(log, 0, 1, bp, &offset);
1068         if (error)
1069                 goto bp_err;
1070
1071         first_cycle = xlog_get_cycle(offset);
1072         if (first_cycle == 0) {         /* completely zeroed log */
1073                 *blk_no = 0;
1074                 xlog_put_bp(bp);
1075                 return -1;
1076         }
1077
1078         /* check partially zeroed log */
1079         error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
1080         if (error)
1081                 goto bp_err;
1082
1083         last_cycle = xlog_get_cycle(offset);
1084         if (last_cycle != 0) {          /* log completely written to */
1085                 xlog_put_bp(bp);
1086                 return 0;
1087         } else if (first_cycle != 1) {
1088                 /*
1089                  * If the cycle of the last block is zero, the cycle of
1090                  * the first block must be 1. If it's not, maybe we're
1091                  * not looking at a log... Bail out.
1092                  */
1093                 xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
1094                 return XFS_ERROR(EINVAL);
1095         }
1096
1097         /* we have a partially zeroed log */
1098         last_blk = log_bbnum-1;
1099         if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
1100                 goto bp_err;
1101
1102         /*
1103          * Validate the answer.  Because there is no way to guarantee that
1104          * the entire log is made up of log records which are the same size,
1105          * we scan over the defined maximum blocks.  At this point, the maximum
1106          * is not chosen to mean anything special.   XXXmiken
1107          */
1108         num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1109         ASSERT(num_scan_bblks <= INT_MAX);
1110
1111         if (last_blk < num_scan_bblks)
1112                 num_scan_bblks = last_blk;
1113         start_blk = last_blk - num_scan_bblks;
1114
1115         /*
1116          * We search for any instances of cycle number 0 that occur before
1117          * our current estimate of the head.  What we're trying to detect is
1118          *        1 ... | 0 | 1 | 0...
1119          *                       ^ binary search ends here
1120          */
1121         if ((error = xlog_find_verify_cycle(log, start_blk,
1122                                          (int)num_scan_bblks, 0, &new_blk)))
1123                 goto bp_err;
1124         if (new_blk != -1)
1125                 last_blk = new_blk;
1126
1127         /*
1128          * Potentially backup over partial log record write.  We don't need
1129          * to search the end of the log because we know it is zero.
1130          */
1131         if ((error = xlog_find_verify_log_record(log, start_blk,
1132                                 &last_blk, 0)) == -1) {
1133             error = XFS_ERROR(EIO);
1134             goto bp_err;
1135         } else if (error)
1136             goto bp_err;
1137
1138         *blk_no = last_blk;
1139 bp_err:
1140         xlog_put_bp(bp);
1141         if (error)
1142                 return error;
1143         return -1;
1144 }
1145
1146 /*
1147  * These are simple subroutines used by xlog_clear_stale_blocks() below
1148  * to initialize a buffer full of empty log record headers and write
1149  * them into the log.
1150  */
1151 STATIC void
1152 xlog_add_record(
1153         xlog_t                  *log,
1154         xfs_caddr_t             buf,
1155         int                     cycle,
1156         int                     block,
1157         int                     tail_cycle,
1158         int                     tail_block)
1159 {
1160         xlog_rec_header_t       *recp = (xlog_rec_header_t *)buf;
1161
1162         memset(buf, 0, BBSIZE);
1163         recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1164         recp->h_cycle = cpu_to_be32(cycle);
1165         recp->h_version = cpu_to_be32(
1166                         xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
1167         recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1168         recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1169         recp->h_fmt = cpu_to_be32(XLOG_FMT);
1170         memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1171 }
1172
1173 STATIC int
1174 xlog_write_log_records(
1175         xlog_t          *log,
1176         int             cycle,
1177         int             start_block,
1178         int             blocks,
1179         int             tail_cycle,
1180         int             tail_block)
1181 {
1182         xfs_caddr_t     offset;
1183         xfs_buf_t       *bp;
1184         int             balign, ealign;
1185         int             sectbb = xlog_sectbb(log);
1186         int             end_block = start_block + blocks;
1187         int             bufblks;
1188         int             error = 0;
1189         int             i, j = 0;
1190
1191         /*
1192          * Greedily allocate a buffer big enough to handle the full
1193          * range of basic blocks to be written.  If that fails, try
1194          * a smaller size.  We need to be able to write at least a
1195          * log sector, or we're out of luck.
1196          */
1197         bufblks = 1 << ffs(blocks);
1198         while (!(bp = xlog_get_bp(log, bufblks))) {
1199                 bufblks >>= 1;
1200                 if (bufblks < xlog_sectbb(log))
1201                         return ENOMEM;
1202         }
1203
1204         /* We may need to do a read at the start to fill in part of
1205          * the buffer in the starting sector not covered by the first
1206          * write below.
1207          */
1208         balign = round_down(start_block, sectbb);
1209         if (balign != start_block) {
1210                 error = xlog_bread_noalign(log, start_block, 1, bp);
1211                 if (error)
1212                         goto out_put_bp;
1213
1214                 j = start_block - balign;
1215         }
1216
1217         for (i = start_block; i < end_block; i += bufblks) {
1218                 int             bcount, endcount;
1219
1220                 bcount = min(bufblks, end_block - start_block);
1221                 endcount = bcount - j;
1222
1223                 /* We may need to do a read at the end to fill in part of
1224                  * the buffer in the final sector not covered by the write.
1225                  * If this is the same sector as the above read, skip it.
1226                  */
1227                 ealign = round_down(end_block, sectbb);
1228                 if (j == 0 && (start_block + endcount > ealign)) {
1229                         offset = XFS_BUF_PTR(bp);
1230                         balign = BBTOB(ealign - start_block);
1231                         error = XFS_BUF_SET_PTR(bp, offset + balign,
1232                                                 BBTOB(sectbb));
1233                         if (error)
1234                                 break;
1235
1236                         error = xlog_bread_noalign(log, ealign, sectbb, bp);
1237                         if (error)
1238                                 break;
1239
1240                         error = XFS_BUF_SET_PTR(bp, offset, bufblks);
1241                         if (error)
1242                                 break;
1243                 }
1244
1245                 offset = xlog_align(log, start_block, endcount, bp);
1246                 for (; j < endcount; j++) {
1247                         xlog_add_record(log, offset, cycle, i+j,
1248                                         tail_cycle, tail_block);
1249                         offset += BBSIZE;
1250                 }
1251                 error = xlog_bwrite(log, start_block, endcount, bp);
1252                 if (error)
1253                         break;
1254                 start_block += endcount;
1255                 j = 0;
1256         }
1257
1258  out_put_bp:
1259         xlog_put_bp(bp);
1260         return error;
1261 }
1262
1263 /*
1264  * This routine is called to blow away any incomplete log writes out
1265  * in front of the log head.  We do this so that we won't become confused
1266  * if we come up, write only a little bit more, and then crash again.
1267  * If we leave the partial log records out there, this situation could
1268  * cause us to think those partial writes are valid blocks since they
1269  * have the current cycle number.  We get rid of them by overwriting them
1270  * with empty log records with the old cycle number rather than the
1271  * current one.
1272  *
1273  * The tail lsn is passed in rather than taken from
1274  * the log so that we will not write over the unmount record after a
1275  * clean unmount in a 512 block log.  Doing so would leave the log without
1276  * any valid log records in it until a new one was written.  If we crashed
1277  * during that time we would not be able to recover.
1278  */
1279 STATIC int
1280 xlog_clear_stale_blocks(
1281         xlog_t          *log,
1282         xfs_lsn_t       tail_lsn)
1283 {
1284         int             tail_cycle, head_cycle;
1285         int             tail_block, head_block;
1286         int             tail_distance, max_distance;
1287         int             distance;
1288         int             error;
1289
1290         tail_cycle = CYCLE_LSN(tail_lsn);
1291         tail_block = BLOCK_LSN(tail_lsn);
1292         head_cycle = log->l_curr_cycle;
1293         head_block = log->l_curr_block;
1294
1295         /*
1296          * Figure out the distance between the new head of the log
1297          * and the tail.  We want to write over any blocks beyond the
1298          * head that we may have written just before the crash, but
1299          * we don't want to overwrite the tail of the log.
1300          */
1301         if (head_cycle == tail_cycle) {
1302                 /*
1303                  * The tail is behind the head in the physical log,
1304                  * so the distance from the head to the tail is the
1305                  * distance from the head to the end of the log plus
1306                  * the distance from the beginning of the log to the
1307                  * tail.
1308                  */
1309                 if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
1310                         XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
1311                                          XFS_ERRLEVEL_LOW, log->l_mp);
1312                         return XFS_ERROR(EFSCORRUPTED);
1313                 }
1314                 tail_distance = tail_block + (log->l_logBBsize - head_block);
1315         } else {
1316                 /*
1317                  * The head is behind the tail in the physical log,
1318                  * so the distance from the head to the tail is just
1319                  * the tail block minus the head block.
1320                  */
1321                 if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
1322                         XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
1323                                          XFS_ERRLEVEL_LOW, log->l_mp);
1324                         return XFS_ERROR(EFSCORRUPTED);
1325                 }
1326                 tail_distance = tail_block - head_block;
1327         }
1328
1329         /*
1330          * If the head is right up against the tail, we can't clear
1331          * anything.
1332          */
1333         if (tail_distance <= 0) {
1334                 ASSERT(tail_distance == 0);
1335                 return 0;
1336         }
1337
1338         max_distance = XLOG_TOTAL_REC_SHIFT(log);
1339         /*
1340          * Take the smaller of the maximum amount of outstanding I/O
1341          * we could have and the distance to the tail to clear out.
1342          * We take the smaller so that we don't overwrite the tail and
1343          * we don't waste all day writing from the head to the tail
1344          * for no reason.
1345          */
1346         max_distance = MIN(max_distance, tail_distance);
1347
1348         if ((head_block + max_distance) <= log->l_logBBsize) {
1349                 /*
1350                  * We can stomp all the blocks we need to without
1351                  * wrapping around the end of the log.  Just do it
1352                  * in a single write.  Use the cycle number of the
1353                  * current cycle minus one so that the log will look like:
1354                  *     n ... | n - 1 ...
1355                  */
1356                 error = xlog_write_log_records(log, (head_cycle - 1),
1357                                 head_block, max_distance, tail_cycle,
1358                                 tail_block);
1359                 if (error)
1360                         return error;
1361         } else {
1362                 /*
1363                  * We need to wrap around the end of the physical log in
1364                  * order to clear all the blocks.  Do it in two separate
1365                  * I/Os.  The first write should be from the head to the
1366                  * end of the physical log, and it should use the current
1367                  * cycle number minus one just like above.
1368                  */
1369                 distance = log->l_logBBsize - head_block;
1370                 error = xlog_write_log_records(log, (head_cycle - 1),
1371                                 head_block, distance, tail_cycle,
1372                                 tail_block);
1373
1374                 if (error)
1375                         return error;
1376
1377                 /*
1378                  * Now write the blocks at the start of the physical log.
1379                  * This writes the remainder of the blocks we want to clear.
1380                  * It uses the current cycle number since we're now on the
1381                  * same cycle as the head so that we get:
1382                  *    n ... n ... | n - 1 ...
1383                  *    ^^^^^ blocks we're writing
1384                  */
1385                 distance = max_distance - (log->l_logBBsize - head_block);
1386                 error = xlog_write_log_records(log, head_cycle, 0, distance,
1387                                 tail_cycle, tail_block);
1388                 if (error)
1389                         return error;
1390         }
1391
1392         return 0;
1393 }
1394
1395 /******************************************************************************
1396  *
1397  *              Log recover routines
1398  *
1399  ******************************************************************************
1400  */
1401
1402 STATIC xlog_recover_t *
1403 xlog_recover_find_tid(
1404         struct hlist_head       *head,
1405         xlog_tid_t              tid)
1406 {
1407         xlog_recover_t          *trans;
1408         struct hlist_node       *n;
1409
1410         hlist_for_each_entry(trans, n, head, r_list) {
1411                 if (trans->r_log_tid == tid)
1412                         return trans;
1413         }
1414         return NULL;
1415 }
1416
1417 STATIC void
1418 xlog_recover_new_tid(
1419         struct hlist_head       *head,
1420         xlog_tid_t              tid,
1421         xfs_lsn_t               lsn)
1422 {
1423         xlog_recover_t          *trans;
1424
1425         trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1426         trans->r_log_tid   = tid;
1427         trans->r_lsn       = lsn;
1428         INIT_LIST_HEAD(&trans->r_itemq);
1429
1430         INIT_HLIST_NODE(&trans->r_list);
1431         hlist_add_head(&trans->r_list, head);
1432 }
1433
1434 STATIC void
1435 xlog_recover_add_item(
1436         struct list_head        *head)
1437 {
1438         xlog_recover_item_t     *item;
1439
1440         item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
1441         INIT_LIST_HEAD(&item->ri_list);
1442         list_add_tail(&item->ri_list, head);
1443 }
1444
1445 STATIC int
1446 xlog_recover_add_to_cont_trans(
1447         struct log              *log,
1448         xlog_recover_t          *trans,
1449         xfs_caddr_t             dp,
1450         int                     len)
1451 {
1452         xlog_recover_item_t     *item;
1453         xfs_caddr_t             ptr, old_ptr;
1454         int                     old_len;
1455
1456         if (list_empty(&trans->r_itemq)) {
1457                 /* finish copying rest of trans header */
1458                 xlog_recover_add_item(&trans->r_itemq);
1459                 ptr = (xfs_caddr_t) &trans->r_theader +
1460                                 sizeof(xfs_trans_header_t) - len;
1461                 memcpy(ptr, dp, len); /* d, s, l */
1462                 return 0;
1463         }
1464         /* take the tail entry */
1465         item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1466
1467         old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1468         old_len = item->ri_buf[item->ri_cnt-1].i_len;
1469
1470         ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u);
1471         memcpy(&ptr[old_len], dp, len); /* d, s, l */
1472         item->ri_buf[item->ri_cnt-1].i_len += len;
1473         item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1474         trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
1475         return 0;
1476 }
1477
1478 /*
1479  * The next region to add is the start of a new region.  It could be
1480  * a whole region or it could be the first part of a new region.  Because
1481  * of this, the assumption here is that the type and size fields of all
1482  * format structures fit into the first 32 bits of the structure.
1483  *
1484  * This works because all regions must be 32 bit aligned.  Therefore, we
1485  * either have both fields or we have neither field.  In the case we have
1486  * neither field, the data part of the region is zero length.  We only have
1487  * a log_op_header and can throw away the header since a new one will appear
1488  * later.  If we have at least 4 bytes, then we can determine how many regions
1489  * will appear in the current log item.
1490  */
1491 STATIC int
1492 xlog_recover_add_to_trans(
1493         struct log              *log,
1494         xlog_recover_t          *trans,
1495         xfs_caddr_t             dp,
1496         int                     len)
1497 {
1498         xfs_inode_log_format_t  *in_f;                  /* any will do */
1499         xlog_recover_item_t     *item;
1500         xfs_caddr_t             ptr;
1501
1502         if (!len)
1503                 return 0;
1504         if (list_empty(&trans->r_itemq)) {
1505                 /* we need to catch log corruptions here */
1506                 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1507                         xlog_warn("XFS: xlog_recover_add_to_trans: "
1508                                   "bad header magic number");
1509                         ASSERT(0);
1510                         return XFS_ERROR(EIO);
1511                 }
1512                 if (len == sizeof(xfs_trans_header_t))
1513                         xlog_recover_add_item(&trans->r_itemq);
1514                 memcpy(&trans->r_theader, dp, len); /* d, s, l */
1515                 return 0;
1516         }
1517
1518         ptr = kmem_alloc(len, KM_SLEEP);
1519         memcpy(ptr, dp, len);
1520         in_f = (xfs_inode_log_format_t *)ptr;
1521
1522         /* take the tail entry */
1523         item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1524         if (item->ri_total != 0 &&
1525              item->ri_total == item->ri_cnt) {
1526                 /* tail item is in use, get a new one */
1527                 xlog_recover_add_item(&trans->r_itemq);
1528                 item = list_entry(trans->r_itemq.prev,
1529                                         xlog_recover_item_t, ri_list);
1530         }
1531
1532         if (item->ri_total == 0) {              /* first region to be added */
1533                 if (in_f->ilf_size == 0 ||
1534                     in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
1535                         xlog_warn(
1536         "XFS: bad number of regions (%d) in inode log format",
1537                                   in_f->ilf_size);
1538                         ASSERT(0);
1539                         return XFS_ERROR(EIO);
1540                 }
1541
1542                 item->ri_total = in_f->ilf_size;
1543                 item->ri_buf =
1544                         kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
1545                                     KM_SLEEP);
1546         }
1547         ASSERT(item->ri_total > item->ri_cnt);
1548         /* Description region is ri_buf[0] */
1549         item->ri_buf[item->ri_cnt].i_addr = ptr;
1550         item->ri_buf[item->ri_cnt].i_len  = len;
1551         item->ri_cnt++;
1552         trace_xfs_log_recover_item_add(log, trans, item, 0);
1553         return 0;
1554 }
1555
1556 /*
1557  * Sort the log items in the transaction. Cancelled buffers need
1558  * to be put first so they are processed before any items that might
1559  * modify the buffers. If they are cancelled, then the modifications
1560  * don't need to be replayed.
1561  */
1562 STATIC int
1563 xlog_recover_reorder_trans(
1564         struct log              *log,
1565         xlog_recover_t          *trans,
1566         int                     pass)
1567 {
1568         xlog_recover_item_t     *item, *n;
1569         LIST_HEAD(sort_list);
1570
1571         list_splice_init(&trans->r_itemq, &sort_list);
1572         list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1573                 xfs_buf_log_format_t    *buf_f;
1574
1575                 buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
1576
1577                 switch (ITEM_TYPE(item)) {
1578                 case XFS_LI_BUF:
1579                         if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) {
1580                                 trace_xfs_log_recover_item_reorder_head(log,
1581                                                         trans, item, pass);
1582                                 list_move(&item->ri_list, &trans->r_itemq);
1583                                 break;
1584                         }
1585                 case XFS_LI_INODE:
1586                 case XFS_LI_DQUOT:
1587                 case XFS_LI_QUOTAOFF:
1588                 case XFS_LI_EFD:
1589                 case XFS_LI_EFI:
1590                         trace_xfs_log_recover_item_reorder_tail(log,
1591                                                         trans, item, pass);
1592                         list_move_tail(&item->ri_list, &trans->r_itemq);
1593                         break;
1594                 default:
1595                         xlog_warn(
1596         "XFS: xlog_recover_reorder_trans: unrecognized type of log operation");
1597                         ASSERT(0);
1598                         return XFS_ERROR(EIO);
1599                 }
1600         }
1601         ASSERT(list_empty(&sort_list));
1602         return 0;
1603 }
1604
1605 /*
1606  * Build up the table of buf cancel records so that we don't replay
1607  * cancelled data in the second pass.  For buffer records that are
1608  * not cancel records, there is nothing to do here so we just return.
1609  *
1610  * If we get a cancel record which is already in the table, this indicates
1611  * that the buffer was cancelled multiple times.  In order to ensure
1612  * that during pass 2 we keep the record in the table until we reach its
1613  * last occurrence in the log, we keep a reference count in the cancel
1614  * record in the table to tell us how many times we expect to see this
1615  * record during the second pass.
1616  */
1617 STATIC void
1618 xlog_recover_do_buffer_pass1(
1619         xlog_t                  *log,
1620         xfs_buf_log_format_t    *buf_f)
1621 {
1622         xfs_buf_cancel_t        *bcp;
1623         xfs_buf_cancel_t        *nextp;
1624         xfs_buf_cancel_t        *prevp;
1625         xfs_buf_cancel_t        **bucket;
1626         xfs_daddr_t             blkno = 0;
1627         uint                    len = 0;
1628         ushort                  flags = 0;
1629
1630         switch (buf_f->blf_type) {
1631         case XFS_LI_BUF:
1632                 blkno = buf_f->blf_blkno;
1633                 len = buf_f->blf_len;
1634                 flags = buf_f->blf_flags;
1635                 break;
1636         }
1637
1638         /*
1639          * If this isn't a cancel buffer item, then just return.
1640          */
1641         if (!(flags & XFS_BLI_CANCEL)) {
1642                 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1643                 return;
1644         }
1645
1646         /*
1647          * Insert an xfs_buf_cancel record into the hash table of
1648          * them.  If there is already an identical record, bump
1649          * its reference count.
1650          */
1651         bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1652                                           XLOG_BC_TABLE_SIZE];
1653         /*
1654          * If the hash bucket is empty then just insert a new record into
1655          * the bucket.
1656          */
1657         if (*bucket == NULL) {
1658                 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1659                                                      KM_SLEEP);
1660                 bcp->bc_blkno = blkno;
1661                 bcp->bc_len = len;
1662                 bcp->bc_refcount = 1;
1663                 bcp->bc_next = NULL;
1664                 *bucket = bcp;
1665                 return;
1666         }
1667
1668         /*
1669          * The hash bucket is not empty, so search for duplicates of our
1670          * record.  If we find one them just bump its refcount.  If not
1671          * then add us at the end of the list.
1672          */
1673         prevp = NULL;
1674         nextp = *bucket;
1675         while (nextp != NULL) {
1676                 if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
1677                         nextp->bc_refcount++;
1678                         trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1679                         return;
1680                 }
1681                 prevp = nextp;
1682                 nextp = nextp->bc_next;
1683         }
1684         ASSERT(prevp != NULL);
1685         bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1686                                              KM_SLEEP);
1687         bcp->bc_blkno = blkno;
1688         bcp->bc_len = len;
1689         bcp->bc_refcount = 1;
1690         bcp->bc_next = NULL;
1691         prevp->bc_next = bcp;
1692         trace_xfs_log_recover_buf_cancel_add(log, buf_f);
1693 }
1694
1695 /*
1696  * Check to see whether the buffer being recovered has a corresponding
1697  * entry in the buffer cancel record table.  If it does then return 1
1698  * so that it will be cancelled, otherwise return 0.  If the buffer is
1699  * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement
1700  * the refcount on the entry in the table and remove it from the table
1701  * if this is the last reference.
1702  *
1703  * We remove the cancel record from the table when we encounter its
1704  * last occurrence in the log so that if the same buffer is re-used
1705  * again after its last cancellation we actually replay the changes
1706  * made at that point.
1707  */
1708 STATIC int
1709 xlog_check_buffer_cancelled(
1710         xlog_t                  *log,
1711         xfs_daddr_t             blkno,
1712         uint                    len,
1713         ushort                  flags)
1714 {
1715         xfs_buf_cancel_t        *bcp;
1716         xfs_buf_cancel_t        *prevp;
1717         xfs_buf_cancel_t        **bucket;
1718
1719         if (log->l_buf_cancel_table == NULL) {
1720                 /*
1721                  * There is nothing in the table built in pass one,
1722                  * so this buffer must not be cancelled.
1723                  */
1724                 ASSERT(!(flags & XFS_BLI_CANCEL));
1725                 return 0;
1726         }
1727
1728         bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1729                                           XLOG_BC_TABLE_SIZE];
1730         bcp = *bucket;
1731         if (bcp == NULL) {
1732                 /*
1733                  * There is no corresponding entry in the table built
1734                  * in pass one, so this buffer has not been cancelled.
1735                  */
1736                 ASSERT(!(flags & XFS_BLI_CANCEL));
1737                 return 0;
1738         }
1739
1740         /*
1741          * Search for an entry in the buffer cancel table that
1742          * matches our buffer.
1743          */
1744         prevp = NULL;
1745         while (bcp != NULL) {
1746                 if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
1747                         /*
1748                          * We've go a match, so return 1 so that the
1749                          * recovery of this buffer is cancelled.
1750                          * If this buffer is actually a buffer cancel
1751                          * log item, then decrement the refcount on the
1752                          * one in the table and remove it if this is the
1753                          * last reference.
1754                          */
1755                         if (flags & XFS_BLI_CANCEL) {
1756                                 bcp->bc_refcount--;
1757                                 if (bcp->bc_refcount == 0) {
1758                                         if (prevp == NULL) {
1759                                                 *bucket = bcp->bc_next;
1760                                         } else {
1761                                                 prevp->bc_next = bcp->bc_next;
1762                                         }
1763                                         kmem_free(bcp);
1764                                 }
1765                         }
1766                         return 1;
1767                 }
1768                 prevp = bcp;
1769                 bcp = bcp->bc_next;
1770         }
1771         /*
1772          * We didn't find a corresponding entry in the table, so
1773          * return 0 so that the buffer is NOT cancelled.
1774          */
1775         ASSERT(!(flags & XFS_BLI_CANCEL));
1776         return 0;
1777 }
1778
1779 STATIC int
1780 xlog_recover_do_buffer_pass2(
1781         xlog_t                  *log,
1782         xfs_buf_log_format_t    *buf_f)
1783 {
1784         xfs_daddr_t             blkno = 0;
1785         ushort                  flags = 0;
1786         uint                    len = 0;
1787
1788         switch (buf_f->blf_type) {
1789         case XFS_LI_BUF:
1790                 blkno = buf_f->blf_blkno;
1791                 flags = buf_f->blf_flags;
1792                 len = buf_f->blf_len;
1793                 break;
1794         }
1795
1796         return xlog_check_buffer_cancelled(log, blkno, len, flags);
1797 }
1798
1799 /*
1800  * Perform recovery for a buffer full of inodes.  In these buffers,
1801  * the only data which should be recovered is that which corresponds
1802  * to the di_next_unlinked pointers in the on disk inode structures.
1803  * The rest of the data for the inodes is always logged through the
1804  * inodes themselves rather than the inode buffer and is recovered
1805  * in xlog_recover_do_inode_trans().
1806  *
1807  * The only time when buffers full of inodes are fully recovered is
1808  * when the buffer is full of newly allocated inodes.  In this case
1809  * the buffer will not be marked as an inode buffer and so will be
1810  * sent to xlog_recover_do_reg_buffer() below during recovery.
1811  */
1812 STATIC int
1813 xlog_recover_do_inode_buffer(
1814         xfs_mount_t             *mp,
1815         xlog_recover_item_t     *item,
1816         xfs_buf_t               *bp,
1817         xfs_buf_log_format_t    *buf_f)
1818 {
1819         int                     i;
1820         int                     item_index;
1821         int                     bit;
1822         int                     nbits;
1823         int                     reg_buf_offset;
1824         int                     reg_buf_bytes;
1825         int                     next_unlinked_offset;
1826         int                     inodes_per_buf;
1827         xfs_agino_t             *logged_nextp;
1828         xfs_agino_t             *buffer_nextp;
1829         unsigned int            *data_map = NULL;
1830         unsigned int            map_size = 0;
1831
1832         trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1833
1834         switch (buf_f->blf_type) {
1835         case XFS_LI_BUF:
1836                 data_map = buf_f->blf_data_map;
1837                 map_size = buf_f->blf_map_size;
1838                 break;
1839         }
1840         /*
1841          * Set the variables corresponding to the current region to
1842          * 0 so that we'll initialize them on the first pass through
1843          * the loop.
1844          */
1845         reg_buf_offset = 0;
1846         reg_buf_bytes = 0;
1847         bit = 0;
1848         nbits = 0;
1849         item_index = 0;
1850         inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
1851         for (i = 0; i < inodes_per_buf; i++) {
1852                 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
1853                         offsetof(xfs_dinode_t, di_next_unlinked);
1854
1855                 while (next_unlinked_offset >=
1856                        (reg_buf_offset + reg_buf_bytes)) {
1857                         /*
1858                          * The next di_next_unlinked field is beyond
1859                          * the current logged region.  Find the next
1860                          * logged region that contains or is beyond
1861                          * the current di_next_unlinked field.
1862                          */
1863                         bit += nbits;
1864                         bit = xfs_next_bit(data_map, map_size, bit);
1865
1866                         /*
1867                          * If there are no more logged regions in the
1868                          * buffer, then we're done.
1869                          */
1870                         if (bit == -1) {
1871                                 return 0;
1872                         }
1873
1874                         nbits = xfs_contig_bits(data_map, map_size,
1875                                                          bit);
1876                         ASSERT(nbits > 0);
1877                         reg_buf_offset = bit << XFS_BLI_SHIFT;
1878                         reg_buf_bytes = nbits << XFS_BLI_SHIFT;
1879                         item_index++;
1880                 }
1881
1882                 /*
1883                  * If the current logged region starts after the current
1884                  * di_next_unlinked field, then move on to the next
1885                  * di_next_unlinked field.
1886                  */
1887                 if (next_unlinked_offset < reg_buf_offset) {
1888                         continue;
1889                 }
1890
1891                 ASSERT(item->ri_buf[item_index].i_addr != NULL);
1892                 ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0);
1893                 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
1894
1895                 /*
1896                  * The current logged region contains a copy of the
1897                  * current di_next_unlinked field.  Extract its value
1898                  * and copy it to the buffer copy.
1899                  */
1900                 logged_nextp = (xfs_agino_t *)
1901                                ((char *)(item->ri_buf[item_index].i_addr) +
1902                                 (next_unlinked_offset - reg_buf_offset));
1903                 if (unlikely(*logged_nextp == 0)) {
1904                         xfs_fs_cmn_err(CE_ALERT, mp,
1905                                 "bad inode buffer log record (ptr = 0x%p, bp = 0x%p).  XFS trying to replay bad (0) inode di_next_unlinked field",
1906                                 item, bp);
1907                         XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
1908                                          XFS_ERRLEVEL_LOW, mp);
1909                         return XFS_ERROR(EFSCORRUPTED);
1910                 }
1911
1912                 buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
1913                                               next_unlinked_offset);
1914                 *buffer_nextp = *logged_nextp;
1915         }
1916
1917         return 0;
1918 }
1919
1920 /*
1921  * Perform a 'normal' buffer recovery.  Each logged region of the
1922  * buffer should be copied over the corresponding region in the
1923  * given buffer.  The bitmap in the buf log format structure indicates
1924  * where to place the logged data.
1925  */
1926 /*ARGSUSED*/
1927 STATIC void
1928 xlog_recover_do_reg_buffer(
1929         struct xfs_mount        *mp,
1930         xlog_recover_item_t     *item,
1931         xfs_buf_t               *bp,
1932         xfs_buf_log_format_t    *buf_f)
1933 {
1934         int                     i;
1935         int                     bit;
1936         int                     nbits;
1937         unsigned int            *data_map = NULL;
1938         unsigned int            map_size = 0;
1939         int                     error;
1940
1941         trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
1942
1943         switch (buf_f->blf_type) {
1944         case XFS_LI_BUF:
1945                 data_map = buf_f->blf_data_map;
1946                 map_size = buf_f->blf_map_size;
1947                 break;
1948         }
1949         bit = 0;
1950         i = 1;  /* 0 is the buf format structure */
1951         while (1) {
1952                 bit = xfs_next_bit(data_map, map_size, bit);
1953                 if (bit == -1)
1954                         break;
1955                 nbits = xfs_contig_bits(data_map, map_size, bit);
1956                 ASSERT(nbits > 0);
1957                 ASSERT(item->ri_buf[i].i_addr != NULL);
1958                 ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
1959                 ASSERT(XFS_BUF_COUNT(bp) >=
1960                        ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
1961
1962                 /*
1963                  * Do a sanity check if this is a dquot buffer. Just checking
1964                  * the first dquot in the buffer should do. XXXThis is
1965                  * probably a good thing to do for other buf types also.
1966                  */
1967                 error = 0;
1968                 if (buf_f->blf_flags &
1969                    (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
1970                         if (item->ri_buf[i].i_addr == NULL) {
1971                                 cmn_err(CE_ALERT,
1972                                         "XFS: NULL dquot in %s.", __func__);
1973                                 goto next;
1974                         }
1975                         if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
1976                                 cmn_err(CE_ALERT,
1977                                         "XFS: dquot too small (%d) in %s.",
1978                                         item->ri_buf[i].i_len, __func__);
1979                                 goto next;
1980                         }
1981                         error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
1982                                                item->ri_buf[i].i_addr,
1983                                                -1, 0, XFS_QMOPT_DOWARN,
1984                                                "dquot_buf_recover");
1985                         if (error)
1986                                 goto next;
1987                 }
1988
1989                 memcpy(xfs_buf_offset(bp,
1990                         (uint)bit << XFS_BLI_SHIFT),    /* dest */
1991                         item->ri_buf[i].i_addr,         /* source */
1992                         nbits<<XFS_BLI_SHIFT);          /* length */
1993  next:
1994                 i++;
1995                 bit += nbits;
1996         }
1997
1998         /* Shouldn't be any more regions */
1999         ASSERT(i == item->ri_total);
2000 }
2001
2002 /*
2003  * Do some primitive error checking on ondisk dquot data structures.
2004  */
2005 int
2006 xfs_qm_dqcheck(
2007         xfs_disk_dquot_t *ddq,
2008         xfs_dqid_t       id,
2009         uint             type,    /* used only when IO_dorepair is true */
2010         uint             flags,
2011         char             *str)
2012 {
2013         xfs_dqblk_t      *d = (xfs_dqblk_t *)ddq;
2014         int             errs = 0;
2015
2016         /*
2017          * We can encounter an uninitialized dquot buffer for 2 reasons:
2018          * 1. If we crash while deleting the quotainode(s), and those blks got
2019          *    used for user data. This is because we take the path of regular
2020          *    file deletion; however, the size field of quotainodes is never
2021          *    updated, so all the tricks that we play in itruncate_finish
2022          *    don't quite matter.
2023          *
2024          * 2. We don't play the quota buffers when there's a quotaoff logitem.
2025          *    But the allocation will be replayed so we'll end up with an
2026          *    uninitialized quota block.
2027          *
2028          * This is all fine; things are still consistent, and we haven't lost
2029          * any quota information. Just don't complain about bad dquot blks.
2030          */
2031         if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
2032                 if (flags & XFS_QMOPT_DOWARN)
2033                         cmn_err(CE_ALERT,
2034                         "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
2035                         str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
2036                 errs++;
2037         }
2038         if (ddq->d_version != XFS_DQUOT_VERSION) {
2039                 if (flags & XFS_QMOPT_DOWARN)
2040                         cmn_err(CE_ALERT,
2041                         "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
2042                         str, id, ddq->d_version, XFS_DQUOT_VERSION);
2043                 errs++;
2044         }
2045
2046         if (ddq->d_flags != XFS_DQ_USER &&
2047             ddq->d_flags != XFS_DQ_PROJ &&
2048             ddq->d_flags != XFS_DQ_GROUP) {
2049                 if (flags & XFS_QMOPT_DOWARN)
2050                         cmn_err(CE_ALERT,
2051                         "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
2052                         str, id, ddq->d_flags);
2053                 errs++;
2054         }
2055
2056         if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
2057                 if (flags & XFS_QMOPT_DOWARN)
2058                         cmn_err(CE_ALERT,
2059                         "%s : ondisk-dquot 0x%p, ID mismatch: "
2060                         "0x%x expected, found id 0x%x",
2061                         str, ddq, id, be32_to_cpu(ddq->d_id));
2062                 errs++;
2063         }
2064
2065         if (!errs && ddq->d_id) {
2066                 if (ddq->d_blk_softlimit &&
2067                     be64_to_cpu(ddq->d_bcount) >=
2068                                 be64_to_cpu(ddq->d_blk_softlimit)) {
2069                         if (!ddq->d_btimer) {
2070                                 if (flags & XFS_QMOPT_DOWARN)
2071                                         cmn_err(CE_ALERT,
2072                                         "%s : Dquot ID 0x%x (0x%p) "
2073                                         "BLK TIMER NOT STARTED",
2074                                         str, (int)be32_to_cpu(ddq->d_id), ddq);
2075                                 errs++;
2076                         }
2077                 }
2078                 if (ddq->d_ino_softlimit &&
2079                     be64_to_cpu(ddq->d_icount) >=
2080                                 be64_to_cpu(ddq->d_ino_softlimit)) {
2081                         if (!ddq->d_itimer) {
2082                                 if (flags & XFS_QMOPT_DOWARN)
2083                                         cmn_err(CE_ALERT,
2084                                         "%s : Dquot ID 0x%x (0x%p) "
2085                                         "INODE TIMER NOT STARTED",
2086                                         str, (int)be32_to_cpu(ddq->d_id), ddq);
2087                                 errs++;
2088                         }
2089                 }
2090                 if (ddq->d_rtb_softlimit &&
2091                     be64_to_cpu(ddq->d_rtbcount) >=
2092                                 be64_to_cpu(ddq->d_rtb_softlimit)) {
2093                         if (!ddq->d_rtbtimer) {
2094                                 if (flags & XFS_QMOPT_DOWARN)
2095                                         cmn_err(CE_ALERT,
2096                                         "%s : Dquot ID 0x%x (0x%p) "
2097                                         "RTBLK TIMER NOT STARTED",
2098                                         str, (int)be32_to_cpu(ddq->d_id), ddq);
2099                                 errs++;
2100                         }
2101                 }
2102         }
2103
2104         if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
2105                 return errs;
2106
2107         if (flags & XFS_QMOPT_DOWARN)
2108                 cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id);
2109
2110         /*
2111          * Typically, a repair is only requested by quotacheck.
2112          */
2113         ASSERT(id != -1);
2114         ASSERT(flags & XFS_QMOPT_DQREPAIR);
2115         memset(d, 0, sizeof(xfs_dqblk_t));
2116
2117         d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
2118         d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
2119         d->dd_diskdq.d_flags = type;
2120         d->dd_diskdq.d_id = cpu_to_be32(id);
2121
2122         return errs;
2123 }
2124
2125 /*
2126  * Perform a dquot buffer recovery.
2127  * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
2128  * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2129  * Else, treat it as a regular buffer and do recovery.
2130  */
2131 STATIC void
2132 xlog_recover_do_dquot_buffer(
2133         xfs_mount_t             *mp,
2134         xlog_t                  *log,
2135         xlog_recover_item_t     *item,
2136         xfs_buf_t               *bp,
2137         xfs_buf_log_format_t    *buf_f)
2138 {
2139         uint                    type;
2140
2141         trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
2142
2143         /*
2144          * Filesystems are required to send in quota flags at mount time.
2145          */
2146         if (mp->m_qflags == 0) {
2147                 return;
2148         }
2149
2150         type = 0;
2151         if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF)
2152                 type |= XFS_DQ_USER;
2153         if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF)
2154                 type |= XFS_DQ_PROJ;
2155         if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF)
2156                 type |= XFS_DQ_GROUP;
2157         /*
2158          * This type of quotas was turned off, so ignore this buffer
2159          */
2160         if (log->l_quotaoffs_flag & type)
2161                 return;
2162
2163         xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2164 }
2165
2166 /*
2167  * This routine replays a modification made to a buffer at runtime.
2168  * There are actually two types of buffer, regular and inode, which
2169  * are handled differently.  Inode buffers are handled differently
2170  * in that we only recover a specific set of data from them, namely
2171  * the inode di_next_unlinked fields.  This is because all other inode
2172  * data is actually logged via inode records and any data we replay
2173  * here which overlaps that may be stale.
2174  *
2175  * When meta-data buffers are freed at run time we log a buffer item
2176  * with the XFS_BLI_CANCEL bit set to indicate that previous copies
2177  * of the buffer in the log should not be replayed at recovery time.
2178  * This is so that if the blocks covered by the buffer are reused for
2179  * file data before we crash we don't end up replaying old, freed
2180  * meta-data into a user's file.
2181  *
2182  * To handle the cancellation of buffer log items, we make two passes
2183  * over the log during recovery.  During the first we build a table of
2184  * those buffers which have been cancelled, and during the second we
2185  * only replay those buffers which do not have corresponding cancel
2186  * records in the table.  See xlog_recover_do_buffer_pass[1,2] above
2187  * for more details on the implementation of the table of cancel records.
2188  */
2189 STATIC int
2190 xlog_recover_do_buffer_trans(
2191         xlog_t                  *log,
2192         xlog_recover_item_t     *item,
2193         int                     pass)
2194 {
2195         xfs_buf_log_format_t    *buf_f;
2196         xfs_mount_t             *mp;
2197         xfs_buf_t               *bp;
2198         int                     error;
2199         int                     cancel;
2200         xfs_daddr_t             blkno;
2201         int                     len;
2202         ushort                  flags;
2203         uint                    buf_flags;
2204
2205         buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
2206
2207         if (pass == XLOG_RECOVER_PASS1) {
2208                 /*
2209                  * In this pass we're only looking for buf items
2210                  * with the XFS_BLI_CANCEL bit set.
2211                  */
2212                 xlog_recover_do_buffer_pass1(log, buf_f);
2213                 return 0;
2214         } else {
2215                 /*
2216                  * In this pass we want to recover all the buffers
2217                  * which have not been cancelled and are not
2218                  * cancellation buffers themselves.  The routine
2219                  * we call here will tell us whether or not to
2220                  * continue with the replay of this buffer.
2221                  */
2222                 cancel = xlog_recover_do_buffer_pass2(log, buf_f);
2223                 if (cancel) {
2224                         trace_xfs_log_recover_buf_cancel(log, buf_f);
2225                         return 0;
2226                 }
2227         }
2228         trace_xfs_log_recover_buf_recover(log, buf_f);
2229         switch (buf_f->blf_type) {
2230         case XFS_LI_BUF:
2231                 blkno = buf_f->blf_blkno;
2232                 len = buf_f->blf_len;
2233                 flags = buf_f->blf_flags;
2234                 break;
2235         default:
2236                 xfs_fs_cmn_err(CE_ALERT, log->l_mp,
2237                         "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
2238                         buf_f->blf_type, log->l_mp->m_logname ?
2239                         log->l_mp->m_logname : "internal");
2240                 XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
2241                                  XFS_ERRLEVEL_LOW, log->l_mp);
2242                 return XFS_ERROR(EFSCORRUPTED);
2243         }
2244
2245         mp = log->l_mp;
2246         buf_flags = XBF_LOCK;
2247         if (!(flags & XFS_BLI_INODE_BUF))
2248                 buf_flags |= XBF_MAPPED;
2249
2250         bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
2251         if (XFS_BUF_ISERROR(bp)) {
2252                 xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
2253                                   bp, blkno);
2254                 error = XFS_BUF_GETERROR(bp);
2255                 xfs_buf_relse(bp);
2256                 return error;
2257         }
2258
2259         error = 0;
2260         if (flags & XFS_BLI_INODE_BUF) {
2261                 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2262         } else if (flags &
2263                   (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
2264                 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2265         } else {
2266                 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2267         }
2268         if (error)
2269                 return XFS_ERROR(error);
2270
2271         /*
2272          * Perform delayed write on the buffer.  Asynchronous writes will be
2273          * slower when taking into account all the buffers to be flushed.
2274          *
2275          * Also make sure that only inode buffers with good sizes stay in
2276          * the buffer cache.  The kernel moves inodes in buffers of 1 block
2277          * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger.  The inode
2278          * buffers in the log can be a different size if the log was generated
2279          * by an older kernel using unclustered inode buffers or a newer kernel
2280          * running with a different inode cluster size.  Regardless, if the
2281          * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
2282          * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
2283          * the buffer out of the buffer cache so that the buffer won't
2284          * overlap with future reads of those inodes.
2285          */
2286         if (XFS_DINODE_MAGIC ==
2287             be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
2288             (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
2289                         (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
2290                 XFS_BUF_STALE(bp);
2291                 error = xfs_bwrite(mp, bp);
2292         } else {
2293                 ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
2294                 bp->b_mount = mp;
2295                 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2296                 xfs_bdwrite(mp, bp);
2297         }
2298
2299         return (error);
2300 }
2301
2302 STATIC int
2303 xlog_recover_do_inode_trans(
2304         xlog_t                  *log,
2305         xlog_recover_item_t     *item,
2306         int                     pass)
2307 {
2308         xfs_inode_log_format_t  *in_f;
2309         xfs_mount_t             *mp;
2310         xfs_buf_t               *bp;
2311         xfs_dinode_t            *dip;
2312         xfs_ino_t               ino;
2313         int                     len;
2314         xfs_caddr_t             src;
2315         xfs_caddr_t             dest;
2316         int                     error;
2317         int                     attr_index;
2318         uint                    fields;
2319         xfs_icdinode_t          *dicp;
2320         int                     need_free = 0;
2321
2322         if (pass == XLOG_RECOVER_PASS1) {
2323                 return 0;
2324         }
2325
2326         if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2327                 in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
2328         } else {
2329                 in_f = (xfs_inode_log_format_t *)kmem_alloc(
2330                         sizeof(xfs_inode_log_format_t), KM_SLEEP);
2331                 need_free = 1;
2332                 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
2333                 if (error)
2334                         goto error;
2335         }
2336         ino = in_f->ilf_ino;
2337         mp = log->l_mp;
2338
2339         /*
2340          * Inode buffers can be freed, look out for it,
2341          * and do not replay the inode.
2342          */
2343         if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
2344                                         in_f->ilf_len, 0)) {
2345                 error = 0;
2346                 trace_xfs_log_recover_inode_cancel(log, in_f);
2347                 goto error;
2348         }
2349         trace_xfs_log_recover_inode_recover(log, in_f);
2350
2351         bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
2352                           XBF_LOCK);
2353         if (XFS_BUF_ISERROR(bp)) {
2354                 xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
2355                                   bp, in_f->ilf_blkno);
2356                 error = XFS_BUF_GETERROR(bp);
2357                 xfs_buf_relse(bp);
2358                 goto error;
2359         }
2360         error = 0;
2361         ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2362         dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
2363
2364         /*
2365          * Make sure the place we're flushing out to really looks
2366          * like an inode!
2367          */
2368         if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
2369                 xfs_buf_relse(bp);
2370                 xfs_fs_cmn_err(CE_ALERT, mp,
2371                         "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
2372                         dip, bp, ino);
2373                 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
2374                                  XFS_ERRLEVEL_LOW, mp);
2375                 error = EFSCORRUPTED;
2376                 goto error;
2377         }
2378         dicp = (xfs_icdinode_t *)(item->ri_buf[1].i_addr);
2379         if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2380                 xfs_buf_relse(bp);
2381                 xfs_fs_cmn_err(CE_ALERT, mp,
2382                         "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
2383                         item, ino);
2384                 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
2385                                  XFS_ERRLEVEL_LOW, mp);
2386                 error = EFSCORRUPTED;
2387                 goto error;
2388         }
2389
2390         /* Skip replay when the on disk inode is newer than the log one */
2391         if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
2392                 /*
2393                  * Deal with the wrap case, DI_MAX_FLUSH is less
2394                  * than smaller numbers
2395                  */
2396                 if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
2397                     dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
2398                         /* do nothing */
2399                 } else {
2400                         xfs_buf_relse(bp);
2401                         trace_xfs_log_recover_inode_skip(log, in_f);
2402                         error = 0;
2403                         goto error;
2404                 }
2405         }
2406         /* Take the opportunity to reset the flush iteration count */
2407         dicp->di_flushiter = 0;
2408
2409         if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
2410                 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2411                     (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2412                         XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
2413                                          XFS_ERRLEVEL_LOW, mp, dicp);
2414                         xfs_buf_relse(bp);
2415                         xfs_fs_cmn_err(CE_ALERT, mp,
2416                                 "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2417                                 item, dip, bp, ino);
2418                         error = EFSCORRUPTED;
2419                         goto error;
2420                 }
2421         } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) {
2422                 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2423                     (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2424                     (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2425                         XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
2426                                              XFS_ERRLEVEL_LOW, mp, dicp);
2427                         xfs_buf_relse(bp);
2428                         xfs_fs_cmn_err(CE_ALERT, mp,
2429                                 "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2430                                 item, dip, bp, ino);
2431                         error = EFSCORRUPTED;
2432                         goto error;
2433                 }
2434         }
2435         if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2436                 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
2437                                      XFS_ERRLEVEL_LOW, mp, dicp);
2438                 xfs_buf_relse(bp);
2439                 xfs_fs_cmn_err(CE_ALERT, mp,
2440                         "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2441                         item, dip, bp, ino,
2442                         dicp->di_nextents + dicp->di_anextents,
2443                         dicp->di_nblocks);
2444                 error = EFSCORRUPTED;
2445                 goto error;
2446         }
2447         if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2448                 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
2449                                      XFS_ERRLEVEL_LOW, mp, dicp);
2450                 xfs_buf_relse(bp);
2451                 xfs_fs_cmn_err(CE_ALERT, mp,
2452                         "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
2453                         item, dip, bp, ino, dicp->di_forkoff);
2454                 error = EFSCORRUPTED;
2455                 goto error;
2456         }
2457         if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
2458                 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
2459                                      XFS_ERRLEVEL_LOW, mp, dicp);
2460                 xfs_buf_relse(bp);
2461                 xfs_fs_cmn_err(CE_ALERT, mp,
2462                         "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
2463                         item->ri_buf[1].i_len, item);
2464                 error = EFSCORRUPTED;
2465                 goto error;
2466         }
2467
2468         /* The core is in in-core format */
2469         xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr);
2470
2471         /* the rest is in on-disk format */
2472         if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
2473                 memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
2474                         item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
2475                         item->ri_buf[1].i_len  - sizeof(struct xfs_icdinode));
2476         }
2477
2478         fields = in_f->ilf_fields;
2479         switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
2480         case XFS_ILOG_DEV:
2481                 xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
2482                 break;
2483         case XFS_ILOG_UUID:
2484                 memcpy(XFS_DFORK_DPTR(dip),
2485                        &in_f->ilf_u.ilfu_uuid,
2486                        sizeof(uuid_t));
2487                 break;
2488         }
2489
2490         if (in_f->ilf_size == 2)
2491                 goto write_inode_buffer;
2492         len = item->ri_buf[2].i_len;
2493         src = item->ri_buf[2].i_addr;
2494         ASSERT(in_f->ilf_size <= 4);
2495         ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
2496         ASSERT(!(fields & XFS_ILOG_DFORK) ||
2497                (len == in_f->ilf_dsize));
2498
2499         switch (fields & XFS_ILOG_DFORK) {
2500         case XFS_ILOG_DDATA:
2501         case XFS_ILOG_DEXT:
2502                 memcpy(XFS_DFORK_DPTR(dip), src, len);
2503                 break;
2504
2505         case XFS_ILOG_DBROOT:
2506                 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
2507                                  (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
2508                                  XFS_DFORK_DSIZE(dip, mp));
2509                 break;
2510
2511         default:
2512                 /*
2513                  * There are no data fork flags set.
2514                  */
2515                 ASSERT((fields & XFS_ILOG_DFORK) == 0);
2516                 break;
2517         }
2518
2519         /*
2520          * If we logged any attribute data, recover it.  There may or
2521          * may not have been any other non-core data logged in this
2522          * transaction.
2523          */
2524         if (in_f->ilf_fields & XFS_ILOG_AFORK) {
2525                 if (in_f->ilf_fields & XFS_ILOG_DFORK) {
2526                         attr_index = 3;
2527                 } else {
2528                         attr_index = 2;
2529                 }
2530                 len = item->ri_buf[attr_index].i_len;
2531                 src = item->ri_buf[attr_index].i_addr;
2532                 ASSERT(len == in_f->ilf_asize);
2533
2534                 switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
2535                 case XFS_ILOG_ADATA:
2536                 case XFS_ILOG_AEXT:
2537                         dest = XFS_DFORK_APTR(dip);
2538                         ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
2539                         memcpy(dest, src, len);
2540                         break;
2541
2542                 case XFS_ILOG_ABROOT:
2543                         dest = XFS_DFORK_APTR(dip);
2544                         xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
2545                                          len, (xfs_bmdr_block_t*)dest,
2546                                          XFS_DFORK_ASIZE(dip, mp));
2547                         break;
2548
2549                 default:
2550                         xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
2551                         ASSERT(0);
2552                         xfs_buf_relse(bp);
2553                         error = EIO;
2554                         goto error;
2555                 }
2556         }
2557
2558 write_inode_buffer:
2559         ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
2560         bp->b_mount = mp;
2561         XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2562         xfs_bdwrite(mp, bp);
2563 error:
2564         if (need_free)
2565                 kmem_free(in_f);
2566         return XFS_ERROR(error);
2567 }
2568
2569 /*
2570  * Recover QUOTAOFF records. We simply make a note of it in the xlog_t
2571  * structure, so that we know not to do any dquot item or dquot buffer recovery,
2572  * of that type.
2573  */
2574 STATIC int
2575 xlog_recover_do_quotaoff_trans(
2576         xlog_t                  *log,
2577         xlog_recover_item_t     *item,
2578         int                     pass)
2579 {
2580         xfs_qoff_logformat_t    *qoff_f;
2581
2582         if (pass == XLOG_RECOVER_PASS2) {
2583                 return (0);
2584         }
2585
2586         qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr;
2587         ASSERT(qoff_f);
2588
2589         /*
2590          * The logitem format's flag tells us if this was user quotaoff,
2591          * group/project quotaoff or both.
2592          */
2593         if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
2594                 log->l_quotaoffs_flag |= XFS_DQ_USER;
2595         if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
2596                 log->l_quotaoffs_flag |= XFS_DQ_PROJ;
2597         if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
2598                 log->l_quotaoffs_flag |= XFS_DQ_GROUP;
2599
2600         return (0);
2601 }
2602
2603 /*
2604  * Recover a dquot record
2605  */
2606 STATIC int
2607 xlog_recover_do_dquot_trans(
2608         xlog_t                  *log,
2609         xlog_recover_item_t     *item,
2610         int                     pass)
2611 {
2612         xfs_mount_t             *mp;
2613         xfs_buf_t               *bp;
2614         struct xfs_disk_dquot   *ddq, *recddq;
2615         int                     error;
2616         xfs_dq_logformat_t      *dq_f;
2617         uint                    type;
2618
2619         if (pass == XLOG_RECOVER_PASS1) {
2620                 return 0;
2621         }
2622         mp = log->l_mp;
2623
2624         /*
2625          * Filesystems are required to send in quota flags at mount time.
2626          */
2627         if (mp->m_qflags == 0)
2628                 return (0);
2629
2630         recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
2631
2632         if (item->ri_buf[1].i_addr == NULL) {
2633                 cmn_err(CE_ALERT,
2634                         "XFS: NULL dquot in %s.", __func__);
2635                 return XFS_ERROR(EIO);
2636         }
2637         if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
2638                 cmn_err(CE_ALERT,
2639                         "XFS: dquot too small (%d) in %s.",
2640                         item->ri_buf[1].i_len, __func__);
2641                 return XFS_ERROR(EIO);
2642         }
2643
2644         /*
2645          * This type of quotas was turned off, so ignore this record.
2646          */
2647         type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
2648         ASSERT(type);
2649         if (log->l_quotaoffs_flag & type)
2650                 return (0);
2651
2652         /*
2653          * At this point we know that quota was _not_ turned off.
2654          * Since the mount flags are not indicating to us otherwise, this
2655          * must mean that quota is on, and the dquot needs to be replayed.
2656          * Remember that we may not have fully recovered the superblock yet,
2657          * so we can't do the usual trick of looking at the SB quota bits.
2658          *
2659          * The other possibility, of course, is that the quota subsystem was
2660          * removed since the last mount - ENOSYS.
2661          */
2662         dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr;
2663         ASSERT(dq_f);
2664         if ((error = xfs_qm_dqcheck(recddq,
2665                            dq_f->qlf_id,
2666                            0, XFS_QMOPT_DOWARN,
2667                            "xlog_recover_do_dquot_trans (log copy)"))) {
2668                 return XFS_ERROR(EIO);
2669         }
2670         ASSERT(dq_f->qlf_len == 1);
2671
2672         error = xfs_read_buf(mp, mp->m_ddev_targp,
2673                              dq_f->qlf_blkno,
2674                              XFS_FSB_TO_BB(mp, dq_f->qlf_len),
2675                              0, &bp);
2676         if (error) {
2677                 xfs_ioerror_alert("xlog_recover_do..(read#3)", mp,
2678                                   bp, dq_f->qlf_blkno);
2679                 return error;
2680         }
2681         ASSERT(bp);
2682         ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
2683
2684         /*
2685          * At least the magic num portion should be on disk because this
2686          * was among a chunk of dquots created earlier, and we did some
2687          * minimal initialization then.
2688          */
2689         if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2690                            "xlog_recover_do_dquot_trans")) {
2691                 xfs_buf_relse(bp);
2692                 return XFS_ERROR(EIO);
2693         }
2694
2695         memcpy(ddq, recddq, item->ri_buf[1].i_len);
2696
2697         ASSERT(dq_f->qlf_size == 2);
2698         ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
2699         bp->b_mount = mp;
2700         XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2701         xfs_bdwrite(mp, bp);
2702
2703         return (0);
2704 }
2705
2706 /*
2707  * This routine is called to create an in-core extent free intent
2708  * item from the efi format structure which was logged on disk.
2709  * It allocates an in-core efi, copies the extents from the format
2710  * structure into it, and adds the efi to the AIL with the given
2711  * LSN.
2712  */
2713 STATIC int
2714 xlog_recover_do_efi_trans(
2715         xlog_t                  *log,
2716         xlog_recover_item_t     *item,
2717         xfs_lsn_t               lsn,
2718         int                     pass)
2719 {
2720         int                     error;
2721         xfs_mount_t             *mp;
2722         xfs_efi_log_item_t      *efip;
2723         xfs_efi_log_format_t    *efi_formatp;
2724
2725         if (pass == XLOG_RECOVER_PASS1) {
2726                 return 0;
2727         }
2728
2729         efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr;
2730
2731         mp = log->l_mp;
2732         efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
2733         if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
2734                                          &(efip->efi_format)))) {
2735                 xfs_efi_item_free(efip);
2736                 return error;
2737         }
2738         efip->efi_next_extent = efi_formatp->efi_nextents;
2739         efip->efi_flags |= XFS_EFI_COMMITTED;
2740
2741         spin_lock(&log->l_ailp->xa_lock);
2742         /*
2743          * xfs_trans_ail_update() drops the AIL lock.
2744          */
2745         xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
2746         return 0;
2747 }
2748
2749
2750 /*
2751  * This routine is called when an efd format structure is found in
2752  * a committed transaction in the log.  It's purpose is to cancel
2753  * the corresponding efi if it was still in the log.  To do this
2754  * it searches the AIL for the efi with an id equal to that in the
2755  * efd format structure.  If we find it, we remove the efi from the
2756  * AIL and free it.
2757  */
2758 STATIC void
2759 xlog_recover_do_efd_trans(
2760         xlog_t                  *log,
2761         xlog_recover_item_t     *item,
2762         int                     pass)
2763 {
2764         xfs_efd_log_format_t    *efd_formatp;
2765         xfs_efi_log_item_t      *efip = NULL;
2766         xfs_log_item_t          *lip;
2767         __uint64_t              efi_id;
2768         struct xfs_ail_cursor   cur;
2769         struct xfs_ail          *ailp = log->l_ailp;
2770
2771         if (pass == XLOG_RECOVER_PASS1) {
2772                 return;
2773         }
2774
2775         efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
2776         ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
2777                 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
2778                (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
2779                 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
2780         efi_id = efd_formatp->efd_efi_id;
2781
2782         /*
2783          * Search for the efi with the id in the efd format structure
2784          * in the AIL.
2785          */
2786         spin_lock(&ailp->xa_lock);
2787         lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2788         while (lip != NULL) {
2789                 if (lip->li_type == XFS_LI_EFI) {
2790                         efip = (xfs_efi_log_item_t *)lip;
2791                         if (efip->efi_format.efi_id == efi_id) {
2792                                 /*
2793                                  * xfs_trans_ail_delete() drops the
2794                                  * AIL lock.
2795                                  */
2796                                 xfs_trans_ail_delete(ailp, lip);
2797                                 xfs_efi_item_free(efip);
2798                                 spin_lock(&ailp->xa_lock);
2799                                 break;
2800                         }
2801                 }
2802                 lip = xfs_trans_ail_cursor_next(ailp, &cur);
2803         }
2804         xfs_trans_ail_cursor_done(ailp, &cur);
2805         spin_unlock(&ailp->xa_lock);
2806 }
2807
2808 /*
2809  * Perform the transaction
2810  *
2811  * If the transaction modifies a buffer or inode, do it now.  Otherwise,
2812  * EFIs and EFDs get queued up by adding entries into the AIL for them.
2813  */
2814 STATIC int
2815 xlog_recover_do_trans(
2816         xlog_t                  *log,
2817         xlog_recover_t          *trans,
2818         int                     pass)
2819 {
2820         int                     error = 0;
2821         xlog_recover_item_t     *item;
2822
2823         error = xlog_recover_reorder_trans(log, trans, pass);
2824         if (error)
2825                 return error;
2826
2827         list_for_each_entry(item, &trans->r_itemq, ri_list) {
2828                 trace_xfs_log_recover_item_recover(log, trans, item, pass);
2829                 switch (ITEM_TYPE(item)) {
2830                 case XFS_LI_BUF:
2831                         error = xlog_recover_do_buffer_trans(log, item, pass);
2832                         break;
2833                 case XFS_LI_INODE:
2834                         error = xlog_recover_do_inode_trans(log, item, pass);
2835                         break;
2836                 case XFS_LI_EFI:
2837                         error = xlog_recover_do_efi_trans(log, item,
2838                                                           trans->r_lsn, pass);
2839                         break;
2840                 case XFS_LI_EFD:
2841                         xlog_recover_do_efd_trans(log, item, pass);
2842                         error = 0;
2843                         break;
2844                 case XFS_LI_DQUOT:
2845                         error = xlog_recover_do_dquot_trans(log, item, pass);
2846                         break;
2847                 case XFS_LI_QUOTAOFF:
2848                         error = xlog_recover_do_quotaoff_trans(log, item,
2849                                                                pass);
2850                         break;
2851                 default:
2852                         xlog_warn(
2853         "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
2854                         ASSERT(0);
2855                         error = XFS_ERROR(EIO);
2856                         break;
2857                 }
2858
2859                 if (error)
2860                         return error;
2861         }
2862
2863         return 0;
2864 }
2865
2866 /*
2867  * Free up any resources allocated by the transaction
2868  *
2869  * Remember that EFIs, EFDs, and IUNLINKs are handled later.
2870  */
2871 STATIC void
2872 xlog_recover_free_trans(
2873         xlog_recover_t          *trans)
2874 {
2875         xlog_recover_item_t     *item, *n;
2876         int                     i;
2877
2878         list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
2879                 /* Free the regions in the item. */
2880                 list_del(&item->ri_list);
2881                 for (i = 0; i < item->ri_cnt; i++)
2882                         kmem_free(item->ri_buf[i].i_addr);
2883                 /* Free the item itself */
2884                 kmem_free(item->ri_buf);
2885                 kmem_free(item);
2886         }
2887         /* Free the transaction recover structure */
2888         kmem_free(trans);
2889 }
2890
2891 STATIC int
2892 xlog_recover_commit_trans(
2893         xlog_t                  *log,
2894         xlog_recover_t          *trans,
2895         int                     pass)
2896 {
2897         int                     error;
2898
2899         hlist_del(&trans->r_list);
2900         if ((error = xlog_recover_do_trans(log, trans, pass)))
2901                 return error;
2902         xlog_recover_free_trans(trans);                 /* no error */
2903         return 0;
2904 }
2905
2906 STATIC int
2907 xlog_recover_unmount_trans(
2908         xlog_recover_t          *trans)
2909 {
2910         /* Do nothing now */
2911         xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
2912         return 0;
2913 }
2914
2915 /*
2916  * There are two valid states of the r_state field.  0 indicates that the
2917  * transaction structure is in a normal state.  We have either seen the
2918  * start of the transaction or the last operation we added was not a partial
2919  * operation.  If the last operation we added to the transaction was a
2920  * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
2921  *
2922  * NOTE: skip LRs with 0 data length.
2923  */
2924 STATIC int
2925 xlog_recover_process_data(
2926         xlog_t                  *log,
2927         struct hlist_head       rhash[],
2928         xlog_rec_header_t       *rhead,
2929         xfs_caddr_t             dp,
2930         int                     pass)
2931 {
2932         xfs_caddr_t             lp;
2933         int                     num_logops;
2934         xlog_op_header_t        *ohead;
2935         xlog_recover_t          *trans;
2936         xlog_tid_t              tid;
2937         int                     error;
2938         unsigned long           hash;
2939         uint                    flags;
2940
2941         lp = dp + be32_to_cpu(rhead->h_len);
2942         num_logops = be32_to_cpu(rhead->h_num_logops);
2943
2944         /* check the log format matches our own - else we can't recover */
2945         if (xlog_header_check_recover(log->l_mp, rhead))
2946                 return (XFS_ERROR(EIO));
2947
2948         while ((dp < lp) && num_logops) {
2949                 ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
2950                 ohead = (xlog_op_header_t *)dp;
2951                 dp += sizeof(xlog_op_header_t);
2952                 if (ohead->oh_clientid != XFS_TRANSACTION &&
2953                     ohead->oh_clientid != XFS_LOG) {
2954                         xlog_warn(
2955                 "XFS: xlog_recover_process_data: bad clientid");
2956                         ASSERT(0);
2957                         return (XFS_ERROR(EIO));
2958                 }
2959                 tid = be32_to_cpu(ohead->oh_tid);
2960                 hash = XLOG_RHASH(tid);
2961                 trans = xlog_recover_find_tid(&rhash[hash], tid);
2962                 if (trans == NULL) {               /* not found; add new tid */
2963                         if (ohead->oh_flags & XLOG_START_TRANS)
2964                                 xlog_recover_new_tid(&rhash[hash], tid,
2965                                         be64_to_cpu(rhead->h_lsn));
2966                 } else {
2967                         if (dp + be32_to_cpu(ohead->oh_len) > lp) {
2968                                 xlog_warn(
2969                         "XFS: xlog_recover_process_data: bad length");
2970                                 WARN_ON(1);
2971                                 return (XFS_ERROR(EIO));
2972                         }
2973                         flags = ohead->oh_flags & ~XLOG_END_TRANS;
2974                         if (flags & XLOG_WAS_CONT_TRANS)
2975                                 flags &= ~XLOG_CONTINUE_TRANS;
2976                         switch (flags) {
2977                         case XLOG_COMMIT_TRANS:
2978                                 error = xlog_recover_commit_trans(log,
2979                                                                 trans, pass);
2980                                 break;
2981                         case XLOG_UNMOUNT_TRANS:
2982                                 error = xlog_recover_unmount_trans(trans);
2983                                 break;
2984                         case XLOG_WAS_CONT_TRANS:
2985                                 error = xlog_recover_add_to_cont_trans(log,
2986                                                 trans, dp,
2987                                                 be32_to_cpu(ohead->oh_len));
2988                                 break;
2989                         case XLOG_START_TRANS:
2990                                 xlog_warn(
2991                         "XFS: xlog_recover_process_data: bad transaction");
2992                                 ASSERT(0);
2993                                 error = XFS_ERROR(EIO);
2994                                 break;
2995                         case 0:
2996                         case XLOG_CONTINUE_TRANS:
2997                                 error = xlog_recover_add_to_trans(log, trans,
2998                                                 dp, be32_to_cpu(ohead->oh_len));
2999                                 break;
3000                         default:
3001                                 xlog_warn(
3002                         "XFS: xlog_recover_process_data: bad flag");
3003                                 ASSERT(0);
3004                                 error = XFS_ERROR(EIO);
3005                                 break;
3006                         }
3007                         if (error)
3008                                 return error;
3009                 }
3010                 dp += be32_to_cpu(ohead->oh_len);
3011                 num_logops--;
3012         }
3013         return 0;
3014 }
3015
3016 /*
3017  * Process an extent free intent item that was recovered from
3018  * the log.  We need to free the extents that it describes.
3019  */
3020 STATIC int
3021 xlog_recover_process_efi(
3022         xfs_mount_t             *mp,
3023         xfs_efi_log_item_t      *efip)
3024 {
3025         xfs_efd_log_item_t      *efdp;
3026         xfs_trans_t             *tp;
3027         int                     i;
3028         int                     error = 0;
3029         xfs_extent_t            *extp;
3030         xfs_fsblock_t           startblock_fsb;
3031
3032         ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
3033
3034         /*
3035          * First check the validity of the extents described by the
3036          * EFI.  If any are bad, then assume that all are bad and
3037          * just toss the EFI.
3038          */
3039         for (i = 0; i < efip->efi_format.efi_nextents; i++) {
3040                 extp = &(efip->efi_format.efi_extents[i]);
3041                 startblock_fsb = XFS_BB_TO_FSB(mp,
3042                                    XFS_FSB_TO_DADDR(mp, extp->ext_start));
3043                 if ((startblock_fsb == 0) ||
3044                     (extp->ext_len == 0) ||
3045                     (startblock_fsb >= mp->m_sb.sb_dblocks) ||
3046                     (extp->ext_len >= mp->m_sb.sb_agblocks)) {
3047                         /*
3048                          * This will pull the EFI from the AIL and
3049                          * free the memory associated with it.
3050                          */
3051                         xfs_efi_release(efip, efip->efi_format.efi_nextents);
3052                         return XFS_ERROR(EIO);
3053                 }
3054         }
3055
3056         tp = xfs_trans_alloc(mp, 0);
3057         error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
3058         if (error)
3059                 goto abort_error;
3060         efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
3061
3062         for (i = 0; i < efip->efi_format.efi_nextents; i++) {
3063                 extp = &(efip->efi_format.efi_extents[i]);
3064                 error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
3065                 if (error)
3066                         goto abort_error;
3067                 xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
3068                                          extp->ext_len);
3069         }
3070
3071         efip->efi_flags |= XFS_EFI_RECOVERED;
3072         error = xfs_trans_commit(tp, 0);
3073         return error;
3074
3075 abort_error:
3076         xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3077         return error;
3078 }
3079
3080 /*
3081  * When this is called, all of the EFIs which did not have
3082  * corresponding EFDs should be in the AIL.  What we do now
3083  * is free the extents associated with each one.
3084  *
3085  * Since we process the EFIs in normal transactions, they
3086  * will be removed at some point after the commit.  This prevents
3087  * us from just walking down the list processing each one.
3088  * We'll use a flag in the EFI to skip those that we've already
3089  * processed and use the AIL iteration mechanism's generation
3090  * count to try to speed this up at least a bit.
3091  *
3092  * When we start, we know that the EFIs are the only things in
3093  * the AIL.  As we process them, however, other items are added
3094  * to the AIL.  Since everything added to the AIL must come after
3095  * everything already in the AIL, we stop processing as soon as
3096  * we see something other than an EFI in the AIL.
3097  */
3098 STATIC int
3099 xlog_recover_process_efis(
3100         xlog_t                  *log)
3101 {
3102         xfs_log_item_t          *lip;
3103         xfs_efi_log_item_t      *efip;
3104         int                     error = 0;
3105         struct xfs_ail_cursor   cur;
3106         struct xfs_ail          *ailp;
3107
3108         ailp = log->l_ailp;
3109         spin_lock(&ailp->xa_lock);
3110         lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3111         while (lip != NULL) {
3112                 /*
3113                  * We're done when we see something other than an EFI.
3114                  * There should be no EFIs left in the AIL now.
3115                  */
3116                 if (lip->li_type != XFS_LI_EFI) {
3117 #ifdef DEBUG
3118                         for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
3119                                 ASSERT(lip->li_type != XFS_LI_EFI);
3120 #endif
3121                         break;
3122                 }
3123
3124                 /*
3125                  * Skip EFIs that we've already processed.
3126                  */
3127                 efip = (xfs_efi_log_item_t *)lip;
3128                 if (efip->efi_flags & XFS_EFI_RECOVERED) {
3129                         lip = xfs_trans_ail_cursor_next(ailp, &cur);
3130                         continue;
3131                 }
3132
3133                 spin_unlock(&ailp->xa_lock);
3134                 error = xlog_recover_process_efi(log->l_mp, efip);
3135                 spin_lock(&ailp->xa_lock);
3136                 if (error)
3137                         goto out;
3138                 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3139         }
3140 out:
3141         xfs_trans_ail_cursor_done(ailp, &cur);
3142         spin_unlock(&ailp->xa_lock);
3143         return error;
3144 }
3145
3146 /*
3147  * This routine performs a transaction to null out a bad inode pointer
3148  * in an agi unlinked inode hash bucket.
3149  */
3150 STATIC void
3151 xlog_recover_clear_agi_bucket(
3152         xfs_mount_t     *mp,
3153         xfs_agnumber_t  agno,
3154         int             bucket)
3155 {
3156         xfs_trans_t     *tp;
3157         xfs_agi_t       *agi;
3158         xfs_buf_t       *agibp;
3159         int             offset;
3160         int             error;
3161
3162         tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
3163         error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
3164                                   0, 0, 0);
3165         if (error)
3166                 goto out_abort;
3167
3168         error = xfs_read_agi(mp, tp, agno, &agibp);
3169         if (error)
3170                 goto out_abort;
3171
3172         agi = XFS_BUF_TO_AGI(agibp);
3173         agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
3174         offset = offsetof(xfs_agi_t, agi_unlinked) +
3175                  (sizeof(xfs_agino_t) * bucket);
3176         xfs_trans_log_buf(tp, agibp, offset,
3177                           (offset + sizeof(xfs_agino_t) - 1));
3178
3179         error = xfs_trans_commit(tp, 0);
3180         if (error)
3181                 goto out_error;
3182         return;
3183
3184 out_abort:
3185         xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3186 out_error:
3187         xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: "
3188                         "failed to clear agi %d. Continuing.", agno);
3189         return;
3190 }
3191
3192 STATIC xfs_agino_t
3193 xlog_recover_process_one_iunlink(
3194         struct xfs_mount                *mp,
3195         xfs_agnumber_t                  agno,
3196         xfs_agino_t                     agino,
3197         int                             bucket)
3198 {
3199         struct xfs_buf                  *ibp;
3200         struct xfs_dinode               *dip;
3201         struct xfs_inode                *ip;
3202         xfs_ino_t                       ino;
3203         int                             error;
3204
3205         ino = XFS_AGINO_TO_INO(mp, agno, agino);
3206         error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
3207         if (error)
3208                 goto fail;
3209
3210         /*
3211          * Get the on disk inode to find the next inode in the bucket.
3212          */
3213         error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK);
3214         if (error)
3215                 goto fail_iput;
3216
3217         ASSERT(ip->i_d.di_nlink == 0);
3218         ASSERT(ip->i_d.di_mode != 0);
3219
3220         /* setup for the next pass */
3221         agino = be32_to_cpu(dip->di_next_unlinked);
3222         xfs_buf_relse(ibp);
3223
3224         /*
3225          * Prevent any DMAPI event from being sent when the reference on
3226          * the inode is dropped.
3227          */
3228         ip->i_d.di_dmevmask = 0;
3229
3230         IRELE(ip);
3231         return agino;
3232
3233  fail_iput:
3234         IRELE(ip);
3235  fail:
3236         /*
3237          * We can't read in the inode this bucket points to, or this inode
3238          * is messed up.  Just ditch this bucket of inodes.  We will lose
3239          * some inodes and space, but at least we won't hang.
3240          *
3241          * Call xlog_recover_clear_agi_bucket() to perform a transaction to
3242          * clear the inode pointer in the bucket.
3243          */
3244         xlog_recover_clear_agi_bucket(mp, agno, bucket);
3245         return NULLAGINO;
3246 }
3247
3248 /*
3249  * xlog_iunlink_recover
3250  *
3251  * This is called during recovery to process any inodes which
3252  * we unlinked but not freed when the system crashed.  These
3253  * inodes will be on the lists in the AGI blocks.  What we do
3254  * here is scan all the AGIs and fully truncate and free any
3255  * inodes found on the lists.  Each inode is removed from the
3256  * lists when it has been fully truncated and is freed.  The
3257  * freeing of the inode and its removal from the list must be
3258  * atomic.
3259  */
3260 STATIC void
3261 xlog_recover_process_iunlinks(
3262         xlog_t          *log)
3263 {
3264         xfs_mount_t     *mp;
3265         xfs_agnumber_t  agno;
3266         xfs_agi_t       *agi;
3267         xfs_buf_t       *agibp;
3268         xfs_agino_t     agino;
3269         int             bucket;
3270         int             error;
3271         uint            mp_dmevmask;
3272
3273         mp = log->l_mp;
3274
3275         /*
3276          * Prevent any DMAPI event from being sent while in this function.
3277          */
3278         mp_dmevmask = mp->m_dmevmask;
3279         mp->m_dmevmask = 0;
3280
3281         for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3282                 /*
3283                  * Find the agi for this ag.
3284                  */
3285                 error = xfs_read_agi(mp, NULL, agno, &agibp);
3286                 if (error) {
3287                         /*
3288                          * AGI is b0rked. Don't process it.
3289                          *
3290                          * We should probably mark the filesystem as corrupt
3291                          * after we've recovered all the ag's we can....
3292                          */
3293                         continue;
3294                 }
3295                 agi = XFS_BUF_TO_AGI(agibp);
3296
3297                 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
3298                         agino = be32_to_cpu(agi->agi_unlinked[bucket]);
3299                         while (agino != NULLAGINO) {
3300                                 /*
3301                                  * Release the agi buffer so that it can
3302                                  * be acquired in the normal course of the
3303                                  * transaction to truncate and free the inode.
3304                                  */
3305                                 xfs_buf_relse(agibp);
3306
3307                                 agino = xlog_recover_process_one_iunlink(mp,
3308                                                         agno, agino, bucket);
3309
3310                                 /*
3311                                  * Reacquire the agibuffer and continue around
3312                                  * the loop. This should never fail as we know
3313                                  * the buffer was good earlier on.
3314                                  */
3315                                 error = xfs_read_agi(mp, NULL, agno, &agibp);
3316                                 ASSERT(error == 0);
3317                                 agi = XFS_BUF_TO_AGI(agibp);
3318                         }
3319                 }
3320
3321                 /*
3322                  * Release the buffer for the current agi so we can
3323                  * go on to the next one.
3324                  */
3325                 xfs_buf_relse(agibp);
3326         }
3327
3328         mp->m_dmevmask = mp_dmevmask;
3329 }
3330
3331
3332 #ifdef DEBUG
3333 STATIC void
3334 xlog_pack_data_checksum(
3335         xlog_t          *log,
3336         xlog_in_core_t  *iclog,
3337         int             size)
3338 {
3339         int             i;
3340         __be32          *up;
3341         uint            chksum = 0;
3342
3343         up = (__be32 *)iclog->ic_datap;
3344         /* divide length by 4 to get # words */
3345         for (i = 0; i < (size >> 2); i++) {
3346                 chksum ^= be32_to_cpu(*up);
3347                 up++;
3348         }
3349         iclog->ic_header.h_chksum = cpu_to_be32(chksum);
3350 }
3351 #else
3352 #define xlog_pack_data_checksum(log, iclog, size)
3353 #endif
3354
3355 /*
3356  * Stamp cycle number in every block
3357  */
3358 void
3359 xlog_pack_data(
3360         xlog_t                  *log,
3361         xlog_in_core_t          *iclog,
3362         int                     roundoff)
3363 {
3364         int                     i, j, k;
3365         int                     size = iclog->ic_offset + roundoff;
3366         __be32                  cycle_lsn;
3367         xfs_caddr_t             dp;
3368
3369         xlog_pack_data_checksum(log, iclog, size);
3370
3371         cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
3372
3373         dp = iclog->ic_datap;
3374         for (i = 0; i < BTOBB(size) &&
3375                 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3376                 iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
3377                 *(__be32 *)dp = cycle_lsn;
3378                 dp += BBSIZE;
3379         }
3380
3381         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3382                 xlog_in_core_2_t *xhdr = iclog->ic_data;
3383
3384                 for ( ; i < BTOBB(size); i++) {
3385                         j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3386                         k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3387                         xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
3388                         *(__be32 *)dp = cycle_lsn;
3389                         dp += BBSIZE;
3390                 }
3391
3392                 for (i = 1; i < log->l_iclog_heads; i++) {
3393                         xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
3394                 }
3395         }
3396 }
3397
3398 #if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
3399 STATIC void
3400 xlog_unpack_data_checksum(
3401         xlog_rec_header_t       *rhead,
3402         xfs_caddr_t             dp,
3403         xlog_t                  *log)
3404 {
3405         __be32                  *up = (__be32 *)dp;
3406         uint                    chksum = 0;
3407         int                     i;
3408
3409         /* divide length by 4 to get # words */
3410         for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) {
3411                 chksum ^= be32_to_cpu(*up);
3412                 up++;
3413         }
3414         if (chksum != be32_to_cpu(rhead->h_chksum)) {
3415             if (rhead->h_chksum ||
3416                 ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
3417                     cmn_err(CE_DEBUG,
3418                         "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n",
3419                             be32_to_cpu(rhead->h_chksum), chksum);
3420                     cmn_err(CE_DEBUG,
3421 "XFS: Disregard message if filesystem was created with non-DEBUG kernel");
3422                     if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3423                             cmn_err(CE_DEBUG,
3424                                 "XFS: LogR this is a LogV2 filesystem\n");
3425                     }
3426                     log->l_flags |= XLOG_CHKSUM_MISMATCH;
3427             }
3428         }
3429 }
3430 #else
3431 #define xlog_unpack_data_checksum(rhead, dp, log)
3432 #endif
3433
3434 STATIC void
3435 xlog_unpack_data(
3436         xlog_rec_header_t       *rhead,
3437         xfs_caddr_t             dp,
3438         xlog_t                  *log)
3439 {
3440         int                     i, j, k;
3441
3442         for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
3443                   i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3444                 *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
3445                 dp += BBSIZE;
3446         }
3447
3448         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3449                 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
3450                 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
3451                         j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3452                         k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3453                         *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
3454                         dp += BBSIZE;
3455                 }
3456         }
3457
3458         xlog_unpack_data_checksum(rhead, dp, log);
3459 }
3460
3461 STATIC int
3462 xlog_valid_rec_header(
3463         xlog_t                  *log,
3464         xlog_rec_header_t       *rhead,
3465         xfs_daddr_t             blkno)
3466 {
3467         int                     hlen;
3468
3469         if (unlikely(be32_to_cpu(rhead->h_magicno) != XLOG_HEADER_MAGIC_NUM)) {
3470                 XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
3471                                 XFS_ERRLEVEL_LOW, log->l_mp);
3472                 return XFS_ERROR(EFSCORRUPTED);
3473         }
3474         if (unlikely(
3475             (!rhead->h_version ||
3476             (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
3477                 xlog_warn("XFS: %s: unrecognised log version (%d).",
3478                         __func__, be32_to_cpu(rhead->h_version));
3479                 return XFS_ERROR(EIO);
3480         }
3481
3482         /* LR body must have data or it wouldn't have been written */
3483         hlen = be32_to_cpu(rhead->h_len);
3484         if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
3485                 XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
3486                                 XFS_ERRLEVEL_LOW, log->l_mp);
3487                 return XFS_ERROR(EFSCORRUPTED);
3488         }
3489         if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
3490                 XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
3491                                 XFS_ERRLEVEL_LOW, log->l_mp);
3492                 return XFS_ERROR(EFSCORRUPTED);
3493         }
3494         return 0;
3495 }
3496
3497 /*
3498  * Read the log from tail to head and process the log records found.
3499  * Handle the two cases where the tail and head are in the same cycle
3500  * and where the active portion of the log wraps around the end of
3501  * the physical log separately.  The pass parameter is passed through
3502  * to the routines called to process the data and is not looked at
3503  * here.
3504  */
3505 STATIC int
3506 xlog_do_recovery_pass(
3507         xlog_t                  *log,
3508         xfs_daddr_t             head_blk,
3509         xfs_daddr_t             tail_blk,
3510         int                     pass)
3511 {
3512         xlog_rec_header_t       *rhead;
3513         xfs_daddr_t             blk_no;
3514         xfs_caddr_t             offset;
3515         xfs_buf_t               *hbp, *dbp;
3516         int                     error = 0, h_size;
3517         int                     bblks, split_bblks;
3518         int                     hblks, split_hblks, wrapped_hblks;
3519         struct hlist_head       rhash[XLOG_RHASH_SIZE];
3520
3521         ASSERT(head_blk != tail_blk);
3522
3523         /*
3524          * Read the header of the tail block and get the iclog buffer size from
3525          * h_size.  Use this to tell how many sectors make up the log header.
3526          */
3527         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3528                 /*
3529                  * When using variable length iclogs, read first sector of
3530                  * iclog header and extract the header size from it.  Get a
3531                  * new hbp that is the correct size.
3532                  */
3533                 hbp = xlog_get_bp(log, 1);
3534                 if (!hbp)
3535                         return ENOMEM;
3536
3537                 error = xlog_bread(log, tail_blk, 1, hbp, &offset);
3538                 if (error)
3539                         goto bread_err1;
3540
3541                 rhead = (xlog_rec_header_t *)offset;
3542                 error = xlog_valid_rec_header(log, rhead, tail_blk);
3543                 if (error)
3544                         goto bread_err1;
3545                 h_size = be32_to_cpu(rhead->h_size);
3546                 if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
3547                     (h_size > XLOG_HEADER_CYCLE_SIZE)) {
3548                         hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
3549                         if (h_size % XLOG_HEADER_CYCLE_SIZE)
3550                                 hblks++;
3551                         xlog_put_bp(hbp);
3552                         hbp = xlog_get_bp(log, hblks);
3553                 } else {
3554                         hblks = 1;
3555                 }
3556         } else {
3557                 ASSERT(log->l_sectbb_log == 0);
3558                 hblks = 1;
3559                 hbp = xlog_get_bp(log, 1);
3560                 h_size = XLOG_BIG_RECORD_BSIZE;
3561         }
3562
3563         if (!hbp)
3564                 return ENOMEM;
3565         dbp = xlog_get_bp(log, BTOBB(h_size));
3566         if (!dbp) {
3567                 xlog_put_bp(hbp);
3568                 return ENOMEM;
3569         }
3570
3571         memset(rhash, 0, sizeof(rhash));
3572         if (tail_blk <= head_blk) {
3573                 for (blk_no = tail_blk; blk_no < head_blk; ) {
3574                         error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3575                         if (error)
3576                                 goto bread_err2;
3577
3578                         rhead = (xlog_rec_header_t *)offset;
3579                         error = xlog_valid_rec_header(log, rhead, blk_no);
3580                         if (error)
3581                                 goto bread_err2;
3582
3583                         /* blocks in data section */
3584                         bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3585                         error = xlog_bread(log, blk_no + hblks, bblks, dbp,
3586                                            &offset);
3587                         if (error)
3588                                 goto bread_err2;
3589
3590                         xlog_unpack_data(rhead, offset, log);
3591                         if ((error = xlog_recover_process_data(log,
3592                                                 rhash, rhead, offset, pass)))
3593                                 goto bread_err2;
3594                         blk_no += bblks + hblks;
3595                 }
3596         } else {
3597                 /*
3598                  * Perform recovery around the end of the physical log.
3599                  * When the head is not on the same cycle number as the tail,
3600                  * we can't do a sequential recovery as above.
3601                  */
3602                 blk_no = tail_blk;
3603                 while (blk_no < log->l_logBBsize) {
3604                         /*
3605                          * Check for header wrapping around physical end-of-log
3606                          */
3607                         offset = XFS_BUF_PTR(hbp);
3608                         split_hblks = 0;
3609                         wrapped_hblks = 0;
3610                         if (blk_no + hblks <= log->l_logBBsize) {
3611                                 /* Read header in one read */
3612                                 error = xlog_bread(log, blk_no, hblks, hbp,
3613                                                    &offset);
3614                                 if (error)
3615                                         goto bread_err2;
3616                         } else {
3617                                 /* This LR is split across physical log end */
3618                                 if (blk_no != log->l_logBBsize) {
3619                                         /* some data before physical log end */
3620                                         ASSERT(blk_no <= INT_MAX);
3621                                         split_hblks = log->l_logBBsize - (int)blk_no;
3622                                         ASSERT(split_hblks > 0);
3623                                         error = xlog_bread(log, blk_no,
3624                                                            split_hblks, hbp,
3625                                                            &offset);
3626                                         if (error)
3627                                                 goto bread_err2;
3628                                 }
3629
3630                                 /*
3631                                  * Note: this black magic still works with
3632                                  * large sector sizes (non-512) only because:
3633                                  * - we increased the buffer size originally
3634                                  *   by 1 sector giving us enough extra space
3635                                  *   for the second read;
3636                                  * - the log start is guaranteed to be sector
3637                                  *   aligned;
3638                                  * - we read the log end (LR header start)
3639                                  *   _first_, then the log start (LR header end)
3640                                  *   - order is important.
3641                                  */
3642                                 wrapped_hblks = hblks - split_hblks;
3643                                 error = XFS_BUF_SET_PTR(hbp,
3644                                                 offset + BBTOB(split_hblks),
3645                                                 BBTOB(hblks - split_hblks));
3646                                 if (error)
3647                                         goto bread_err2;
3648
3649                                 error = xlog_bread_noalign(log, 0,
3650                                                            wrapped_hblks, hbp);
3651                                 if (error)
3652                                         goto bread_err2;
3653
3654                                 error = XFS_BUF_SET_PTR(hbp, offset,
3655                                                         BBTOB(hblks));
3656                                 if (error)
3657                                         goto bread_err2;
3658                         }
3659                         rhead = (xlog_rec_header_t *)offset;
3660                         error = xlog_valid_rec_header(log, rhead,
3661                                                 split_hblks ? blk_no : 0);
3662                         if (error)
3663                                 goto bread_err2;
3664
3665                         bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3666                         blk_no += hblks;
3667
3668                         /* Read in data for log record */
3669                         if (blk_no + bblks <= log->l_logBBsize) {
3670                                 error = xlog_bread(log, blk_no, bblks, dbp,
3671                                                    &offset);
3672                                 if (error)
3673                                         goto bread_err2;
3674                         } else {
3675                                 /* This log record is split across the
3676                                  * physical end of log */
3677                                 offset = XFS_BUF_PTR(dbp);
3678                                 split_bblks = 0;
3679                                 if (blk_no != log->l_logBBsize) {
3680                                         /* some data is before the physical
3681                                          * end of log */
3682                                         ASSERT(!wrapped_hblks);
3683                                         ASSERT(blk_no <= INT_MAX);
3684                                         split_bblks =
3685                                                 log->l_logBBsize - (int)blk_no;
3686                                         ASSERT(split_bblks > 0);
3687                                         error = xlog_bread(log, blk_no,
3688                                                         split_bblks, dbp,
3689                                                         &offset);
3690                                         if (error)
3691                                                 goto bread_err2;
3692                                 }
3693
3694                                 /*
3695                                  * Note: this black magic still works with
3696                                  * large sector sizes (non-512) only because:
3697                                  * - we increased the buffer size originally
3698                                  *   by 1 sector giving us enough extra space
3699                                  *   for the second read;
3700                                  * - the log start is guaranteed to be sector
3701                                  *   aligned;
3702                                  * - we read the log end (LR header start)
3703                                  *   _first_, then the log start (LR header end)
3704                                  *   - order is important.
3705                                  */
3706                                 error = XFS_BUF_SET_PTR(dbp,
3707                                                 offset + BBTOB(split_bblks),
3708                                                 BBTOB(bblks - split_bblks));
3709                                 if (error)
3710                                         goto bread_err2;
3711
3712                                 error = xlog_bread_noalign(log, wrapped_hblks,
3713                                                 bblks - split_bblks,
3714                                                 dbp);
3715                                 if (error)
3716                                         goto bread_err2;
3717
3718                                 error = XFS_BUF_SET_PTR(dbp, offset, h_size);
3719                                 if (error)
3720                                         goto bread_err2;
3721                         }
3722                         xlog_unpack_data(rhead, offset, log);
3723                         if ((error = xlog_recover_process_data(log, rhash,
3724                                                         rhead, offset, pass)))
3725                                 goto bread_err2;
3726                         blk_no += bblks;
3727                 }
3728
3729                 ASSERT(blk_no >= log->l_logBBsize);
3730                 blk_no -= log->l_logBBsize;
3731
3732                 /* read first part of physical log */
3733                 while (blk_no < head_blk) {
3734                         error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3735                         if (error)
3736                                 goto bread_err2;
3737
3738                         rhead = (xlog_rec_header_t *)offset;
3739                         error = xlog_valid_rec_header(log, rhead, blk_no);
3740                         if (error)
3741                                 goto bread_err2;
3742
3743                         bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3744                         error = xlog_bread(log, blk_no+hblks, bblks, dbp,
3745                                            &offset);
3746                         if (error)
3747                                 goto bread_err2;
3748
3749                         xlog_unpack_data(rhead, offset, log);
3750                         if ((error = xlog_recover_process_data(log, rhash,
3751                                                         rhead, offset, pass)))
3752                                 goto bread_err2;
3753                         blk_no += bblks + hblks;
3754                 }
3755         }
3756
3757  bread_err2:
3758         xlog_put_bp(dbp);
3759  bread_err1:
3760         xlog_put_bp(hbp);
3761         return error;
3762 }
3763
3764 /*
3765  * Do the recovery of the log.  We actually do this in two phases.
3766  * The two passes are necessary in order to implement the function
3767  * of cancelling a record written into the log.  The first pass
3768  * determines those things which have been cancelled, and the
3769  * second pass replays log items normally except for those which
3770  * have been cancelled.  The handling of the replay and cancellations
3771  * takes place in the log item type specific routines.
3772  *
3773  * The table of items which have cancel records in the log is allocated
3774  * and freed at this level, since only here do we know when all of
3775  * the log recovery has been completed.
3776  */
3777 STATIC int
3778 xlog_do_log_recovery(
3779         xlog_t          *log,
3780         xfs_daddr_t     head_blk,
3781         xfs_daddr_t     tail_blk)
3782 {
3783         int             error;
3784
3785         ASSERT(head_blk != tail_blk);
3786
3787         /*
3788          * First do a pass to find all of the cancelled buf log items.
3789          * Store them in the buf_cancel_table for use in the second pass.
3790          */
3791         log->l_buf_cancel_table =
3792                 (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
3793                                                  sizeof(xfs_buf_cancel_t*),
3794                                                  KM_SLEEP);
3795         error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3796                                       XLOG_RECOVER_PASS1);
3797         if (error != 0) {
3798                 kmem_free(log->l_buf_cancel_table);
3799                 log->l_buf_cancel_table = NULL;
3800                 return error;
3801         }
3802         /*
3803          * Then do a second pass to actually recover the items in the log.
3804          * When it is complete free the table of buf cancel items.
3805          */
3806         error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3807                                       XLOG_RECOVER_PASS2);
3808 #ifdef DEBUG
3809         if (!error) {
3810                 int     i;
3811
3812                 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3813                         ASSERT(log->l_buf_cancel_table[i] == NULL);
3814         }
3815 #endif  /* DEBUG */
3816
3817         kmem_free(log->l_buf_cancel_table);
3818         log->l_buf_cancel_table = NULL;
3819
3820         return error;
3821 }
3822
3823 /*
3824  * Do the actual recovery
3825  */
3826 STATIC int
3827 xlog_do_recover(
3828         xlog_t          *log,
3829         xfs_daddr_t     head_blk,
3830         xfs_daddr_t     tail_blk)
3831 {
3832         int             error;
3833         xfs_buf_t       *bp;
3834         xfs_sb_t        *sbp;
3835
3836         /*
3837          * First replay the images in the log.
3838          */
3839         error = xlog_do_log_recovery(log, head_blk, tail_blk);
3840         if (error) {
3841                 return error;
3842         }
3843
3844         XFS_bflush(log->l_mp->m_ddev_targp);
3845
3846         /*
3847          * If IO errors happened during recovery, bail out.
3848          */
3849         if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
3850                 return (EIO);
3851         }
3852
3853         /*
3854          * We now update the tail_lsn since much of the recovery has completed
3855          * and there may be space available to use.  If there were no extent
3856          * or iunlinks, we can free up the entire log and set the tail_lsn to
3857          * be the last_sync_lsn.  This was set in xlog_find_tail to be the
3858          * lsn of the last known good LR on disk.  If there are extent frees
3859          * or iunlinks they will have some entries in the AIL; so we look at
3860          * the AIL to determine how to set the tail_lsn.
3861          */
3862         xlog_assign_tail_lsn(log->l_mp);
3863
3864         /*
3865          * Now that we've finished replaying all buffer and inode
3866          * updates, re-read in the superblock.
3867          */
3868         bp = xfs_getsb(log->l_mp, 0);
3869         XFS_BUF_UNDONE(bp);
3870         ASSERT(!(XFS_BUF_ISWRITE(bp)));
3871         ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
3872         XFS_BUF_READ(bp);
3873         XFS_BUF_UNASYNC(bp);
3874         xfsbdstrat(log->l_mp, bp);
3875         error = xfs_iowait(bp);
3876         if (error) {
3877                 xfs_ioerror_alert("xlog_do_recover",
3878                                   log->l_mp, bp, XFS_BUF_ADDR(bp));
3879                 ASSERT(0);
3880                 xfs_buf_relse(bp);
3881                 return error;
3882         }
3883
3884         /* Convert superblock from on-disk format */
3885         sbp = &log->l_mp->m_sb;
3886         xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
3887         ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
3888         ASSERT(xfs_sb_good_version(sbp));
3889         xfs_buf_relse(bp);
3890
3891         /* We've re-read the superblock so re-initialize per-cpu counters */
3892         xfs_icsb_reinit_counters(log->l_mp);
3893
3894         xlog_recover_check_summary(log);
3895
3896         /* Normal transactions can now occur */
3897         log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
3898         return 0;
3899 }
3900
3901 /*
3902  * Perform recovery and re-initialize some log variables in xlog_find_tail.
3903  *
3904  * Return error or zero.
3905  */
3906 int
3907 xlog_recover(
3908         xlog_t          *log)
3909 {
3910         xfs_daddr_t     head_blk, tail_blk;
3911         int             error;
3912
3913         /* find the tail of the log */
3914         if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
3915                 return error;
3916
3917         if (tail_blk != head_blk) {
3918                 /* There used to be a comment here:
3919                  *
3920                  * disallow recovery on read-only mounts.  note -- mount
3921                  * checks for ENOSPC and turns it into an intelligent
3922                  * error message.
3923                  * ...but this is no longer true.  Now, unless you specify
3924                  * NORECOVERY (in which case this function would never be
3925                  * called), we just go ahead and recover.  We do this all
3926                  * under the vfs layer, so we can get away with it unless
3927                  * the device itself is read-only, in which case we fail.
3928                  */
3929                 if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
3930                         return error;
3931                 }
3932
3933                 cmn_err(CE_NOTE,
3934                         "Starting XFS recovery on filesystem: %s (logdev: %s)",
3935                         log->l_mp->m_fsname, log->l_mp->m_logname ?
3936                         log->l_mp->m_logname : "internal");
3937
3938                 error = xlog_do_recover(log, head_blk, tail_blk);
3939                 log->l_flags |= XLOG_RECOVERY_NEEDED;
3940         }
3941         return error;
3942 }
3943
3944 /*
3945  * In the first part of recovery we replay inodes and buffers and build
3946  * up the list of extent free items which need to be processed.  Here
3947  * we process the extent free items and clean up the on disk unlinked
3948  * inode lists.  This is separated from the first part of recovery so
3949  * that the root and real-time bitmap inodes can be read in from disk in
3950  * between the two stages.  This is necessary so that we can free space
3951  * in the real-time portion of the file system.
3952  */
3953 int
3954 xlog_recover_finish(
3955         xlog_t          *log)
3956 {
3957         /*
3958          * Now we're ready to do the transactions needed for the
3959          * rest of recovery.  Start with completing all the extent
3960          * free intent records and then process the unlinked inode
3961          * lists.  At this point, we essentially run in normal mode
3962          * except that we're still performing recovery actions
3963          * rather than accepting new requests.
3964          */
3965         if (log->l_flags & XLOG_RECOVERY_NEEDED) {
3966                 int     error;
3967                 error = xlog_recover_process_efis(log);
3968                 if (error) {
3969                         cmn_err(CE_ALERT,
3970                                 "Failed to recover EFIs on filesystem: %s",
3971                                 log->l_mp->m_fsname);
3972                         return error;
3973                 }
3974                 /*
3975                  * Sync the log to get all the EFIs out of the AIL.
3976                  * This isn't absolutely necessary, but it helps in
3977                  * case the unlink transactions would have problems
3978                  * pushing the EFIs out of the way.
3979                  */
3980                 xfs_log_force(log->l_mp, XFS_LOG_SYNC);
3981
3982                 xlog_recover_process_iunlinks(log);
3983
3984                 xlog_recover_check_summary(log);
3985
3986                 cmn_err(CE_NOTE,
3987                         "Ending XFS recovery on filesystem: %s (logdev: %s)",
3988                         log->l_mp->m_fsname, log->l_mp->m_logname ?
3989                         log->l_mp->m_logname : "internal");
3990                 log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3991         } else {
3992                 cmn_err(CE_DEBUG,
3993                         "!Ending clean XFS mount for filesystem: %s\n",
3994                         log->l_mp->m_fsname);
3995         }
3996         return 0;
3997 }
3998
3999
4000 #if defined(DEBUG)
4001 /*
4002  * Read all of the agf and agi counters and check that they
4003  * are consistent with the superblock counters.
4004  */
4005 void
4006 xlog_recover_check_summary(
4007         xlog_t          *log)
4008 {
4009         xfs_mount_t     *mp;
4010         xfs_agf_t       *agfp;
4011         xfs_buf_t       *agfbp;
4012         xfs_buf_t       *agibp;
4013         xfs_buf_t       *sbbp;
4014 #ifdef XFS_LOUD_RECOVERY
4015         xfs_sb_t        *sbp;
4016 #endif
4017         xfs_agnumber_t  agno;
4018         __uint64_t      freeblks;
4019         __uint64_t      itotal;
4020         __uint64_t      ifree;
4021         int             error;
4022
4023         mp = log->l_mp;
4024
4025         freeblks = 0LL;
4026         itotal = 0LL;
4027         ifree = 0LL;
4028         for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
4029                 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
4030                 if (error) {
4031                         xfs_fs_cmn_err(CE_ALERT, mp,
4032                                         "xlog_recover_check_summary(agf)"
4033                                         "agf read failed agno %d error %d",
4034                                                         agno, error);
4035                 } else {
4036                         agfp = XFS_BUF_TO_AGF(agfbp);
4037                         freeblks += be32_to_cpu(agfp->agf_freeblks) +
4038                                     be32_to_cpu(agfp->agf_flcount);
4039                         xfs_buf_relse(agfbp);
4040                 }
4041
4042                 error = xfs_read_agi(mp, NULL, agno, &agibp);
4043                 if (!error) {
4044                         struct xfs_agi  *agi = XFS_BUF_TO_AGI(agibp);
4045
4046                         itotal += be32_to_cpu(agi->agi_count);
4047                         ifree += be32_to_cpu(agi->agi_freecount);
4048                         xfs_buf_relse(agibp);
4049                 }
4050         }
4051
4052         sbbp = xfs_getsb(mp, 0);
4053 #ifdef XFS_LOUD_RECOVERY
4054         sbp = &mp->m_sb;
4055         xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp));
4056         cmn_err(CE_NOTE,
4057                 "xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
4058                 sbp->sb_icount, itotal);
4059         cmn_err(CE_NOTE,
4060                 "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
4061                 sbp->sb_ifree, ifree);
4062         cmn_err(CE_NOTE,
4063                 "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
4064                 sbp->sb_fdblocks, freeblks);
4065 #if 0
4066         /*
4067          * This is turned off until I account for the allocation
4068          * btree blocks which live in free space.
4069          */
4070         ASSERT(sbp->sb_icount == itotal);
4071         ASSERT(sbp->sb_ifree == ifree);
4072         ASSERT(sbp->sb_fdblocks == freeblks);
4073 #endif
4074 #endif
4075         xfs_buf_relse(sbbp);
4076 }
4077 #endif /* DEBUG */