Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs
Linus Torvalds [Mon, 23 May 2011 22:19:16 +0000 (15:19 -0700)]
* 'for-linus' of git://oss.sgi.com/xfs/xfs:
  xfs: obey minleft values during extent allocation correctly
  xfs: reset buffer pointers before freeing them
  xfs: avoid getting stuck during async inode flushes
  xfs: fix xfs_itruncate_start tracing
  xfs: fix duplicate workqueue initialisation
  xfs: kill off xfs_printk()
  xfs: fix race condition in AIL push trigger
  xfs: make AIL target updates and compares 32bit safe.
  xfs: always push the AIL to the target
  xfs: exit AIL push work correctly when AIL is empty
  xfs: ensure reclaim cursor is reset correctly at end of AG
  xfs: add an x86 compat handler for XFS_IOC_ZERO_RANGE
  xfs: fix compiler warning in xfs_trace.h
  xfs: cleanup duplicate initializations
  xfs: reduce the number of pagb_lock roundtrips in xfs_alloc_clear_busy
  xfs: exact busy extent tracking
  xfs: do not immediately reuse busy extent ranges
  xfs: optimize AGFL refills

25 files changed:
fs/xfs/linux-2.6/xfs_buf.c
fs/xfs/linux-2.6/xfs_buf.h
fs/xfs/linux-2.6/xfs_ioctl32.c
fs/xfs/linux-2.6/xfs_ioctl32.h
fs/xfs/linux-2.6/xfs_linux.h
fs/xfs/linux-2.6/xfs_message.c
fs/xfs/linux-2.6/xfs_message.h
fs/xfs/linux-2.6/xfs_super.c
fs/xfs/linux-2.6/xfs_sync.c
fs/xfs/linux-2.6/xfs_trace.h
fs/xfs/xfs_ag.h
fs/xfs/xfs_alloc.c
fs/xfs/xfs_alloc.h
fs/xfs/xfs_alloc_btree.c
fs/xfs/xfs_dfrag.c
fs/xfs/xfs_inode.c
fs/xfs/xfs_inode_item.c
fs/xfs/xfs_log.c
fs/xfs/xfs_log.h
fs/xfs/xfs_log_cil.c
fs/xfs/xfs_log_priv.h
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_trans.c
fs/xfs/xfs_types.h

index 9ef9ed2..52b2b5d 100644 (file)
@@ -33,7 +33,6 @@
 #include <linux/migrate.h>
 #include <linux/backing-dev.h>
 #include <linux/freezer.h>
-#include <linux/list_sort.h>
 
 #include "xfs_sb.h"
 #include "xfs_inum.h"
@@ -709,6 +708,27 @@ xfs_buf_get_empty(
        return bp;
 }
 
+/*
+ * Return a buffer allocated as an empty buffer and associated to external
+ * memory via xfs_buf_associate_memory() back to it's empty state.
+ */
+void
+xfs_buf_set_empty(
+       struct xfs_buf          *bp,
+       size_t                  len)
+{
+       if (bp->b_pages)
+               _xfs_buf_free_pages(bp);
+
+       bp->b_pages = NULL;
+       bp->b_page_count = 0;
+       bp->b_addr = NULL;
+       bp->b_file_offset = 0;
+       bp->b_buffer_length = bp->b_count_desired = len;
+       bp->b_bn = XFS_BUF_DADDR_NULL;
+       bp->b_flags &= ~XBF_MAPPED;
+}
+
 static inline struct page *
 mem_to_page(
        void                    *addr)
index a9a1c45..50a7d5f 100644 (file)
@@ -178,6 +178,7 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
                                xfs_buf_flags_t);
 
 extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
+extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
 extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
 extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
 extern void xfs_buf_hold(xfs_buf_t *);
index b3486df..54e623b 100644 (file)
@@ -586,7 +586,8 @@ xfs_file_compat_ioctl(
        case XFS_IOC_RESVSP_32:
        case XFS_IOC_UNRESVSP_32:
        case XFS_IOC_RESVSP64_32:
-       case XFS_IOC_UNRESVSP64_32: {
+       case XFS_IOC_UNRESVSP64_32:
+       case XFS_IOC_ZERO_RANGE_32: {
                struct xfs_flock64      bf;
 
                if (xfs_compat_flock64_copyin(&bf, arg))
index 08b6057..80f4060 100644 (file)
@@ -184,6 +184,7 @@ typedef struct compat_xfs_flock64 {
 #define XFS_IOC_UNRESVSP_32    _IOW('X', 41, struct compat_xfs_flock64)
 #define XFS_IOC_RESVSP64_32    _IOW('X', 42, struct compat_xfs_flock64)
 #define XFS_IOC_UNRESVSP64_32  _IOW('X', 43, struct compat_xfs_flock64)
+#define XFS_IOC_ZERO_RANGE_32  _IOW('X', 57, struct compat_xfs_flock64)
 
 typedef struct compat_xfs_fsop_geom_v1 {
        __u32           blocksize;      /* filesystem (data) block size */
index 244be9c..8633521 100644 (file)
@@ -70,6 +70,7 @@
 #include <linux/ctype.h>
 #include <linux/writeback.h>
 #include <linux/capability.h>
+#include <linux/list_sort.h>
 
 #include <asm/page.h>
 #include <asm/div64.h>
index 9f76cce..bd672de 100644 (file)
@@ -41,23 +41,6 @@ __xfs_printk(
        printk("%sXFS: %pV\n", level, vaf);
 }
 
-void xfs_printk(
-       const char              *level,
-       const struct xfs_mount  *mp,
-       const char              *fmt, ...)
-{
-       struct va_format        vaf;
-       va_list                 args;
-
-       va_start(args, fmt);
-
-       vaf.fmt = fmt;
-       vaf.va = &args;
-
-       __xfs_printk(level, mp, &vaf);
-       va_end(args);
-}
-
 #define define_xfs_printk_level(func, kern_level)              \
 void func(const struct xfs_mount *mp, const char *fmt, ...)    \
 {                                                              \
@@ -95,8 +78,7 @@ xfs_alert_tag(
        int                     do_panic = 0;
 
        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
-               xfs_printk(KERN_ALERT, mp,
-                       "XFS: Transforming an alert into a BUG.");
+               xfs_alert(mp, "Transforming an alert into a BUG.");
                do_panic = 1;
        }
 
index f1b3fc1..7fb7ea0 100644 (file)
@@ -3,9 +3,6 @@
 
 struct xfs_mount;
 
-extern void xfs_printk(const char *level, const struct xfs_mount *mp,
-                      const char *fmt, ...)
-        __attribute__ ((format (printf, 3, 4)));
 extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
         __attribute__ ((format (printf, 2, 3)));
 extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
@@ -28,7 +25,9 @@ extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
 extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
         __attribute__ ((format (printf, 2, 3)));
 #else
-static inline void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+static inline void
+__attribute__ ((format (printf, 2, 3)))
+xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
 {
 }
 #endif
index b38e58d..b0aa59e 100644 (file)
@@ -1787,10 +1787,6 @@ init_xfs_fs(void)
        if (error)
                goto out_cleanup_procfs;
 
-       error = xfs_init_workqueues();
-       if (error)
-               goto out_sysctl_unregister;
-
        vfs_initquota();
 
        error = register_filesystem(&xfs_fs_type);
index 3e898a4..cb1bb20 100644 (file)
@@ -267,6 +267,16 @@ xfs_sync_inode_attr(
 
        error = xfs_iflush(ip, flags);
 
+       /*
+        * We don't want to try again on non-blocking flushes that can't run
+        * again immediately. If an inode really must be written, then that's
+        * what the SYNC_WAIT flag is for.
+        */
+       if (error == EAGAIN) {
+               ASSERT(!(flags & SYNC_WAIT));
+               error = 0;
+       }
+
  out_unlock:
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        return error;
index 2d0bcb4..d48b7a5 100644 (file)
@@ -1151,44 +1151,7 @@ TRACE_EVENT(xfs_bunmap,
 
 );
 
-#define XFS_BUSY_SYNC \
-       { 0,    "async" }, \
-       { 1,    "sync" }
-
-TRACE_EVENT(xfs_alloc_busy,
-       TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
-                xfs_agblock_t agbno, xfs_extlen_t len, int sync),
-       TP_ARGS(trans, agno, agbno, len, sync),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(struct xfs_trans *, tp)
-               __field(int, tid)
-               __field(xfs_agnumber_t, agno)
-               __field(xfs_agblock_t, agbno)
-               __field(xfs_extlen_t, len)
-               __field(int, sync)
-       ),
-       TP_fast_assign(
-               __entry->dev = trans->t_mountp->m_super->s_dev;
-               __entry->tp = trans;
-               __entry->tid = trans->t_ticket->t_tid;
-               __entry->agno = agno;
-               __entry->agbno = agbno;
-               __entry->len = len;
-               __entry->sync = sync;
-       ),
-       TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->tp,
-                 __entry->tid,
-                 __entry->agno,
-                 __entry->agbno,
-                 __entry->len,
-                 __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
-
-);
-
-TRACE_EVENT(xfs_alloc_unbusy,
+DECLARE_EVENT_CLASS(xfs_busy_class,
        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
                 xfs_agblock_t agbno, xfs_extlen_t len),
        TP_ARGS(mp, agno, agbno, len),
@@ -1210,35 +1173,45 @@ TRACE_EVENT(xfs_alloc_unbusy,
                  __entry->agbno,
                  __entry->len)
 );
+#define DEFINE_BUSY_EVENT(name) \
+DEFINE_EVENT(xfs_busy_class, name, \
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+                xfs_agblock_t agbno, xfs_extlen_t len), \
+       TP_ARGS(mp, agno, agbno, len))
+DEFINE_BUSY_EVENT(xfs_alloc_busy);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_force);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_clear);
 
-#define XFS_BUSY_STATES \
-       { 0,    "missing" }, \
-       { 1,    "found" }
-
-TRACE_EVENT(xfs_alloc_busysearch,
+TRACE_EVENT(xfs_alloc_busy_trim,
        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-                xfs_agblock_t agbno, xfs_extlen_t len, int found),
-       TP_ARGS(mp, agno, agbno, len, found),
+                xfs_agblock_t agbno, xfs_extlen_t len,
+                xfs_agblock_t tbno, xfs_extlen_t tlen),
+       TP_ARGS(mp, agno, agbno, len, tbno, tlen),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(xfs_extlen_t, len)
-               __field(int, found)
+               __field(xfs_agblock_t, tbno)
+               __field(xfs_extlen_t, tlen)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->agno = agno;
                __entry->agbno = agbno;
                __entry->len = len;
-               __entry->found = found;
+               __entry->tbno = tbno;
+               __entry->tlen = tlen;
        ),
-       TP_printk("dev %d:%d agno %u agbno %u len %u %s",
+       TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->agbno,
                  __entry->len,
-                 __print_symbolic(__entry->found, XFS_BUSY_STATES))
+                 __entry->tbno,
+                 __entry->tlen)
 );
 
 TRACE_EVENT(xfs_trans_commit_lsn,
@@ -1418,7 +1391,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
                  __entry->wasfromfl,
                  __entry->isfl,
                  __entry->userdata,
-                 __entry->firstblock)
+                 (unsigned long long)__entry->firstblock)
 )
 
 #define DEFINE_ALLOC_EVENT(name) \
@@ -1433,11 +1406,14 @@ DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_greater);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_busy);
 DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
 DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
 DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
index 58632cc..da0a561 100644 (file)
@@ -187,7 +187,6 @@ struct xfs_busy_extent {
        xfs_agnumber_t  agno;
        xfs_agblock_t   bno;
        xfs_extlen_t    length;
-       xlog_tid_t      tid;            /* transaction that created this */
 };
 
 /*
index 27d64d7..acdced8 100644 (file)
 #define        XFSA_FIXUP_BNO_OK       1
 #define        XFSA_FIXUP_CNT_OK       2
 
-/*
- * Prototypes for per-ag allocation routines
- */
-
 STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
-       xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
-
-/*
- * Internal functions.
- */
+               xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *,
+               xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *);
 
 /*
  * Lookup the record equal to [bno, len] in the btree given by cur.
@@ -154,19 +148,21 @@ xfs_alloc_compute_aligned(
        xfs_extlen_t    *reslen)        /* result length */
 {
        xfs_agblock_t   bno;
-       xfs_extlen_t    diff;
        xfs_extlen_t    len;
 
-       if (args->alignment > 1 && foundlen >= args->minlen) {
-               bno = roundup(foundbno, args->alignment);
-               diff = bno - foundbno;
-               len = diff >= foundlen ? 0 : foundlen - diff;
+       /* Trim busy sections out of found extent */
+       xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len);
+
+       if (args->alignment > 1 && len >= args->minlen) {
+               xfs_agblock_t   aligned_bno = roundup(bno, args->alignment);
+               xfs_extlen_t    diff = aligned_bno - bno;
+
+               *resbno = aligned_bno;
+               *reslen = diff >= len ? 0 : len - diff;
        } else {
-               bno = foundbno;
-               len = foundlen;
+               *resbno = bno;
+               *reslen = len;
        }
-       *resbno = bno;
-       *reslen = len;
 }
 
 /*
@@ -280,7 +276,6 @@ xfs_alloc_fix_minleft(
                return 1;
        agf = XFS_BUF_TO_AGF(args->agbp);
        diff = be32_to_cpu(agf->agf_freeblks)
-               + be32_to_cpu(agf->agf_flcount)
                - args->len - args->minleft;
        if (diff >= 0)
                return 1;
@@ -541,16 +536,8 @@ xfs_alloc_ag_vextent(
                if (error)
                        return error;
 
-               /*
-                * Search the busylist for these blocks and mark the
-                * transaction as synchronous if blocks are found. This
-                * avoids the need to block due to a synchronous log
-                * force to ensure correct ordering as the synchronous
-                * transaction will guarantee that for us.
-                */
-               if (xfs_alloc_busy_search(args->mp, args->agno,
-                                       args->agbno, args->len))
-                       xfs_trans_set_sync(args->tp);
+               ASSERT(!xfs_alloc_busy_search(args->mp, args->agno,
+                                             args->agbno, args->len));
        }
 
        if (!args->isfl) {
@@ -577,14 +564,14 @@ xfs_alloc_ag_vextent_exact(
 {
        xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
        xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
-       xfs_agblock_t   end;    /* end of allocated extent */
        int             error;
        xfs_agblock_t   fbno;   /* start block of found extent */
-       xfs_agblock_t   fend;   /* end block of found extent */
        xfs_extlen_t    flen;   /* length of found extent */
+       xfs_agblock_t   tbno;   /* start block of trimmed extent */
+       xfs_extlen_t    tlen;   /* length of trimmed extent */
+       xfs_agblock_t   tend;   /* end block of trimmed extent */
+       xfs_agblock_t   end;    /* end of allocated extent */
        int             i;      /* success/failure of operation */
-       xfs_agblock_t   maxend; /* end of maximal extent */
-       xfs_agblock_t   minend; /* end of minimal extent */
        xfs_extlen_t    rlen;   /* length of returned extent */
 
        ASSERT(args->alignment == 1);
@@ -614,14 +601,22 @@ xfs_alloc_ag_vextent_exact(
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        ASSERT(fbno <= args->agbno);
-       minend = args->agbno + args->minlen;
-       maxend = args->agbno + args->maxlen;
-       fend = fbno + flen;
 
        /*
-        * Give up if the freespace isn't long enough for the minimum request.
+        * Check for overlapping busy extents.
+        */
+       xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen);
+
+       /*
+        * Give up if the start of the extent is busy, or the freespace isn't
+        * long enough for the minimum request.
         */
-       if (fend < minend)
+       if (tbno > args->agbno)
+               goto not_found;
+       if (tlen < args->minlen)
+               goto not_found;
+       tend = tbno + tlen;
+       if (tend < args->agbno + args->minlen)
                goto not_found;
 
        /*
@@ -630,14 +625,14 @@ xfs_alloc_ag_vextent_exact(
         *
         * Fix the length according to mod and prod if given.
         */
-       end = XFS_AGBLOCK_MIN(fend, maxend);
+       end = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen);
        args->len = end - args->agbno;
        xfs_alloc_fix_len(args);
        if (!xfs_alloc_fix_minleft(args))
                goto not_found;
 
        rlen = args->len;
-       ASSERT(args->agbno + rlen <= fend);
+       ASSERT(args->agbno + rlen <= tend);
        end = args->agbno + rlen;
 
        /*
@@ -686,11 +681,11 @@ xfs_alloc_find_best_extent(
        struct xfs_btree_cur    **scur, /* searching cursor */
        xfs_agblock_t           gdiff,  /* difference for search comparison */
        xfs_agblock_t           *sbno,  /* extent found by search */
-       xfs_extlen_t            *slen,
-       xfs_extlen_t            *slena, /* aligned length */
+       xfs_extlen_t            *slen,  /* extent length */
+       xfs_agblock_t           *sbnoa, /* aligned extent found by search */
+       xfs_extlen_t            *slena, /* aligned extent length */
        int                     dir)    /* 0 = search right, 1 = search left */
 {
-       xfs_agblock_t           bno;
        xfs_agblock_t           new;
        xfs_agblock_t           sdiff;
        int                     error;
@@ -708,16 +703,16 @@ xfs_alloc_find_best_extent(
                if (error)
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               xfs_alloc_compute_aligned(args, *sbno, *slen, &bno, slena);
+               xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
 
                /*
                 * The good extent is closer than this one.
                 */
                if (!dir) {
-                       if (bno >= args->agbno + gdiff)
+                       if (*sbnoa >= args->agbno + gdiff)
                                goto out_use_good;
                } else {
-                       if (bno <= args->agbno - gdiff)
+                       if (*sbnoa <= args->agbno - gdiff)
                                goto out_use_good;
                }
 
@@ -729,8 +724,8 @@ xfs_alloc_find_best_extent(
                        xfs_alloc_fix_len(args);
 
                        sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                                                      args->alignment, *sbno,
-                                                      *slen, &new);
+                                                      args->alignment, *sbnoa,
+                                                      *slena, &new);
 
                        /*
                         * Choose closer size and invalidate other cursor.
@@ -780,7 +775,7 @@ xfs_alloc_ag_vextent_near(
        xfs_agblock_t   gtbnoa;         /* aligned ... */
        xfs_extlen_t    gtdiff;         /* difference to right side entry */
        xfs_extlen_t    gtlen;          /* length of right side entry */
-       xfs_extlen_t    gtlena = 0;     /* aligned ... */
+       xfs_extlen_t    gtlena;         /* aligned ... */
        xfs_agblock_t   gtnew;          /* useful start bno of right side */
        int             error;          /* error code */
        int             i;              /* result code, temporary */
@@ -789,9 +784,10 @@ xfs_alloc_ag_vextent_near(
        xfs_agblock_t   ltbnoa;         /* aligned ... */
        xfs_extlen_t    ltdiff;         /* difference to left side entry */
        xfs_extlen_t    ltlen;          /* length of left side entry */
-       xfs_extlen_t    ltlena = 0;     /* aligned ... */
+       xfs_extlen_t    ltlena;         /* aligned ... */
        xfs_agblock_t   ltnew;          /* useful start bno of left side */
        xfs_extlen_t    rlen;           /* length of returned extent */
+       int             forced = 0;
 #if defined(DEBUG) && defined(__KERNEL__)
        /*
         * Randomly don't execute the first algorithm.
@@ -800,13 +796,20 @@ xfs_alloc_ag_vextent_near(
 
        dofirst = random32() & 1;
 #endif
+
+restart:
+       bno_cur_lt = NULL;
+       bno_cur_gt = NULL;
+       ltlen = 0;
+       gtlena = 0;
+       ltlena = 0;
+
        /*
         * Get a cursor for the by-size btree.
         */
        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
                args->agno, XFS_BTNUM_CNT);
-       ltlen = 0;
-       bno_cur_lt = bno_cur_gt = NULL;
+
        /*
         * See if there are any free extents as big as maxlen.
         */
@@ -822,11 +825,13 @@ xfs_alloc_ag_vextent_near(
                        goto error0;
                if (i == 0 || ltlen == 0) {
                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                       trace_xfs_alloc_near_noentry(args);
                        return 0;
                }
                ASSERT(i == 1);
        }
        args->wasfromfl = 0;
+
        /*
         * First algorithm.
         * If the requested extent is large wrt the freespaces available
@@ -890,7 +895,7 @@ xfs_alloc_ag_vextent_near(
                        if (args->len < blen)
                                continue;
                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                               args->alignment, ltbno, ltlen, &ltnew);
+                               args->alignment, ltbnoa, ltlena, &ltnew);
                        if (ltnew != NULLAGBLOCK &&
                            (args->len > blen || ltdiff < bdiff)) {
                                bdiff = ltdiff;
@@ -1042,11 +1047,12 @@ xfs_alloc_ag_vextent_near(
                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
                        xfs_alloc_fix_len(args);
                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                               args->alignment, ltbno, ltlen, &ltnew);
+                               args->alignment, ltbnoa, ltlena, &ltnew);
 
                        error = xfs_alloc_find_best_extent(args,
                                                &bno_cur_lt, &bno_cur_gt,
-                                               ltdiff, &gtbno, &gtlen, &gtlena,
+                                               ltdiff, &gtbno, &gtlen,
+                                               &gtbnoa, &gtlena,
                                                0 /* search right */);
                } else {
                        ASSERT(gtlena >= args->minlen);
@@ -1057,11 +1063,12 @@ xfs_alloc_ag_vextent_near(
                        args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
                        xfs_alloc_fix_len(args);
                        gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                               args->alignment, gtbno, gtlen, &gtnew);
+                               args->alignment, gtbnoa, gtlena, &gtnew);
 
                        error = xfs_alloc_find_best_extent(args,
                                                &bno_cur_gt, &bno_cur_lt,
-                                               gtdiff, &ltbno, &ltlen, &ltlena,
+                                               gtdiff, &ltbno, &ltlen,
+                                               &ltbnoa, &ltlena,
                                                1 /* search left */);
                }
 
@@ -1073,6 +1080,12 @@ xfs_alloc_ag_vextent_near(
         * If we couldn't get anything, give up.
         */
        if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
+               if (!forced++) {
+                       trace_xfs_alloc_near_busy(args);
+                       xfs_log_force(args->mp, XFS_LOG_SYNC);
+                       goto restart;
+               }
+
                trace_xfs_alloc_size_neither(args);
                args->agbno = NULLAGBLOCK;
                return 0;
@@ -1107,12 +1120,13 @@ xfs_alloc_ag_vextent_near(
                return 0;
        }
        rlen = args->len;
-       (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno,
-               ltlen, &ltnew);
+       (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
+                                    ltbnoa, ltlena, &ltnew);
        ASSERT(ltnew >= ltbno);
-       ASSERT(ltnew + rlen <= ltbno + ltlen);
+       ASSERT(ltnew + rlen <= ltbnoa + ltlena);
        ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
        args->agbno = ltnew;
+
        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
                        ltnew, rlen, XFSA_FIXUP_BNO_OK)))
                goto error0;
@@ -1155,26 +1169,35 @@ xfs_alloc_ag_vextent_size(
        int             i;              /* temp status variable */
        xfs_agblock_t   rbno;           /* returned block number */
        xfs_extlen_t    rlen;           /* length of returned extent */
+       int             forced = 0;
 
+restart:
        /*
         * Allocate and initialize a cursor for the by-size btree.
         */
        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
                args->agno, XFS_BTNUM_CNT);
        bno_cur = NULL;
+
        /*
         * Look for an entry >= maxlen+alignment-1 blocks.
         */
        if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
                        args->maxlen + args->alignment - 1, &i)))
                goto error0;
+
        /*
-        * If none, then pick up the last entry in the tree unless the
-        * tree is empty.
+        * If none or we have busy extents that we cannot allocate from, then
+        * we have to settle for a smaller extent. In the case that there are
+        * no large extents, this will return the last entry in the tree unless
+        * the tree is empty. In the case that there are only busy large
+        * extents, this will return the largest small extent unless there
+        * are no smaller extents available.
         */
-       if (!i) {
-               if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno,
-                               &flen, &i)))
+       if (!i || forced > 1) {
+               error = xfs_alloc_ag_vextent_small(args, cnt_cur,
+                                                  &fbno, &flen, &i);
+               if (error)
                        goto error0;
                if (i == 0 || flen == 0) {
                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -1182,22 +1205,56 @@ xfs_alloc_ag_vextent_size(
                        return 0;
                }
                ASSERT(i == 1);
+               xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
+       } else {
+               /*
+                * Search for a non-busy extent that is large enough.
+                * If we are at low space, don't check, or if we fall of
+                * the end of the btree, turn off the busy check and
+                * restart.
+                */
+               for (;;) {
+                       error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
+                       if (error)
+                               goto error0;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+                       xfs_alloc_compute_aligned(args, fbno, flen,
+                                                 &rbno, &rlen);
+
+                       if (rlen >= args->maxlen)
+                               break;
+
+                       error = xfs_btree_increment(cnt_cur, 0, &i);
+                       if (error)
+                               goto error0;
+                       if (i == 0) {
+                               /*
+                                * Our only valid extents must have been busy.
+                                * Make it unbusy by forcing the log out and
+                                * retrying. If we've been here before, forcing
+                                * the log isn't making the extents available,
+                                * which means they have probably been freed in
+                                * this transaction.  In that case, we have to
+                                * give up on them and we'll attempt a minlen
+                                * allocation the next time around.
+                                */
+                               xfs_btree_del_cursor(cnt_cur,
+                                                    XFS_BTREE_NOERROR);
+                               trace_xfs_alloc_size_busy(args);
+                               if (!forced++)
+                                       xfs_log_force(args->mp, XFS_LOG_SYNC);
+                               goto restart;
+                       }
+               }
        }
-       /*
-        * There's a freespace as big as maxlen+alignment-1, get it.
-        */
-       else {
-               if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-       }
+
        /*
         * In the first case above, we got the last entry in the
         * by-size btree.  Now we check to see if the space hits maxlen
         * once aligned; if not, we search left for something better.
         * This can't happen in the second case above.
         */
-       xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
        XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
                        (rlen <= flen && rbno + rlen <= fbno + flen), error0);
@@ -1251,13 +1308,19 @@ xfs_alloc_ag_vextent_size(
         * Fix up the length.
         */
        args->len = rlen;
-       xfs_alloc_fix_len(args);
-       if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) {
-               xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-               trace_xfs_alloc_size_nominleft(args);
-               args->agbno = NULLAGBLOCK;
-               return 0;
+       if (rlen < args->minlen) {
+               if (!forced++) {
+                       xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                       trace_xfs_alloc_size_busy(args);
+                       xfs_log_force(args->mp, XFS_LOG_SYNC);
+                       goto restart;
+               }
+               goto out_nominleft;
        }
+       xfs_alloc_fix_len(args);
+
+       if (!xfs_alloc_fix_minleft(args))
+               goto out_nominleft;
        rlen = args->len;
        XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
        /*
@@ -1287,6 +1350,12 @@ error0:
        if (bno_cur)
                xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
        return error;
+
+out_nominleft:
+       xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+       trace_xfs_alloc_size_nominleft(args);
+       args->agbno = NULLAGBLOCK;
+       return 0;
 }
 
 /*
@@ -1326,6 +1395,9 @@ xfs_alloc_ag_vextent_small(
                if (error)
                        goto error0;
                if (fbno != NULLAGBLOCK) {
+                       xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1,
+                                            args->userdata);
+
                        if (args->userdata) {
                                xfs_buf_t       *bp;
 
@@ -1617,18 +1689,6 @@ xfs_free_ag_extent(
 
        trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
 
-       /*
-        * Since blocks move to the free list without the coordination
-        * used in xfs_bmap_finish, we can't allow block to be available
-        * for reallocation and non-transaction writing (user data)
-        * until we know that the transaction that moved it to the free
-        * list is permanently on disk.  We track the blocks by declaring
-        * these blocks as "busy"; the busy list is maintained on a per-ag
-        * basis and each transaction records which entries should be removed
-        * when the iclog commits to disk.  If a busy block is allocated,
-        * the iclog is pushed up to the LSN that freed the block.
-        */
-       xfs_alloc_busy_insert(tp, agno, bno, len);
        return 0;
 
  error0:
@@ -1923,21 +1983,6 @@ xfs_alloc_get_freelist(
        xfs_alloc_log_agf(tp, agbp, logflags);
        *bnop = bno;
 
-       /*
-        * As blocks are freed, they are added to the per-ag busy list and
-        * remain there until the freeing transaction is committed to disk.
-        * Now that we have allocated blocks, this list must be searched to see
-        * if a block is being reused.  If one is, then the freeing transaction
-        * must be pushed to disk before this transaction.
-        *
-        * We do this by setting the current transaction to a sync transaction
-        * which guarantees that the freeing transaction is on disk before this
-        * transaction. This is done instead of a synchronous log force here so
-        * that we don't sit and wait with the AGF locked in the transaction
-        * during the log force.
-        */
-       if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
-               xfs_trans_set_sync(tp);
        return 0;
 }
 
@@ -2423,105 +2468,13 @@ xfs_free_extent(
        }
 
        error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
+       if (!error)
+               xfs_alloc_busy_insert(tp, args.agno, args.agbno, len);
 error0:
        xfs_perag_put(args.pag);
        return error;
 }
 
-
-/*
- * AG Busy list management
- * The busy list contains block ranges that have been freed but whose
- * transactions have not yet hit disk.  If any block listed in a busy
- * list is reused, the transaction that freed it must be forced to disk
- * before continuing to use the block.
- *
- * xfs_alloc_busy_insert - add to the per-ag busy list
- * xfs_alloc_busy_clear - remove an item from the per-ag busy list
- * xfs_alloc_busy_search - search for a busy extent
- */
-
-/*
- * Insert a new extent into the busy tree.
- *
- * The busy extent tree is indexed by the start block of the busy extent.
- * there can be multiple overlapping ranges in the busy extent tree but only
- * ever one entry at a given start block. The reason for this is that
- * multi-block extents can be freed, then smaller chunks of that extent
- * allocated and freed again before the first transaction commit is on disk.
- * If the exact same start block is freed a second time, we have to wait for
- * that busy extent to pass out of the tree before the new extent is inserted.
- * There are two main cases we have to handle here.
- *
- * The first case is a transaction that triggers a "free - allocate - free"
- * cycle. This can occur during btree manipulations as a btree block is freed
- * to the freelist, then allocated from the free list, then freed again. In
- * this case, the second extxpnet free is what triggers the duplicate and as
- * such the transaction IDs should match. Because the extent was allocated in
- * this transaction, the transaction must be marked as synchronous. This is
- * true for all cases where the free/alloc/free occurs in the one transaction,
- * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
- * This serves to catch violations of the second case quite effectively.
- *
- * The second case is where the free/alloc/free occur in different
- * transactions. In this case, the thread freeing the extent the second time
- * can't mark the extent busy immediately because it is already tracked in a
- * transaction that may be committing.  When the log commit for the existing
- * busy extent completes, the busy extent will be removed from the tree. If we
- * allow the second busy insert to continue using that busy extent structure,
- * it can be freed before this transaction is safely in the log.  Hence our
- * only option in this case is to force the log to remove the existing busy
- * extent from the list before we insert the new one with the current
- * transaction ID.
- *
- * The problem we are trying to avoid in the free-alloc-free in separate
- * transactions is most easily described with a timeline:
- *
- *      Thread 1       Thread 2        Thread 3        xfslogd
- *     xact alloc
- *     free X
- *     mark busy
- *     commit xact
- *     free xact
- *                     xact alloc
- *                     alloc X
- *                     busy search
- *                     mark xact sync
- *                     commit xact
- *                     free xact
- *                     force log
- *                     checkpoint starts
- *                     ....
- *                                     xact alloc
- *                                     free X
- *                                     mark busy
- *                                     finds match
- *                                     *** KABOOM! ***
- *                                     ....
- *                                                     log IO completes
- *                                                     unbusy X
- *                     checkpoint completes
- *
- * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
- * the checkpoint completes, and the busy extent it matched will have been
- * removed from the tree when it is woken. Hence it can then continue safely.
- *
- * However, to ensure this matching process is robust, we need to use the
- * transaction ID for identifying transaction, as delayed logging results in
- * the busy extent and transaction lifecycles being different. i.e. the busy
- * extent is active for a lot longer than the transaction.  Hence the
- * transaction structure can be freed and reallocated, then mark the same
- * extent busy again in the new transaction. In this case the new transaction
- * will have a different tid but can have the same address, and hence we need
- * to check against the tid.
- *
- * Future: for delayed logging, we could avoid the log force if the extent was
- * first freed in the current checkpoint sequence. This, however, requires the
- * ability to pin the current checkpoint in memory until this transaction
- * commits to ensure that both the original free and the current one combine
- * logically into the one checkpoint. If the checkpoint sequences are
- * different, however, we still need to wait on a log force.
- */
 void
 xfs_alloc_busy_insert(
        struct xfs_trans        *tp,
@@ -2533,9 +2486,7 @@ xfs_alloc_busy_insert(
        struct xfs_busy_extent  *busyp;
        struct xfs_perag        *pag;
        struct rb_node          **rbp;
-       struct rb_node          *parent;
-       int                     match;
-
+       struct rb_node          *parent = NULL;
 
        new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
        if (!new) {
@@ -2544,7 +2495,7 @@ xfs_alloc_busy_insert(
                 * block, make this a synchronous transaction to insure that
                 * the block is not reused before this transaction commits.
                 */
-               trace_xfs_alloc_busy(tp, agno, bno, len, 1);
+               trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len);
                xfs_trans_set_sync(tp);
                return;
        }
@@ -2552,66 +2503,28 @@ xfs_alloc_busy_insert(
        new->agno = agno;
        new->bno = bno;
        new->length = len;
-       new->tid = xfs_log_get_trans_ident(tp);
-
        INIT_LIST_HEAD(&new->list);
 
        /* trace before insert to be able to see failed inserts */
-       trace_xfs_alloc_busy(tp, agno, bno, len, 0);
+       trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
 
        pag = xfs_perag_get(tp->t_mountp, new->agno);
-restart:
        spin_lock(&pag->pagb_lock);
        rbp = &pag->pagb_tree.rb_node;
-       parent = NULL;
-       busyp = NULL;
-       match = 0;
-       while (*rbp && match >= 0) {
+       while (*rbp) {
                parent = *rbp;
                busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
 
                if (new->bno < busyp->bno) {
-                       /* may overlap, but exact start block is lower */
                        rbp = &(*rbp)->rb_left;
-                       if (new->bno + new->length > busyp->bno)
-                               match = busyp->tid == new->tid ? 1 : -1;
+                       ASSERT(new->bno + new->length <= busyp->bno);
                } else if (new->bno > busyp->bno) {
-                       /* may overlap, but exact start block is higher */
                        rbp = &(*rbp)->rb_right;
-                       if (bno < busyp->bno + busyp->length)
-                               match = busyp->tid == new->tid ? 1 : -1;
+                       ASSERT(bno >= busyp->bno + busyp->length);
                } else {
-                       match = busyp->tid == new->tid ? 1 : -1;
-                       break;
+                       ASSERT(0);
                }
        }
-       if (match < 0) {
-               /* overlap marked busy in different transaction */
-               spin_unlock(&pag->pagb_lock);
-               xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
-               goto restart;
-       }
-       if (match > 0) {
-               /*
-                * overlap marked busy in same transaction. Update if exact
-                * start block match, otherwise combine the busy extents into
-                * a single range.
-                */
-               if (busyp->bno == new->bno) {
-                       busyp->length = max(busyp->length, new->length);
-                       spin_unlock(&pag->pagb_lock);
-                       ASSERT(tp->t_flags & XFS_TRANS_SYNC);
-                       xfs_perag_put(pag);
-                       kmem_free(new);
-                       return;
-               }
-               rb_erase(&busyp->rb_node, &pag->pagb_tree);
-               new->length = max(busyp->bno + busyp->length,
-                                       new->bno + new->length) -
-                               min(busyp->bno, new->bno);
-               new->bno = min(busyp->bno, new->bno);
-       } else
-               busyp = NULL;
 
        rb_link_node(&new->rb_node, parent, rbp);
        rb_insert_color(&new->rb_node, &pag->pagb_tree);
@@ -2619,7 +2532,6 @@ restart:
        list_add(&new->list, &tp->t_busy);
        spin_unlock(&pag->pagb_lock);
        xfs_perag_put(pag);
-       kmem_free(busyp);
 }
 
 /*
@@ -2668,31 +2580,443 @@ xfs_alloc_busy_search(
                }
        }
        spin_unlock(&pag->pagb_lock);
-       trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
        xfs_perag_put(pag);
        return match;
 }
 
+/*
+ * The found free extent [fbno, fend] overlaps part or all of the given busy
+ * extent.  If the overlap covers the beginning, the end, or all of the busy
+ * extent, the overlapping portion can be made unbusy and used for the
+ * allocation.  We can't split a busy extent because we can't modify a
+ * transaction/CIL context busy list, but we can update an entries block
+ * number or length.
+ *
+ * Returns true if the extent can safely be reused, or false if the search
+ * needs to be restarted.
+ */
+STATIC bool
+xfs_alloc_busy_update_extent(
+       struct xfs_mount        *mp,
+       struct xfs_perag        *pag,
+       struct xfs_busy_extent  *busyp,
+       xfs_agblock_t           fbno,
+       xfs_extlen_t            flen,
+       bool                    userdata)
+{
+       xfs_agblock_t           fend = fbno + flen;
+       xfs_agblock_t           bbno = busyp->bno;
+       xfs_agblock_t           bend = bbno + busyp->length;
+
+       /*
+        * If there is a busy extent overlapping a user allocation, we have
+        * no choice but to force the log and retry the search.
+        *
+        * Fortunately this does not happen during normal operation, but
+        * only if the filesystem is very low on space and has to dip into
+        * the AGFL for normal allocations.
+        */
+       if (userdata)
+               goto out_force_log;
+
+       if (bbno < fbno && bend > fend) {
+               /*
+                * Case 1:
+                *    bbno           bend
+                *    +BBBBBBBBBBBBBBBBB+
+                *        +---------+
+                *        fbno   fend
+                */
+
+               /*
+                * We would have to split the busy extent to be able to track
+                * it correct, which we cannot do because we would have to
+                * modify the list of busy extents attached to the transaction
+                * or CIL context, which is immutable.
+                *
+                * Force out the log to clear the busy extent and retry the
+                * search.
+                */
+               goto out_force_log;
+       } else if (bbno >= fbno && bend <= fend) {
+               /*
+                * Case 2:
+                *    bbno           bend
+                *    +BBBBBBBBBBBBBBBBB+
+                *    +-----------------+
+                *    fbno           fend
+                *
+                * Case 3:
+                *    bbno           bend
+                *    +BBBBBBBBBBBBBBBBB+
+                *    +--------------------------+
+                *    fbno                    fend
+                *
+                * Case 4:
+                *             bbno           bend
+                *             +BBBBBBBBBBBBBBBBB+
+                *    +--------------------------+
+                *    fbno                    fend
+                *
+                * Case 5:
+                *             bbno           bend
+                *             +BBBBBBBBBBBBBBBBB+
+                *    +-----------------------------------+
+                *    fbno                             fend
+                *
+                */
+
+               /*
+                * The busy extent is fully covered by the extent we are
+                * allocating, and can simply be removed from the rbtree.
+                * However we cannot remove it from the immutable list
+                * tracking busy extents in the transaction or CIL context,
+                * so set the length to zero to mark it invalid.
+                *
+                * We also need to restart the busy extent search from the
+                * tree root, because erasing the node can rearrange the
+                * tree topology.
+                */
+               rb_erase(&busyp->rb_node, &pag->pagb_tree);
+               busyp->length = 0;
+               return false;
+       } else if (fend < bend) {
+               /*
+                * Case 6:
+                *              bbno           bend
+                *             +BBBBBBBBBBBBBBBBB+
+                *             +---------+
+                *             fbno   fend
+                *
+                * Case 7:
+                *             bbno           bend
+                *             +BBBBBBBBBBBBBBBBB+
+                *    +------------------+
+                *    fbno            fend
+                *
+                */
+               busyp->bno = fend;
+       } else if (bbno < fbno) {
+               /*
+                * Case 8:
+                *    bbno           bend
+                *    +BBBBBBBBBBBBBBBBB+
+                *        +-------------+
+                *        fbno       fend
+                *
+                * Case 9:
+                *    bbno           bend
+                *    +BBBBBBBBBBBBBBBBB+
+                *        +----------------------+
+                *        fbno                fend
+                */
+               busyp->length = fbno - busyp->bno;
+       } else {
+               ASSERT(0);
+       }
+
+       trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen);
+       return true;
+
+out_force_log:
+       spin_unlock(&pag->pagb_lock);
+       xfs_log_force(mp, XFS_LOG_SYNC);
+       trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen);
+       spin_lock(&pag->pagb_lock);
+       return false;
+}
+
+
+/*
+ * For a given extent [fbno, flen], make sure we can reuse it safely.
+ */
 void
-xfs_alloc_busy_clear(
+xfs_alloc_busy_reuse(
        struct xfs_mount        *mp,
-       struct xfs_busy_extent  *busyp)
+       xfs_agnumber_t          agno,
+       xfs_agblock_t           fbno,
+       xfs_extlen_t            flen,
+       bool                    userdata)
 {
        struct xfs_perag        *pag;
+       struct rb_node          *rbp;
 
-       trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
-                                               busyp->length);
+       ASSERT(flen > 0);
 
-       ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
-                                               busyp->length) == 1);
+       pag = xfs_perag_get(mp, agno);
+       spin_lock(&pag->pagb_lock);
+restart:
+       rbp = pag->pagb_tree.rb_node;
+       while (rbp) {
+               struct xfs_busy_extent *busyp =
+                       rb_entry(rbp, struct xfs_busy_extent, rb_node);
+               xfs_agblock_t   bbno = busyp->bno;
+               xfs_agblock_t   bend = bbno + busyp->length;
 
-       list_del_init(&busyp->list);
+               if (fbno + flen <= bbno) {
+                       rbp = rbp->rb_left;
+                       continue;
+               } else if (fbno >= bend) {
+                       rbp = rbp->rb_right;
+                       continue;
+               }
 
-       pag = xfs_perag_get(mp, busyp->agno);
-       spin_lock(&pag->pagb_lock);
-       rb_erase(&busyp->rb_node, &pag->pagb_tree);
+               if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen,
+                                                 userdata))
+                       goto restart;
+       }
        spin_unlock(&pag->pagb_lock);
        xfs_perag_put(pag);
+}
+
+/*
+ * For a given extent [fbno, flen], search the busy extent list to find a
+ * subset of the extent that is not busy.  If *rlen is smaller than
+ * args->minlen no suitable extent could be found, and the higher level
+ * code needs to force out the log and retry the allocation.
+ */
+STATIC void
+xfs_alloc_busy_trim(
+       struct xfs_alloc_arg    *args,
+       xfs_agblock_t           bno,
+       xfs_extlen_t            len,
+       xfs_agblock_t           *rbno,
+       xfs_extlen_t            *rlen)
+{
+       xfs_agblock_t           fbno;
+       xfs_extlen_t            flen;
+       struct rb_node          *rbp;
+
+       ASSERT(len > 0);
 
+       spin_lock(&args->pag->pagb_lock);
+restart:
+       fbno = bno;
+       flen = len;
+       rbp = args->pag->pagb_tree.rb_node;
+       while (rbp && flen >= args->minlen) {
+               struct xfs_busy_extent *busyp =
+                       rb_entry(rbp, struct xfs_busy_extent, rb_node);
+               xfs_agblock_t   fend = fbno + flen;
+               xfs_agblock_t   bbno = busyp->bno;
+               xfs_agblock_t   bend = bbno + busyp->length;
+
+               if (fend <= bbno) {
+                       rbp = rbp->rb_left;
+                       continue;
+               } else if (fbno >= bend) {
+                       rbp = rbp->rb_right;
+                       continue;
+               }
+
+               /*
+                * If this is a metadata allocation, try to reuse the busy
+                * extent instead of trimming the allocation.
+                */
+               if (!args->userdata) {
+                       if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
+                                                         busyp, fbno, flen,
+                                                         false))
+                               goto restart;
+                       continue;
+               }
+
+               if (bbno <= fbno) {
+                       /* start overlap */
+
+                       /*
+                        * Case 1:
+                        *    bbno           bend
+                        *    +BBBBBBBBBBBBBBBBB+
+                        *        +---------+
+                        *        fbno   fend
+                        *
+                        * Case 2:
+                        *    bbno           bend
+                        *    +BBBBBBBBBBBBBBBBB+
+                        *    +-------------+
+                        *    fbno       fend
+                        *
+                        * Case 3:
+                        *    bbno           bend
+                        *    +BBBBBBBBBBBBBBBBB+
+                        *        +-------------+
+                        *        fbno       fend
+                        *
+                        * Case 4:
+                        *    bbno           bend
+                        *    +BBBBBBBBBBBBBBBBB+
+                        *    +-----------------+
+                        *    fbno           fend
+                        *
+                        * No unbusy region in extent, return failure.
+                        */
+                       if (fend <= bend)
+                               goto fail;
+
+                       /*
+                        * Case 5:
+                        *    bbno           bend
+                        *    +BBBBBBBBBBBBBBBBB+
+                        *        +----------------------+
+                        *        fbno                fend
+                        *
+                        * Case 6:
+                        *    bbno           bend
+                        *    +BBBBBBBBBBBBBBBBB+
+                        *    +--------------------------+
+                        *    fbno                    fend
+                        *
+                        * Needs to be trimmed to:
+                        *                       +-------+
+                        *                       fbno fend
+                        */
+                       fbno = bend;
+               } else if (bend >= fend) {
+                       /* end overlap */
+
+                       /*
+                        * Case 7:
+                        *             bbno           bend
+                        *             +BBBBBBBBBBBBBBBBB+
+                        *    +------------------+
+                        *    fbno            fend
+                        *
+                        * Case 8:
+                        *             bbno           bend
+                        *             +BBBBBBBBBBBBBBBBB+
+                        *    +--------------------------+
+                        *    fbno                    fend
+                        *
+                        * Needs to be trimmed to:
+                        *    +-------+
+                        *    fbno fend
+                        */
+                       fend = bbno;
+               } else {
+                       /* middle overlap */
+
+                       /*
+                        * Case 9:
+                        *             bbno           bend
+                        *             +BBBBBBBBBBBBBBBBB+
+                        *    +-----------------------------------+
+                        *    fbno                             fend
+                        *
+                        * Can be trimmed to:
+                        *    +-------+        OR         +-------+
+                        *    fbno fend                   fbno fend
+                        *
+                        * Backward allocation leads to significant
+                        * fragmentation of directories, which degrades
+                        * directory performance, therefore we always want to
+                        * choose the option that produces forward allocation
+                        * patterns.
+                        * Preferring the lower bno extent will make the next
+                        * request use "fend" as the start of the next
+                        * allocation;  if the segment is no longer busy at
+                        * that point, we'll get a contiguous allocation, but
+                        * even if it is still busy, we will get a forward
+                        * allocation.
+                        * We try to avoid choosing the segment at "bend",
+                        * because that can lead to the next allocation
+                        * taking the segment at "fbno", which would be a
+                        * backward allocation.  We only use the segment at
+                        * "fbno" if it is much larger than the current
+                        * requested size, because in that case there's a
+                        * good chance subsequent allocations will be
+                        * contiguous.
+                        */
+                       if (bbno - fbno >= args->maxlen) {
+                               /* left candidate fits perfect */
+                               fend = bbno;
+                       } else if (fend - bend >= args->maxlen * 4) {
+                               /* right candidate has enough free space */
+                               fbno = bend;
+                       } else if (bbno - fbno >= args->minlen) {
+                               /* left candidate fits minimum requirement */
+                               fend = bbno;
+                       } else {
+                               goto fail;
+                       }
+               }
+
+               flen = fend - fbno;
+       }
+       spin_unlock(&args->pag->pagb_lock);
+
+       if (fbno != bno || flen != len) {
+               trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len,
+                                         fbno, flen);
+       }
+       *rbno = fbno;
+       *rlen = flen;
+       return;
+fail:
+       /*
+        * Return a zero extent length as failure indications.  All callers
+        * re-check if the trimmed extent satisfies the minlen requirement.
+        */
+       spin_unlock(&args->pag->pagb_lock);
+       trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
+       *rbno = fbno;
+       *rlen = 0;
+}
+
+static void
+xfs_alloc_busy_clear_one(
+       struct xfs_mount        *mp,
+       struct xfs_perag        *pag,
+       struct xfs_busy_extent  *busyp)
+{
+       if (busyp->length) {
+               trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno,
+                                               busyp->length);
+               rb_erase(&busyp->rb_node, &pag->pagb_tree);
+       }
+
+       list_del_init(&busyp->list);
        kmem_free(busyp);
 }
+
+void
+xfs_alloc_busy_clear(
+       struct xfs_mount        *mp,
+       struct list_head        *list)
+{
+       struct xfs_busy_extent  *busyp, *n;
+       struct xfs_perag        *pag = NULL;
+       xfs_agnumber_t          agno = NULLAGNUMBER;
+
+       list_for_each_entry_safe(busyp, n, list, list) {
+               if (busyp->agno != agno) {
+                       if (pag) {
+                               spin_unlock(&pag->pagb_lock);
+                               xfs_perag_put(pag);
+                       }
+                       pag = xfs_perag_get(mp, busyp->agno);
+                       spin_lock(&pag->pagb_lock);
+                       agno = busyp->agno;
+               }
+
+               xfs_alloc_busy_clear_one(mp, pag, busyp);
+       }
+
+       if (pag) {
+               spin_unlock(&pag->pagb_lock);
+               xfs_perag_put(pag);
+       }
+}
+
+/*
+ * Callback for list_sort to sort busy extents by the AG they reside in.
+ */
+int
+xfs_busy_extent_ag_cmp(
+       void                    *priv,
+       struct list_head        *a,
+       struct list_head        *b)
+{
+       return container_of(a, struct xfs_busy_extent, list)->agno -
+               container_of(b, struct xfs_busy_extent, list)->agno;
+}
index d0b3bc7..240ad28 100644 (file)
@@ -140,11 +140,24 @@ xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
        xfs_agblock_t bno, xfs_extlen_t len);
 
 void
-xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
+xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list);
 
 int
 xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
        xfs_agblock_t bno, xfs_extlen_t len);
+
+void
+xfs_alloc_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
+       xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
+
+int
+xfs_busy_extent_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
+
+static inline void xfs_alloc_busy_sort(struct list_head *list)
+{
+       list_sort(NULL, list, xfs_busy_extent_ag_cmp);
+}
+
 #endif /* __KERNEL__ */
 
 /*
index 3916925..8b469d5 100644 (file)
@@ -95,6 +95,8 @@ xfs_allocbt_alloc_block(
                return 0;
        }
 
+       xfs_alloc_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
+
        xfs_trans_agbtree_delta(cur->bc_tp, 1);
        new->s = cpu_to_be32(bno);
 
@@ -118,17 +120,6 @@ xfs_allocbt_free_block(
        if (error)
                return error;
 
-       /*
-        * Since blocks move to the free list without the coordination used in
-        * xfs_bmap_finish, we can't allow block to be available for
-        * reallocation and non-transaction writing (user data) until we know
-        * that the transaction that moved it to the free list is permanently
-        * on disk. We track the blocks by declaring these blocks as "busy";
-        * the busy list is maintained on a per-ag basis and each transaction
-        * records which entries should be removed when the iclog commits to
-        * disk. If a busy block is allocated, the iclog is pushed up to the
-        * LSN that freed the block.
-        */
        xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
        xfs_trans_agbtree_delta(cur->bc_tp, -1);
        return 0;
index be62867..9a84a85 100644 (file)
@@ -202,7 +202,7 @@ xfs_swap_extents(
        xfs_inode_t     *tip,   /* tmp inode */
        xfs_swapext_t   *sxp)
 {
-       xfs_mount_t     *mp;
+       xfs_mount_t     *mp = ip->i_mount;
        xfs_trans_t     *tp;
        xfs_bstat_t     *sbp = &sxp->sx_stat;
        xfs_ifork_t     *tempifp, *ifp, *tifp;
@@ -212,16 +212,12 @@ xfs_swap_extents(
        int             taforkblks = 0;
        __uint64_t      tmp;
 
-       mp = ip->i_mount;
-
        tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
        if (!tempifp) {
                error = XFS_ERROR(ENOMEM);
                goto out;
        }
 
-       sbp = &sxp->sx_stat;
-
        /*
         * we have to do two separate lock calls here to keep lockdep
         * happy. If we try to get all the locks in one call, lock will
index d11ce61..c8e3349 100644 (file)
@@ -1354,7 +1354,7 @@ xfs_itruncate_start(
                return 0;
        }
        last_byte = xfs_file_last_byte(ip);
-       trace_xfs_itruncate_start(ip, flags, new_size, toss_start, last_byte);
+       trace_xfs_itruncate_start(ip, new_size, flags, toss_start, last_byte);
        if (last_byte > toss_start) {
                if (flags & XFS_ITRUNC_DEFINITE) {
                        xfs_tosspages(ip, toss_start,
index 576fdfe..09983a3 100644 (file)
@@ -970,7 +970,6 @@ xfs_iflush_abort(
 {
        xfs_inode_log_item_t    *iip = ip->i_itemp;
 
-       iip = ip->i_itemp;
        if (iip) {
                struct xfs_ail  *ailp = iip->ili_item.li_ailp;
                if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
index b612ce4..2119302 100644 (file)
@@ -1449,6 +1449,13 @@ xlog_dealloc_log(xlog_t *log)
 
        xlog_cil_destroy(log);
 
+       /*
+        * always need to ensure that the extra buffer does not point to memory
+        * owned by another log buffer before we free it.
+        */
+       xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size);
+       xfs_buf_free(log->l_xbuf);
+
        iclog = log->l_iclog;
        for (i=0; i<log->l_iclog_bufs; i++) {
                xfs_buf_free(iclog->ic_bp);
@@ -1458,7 +1465,6 @@ xlog_dealloc_log(xlog_t *log)
        }
        spinlock_destroy(&log->l_icloglock);
 
-       xfs_buf_free(log->l_xbuf);
        log->l_mp->m_log = NULL;
        kmem_free(log);
 }      /* xlog_dealloc_log */
@@ -3248,13 +3254,6 @@ xfs_log_ticket_get(
        return ticket;
 }
 
-xlog_tid_t
-xfs_log_get_trans_ident(
-       struct xfs_trans        *tp)
-{
-       return tp->t_ticket->t_tid;
-}
-
 /*
  * Allocate and initialise a new log ticket.
  */
index 3bd3291..78c9039 100644 (file)
@@ -189,8 +189,6 @@ void          xlog_iodone(struct xfs_buf *);
 struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
 void     xfs_log_ticket_put(struct xlog_ticket *ticket);
 
-xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
-
 void   xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
                                struct xfs_log_vec *log_vector,
                                xfs_lsn_t *commit_lsn, int flags);
index 9ca59be..7d56e88 100644 (file)
@@ -361,13 +361,12 @@ xlog_cil_committed(
        int     abort)
 {
        struct xfs_cil_ctx      *ctx = args;
-       struct xfs_busy_extent  *busyp, *n;
 
        xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
                                        ctx->start_lsn, abort);
 
-       list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
-               xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
+       xfs_alloc_busy_sort(&ctx->busy_extents);
+       xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, &ctx->busy_extents);
 
        spin_lock(&ctx->cil->xc_cil_lock);
        list_del(&ctx->committing);
index 5864850..2d3b6a4 100644 (file)
@@ -146,6 +146,8 @@ static inline uint xlog_get_client_id(__be32 i)
                                           shutdown */
 #define XLOG_TAIL_WARN         0x10    /* log tail verify warning issued */
 
+typedef __uint32_t xlog_tid_t;
+
 #ifdef __KERNEL__
 /*
  * Below are states for covering allocation transactions.
index 5cc464a..04142ca 100644 (file)
@@ -205,6 +205,35 @@ xlog_bread(
 }
 
 /*
+ * Read at an offset into the buffer. Returns with the buffer in it's original
+ * state regardless of the result of the read.
+ */
+STATIC int
+xlog_bread_offset(
+       xlog_t          *log,
+       xfs_daddr_t     blk_no,         /* block to read from */
+       int             nbblks,         /* blocks to read */
+       xfs_buf_t       *bp,
+       xfs_caddr_t     offset)
+{
+       xfs_caddr_t     orig_offset = XFS_BUF_PTR(bp);
+       int             orig_len = bp->b_buffer_length;
+       int             error, error2;
+
+       error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks));
+       if (error)
+               return error;
+
+       error = xlog_bread_noalign(log, blk_no, nbblks, bp);
+
+       /* must reset buffer pointer even on error */
+       error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len);
+       if (error)
+               return error;
+       return error2;
+}
+
+/*
  * Write out the buffer at the given block for the given number of blocks.
  * The buffer is kept locked across the write and is returned locked.
  * This can only be used for synchronous log writes.
@@ -1229,20 +1258,12 @@ xlog_write_log_records(
                 */
                ealign = round_down(end_block, sectbb);
                if (j == 0 && (start_block + endcount > ealign)) {
-                       offset = XFS_BUF_PTR(bp);
-                       balign = BBTOB(ealign - start_block);
-                       error = XFS_BUF_SET_PTR(bp, offset + balign,
-                                               BBTOB(sectbb));
+                       offset = XFS_BUF_PTR(bp) + BBTOB(ealign - start_block);
+                       error = xlog_bread_offset(log, ealign, sectbb,
+                                                       bp, offset);
                        if (error)
                                break;
 
-                       error = xlog_bread_noalign(log, ealign, sectbb, bp);
-                       if (error)
-                               break;
-
-                       error = XFS_BUF_SET_PTR(bp, offset, bufblks);
-                       if (error)
-                               break;
                }
 
                offset = xlog_align(log, start_block, endcount, bp);
@@ -3448,19 +3469,9 @@ xlog_do_recovery_pass(
                                 *   - order is important.
                                 */
                                wrapped_hblks = hblks - split_hblks;
-                               error = XFS_BUF_SET_PTR(hbp,
-                                               offset + BBTOB(split_hblks),
-                                               BBTOB(hblks - split_hblks));
-                               if (error)
-                                       goto bread_err2;
-
-                               error = xlog_bread_noalign(log, 0,
-                                                          wrapped_hblks, hbp);
-                               if (error)
-                                       goto bread_err2;
-
-                               error = XFS_BUF_SET_PTR(hbp, offset,
-                                                       BBTOB(hblks));
+                               error = xlog_bread_offset(log, 0,
+                                               wrapped_hblks, hbp,
+                                               offset + BBTOB(split_hblks));
                                if (error)
                                        goto bread_err2;
                        }
@@ -3511,19 +3522,9 @@ xlog_do_recovery_pass(
                                 *   _first_, then the log start (LR header end)
                                 *   - order is important.
                                 */
-                               error = XFS_BUF_SET_PTR(dbp,
-                                               offset + BBTOB(split_bblks),
-                                               BBTOB(bblks - split_bblks));
-                               if (error)
-                                       goto bread_err2;
-
-                               error = xlog_bread_noalign(log, wrapped_hblks,
-                                               bblks - split_bblks,
-                                               dbp);
-                               if (error)
-                                       goto bread_err2;
-
-                               error = XFS_BUF_SET_PTR(dbp, offset, h_size);
+                               error = xlog_bread_offset(log, 0,
+                                               bblks - split_bblks, hbp,
+                                               offset + BBTOB(split_bblks));
                                if (error)
                                        goto bread_err2;
                        }
index bb3f9a7..b49b823 100644 (file)
@@ -1900,7 +1900,7 @@ xfs_mod_incore_sb_batch(
        uint                    nmsb,
        int                     rsvd)
 {
-       xfs_mod_sb_t            *msbp = &msb[0];
+       xfs_mod_sb_t            *msbp;
        int                     error = 0;
 
        /*
@@ -1910,7 +1910,7 @@ xfs_mod_incore_sb_batch(
         * changes will be atomic.
         */
        spin_lock(&mp->m_sb_lock);
-       for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) {
+       for (msbp = msb; msbp < (msb + nmsb); msbp++) {
                ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
                       msbp->msb_field > XFS_SBS_FDBLOCKS);
 
index 7692279..d1f2485 100644 (file)
@@ -608,10 +608,8 @@ STATIC void
 xfs_trans_free(
        struct xfs_trans        *tp)
 {
-       struct xfs_busy_extent  *busyp, *n;
-
-       list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
-               xfs_alloc_busy_clear(tp->t_mountp, busyp);
+       xfs_alloc_busy_sort(&tp->t_busy);
+       xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy);
 
        atomic_dec(&tp->t_mountp->m_active_trans);
        xfs_trans_free_dqinfo(tp);
index 26d1867..65584b5 100644 (file)
@@ -73,8 +73,6 @@ typedef       __int32_t       xfs_tid_t;      /* transaction identifier */
 typedef        __uint32_t      xfs_dablk_t;    /* dir/attr block number (in file) */
 typedef        __uint32_t      xfs_dahash_t;   /* dir/attr hash value */
 
-typedef __uint32_t     xlog_tid_t;     /* transaction ID type */
-
 /*
  * These types are 64 bits on disk but are either 32 or 64 bits in memory.
  * Disk based types: