block: make gendisk hold a reference to its queue
[linux-2.6.git] / fs / xfs / xfs_iomap.c
index 2edd676..091d82b 100644 (file)
 /*
- * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation.
  *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
  *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like.         Any license provided herein, whether implied or
- * otherwise, applies only to this software file.  Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA  94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
-
 #include "xfs.h"
-
 #include "xfs_fs.h"
-#include "xfs_inum.h"
+#include "xfs_bit.h"
 #include "xfs_log.h"
+#include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
-#include "xfs_alloc_btree.h"
 #include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_btree.h"
 #include "xfs_bmap.h"
-#include "xfs_bit.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
 #include "xfs_utils.h"
 #include "xfs_iomap.h"
+#include "xfs_trace.h"
 
-#if defined(XFS_RW_TRACE)
-void
-xfs_iomap_enter_trace(
-       int             tag,
-       xfs_iocore_t    *io,
-       xfs_off_t       offset,
-       ssize_t         count)
-{
-       xfs_inode_t     *ip = XFS_IO_INODE(io);
-
-       if (!ip->i_rwtrace)
-               return;
-
-       ktrace_enter(ip->i_rwtrace,
-               (void *)((unsigned long)tag),
-               (void *)ip,
-               (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
-               (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
-               (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
-               (void *)((unsigned long)(offset & 0xffffffff)),
-               (void *)((unsigned long)count),
-               (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)),
-               (void *)((unsigned long)(io->io_new_size & 0xffffffff)),
-               (void *)NULL,
-               (void *)NULL,
-               (void *)NULL,
-               (void *)NULL,
-               (void *)NULL,
-               (void *)NULL,
-               (void *)NULL);
-}
-
-void
-xfs_iomap_map_trace(
-       int             tag,
-       xfs_iocore_t    *io,
-       xfs_off_t       offset,
-       ssize_t         count,
-       xfs_iomap_t     *iomapp,
-       xfs_bmbt_irec_t *imapp,
-       int             flags)
-{
-       xfs_inode_t     *ip = XFS_IO_INODE(io);
-
-       if (!ip->i_rwtrace)
-               return;
-
-       ktrace_enter(ip->i_rwtrace,
-               (void *)((unsigned long)tag),
-               (void *)ip,
-               (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
-               (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
-               (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
-               (void *)((unsigned long)(offset & 0xffffffff)),
-               (void *)((unsigned long)count),
-               (void *)((unsigned long)flags),
-               (void *)((unsigned long)((iomapp->iomap_offset >> 32) & 0xffffffff)),
-               (void *)((unsigned long)(iomapp->iomap_offset & 0xffffffff)),
-               (void *)((unsigned long)(iomapp->iomap_delta)),
-               (void *)((unsigned long)(iomapp->iomap_bsize)),
-               (void *)((unsigned long)(iomapp->iomap_bn)),
-               (void *)(__psint_t)(imapp->br_startoff),
-               (void *)((unsigned long)(imapp->br_blockcount)),
-               (void *)(__psint_t)(imapp->br_startblock));
-}
-#else
-#define xfs_iomap_enter_trace(tag, io, offset, count)
-#define xfs_iomap_map_trace(tag, io, offset, count, iomapp, imapp, flags)
-#endif
 
 #define XFS_WRITEIO_ALIGN(mp,off)      (((off) >> mp->m_writeio_log) \
                                                << mp->m_writeio_log)
-#define XFS_STRAT_WRITE_IMAPS  2
 #define XFS_WRITE_IMAPS                XFS_BMAP_MAX_NMAP
 
 STATIC int
-xfs_imap_to_bmap(
-       xfs_iocore_t    *io,
-       xfs_off_t       offset,
-       xfs_bmbt_irec_t *imap,
-       xfs_iomap_t     *iomapp,
-       int             imaps,                  /* Number of imap entries */
-       int             iomaps,                 /* Number of iomap entries */
-       int             flags)
-{
-       xfs_mount_t     *mp;
-       xfs_fsize_t     nisize;
-       int             pbm;
-       xfs_fsblock_t   start_block;
-
-       mp = io->io_mount;
-       nisize = XFS_SIZE(mp, io);
-       if (io->io_new_size > nisize)
-               nisize = io->io_new_size;
-
-       for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) {
-               iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
-               iomapp->iomap_delta = offset - iomapp->iomap_offset;
-               iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount);
-               iomapp->iomap_flags = flags;
-
-               if (io->io_flags & XFS_IOCORE_RT) {
-                       iomapp->iomap_flags |= IOMAP_REALTIME;
-                       iomapp->iomap_target = mp->m_rtdev_targp;
-               } else {
-                       iomapp->iomap_target = mp->m_ddev_targp;
-               }
-               start_block = imap->br_startblock;
-               if (start_block == HOLESTARTBLOCK) {
-                       iomapp->iomap_bn = IOMAP_DADDR_NULL;
-                       iomapp->iomap_flags |= IOMAP_HOLE;
-               } else if (start_block == DELAYSTARTBLOCK) {
-                       iomapp->iomap_bn = IOMAP_DADDR_NULL;
-                       iomapp->iomap_flags |= IOMAP_DELAY;
-               } else {
-                       iomapp->iomap_bn = XFS_FSB_TO_DB_IO(io, start_block);
-                       if (ISUNWRITTEN(imap))
-                               iomapp->iomap_flags |= IOMAP_UNWRITTEN;
-               }
-
-               if ((iomapp->iomap_offset + iomapp->iomap_bsize) >= nisize) {
-                       iomapp->iomap_flags |= IOMAP_EOF;
-               }
-
-               offset += iomapp->iomap_bsize - iomapp->iomap_delta;
-       }
-       return pbm;     /* Return the number filled */
-}
-
-int
-xfs_iomap(
-       xfs_iocore_t    *io,
-       xfs_off_t       offset,
-       ssize_t         count,
-       int             flags,
-       xfs_iomap_t     *iomapp,
-       int             *niomaps)
+xfs_iomap_eof_align_last_fsb(
+       xfs_mount_t     *mp,
+       xfs_inode_t     *ip,
+       xfs_extlen_t    extsize,
+       xfs_fileoff_t   *last_fsb)
 {
-       xfs_mount_t     *mp = io->io_mount;
-       xfs_fileoff_t   offset_fsb, end_fsb;
-       int             error = 0;
-       int             lockmode = 0;
-       xfs_bmbt_irec_t imap;
-       int             nimaps = 1;
-       int             bmapi_flags = 0;
-       int             iomap_flags = 0;
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       switch (flags &
-               (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE |
-                BMAPI_UNWRITTEN | BMAPI_DEVICE)) {
-       case BMAPI_READ:
-               xfs_iomap_enter_trace(XFS_IOMAP_READ_ENTER, io, offset, count);
-               lockmode = XFS_LCK_MAP_SHARED(mp, io);
-               bmapi_flags = XFS_BMAPI_ENTIRE;
-               if (flags & BMAPI_IGNSTATE)
-                       bmapi_flags |= XFS_BMAPI_IGSTATE;
-               break;
-       case BMAPI_WRITE:
-               xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, io, offset, count);
-               lockmode = XFS_ILOCK_EXCL|XFS_EXTSIZE_WR;
-               bmapi_flags = 0;
-               XFS_ILOCK(mp, io, lockmode);
-               break;
-       case BMAPI_ALLOCATE:
-               xfs_iomap_enter_trace(XFS_IOMAP_ALLOC_ENTER, io, offset, count);
-               lockmode = XFS_ILOCK_SHARED|XFS_EXTSIZE_RD;
-               bmapi_flags = XFS_BMAPI_ENTIRE;
-               /* Attempt non-blocking lock */
-               if (flags & BMAPI_TRYLOCK) {
-                       if (!XFS_ILOCK_NOWAIT(mp, io, lockmode))
-                               return XFS_ERROR(EAGAIN);
-               } else {
-                       XFS_ILOCK(mp, io, lockmode);
-               }
-               break;
-       case BMAPI_UNWRITTEN:
-               goto phase2;
-       case BMAPI_DEVICE:
-               lockmode = XFS_LCK_MAP_SHARED(mp, io);
-               iomapp->iomap_target = io->io_flags & XFS_IOCORE_RT ?
-                       mp->m_rtdev_targp : mp->m_ddev_targp;
-               error = 0;
-               *niomaps = 1;
-               goto out;
-       default:
-               BUG();
-       }
-
-       ASSERT(offset <= mp->m_maxioffset);
-       if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
-               count = mp->m_maxioffset - offset;
-       end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
-       offset_fsb = XFS_B_TO_FSBT(mp, offset);
-
-       error = XFS_BMAPI(mp, NULL, io, offset_fsb,
-                       (xfs_filblks_t)(end_fsb - offset_fsb),
-                       bmapi_flags,  NULL, 0, &imap,
-                       &nimaps, NULL);
+       xfs_fileoff_t   new_last_fsb = 0;
+       xfs_extlen_t    align;
+       int             eof, error;
 
-       if (error)
-               goto out;
-
-phase2:
-       switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE|BMAPI_UNWRITTEN)) {
-       case BMAPI_WRITE:
-               /* If we found an extent, return it */
-               if (nimaps &&
-                   (imap.br_startblock != HOLESTARTBLOCK) && 
-                   (imap.br_startblock != DELAYSTARTBLOCK)) {
-                       xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, io,
-                                       offset, count, iomapp, &imap, flags);
-                       break;
-               }
-
-               if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) {
-                       error = XFS_IOMAP_WRITE_DIRECT(mp, io, offset,
-                                       count, flags, &imap, &nimaps, nimaps);
-               } else {
-                       error = XFS_IOMAP_WRITE_DELAY(mp, io, offset, count,
-                                       flags, &imap, &nimaps);
-               }
-               if (!error) {
-                       xfs_iomap_map_trace(XFS_IOMAP_ALLOC_MAP, io,
-                                       offset, count, iomapp, &imap, flags);
-               }
-               iomap_flags = IOMAP_NEW;
-               break;
-       case BMAPI_ALLOCATE:
-               /* If we found an extent, return it */
-               XFS_IUNLOCK(mp, io, lockmode);
-               lockmode = 0;
-
-               if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock)) {
-                       xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, io,
-                                       offset, count, iomapp, &imap, flags);
-                       break;
-               }
+       if (XFS_IS_REALTIME_INODE(ip))
+               ;
+       /*
+        * If mounted with the "-o swalloc" option, roundup the allocation
+        * request to a stripe width boundary if the file size is >=
+        * stripe width and we are allocating past the allocation eof.
+        */
+       else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
+               (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth)))
+               new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
+       /*
+        * Roundup the allocation request to a stripe unit (m_dalign) boundary
+        * if the file size is >= stripe unit size, and we are allocating past
+        * the allocation eof.
+        */
+       else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign)))
+               new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
 
-               error = XFS_IOMAP_WRITE_ALLOCATE(mp, io, offset, count,
-                                                &imap, &nimaps);
-               break;
-       case BMAPI_UNWRITTEN:
-               lockmode = 0;
-               error = XFS_IOMAP_WRITE_UNWRITTEN(mp, io, offset, count);
-               nimaps = 0;
-               break;
+       /*
+        * Always round up the allocation request to an extent boundary
+        * (when file on a real-time subvolume or has di_extsize hint).
+        */
+       if (extsize) {
+               if (new_last_fsb)
+                       align = roundup_64(new_last_fsb, extsize);
+               else
+                       align = extsize;
+               new_last_fsb = roundup_64(*last_fsb, align);
        }
 
-       if (nimaps) {
-               *niomaps = xfs_imap_to_bmap(io, offset, &imap,
-                                           iomapp, nimaps, *niomaps, iomap_flags);
-       } else if (niomaps) {
-               *niomaps = 0;
+       if (new_last_fsb) {
+               error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
+               if (error)
+                       return error;
+               if (eof)
+                       *last_fsb = new_last_fsb;
        }
-
-out:
-       if (lockmode)
-               XFS_IUNLOCK(mp, io, lockmode);
-       return XFS_ERROR(error);
+       return 0;
 }
 
 STATIC int
-xfs_flush_space(
+xfs_alert_fsblock_zero(
        xfs_inode_t     *ip,
-       int             *fsynced,
-       int             *ioflags)
+       xfs_bmbt_irec_t *imap)
 {
-       switch (*fsynced) {
-       case 0:
-               if (ip->i_delayed_blks) {
-                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                       xfs_flush_inode(ip);
-                       xfs_ilock(ip, XFS_ILOCK_EXCL);
-                       *fsynced = 1;
-               } else {
-                       *ioflags |= BMAPI_SYNC;
-                       *fsynced = 2;
-               }
-               return 0;
-       case 1:
-               *fsynced = 2;
-               *ioflags |= BMAPI_SYNC;
-               return 0;
-       case 2:
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               xfs_flush_device(ip);
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               *fsynced = 3;
-               return 0;
-       }
-       return 1;
+       xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+                       "Access to block zero in inode %llu "
+                       "start_block: %llx start_off: %llx "
+                       "blkcnt: %llx extent-state: %x\n",
+               (unsigned long long)ip->i_ino,
+               (unsigned long long)imap->br_startblock,
+               (unsigned long long)imap->br_startoff,
+               (unsigned long long)imap->br_blockcount,
+               imap->br_state);
+       return EFSCORRUPTED;
 }
 
 int
@@ -370,78 +122,68 @@ xfs_iomap_write_direct(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-       int             flags,
-       xfs_bmbt_irec_t *ret_imap,
-       int             *nmaps,
-       int             found)
+       xfs_bmbt_irec_t *imap,
+       int             nmaps)
 {
        xfs_mount_t     *mp = ip->i_mount;
-       xfs_iocore_t    *io = &ip->i_iocore;
        xfs_fileoff_t   offset_fsb;
        xfs_fileoff_t   last_fsb;
-       xfs_filblks_t   count_fsb;
-       xfs_fsize_t     isize;
+       xfs_filblks_t   count_fsb, resaligned;
        xfs_fsblock_t   firstfsb;
-       int             nimaps, maps;
-       int             error;
+       xfs_extlen_t    extsz, temp;
+       int             nimaps;
        int             bmapi_flag;
        int             quota_flag;
        int             rt;
        xfs_trans_t     *tp;
-       xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS], *imapp;
        xfs_bmap_free_t free_list;
-       int             aeof;
-       xfs_filblks_t   datablocks, qblocks, resblks;
+       uint            qblocks, resblks, resrtextents;
        int             committed;
-       int             numrtextents;
+       int             error;
 
        /*
         * Make sure that the dquots are there. This doesn't hold
         * the ilock across a disk read.
         */
-       error = XFS_QM_DQATTACH(ip->i_mount, ip, XFS_QMOPT_ILOCKED);
+       error = xfs_qm_dqattach_locked(ip, 0);
        if (error)
                return XFS_ERROR(error);
 
-       maps = min(XFS_WRITE_IMAPS, *nmaps);
-       nimaps = maps;
-
-       isize = ip->i_d.di_size;
-       aeof = (offset + count) > isize;
-
-       if (io->io_new_size > isize)
-               isize = io->io_new_size;
+       rt = XFS_IS_REALTIME_INODE(ip);
+       extsz = xfs_get_extsz_hint(ip);
 
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+       if ((offset + count) > ip->i_size) {
+               error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
+               if (error)
+                       goto error_out;
+       } else {
+               if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
+                       last_fsb = MIN(last_fsb, (xfs_fileoff_t)
+                                       imap->br_blockcount +
+                                       imap->br_startoff);
+       }
        count_fsb = last_fsb - offset_fsb;
-       if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) {
-               xfs_fileoff_t   map_last_fsb;
-
-               map_last_fsb = ret_imap->br_blockcount + ret_imap->br_startoff;
-               if (map_last_fsb < last_fsb) {
-                       last_fsb = map_last_fsb;
-                       count_fsb = last_fsb - offset_fsb;
-               }
-               ASSERT(count_fsb > 0);
+       ASSERT(count_fsb > 0);
+
+       resaligned = count_fsb;
+       if (unlikely(extsz)) {
+               if ((temp = do_mod(offset_fsb, extsz)))
+                       resaligned += temp;
+               if ((temp = do_mod(resaligned, extsz)))
+                       resaligned += extsz - temp;
        }
 
-       /*
-        * Determine if reserving space on the data or realtime partition.
-        */
-       if ((rt = XFS_IS_REALTIME_INODE(ip))) {
-               xfs_extlen_t    extsz;
-
-               if (!(extsz = ip->i_d.di_extsize))
-                       extsz = mp->m_sb.sb_rextsize;
-               numrtextents = qblocks = (count_fsb + extsz - 1);
-               do_div(numrtextents, mp->m_sb.sb_rextsize);
+       if (unlikely(rt)) {
+               resrtextents = qblocks = resaligned;
+               resrtextents /= mp->m_sb.sb_rextsize;
+               resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
                quota_flag = XFS_QMOPT_RES_RTBLKS;
-               datablocks = 0;
        } else {
-               datablocks = qblocks = count_fsb;
+               resrtextents = 0;
+               resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
                quota_flag = XFS_QMOPT_RES_REGBLKS;
-               numrtextents = 0;
        }
 
        /*
@@ -449,12 +191,10 @@ xfs_iomap_write_direct(
         */
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-       resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
        error = xfs_trans_reserve(tp, resblks,
-                       XFS_WRITE_LOG_RES(mp), numrtextents,
+                       XFS_WRITE_LOG_RES(mp), resrtextents,
                        XFS_TRANS_PERM_LOG_RES,
                        XFS_WRITE_LOG_COUNT);
-
        /*
         * Check for running out of space, note: need lock to return
         */
@@ -464,36 +204,36 @@ xfs_iomap_write_direct(
        if (error)
                goto error_out;
 
-       if (XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag)) {
-               error = (EDQUOT);
+       error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
+       if (error)
                goto error1;
-       }
 
-       bmapi_flag = XFS_BMAPI_WRITE;
-       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-       xfs_trans_ihold(tp, ip);
+       xfs_trans_ijoin(tp, ip);
 
-       if (!(flags & BMAPI_MMAP) && (offset < ip->i_d.di_size || rt))
+       bmapi_flag = XFS_BMAPI_WRITE;
+       if (offset < ip->i_size || extsz)
                bmapi_flag |= XFS_BMAPI_PREALLOC;
 
        /*
-        * Issue the bmapi() call to allocate the blocks
+        * Issue the xfs_bmapi() call to allocate the blocks.
+        *
+        * From this point onwards we overwrite the imap pointer that the
+        * caller gave to us.
         */
-       XFS_BMAP_INIT(&free_list, &firstfsb);
+       xfs_bmap_init(&free_list, &firstfsb);
        nimaps = 1;
-       imapp = &imap[0];
-       error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
-               bmapi_flag, &firstfsb, 0, imapp, &nimaps, &free_list);
+       error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag,
+               &firstfsb, 0, imap, &nimaps, &free_list);
        if (error)
                goto error0;
 
        /*
         * Complete the transaction
         */
-       error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
+       error = xfs_bmap_finish(&tp, &free_list, &committed);
        if (error)
                goto error0;
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
        if (error)
                goto error_out;
 
@@ -501,230 +241,232 @@ xfs_iomap_write_direct(
         * Copy any maps to caller's array and return any error.
         */
        if (nimaps == 0) {
-               error = (ENOSPC);
+               error = ENOSPC;
+               goto error_out;
+       }
+
+       if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) {
+               error = xfs_alert_fsblock_zero(ip, imap);
                goto error_out;
        }
 
-       *ret_imap = imap[0];
-       *nmaps = 1;
-       if ( !(io->io_flags & XFS_IOCORE_RT)  && !ret_imap->br_startblock) {
-                cmn_err(CE_PANIC,"Access to block zero:  fs <%s> inode: %lld "
-                        "start_block : %llx start_off : %llx blkcnt : %llx "
-                        "extent-state : %x \n",
-                        (ip->i_mount)->m_fsname,
-                        (long long)ip->i_ino,
-                        ret_imap->br_startblock, ret_imap->br_startoff,
-                        ret_imap->br_blockcount,ret_imap->br_state);
-        }
        return 0;
 
 error0:        /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
        xfs_bmap_cancel(&free_list);
-       XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
+       xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
 
 error1:        /* Just cancel transaction */
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-       *nmaps = 0;     /* nothing set-up here */
 
 error_out:
        return XFS_ERROR(error);
 }
 
+/*
+ * If the caller is doing a write at the end of the file, then extend the
+ * allocation out to the file system's write iosize.  We clean up any extra
+ * space left over when the file is closed in xfs_inactive().
+ *
+ * If we find we already have delalloc preallocation beyond EOF, don't do more
+ * preallocation as it it not needed.
+ */
+STATIC int
+xfs_iomap_eof_want_preallocate(
+       xfs_mount_t     *mp,
+       xfs_inode_t     *ip,
+       xfs_off_t       offset,
+       size_t          count,
+       xfs_bmbt_irec_t *imap,
+       int             nimaps,
+       int             *prealloc)
+{
+       xfs_fileoff_t   start_fsb;
+       xfs_filblks_t   count_fsb;
+       xfs_fsblock_t   firstblock;
+       int             n, error, imaps;
+       int             found_delalloc = 0;
+
+       *prealloc = 0;
+       if ((offset + count) <= ip->i_size)
+               return 0;
+
+       /*
+        * If there are any real blocks past eof, then don't
+        * do any speculative allocation.
+        */
+       start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1)));
+       count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
+       while (count_fsb > 0) {
+               imaps = nimaps;
+               firstblock = NULLFSBLOCK;
+               error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0,
+                                 &firstblock, 0, imap, &imaps, NULL);
+               if (error)
+                       return error;
+               for (n = 0; n < imaps; n++) {
+                       if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
+                           (imap[n].br_startblock != DELAYSTARTBLOCK))
+                               return 0;
+                       start_fsb += imap[n].br_blockcount;
+                       count_fsb -= imap[n].br_blockcount;
+
+                       if (imap[n].br_startblock == DELAYSTARTBLOCK)
+                               found_delalloc = 1;
+               }
+       }
+       if (!found_delalloc)
+               *prealloc = 1;
+       return 0;
+}
+
+/*
+ * If we don't have a user specified preallocation size, dynamically increase
+ * the preallocation size as the size of the file grows. Cap the maximum size
+ * at a single extent or less if the filesystem is near full. The closer the
+ * filesystem is to full, the smaller the maximum prealocation.
+ */
+STATIC xfs_fsblock_t
+xfs_iomap_prealloc_size(
+       struct xfs_mount        *mp,
+       struct xfs_inode        *ip)
+{
+       xfs_fsblock_t           alloc_blocks = 0;
+
+       if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+               int shift = 0;
+               int64_t freesp;
+
+               /*
+                * rounddown_pow_of_two() returns an undefined result
+                * if we pass in alloc_blocks = 0. Hence the "+ 1" to
+                * ensure we always pass in a non-zero value.
+                */
+               alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
+               alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
+                                       rounddown_pow_of_two(alloc_blocks));
+
+               xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+               freesp = mp->m_sb.sb_fdblocks;
+               if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
+                       shift = 2;
+                       if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
+                               shift++;
+                       if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
+                               shift++;
+                       if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
+                               shift++;
+                       if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
+                               shift++;
+               }
+               if (shift)
+                       alloc_blocks >>= shift;
+       }
+
+       if (alloc_blocks < mp->m_writeio_blocks)
+               alloc_blocks = mp->m_writeio_blocks;
+
+       return alloc_blocks;
+}
+
 int
 xfs_iomap_write_delay(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-       int             ioflag,
-       xfs_bmbt_irec_t *ret_imap,
-       int             *nmaps)
+       xfs_bmbt_irec_t *ret_imap)
 {
        xfs_mount_t     *mp = ip->i_mount;
-       xfs_iocore_t    *io = &ip->i_iocore;
        xfs_fileoff_t   offset_fsb;
        xfs_fileoff_t   last_fsb;
-       xfs_fsize_t     isize;
+       xfs_off_t       aligned_offset;
+       xfs_fileoff_t   ioalign;
        xfs_fsblock_t   firstblock;
+       xfs_extlen_t    extsz;
        int             nimaps;
-       int             error;
        xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
-       int             aeof;
-       int             fsynced = 0;
+       int             prealloc, flushed = 0;
+       int             error;
 
-       ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
        /*
         * Make sure that the dquots are there. This doesn't hold
         * the ilock across a disk read.
         */
-
-       error = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
+       error = xfs_qm_dqattach_locked(ip, 0);
        if (error)
                return XFS_ERROR(error);
 
-retry:
-       isize = ip->i_d.di_size;
-       if (io->io_new_size > isize) {
-               isize = io->io_new_size;
-       }
-
-       aeof = 0;
+       extsz = xfs_get_extsz_hint(ip);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-       last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
-       /*
-        * If the caller is doing a write at the end of the file,
-        * then extend the allocation (and the buffer used for the write)
-        * out to the file system's write iosize.  We clean up any extra
-        * space left over when the file is closed in xfs_inactive().
-        *
-        * For sync writes, we are flushing delayed allocate space to
-        * try to make additional space available for allocation near
-        * the filesystem full boundary - preallocation hurts in that
-        * situation, of course.
-        */
-       if (!(ioflag & BMAPI_SYNC) && ((offset + count) > ip->i_d.di_size)) {
-               xfs_off_t       aligned_offset;
-               xfs_filblks_t   count_fsb;
-               unsigned int    iosize;
-               xfs_fileoff_t   ioalign;
-               int             n;
-               xfs_fileoff_t   start_fsb;
 
-               /*
-                * If there are any real blocks past eof, then don't
-                * do any speculative allocation.
-                */
-               start_fsb = XFS_B_TO_FSBT(mp,
-                                       ((xfs_ufsize_t)(offset + count - 1)));
-               count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
-               while (count_fsb > 0) {
-                       nimaps = XFS_WRITE_IMAPS;
-                       error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb,
-                                       0, &firstblock, 0, imap, &nimaps, NULL);
-                       if (error) {
-                               return error;
-                       }
-                       for (n = 0; n < nimaps; n++) {
-                               if ( !(io->io_flags & XFS_IOCORE_RT)  && 
-                                       !imap[n].br_startblock) {
-                                       cmn_err(CE_PANIC,"Access to block "
-                                               "zero:  fs <%s> inode: %lld "
-                                               "start_block : %llx start_off "
-                                               ": %llx blkcnt : %llx "
-                                               "extent-state : %x \n",
-                                               (ip->i_mount)->m_fsname,
-                                               (long long)ip->i_ino,
-                                               imap[n].br_startblock,
-                                               imap[n].br_startoff,
-                                               imap[n].br_blockcount,
-                                               imap[n].br_state);
-                               }
-                               if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
-                                   (imap[n].br_startblock != DELAYSTARTBLOCK)) {
-                                       goto write_map;
-                               }
-                               start_fsb += imap[n].br_blockcount;
-                               count_fsb -= imap[n].br_blockcount;
-                       }
-               }
-               iosize = mp->m_writeio_blocks;
+
+       error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
+                               imap, XFS_WRITE_IMAPS, &prealloc);
+       if (error)
+               return error;
+
+retry:
+       if (prealloc) {
+               xfs_fsblock_t   alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
+
                aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
                ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
-               last_fsb = ioalign + iosize;
-               aeof = 1;
+               last_fsb = ioalign + alloc_blocks;
+       } else {
+               last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
        }
-write_map:
-       nimaps = XFS_WRITE_IMAPS;
-       firstblock = NULLFSBLOCK;
 
-       /*
-        * If mounted with the "-o swalloc" option, roundup the allocation
-        * request to a stripe width boundary if the file size is >=
-        * stripe width and we are allocating past the allocation eof.
-        */
-       if (!(io->io_flags & XFS_IOCORE_RT) && mp->m_swidth 
-           && (mp->m_flags & XFS_MOUNT_SWALLOC)
-           && (isize >= XFS_FSB_TO_B(mp, mp->m_swidth)) && aeof) {
-               int eof;
-               xfs_fileoff_t new_last_fsb;
-
-               new_last_fsb = roundup_64(last_fsb, mp->m_swidth);
-               error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
-               if (error) {
-                       return error;
-               }
-               if (eof) {
-                       last_fsb = new_last_fsb;
-               }
-       /*
-        * Roundup the allocation request to a stripe unit (m_dalign) boundary
-        * if the file size is >= stripe unit size, and we are allocating past
-        * the allocation eof.
-        */
-       } else if (!(io->io_flags & XFS_IOCORE_RT) && mp->m_dalign &&
-                  (isize >= XFS_FSB_TO_B(mp, mp->m_dalign)) && aeof) {
-               int eof;
-               xfs_fileoff_t new_last_fsb;
-               new_last_fsb = roundup_64(last_fsb, mp->m_dalign);
-               error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
-               if (error) {
-                       return error;
-               }
-               if (eof) {
-                       last_fsb = new_last_fsb;
-               }
-       /*
-        * Round up the allocation request to a real-time extent boundary
-        * if the file is on the real-time subvolume.
-        */
-       } else if (io->io_flags & XFS_IOCORE_RT && aeof) {
-               int eof;
-               xfs_fileoff_t new_last_fsb;
-
-               new_last_fsb = roundup_64(last_fsb, mp->m_sb.sb_rextsize);
-               error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof);
-               if (error) {
+       if (prealloc || extsz) {
+               error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
+               if (error)
                        return error;
-               }
-               if (eof)
-                       last_fsb = new_last_fsb;
        }
+
+       nimaps = XFS_WRITE_IMAPS;
+       firstblock = NULLFSBLOCK;
        error = xfs_bmapi(NULL, ip, offset_fsb,
                          (xfs_filblks_t)(last_fsb - offset_fsb),
                          XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
                          XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
                          &nimaps, NULL);
-       /*
-        * This can be EDQUOT, if nimaps == 0
-        */
-       if (error && (error != ENOSPC)) {
+       switch (error) {
+       case 0:
+       case ENOSPC:
+       case EDQUOT:
+               break;
+       default:
                return XFS_ERROR(error);
        }
+
        /*
-        * If bmapi returned us nothing, and if we didn't get back EDQUOT,
-        * then we must have run out of space.
+        * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
+        * ENOSPC, * flush all other inodes with delalloc blocks to free up
+        * some of the excess reserved metadata space. For both cases, retry
+        * without EOF preallocation.
         */
        if (nimaps == 0) {
-               xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE,
-                                       io, offset, count);
-               if (xfs_flush_space(ip, &fsynced, &ioflag))
-                       return XFS_ERROR(ENOSPC);
+               trace_xfs_delalloc_enospc(ip, offset, count);
+               if (flushed)
+                       return XFS_ERROR(error ? error : ENOSPC);
 
+               if (error == ENOSPC) {
+                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                       xfs_flush_inodes(ip);
+                       xfs_ilock(ip, XFS_ILOCK_EXCL);
+               }
+
+               flushed = 1;
                error = 0;
+               prealloc = 0;
                goto retry;
        }
 
+       if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
+               return xfs_alert_fsblock_zero(ip, &imap[0]);
+
        *ret_imap = imap[0];
-       *nmaps = 1;
-       if ( !(io->io_flags & XFS_IOCORE_RT)  && !ret_imap->br_startblock) {
-               cmn_err(CE_PANIC,"Access to block zero:  fs <%s> inode: %lld "
-                        "start_block : %llx start_off : %llx blkcnt : %llx "
-                        "extent-state : %x \n",
-                        (ip->i_mount)->m_fsname,
-                        (long long)ip->i_ino,
-                        ret_imap->br_startblock, ret_imap->br_startoff,
-                        ret_imap->br_blockcount,ret_imap->br_state);
-       }
        return 0;
 }
 
@@ -734,39 +476,38 @@ write_map:
  * the originating callers request.
  *
  * Called without a lock on the inode.
+ *
+ * We no longer bother to look at the incoming map - all we have to
+ * guarantee is that whatever we allocate fills the required range.
  */
 int
 xfs_iomap_write_allocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-       xfs_bmbt_irec_t *map,
-       int             *retmap)
+       xfs_bmbt_irec_t *imap)
 {
        xfs_mount_t     *mp = ip->i_mount;
-       xfs_iocore_t    *io = &ip->i_iocore;
        xfs_fileoff_t   offset_fsb, last_block;
        xfs_fileoff_t   end_fsb, map_start_fsb;
        xfs_fsblock_t   first_block;
        xfs_bmap_free_t free_list;
        xfs_filblks_t   count_fsb;
-       xfs_bmbt_irec_t imap[XFS_STRAT_WRITE_IMAPS];
        xfs_trans_t     *tp;
-       int             i, nimaps, committed;
+       int             nimaps, committed;
        int             error = 0;
        int             nres;
 
-       *retmap = 0;
-
        /*
         * Make sure that the dquots are there.
         */
-       if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+       error = xfs_qm_dqattach(ip, 0);
+       if (error)
                return XFS_ERROR(error);
 
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-       count_fsb = map->br_blockcount;
-       map_start_fsb = map->br_startoff;
+       count_fsb = imap->br_blockcount;
+       map_start_fsb = imap->br_startoff;
 
        XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
 
@@ -783,38 +524,59 @@ xfs_iomap_write_allocate(
                nimaps = 0;
                while (nimaps == 0) {
                        tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
+                       tp->t_flags |= XFS_TRANS_RESERVE;
                        nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
                        error = xfs_trans_reserve(tp, nres,
                                        XFS_WRITE_LOG_RES(mp),
                                        0, XFS_TRANS_PERM_LOG_RES,
                                        XFS_WRITE_LOG_COUNT);
-                       if (error == ENOSPC) {
-                               error = xfs_trans_reserve(tp, 0,
-                                               XFS_WRITE_LOG_RES(mp),
-                                               0,
-                                               XFS_TRANS_PERM_LOG_RES,
-                                               XFS_WRITE_LOG_COUNT);
-                       }
                        if (error) {
                                xfs_trans_cancel(tp, 0);
                                return XFS_ERROR(error);
                        }
                        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-                       xfs_trans_ihold(tp, ip);
+                       xfs_trans_ijoin(tp, ip);
 
-                       XFS_BMAP_INIT(&free_list, &first_block);
+                       xfs_bmap_init(&free_list, &first_block);
 
-                       nimaps = XFS_STRAT_WRITE_IMAPS;
                        /*
-                        * Ensure we don't go beyond eof - it is possible
-                        * the extents changed since we did the read call,
-                        * we dropped the ilock in the interim.
+                        * it is possible that the extents have changed since
+                        * we did the read call as we dropped the ilock for a
+                        * while. We have to be careful about truncates or hole
+                        * punchs here - we are not allowed to allocate
+                        * non-delalloc blocks here.
+                        *
+                        * The only protection against truncation is the pages
+                        * for the range we are being asked to convert are
+                        * locked and hence a truncate will block on them
+                        * first.
+                        *
+                        * As a result, if we go beyond the range we really
+                        * need and hit an delalloc extent boundary followed by
+                        * a hole while we have excess blocks in the map, we
+                        * will fill the hole incorrectly and overrun the
+                        * transaction reservation.
+                        *
+                        * Using a single map prevents this as we are forced to
+                        * check each map we look for overlap with the desired
+                        * range and abort as soon as we find it. Also, given
+                        * that we only return a single map, having one beyond
+                        * what we can return is probably a bit silly.
+                        *
+                        * We also need to check that we don't go beyond EOF;
+                        * this is a truncate optimisation as a truncate sets
+                        * the new file size before block on the pages we
+                        * currently have locked under writeback. Because they
+                        * are about to be tossed, we don't need to write them
+                        * back....
                         */
+                       nimaps = 1;
+                       end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
+                       error = xfs_bmap_last_offset(NULL, ip, &last_block,
+                                                       XFS_DATA_FORK);
+                       if (error)
+                               goto trans_cancel;
 
-                       end_fsb = XFS_B_TO_FSB(mp, ip->i_d.di_size);
-                       xfs_bmap_last_offset(NULL, ip, &last_block,
-                               XFS_DATA_FORK);
                        last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
                        if ((map_start_fsb + count_fsb) > last_block) {
                                count_fsb = last_block - map_start_fsb;
@@ -824,20 +586,23 @@ xfs_iomap_write_allocate(
                                }
                        }
 
-                       /* Go get the actual blocks */
+                       /*
+                        * Go get the actual blocks.
+                        *
+                        * From this point onwards we overwrite the imap
+                        * pointer that the caller gave to us.
+                        */
                        error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
                                        XFS_BMAPI_WRITE, &first_block, 1,
                                        imap, &nimaps, &free_list);
                        if (error)
                                goto trans_cancel;
 
-                       error = xfs_bmap_finish(&tp, &free_list,
-                                       first_block, &committed);
+                       error = xfs_bmap_finish(&tp, &free_list, &committed);
                        if (error)
                                goto trans_cancel;
 
-                       error = xfs_trans_commit(tp,
-                                       XFS_TRANS_RELEASE_LOG_RES, NULL);
+                       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
                        if (error)
                                goto error0;
 
@@ -848,37 +613,22 @@ xfs_iomap_write_allocate(
                 * See if we were able to allocate an extent that
                 * covers at least part of the callers request
                 */
-
-               for (i = 0; i < nimaps; i++) {
-                       if ( !(io->io_flags & XFS_IOCORE_RT)  && 
-                               !imap[i].br_startblock) {
-                               cmn_err(CE_PANIC,"Access to block zero:  "
-                                       "fs <%s> inode: %lld "
-                                       "start_block : %llx start_off : %llx " 
-                                       "blkcnt : %llx extent-state : %x \n",
-                                       (ip->i_mount)->m_fsname,
-                                       (long long)ip->i_ino,
-                                       imap[i].br_startblock,
-                                       imap[i].br_startoff,
-                                       imap[i].br_blockcount,imap[i].br_state);
-                        }
-                       if ((offset_fsb >= imap[i].br_startoff) &&
-                           (offset_fsb < (imap[i].br_startoff +
-                                          imap[i].br_blockcount))) {
-                               *map = imap[i];
-                               *retmap = 1;
-                               XFS_STATS_INC(xs_xstrat_quick);
-                               return 0;
-                       }
-                       count_fsb -= imap[i].br_blockcount;
+               if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
+                       return xfs_alert_fsblock_zero(ip, imap);
+
+               if ((offset_fsb >= imap->br_startoff) &&
+                   (offset_fsb < (imap->br_startoff +
+                                  imap->br_blockcount))) {
+                       XFS_STATS_INC(xs_xstrat_quick);
+                       return 0;
                }
 
-               /* So far we have not mapped the requested part of the
+               /*
+                * So far we have not mapped the requested part of the
                 * file, just surrounding data, try again.
                 */
-               nimaps--;
-               map_start_fsb = imap[nimaps].br_startoff +
-                               imap[nimaps].br_blockcount;
+               count_fsb -= imap->br_blockcount;
+               map_start_fsb = imap->br_startoff + imap->br_blockcount;
        }
 
 trans_cancel:
@@ -896,79 +646,85 @@ xfs_iomap_write_unwritten(
        size_t          count)
 {
        xfs_mount_t     *mp = ip->i_mount;
-       xfs_iocore_t    *io = &ip->i_iocore;
-       xfs_trans_t     *tp;
        xfs_fileoff_t   offset_fsb;
        xfs_filblks_t   count_fsb;
        xfs_filblks_t   numblks_fsb;
-       xfs_bmbt_irec_t imap;
+       xfs_fsblock_t   firstfsb;
+       int             nimaps;
+       xfs_trans_t     *tp;
+       xfs_bmbt_irec_t imap;
+       xfs_bmap_free_t free_list;
+       uint            resblks;
        int             committed;
        int             error;
-       int             nres;
-       int             nimaps;
-       xfs_fsblock_t   firstfsb;
-       xfs_bmap_free_t free_list;
 
-       xfs_iomap_enter_trace(XFS_IOMAP_UNWRITTEN,
-                               &ip->i_iocore, offset, count);
+       trace_xfs_unwritten_convert(ip, offset, count);
 
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
        count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
 
-       do {
-               nres = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+       /*
+        * Reserve enough blocks in this transaction for two complete extent
+        * btree splits.  We may be converting the middle part of an unwritten
+        * extent and in this case we will insert two new extents in the btree
+        * each of which could cause a full split.
+        *
+        * This reservation amount will be used in the first call to
+        * xfs_bmbt_split() to select an AG with enough space to satisfy the
+        * rest of the operation.
+        */
+       resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
 
+       do {
                /*
                 * set up a transaction to convert the range of extents
                 * from unwritten to real. Do allocations in a loop until
                 * we have covered the range passed in.
+                *
+                * Note that we open code the transaction allocation here
+                * to pass KM_NOFS--we can't risk to recursing back into
+                * the filesystem here as we might be asked to write out
+                * the same inode that we complete here and might deadlock
+                * on the iolock.
                 */
-
-               tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
-               error = xfs_trans_reserve(tp, nres,
+               xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
+               tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
+               tp->t_flags |= XFS_TRANS_RESERVE;
+               error = xfs_trans_reserve(tp, resblks,
                                XFS_WRITE_LOG_RES(mp), 0,
                                XFS_TRANS_PERM_LOG_RES,
                                XFS_WRITE_LOG_COUNT);
                if (error) {
                        xfs_trans_cancel(tp, 0);
-                       goto error0;
+                       return XFS_ERROR(error);
                }
 
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-               xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-               xfs_trans_ihold(tp, ip);
+               xfs_trans_ijoin(tp, ip);
 
                /*
                 * Modify the unwritten extent state of the buffer.
                 */
-               XFS_BMAP_INIT(&free_list, &firstfsb);
+               xfs_bmap_init(&free_list, &firstfsb);
                nimaps = 1;
                error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
-                                 XFS_BMAPI_WRITE, &firstfsb,
+                                 XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
                                  1, &imap, &nimaps, &free_list);
                if (error)
                        goto error_on_bmapi_transaction;
 
-               error = xfs_bmap_finish(&(tp), &(free_list),
-                               firstfsb, &committed);
+               error = xfs_bmap_finish(&(tp), &(free_list), &committed);
                if (error)
                        goto error_on_bmapi_transaction;
 
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                if (error)
-                       goto error0;
-               
-               if ( !(io->io_flags & XFS_IOCORE_RT)  && !imap.br_startblock) {
-                       cmn_err(CE_PANIC,"Access to block zero:  fs <%s> "
-                               "inode: %lld start_block : %llx start_off : "
-                               "%llx blkcnt : %llx extent-state : %x \n",
-                               (ip->i_mount)->m_fsname,
-                               (long long)ip->i_ino,
-                               imap.br_startblock,imap.br_startoff,
-                               imap.br_blockcount,imap.br_state);
-               }
+                       return XFS_ERROR(error);
+
+               if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
+                       return xfs_alert_fsblock_zero(ip, &imap);
 
                if ((numblks_fsb = imap.br_blockcount) == 0) {
                        /*
@@ -988,6 +744,5 @@ error_on_bmapi_transaction:
        xfs_bmap_cancel(&free_list);
        xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-error0:
        return XFS_ERROR(error);
 }