Merge branch 'next' into upstream-merge
[linux-2.6.git] / fs / ext4 / inode.c
index a0ab375..2d6c6c8 100644 (file)
@@ -60,6 +60,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
 }
 
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
+                                  struct buffer_head *bh_result, int create);
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
 
 /*
  * Test whether an inode is a fast symlink.
@@ -167,11 +173,16 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
 /*
  * Called at the last iput() if i_nlink is zero.
  */
-void ext4_delete_inode(struct inode *inode)
+void ext4_evict_inode(struct inode *inode)
 {
        handle_t *handle;
        int err;
 
+       if (inode->i_nlink) {
+               truncate_inode_pages(&inode->i_data, 0);
+               goto no_delete;
+       }
+
        if (!is_bad_inode(inode))
                dquot_initialize(inode);
 
@@ -246,13 +257,13 @@ void ext4_delete_inode(struct inode *inode)
         */
        if (ext4_mark_inode_dirty(handle, inode))
                /* If that failed, just do the required in-core inode clear. */
-               clear_inode(inode);
+               ext4_clear_inode(inode);
        else
                ext4_free_inode(handle, inode);
        ext4_journal_stop(handle);
        return;
 no_delete:
-       clear_inode(inode);     /* We must guarantee clearing of inode... */
+       ext4_clear_inode(inode);        /* We must guarantee clearing of inode... */
 }
 
 typedef struct {
@@ -750,6 +761,11 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                 * parent to disk.
                 */
                bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+               if (unlikely(!bh)) {
+                       err = -EIO;
+                       goto failed;
+               }
+
                branch[n].bh = bh;
                lock_buffer(bh);
                BUFFER_TRACE(bh, "call get_create_access");
@@ -1202,8 +1218,10 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
                                break;
                        idx++;
                        num++;
-                       if (num >= max_pages)
+                       if (num >= max_pages) {
+                               done = 1;
                                break;
+                       }
                }
                pagevec_release(&pvec);
        }
@@ -1533,10 +1551,10 @@ static int do_journal_get_write_access(handle_t *handle,
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        /*
-        * __block_prepare_write() could have dirtied some buffers. Clean
+        * __block_write_begin() could have dirtied some buffers. Clean
         * the dirty bit as jbd2_journal_get_write_access() could complain
         * otherwise about fs integrity issues. Setting of the dirty bit
-        * by __block_prepare_write() isn't a real problem here as we clear
+        * by __block_write_begin() isn't a real problem here as we clear
         * the bit before releasing a page lock and thus writeback cannot
         * ever write the buffer.
         */
@@ -1602,11 +1620,9 @@ retry:
        *pagep = page;
 
        if (ext4_should_dioread_nolock(inode))
-               ret = block_write_begin(file, mapping, pos, len, flags, pagep,
-                               fsdata, ext4_get_block_write);
+               ret = __block_write_begin(page, pos, len, ext4_get_block_write);
        else
-               ret = block_write_begin(file, mapping, pos, len, flags, pagep,
-                               fsdata, ext4_get_block);
+               ret = __block_write_begin(page, pos, len, ext4_get_block);
 
        if (!ret && ext4_should_journal_data(inode)) {
                ret = walk_page_buffers(handle, page_buffers(page),
@@ -1617,7 +1633,7 @@ retry:
                unlock_page(page);
                page_cache_release(page);
                /*
-                * block_write_begin may have instantiated a few blocks
+                * __block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                 * i_size_read because we hold i_mutex.
                 *
@@ -1992,16 +2008,23 @@ static void ext4_da_page_release_reservation(struct page *page,
  *
  * As pages are already locked by write_cache_pages(), we can't use it
  */
-static int mpage_da_submit_io(struct mpage_da_data *mpd)
+static int mpage_da_submit_io(struct mpage_da_data *mpd,
+                             struct ext4_map_blocks *map)
 {
-       long pages_skipped;
        struct pagevec pvec;
        unsigned long index, end;
        int ret = 0, err, nr_pages, i;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
+       loff_t size = i_size_read(inode);
+       unsigned int len, block_start;
+       struct buffer_head *bh, *page_bufs = NULL;
+       int journal_data = ext4_should_journal_data(inode);
+       sector_t pblock = 0, cur_logical = 0;
+       struct ext4_io_submit io_submit;
 
        BUG_ON(mpd->next_page <= mpd->first_page);
+       memset(&io_submit, 0, sizeof(io_submit));
        /*
         * We need to start from the first_page to the next_page - 1
         * to make sure we also write the mapped dirty buffer_heads.
@@ -2017,122 +2040,108 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
+                       int commit_write = 0, redirty_page = 0;
                        struct page *page = pvec.pages[i];
 
                        index = page->index;
                        if (index > end)
                                break;
+
+                       if (index == size >> PAGE_CACHE_SHIFT)
+                               len = size & ~PAGE_CACHE_MASK;
+                       else
+                               len = PAGE_CACHE_SIZE;
+                       if (map) {
+                               cur_logical = index << (PAGE_CACHE_SHIFT -
+                                                       inode->i_blkbits);
+                               pblock = map->m_pblk + (cur_logical -
+                                                       map->m_lblk);
+                       }
                        index++;
 
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
 
-                       pages_skipped = mpd->wbc->pages_skipped;
-                       err = mapping->a_ops->writepage(page, mpd->wbc);
-                       if (!err && (pages_skipped == mpd->wbc->pages_skipped))
-                               /*
-                                * have successfully written the page
-                                * without skipping the same
-                                */
-                               mpd->pages_written++;
                        /*
-                        * In error case, we have to continue because
-                        * remaining pages are still locked
-                        * XXX: unlock and re-dirty them?
+                        * If the page does not have buffers (for
+                        * whatever reason), try to create them using
+                        * __block_write_begin.  If this fails,
+                        * redirty the page and move on.
                         */
-                       if (ret == 0)
-                               ret = err;
-               }
-               pagevec_release(&pvec);
-       }
-       return ret;
-}
-
-/*
- * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
- *
- * the function goes through all passed space and put actual disk
- * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
- */
-static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
-                                struct ext4_map_blocks *map)
-{
-       struct inode *inode = mpd->inode;
-       struct address_space *mapping = inode->i_mapping;
-       int blocks = map->m_len;
-       sector_t pblock = map->m_pblk, cur_logical;
-       struct buffer_head *head, *bh;
-       pgoff_t index, end;
-       struct pagevec pvec;
-       int nr_pages, i;
-
-       index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-       end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-       cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
-       pagevec_init(&pvec, 0);
-
-       while (index <= end) {
-               /* XXX: optimize tail */
-               nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
-               if (nr_pages == 0)
-                       break;
-               for (i = 0; i < nr_pages; i++) {
-                       struct page *page = pvec.pages[i];
-
-                       index = page->index;
-                       if (index > end)
-                               break;
-                       index++;
-
-                       BUG_ON(!PageLocked(page));
-                       BUG_ON(PageWriteback(page));
-                       BUG_ON(!page_has_buffers(page));
-
-                       bh = page_buffers(page);
-                       head = bh;
-
-                       /* skip blocks out of the range */
-                       do {
-                               if (cur_logical >= map->m_lblk)
-                                       break;
-                               cur_logical++;
-                       } while ((bh = bh->b_this_page) != head);
+                       if (!page_has_buffers(page)) {
+                               if (__block_write_begin(page, 0, len,
+                                               noalloc_get_block_write)) {
+                               redirty_page:
+                                       redirty_page_for_writepage(mpd->wbc,
+                                                                  page);
+                                       unlock_page(page);
+                                       continue;
+                               }
+                               commit_write = 1;
+                       }
 
+                       bh = page_bufs = page_buffers(page);
+                       block_start = 0;
                        do {
-                               if (cur_logical >= map->m_lblk + blocks)
-                                       break;
-
-                               if (buffer_delay(bh) || buffer_unwritten(bh)) {
-
-                                       BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
-
+                               if (!bh)
+                                       goto redirty_page;
+                               if (map && (cur_logical >= map->m_lblk) &&
+                                   (cur_logical <= (map->m_lblk +
+                                                    (map->m_len - 1)))) {
                                        if (buffer_delay(bh)) {
                                                clear_buffer_delay(bh);
                                                bh->b_blocknr = pblock;
-                                       } else {
-                                               /*
-                                                * unwritten already should have
-                                                * blocknr assigned. Verify that
-                                                */
-                                               clear_buffer_unwritten(bh);
-                                               BUG_ON(bh->b_blocknr != pblock);
                                        }
+                                       if (buffer_unwritten(bh) ||
+                                           buffer_mapped(bh))
+                                               BUG_ON(bh->b_blocknr != pblock);
+                                       if (map->m_flags & EXT4_MAP_UNINIT)
+                                               set_buffer_uninit(bh);
+                                       clear_buffer_unwritten(bh);
+                               }
 
-                               } else if (buffer_mapped(bh))
-                                       BUG_ON(bh->b_blocknr != pblock);
-
-                               if (map->m_flags & EXT4_MAP_UNINIT)
-                                       set_buffer_uninit(bh);
+                               /* redirty page if block allocation undone */
+                               if (buffer_delay(bh) || buffer_unwritten(bh))
+                                       redirty_page = 1;
+                               bh = bh->b_this_page;
+                               block_start += bh->b_size;
                                cur_logical++;
                                pblock++;
-                       } while ((bh = bh->b_this_page) != head);
+                       } while (bh != page_bufs);
+
+                       if (redirty_page)
+                               goto redirty_page;
+
+                       if (commit_write)
+                               /* mark the buffer_heads as dirty & uptodate */
+                               block_commit_write(page, 0, len);
+
+                       /*
+                        * Delalloc doesn't support data journalling,
+                        * but eventually maybe we'll lift this
+                        * restriction.
+                        */
+                       if (unlikely(journal_data && PageChecked(page)))
+                               err = __ext4_journalled_writepage(page, len);
+                       else
+                               err = ext4_bio_write_page(&io_submit, page,
+                                                         len, mpd->wbc);
+
+                       if (!err)
+                               mpd->pages_written++;
+                       /*
+                        * In error case, we have to continue because
+                        * remaining pages are still locked
+                        */
+                       if (ret == 0)
+                               ret = err;
                }
                pagevec_release(&pvec);
        }
+       ext4_io_submit(&io_submit);
+       return ret;
 }
 
-
 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
                                        sector_t logical, long blk_cnt)
 {
@@ -2184,35 +2193,32 @@ static void ext4_print_free_blocks(struct inode *inode)
 }
 
 /*
- * mpage_da_map_blocks - go through given space
+ * mpage_da_map_and_submit - go through given space, map them
+ *       if necessary, and then submit them for I/O
  *
  * @mpd - bh describing space
  *
  * The function skips space we know is already mapped to disk blocks.
  *
  */
-static int mpage_da_map_blocks(struct mpage_da_data *mpd)
+static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 {
        int err, blks, get_blocks_flags;
-       struct ext4_map_blocks map;
+       struct ext4_map_blocks map, *mapp = NULL;
        sector_t next = mpd->b_blocknr;
        unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
        loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
        handle_t *handle = NULL;
 
        /*
-        * We consider only non-mapped and non-allocated blocks
-        */
-       if ((mpd->b_state  & (1 << BH_Mapped)) &&
-               !(mpd->b_state & (1 << BH_Delay)) &&
-               !(mpd->b_state & (1 << BH_Unwritten)))
-               return 0;
-
-       /*
-        * If we didn't accumulate anything to write simply return
+        * If the blocks are mapped already, or we couldn't accumulate
+        * any blocks, then proceed immediately to the submission stage.
         */
-       if (!mpd->b_size)
-               return 0;
+       if ((mpd->b_size == 0) ||
+           ((mpd->b_state  & (1 << BH_Mapped)) &&
+            !(mpd->b_state & (1 << BH_Delay)) &&
+            !(mpd->b_state & (1 << BH_Unwritten))))
+               goto submit_io;
 
        handle = ext4_journal_current_handle();
        BUG_ON(!handle);
@@ -2249,17 +2255,18 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 
                err = blks;
                /*
-                * If get block returns with error we simply
-                * return. Later writepage will redirty the page and
-                * writepages will find the dirty page again
+                * If get block returns EAGAIN or ENOSPC and there
+                * appears to be free blocks we will call
+                * ext4_writepage() for all of the pages which will
+                * just redirty the pages.
                 */
                if (err == -EAGAIN)
-                       return 0;
+                       goto submit_io;
 
                if (err == -ENOSPC &&
                    ext4_count_free_blocks(sb)) {
                        mpd->retval = err;
-                       return 0;
+                       goto submit_io;
                }
 
                /*
@@ -2284,10 +2291,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                /* invalidate all the pages */
                ext4_da_block_invalidatepages(mpd, next,
                                mpd->b_size >> mpd->inode->i_blkbits);
-               return err;
+               return;
        }
        BUG_ON(blks == 0);
 
+       mapp = &map;
        if (map.m_flags & EXT4_MAP_NEW) {
                struct block_device *bdev = mpd->inode->i_sb->s_bdev;
                int i;
@@ -2296,18 +2304,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                        unmap_underlying_metadata(bdev, map.m_pblk + i);
        }
 
-       /*
-        * If blocks are delayed marked, we need to
-        * put actual blocknr and drop delayed bit
-        */
-       if ((mpd->b_state & (1 << BH_Delay)) ||
-           (mpd->b_state & (1 << BH_Unwritten)))
-               mpage_put_bnr_to_bhs(mpd, &map);
-
        if (ext4_should_order_data(mpd->inode)) {
                err = ext4_jbd2_file_inode(handle, mpd->inode);
                if (err)
-                       return err;
+                       /* This only happens if the journal is aborted */
+                       return;
        }
 
        /*
@@ -2318,10 +2319,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                disksize = i_size_read(mpd->inode);
        if (disksize > EXT4_I(mpd->inode)->i_disksize) {
                ext4_update_i_disksize(mpd->inode, disksize);
-               return ext4_mark_inode_dirty(handle, mpd->inode);
+               err = ext4_mark_inode_dirty(handle, mpd->inode);
+               if (err)
+                       ext4_error(mpd->inode->i_sb,
+                                  "Failed to mark inode %lu dirty",
+                                  mpd->inode->i_ino);
        }
 
-       return 0;
+submit_io:
+       mpage_da_submit_io(mpd, mapp);
+       mpd->io_done = 1;
 }
 
 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -2398,9 +2405,7 @@ flush_it:
         * We couldn't merge the block to our extent, so we
         * need to flush current  extent and start new one
         */
-       if (mpage_da_map_blocks(mpd) == 0)
-               mpage_da_submit_io(mpd);
-       mpd->io_done = 1;
+       mpage_da_map_and_submit(mpd);
        return;
 }
 
@@ -2419,9 +2424,9 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
  * The function finds extents of pages and scan them for all blocks.
  */
 static int __mpage_da_writepage(struct page *page,
-                               struct writeback_control *wbc, void *data)
+                               struct writeback_control *wbc,
+                               struct mpage_da_data *mpd)
 {
-       struct mpage_da_data *mpd = data;
        struct inode *inode = mpd->inode;
        struct buffer_head *bh, *head;
        sector_t logical;
@@ -2432,15 +2437,13 @@ static int __mpage_da_writepage(struct page *page,
        if (mpd->next_page != page->index) {
                /*
                 * Nope, we can't. So, we map non-allocated blocks
-                * and start IO on them using writepage()
+                * and start IO on them
                 */
                if (mpd->next_page != mpd->first_page) {
-                       if (mpage_da_map_blocks(mpd) == 0)
-                               mpage_da_submit_io(mpd);
+                       mpage_da_map_and_submit(mpd);
                        /*
                         * skip rest of the page in the page_vec
                         */
-                       mpd->io_done = 1;
                        redirty_page_for_writepage(wbc, page);
                        unlock_page(page);
                        return MPAGE_DA_EXTENT_TAIL;
@@ -2547,8 +2550,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                if (buffer_delay(bh))
                        return 0; /* Not sure this could or should happen */
                /*
-                * XXX: __block_prepare_write() unmaps passed block,
-                * is it OK?
+                * XXX: __block_write_begin() unmaps passed block, is it OK?
                 */
                ret = ext4_da_reserve_space(inode, iblock);
                if (ret)
@@ -2580,7 +2582,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 /*
  * This function is used as a standard get_block_t calback function
  * when there is no desire to allocate any blocks.  It is used as a
- * callback function for block_prepare_write() and block_write_full_page().
+ * callback function for block_write_begin() and block_write_full_page().
  * These functions should only try to map a single block at a time.
  *
  * Since this function doesn't do block allocations even if the caller
@@ -2620,6 +2622,7 @@ static int __ext4_journalled_writepage(struct page *page,
        int ret = 0;
        int err;
 
+       ClearPageChecked(page);
        page_bufs = page_buffers(page);
        BUG_ON(!page_bufs);
        walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
@@ -2697,7 +2700,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 static int ext4_writepage(struct page *page,
                          struct writeback_control *wbc)
 {
-       int ret = 0;
+       int ret = 0, commit_write = 0;
        loff_t size;
        unsigned int len;
        struct buffer_head *page_bufs = NULL;
@@ -2710,71 +2713,46 @@ static int ext4_writepage(struct page *page,
        else
                len = PAGE_CACHE_SIZE;
 
-       if (page_has_buffers(page)) {
-               page_bufs = page_buffers(page);
-               if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                       ext4_bh_delay_or_unwritten)) {
-                       /*
-                        * We don't want to do  block allocation
-                        * So redirty the page and return
-                        * We may reach here when we do a journal commit
-                        * via journal_submit_inode_data_buffers.
-                        * If we don't have mapping block we just ignore
-                        * them. We can also reach here via shrink_page_list
-                        */
+       /*
+        * If the page does not have buffers (for whatever reason),
+        * try to create them using __block_write_begin.  If this
+        * fails, redirty the page and move on.
+        */
+       if (!page_buffers(page)) {
+               if (__block_write_begin(page, 0, len,
+                                       noalloc_get_block_write)) {
+               redirty_page:
                        redirty_page_for_writepage(wbc, page);
                        unlock_page(page);
                        return 0;
                }
-       } else {
+               commit_write = 1;
+       }
+       page_bufs = page_buffers(page);
+       if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                             ext4_bh_delay_or_unwritten)) {
                /*
-                * The test for page_has_buffers() is subtle:
-                * We know the page is dirty but it lost buffers. That means
-                * that at some moment in time after write_begin()/write_end()
-                * has been called all buffers have been clean and thus they
-                * must have been written at least once. So they are all
-                * mapped and we can happily proceed with mapping them
-                * and writing the page.
-                *
-                * Try to initialize the buffer_heads and check whether
-                * all are mapped and non delay. We don't want to
-                * do block allocation here.
+                * We don't want to do block allocation So redirty the
+                * page and return We may reach here when we do a
+                * journal commit via
+                * journal_submit_inode_data_buffers.  If we don't
+                * have mapping block we just ignore them. We can also
+                * reach here via shrink_page_list
                 */
-               ret = block_prepare_write(page, 0, len,
-                                         noalloc_get_block_write);
-               if (!ret) {
-                       page_bufs = page_buffers(page);
-                       /* check whether all are mapped and non delay */
-                       if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                               ext4_bh_delay_or_unwritten)) {
-                               redirty_page_for_writepage(wbc, page);
-                               unlock_page(page);
-                               return 0;
-                       }
-               } else {
-                       /*
-                        * We can't do block allocation here
-                        * so just redity the page and unlock
-                        * and return
-                        */
-                       redirty_page_for_writepage(wbc, page);
-                       unlock_page(page);
-                       return 0;
-               }
+               goto redirty_page;
+       }
+       if (commit_write)
                /* now mark the buffer_heads as dirty and uptodate */
                block_commit_write(page, 0, len);
-       }
 
-       if (PageChecked(page) && ext4_should_journal_data(inode)) {
+       if (PageChecked(page) && ext4_should_journal_data(inode))
                /*
                 * It's mmapped pagecache.  Add buffers and journal it.  There
                 * doesn't seem much point in redirtying the page here.
                 */
-               ClearPageChecked(page);
                return __ext4_journalled_writepage(page, len);
-       }
 
-       if (page_bufs && buffer_uninit(page_bufs)) {
+       if (buffer_uninit(page_bufs)) {
                ext4_set_bh_endio(page_bufs, inode);
                ret = block_write_full_page_endio(page, noalloc_get_block_write,
                                            wbc, ext4_end_io_buffer_write);
@@ -2821,25 +2799,32 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
  */
 static int write_cache_pages_da(struct address_space *mapping,
                                struct writeback_control *wbc,
-                               struct mpage_da_data *mpd)
+                               struct mpage_da_data *mpd,
+                               pgoff_t *done_index)
 {
        int ret = 0;
        int done = 0;
        struct pagevec pvec;
-       int nr_pages;
+       unsigned nr_pages;
        pgoff_t index;
        pgoff_t end;            /* Inclusive */
        long nr_to_write = wbc->nr_to_write;
+       int tag;
 
        pagevec_init(&pvec, 0);
        index = wbc->range_start >> PAGE_CACHE_SHIFT;
        end = wbc->range_end >> PAGE_CACHE_SHIFT;
 
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               tag = PAGECACHE_TAG_TOWRITE;
+       else
+               tag = PAGECACHE_TAG_DIRTY;
+
+       *done_index = index;
        while (!done && (index <= end)) {
                int i;
 
-               nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-                             PAGECACHE_TAG_DIRTY,
+               nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
                        break;
@@ -2859,6 +2844,8 @@ static int write_cache_pages_da(struct address_space *mapping,
                                break;
                        }
 
+                       *done_index = page->index + 1;
+
                        lock_page(page);
 
                        /*
@@ -2944,6 +2931,8 @@ static int ext4_da_writepages(struct address_space *mapping,
        long desired_nr_to_write, nr_to_writebump = 0;
        loff_t range_start = wbc->range_start;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+       pgoff_t done_index = 0;
+       pgoff_t end;
 
        trace_ext4_da_writepages(inode, wbc);
 
@@ -2979,8 +2968,11 @@ static int ext4_da_writepages(struct address_space *mapping,
                wbc->range_start = index << PAGE_CACHE_SHIFT;
                wbc->range_end  = LLONG_MAX;
                wbc->range_cyclic = 0;
-       } else
+               end = -1;
+       } else {
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
+               end = wbc->range_end >> PAGE_CACHE_SHIFT;
+       }
 
        /*
         * This works around two forms of stupidity.  The first is in
@@ -2999,9 +2991,12 @@ static int ext4_da_writepages(struct address_space *mapping,
         * sbi->max_writeback_mb_bump whichever is smaller.
         */
        max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
-       if (!range_cyclic && range_whole)
-               desired_nr_to_write = wbc->nr_to_write * 8;
-       else
+       if (!range_cyclic && range_whole) {
+               if (wbc->nr_to_write == LONG_MAX)
+                       desired_nr_to_write = wbc->nr_to_write;
+               else
+                       desired_nr_to_write = wbc->nr_to_write * 8;
+       } else
                desired_nr_to_write = ext4_num_dirty_pages(inode, index,
                                                           max_pages);
        if (desired_nr_to_write > max_pages)
@@ -3018,6 +3013,9 @@ static int ext4_da_writepages(struct address_space *mapping,
        pages_skipped = wbc->pages_skipped;
 
 retry:
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               tag_pages_for_writeback(mapping, index, end);
+
        while (!ret && wbc->nr_to_write > 0) {
 
                /*
@@ -3056,16 +3054,14 @@ retry:
                mpd.io_done = 0;
                mpd.pages_written = 0;
                mpd.retval = 0;
-               ret = write_cache_pages_da(mapping, wbc, &mpd);
+               ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
                /*
                 * If we have a contiguous extent of pages and we
                 * haven't done the I/O yet, map the blocks and submit
                 * them for I/O.
                 */
                if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-                       if (mpage_da_map_blocks(&mpd) == 0)
-                               mpage_da_submit_io(&mpd);
-                       mpd.io_done = 1;
+                       mpage_da_map_and_submit(&mpd);
                        ret = MPAGE_DA_EXTENT_TAIL;
                }
                trace_ext4_da_write_pages(inode, &mpd);
@@ -3112,14 +3108,13 @@ retry:
                         __func__, wbc->nr_to_write, ret);
 
        /* Update index */
-       index += pages_written;
        wbc->range_cyclic = range_cyclic;
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                /*
                 * set the writeback_index so that range_cyclic
                 * mode will write it back later
                 */
-               mapping->writeback_index = index;
+               mapping->writeback_index = done_index;
 
 out_writepages:
        wbc->nr_to_write -= nr_to_writebump;
@@ -3205,8 +3200,7 @@ retry:
        }
        *pagep = page;
 
-       ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-                               ext4_da_get_block_prep);
+       ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
        if (ret < 0) {
                unlock_page(page);
                ext4_journal_stop(handle);
@@ -3455,15 +3449,6 @@ ext4_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
 
-static void ext4_free_io_end(ext4_io_end_t *io)
-{
-       BUG_ON(!io);
-       if (io->page)
-               put_page(io->page);
-       iput(io->inode);
-       kfree(io);
-}
-
 static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
 {
        struct buffer_head *head, *bh;
@@ -3565,15 +3550,24 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 
 retry:
        if (rw == READ && ext4_should_dioread_nolock(inode))
-               ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
+               ret = __blockdev_direct_IO(rw, iocb, inode,
                                 inode->i_sb->s_bdev, iov,
                                 offset, nr_segs,
-                                ext4_get_block, NULL);
-       else
+                                ext4_get_block, NULL, NULL, 0);
+       else {
                ret = blockdev_direct_IO(rw, iocb, inode,
                                 inode->i_sb->s_bdev, iov,
                                 offset, nr_segs,
                                 ext4_get_block, NULL);
+
+               if (unlikely((rw & WRITE) && ret < 0)) {
+                       loff_t isize = i_size_read(inode);
+                       loff_t end = offset + iov_length(iov, nr_segs);
+
+                       if (end > isize)
+                               vmtruncate(inode, isize);
+               }
+       }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
 
@@ -3631,173 +3625,6 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
                               EXT4_GET_BLOCKS_IO_CREATE_EXT);
 }
 
-static void dump_completed_IO(struct inode * inode)
-{
-#ifdef EXT4_DEBUG
-       struct list_head *cur, *before, *after;
-       ext4_io_end_t *io, *io0, *io1;
-       unsigned long flags;
-
-       if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
-               ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
-               return;
-       }
-
-       ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
-       spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
-       list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
-               cur = &io->list;
-               before = cur->prev;
-               io0 = container_of(before, ext4_io_end_t, list);
-               after = cur->next;
-               io1 = container_of(after, ext4_io_end_t, list);
-
-               ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
-                           io, inode->i_ino, io0, io1);
-       }
-       spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-#endif
-}
-
-/*
- * check a range of space and convert unwritten extents to written.
- */
-static int ext4_end_io_nolock(ext4_io_end_t *io)
-{
-       struct inode *inode = io->inode;
-       loff_t offset = io->offset;
-       ssize_t size = io->size;
-       int ret = 0;
-
-       ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
-                  "list->prev 0x%p\n",
-                  io, inode->i_ino, io->list.next, io->list.prev);
-
-       if (list_empty(&io->list))
-               return ret;
-
-       if (io->flag != EXT4_IO_UNWRITTEN)
-               return ret;
-
-       ret = ext4_convert_unwritten_extents(inode, offset, size);
-       if (ret < 0) {
-               printk(KERN_EMERG "%s: failed to convert unwritten"
-                       "extents to written extents, error is %d"
-                       " io is still on inode %lu aio dio list\n",
-                       __func__, ret, inode->i_ino);
-               return ret;
-       }
-
-       if (io->iocb)
-               aio_complete(io->iocb, io->result, 0);
-       /* clear the DIO AIO unwritten flag */
-       io->flag = 0;
-       return ret;
-}
-
-/*
- * work on completed aio dio IO, to convert unwritten extents to extents
- */
-static void ext4_end_io_work(struct work_struct *work)
-{
-       ext4_io_end_t           *io = container_of(work, ext4_io_end_t, work);
-       struct inode            *inode = io->inode;
-       struct ext4_inode_info  *ei = EXT4_I(inode);
-       unsigned long           flags;
-       int                     ret;
-
-       mutex_lock(&inode->i_mutex);
-       ret = ext4_end_io_nolock(io);
-       if (ret < 0) {
-               mutex_unlock(&inode->i_mutex);
-               return;
-       }
-
-       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-       if (!list_empty(&io->list))
-               list_del_init(&io->list);
-       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-       mutex_unlock(&inode->i_mutex);
-       ext4_free_io_end(io);
-}
-
-/*
- * This function is called from ext4_sync_file().
- *
- * When IO is completed, the work to convert unwritten extents to
- * written is queued on workqueue but may not get immediately
- * scheduled. When fsync is called, we need to ensure the
- * conversion is complete before fsync returns.
- * The inode keeps track of a list of pending/completed IO that
- * might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents for completed IO
- * to written.
- * The function return the number of pending IOs on success.
- */
-int flush_completed_IO(struct inode *inode)
-{
-       ext4_io_end_t *io;
-       struct ext4_inode_info *ei = EXT4_I(inode);
-       unsigned long flags;
-       int ret = 0;
-       int ret2 = 0;
-
-       if (list_empty(&ei->i_completed_io_list))
-               return ret;
-
-       dump_completed_IO(inode);
-       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-       while (!list_empty(&ei->i_completed_io_list)){
-               io = list_entry(ei->i_completed_io_list.next,
-                               ext4_io_end_t, list);
-               /*
-                * Calling ext4_end_io_nolock() to convert completed
-                * IO to written.
-                *
-                * When ext4_sync_file() is called, run_queue() may already
-                * about to flush the work corresponding to this io structure.
-                * It will be upset if it founds the io structure related
-                * to the work-to-be schedule is freed.
-                *
-                * Thus we need to keep the io structure still valid here after
-                * convertion finished. The io structure has a flag to
-                * avoid double converting from both fsync and background work
-                * queue work.
-                */
-               spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-               ret = ext4_end_io_nolock(io);
-               spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-               if (ret < 0)
-                       ret2 = ret;
-               else
-                       list_del_init(&io->list);
-       }
-       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-       return (ret2 < 0) ? ret2 : 0;
-}
-
-static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
-{
-       ext4_io_end_t *io = NULL;
-
-       io = kmalloc(sizeof(*io), flags);
-
-       if (io) {
-               igrab(inode);
-               io->inode = inode;
-               io->flag = 0;
-               io->offset = 0;
-               io->size = 0;
-               io->page = NULL;
-               io->iocb = NULL;
-               io->result = 0;
-               INIT_WORK(&io->work, ext4_end_io_work);
-               INIT_LIST_HEAD(&io->list);
-       }
-
-       return io;
-}
-
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                            ssize_t size, void *private, int ret,
                            bool is_async)
@@ -3817,7 +3644,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                  size);
 
        /* if not aio dio with unwritten extents, just free io and return */
-       if (io_end->flag != EXT4_IO_UNWRITTEN){
+       if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
                ext4_free_io_end(io_end);
                iocb->private = NULL;
 out:
@@ -3834,14 +3661,14 @@ out:
        }
        wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
 
-       /* queue the work to convert unwritten extents to written */
-       queue_work(wq, &io_end->work);
-
        /* Add the io_end to per-inode completed aio dio list*/
        ei = EXT4_I(io_end->inode);
        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
        list_add_tail(&io_end->list, &ei->i_completed_io_list);
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+       /* queue the work to convert unwritten extents to written */
+       queue_work(wq, &io_end->work);
        iocb->private = NULL;
 }
 
@@ -3862,7 +3689,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
                goto out;
        }
 
-       io_end->flag = EXT4_IO_UNWRITTEN;
+       io_end->flag = EXT4_IO_END_UNWRITTEN;
        inode = io_end->inode;
 
        /* Add the io_end to per-inode completed io list*/
@@ -5453,6 +5280,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
        int error, rc = 0;
+       int orphan = 0;
        const unsigned int ia_valid = attr->ia_valid;
 
        error = inode_change_ok(inode, attr);
@@ -5508,8 +5336,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                        error = PTR_ERR(handle);
                        goto err_out;
                }
-
-               error = ext4_orphan_add(handle, inode);
+               if (ext4_handle_valid(handle)) {
+                       error = ext4_orphan_add(handle, inode);
+                       orphan = 1;
+               }
                EXT4_I(inode)->i_disksize = attr->ia_size;
                rc = ext4_mark_inode_dirty(handle, inode);
                if (!error)
@@ -5527,6 +5357,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                                        goto err_out;
                                }
                                ext4_orphan_del(handle, inode);
+                               orphan = 0;
                                ext4_journal_stop(handle);
                                goto err_out;
                        }
@@ -5536,12 +5367,20 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                        ext4_truncate(inode);
        }
 
-       rc = inode_setattr(inode, attr);
+       if ((attr->ia_valid & ATTR_SIZE) &&
+           attr->ia_size != i_size_read(inode))
+               rc = vmtruncate(inode, attr->ia_size);
 
-       /* If inode_setattr's call to ext4_truncate failed to get a
-        * transaction handle at all, we need to clean up the in-core
-        * orphan list manually. */
-       if (inode->i_nlink)
+       if (!rc) {
+               setattr_copy(inode, attr);
+               mark_inode_dirty(inode);
+       }
+
+       /*
+        * If the call to ext4_truncate failed to get a transaction handle at
+        * all, we need to clean up the in-core orphan list manually.
+        */
+       if (orphan && inode->i_nlink)
                ext4_orphan_del(NULL, inode);
 
        if (!rc && (ia_valid & ATTR_MODE))
@@ -5624,7 +5463,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
  *
  * Also account for superblock, inode, quota and xattr blocks
  */
-int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
        int gdpblocks;