Btrfs: fix space leak when we fail to reserve metadata space
[linux-2.6.git] / fs / ocfs2 / file.c
index 2b10b36..7602783 100644 (file)
@@ -36,8 +36,8 @@
 #include <linux/writeback.h>
 #include <linux/falloc.h>
 #include <linux/quotaops.h>
+#include <linux/blkdev.h>
 
-#define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
 #include "acl.h"
 #include "quota.h"
 #include "refcounttree.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
-static int ocfs2_sync_inode(struct inode *inode)
-{
-       filemap_fdatawrite(inode->i_mapping);
-       return sync_mapping_buffers(inode->i_mapping);
-}
-
 static int ocfs2_init_file_private(struct inode *inode, struct file *file)
 {
        struct ocfs2_file_private *fp;
@@ -104,8 +99,10 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
        int mode = file->f_flags;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-       mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
-                  file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
+       trace_ocfs2_file_open(inode, file, file->f_path.dentry,
+                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                             file->f_path.dentry->d_name.len,
+                             file->f_path.dentry->d_name.name, mode);
 
        if (file->f_mode & FMODE_WRITE)
                dquot_initialize(inode);
@@ -140,7 +137,6 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
        }
 
 leave:
-       mlog_exit(status);
        return status;
 }
 
@@ -148,19 +144,19 @@ static int ocfs2_file_release(struct inode *inode, struct file *file)
 {
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-       mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
-                      file->f_path.dentry->d_name.len,
-                      file->f_path.dentry->d_name.name);
-
        spin_lock(&oi->ip_lock);
        if (!--oi->ip_open_count)
                oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
+
+       trace_ocfs2_file_release(inode, file, file->f_path.dentry,
+                                oi->ip_blkno,
+                                file->f_path.dentry->d_name.len,
+                                file->f_path.dentry->d_name.name,
+                                oi->ip_open_count);
        spin_unlock(&oi->ip_lock);
 
        ocfs2_free_file_private(inode, file);
 
-       mlog_exit(0);
-
        return 0;
 }
 
@@ -175,29 +171,47 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file)
        return 0;
 }
 
-static int ocfs2_sync_file(struct file *file, int datasync)
+static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
+                          int datasync)
 {
        int err = 0;
        journal_t *journal;
-       struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = file->f_mapping->host;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-       mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
-                  dentry->d_name.len, dentry->d_name.name);
+       trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
+                             OCFS2_I(inode)->ip_blkno,
+                             file->f_path.dentry->d_name.len,
+                             file->f_path.dentry->d_name.name,
+                             (unsigned long long)datasync);
 
-       err = ocfs2_sync_inode(dentry->d_inode);
+       err = filemap_write_and_wait_range(inode->i_mapping, start, end);
        if (err)
-               goto bail;
+               return err;
 
-       if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+       /*
+        * Probably don't need the i_mutex at all in here, just putting it here
+        * to be consistent with how fsync used to be called, someone more
+        * familiar with the fs could possibly remove it.
+        */
+       mutex_lock(&inode->i_mutex);
+       if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
+               /*
+                * We still have to flush drive's caches to get data to the
+                * platter
+                */
+               if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
+                       blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
                goto bail;
+       }
 
        journal = osb->journal->j_journal;
        err = jbd2_journal_force_commit(journal);
 
 bail:
-       mlog_exit(err);
+       if (err)
+               mlog_errno(err);
+       mutex_unlock(&inode->i_mutex);
 
        return (err < 0) ? -EIO : 0;
 }
@@ -253,8 +267,6 @@ int ocfs2_update_inode_atime(struct inode *inode,
        handle_t *handle;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
 
-       mlog_entry_void();
-
        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
@@ -282,7 +294,6 @@ int ocfs2_update_inode_atime(struct inode *inode,
 out_commit:
        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
-       mlog_exit(ret);
        return ret;
 }
 
@@ -293,7 +304,6 @@ static int ocfs2_set_inode_size(handle_t *handle,
 {
        int status;
 
-       mlog_entry_void();
        i_size_write(inode, new_i_size);
        inode->i_blocks = ocfs2_inode_sector_count(inode);
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -305,7 +315,6 @@ static int ocfs2_set_inode_size(handle_t *handle,
        }
 
 bail:
-       mlog_exit(status);
        return status;
 }
 
@@ -361,7 +370,7 @@ static int ocfs2_cow_file_pos(struct inode *inode,
        if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
                goto out;
 
-       return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
+       return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
 
 out:
        return status;
@@ -377,8 +386,6 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
        struct ocfs2_dinode *di;
        u64 cluster_bytes;
 
-       mlog_entry_void();
-
        /*
         * We need to CoW the cluster contains the offset if it is reflinked
         * since we will call ocfs2_zero_range_for_truncate later which will
@@ -431,8 +438,6 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 out_commit:
        ocfs2_commit_trans(osb, handle);
 out:
-
-       mlog_exit(status);
        return status;
 }
 
@@ -444,14 +449,14 @@ static int ocfs2_truncate_file(struct inode *inode,
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-       mlog_entry("(inode = %llu, new_i_size = %llu\n",
-                  (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                  (unsigned long long)new_i_size);
-
        /* We trust di_bh because it comes from ocfs2_inode_lock(), which
         * already validated it */
        fe = (struct ocfs2_dinode *) di_bh->b_data;
 
+       trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                 (unsigned long long)le64_to_cpu(fe->i_size),
+                                 (unsigned long long)new_i_size);
+
        mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
                        "Inode %llu, inode i_size = %lld != di "
                        "i_size = %llu, i_flags = 0x%x\n",
@@ -461,19 +466,14 @@ static int ocfs2_truncate_file(struct inode *inode,
                        le32_to_cpu(fe->i_flags));
 
        if (new_i_size > le64_to_cpu(fe->i_size)) {
-               mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
-                    (unsigned long long)le64_to_cpu(fe->i_size),
-                    (unsigned long long)new_i_size);
+               trace_ocfs2_truncate_file_error(
+                       (unsigned long long)le64_to_cpu(fe->i_size),
+                       (unsigned long long)new_i_size);
                status = -EINVAL;
                mlog_errno(status);
                goto bail;
        }
 
-       mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
-            (unsigned long long)le64_to_cpu(fe->i_blkno),
-            (unsigned long long)le64_to_cpu(fe->i_size),
-            (unsigned long long)new_i_size);
-
        /* lets handle the simple truncate cases before doing any more
         * cluster locking. */
        if (new_i_size == le64_to_cpu(fe->i_size))
@@ -527,7 +527,6 @@ bail:
        if (!status && OCFS2_I(inode)->ip_clusters == 0)
                status = ocfs2_try_remove_refcount_tree(inode, di_bh);
 
-       mlog_exit(status);
        return status;
 }
 
@@ -580,8 +579,6 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
        struct ocfs2_extent_tree et;
        int did_quota = 0;
 
-       mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
-
        /*
         * This function only exists for file systems which don't
         * support holes.
@@ -598,11 +595,6 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 restart_all:
        BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 
-       mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
-            "clusters_to_add = %u\n",
-            (unsigned long long)OCFS2_I(inode)->ip_blkno,
-            (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
-            clusters_to_add);
        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
        status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
                                       &data_ac, &meta_ac);
@@ -622,6 +614,12 @@ restart_all:
        }
 
 restarted_transaction:
+       trace_ocfs2_extend_allocation(
+               (unsigned long long)OCFS2_I(inode)->ip_blkno,
+               (unsigned long long)i_size_read(inode),
+               le32_to_cpu(fe->i_clusters), clusters_to_add,
+               why, restart_func);
+
        status = dquot_alloc_space_nodirty(inode,
                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
        if (status)
@@ -668,13 +666,11 @@ restarted_transaction:
 
        if (why != RESTART_NONE && clusters_to_add) {
                if (why == RESTART_META) {
-                       mlog(0, "restarting function.\n");
                        restart_func = 1;
                        status = 0;
                } else {
                        BUG_ON(why != RESTART_TRANS);
 
-                       mlog(0, "restarting transaction.\n");
                        /* TODO: This can be more intelligent. */
                        credits = ocfs2_calc_extend_credits(osb->sb,
                                                            &fe->id2.i_list,
@@ -691,11 +687,11 @@ restarted_transaction:
                }
        }
 
-       mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
+       trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
             le32_to_cpu(fe->i_clusters),
-            (unsigned long long)le64_to_cpu(fe->i_size));
-       mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
-            OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
+            (unsigned long long)le64_to_cpu(fe->i_size),
+            OCFS2_I(inode)->ip_clusters,
+            (unsigned long long)i_size_read(inode));
 
 leave:
        if (status < 0 && did_quota)
@@ -720,7 +716,6 @@ leave:
        brelse(bh);
        bh = NULL;
 
-       mlog_exit(status);
        return status;
 }
 
@@ -774,7 +769,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
        BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
        BUG_ON(abs_from & (inode->i_blkbits - 1));
 
-       page = grab_cache_page(mapping, index);
+       page = find_or_create_page(mapping, index, GFP_NOFS);
        if (!page) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -787,10 +782,11 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
        if (!zero_to)
                zero_to = PAGE_CACHE_SIZE;
 
-       mlog(0,
-            "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
-            (unsigned long long)abs_from, (unsigned long long)abs_to,
-            index, zero_from, zero_to);
+       trace_ocfs2_write_zero_page(
+                       (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                       (unsigned long long)abs_from,
+                       (unsigned long long)abs_to,
+                       index, zero_from, zero_to);
 
        /* We know that zero_from is block aligned */
        for (block_start = zero_from; block_start < zero_to;
@@ -798,13 +794,12 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
                block_end = block_start + (1 << inode->i_blkbits);
 
                /*
-                * block_start is block-aligned.  Bump it by one to
-                * force ocfs2_{prepare,commit}_write() to zero the
+                * block_start is block-aligned.  Bump it by one to force
+                * __block_write_begin and block_commit_write to zero the
                 * whole block.
                 */
-               ret = ocfs2_prepare_write_nolock(inode, page,
-                                                block_start + 1,
-                                                block_start + 1);
+               ret = __block_write_begin(page, block_start + 1, 0,
+                                         ocfs2_get_block);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out_unlock;
@@ -904,8 +899,8 @@ static int ocfs2_zero_extend_get_range(struct inode *inode,
                zero_clusters = last_cpos - zero_cpos;
 
        if (needs_cow) {
-               rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
-                                       UINT_MAX);
+               rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
+                                       zero_clusters, UINT_MAX);
                if (rc) {
                        mlog_errno(rc);
                        goto out;
@@ -931,9 +926,10 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
        u64 next_pos;
        u64 zero_pos = range_start;
 
-       mlog(0, "range_start = %llu, range_end = %llu\n",
-            (unsigned long long)range_start,
-            (unsigned long long)range_end);
+       trace_ocfs2_zero_extend_range(
+                       (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                       (unsigned long long)range_start,
+                       (unsigned long long)range_end);
        BUG_ON(range_start >= range_end);
 
        while (zero_pos < range_end) {
@@ -965,9 +961,9 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
        struct super_block *sb = inode->i_sb;
 
        zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
-       mlog(0, "zero_start %llu for i_size %llu\n",
-            (unsigned long long)zero_start,
-            (unsigned long long)i_size_read(inode));
+       trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
+                               (unsigned long long)zero_start,
+                               (unsigned long long)i_size_read(inode));
        while (zero_start < zero_to_size) {
                ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
                                                  zero_to_size,
@@ -1116,30 +1112,20 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        struct dquot *transfer_to[MAXQUOTAS] = { };
        int qtype;
 
-       mlog_entry("(0x%p, '%.*s')\n", dentry,
-                  dentry->d_name.len, dentry->d_name.name);
+       trace_ocfs2_setattr(inode, dentry,
+                           (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                           dentry->d_name.len, dentry->d_name.name,
+                           attr->ia_valid, attr->ia_mode,
+                           attr->ia_uid, attr->ia_gid);
 
        /* ensuring we don't even attempt to truncate a symlink */
        if (S_ISLNK(inode->i_mode))
                attr->ia_valid &= ~ATTR_SIZE;
 
-       if (attr->ia_valid & ATTR_MODE)
-               mlog(0, "mode change: %d\n", attr->ia_mode);
-       if (attr->ia_valid & ATTR_UID)
-               mlog(0, "uid change: %d\n", attr->ia_uid);
-       if (attr->ia_valid & ATTR_GID)
-               mlog(0, "gid change: %d\n", attr->ia_gid);
-       if (attr->ia_valid & ATTR_SIZE)
-               mlog(0, "size change...\n");
-       if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
-               mlog(0, "time change...\n");
-
 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
                           | ATTR_GID | ATTR_UID | ATTR_MODE)
-       if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
-               mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
+       if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
                return 0;
-       }
 
        status = inode_change_ok(inode, attr);
        if (status)
@@ -1168,6 +1154,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                if (status)
                        goto bail_unlock;
 
+               inode_dio_wait(inode);
+
                if (i_size_read(inode) > attr->ia_size) {
                        if (ocfs2_should_order_data(inode)) {
                                status = ocfs2_begin_ordered_truncate(inode,
@@ -1233,18 +1221,26 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        }
 
        /*
-        * This will intentionally not wind up calling simple_setsize(),
+        * This will intentionally not wind up calling truncate_setsize(),
         * since all the work for a size change has been done above.
         * Otherwise, we could get into problems with truncate as
         * ip_alloc_sem is used there to protect against i_size
         * changes.
+        *
+        * XXX: this means the conditional below can probably be removed.
         */
-       status = inode_setattr(inode, attr);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail_commit;
+       if ((attr->ia_valid & ATTR_SIZE) &&
+           attr->ia_size != i_size_read(inode)) {
+               status = vmtruncate(inode, attr->ia_size);
+               if (status) {
+                       mlog_errno(status);
+                       goto bail_commit;
+               }
        }
 
+       setattr_copy(inode, attr);
+       mark_inode_dirty(inode);
+
        status = ocfs2_mark_inode_dirty(handle, inode, bh);
        if (status < 0)
                mlog_errno(status);
@@ -1269,7 +1265,6 @@ bail:
                        mlog_errno(status);
        }
 
-       mlog_exit(status);
        return status;
 }
 
@@ -1282,8 +1277,6 @@ int ocfs2_getattr(struct vfsmount *mnt,
        struct ocfs2_super *osb = sb->s_fs_info;
        int err;
 
-       mlog_entry_void();
-
        err = ocfs2_inode_revalidate(dentry);
        if (err) {
                if (err != -ENOENT)
@@ -1297,8 +1290,6 @@ int ocfs2_getattr(struct vfsmount *mnt,
        stat->blksize = osb->s_clustersize;
 
 bail:
-       mlog_exit(err);
-
        return err;
 }
 
@@ -1306,7 +1297,8 @@ int ocfs2_permission(struct inode *inode, int mask)
 {
        int ret;
 
-       mlog_entry_void();
+       if (mask & MAY_NOT_BLOCK)
+               return -ECHILD;
 
        ret = ocfs2_inode_lock(inode, NULL, 0);
        if (ret) {
@@ -1315,11 +1307,10 @@ int ocfs2_permission(struct inode *inode, int mask)
                goto out;
        }
 
-       ret = generic_permission(inode, mask, ocfs2_check_acl);
+       ret = generic_permission(inode, mask);
 
        ocfs2_inode_unlock(inode, 0);
 out:
-       mlog_exit(ret);
        return ret;
 }
 
@@ -1331,8 +1322,9 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_dinode *di;
 
-       mlog_entry("(Inode %llu, mode 0%o)\n",
-                  (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
+       trace_ocfs2_write_remove_suid(
+                       (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                       inode->i_mode);
 
        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
        if (IS_ERR(handle)) {
@@ -1360,7 +1352,6 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
 out_trans:
        ocfs2_commit_trans(osb, handle);
 out:
-       mlog_exit(ret);
        return ret;
 }
 
@@ -1539,8 +1530,9 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
         * partial clusters here. There's no need to worry about
         * physical allocation - the zeroing code knows to skip holes.
         */
-       mlog(0, "byte start: %llu, end: %llu\n",
-            (unsigned long long)start, (unsigned long long)end);
+       trace_ocfs2_zero_partial_clusters(
+               (unsigned long long)OCFS2_I(inode)->ip_blkno,
+               (unsigned long long)start, (unsigned long long)end);
 
        /*
         * If both edges are on a cluster boundary then there's no
@@ -1564,8 +1556,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
        if (tmpend > end)
                tmpend = end;
 
-       mlog(0, "1st range: start: %llu, tmpend: %llu\n",
-            (unsigned long long)start, (unsigned long long)tmpend);
+       trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
+                                                (unsigned long long)tmpend);
 
        ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
        if (ret)
@@ -1579,8 +1571,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
                 */
                start = end & ~(osb->s_clustersize - 1);
 
-               mlog(0, "2nd range: start: %llu, end: %llu\n",
-                    (unsigned long long)start, (unsigned long long)end);
+               trace_ocfs2_zero_partial_clusters_range2(
+                       (unsigned long long)start, (unsigned long long)end);
 
                ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
                if (ret)
@@ -1629,6 +1621,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode,
        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
 
        if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
+               /*
+                * remove an entire extent record.
+                */
                *trunc_cpos = le32_to_cpu(rec->e_cpos);
                /*
                 * Skip holes if any.
@@ -1639,7 +1634,16 @@ static void ocfs2_calc_trunc_pos(struct inode *inode,
                *blkno = le64_to_cpu(rec->e_blkno);
                *trunc_end = le32_to_cpu(rec->e_cpos);
        } else if (range > trunc_start) {
+               /*
+                * remove a partial extent record, which means we're
+                * removing the last extent record.
+                */
                *trunc_cpos = trunc_start;
+               /*
+                * skip hole if any.
+                */
+               if (range < *trunc_end)
+                       *trunc_end = range;
                *trunc_len = *trunc_end - trunc_start;
                coff = trunc_start - le32_to_cpu(rec->e_cpos);
                *blkno = le64_to_cpu(rec->e_blkno) +
@@ -1680,6 +1684,11 @@ static int ocfs2_remove_inode_range(struct inode *inode,
        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
        ocfs2_init_dealloc_ctxt(&dealloc);
 
+       trace_ocfs2_remove_inode_range(
+                       (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                       (unsigned long long)byte_start,
+                       (unsigned long long)byte_len);
+
        if (byte_len == 0)
                return 0;
 
@@ -1726,11 +1735,6 @@ static int ocfs2_remove_inode_range(struct inode *inode,
        trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
        cluster_in_el = trunc_end;
 
-       mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n",
-            (unsigned long long)OCFS2_I(inode)->ip_blkno,
-            (unsigned long long)byte_start,
-            (unsigned long long)byte_len, trunc_start, trunc_end);
-
        ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
        if (ret) {
                mlog_errno(ret);
@@ -1946,6 +1950,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
        if (ret < 0)
                mlog_errno(ret);
 
+       if (file && (file->f_flags & O_SYNC))
+               handle->h_sync = 1;
+
        ocfs2_commit_trans(osb, handle);
 
 out_inode_unlock:
@@ -1981,28 +1988,32 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
        return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
 }
 
-static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
+static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
                            loff_t len)
 {
+       struct inode *inode = file->f_path.dentry->d_inode;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_space_resv sr;
        int change_size = 1;
+       int cmd = OCFS2_IOC_RESVSP64;
 
+       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+               return -EOPNOTSUPP;
        if (!ocfs2_writes_unwritten_extents(osb))
                return -EOPNOTSUPP;
 
-       if (S_ISDIR(inode->i_mode))
-               return -ENODEV;
-
        if (mode & FALLOC_FL_KEEP_SIZE)
                change_size = 0;
 
+       if (mode & FALLOC_FL_PUNCH_HOLE)
+               cmd = OCFS2_IOC_UNRESVSP64;
+
        sr.l_whence = 0;
        sr.l_start = (s64)offset;
        sr.l_len = (s64)len;
 
-       return __ocfs2_change_file_space(NULL, inode, offset,
-                                        OCFS2_IOC_RESVSP64, &sr, change_size);
+       return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
+                                        change_size);
 }
 
 int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
@@ -2044,7 +2055,25 @@ out:
        return ret;
 }
 
+static void ocfs2_aiodio_wait(struct inode *inode)
+{
+       wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
+
+       wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
+}
+
+static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
+{
+       int blockmask = inode->i_sb->s_blocksize - 1;
+       loff_t final_size = pos + count;
+
+       if ((pos & blockmask) || (final_size & blockmask))
+               return 1;
+       return 0;
+}
+
 static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
+                                           struct file *file,
                                            loff_t pos, size_t count,
                                            int *meta_level)
 {
@@ -2062,7 +2091,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
 
        *meta_level = 1;
 
-       ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
+       ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
        if (ret)
                mlog_errno(ret);
 out:
@@ -2070,7 +2099,7 @@ out:
        return ret;
 }
 
-static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
+static int ocfs2_prepare_inode_for_write(struct file *file,
                                         loff_t *ppos,
                                         size_t count,
                                         int appending,
@@ -2078,8 +2107,9 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                                         int *has_refcount)
 {
        int ret = 0, meta_level = 0;
+       struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = dentry->d_inode;
-       loff_t saved_pos, end;
+       loff_t saved_pos = 0, end;
 
        /*
         * We start with a read level meta lock and only jump to an ex
@@ -2098,7 +2128,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                 * remove_suid() calls ->setattr without any hint that
                 * we may have already done our cluster locking. Since
                 * ocfs2_setattr() *must* take cluster locks to
-                * proceeed, this will lead us to recursively lock the
+                * proceed, this will lead us to recursively lock the
                 * inode. There's also the dinode i_size state which
                 * can be lost via setattr during extending writes (we
                 * set inode->i_size at the end of a write. */
@@ -2118,12 +2148,10 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 
                /* work on a copy of ppos until we're sure that we won't have
                 * to recalculate it due to relocking. */
-               if (appending) {
+               if (appending)
                        saved_pos = i_size_read(inode);
-                       mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
-               } else {
+               else
                        saved_pos = *ppos;
-               }
 
                end = saved_pos + count;
 
@@ -2133,6 +2161,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                        meta_level = -1;
 
                        ret = ocfs2_prepare_inode_for_refcount(inode,
+                                                              file,
                                                               saved_pos,
                                                               count,
                                                               &meta_level);
@@ -2193,6 +2222,10 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                *ppos = saved_pos;
 
 out_unlock:
+       trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
+                                           saved_pos, appending, count,
+                                           direct_io, has_refcount);
+
        if (meta_level >= 0)
                ocfs2_inode_unlock(inode, meta_level);
 
@@ -2215,11 +2248,15 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_path.dentry->d_inode;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       int full_coherency = !(osb->s_mount_opt &
+                              OCFS2_MOUNT_COHERENCY_BUFFERED);
+       int unaligned_dio = 0;
 
-       mlog_entry("(0x%p, %u, '%.*s')\n", file,
-                  (unsigned int)nr_segs,
-                  file->f_path.dentry->d_name.len,
-                  file->f_path.dentry->d_name.name);
+       trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
+               (unsigned long long)OCFS2_I(inode)->ip_blkno,
+               file->f_path.dentry->d_name.len,
+               file->f_path.dentry->d_name.name,
+               (unsigned int)nr_segs);
 
        if (iocb->ki_left == 0)
                return 0;
@@ -2231,23 +2268,49 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 
        mutex_lock(&inode->i_mutex);
 
+       ocfs2_iocb_clear_sem_locked(iocb);
+
 relock:
-       /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
+       /* to match setattr's i_mutex -> rw_lock ordering */
        if (direct_io) {
-               down_read(&inode->i_alloc_sem);
                have_alloc_sem = 1;
+               /* communicate with ocfs2_dio_end_io */
+               ocfs2_iocb_set_sem_locked(iocb);
        }
 
-       /* concurrent O_DIRECT writes are allowed */
-       rw_level = !direct_io;
+       /*
+        * Concurrent O_DIRECT writes are allowed with
+        * mount_option "coherency=buffered".
+        */
+       rw_level = (!direct_io || full_coherency);
+
        ret = ocfs2_rw_lock(inode, rw_level);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_sems;
        }
 
+       /*
+        * O_DIRECT writes with "coherency=full" need to take EX cluster
+        * inode_lock to guarantee coherency.
+        */
+       if (direct_io && full_coherency) {
+               /*
+                * We need to take and drop the inode lock to force
+                * other nodes to drop their caches.  Buffered I/O
+                * already does this in write_begin().
+                */
+               ret = ocfs2_inode_lock(inode, NULL, 1);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out_sems;
+               }
+
+               ocfs2_inode_unlock(inode, 1);
+       }
+
        can_do_direct = direct_io;
-       ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
+       ret = ocfs2_prepare_inode_for_write(file, ppos,
                                            iocb->ki_left, appending,
                                            &can_do_direct, &has_refcount);
        if (ret < 0) {
@@ -2255,13 +2318,16 @@ relock:
                goto out;
        }
 
+       if (direct_io && !is_sync_kiocb(iocb))
+               unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
+                                                     *ppos);
+
        /*
         * We can't complete the direct I/O as requested, fall back to
         * buffered I/O.
         */
        if (direct_io && !can_do_direct) {
                ocfs2_rw_unlock(inode, rw_level);
-               up_read(&inode->i_alloc_sem);
 
                have_alloc_sem = 0;
                rw_level = -1;
@@ -2270,6 +2336,18 @@ relock:
                goto relock;
        }
 
+       if (unaligned_dio) {
+               /*
+                * Wait on previous unaligned aio to complete before
+                * proceeding.
+                */
+               ocfs2_aiodio_wait(inode);
+
+               /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
+               atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
+               ocfs2_iocb_set_unaligned_aio(iocb);
+       }
+
        /*
         * To later detect whether a journal commit for sync writes is
         * necessary, we sample i_size, and cluster count here.
@@ -2295,17 +2373,6 @@ relock:
                written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
                                                    ppos, count, ocount);
                if (written < 0) {
-                       /*
-                        * direct write may have instantiated a few
-                        * blocks outside i_size. Trim these off again.
-                        * Don't need i_size_read because we hold i_mutex.
-                        *
-                        * XXX(hch): this looks buggy because ocfs2 did not
-                        * actually implement ->truncate.  Take a look at
-                        * the new truncate sequence and update this accordingly
-                        */
-                       if (*ppos + count > inode->i_size)
-                               simple_setsize(inode, inode->i_size);
                        ret = written;
                        goto out_dio;
                }
@@ -2321,7 +2388,7 @@ out_dio:
        BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
 
        if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
-           ((file->f_flags & O_DIRECT) && has_refcount)) {
+           ((file->f_flags & O_DIRECT) && !direct_io)) {
                ret = filemap_fdatawrite_range(file->f_mapping, pos,
                                               pos + count - 1);
                if (ret < 0)
@@ -2343,8 +2410,7 @@ out_dio:
        /*
         * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
         * function pointer which is called when o_direct io completes so that
-        * it can unlock our rw lock.  (it's the clustered equivalent of
-        * i_alloc_sem; protects truncate from racing with pending ios).
+        * it can unlock our rw lock.
         * Unfortunately there are error cases which call end_io and others
         * that don't.  so we don't have to unlock the rw_lock if either an
         * async dio is going to do it in the future or an end_io after an
@@ -2353,6 +2419,12 @@ out_dio:
        if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
                rw_level = -1;
                have_alloc_sem = 0;
+               unaligned_dio = 0;
+       }
+
+       if (unaligned_dio) {
+               ocfs2_iocb_clear_unaligned_aio(iocb);
+               atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
        }
 
 out:
@@ -2361,13 +2433,12 @@ out:
 
 out_sems:
        if (have_alloc_sem)
-               up_read(&inode->i_alloc_sem);
+               ocfs2_iocb_clear_sem_locked(iocb);
 
        mutex_unlock(&inode->i_mutex);
 
        if (written)
                ret = written;
-       mlog_exit(ret);
        return ret;
 }
 
@@ -2377,7 +2448,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
 {
        int ret;
 
-       ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
+       ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
                                            sd->total_len, 0, NULL, NULL);
        if (ret < 0) {
                mlog_errno(ret);
@@ -2403,10 +2474,11 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
                .u.file = out,
        };
 
-       mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
-                  (unsigned int)len,
-                  out->f_path.dentry->d_name.len,
-                  out->f_path.dentry->d_name.name);
+
+       trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry,
+                       (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                       out->f_path.dentry->d_name.len,
+                       out->f_path.dentry->d_name.name, len);
 
        if (pipe->inode)
                mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
@@ -2450,7 +2522,6 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
        }
 
-       mlog_exit(ret);
        return ret;
 }
 
@@ -2463,10 +2534,10 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
        int ret = 0, lock_level = 0;
        struct inode *inode = in->f_path.dentry->d_inode;
 
-       mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
-                  (unsigned int)len,
-                  in->f_path.dentry->d_name.len,
-                  in->f_path.dentry->d_name.name);
+       trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
+                       (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                       in->f_path.dentry->d_name.len,
+                       in->f_path.dentry->d_name.name, len);
 
        /*
         * See the comment in ocfs2_file_aio_read()
@@ -2481,7 +2552,6 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
        ret = generic_file_splice_read(in, ppos, pipe, len, flags);
 
 bail:
-       mlog_exit(ret);
        return ret;
 }
 
@@ -2494,10 +2564,11 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
        struct file *filp = iocb->ki_filp;
        struct inode *inode = filp->f_path.dentry->d_inode;
 
-       mlog_entry("(0x%p, %u, '%.*s')\n", filp,
-                  (unsigned int)nr_segs,
-                  filp->f_path.dentry->d_name.len,
-                  filp->f_path.dentry->d_name.name);
+       trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
+                       (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                       filp->f_path.dentry->d_name.len,
+                       filp->f_path.dentry->d_name.name, nr_segs);
+
 
        if (!inode) {
                ret = -EINVAL;
@@ -2505,13 +2576,15 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
                goto bail;
        }
 
+       ocfs2_iocb_clear_sem_locked(iocb);
+
        /*
         * buffered reads protect themselves in ->readpage().  O_DIRECT reads
         * need locks to protect pending reads from racing with truncate.
         */
        if (filp->f_flags & O_DIRECT) {
-               down_read(&inode->i_alloc_sem);
                have_alloc_sem = 1;
+               ocfs2_iocb_set_sem_locked(iocb);
 
                ret = ocfs2_rw_lock(inode, 0);
                if (ret < 0) {
@@ -2540,8 +2613,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
        ocfs2_inode_unlock(inode, lock_level);
 
        ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
-       if (ret == -EINVAL)
-               mlog(0, "generic_file_aio_read returned -EINVAL\n");
+       trace_generic_file_aio_read_ret(ret);
 
        /* buffered aio wouldn't have proper lock coverage today */
        BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
@@ -2554,14 +2626,65 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 
 bail:
        if (have_alloc_sem)
-               up_read(&inode->i_alloc_sem);
+               ocfs2_iocb_clear_sem_locked(iocb);
+
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
-       mlog_exit(ret);
 
        return ret;
 }
 
+/* Refer generic_file_llseek_unlocked() */
+static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)
+{
+       struct inode *inode = file->f_mapping->host;
+       int ret = 0;
+
+       mutex_lock(&inode->i_mutex);
+
+       switch (origin) {
+       case SEEK_SET:
+               break;
+       case SEEK_END:
+               offset += inode->i_size;
+               break;
+       case SEEK_CUR:
+               if (offset == 0) {
+                       offset = file->f_pos;
+                       goto out;
+               }
+               offset += file->f_pos;
+               break;
+       case SEEK_DATA:
+       case SEEK_HOLE:
+               ret = ocfs2_seek_data_hole_offset(file, &offset, origin);
+               if (ret)
+                       goto out;
+               break;
+       default:
+               ret = -EINVAL;
+               goto out;
+       }
+
+       if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+               ret = -EINVAL;
+       if (!ret && offset > inode->i_sb->s_maxbytes)
+               ret = -EINVAL;
+       if (ret)
+               goto out;
+
+       if (offset != file->f_pos) {
+               file->f_pos = offset;
+               file->f_version = 0;
+       }
+
+out:
+       mutex_unlock(&inode->i_mutex);
+       if (ret)
+               return ret;
+       return offset;
+}
+
 const struct inode_operations ocfs2_file_iops = {
        .setattr        = ocfs2_setattr,
        .getattr        = ocfs2_getattr,
@@ -2570,14 +2693,15 @@ const struct inode_operations ocfs2_file_iops = {
        .getxattr       = generic_getxattr,
        .listxattr      = ocfs2_listxattr,
        .removexattr    = generic_removexattr,
-       .fallocate      = ocfs2_fallocate,
        .fiemap         = ocfs2_fiemap,
+       .get_acl        = ocfs2_iop_get_acl,
 };
 
 const struct inode_operations ocfs2_special_file_iops = {
        .setattr        = ocfs2_setattr,
        .getattr        = ocfs2_getattr,
        .permission     = ocfs2_permission,
+       .get_acl        = ocfs2_iop_get_acl,
 };
 
 /*
@@ -2585,7 +2709,7 @@ const struct inode_operations ocfs2_special_file_iops = {
  * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
  */
 const struct file_operations ocfs2_fops = {
-       .llseek         = generic_file_llseek,
+       .llseek         = ocfs2_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .mmap           = ocfs2_mmap,
@@ -2602,6 +2726,7 @@ const struct file_operations ocfs2_fops = {
        .flock          = ocfs2_flock,
        .splice_read    = ocfs2_file_splice_read,
        .splice_write   = ocfs2_file_splice_write,
+       .fallocate      = ocfs2_fallocate,
 };
 
 const struct file_operations ocfs2_dops = {
@@ -2632,7 +2757,7 @@ const struct file_operations ocfs2_dops = {
  * the cluster.
  */
 const struct file_operations ocfs2_fops_no_plocks = {
-       .llseek         = generic_file_llseek,
+       .llseek         = ocfs2_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .mmap           = ocfs2_mmap,
@@ -2648,6 +2773,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
        .flock          = ocfs2_flock,
        .splice_read    = ocfs2_file_splice_read,
        .splice_write   = ocfs2_file_splice_write,
+       .fallocate      = ocfs2_fallocate,
 };
 
 const struct file_operations ocfs2_dops_no_plocks = {