Btrfs: fix space leak when we fail to reserve metadata space
[linux-2.6.git] / fs / ocfs2 / file.c
index 1e1a93a..7602783 100644 (file)
@@ -38,7 +38,6 @@
 #include <linux/quotaops.h>
 #include <linux/blkdev.h>
 
-#define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -61,6 +60,7 @@
 #include "acl.h"
 #include "quota.h"
 #include "refcounttree.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -99,8 +99,10 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
        int mode = file->f_flags;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-       mlog(0, "(0x%p, 0x%p, '%.*s')\n", inode, file,
-            file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
+       trace_ocfs2_file_open(inode, file, file->f_path.dentry,
+                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                             file->f_path.dentry->d_name.len,
+                             file->f_path.dentry->d_name.name, mode);
 
        if (file->f_mode & FMODE_WRITE)
                dquot_initialize(inode);
@@ -142,13 +144,15 @@ static int ocfs2_file_release(struct inode *inode, struct file *file)
 {
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-       mlog(0, "(0x%p, 0x%p, '%.*s')\n", inode, file,
-            file->f_path.dentry->d_name.len,
-            file->f_path.dentry->d_name.name);
-
        spin_lock(&oi->ip_lock);
        if (!--oi->ip_open_count)
                oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
+
+       trace_ocfs2_file_release(inode, file, file->f_path.dentry,
+                                oi->ip_blkno,
+                                file->f_path.dentry->d_name.len,
+                                file->f_path.dentry->d_name.name,
+                                oi->ip_open_count);
        spin_unlock(&oi->ip_lock);
 
        ocfs2_free_file_private(inode, file);
@@ -167,17 +171,30 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file)
        return 0;
 }
 
-static int ocfs2_sync_file(struct file *file, int datasync)
+static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
+                          int datasync)
 {
        int err = 0;
        journal_t *journal;
        struct inode *inode = file->f_mapping->host;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-       mlog(0, "(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
-            file->f_path.dentry, file->f_path.dentry->d_name.len,
-            file->f_path.dentry->d_name.name);
+       trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
+                             OCFS2_I(inode)->ip_blkno,
+                             file->f_path.dentry->d_name.len,
+                             file->f_path.dentry->d_name.name,
+                             (unsigned long long)datasync);
+
+       err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+       if (err)
+               return err;
 
+       /*
+        * Probably don't need the i_mutex at all in here, just putting it here
+        * to be consistent with how fsync used to be called, someone more
+        * familiar with the fs could possibly remove it.
+        */
+       mutex_lock(&inode->i_mutex);
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
                /*
                 * We still have to flush drive's caches to get data to the
@@ -194,6 +211,7 @@ static int ocfs2_sync_file(struct file *file, int datasync)
 bail:
        if (err)
                mlog_errno(err);
+       mutex_unlock(&inode->i_mutex);
 
        return (err < 0) ? -EIO : 0;
 }
@@ -431,14 +449,14 @@ static int ocfs2_truncate_file(struct inode *inode,
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-       mlog(0, "(inode = %llu, new_i_size = %llu\n",
-            (unsigned long long)OCFS2_I(inode)->ip_blkno,
-            (unsigned long long)new_i_size);
-
        /* We trust di_bh because it comes from ocfs2_inode_lock(), which
         * already validated it */
        fe = (struct ocfs2_dinode *) di_bh->b_data;
 
+       trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                 (unsigned long long)le64_to_cpu(fe->i_size),
+                                 (unsigned long long)new_i_size);
+
        mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
                        "Inode %llu, inode i_size = %lld != di "
                        "i_size = %llu, i_flags = 0x%x\n",
@@ -448,19 +466,14 @@ static int ocfs2_truncate_file(struct inode *inode,
                        le32_to_cpu(fe->i_flags));
 
        if (new_i_size > le64_to_cpu(fe->i_size)) {
-               mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
-                    (unsigned long long)le64_to_cpu(fe->i_size),
-                    (unsigned long long)new_i_size);
+               trace_ocfs2_truncate_file_error(
+                       (unsigned long long)le64_to_cpu(fe->i_size),
+                       (unsigned long long)new_i_size);
                status = -EINVAL;
                mlog_errno(status);
                goto bail;
        }
 
-       mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
-            (unsigned long long)le64_to_cpu(fe->i_blkno),
-            (unsigned long long)le64_to_cpu(fe->i_size),
-            (unsigned long long)new_i_size);
-
        /* lets handle the simple truncate cases before doing any more
         * cluster locking. */
        if (new_i_size == le64_to_cpu(fe->i_size))
@@ -566,8 +579,6 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
        struct ocfs2_extent_tree et;
        int did_quota = 0;
 
-       mlog(0, "(clusters_to_add = %u)\n", clusters_to_add);
-
        /*
         * This function only exists for file systems which don't
         * support holes.
@@ -584,11 +595,6 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 restart_all:
        BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 
-       mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
-            "clusters_to_add = %u\n",
-            (unsigned long long)OCFS2_I(inode)->ip_blkno,
-            (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
-            clusters_to_add);
        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
        status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
                                       &data_ac, &meta_ac);
@@ -608,6 +614,12 @@ restart_all:
        }
 
 restarted_transaction:
+       trace_ocfs2_extend_allocation(
+               (unsigned long long)OCFS2_I(inode)->ip_blkno,
+               (unsigned long long)i_size_read(inode),
+               le32_to_cpu(fe->i_clusters), clusters_to_add,
+               why, restart_func);
+
        status = dquot_alloc_space_nodirty(inode,
                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
        if (status)
@@ -654,13 +666,11 @@ restarted_transaction:
 
        if (why != RESTART_NONE && clusters_to_add) {
                if (why == RESTART_META) {
-                       mlog(0, "restarting function.\n");
                        restart_func = 1;
                        status = 0;
                } else {
                        BUG_ON(why != RESTART_TRANS);
 
-                       mlog(0, "restarting transaction.\n");
                        /* TODO: This can be more intelligent. */
                        credits = ocfs2_calc_extend_credits(osb->sb,
                                                            &fe->id2.i_list,
@@ -677,11 +687,11 @@ restarted_transaction:
                }
        }
 
-       mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
+       trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
             le32_to_cpu(fe->i_clusters),
-            (unsigned long long)le64_to_cpu(fe->i_size));
-       mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
-            OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
+            (unsigned long long)le64_to_cpu(fe->i_size),
+            OCFS2_I(inode)->ip_clusters,
+            (unsigned long long)i_size_read(inode));
 
 leave:
        if (status < 0 && did_quota)
@@ -772,10 +782,11 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
        if (!zero_to)
                zero_to = PAGE_CACHE_SIZE;
 
-       mlog(0,
-            "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
-            (unsigned long long)abs_from, (unsigned long long)abs_to,
-            index, zero_from, zero_to);
+       trace_ocfs2_write_zero_page(
+                       (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                       (unsigned long long)abs_from,
+                       (unsigned long long)abs_to,
+                       index, zero_from, zero_to);
 
        /* We know that zero_from is block aligned */
        for (block_start = zero_from; block_start < zero_to;
@@ -915,9 +926,10 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
        u64 next_pos;
        u64 zero_pos = range_start;
 
-       mlog(0, "range_start = %llu, range_end = %llu\n",
-            (unsigned long long)range_start,
-            (unsigned long long)range_end);
+       trace_ocfs2_zero_extend_range(
+                       (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                       (unsigned long long)range_start,
+                       (unsigned long long)range_end);
        BUG_ON(range_start >= range_end);
 
        while (zero_pos < range_end) {
@@ -949,9 +961,9 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
        struct super_block *sb = inode->i_sb;
 
        zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
-       mlog(0, "zero_start %llu for i_size %llu\n",
-            (unsigned long long)zero_start,
-            (unsigned long long)i_size_read(inode));
+       trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
+                               (unsigned long long)zero_start,
+                               (unsigned long long)i_size_read(inode));
        while (zero_start < zero_to_size) {
                ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
                                                  zero_to_size,
@@ -1100,30 +1112,20 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        struct dquot *transfer_to[MAXQUOTAS] = { };
        int qtype;
 
-       mlog(0, "(0x%p, '%.*s')\n", dentry,
-            dentry->d_name.len, dentry->d_name.name);
+       trace_ocfs2_setattr(inode, dentry,
+                           (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                           dentry->d_name.len, dentry->d_name.name,
+                           attr->ia_valid, attr->ia_mode,
+                           attr->ia_uid, attr->ia_gid);
 
        /* ensuring we don't even attempt to truncate a symlink */
        if (S_ISLNK(inode->i_mode))
                attr->ia_valid &= ~ATTR_SIZE;
 
-       if (attr->ia_valid & ATTR_MODE)
-               mlog(0, "mode change: %d\n", attr->ia_mode);
-       if (attr->ia_valid & ATTR_UID)
-               mlog(0, "uid change: %d\n", attr->ia_uid);
-       if (attr->ia_valid & ATTR_GID)
-               mlog(0, "gid change: %d\n", attr->ia_gid);
-       if (attr->ia_valid & ATTR_SIZE)
-               mlog(0, "size change...\n");
-       if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
-               mlog(0, "time change...\n");
-
 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
                           | ATTR_GID | ATTR_UID | ATTR_MODE)
-       if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
-               mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
+       if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
                return 0;
-       }
 
        status = inode_change_ok(inode, attr);
        if (status)
@@ -1152,6 +1154,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                if (status)
                        goto bail_unlock;
 
+               inode_dio_wait(inode);
+
                if (i_size_read(inode) > attr->ia_size) {
                        if (ocfs2_should_order_data(inode)) {
                                status = ocfs2_begin_ordered_truncate(inode,
@@ -1289,11 +1293,11 @@ bail:
        return err;
 }
 
-int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
+int ocfs2_permission(struct inode *inode, int mask)
 {
        int ret;
 
-       if (flags & IPERM_FLAG_RCU)
+       if (mask & MAY_NOT_BLOCK)
                return -ECHILD;
 
        ret = ocfs2_inode_lock(inode, NULL, 0);
@@ -1303,7 +1307,7 @@ int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
                goto out;
        }
 
-       ret = generic_permission(inode, mask, flags, ocfs2_check_acl);
+       ret = generic_permission(inode, mask);
 
        ocfs2_inode_unlock(inode, 0);
 out:
@@ -1318,8 +1322,9 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_dinode *di;
 
-       mlog(0, "(Inode %llu, mode 0%o)\n",
-            (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
+       trace_ocfs2_write_remove_suid(
+                       (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                       inode->i_mode);
 
        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
        if (IS_ERR(handle)) {
@@ -1525,8 +1530,9 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
         * partial clusters here. There's no need to worry about
         * physical allocation - the zeroing code knows to skip holes.
         */
-       mlog(0, "byte start: %llu, end: %llu\n",
-            (unsigned long long)start, (unsigned long long)end);
+       trace_ocfs2_zero_partial_clusters(
+               (unsigned long long)OCFS2_I(inode)->ip_blkno,
+               (unsigned long long)start, (unsigned long long)end);
 
        /*
         * If both edges are on a cluster boundary then there's no
@@ -1550,8 +1556,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
        if (tmpend > end)
                tmpend = end;
 
-       mlog(0, "1st range: start: %llu, tmpend: %llu\n",
-            (unsigned long long)start, (unsigned long long)tmpend);
+       trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
+                                                (unsigned long long)tmpend);
 
        ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
        if (ret)
@@ -1565,8 +1571,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
                 */
                start = end & ~(osb->s_clustersize - 1);
 
-               mlog(0, "2nd range: start: %llu, end: %llu\n",
-                    (unsigned long long)start, (unsigned long long)end);
+               trace_ocfs2_zero_partial_clusters_range2(
+                       (unsigned long long)start, (unsigned long long)end);
 
                ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
                if (ret)
@@ -1615,6 +1621,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode,
        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
 
        if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
+               /*
+                * remove an entire extent record.
+                */
                *trunc_cpos = le32_to_cpu(rec->e_cpos);
                /*
                 * Skip holes if any.
@@ -1625,7 +1634,16 @@ static void ocfs2_calc_trunc_pos(struct inode *inode,
                *blkno = le64_to_cpu(rec->e_blkno);
                *trunc_end = le32_to_cpu(rec->e_cpos);
        } else if (range > trunc_start) {
+               /*
+                * remove a partial extent record, which means we're
+                * removing the last extent record.
+                */
                *trunc_cpos = trunc_start;
+               /*
+                * skip hole if any.
+                */
+               if (range < *trunc_end)
+                       *trunc_end = range;
                *trunc_len = *trunc_end - trunc_start;
                coff = trunc_start - le32_to_cpu(rec->e_cpos);
                *blkno = le64_to_cpu(rec->e_blkno) +
@@ -1666,6 +1684,11 @@ static int ocfs2_remove_inode_range(struct inode *inode,
        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
        ocfs2_init_dealloc_ctxt(&dealloc);
 
+       trace_ocfs2_remove_inode_range(
+                       (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                       (unsigned long long)byte_start,
+                       (unsigned long long)byte_len);
+
        if (byte_len == 0)
                return 0;
 
@@ -1712,11 +1735,6 @@ static int ocfs2_remove_inode_range(struct inode *inode,
        trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
        cluster_in_el = trunc_end;
 
-       mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n",
-            (unsigned long long)OCFS2_I(inode)->ip_blkno,
-            (unsigned long long)byte_start,
-            (unsigned long long)byte_len, trunc_start, trunc_end);
-
        ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
        if (ret) {
                mlog_errno(ret);
@@ -1932,6 +1950,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
        if (ret < 0)
                mlog_errno(ret);
 
+       if (file && (file->f_flags & O_SYNC))
+               handle->h_sync = 1;
+
        ocfs2_commit_trans(osb, handle);
 
 out_inode_unlock:
@@ -2034,6 +2055,23 @@ out:
        return ret;
 }
 
+static void ocfs2_aiodio_wait(struct inode *inode)
+{
+       wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
+
+       wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
+}
+
+static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
+{
+       int blockmask = inode->i_sb->s_blocksize - 1;
+       loff_t final_size = pos + count;
+
+       if ((pos & blockmask) || (final_size & blockmask))
+               return 1;
+       return 0;
+}
+
 static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
                                            struct file *file,
                                            loff_t pos, size_t count,
@@ -2071,7 +2109,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
        int ret = 0, meta_level = 0;
        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = dentry->d_inode;
-       loff_t saved_pos, end;
+       loff_t saved_pos = 0, end;
 
        /*
         * We start with a read level meta lock and only jump to an ex
@@ -2090,7 +2128,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
                 * remove_suid() calls ->setattr without any hint that
                 * we may have already done our cluster locking. Since
                 * ocfs2_setattr() *must* take cluster locks to
-                * proceeed, this will lead us to recursively lock the
+                * proceed, this will lead us to recursively lock the
                 * inode. There's also the dinode i_size state which
                 * can be lost via setattr during extending writes (we
                 * set inode->i_size at the end of a write. */
@@ -2110,12 +2148,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
 
                /* work on a copy of ppos until we're sure that we won't have
                 * to recalculate it due to relocking. */
-               if (appending) {
+               if (appending)
                        saved_pos = i_size_read(inode);
-                       mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
-               } else {
+               else
                        saved_pos = *ppos;
-               }
 
                end = saved_pos + count;
 
@@ -2186,6 +2222,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
                *ppos = saved_pos;
 
 out_unlock:
+       trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
+                                           saved_pos, appending, count,
+                                           direct_io, has_refcount);
+
        if (meta_level >= 0)
                ocfs2_inode_unlock(inode, meta_level);
 
@@ -2210,11 +2250,13 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        int full_coherency = !(osb->s_mount_opt &
                               OCFS2_MOUNT_COHERENCY_BUFFERED);
+       int unaligned_dio = 0;
 
-       mlog(0, "(0x%p, %u, '%.*s')\n", file,
-            (unsigned int)nr_segs,
-            file->f_path.dentry->d_name.len,
-            file->f_path.dentry->d_name.name);
+       trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
+               (unsigned long long)OCFS2_I(inode)->ip_blkno,
+               file->f_path.dentry->d_name.len,
+               file->f_path.dentry->d_name.name,
+               (unsigned int)nr_segs);
 
        if (iocb->ki_left == 0)
                return 0;
@@ -2229,9 +2271,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        ocfs2_iocb_clear_sem_locked(iocb);
 
 relock:
-       /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
+       /* to match setattr's i_mutex -> rw_lock ordering */
        if (direct_io) {
-               down_read(&inode->i_alloc_sem);
                have_alloc_sem = 1;
                /* communicate with ocfs2_dio_end_io */
                ocfs2_iocb_set_sem_locked(iocb);
@@ -2277,13 +2318,16 @@ relock:
                goto out;
        }
 
+       if (direct_io && !is_sync_kiocb(iocb))
+               unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
+                                                     *ppos);
+
        /*
         * We can't complete the direct I/O as requested, fall back to
         * buffered I/O.
         */
        if (direct_io && !can_do_direct) {
                ocfs2_rw_unlock(inode, rw_level);
-               up_read(&inode->i_alloc_sem);
 
                have_alloc_sem = 0;
                rw_level = -1;
@@ -2292,6 +2336,18 @@ relock:
                goto relock;
        }
 
+       if (unaligned_dio) {
+               /*
+                * Wait on previous unaligned aio to complete before
+                * proceeding.
+                */
+               ocfs2_aiodio_wait(inode);
+
+               /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
+               atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
+               ocfs2_iocb_set_unaligned_aio(iocb);
+       }
+
        /*
         * To later detect whether a journal commit for sync writes is
         * necessary, we sample i_size, and cluster count here.
@@ -2354,8 +2410,7 @@ out_dio:
        /*
         * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
         * function pointer which is called when o_direct io completes so that
-        * it can unlock our rw lock.  (it's the clustered equivalent of
-        * i_alloc_sem; protects truncate from racing with pending ios).
+        * it can unlock our rw lock.
         * Unfortunately there are error cases which call end_io and others
         * that don't.  so we don't have to unlock the rw_lock if either an
         * async dio is going to do it in the future or an end_io after an
@@ -2364,6 +2419,12 @@ out_dio:
        if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
                rw_level = -1;
                have_alloc_sem = 0;
+               unaligned_dio = 0;
+       }
+
+       if (unaligned_dio) {
+               ocfs2_iocb_clear_unaligned_aio(iocb);
+               atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
        }
 
 out:
@@ -2371,10 +2432,8 @@ out:
                ocfs2_rw_unlock(inode, rw_level);
 
 out_sems:
-       if (have_alloc_sem) {
-               up_read(&inode->i_alloc_sem);
+       if (have_alloc_sem)
                ocfs2_iocb_clear_sem_locked(iocb);
-       }
 
        mutex_unlock(&inode->i_mutex);
 
@@ -2415,10 +2474,11 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
                .u.file = out,
        };
 
-       mlog(0, "(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
-            (unsigned int)len,
-            out->f_path.dentry->d_name.len,
-            out->f_path.dentry->d_name.name);
+
+       trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry,
+                       (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                       out->f_path.dentry->d_name.len,
+                       out->f_path.dentry->d_name.name, len);
 
        if (pipe->inode)
                mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
@@ -2474,10 +2534,10 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
        int ret = 0, lock_level = 0;
        struct inode *inode = in->f_path.dentry->d_inode;
 
-       mlog(0, "(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
-            (unsigned int)len,
-            in->f_path.dentry->d_name.len,
-            in->f_path.dentry->d_name.name);
+       trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
+                       (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                       in->f_path.dentry->d_name.len,
+                       in->f_path.dentry->d_name.name, len);
 
        /*
         * See the comment in ocfs2_file_aio_read()
@@ -2504,10 +2564,11 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
        struct file *filp = iocb->ki_filp;
        struct inode *inode = filp->f_path.dentry->d_inode;
 
-       mlog(0, "(0x%p, %u, '%.*s')\n", filp,
-            (unsigned int)nr_segs,
-            filp->f_path.dentry->d_name.len,
-            filp->f_path.dentry->d_name.name);
+       trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
+                       (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                       filp->f_path.dentry->d_name.len,
+                       filp->f_path.dentry->d_name.name, nr_segs);
+
 
        if (!inode) {
                ret = -EINVAL;
@@ -2522,7 +2583,6 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
         * need locks to protect pending reads from racing with truncate.
         */
        if (filp->f_flags & O_DIRECT) {
-               down_read(&inode->i_alloc_sem);
                have_alloc_sem = 1;
                ocfs2_iocb_set_sem_locked(iocb);
 
@@ -2553,8 +2613,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
        ocfs2_inode_unlock(inode, lock_level);
 
        ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
-       if (ret == -EINVAL)
-               mlog(0, "generic_file_aio_read returned -EINVAL\n");
+       trace_generic_file_aio_read_ret(ret);
 
        /* buffered aio wouldn't have proper lock coverage today */
        BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
@@ -2566,16 +2625,66 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
        }
 
 bail:
-       if (have_alloc_sem) {
-               up_read(&inode->i_alloc_sem);
+       if (have_alloc_sem)
                ocfs2_iocb_clear_sem_locked(iocb);
-       }
+
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
 
        return ret;
 }
 
+/* Refer generic_file_llseek_unlocked() */
+static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)
+{
+       struct inode *inode = file->f_mapping->host;
+       int ret = 0;
+
+       mutex_lock(&inode->i_mutex);
+
+       switch (origin) {
+       case SEEK_SET:
+               break;
+       case SEEK_END:
+               offset += inode->i_size;
+               break;
+       case SEEK_CUR:
+               if (offset == 0) {
+                       offset = file->f_pos;
+                       goto out;
+               }
+               offset += file->f_pos;
+               break;
+       case SEEK_DATA:
+       case SEEK_HOLE:
+               ret = ocfs2_seek_data_hole_offset(file, &offset, origin);
+               if (ret)
+                       goto out;
+               break;
+       default:
+               ret = -EINVAL;
+               goto out;
+       }
+
+       if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+               ret = -EINVAL;
+       if (!ret && offset > inode->i_sb->s_maxbytes)
+               ret = -EINVAL;
+       if (ret)
+               goto out;
+
+       if (offset != file->f_pos) {
+               file->f_pos = offset;
+               file->f_version = 0;
+       }
+
+out:
+       mutex_unlock(&inode->i_mutex);
+       if (ret)
+               return ret;
+       return offset;
+}
+
 const struct inode_operations ocfs2_file_iops = {
        .setattr        = ocfs2_setattr,
        .getattr        = ocfs2_getattr,
@@ -2585,12 +2694,14 @@ const struct inode_operations ocfs2_file_iops = {
        .listxattr      = ocfs2_listxattr,
        .removexattr    = generic_removexattr,
        .fiemap         = ocfs2_fiemap,
+       .get_acl        = ocfs2_iop_get_acl,
 };
 
 const struct inode_operations ocfs2_special_file_iops = {
        .setattr        = ocfs2_setattr,
        .getattr        = ocfs2_getattr,
        .permission     = ocfs2_permission,
+       .get_acl        = ocfs2_iop_get_acl,
 };
 
 /*
@@ -2598,7 +2709,7 @@ const struct inode_operations ocfs2_special_file_iops = {
  * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
  */
 const struct file_operations ocfs2_fops = {
-       .llseek         = generic_file_llseek,
+       .llseek         = ocfs2_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .mmap           = ocfs2_mmap,
@@ -2646,7 +2757,7 @@ const struct file_operations ocfs2_dops = {
  * the cluster.
  */
 const struct file_operations ocfs2_fops_no_plocks = {
-       .llseek         = generic_file_llseek,
+       .llseek         = ocfs2_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .mmap           = ocfs2_mmap,
@@ -2662,6 +2773,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
        .flock          = ocfs2_flock,
        .splice_read    = ocfs2_file_splice_read,
        .splice_write   = ocfs2_file_splice_write,
+       .fallocate      = ocfs2_fallocate,
 };
 
 const struct file_operations ocfs2_dops_no_plocks = {