vfs: do (nearly) lockless generic_file_llseek
[linux-2.6.git] / fs / gfs2 / file.c
index aa99647..fe6bc02 100644 (file)
@@ -19,6 +19,8 @@
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/ext2_fs.h>
+#include <linux/falloc.h>
+#include <linux/swap.h>
 #include <linux/crc32.h>
 #include <linux/writeback.h>
 #include <asm/uaccess.h>
@@ -61,11 +63,11 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
                                           &i_gh);
                if (!error) {
-                       error = generic_file_llseek_unlocked(file, offset, origin);
+                       error = generic_file_llseek(file, offset, origin);
                        gfs2_glock_dq_uninit(&i_gh);
                }
        } else
-               error = generic_file_llseek_unlocked(file, offset, origin);
+               error = generic_file_llseek(file, offset, origin);
 
        return error;
 }
@@ -172,7 +174,9 @@ void gfs2_set_inode_flags(struct inode *inode)
        struct gfs2_inode *ip = GFS2_I(inode);
        unsigned int flags = inode->i_flags;
 
-       flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+       flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_NOSEC);
+       if ((ip->i_eattr == 0) && !is_sxid(inode->i_mode))
+               inode->i_flags |= S_NOSEC;
        if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)
                flags |= S_IMMUTABLE;
        if (ip->i_diskflags & GFS2_DIF_APPENDONLY)
@@ -219,7 +223,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
                goto out_drop_write;
 
        error = -EACCES;
-       if (!is_owner_or_cap(inode))
+       if (!inode_owner_or_capable(inode))
                goto out;
 
        error = 0;
@@ -446,15 +450,20 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
 
-       if (!(file->f_flags & O_NOATIME)) {
+       if (!(file->f_flags & O_NOATIME) &&
+           !IS_NOATIME(&ip->i_inode)) {
                struct gfs2_holder i_gh;
                int error;
 
-               gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+               gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
                error = gfs2_glock_nq(&i_gh);
-               file_accessed(file);
-               if (error == 0)
-                       gfs2_glock_dq_uninit(&i_gh);
+               if (error == 0) {
+                       file_accessed(file);
+                       gfs2_glock_dq(&i_gh);
+               }
+               gfs2_holder_uninit(&i_gh);
+               if (error)
+                       return error;
        }
        vma->vm_ops = &gfs2_vm_ops;
        vma->vm_flags |= VM_CAN_NONLINEAR;
@@ -537,43 +546,44 @@ static int gfs2_close(struct inode *inode, struct file *file)
 
 /**
  * gfs2_fsync - sync the dirty data for a file (across the cluster)
- * @file: the file that points to the dentry (we ignore this)
- * @dentry: the dentry that points to the inode to sync
- *
- * The VFS will flush "normal" data for us. We only need to worry
- * about metadata here. For journaled data, we just do a log flush
- * as we can't avoid it. Otherwise we can just bale out if datasync
- * is set. For stuffed inodes we must flush the log in order to
- * ensure that all data is on disk.
+ * @file: the file that points to the dentry
+ * @start: the start position in the file to sync
+ * @end: the end position in the file to sync
+ * @datasync: set if we can ignore timestamp changes
  *
- * The call to write_inode_now() is there to write back metadata and
- * the inode itself. It does also try and write the data, but thats
- * (hopefully) a no-op due to the VFS having already called filemap_fdatawrite()
- * for us.
+ * The VFS will flush data for us. We only need to worry
+ * about metadata here.
  *
  * Returns: errno
  */
 
-static int gfs2_fsync(struct file *file, int datasync)
+static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
+                     int datasync)
 {
        struct inode *inode = file->f_mapping->host;
        int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
-       int ret = 0;
+       struct gfs2_inode *ip = GFS2_I(inode);
+       int ret;
 
-       if (gfs2_is_jdata(GFS2_I(inode))) {
-               gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
-               return 0;
-       }
+       ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+       if (ret)
+               return ret;
+       mutex_lock(&inode->i_mutex);
 
-       if (sync_state != 0) {
-               if (!datasync)
-                       ret = write_inode_now(inode, 0);
+       if (datasync)
+               sync_state &= ~I_DIRTY_SYNC;
 
-               if (gfs2_is_stuffed(GFS2_I(inode)))
-                       gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
+       if (sync_state) {
+               ret = sync_inode_metadata(inode, 1);
+               if (ret) {
+                       mutex_unlock(&inode->i_mutex);
+                       return ret;
+               }
+               gfs2_ail_flush(ip->i_gl);
        }
 
-       return ret;
+       mutex_unlock(&inode->i_mutex);
+       return 0;
 }
 
 /**
@@ -610,6 +620,310 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        return generic_file_aio_write(iocb, iov, nr_segs, pos);
 }
 
+static int empty_write_end(struct page *page, unsigned from,
+                          unsigned to, int mode)
+{
+       struct inode *inode = page->mapping->host;
+       struct gfs2_inode *ip = GFS2_I(inode);
+       struct buffer_head *bh;
+       unsigned offset, blksize = 1 << inode->i_blkbits;
+       pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
+
+       zero_user(page, from, to-from);
+       mark_page_accessed(page);
+
+       if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
+               if (!gfs2_is_writeback(ip))
+                       gfs2_page_add_databufs(ip, page, from, to);
+
+               block_commit_write(page, from, to);
+               return 0;
+       }
+
+       offset = 0;
+       bh = page_buffers(page);
+       while (offset < to) {
+               if (offset >= from) {
+                       set_buffer_uptodate(bh);
+                       mark_buffer_dirty(bh);
+                       clear_buffer_new(bh);
+                       write_dirty_buffer(bh, WRITE);
+               }
+               offset += blksize;
+               bh = bh->b_this_page;
+       }
+
+       offset = 0;
+       bh = page_buffers(page);
+       while (offset < to) {
+               if (offset >= from) {
+                       wait_on_buffer(bh);
+                       if (!buffer_uptodate(bh))
+                               return -EIO;
+               }
+               offset += blksize;
+               bh = bh->b_this_page;
+       }
+       return 0;
+}
+
+static int needs_empty_write(sector_t block, struct inode *inode)
+{
+       int error;
+       struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+
+       bh_map.b_size = 1 << inode->i_blkbits;
+       error = gfs2_block_map(inode, block, &bh_map, 0);
+       if (unlikely(error))
+               return error;
+       return !buffer_mapped(&bh_map);
+}
+
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
+                             int mode)
+{
+       struct inode *inode = page->mapping->host;
+       unsigned start, end, next, blksize;
+       sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+       int ret;
+
+       blksize = 1 << inode->i_blkbits;
+       next = end = 0;
+       while (next < from) {
+               next += blksize;
+               block++;
+       }
+       start = next;
+       do {
+               next += blksize;
+               ret = needs_empty_write(block, inode);
+               if (unlikely(ret < 0))
+                       return ret;
+               if (ret == 0) {
+                       if (end) {
+                               ret = __block_write_begin(page, start, end - start,
+                                                         gfs2_block_map);
+                               if (unlikely(ret))
+                                       return ret;
+                               ret = empty_write_end(page, start, end, mode);
+                               if (unlikely(ret))
+                                       return ret;
+                               end = 0;
+                       }
+                       start = next;
+               }
+               else
+                       end = next;
+               block++;
+       } while (next < to);
+
+       if (end) {
+               ret = __block_write_begin(page, start, end - start, gfs2_block_map);
+               if (unlikely(ret))
+                       return ret;
+               ret = empty_write_end(page, start, end, mode);
+               if (unlikely(ret))
+                       return ret;
+       }
+
+       return 0;
+}
+
+static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
+                          int mode)
+{
+       struct gfs2_inode *ip = GFS2_I(inode);
+       struct buffer_head *dibh;
+       int error;
+       u64 start = offset >> PAGE_CACHE_SHIFT;
+       unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
+       u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+       pgoff_t curr;
+       struct page *page;
+       unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
+       unsigned int from, to;
+
+       if (!end_offset)
+               end_offset = PAGE_CACHE_SIZE;
+
+       error = gfs2_meta_inode_buffer(ip, &dibh);
+       if (unlikely(error))
+               goto out;
+
+       gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+       if (gfs2_is_stuffed(ip)) {
+               error = gfs2_unstuff_dinode(ip, NULL);
+               if (unlikely(error))
+                       goto out;
+       }
+
+       curr = start;
+       offset = start << PAGE_CACHE_SHIFT;
+       from = start_offset;
+       to = PAGE_CACHE_SIZE;
+       while (curr <= end) {
+               page = grab_cache_page_write_begin(inode->i_mapping, curr,
+                                                  AOP_FLAG_NOFS);
+               if (unlikely(!page)) {
+                       error = -ENOMEM;
+                       goto out;
+               }
+
+               if (curr == end)
+                       to = end_offset;
+               error = write_empty_blocks(page, from, to, mode);
+               if (!error && offset + to > inode->i_size &&
+                   !(mode & FALLOC_FL_KEEP_SIZE)) {
+                       i_size_write(inode, offset + to);
+               }
+               unlock_page(page);
+               page_cache_release(page);
+               if (error)
+                       goto out;
+               curr++;
+               offset += PAGE_CACHE_SIZE;
+               from = 0;
+       }
+
+       gfs2_dinode_out(ip, dibh->b_data);
+       mark_inode_dirty(inode);
+
+       brelse(dibh);
+
+out:
+       return error;
+}
+
+static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
+                           unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+       const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+       unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
+       unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
+
+       for (tmp = max_data; tmp > sdp->sd_diptrs;) {
+               tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+               max_data -= tmp;
+       }
+       /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
+          so it might end up with fewer data blocks */
+       if (max_data <= *data_blocks)
+               return;
+       *data_blocks = max_data;
+       *ind_blocks = max_blocks - max_data;
+       *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
+       if (*len > max) {
+               *len = max;
+               gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
+       }
+}
+
+static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
+                          loff_t len)
+{
+       struct inode *inode = file->f_path.dentry->d_inode;
+       struct gfs2_sbd *sdp = GFS2_SB(inode);
+       struct gfs2_inode *ip = GFS2_I(inode);
+       unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+       loff_t bytes, max_bytes;
+       struct gfs2_alloc *al;
+       int error;
+       loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
+       loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+       next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
+
+       /* We only support the FALLOC_FL_KEEP_SIZE mode */
+       if (mode & ~FALLOC_FL_KEEP_SIZE)
+               return -EOPNOTSUPP;
+
+       offset &= bsize_mask;
+
+       len = next - offset;
+       bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
+       if (!bytes)
+               bytes = UINT_MAX;
+       bytes &= bsize_mask;
+       if (bytes == 0)
+               bytes = sdp->sd_sb.sb_bsize;
+
+       gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+       error = gfs2_glock_nq(&ip->i_gh);
+       if (unlikely(error))
+               goto out_uninit;
+
+       if (!gfs2_write_alloc_required(ip, offset, len))
+               goto out_unlock;
+
+       while (len > 0) {
+               if (len < bytes)
+                       bytes = len;
+               al = gfs2_alloc_get(ip);
+               if (!al) {
+                       error = -ENOMEM;
+                       goto out_unlock;
+               }
+
+               error = gfs2_quota_lock_check(ip);
+               if (error)
+                       goto out_alloc_put;
+
+retry:
+               gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+
+               al->al_requested = data_blocks + ind_blocks;
+               error = gfs2_inplace_reserve(ip);
+               if (error) {
+                       if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
+                               bytes >>= 1;
+                               bytes &= bsize_mask;
+                               if (bytes == 0)
+                                       bytes = sdp->sd_sb.sb_bsize;
+                               goto retry;
+                       }
+                       goto out_qunlock;
+               }
+               max_bytes = bytes;
+               calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+               al->al_requested = data_blocks + ind_blocks;
+
+               rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
+                         RES_RG_HDR + gfs2_rg_blocks(al);
+               if (gfs2_is_jdata(ip))
+                       rblocks += data_blocks ? data_blocks : 1;
+
+               error = gfs2_trans_begin(sdp, rblocks,
+                                        PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+               if (error)
+                       goto out_trans_fail;
+
+               error = fallocate_chunk(inode, offset, max_bytes, mode);
+               gfs2_trans_end(sdp);
+
+               if (error)
+                       goto out_trans_fail;
+
+               len -= max_bytes;
+               offset += max_bytes;
+               gfs2_inplace_release(ip);
+               gfs2_quota_unlock(ip);
+               gfs2_alloc_put(ip);
+       }
+       goto out_unlock;
+
+out_trans_fail:
+       gfs2_inplace_release(ip);
+out_qunlock:
+       gfs2_quota_unlock(ip);
+out_alloc_put:
+       gfs2_alloc_put(ip);
+out_unlock:
+       gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+       gfs2_holder_uninit(&ip->i_gh);
+       return error;
+}
+
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
 
 /**
@@ -720,8 +1034,10 @@ static void do_unflock(struct file *file, struct file_lock *fl)
 
        mutex_lock(&fp->f_fl_mutex);
        flock_lock_file_wait(file, fl);
-       if (fl_gh->gh_gl)
-               gfs2_glock_dq_uninit(fl_gh);
+       if (fl_gh->gh_gl) {
+               gfs2_glock_dq_wait(fl_gh);
+               gfs2_holder_uninit(fl_gh);
+       }
        mutex_unlock(&fp->f_fl_mutex);
 }
 
@@ -765,6 +1081,7 @@ const struct file_operations gfs2_file_fops = {
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
        .setlease       = gfs2_setlease,
+       .fallocate      = gfs2_fallocate,
 };
 
 const struct file_operations gfs2_dir_fops = {
@@ -794,6 +1111,7 @@ const struct file_operations gfs2_file_fops_nolock = {
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
        .setlease       = generic_setlease,
+       .fallocate      = gfs2_fallocate,
 };
 
 const struct file_operations gfs2_dir_fops_nolock = {