nfsd41: Deny new lock before RECLAIM_COMPLETE done
[linux-2.6.git] / fs / nfsd / vfs.c
index bd584bc..fd0acca 100644 (file)
@@ -1,7 +1,4 @@
-#define MSNFS  /* HACK HACK */
 /*
- * linux/fs/nfsd/vfs.c
- *
  * File operations used by nfsd. Some of these have been ripped from
  * other parts of the kernel because they weren't exported, others
  * are partial duplicates with added or changed functionality.
  * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp>
  */
 
-#include <linux/string.h>
-#include <linux/time.h>
-#include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/mount.h>
-#include <linux/major.h>
 #include <linux/splice.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
 #include <linux/fcntl.h>
-#include <linux/net.h>
-#include <linux/unistd.h>
-#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/in.h>
-#include <linux/module.h>
 #include <linux/namei.h>
-#include <linux/vfs.h>
 #include <linux/delay.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
-#ifdef CONFIG_NFSD_V3
-#include <linux/nfs3.h>
-#include <linux/nfsd/xdr3.h>
-#endif /* CONFIG_NFSD_V3 */
-#include <linux/nfsd/nfsfh.h>
-#include <linux/quotaops.h>
 #include <linux/fsnotify.h>
-#include <linux/posix_acl.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/xattr.h>
-#ifdef CONFIG_NFSD_V4
-#include <linux/nfs4.h>
-#include <linux/nfs4_acl.h>
-#include <linux/nfsd_idmap.h>
-#include <linux/security.h>
-#endif /* CONFIG_NFSD_V4 */
 #include <linux/jhash.h>
 #include <linux/ima.h>
-
+#include <linux/slab.h>
 #include <asm/uaccess.h>
+#include <linux/exportfs.h>
+#include <linux/writeback.h>
+
+#ifdef CONFIG_NFSD_V3
+#include "xdr3.h"
+#endif /* CONFIG_NFSD_V3 */
+
+#ifdef CONFIG_NFSD_V4
+#include "acl.h"
+#include "idmap.h"
+#endif /* CONFIG_NFSD_V4 */
+
+#include "nfsd.h"
+#include "vfs.h"
 
 #define NFSDDBG_FACILITY               NFSDDBG_FILEOP
 
@@ -101,40 +83,96 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 {
        struct svc_export *exp = *expp, *exp2 = NULL;
        struct dentry *dentry = *dpp;
-       struct vfsmount *mnt = mntget(exp->ex_path.mnt);
-       struct dentry *mounts = dget(dentry);
+       struct path path = {.mnt = mntget(exp->ex_path.mnt),
+                           .dentry = dget(dentry)};
        int err = 0;
 
-       while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts));
+       err = follow_down(&path);
+       if (err < 0)
+               goto out;
 
-       exp2 = rqst_exp_get_by_name(rqstp, mnt, mounts);
+       exp2 = rqst_exp_get_by_name(rqstp, &path);
        if (IS_ERR(exp2)) {
-               if (PTR_ERR(exp2) != -ENOENT)
-                       err = PTR_ERR(exp2);
-               dput(mounts);
-               mntput(mnt);
+               err = PTR_ERR(exp2);
+               /*
+                * We normally allow NFS clients to continue
+                * "underneath" a mountpoint that is not exported.
+                * The exception is V4ROOT, where no traversal is ever
+                * allowed without an explicit export of the new
+                * directory.
+                */
+               if (err == -ENOENT && !(exp->ex_flags & NFSEXP_V4ROOT))
+                       err = 0;
+               path_put(&path);
                goto out;
        }
-       if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
+       if (nfsd_v4client(rqstp) ||
+               (exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
                /* successfully crossed mount point */
                /*
-                * This is subtle: dentry is *not* under mnt at this point.
-                * The only reason we are safe is that original mnt is pinned
-                * down by exp, so we should dput before putting exp.
+                * This is subtle: path.dentry is *not* on path.mnt
+                * at this point.  The only reason we are safe is that
+                * original mnt is pinned down by exp, so we should
+                * put path *before* putting exp
                 */
-               dput(dentry);
-               *dpp = mounts;
-               exp_put(exp);
+               *dpp = path.dentry;
+               path.dentry = dentry;
                *expp = exp2;
-       } else {
-               exp_put(exp2);
-               dput(mounts);
+               exp2 = exp;
        }
-       mntput(mnt);
+       path_put(&path);
+       exp_put(exp2);
 out:
        return err;
 }
 
+static void follow_to_parent(struct path *path)
+{
+       struct dentry *dp;
+
+       while (path->dentry == path->mnt->mnt_root && follow_up(path))
+               ;
+       dp = dget_parent(path->dentry);
+       dput(path->dentry);
+       path->dentry = dp;
+}
+
+static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, struct svc_export **exp, struct dentry **dentryp)
+{
+       struct svc_export *exp2;
+       struct path path = {.mnt = mntget((*exp)->ex_path.mnt),
+                           .dentry = dget(dparent)};
+
+       follow_to_parent(&path);
+
+       exp2 = rqst_exp_parent(rqstp, &path);
+       if (PTR_ERR(exp2) == -ENOENT) {
+               *dentryp = dget(dparent);
+       } else if (IS_ERR(exp2)) {
+               path_put(&path);
+               return PTR_ERR(exp2);
+       } else {
+               *dentryp = dget(path.dentry);
+               exp_put(*exp);
+               *exp = exp2;
+       }
+       path_put(&path);
+       return 0;
+}
+
+/*
+ * For nfsd purposes, we treat V4ROOT exports as though there was an
+ * export at *every* directory.
+ */
+int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
+{
+       if (d_mountpoint(dentry))
+               return 1;
+       if (!(exp->ex_flags & NFSEXP_V4ROOT))
+               return 0;
+       return dentry->d_inode != NULL;
+}
+
 __be32
 nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
                   const char *name, unsigned int len,
@@ -143,16 +181,10 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
        struct svc_export       *exp;
        struct dentry           *dparent;
        struct dentry           *dentry;
-       __be32                  err;
        int                     host_err;
 
        dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
 
-       /* Obtain dentry and export. */
-       err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
-       if (err)
-               return err;
-
        dparent = fhp->fh_dentry;
        exp  = fhp->fh_export;
        exp_get(exp);
@@ -163,34 +195,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
                        dentry = dget(dparent);
                else if (dparent != exp->ex_path.dentry)
                        dentry = dget_parent(dparent);
-               else if (!EX_NOHIDE(exp))
+               else if (!EX_NOHIDE(exp) && !nfsd_v4client(rqstp))
                        dentry = dget(dparent); /* .. == . just like at / */
                else {
                        /* checking mountpoint crossing is very different when stepping up */
-                       struct svc_export *exp2 = NULL;
-                       struct dentry *dp;
-                       struct vfsmount *mnt = mntget(exp->ex_path.mnt);
-                       dentry = dget(dparent);
-                       while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry))
-                               ;
-                       dp = dget_parent(dentry);
-                       dput(dentry);
-                       dentry = dp;
-
-                       exp2 = rqst_exp_parent(rqstp, mnt, dentry);
-                       if (PTR_ERR(exp2) == -ENOENT) {
-                               dput(dentry);
-                               dentry = dget(dparent);
-                       } else if (IS_ERR(exp2)) {
-                               host_err = PTR_ERR(exp2);
-                               dput(dentry);
-                               mntput(mnt);
+                       host_err = nfsd_lookup_parent(rqstp, dparent, &exp, &dentry);
+                       if (host_err)
                                goto out_nfserr;
-                       } else {
-                               exp_put(exp);
-                               exp = exp2;
-                       }
-                       mntput(mnt);
                }
        } else {
                fh_lock(fhp);
@@ -201,7 +212,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
                /*
                 * check if we have crossed a mount point ...
                 */
-               if (d_mountpoint(dentry)) {
+               if (nfsd_mountpoint(dentry, exp)) {
                        if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
                                dput(dentry);
                                goto out_nfserr;
@@ -237,6 +248,9 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
        struct dentry           *dentry;
        __be32 err;
 
+       err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
+       if (err)
+               return err;
        err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry);
        if (err)
                return err;
@@ -256,6 +270,29 @@ out:
        return err;
 }
 
+static int nfsd_break_lease(struct inode *inode)
+{
+       if (!S_ISREG(inode->i_mode))
+               return 0;
+       return break_lease(inode, O_WRONLY | O_NONBLOCK);
+}
+
+/*
+ * Commit metadata changes to stable storage.
+ */
+static int
+commit_metadata(struct svc_fh *fhp)
+{
+       struct inode *inode = fhp->fh_dentry->d_inode;
+       const struct export_operations *export_ops = inode->i_sb->s_export_op;
+
+       if (!EX_ISSYNC(fhp->fh_export))
+               return 0;
+
+       if (export_ops->commit_metadata)
+               return export_ops->commit_metadata(inode);
+       return sync_inode_metadata(inode, 1);
+}
 
 /*
  * Set various file attributes.
@@ -342,16 +379,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
                                goto out;
                }
 
-               /*
-                * If we are changing the size of the file, then
-                * we need to break all leases.
-                */
-               host_err = break_lease(inode, FMODE_WRITE | O_NONBLOCK);
-               if (host_err == -EWOULDBLOCK)
-                       host_err = -ETIMEDOUT;
-               if (host_err) /* ENOMEM or EWOULDBLOCK */
-                       goto out_nfserr;
-
                host_err = get_write_access(inode);
                if (host_err)
                        goto out_nfserr;
@@ -362,7 +389,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
                        put_write_access(inode);
                        goto out_nfserr;
                }
-               vfs_dq_init(inode);
        }
 
        /* sanitize the mode change */
@@ -393,7 +419,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 
        err = nfserr_notsync;
        if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
+               host_err = nfsd_break_lease(inode);
+               if (host_err)
+                       goto out_nfserr;
                fh_lock(fhp);
+
                host_err = notify_change(dentry, iap);
                err = nfserrno(host_err);
                fh_unlock(fhp);
@@ -401,8 +431,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
        if (size_change)
                put_write_access(inode);
        if (!err)
-               if (EX_ISSYNC(fhp->fh_export))
-                       write_inode_now(inode, 1);
+               commit_metadata(fhp);
 out:
        return err;
 
@@ -563,7 +592,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
        return error;
 }
 
-#endif /* defined(CONFIG_NFS_V4) */
+#endif /* defined(CONFIG_NFSD_V4) */
 
 #ifdef CONFIG_NFSD_V3
 /*
@@ -667,7 +696,15 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
 }
 #endif /* CONFIG_NFSD_V3 */
 
+static int nfsd_open_break_lease(struct inode *inode, int access)
+{
+       unsigned int mode;
 
+       if (access & NFSD_MAY_NOT_BREAK_LEASE)
+               return 0;
+       mode = (access & NFSD_MAY_WRITE) ? O_WRONLY : O_RDONLY;
+       return break_lease(inode, mode | O_NONBLOCK);
+}
 
 /*
  * Open an existing file or directory.
@@ -678,12 +715,13 @@ __be32
 nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
                        int access, struct file **filp)
 {
-       const struct cred *cred = current_cred();
        struct dentry   *dentry;
        struct inode    *inode;
        int             flags = O_RDONLY|O_LARGEFILE;
        __be32          err;
-       int             host_err;
+       int             host_err = 0;
+
+       validate_process_creds();
 
        /*
         * If we get here, then the client has already done an "open",
@@ -714,13 +752,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        if (!inode->i_fop)
                goto out;
 
-       /*
-        * Check to see if there are any leases on this file.
-        * This may block while leases are broken.
-        */
-       host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? FMODE_WRITE : 0));
-       if (host_err == -EWOULDBLOCK)
-               host_err = -ETIMEDOUT;
+       host_err = nfsd_open_break_lease(inode, access);
        if (host_err) /* NOMEM or WOULDBLOCK */
                goto out_nfserr;
 
@@ -729,18 +761,17 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
                        flags = O_RDWR|O_LARGEFILE;
                else
                        flags = O_WRONLY|O_LARGEFILE;
-
-               vfs_dq_init(inode);
        }
        *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt),
-                           flags, cred);
+                           flags, current_cred());
        if (IS_ERR(*filp))
                host_err = PTR_ERR(*filp);
        else
-               ima_counts_get(*filp);
+               host_err = ima_file_check(*filp, access);
 out_nfserr:
        err = nfserrno(host_err);
 out:
+       validate_process_creds();
        return err;
 }
 
@@ -754,46 +785,6 @@ nfsd_close(struct file *filp)
 }
 
 /*
- * Sync a file
- * As this calls fsync (not fdatasync) there is no need for a write_inode
- * after it.
- */
-static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
-                             const struct file_operations *fop)
-{
-       struct inode *inode = dp->d_inode;
-       int (*fsync) (struct file *, struct dentry *, int);
-       int err;
-
-       err = filemap_fdatawrite(inode->i_mapping);
-       if (err == 0 && fop && (fsync = fop->fsync))
-               err = fsync(filp, dp, 0);
-       if (err == 0)
-               err = filemap_fdatawait(inode->i_mapping);
-
-       return err;
-}
-
-static int
-nfsd_sync(struct file *filp)
-{
-        int err;
-       struct inode *inode = filp->f_path.dentry->d_inode;
-       dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name);
-       mutex_lock(&inode->i_mutex);
-       err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op);
-       mutex_unlock(&inode->i_mutex);
-
-       return err;
-}
-
-int
-nfsd_sync_dir(struct dentry *dp)
-{
-       return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
-}
-
-/*
  * Obtain the readahead parameters for the file
  * specified by (dev, ino).
  */
@@ -817,7 +808,7 @@ nfsd_get_raparms(dev_t dev, ino_t ino)
                if (ra->p_count == 0)
                        frap = rap;
        }
-       depth = nfsdstats.ra_size*11/10;
+       depth = nfsdstats.ra_size;
        if (!frap) {    
                spin_unlock(&rab->pb_lock);
                return NULL;
@@ -853,11 +844,6 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
        struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
        struct page *page = buf->page;
        size_t size;
-       int ret;
-
-       ret = buf->ops->confirm(pipe, buf);
-       if (unlikely(ret))
-               return ret;
 
        size = sd->len;
 
@@ -887,36 +873,15 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
        return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
 }
 
-static inline int svc_msnfs(struct svc_fh *ffhp)
-{
-#ifdef MSNFS
-       return (ffhp->fh_export->ex_flags & NFSEXP_MSNFS);
-#else
-       return 0;
-#endif
-}
-
 static __be32
 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
               loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
 {
-       struct inode *inode;
-       struct raparms  *ra;
        mm_segment_t    oldfs;
        __be32          err;
        int             host_err;
 
        err = nfserr_perm;
-       inode = file->f_path.dentry->d_inode;
-
-       if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count))
-               goto out;
-
-       /* Get readahead parameters */
-       ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
-
-       if (ra && ra->p_set)
-               file->f_ra = ra->p_ra;
 
        if (file->f_op->splice_read && rqstp->rq_splice_ok) {
                struct splice_desc sd = {
@@ -935,24 +900,13 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                set_fs(oldfs);
        }
 
-       /* Write back readahead params */
-       if (ra) {
-               struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
-               spin_lock(&rab->pb_lock);
-               ra->p_ra = file->f_ra;
-               ra->p_set = 1;
-               ra->p_count--;
-               spin_unlock(&rab->pb_lock);
-       }
-
        if (host_err >= 0) {
                nfsdstats.io_read += host_err;
                *count = host_err;
                err = 0;
-               fsnotify_access(file->f_path.dentry);
+               fsnotify_access(file);
        } else 
                err = nfserrno(host_err);
-out:
        return err;
 }
 
@@ -966,6 +920,43 @@ static void kill_suid(struct dentry *dentry)
        mutex_unlock(&dentry->d_inode->i_mutex);
 }
 
+/*
+ * Gathered writes: If another process is currently writing to the file,
+ * there's a high chance this is another nfsd (triggered by a bulk write
+ * from a client's biod). Rather than syncing the file with each write
+ * request, we sleep for 10 msec.
+ *
+ * I don't know if this roughly approximates C. Juszak's idea of
+ * gathered writes, but it's a nice and simple solution (IMHO), and it
+ * seems to work:-)
+ *
+ * Note: we do this only in the NFSv2 case, since v3 and higher have a
+ * better tool (separate unstable writes and commits) for solving this
+ * problem.
+ */
+static int wait_for_concurrent_writes(struct file *file)
+{
+       struct inode *inode = file->f_path.dentry->d_inode;
+       static ino_t last_ino;
+       static dev_t last_dev;
+       int err = 0;
+
+       if (atomic_read(&inode->i_writecount) > 1
+           || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) {
+               dprintk("nfsd: write defer %d\n", task_pid_nr(current));
+               msleep(10);
+               dprintk("nfsd: write resume %d\n", task_pid_nr(current));
+       }
+
+       if (inode->i_state & I_DIRTY) {
+               dprintk("nfsd: write sync %d\n", task_pid_nr(current));
+               err = vfs_fsync(file, 0);
+       }
+       last_ino = inode->i_ino;
+       last_dev = inode->i_sb->s_dev;
+       return err;
+}
+
 static __be32
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                                loff_t offset, struct kvec *vec, int vlen,
@@ -978,14 +969,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        __be32                  err = 0;
        int                     host_err;
        int                     stable = *stablep;
-
-#ifdef MSNFS
-       err = nfserr_perm;
-
-       if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-               (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
-               goto out;
-#endif
+       int                     use_wgather;
 
        dentry = file->f_path.dentry;
        inode = dentry->d_inode;
@@ -996,9 +980,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
         *  -   the sync export option has been set, or
         *  -   the client requested O_SYNC behavior (NFSv3 feature).
         *  -   The file system doesn't support fsync().
-        * When gathered writes have been configured for this volume,
+        * When NFSv2 gathered writes have been configured for this volume,
         * flushing the data to disk is handled separately below.
         */
+       use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
 
        if (!file->f_op->fsync) {/* COMMIT3 cannot work */
               stable = 2;
@@ -1007,7 +992,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 
        if (!EX_ISSYNC(exp))
                stable = 0;
-       if (stable && !EX_WGATHER(exp)) {
+       if (stable && !use_wgather) {
                spin_lock(&file->f_lock);
                file->f_flags |= O_SYNC;
                spin_unlock(&file->f_lock);
@@ -1017,58 +1002,25 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        oldfs = get_fs(); set_fs(KERNEL_DS);
        host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
        set_fs(oldfs);
-       if (host_err >= 0) {
-               *cnt = host_err;
-               nfsdstats.io_write += host_err;
-               fsnotify_modify(file->f_path.dentry);
-       }
+       if (host_err < 0)
+               goto out_nfserr;
+       *cnt = host_err;
+       nfsdstats.io_write += host_err;
+       fsnotify_modify(file);
 
        /* clear setuid/setgid flag after write */
-       if (host_err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID)))
+       if (inode->i_mode & (S_ISUID | S_ISGID))
                kill_suid(dentry);
 
-       if (host_err >= 0 && stable) {
-               static ino_t    last_ino;
-               static dev_t    last_dev;
-
-               /*
-                * Gathered writes: If another process is currently
-                * writing to the file, there's a high chance
-                * this is another nfsd (triggered by a bulk write
-                * from a client's biod). Rather than syncing the
-                * file with each write request, we sleep for 10 msec.
-                *
-                * I don't know if this roughly approximates
-                * C. Juszak's idea of gathered writes, but it's a
-                * nice and simple solution (IMHO), and it seems to
-                * work:-)
-                */
-               if (EX_WGATHER(exp)) {
-                       if (atomic_read(&inode->i_writecount) > 1
-                           || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) {
-                               dprintk("nfsd: write defer %d\n", task_pid_nr(current));
-                               msleep(10);
-                               dprintk("nfsd: write resume %d\n", task_pid_nr(current));
-                       }
-
-                       if (inode->i_state & I_DIRTY) {
-                               dprintk("nfsd: write sync %d\n", task_pid_nr(current));
-                               host_err=nfsd_sync(file);
-                       }
-#if 0
-                       wake_up(&inode->i_wait);
-#endif
-               }
-               last_ino = inode->i_ino;
-               last_dev = inode->i_sb->s_dev;
-       }
+       if (stable && use_wgather)
+               host_err = wait_for_concurrent_writes(file);
 
+out_nfserr:
        dprintk("nfsd: write complete host_err=%d\n", host_err);
        if (host_err >= 0)
                err = 0;
        else
                err = nfserrno(host_err);
-out:
        return err;
 }
 
@@ -1077,8 +1029,45 @@ out:
  * on entry. On return, *count contains the number of bytes actually read.
  * N.B. After this call fhp needs an fh_put
  */
+__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+       loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
+{
+       struct file *file;
+       struct inode *inode;
+       struct raparms  *ra;
+       __be32 err;
+
+       err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+       if (err)
+               return err;
+
+       inode = file->f_path.dentry->d_inode;
+
+       /* Get readahead parameters */
+       ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
+
+       if (ra && ra->p_set)
+               file->f_ra = ra->p_ra;
+
+       err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
+
+       /* Write back readahead params */
+       if (ra) {
+               struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
+               spin_lock(&rab->pb_lock);
+               ra->p_ra = file->f_ra;
+               ra->p_set = 1;
+               ra->p_count--;
+               spin_unlock(&rab->pb_lock);
+       }
+
+       nfsd_close(file);
+       return err;
+}
+
+/* As above, but use the provided file descriptor. */
 __be32
-nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
+nfsd_read_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                loff_t offset, struct kvec *vec, int vlen,
                unsigned long *count)
 {
@@ -1090,13 +1079,8 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                if (err)
                        goto out;
                err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
-       } else {
-               err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
-               if (err)
-                       goto out;
-               err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
-               nfsd_close(file);
-       }
+       } else /* Note file may still be NULL in NFSv4 special stateid case: */
+               err = nfsd_read(rqstp, fhp, offset, vec, vlen, count);
 out:
        return err;
 }
@@ -1137,8 +1121,9 @@ out:
 #ifdef CONFIG_NFSD_V3
 /*
  * Commit all pending writes to stable storage.
- * Strictly speaking, we could sync just the indicated file region here,
- * but there's currently no way we can ask the VFS to do so.
+ *
+ * Note: we only guarantee that data that lies within the range specified
+ * by the 'offset' and 'count' parameters will be synced.
  *
  * Unfortunately we cannot lock the file to make sure we return full WCC
  * data to the client, as locking happens lower down in the filesystem.
@@ -1148,23 +1133,32 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
                loff_t offset, unsigned long count)
 {
        struct file     *file;
-       __be32          err;
+       loff_t          end = LLONG_MAX;
+       __be32          err = nfserr_inval;
 
-       if ((u64)count > ~(u64)offset)
-               return nfserr_inval;
+       if (offset < 0)
+               goto out;
+       if (count != 0) {
+               end = offset + (loff_t)count - 1;
+               if (end < offset)
+                       goto out;
+       }
 
-       err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
+       err = nfsd_open(rqstp, fhp, S_IFREG,
+                       NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file);
        if (err)
-               return err;
+               goto out;
        if (EX_ISSYNC(fhp->fh_export)) {
-               if (file->f_op && file->f_op->fsync) {
-                       err = nfserrno(nfsd_sync(file));
-               } else {
+               int err2 = vfs_fsync_range(file, offset, end, 0);
+
+               if (err2 != -EINVAL)
+                       err = nfserrno(err2);
+               else
                        err = nfserr_notsupp;
-               }
        }
 
        nfsd_close(file);
+out:
        return err;
 }
 #endif /* CONFIG_NFSD_V3 */
@@ -1317,12 +1311,14 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
                goto out_nfserr;
        }
 
-       if (EX_ISSYNC(fhp->fh_export)) {
-               err = nfserrno(nfsd_sync_dir(dentry));
-               write_inode_now(dchild->d_inode, 1);
-       }
+       err = nfsd_create_setattr(rqstp, resfhp, iap);
 
-       err2 = nfsd_create_setattr(rqstp, resfhp, iap);
+       /*
+        * nfsd_setattr already committed the child.  Transactional filesystems
+        * had a chance to commit changes for both parent and child
+        * simultaneously making the following commit_metadata a noop.
+        */
+       err2 = nfserrno(commit_metadata(fhp));
        if (err2)
                err = err2;
        mnt_drop_write(fhp->fh_export->ex_path.mnt);
@@ -1342,11 +1338,18 @@ out_nfserr:
 }
 
 #ifdef CONFIG_NFSD_V3
+
+static inline int nfsd_create_is_exclusive(int createmode)
+{
+       return createmode == NFS3_CREATE_EXCLUSIVE
+              || createmode == NFS4_CREATE_EXCLUSIVE4_1;
+}
+
 /*
- * NFSv3 version of nfsd_create
+ * NFSv3 and NFSv4 version of nfsd_create
  */
 __be32
-nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
+do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
                char *fname, int flen, struct iattr *iap,
                struct svc_fh *resfhp, int createmode, u32 *verifier,
                int *truncp, int *created)
@@ -1354,7 +1357,6 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
        struct dentry   *dentry, *dchild = NULL;
        struct inode    *dirp;
        __be32          err;
-       __be32          err2;
        int             host_err;
        __u32           v_mtime=0, v_atime=0;
 
@@ -1366,7 +1368,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
                goto out;
        if (!(iap->ia_valid & ATTR_MODE))
                iap->ia_mode = 0;
-       err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+       err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
        if (err)
                goto out;
 
@@ -1388,11 +1390,18 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if (IS_ERR(dchild))
                goto out_nfserr;
 
+       /* If file doesn't exist, check for permissions to create one */
+       if (!dchild->d_inode) {
+               err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+               if (err)
+                       goto out;
+       }
+
        err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
        if (err)
                goto out;
 
-       if (createmode == NFS3_CREATE_EXCLUSIVE) {
+       if (nfsd_create_is_exclusive(createmode)) {
                /* solaris7 gets confused (bugid 4218508) if these have
                 * the high bit set, so just clear the high bits. If this is
                 * ever changed to use different attrs for storing the
@@ -1433,6 +1442,11 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
                            && dchild->d_inode->i_atime.tv_sec == v_atime
                            && dchild->d_inode->i_size  == 0 )
                                break;
+               case NFS4_CREATE_EXCLUSIVE4_1:
+                       if (   dchild->d_inode->i_mtime.tv_sec == v_mtime
+                           && dchild->d_inode->i_atime.tv_sec == v_atime
+                           && dchild->d_inode->i_size  == 0 )
+                               goto set_attr;
                         /* fallthru */
                case NFS3_CREATE_GUARDED:
                        err = nfserr_exist;
@@ -1449,14 +1463,9 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if (created)
                *created = 1;
 
-       if (EX_ISSYNC(fhp->fh_export)) {
-               err = nfserrno(nfsd_sync_dir(dentry));
-               /* setattr will sync the child (or not) */
-       }
-
        nfsd_check_ignore_resizing(iap);
 
-       if (createmode == NFS3_CREATE_EXCLUSIVE) {
+       if (nfsd_create_is_exclusive(createmode)) {
                /* Cram the verifier into atime/mtime */
                iap->ia_valid = ATTR_MTIME|ATTR_ATIME
                        | ATTR_MTIME_SET|ATTR_ATIME_SET;
@@ -1468,9 +1477,13 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
        }
 
  set_attr:
-       err2 = nfsd_create_setattr(rqstp, resfhp, iap);
-       if (err2)
-               err = err2;
+       err = nfsd_create_setattr(rqstp, resfhp, iap);
+
+       /*
+        * nfsd_setattr already committed the child (and possibly also the parent).
+        */
+       if (!err)
+               err = nfserrno(commit_metadata(fhp));
 
        mnt_drop_write(fhp->fh_export->ex_path.mnt);
        /*
@@ -1585,12 +1598,9 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
                }
        } else
                host_err = vfs_symlink(dentry->d_inode, dnew, path);
-
-       if (!host_err) {
-               if (EX_ISSYNC(fhp->fh_export))
-                       host_err = nfsd_sync_dir(dentry);
-       }
        err = nfserrno(host_err);
+       if (!err)
+               err = nfserrno(commit_metadata(fhp));
        fh_unlock(fhp);
 
        mnt_drop_write(fhp->fh_export->ex_path.mnt);
@@ -1615,7 +1625,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
                                char *name, int len, struct svc_fh *tfhp)
 {
        struct dentry   *ddir, *dnew, *dold;
-       struct inode    *dirp, *dest;
+       struct inode    *dirp;
        __be32          err;
        int             host_err;
 
@@ -1643,26 +1653,32 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
                goto out_nfserr;
 
        dold = tfhp->fh_dentry;
-       dest = dold->d_inode;
 
        host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt);
        if (host_err) {
                err = nfserrno(host_err);
                goto out_dput;
        }
+       err = nfserr_noent;
+       if (!dold->d_inode)
+               goto out_drop_write;
+       host_err = nfsd_break_lease(dold->d_inode);
+       if (host_err) {
+               err = nfserrno(host_err);
+               goto out_drop_write;
+       }
        host_err = vfs_link(dold, dirp, dnew);
        if (!host_err) {
-               if (EX_ISSYNC(ffhp->fh_export)) {
-                       err = nfserrno(nfsd_sync_dir(ddir));
-                       write_inode_now(dest, 1);
-               }
-               err = 0;
+               err = nfserrno(commit_metadata(ffhp));
+               if (!err)
+                       err = nfserrno(commit_metadata(tfhp));
        } else {
                if (host_err == -EXDEV && rqstp->rq_vers == 2)
                        err = nfserr_acces;
                else
                        err = nfserrno(host_err);
        }
+out_drop_write:
        mnt_drop_write(tfhp->fh_export->ex_path.mnt);
 out_dput:
        dput(dnew);
@@ -1737,13 +1753,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        if (ndentry == trap)
                goto out_dput_new;
 
-       if (svc_msnfs(ffhp) &&
-               ((atomic_read(&odentry->d_count) > 1)
-                || (atomic_read(&ndentry->d_count) > 1))) {
-                       host_err = -EPERM;
-                       goto out_dput_new;
-       }
-
        host_err = -EXDEV;
        if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
                goto out_dput_new;
@@ -1751,15 +1760,22 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        if (host_err)
                goto out_dput_new;
 
+       host_err = nfsd_break_lease(odentry->d_inode);
+       if (host_err)
+               goto out_drop_write;
+       if (ndentry->d_inode) {
+               host_err = nfsd_break_lease(ndentry->d_inode);
+               if (host_err)
+                       goto out_drop_write;
+       }
        host_err = vfs_rename(fdir, odentry, tdir, ndentry);
-       if (!host_err && EX_ISSYNC(tfhp->fh_export)) {
-               host_err = nfsd_sync_dir(tdentry);
+       if (!host_err) {
+               host_err = commit_metadata(tfhp);
                if (!host_err)
-                       host_err = nfsd_sync_dir(fdentry);
+                       host_err = commit_metadata(ffhp);
        }
-
+out_drop_write:
        mnt_drop_write(ffhp->fh_export->ex_path.mnt);
-
  out_dput_new:
        dput(ndentry);
  out_dput_old:
@@ -1820,29 +1836,22 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 
        host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
        if (host_err)
-               goto out_nfserr;
+               goto out_put;
 
-       if (type != S_IFDIR) { /* It's UNLINK */
-#ifdef MSNFS
-               if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-                       (atomic_read(&rdentry->d_count) > 1)) {
-                       host_err = -EPERM;
-               } else
-#endif
+       host_err = nfsd_break_lease(rdentry->d_inode);
+       if (host_err)
+               goto out_drop_write;
+       if (type != S_IFDIR)
                host_err = vfs_unlink(dirp, rdentry);
-       } else { /* It's RMDIR */
+       else
                host_err = vfs_rmdir(dirp, rdentry);
-       }
-
+       if (!host_err)
+               host_err = commit_metadata(fhp);
+out_drop_write:
+       mnt_drop_write(fhp->fh_export->ex_path.mnt);
+out_put:
        dput(rdentry);
 
-       if (host_err)
-               goto out_drop;
-       if (EX_ISSYNC(fhp->fh_export))
-               host_err = nfsd_sync_dir(dentry);
-
-out_drop:
-       mnt_drop_write(fhp->fh_export->ex_path.mnt);
 out_nfserr:
        err = nfserrno(host_err);
 out:
@@ -2008,9 +2017,17 @@ out:
 __be32
 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
 {
-       __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
-       if (!err && vfs_statfs(fhp->fh_dentry,stat))
-               err = nfserr_io;
+       __be32 err;
+
+       err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
+       if (!err) {
+               struct path path = {
+                       .mnt    = fhp->fh_export->ex_path.mnt,
+                       .dentry = fhp->fh_dentry,
+               };
+               if (vfs_statfs(&path, stat))
+                       err = nfserr_io;
+       }
        return err;
 }
 
@@ -2027,10 +2044,9 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
                                        struct dentry *dentry, int acc)
 {
        struct inode    *inode = dentry->d_inode;
-       struct path     path;
        int             err;
 
-       if (acc == NFSD_MAY_NOP)
+       if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP)
                return 0;
 #if 0
        dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n",
@@ -2100,17 +2116,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
        if (err == -EACCES && S_ISREG(inode->i_mode) &&
            acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
                err = inode_permission(inode, MAY_EXEC);
-       if (err)
-               goto nfsd_out;
 
-       /* Do integrity (permission) checking now, but defer incrementing
-        * IMA counts to the actual file open.
-        */
-       path.mnt = exp->ex_path.mnt;
-       path.dentry = dentry;
-       err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC),
-                            IMA_COUNT_LEAVE);
-nfsd_out:
        return err? nfserrno(err) : 0;
 }