Merge git://git.linux-nfs.org/pub/linux/nfs-2.6
Linus Torvalds [Mon, 15 Oct 2007 17:46:05 +0000 (10:46 -0700)]
* git://git.linux-nfs.org/pub/linux/nfs-2.6: (131 commits)
  NFSv4: Fix a typo in nfs_inode_reclaim_delegation
  NFS: Add a boot parameter to disable 64 bit inode numbers
  NFS: nfs_refresh_inode should clear cache_validity flags on success
  NFS: Fix a connectathon regression in NFSv3 and NFSv4
  NFS: Use nfs_refresh_inode() in ops that aren't expected to change the inode
  SUNRPC: Don't call xprt_release in call refresh
  SUNRPC: Don't call xprt_release() if call_allocate fails
  SUNRPC: Fix buggy UDP transmission
  [23/37] Clean up duplicate includes in
  [2.6 patch] net/sunrpc/rpcb_clnt.c: make struct rpcb_program static
  SUNRPC: Use correct type in buffer length calculations
  SUNRPC: Fix default hostname created in rpc_create()
  nfs: add server port to rpc_pipe info file
  NFS: Get rid of some obsolete macros
  NFS: Simplify filehandle revalidation
  NFS: Ensure that nfs_link() returns a hashed dentry
  NFS: Be strict about dentry revalidation when doing exclusive create
  NFS: Don't zap the readdir caches upon error
  NFS: Remove the redundant nfs_reval_fsid()
  NFSv3: Always use directory post-op attributes in nfs3_proc_lookup
  ...

Fix up trivial conflict due to sock_owned_by_user() cleanup manually in
net/sunrpc/xprtsock.c

57 files changed:
Documentation/kernel-parameters.txt
fs/Kconfig
fs/lockd/mon.c
fs/lockd/xdr.c
fs/lockd/xdr4.c
fs/nfs/Makefile
fs/nfs/client.c
fs/nfs/delegation.c
fs/nfs/dir.c
fs/nfs/direct.c
fs/nfs/file.c
fs/nfs/inode.c
fs/nfs/internal.h
fs/nfs/nfs2xdr.c
fs/nfs/nfs3acl.c
fs/nfs/nfs3proc.c
fs/nfs/nfs3xdr.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4state.c
fs/nfs/nfs4xdr.c
fs/nfs/nfsroot.c
fs/nfs/proc.c
fs/nfs/read.c
fs/nfs/super.c
fs/nfs/unlink.c
fs/nfs/write.c
fs/nfsd/nfs4xdr.c
include/linux/jiffies.h
include/linux/nfs_fs.h
include/linux/nfs_page.h
include/linux/nfs_xdr.h
include/linux/sunrpc/clnt.h
include/linux/sunrpc/debug.h
include/linux/sunrpc/msg_prot.h
include/linux/sunrpc/rpc_rdma.h [new file with mode: 0644]
include/linux/sunrpc/xdr.h
include/linux/sunrpc/xprt.h
include/linux/sunrpc/xprtrdma.h [new file with mode: 0644]
include/linux/sunrpc/xprtsock.h [new file with mode: 0644]
include/linux/writeback.h
kernel/auditsc.c
net/sunrpc/Makefile
net/sunrpc/auth_gss/gss_krb5_wrap.c
net/sunrpc/clnt.c
net/sunrpc/rpc_pipe.c
net/sunrpc/rpcb_clnt.c
net/sunrpc/sched.c
net/sunrpc/socklib.c
net/sunrpc/sunrpc_syms.c
net/sunrpc/timer.c
net/sunrpc/xprt.c
net/sunrpc/xprtrdma/Makefile [new file with mode: 0644]
net/sunrpc/xprtrdma/rpc_rdma.c [new file with mode: 0644]
net/sunrpc/xprtrdma/transport.c [new file with mode: 0644]
net/sunrpc/xprtrdma/verbs.c [new file with mode: 0644]
net/sunrpc/xprtrdma/xprt_rdma.h [new file with mode: 0644]
net/sunrpc/xprtsock.c

index c323778..fdd6dbc 100644 (file)
@@ -1083,6 +1083,13 @@ and is between 256 and 4096 characters. It is defined in the file
                        [NFS] set the maximum lifetime for idmapper cache
                        entries.
 
+       nfs.enable_ino64=
+                       [NFS] enable 64-bit inode numbers.
+                       If zero, the NFS client will fake up a 32-bit inode
+                       number for the readdir() and stat() syscalls instead
+                       of returning the full 64-bit number.
+                       The default is to return 64-bit inode numbers.
+
        nmi_watchdog=   [KNL,BUGS=X86-32] Debugging features for SMP kernels
 
        no387           [BUGS=X86-32] Tells the kernel to use the 387 maths
index bb02b39..815d201 100644 (file)
@@ -1755,6 +1755,14 @@ config SUNRPC
 config SUNRPC_GSS
        tristate
 
+config SUNRPC_XPRT_RDMA
+       tristate "RDMA transport for sunrpc (EXPERIMENTAL)"
+       depends on SUNRPC && INFINIBAND && EXPERIMENTAL
+       default m
+       help
+         Adds a client RPC transport for supporting kernel NFS over RDMA
+         mounts, including Infiniband and iWARP. Experimental.
+
 config SUNRPC_BIND34
        bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)"
        depends on SUNRPC && EXPERIMENTAL
index 3353ed8..908b23f 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/utsname.h>
 #include <linux/kernel.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xprtsock.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/sm_inter.h>
@@ -132,7 +133,7 @@ nsm_create(void)
                .sin_port       = 0,
        };
        struct rpc_create_args args = {
-               .protocol       = IPPROTO_UDP,
+               .protocol       = XPRT_TRANSPORT_UDP,
                .address        = (struct sockaddr *)&sin,
                .addrsize       = sizeof(sin),
                .servername     = "localhost",
index 5316e30..633653b 100644 (file)
@@ -62,8 +62,9 @@ static __be32 *nlm_decode_cookie(__be32 *p, struct nlm_cookie *c)
        }
        else 
        {
-               printk(KERN_NOTICE
-                       "lockd: bad cookie size %d (only cookies under %d bytes are supported.)\n", len, NLM_MAXCOOKIELEN);
+               dprintk("lockd: bad cookie size %d (only cookies under "
+                       "%d bytes are supported.)\n",
+                               len, NLM_MAXCOOKIELEN);
                return NULL;
        }
        return p;
@@ -84,8 +85,7 @@ nlm_decode_fh(__be32 *p, struct nfs_fh *f)
        unsigned int    len;
 
        if ((len = ntohl(*p++)) != NFS2_FHSIZE) {
-               printk(KERN_NOTICE
-                       "lockd: bad fhandle size %d (should be %d)\n",
+               dprintk("lockd: bad fhandle size %d (should be %d)\n",
                        len, NFS2_FHSIZE);
                return NULL;
        }
index 846fc1d..43ff939 100644 (file)
@@ -64,8 +64,9 @@ nlm4_decode_cookie(__be32 *p, struct nlm_cookie *c)
        }
        else 
        {
-               printk(KERN_NOTICE
-                       "lockd: bad cookie size %d (only cookies under %d bytes are supported.)\n", len, NLM_MAXCOOKIELEN);
+               dprintk("lockd: bad cookie size %d (only cookies under "
+                       "%d bytes are supported.)\n",
+                               len, NLM_MAXCOOKIELEN);
                return NULL;
        }
        return p;
@@ -86,8 +87,7 @@ nlm4_decode_fh(__be32 *p, struct nfs_fh *f)
        memset(f->data, 0, sizeof(f->data));
        f->size = ntohl(*p++);
        if (f->size > NFS_MAXFHSIZE) {
-               printk(KERN_NOTICE
-                       "lockd: bad fhandle size %d (should be <=%d)\n",
+               dprintk("lockd: bad fhandle size %d (should be <=%d)\n",
                        f->size, NFS_MAXFHSIZE);
                return NULL;
        }
index b55cb23..df0f41e 100644 (file)
@@ -16,4 +16,3 @@ nfs-$(CONFIG_NFS_V4)  += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
                           nfs4namespace.o
 nfs-$(CONFIG_NFS_DIRECTIO) += direct.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
-nfs-objs               := $(nfs-y)
index a204484..a532ee1 100644 (file)
@@ -23,6 +23,8 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/xprtsock.h>
+#include <linux/sunrpc/xprtrdma.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include <linux/nfs4_mount.h>
@@ -340,7 +342,8 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
                to->to_retries = 2;
 
        switch (proto) {
-       case IPPROTO_TCP:
+       case XPRT_TRANSPORT_TCP:
+       case XPRT_TRANSPORT_RDMA:
                if (!to->to_initval)
                        to->to_initval = 60 * HZ;
                if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
@@ -349,7 +352,7 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
                to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
                to->to_exponential = 0;
                break;
-       case IPPROTO_UDP:
+       case XPRT_TRANSPORT_UDP:
        default:
                if (!to->to_initval)
                        to->to_initval = 11 * HZ / 10;
@@ -501,9 +504,9 @@ static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t
 /*
  * Initialise an NFS2 or NFS3 client
  */
-static int nfs_init_client(struct nfs_client *clp, const struct nfs_mount_data *data)
+static int nfs_init_client(struct nfs_client *clp,
+                          const struct nfs_parsed_mount_data *data)
 {
-       int proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
        int error;
 
        if (clp->cl_cons_state == NFS_CS_READY) {
@@ -522,8 +525,8 @@ static int nfs_init_client(struct nfs_client *clp, const struct nfs_mount_data *
         * Create a client RPC handle for doing FSSTAT with UNIX auth only
         * - RFC 2623, sec 2.3.2
         */
-       error = nfs_create_rpc_client(clp, proto, data->timeo, data->retrans,
-                                       RPC_AUTH_UNIX, 0);
+       error = nfs_create_rpc_client(clp, data->nfs_server.protocol,
+                               data->timeo, data->retrans, RPC_AUTH_UNIX, 0);
        if (error < 0)
                goto error;
        nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -538,7 +541,8 @@ error:
 /*
  * Create a version 2 or 3 client
  */
-static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_data *data)
+static int nfs_init_server(struct nfs_server *server,
+                          const struct nfs_parsed_mount_data *data)
 {
        struct nfs_client *clp;
        int error, nfsvers = 2;
@@ -551,7 +555,8 @@ static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_dat
 #endif
 
        /* Allocate or find a client reference we can use */
-       clp = nfs_get_client(data->hostname, &data->addr, nfsvers);
+       clp = nfs_get_client(data->nfs_server.hostname,
+                               &data->nfs_server.address, nfsvers);
        if (IS_ERR(clp)) {
                dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
                return PTR_ERR(clp);
@@ -581,7 +586,7 @@ static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_dat
        if (error < 0)
                goto error;
 
-       error = nfs_init_server_rpcclient(server, data->pseudoflavor);
+       error = nfs_init_server_rpcclient(server, data->auth_flavors[0]);
        if (error < 0)
                goto error;
 
@@ -760,7 +765,7 @@ void nfs_free_server(struct nfs_server *server)
  * Create a version 2 or 3 volume record
  * - keyed on server and FSID
  */
-struct nfs_server *nfs_create_server(const struct nfs_mount_data *data,
+struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
                                     struct nfs_fh *mntfh)
 {
        struct nfs_server *server;
@@ -906,7 +911,7 @@ error:
  * Create a version 4 volume record
  */
 static int nfs4_init_server(struct nfs_server *server,
-               const struct nfs4_mount_data *data, rpc_authflavor_t authflavour)
+               const struct nfs_parsed_mount_data *data)
 {
        int error;
 
@@ -926,7 +931,7 @@ static int nfs4_init_server(struct nfs_server *server,
        server->acdirmin = data->acdirmin * HZ;
        server->acdirmax = data->acdirmax * HZ;
 
-       error = nfs_init_server_rpcclient(server, authflavour);
+       error = nfs_init_server_rpcclient(server, data->auth_flavors[0]);
 
        /* Done */
        dprintk("<-- nfs4_init_server() = %d\n", error);
@@ -937,12 +942,7 @@ static int nfs4_init_server(struct nfs_server *server,
  * Create a version 4 volume record
  * - keyed on server and FSID
  */
-struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data,
-                                     const char *hostname,
-                                     const struct sockaddr_in *addr,
-                                     const char *mntpath,
-                                     const char *ip_addr,
-                                     rpc_authflavor_t authflavour,
+struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
                                      struct nfs_fh *mntfh)
 {
        struct nfs_fattr fattr;
@@ -956,13 +956,18 @@ struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data,
                return ERR_PTR(-ENOMEM);
 
        /* Get a client record */
-       error = nfs4_set_client(server, hostname, addr, ip_addr, authflavour,
-                       data->proto, data->timeo, data->retrans);
+       error = nfs4_set_client(server,
+                       data->nfs_server.hostname,
+                       &data->nfs_server.address,
+                       data->client_address,
+                       data->auth_flavors[0],
+                       data->nfs_server.protocol,
+                       data->timeo, data->retrans);
        if (error < 0)
                goto error;
 
        /* set up the general RPC client */
-       error = nfs4_init_server(server, data, authflavour);
+       error = nfs4_init_server(server, data);
        if (error < 0)
                goto error;
 
@@ -971,7 +976,7 @@ struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data,
        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
 
        /* Probe the root fh to retrieve its FSID */
-       error = nfs4_path_walk(server, mntfh, mntpath);
+       error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path);
        if (error < 0)
                goto error;
 
index c55a761..af8b235 100644 (file)
@@ -52,7 +52,7 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
        for (fl = inode->i_flock; fl != 0; fl = fl->fl_next) {
                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
                        continue;
-               if ((struct nfs_open_context *)fl->fl_file->private_data != ctx)
+               if (nfs_file_open_context(fl->fl_file) != ctx)
                        continue;
                status = nfs4_lock_delegation_recall(state, fl);
                if (status >= 0)
@@ -109,6 +109,7 @@ again:
 void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
 {
        struct nfs_delegation *delegation = NFS_I(inode)->delegation;
+       struct rpc_cred *oldcred;
 
        if (delegation == NULL)
                return;
@@ -116,11 +117,12 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st
                        sizeof(delegation->stateid.data));
        delegation->type = res->delegation_type;
        delegation->maxsize = res->maxsize;
-       put_rpccred(cred);
+       oldcred = delegation->cred;
        delegation->cred = get_rpccred(cred);
        delegation->flags &= ~NFS_DELEGATION_NEED_RECLAIM;
        NFS_I(inode)->delegation_state = delegation->type;
        smp_wmb();
+       put_rpccred(oldcred);
 }
 
 /*
index e4a04d1..8ec7fbd 100644 (file)
@@ -200,9 +200,6 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
        desc->timestamp = timestamp;
        desc->timestamp_valid = 1;
        SetPageUptodate(page);
-       spin_lock(&inode->i_lock);
-       NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
-       spin_unlock(&inode->i_lock);
        /* Ensure consistent page alignment of the data.
         * Note: assumes we have exclusive access to this mapping either
         *       through inode->i_mutex or some other mechanism.
@@ -214,9 +211,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
        unlock_page(page);
        return 0;
  error:
-       SetPageError(page);
        unlock_page(page);
-       nfs_zap_caches(inode);
        desc->error = error;
        return -EIO;
 }
@@ -407,7 +402,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
        struct file     *file = desc->file;
        struct nfs_entry *entry = desc->entry;
        struct dentry   *dentry = NULL;
-       unsigned long   fileid;
+       u64             fileid;
        int             loop_count = 0,
                        res;
 
@@ -418,7 +413,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
                unsigned d_type = DT_UNKNOWN;
                /* Note: entry->prev_cookie contains the cookie for
                 *       retrieving the current dirent on the server */
-               fileid = nfs_fileid_to_ino_t(entry->ino);
+               fileid = entry->ino;
 
                /* Get a dentry if we have one */
                if (dentry != NULL)
@@ -428,11 +423,12 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
                /* Use readdirplus info */
                if (dentry != NULL && dentry->d_inode != NULL) {
                        d_type = dt_type(dentry->d_inode);
-                       fileid = dentry->d_inode->i_ino;
+                       fileid = NFS_FILEID(dentry->d_inode);
                }
 
                res = filldir(dirent, entry->name, entry->len, 
-                             file->f_pos, fileid, d_type);
+                             file->f_pos, nfs_compat_user_ino64(fileid),
+                             d_type);
                if (res < 0)
                        break;
                file->f_pos++;
@@ -490,9 +486,6 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
                                                page,
                                                NFS_SERVER(inode)->dtsize,
                                                desc->plus);
-       spin_lock(&inode->i_lock);
-       NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
-       spin_unlock(&inode->i_lock);
        desc->page = page;
        desc->ptr = kmap(page);         /* matching kunmap in nfs_do_filldir */
        if (desc->error >= 0) {
@@ -558,7 +551,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        memset(desc, 0, sizeof(*desc));
 
        desc->file = filp;
-       desc->dir_cookie = &((struct nfs_open_context *)filp->private_data)->dir_cookie;
+       desc->dir_cookie = &nfs_file_open_context(filp)->dir_cookie;
        desc->decode = NFS_PROTO(inode)->decode_dirent;
        desc->plus = NFS_USE_READDIRPLUS(inode);
 
@@ -623,7 +616,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
        }
        if (offset != filp->f_pos) {
                filp->f_pos = offset;
-               ((struct nfs_open_context *)filp->private_data)->dir_cookie = 0;
+               nfs_file_open_context(filp)->dir_cookie = 0;
        }
 out:
        mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
@@ -650,36 +643,18 @@ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
  */
 static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
 {
-       unsigned long verf;
-
        if (IS_ROOT(dentry))
                return 1;
-       verf = dentry->d_time;
-       if (nfs_caches_unstable(dir)
-                       || verf != NFS_I(dir)->cache_change_attribute)
+       if (!nfs_verify_change_attribute(dir, dentry->d_time))
+               return 0;
+       /* Revalidate nfsi->cache_change_attribute before we declare a match */
+       if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
+               return 0;
+       if (!nfs_verify_change_attribute(dir, dentry->d_time))
                return 0;
        return 1;
 }
 
-static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf)
-{
-       dentry->d_time = verf;
-}
-
-static void nfs_refresh_verifier(struct dentry * dentry, unsigned long verf)
-{
-       nfs_set_verifier(dentry, verf);
-}
-
-/*
- * Whenever an NFS operation succeeds, we know that the dentry
- * is valid, so we update the revalidation timestamp.
- */
-static inline void nfs_renew_times(struct dentry * dentry)
-{
-       dentry->d_time = jiffies;
-}
-
 /*
  * Return the intent data that applies to this particular path component
  *
@@ -695,6 +670,19 @@ static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, unsigne
 }
 
 /*
+ * Use intent information to check whether or not we're going to do
+ * an O_EXCL create using this path component.
+ */
+static int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
+{
+       if (NFS_PROTO(dir)->version == 2)
+               return 0;
+       if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0)
+               return 0;
+       return (nd->intent.open.flags & O_EXCL) != 0;
+}
+
+/*
  * Inode and filehandle revalidation for lookups.
  *
  * We force revalidation in the cases where the VFS sets LOOKUP_REVAL,
@@ -717,6 +705,7 @@ int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd)
                                (S_ISREG(inode->i_mode) ||
                                 S_ISDIR(inode->i_mode)))
                        goto out_force;
+               return 0;
        }
        return nfs_revalidate_inode(server, inode);
 out_force:
@@ -759,7 +748,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        int error;
        struct nfs_fh fhandle;
        struct nfs_fattr fattr;
-       unsigned long verifier;
 
        parent = dget_parent(dentry);
        lock_kernel();
@@ -767,10 +755,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
        inode = dentry->d_inode;
 
-       /* Revalidate parent directory attribute cache */
-       if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
-               goto out_zap_parent;
-
        if (!inode) {
                if (nfs_neg_need_reval(dir, dentry, nd))
                        goto out_bad;
@@ -785,7 +769,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        }
 
        /* Force a full look up iff the parent directory has changed */
-       if (nfs_check_verifier(dir, dentry)) {
+       if (!nfs_is_exclusive_create(dir, nd) && nfs_check_verifier(dir, dentry)) {
                if (nfs_lookup_verify_inode(inode, nd))
                        goto out_zap_parent;
                goto out_valid;
@@ -794,7 +778,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        if (NFS_STALE(inode))
                goto out_bad;
 
-       verifier = nfs_save_change_attribute(dir);
        error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
        if (error)
                goto out_bad;
@@ -803,8 +786,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        if ((error = nfs_refresh_inode(inode, &fattr)) != 0)
                goto out_bad;
 
-       nfs_renew_times(dentry);
-       nfs_refresh_verifier(dentry, verifier);
+       nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
  out_valid:
        unlock_kernel();
        dput(parent);
@@ -815,7 +797,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 out_zap_parent:
        nfs_zap_caches(dir);
  out_bad:
-       NFS_CACHEINV(dir);
+       nfs_mark_for_revalidate(dir);
        if (inode && S_ISDIR(inode->i_mode)) {
                /* Purge readdir caches. */
                nfs_zap_caches(inode);
@@ -872,8 +854,6 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
                nfs_complete_unlink(dentry, inode);
                unlock_kernel();
        }
-       /* When creating a negative dentry, we want to renew d_time */
-       nfs_renew_times(dentry);
        iput(inode);
 }
 
@@ -883,30 +863,6 @@ struct dentry_operations nfs_dentry_operations = {
        .d_iput         = nfs_dentry_iput,
 };
 
-/*
- * Use intent information to check whether or not we're going to do
- * an O_EXCL create using this path component.
- */
-static inline
-int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
-{
-       if (NFS_PROTO(dir)->version == 2)
-               return 0;
-       if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0)
-               return 0;
-       return (nd->intent.open.flags & O_EXCL) != 0;
-}
-
-static inline int nfs_reval_fsid(struct inode *dir, const struct nfs_fattr *fattr)
-{
-       struct nfs_server *server = NFS_SERVER(dir);
-
-       if (!nfs_fsid_equal(&server->fsid, &fattr->fsid))
-               /* Revalidate fsid using the parent directory */
-               return __nfs_revalidate_inode(server, dir);
-       return 0;
-}
-
 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
        struct dentry *res;
@@ -945,11 +901,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
                res = ERR_PTR(error);
                goto out_unlock;
        }
-       error = nfs_reval_fsid(dir, &fattr);
-       if (error < 0) {
-               res = ERR_PTR(error);
-               goto out_unlock;
-       }
        inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr);
        res = (struct dentry *)inode;
        if (IS_ERR(res))
@@ -958,17 +909,10 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 no_entry:
        res = d_materialise_unique(dentry, inode);
        if (res != NULL) {
-               struct dentry *parent;
                if (IS_ERR(res))
                        goto out_unlock;
-               /* Was a directory renamed! */
-               parent = dget_parent(res);
-               if (!IS_ROOT(parent))
-                       nfs_mark_for_revalidate(parent->d_inode);
-               dput(parent);
                dentry = res;
        }
-       nfs_renew_times(dentry);
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out_unlock:
        unlock_kernel();
@@ -1020,28 +964,16 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
        }
        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
 
-       /* Let vfs_create() deal with O_EXCL */
+       /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash
+        * the dentry. */
        if (nd->intent.open.flags & O_EXCL) {
-               d_add(dentry, NULL);
+               d_instantiate(dentry, NULL);
                goto out;
        }
 
        /* Open the file on the server */
        lock_kernel();
-       /* Revalidate parent directory attribute cache */
-       error = nfs_revalidate_inode(NFS_SERVER(dir), dir);
-       if (error < 0) {
-               res = ERR_PTR(error);
-               unlock_kernel();
-               goto out;
-       }
-
-       if (nd->intent.open.flags & O_CREAT) {
-               nfs_begin_data_update(dir);
-               res = nfs4_atomic_open(dir, dentry, nd);
-               nfs_end_data_update(dir);
-       } else
-               res = nfs4_atomic_open(dir, dentry, nd);
+       res = nfs4_atomic_open(dir, dentry, nd);
        unlock_kernel();
        if (IS_ERR(res)) {
                error = PTR_ERR(res);
@@ -1063,8 +995,6 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                }
        } else if (res != NULL)
                dentry = res;
-       nfs_renew_times(dentry);
-       nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out:
        return res;
 no_open:
@@ -1076,7 +1006,6 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        struct dentry *parent = NULL;
        struct inode *inode = dentry->d_inode;
        struct inode *dir;
-       unsigned long verifier;
        int openflags, ret = 0;
 
        parent = dget_parent(dentry);
@@ -1086,8 +1015,12 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        /* We can't create new files in nfs_open_revalidate(), so we
         * optimize away revalidation of negative dentries.
         */
-       if (inode == NULL)
+       if (inode == NULL) {
+               if (!nfs_neg_need_reval(dir, dentry, nd))
+                       ret = 1;
                goto out;
+       }
+
        /* NFS only supports OPEN on regular files */
        if (!S_ISREG(inode->i_mode))
                goto no_open;
@@ -1104,10 +1037,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
         * change attribute *before* we do the RPC call.
         */
        lock_kernel();
-       verifier = nfs_save_change_attribute(dir);
        ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
-       if (!ret)
-               nfs_refresh_verifier(dentry, verifier);
        unlock_kernel();
 out:
        dput(parent);
@@ -1133,6 +1063,7 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
                .len = entry->len,
        };
        struct inode *inode;
+       unsigned long verf = nfs_save_change_attribute(dir);
 
        switch (name.len) {
                case 2:
@@ -1143,6 +1074,14 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
                        if (name.name[0] == '.')
                                return dget(parent);
        }
+
+       spin_lock(&dir->i_lock);
+       if (NFS_I(dir)->cache_validity & NFS_INO_INVALID_DATA) {
+               spin_unlock(&dir->i_lock);
+               return NULL;
+       }
+       spin_unlock(&dir->i_lock);
+
        name.hash = full_name_hash(name.name, name.len);
        dentry = d_lookup(parent, &name);
        if (dentry != NULL) {
@@ -1183,12 +1122,8 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
                dentry = alias;
        }
 
-       nfs_renew_times(dentry);
-       nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-       return dentry;
 out_renew:
-       nfs_renew_times(dentry);
-       nfs_refresh_verifier(dentry, nfs_save_change_attribute(dir));
+       nfs_set_verifier(dentry, verf);
        return dentry;
 }
 
@@ -1198,32 +1133,40 @@ out_renew:
 int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
                                struct nfs_fattr *fattr)
 {
+       struct dentry *parent = dget_parent(dentry);
+       struct inode *dir = parent->d_inode;
        struct inode *inode;
        int error = -EACCES;
 
+       d_drop(dentry);
+
        /* We may have been initialized further down */
        if (dentry->d_inode)
-               return 0;
+               goto out;
        if (fhandle->size == 0) {
-               struct inode *dir = dentry->d_parent->d_inode;
                error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
                if (error)
-                       return error;
+                       goto out_error;
        }
+       nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
        if (!(fattr->valid & NFS_ATTR_FATTR)) {
                struct nfs_server *server = NFS_SB(dentry->d_sb);
                error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr);
                if (error < 0)
-                       return error;
+                       goto out_error;
        }
        inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
        error = PTR_ERR(inode);
        if (IS_ERR(inode))
-               return error;
-       d_instantiate(dentry, inode);
-       if (d_unhashed(dentry))
-               d_rehash(dentry);
+               goto out_error;
+       d_add(dentry, inode);
+out:
+       dput(parent);
        return 0;
+out_error:
+       nfs_mark_for_revalidate(dir);
+       dput(parent);
+       return error;
 }
 
 /*
@@ -1249,13 +1192,9 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
                open_flags = nd->intent.open.flags;
 
        lock_kernel();
-       nfs_begin_data_update(dir);
        error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
-       nfs_end_data_update(dir);
        if (error != 0)
                goto out_err;
-       nfs_renew_times(dentry);
-       nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
        unlock_kernel();
        return 0;
 out_err:
@@ -1283,13 +1222,9 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
        attr.ia_valid = ATTR_MODE;
 
        lock_kernel();
-       nfs_begin_data_update(dir);
        status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
-       nfs_end_data_update(dir);
        if (status != 0)
                goto out_err;
-       nfs_renew_times(dentry);
-       nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
        unlock_kernel();
        return 0;
 out_err:
@@ -1313,13 +1248,9 @@ static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        attr.ia_mode = mode | S_IFDIR;
 
        lock_kernel();
-       nfs_begin_data_update(dir);
        error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
-       nfs_end_data_update(dir);
        if (error != 0)
                goto out_err;
-       nfs_renew_times(dentry);
-       nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
        unlock_kernel();
        return 0;
 out_err:
@@ -1336,12 +1267,10 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
                        dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
 
        lock_kernel();
-       nfs_begin_data_update(dir);
        error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
        /* Ensure the VFS deletes this inode */
        if (error == 0 && dentry->d_inode != NULL)
                clear_nlink(dentry->d_inode);
-       nfs_end_data_update(dir);
        unlock_kernel();
 
        return error;
@@ -1350,9 +1279,9 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
 static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 {
        static unsigned int sillycounter;
-       const int      i_inosize  = sizeof(dir->i_ino)*2;
+       const int      fileidsize  = sizeof(NFS_FILEID(dentry->d_inode))*2;
        const int      countersize = sizeof(sillycounter)*2;
-       const int      slen       = sizeof(".nfs") + i_inosize + countersize - 1;
+       const int      slen        = sizeof(".nfs")+fileidsize+countersize-1;
        char           silly[slen+1];
        struct qstr    qsilly;
        struct dentry *sdentry;
@@ -1370,8 +1299,9 @@ static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
        if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
                goto out;
 
-       sprintf(silly, ".nfs%*.*lx",
-               i_inosize, i_inosize, dentry->d_inode->i_ino);
+       sprintf(silly, ".nfs%*.*Lx",
+               fileidsize, fileidsize,
+               (unsigned long long)NFS_FILEID(dentry->d_inode));
 
        /* Return delegation in anticipation of the rename */
        nfs_inode_return_delegation(dentry->d_inode);
@@ -1398,19 +1328,14 @@ static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 
        qsilly.name = silly;
        qsilly.len  = strlen(silly);
-       nfs_begin_data_update(dir);
        if (dentry->d_inode) {
-               nfs_begin_data_update(dentry->d_inode);
                error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
                                dir, &qsilly);
                nfs_mark_for_revalidate(dentry->d_inode);
-               nfs_end_data_update(dentry->d_inode);
        } else
                error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
                                dir, &qsilly);
-       nfs_end_data_update(dir);
        if (!error) {
-               nfs_renew_times(dentry);
                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
                d_move(dentry, sdentry);
                error = nfs_async_unlink(dir, dentry);
@@ -1443,19 +1368,15 @@ static int nfs_safe_remove(struct dentry *dentry)
                goto out;
        }
 
-       nfs_begin_data_update(dir);
        if (inode != NULL) {
                nfs_inode_return_delegation(inode);
-               nfs_begin_data_update(inode);
                error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
                /* The VFS may want to delete this inode */
                if (error == 0)
                        drop_nlink(inode);
                nfs_mark_for_revalidate(inode);
-               nfs_end_data_update(inode);
        } else
                error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
-       nfs_end_data_update(dir);
 out:
        return error;
 }
@@ -1493,7 +1414,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
        spin_unlock(&dcache_lock);
        error = nfs_safe_remove(dentry);
        if (!error) {
-               nfs_renew_times(dentry);
                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
        } else if (need_rehash)
                d_rehash(dentry);
@@ -1548,9 +1468,7 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
                memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
        kunmap_atomic(kaddr, KM_USER0);
 
-       nfs_begin_data_update(dir);
        error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
-       nfs_end_data_update(dir);
        if (error != 0) {
                dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n",
                        dir->i_sb->s_id, dir->i_ino,
@@ -1590,15 +1508,12 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
                dentry->d_parent->d_name.name, dentry->d_name.name);
 
        lock_kernel();
-       nfs_begin_data_update(dir);
-       nfs_begin_data_update(inode);
+       d_drop(dentry);
        error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
        if (error == 0) {
                atomic_inc(&inode->i_count);
-               d_instantiate(dentry, inode);
+               d_add(dentry, inode);
        }
-       nfs_end_data_update(inode);
-       nfs_end_data_update(dir);
        unlock_kernel();
        return error;
 }
@@ -1701,22 +1616,16 @@ go_ahead:
                d_delete(new_dentry);
        }
 
-       nfs_begin_data_update(old_dir);
-       nfs_begin_data_update(new_dir);
-       nfs_begin_data_update(old_inode);
        error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
                                           new_dir, &new_dentry->d_name);
        nfs_mark_for_revalidate(old_inode);
-       nfs_end_data_update(old_inode);
-       nfs_end_data_update(new_dir);
-       nfs_end_data_update(old_dir);
 out:
        if (rehash)
                d_rehash(rehash);
        if (!error) {
                d_move(old_dentry, new_dentry);
-               nfs_renew_times(new_dentry);
-               nfs_refresh_verifier(new_dentry, nfs_save_change_attribute(new_dir));
+               nfs_set_verifier(new_dentry,
+                                       nfs_save_change_attribute(new_dir));
        }
 
        /* new dentry created? */
@@ -1842,7 +1751,7 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, st
        return NULL;
 }
 
-int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_access_entry *cache;
@@ -1854,7 +1763,7 @@ int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs
        cache = nfs_access_search_rbtree(inode, cred);
        if (cache == NULL)
                goto out;
-       if (time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode)))
+       if (!time_in_range(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
                goto out_stale;
        res->jiffies = cache->jiffies;
        res->cred = cache->cred;
@@ -1909,7 +1818,7 @@ found:
        nfs_access_free_entry(entry);
 }
 
-void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
+static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
 {
        struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL);
        if (cache == NULL)
@@ -1957,6 +1866,24 @@ out:
        return -EACCES;
 }
 
+static int nfs_open_permission_mask(int openflags)
+{
+       int mask = 0;
+
+       if (openflags & FMODE_READ)
+               mask |= MAY_READ;
+       if (openflags & FMODE_WRITE)
+               mask |= MAY_WRITE;
+       if (openflags & FMODE_EXEC)
+               mask |= MAY_EXEC;
+       return mask;
+}
+
+int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
+{
+       return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
+}
+
 int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
        struct rpc_cred *cred;
index fcf4d38..32fe972 100644 (file)
@@ -368,7 +368,7 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
                return -ENOMEM;
 
        dreq->inode = inode;
-       dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
+       dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
        if (!is_sync_kiocb(iocb))
                dreq->iocb = iocb;
 
@@ -510,7 +510,6 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
                        nfs_direct_write_reschedule(dreq);
                        break;
                default:
-                       nfs_end_data_update(inode);
                        if (dreq->commit_data != NULL)
                                nfs_commit_free(dreq->commit_data);
                        nfs_direct_free_writedata(dreq);
@@ -533,7 +532,6 @@ static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
 
 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
 {
-       nfs_end_data_update(inode);
        nfs_direct_free_writedata(dreq);
        nfs_zap_mapping(inode, inode->i_mapping);
        nfs_direct_complete(dreq);
@@ -718,14 +716,12 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
                sync = FLUSH_STABLE;
 
        dreq->inode = inode;
-       dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
+       dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
        if (!is_sync_kiocb(iocb))
                dreq->iocb = iocb;
 
        nfs_add_stats(inode, NFSIOS_DIRECTWRITTENBYTES, count);
 
-       nfs_begin_data_update(inode);
-
        rpc_clnt_sigmask(clnt, &oldset);
        result = nfs_direct_write_schedule(dreq, user_addr, count, pos, sync);
        if (!result)
index 579cf8a..c664bb9 100644 (file)
@@ -33,6 +33,7 @@
 #include <asm/system.h>
 
 #include "delegation.h"
+#include "internal.h"
 #include "iostat.h"
 
 #define NFSDBG_FACILITY                NFSDBG_FILE
@@ -55,6 +56,8 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_setlease(struct file *file, long arg, struct file_lock **fl);
 
+static struct vm_operations_struct nfs_file_vm_ops;
+
 const struct file_operations nfs_file_operations = {
        .llseek         = nfs_file_llseek,
        .read           = do_sync_read,
@@ -174,13 +177,38 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 }
 
 /*
+ * Helper for nfs_file_flush() and nfs_fsync()
+ *
+ * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
+ * disk, but it retrieves and clears ctx->error after synching, despite
+ * the two being set at the same time in nfs_context_set_write_error().
+ * This is because the former is used to notify the _next_ call to
+ * nfs_file_write() that a write error occured, and hence cause it to
+ * fall back to doing a synchronous write.
+ */
+static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode)
+{
+       int have_error, status;
+       int ret = 0;
+
+       have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+       status = nfs_wb_all(inode);
+       have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+       if (have_error)
+               ret = xchg(&ctx->error, 0);
+       if (!ret)
+               ret = status;
+       return ret;
+}
+
+/*
  * Flush all dirty pages, and check for write errors.
  *
  */
 static int
 nfs_file_flush(struct file *file, fl_owner_t id)
 {
-       struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+       struct nfs_open_context *ctx = nfs_file_open_context(file);
        struct inode    *inode = file->f_path.dentry->d_inode;
        int             status;
 
@@ -189,16 +217,11 @@ nfs_file_flush(struct file *file, fl_owner_t id)
        if ((file->f_mode & FMODE_WRITE) == 0)
                return 0;
        nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
-       lock_kernel();
+
        /* Ensure that data+attribute caches are up to date after close() */
-       status = nfs_wb_all(inode);
-       if (!status) {
-               status = ctx->error;
-               ctx->error = 0;
-               if (!status)
-                       nfs_revalidate_inode(NFS_SERVER(inode), inode);
-       }
-       unlock_kernel();
+       status = nfs_do_fsync(ctx, inode);
+       if (!status)
+               nfs_revalidate_inode(NFS_SERVER(inode), inode);
        return status;
 }
 
@@ -257,8 +280,11 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
                dentry->d_parent->d_name.name, dentry->d_name.name);
 
        status = nfs_revalidate_mapping(inode, file->f_mapping);
-       if (!status)
-               status = generic_file_mmap(file, vma);
+       if (!status) {
+               vma->vm_ops = &nfs_file_vm_ops;
+               vma->vm_flags |= VM_CAN_NONLINEAR;
+               file_accessed(file);
+       }
        return status;
 }
 
@@ -270,21 +296,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 static int
 nfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
-       struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+       struct nfs_open_context *ctx = nfs_file_open_context(file);
        struct inode *inode = dentry->d_inode;
-       int status;
 
        dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
 
        nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
-       lock_kernel();
-       status = nfs_wb_all(inode);
-       if (!status) {
-               status = ctx->error;
-               ctx->error = 0;
-       }
-       unlock_kernel();
-       return status;
+       return nfs_do_fsync(ctx, inode);
 }
 
 /*
@@ -333,7 +351,7 @@ static int nfs_launder_page(struct page *page)
 const struct address_space_operations nfs_file_aops = {
        .readpage = nfs_readpage,
        .readpages = nfs_readpages,
-       .set_page_dirty = nfs_set_page_dirty,
+       .set_page_dirty = __set_page_dirty_nobuffers,
        .writepage = nfs_writepage,
        .writepages = nfs_writepages,
        .prepare_write = nfs_prepare_write,
@@ -346,6 +364,43 @@ const struct address_space_operations nfs_file_aops = {
        .launder_page = nfs_launder_page,
 };
 
+static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+       struct file *filp = vma->vm_file;
+       unsigned pagelen;
+       int ret = -EINVAL;
+
+       lock_page(page);
+       if (page->mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping)
+               goto out_unlock;
+       pagelen = nfs_page_length(page);
+       if (pagelen == 0)
+               goto out_unlock;
+       ret = nfs_prepare_write(filp, page, 0, pagelen);
+       if (!ret)
+               ret = nfs_commit_write(filp, page, 0, pagelen);
+out_unlock:
+       unlock_page(page);
+       return ret;
+}
+
+static struct vm_operations_struct nfs_file_vm_ops = {
+       .fault = filemap_fault,
+       .page_mkwrite = nfs_vm_page_mkwrite,
+};
+
+static int nfs_need_sync_write(struct file *filp, struct inode *inode)
+{
+       struct nfs_open_context *ctx;
+
+       if (IS_SYNC(inode) || (filp->f_flags & O_SYNC))
+               return 1;
+       ctx = nfs_file_open_context(filp);
+       if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags))
+               return 1;
+       return 0;
+}
+
 static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
                                unsigned long nr_segs, loff_t pos)
 {
@@ -382,8 +437,8 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
        nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
        result = generic_file_aio_write(iocb, iov, nr_segs, pos);
        /* Return error values for O_SYNC and IS_SYNC() */
-       if (result >= 0 && (IS_SYNC(inode) || (iocb->ki_filp->f_flags & O_SYNC))) {
-               int err = nfs_fsync(iocb->ki_filp, dentry, 1);
+       if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
+               int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
                if (err < 0)
                        result = err;
        }
index 71a49c3..035c769 100644 (file)
 
 #define NFSDBG_FACILITY                NFSDBG_VFS
 
+#define NFS_64_BIT_INODE_NUMBERS_ENABLED       1
+
+/* Default is to see 64-bit inode numbers */
+static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
+
 static void nfs_invalidate_inode(struct inode *);
 static int nfs_update_inode(struct inode *, struct nfs_fattr *);
 
@@ -62,6 +67,25 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
        return nfs_fileid_to_ino_t(fattr->fileid);
 }
 
+/**
+ * nfs_compat_user_ino64 - returns the user-visible inode number
+ * @fileid: 64-bit fileid
+ *
+ * This function returns a 32-bit inode number if the boot parameter
+ * nfs.enable_ino64 is zero.
+ */
+u64 nfs_compat_user_ino64(u64 fileid)
+{
+       int ino;
+
+       if (enable_ino64)
+               return fileid;
+       ino = fileid;
+       if (sizeof(ino) < sizeof(fileid))
+               ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8;
+       return ino;
+}
+
 int nfs_write_inode(struct inode *inode, int sync)
 {
        int ret;
@@ -85,7 +109,6 @@ void nfs_clear_inode(struct inode *inode)
         */
        BUG_ON(nfs_have_writebacks(inode));
        BUG_ON(!list_empty(&NFS_I(inode)->open_files));
-       BUG_ON(atomic_read(&NFS_I(inode)->data_updates) != 0);
        nfs_zap_acl_cache(inode);
        nfs_access_zap_cache(inode);
 }
@@ -118,8 +141,8 @@ static void nfs_zap_caches_locked(struct inode *inode)
 
        nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
 
-       NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
-       NFS_ATTRTIMEO_UPDATE(inode) = jiffies;
+       nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
+       nfsi->attrtimeo_timestamp = jiffies;
 
        memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
        if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))
@@ -156,6 +179,13 @@ static void nfs_zap_acl_cache(struct inode *inode)
        spin_unlock(&inode->i_lock);
 }
 
+void nfs_invalidate_atime(struct inode *inode)
+{
+       spin_lock(&inode->i_lock);
+       NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
+       spin_unlock(&inode->i_lock);
+}
+
 /*
  * Invalidate, but do not unhash, the inode.
  * NB: must be called with inode->i_lock held!
@@ -338,7 +368,6 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
                return 0;
 
        lock_kernel();
-       nfs_begin_data_update(inode);
        /* Write all dirty data */
        if (S_ISREG(inode->i_mode)) {
                filemap_write_and_wait(inode->i_mapping);
@@ -352,7 +381,6 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
        error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr);
        if (error == 0)
                nfs_refresh_inode(inode, &fattr);
-       nfs_end_data_update(inode);
        unlock_kernel();
        return error;
 }
@@ -431,7 +459,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 
        /* Flush out writes to the server in order to update c/mtime */
        if (S_ISREG(inode->i_mode))
-               nfs_sync_mapping_range(inode->i_mapping, 0, 0, FLUSH_NOCOMMIT);
+               nfs_wb_nocommit(inode);
 
        /*
         * We may force a getattr if the user cares about atime.
@@ -450,8 +478,10 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
                err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
        else
                err = nfs_revalidate_inode(NFS_SERVER(inode), inode);
-       if (!err)
+       if (!err) {
                generic_fillattr(inode, stat);
+               stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
+       }
        return err;
 }
 
@@ -536,7 +566,7 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
 static void nfs_file_clear_open_context(struct file *filp)
 {
        struct inode *inode = filp->f_path.dentry->d_inode;
-       struct nfs_open_context *ctx = (struct nfs_open_context *)filp->private_data;
+       struct nfs_open_context *ctx = nfs_file_open_context(filp);
 
        if (ctx) {
                filp->private_data = NULL;
@@ -598,16 +628,10 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
        status = nfs_wait_on_inode(inode);
        if (status < 0)
                goto out;
-       if (NFS_STALE(inode)) {
-               status = -ESTALE;
-               /* Do we trust the cached ESTALE? */
-               if (NFS_ATTRTIMEO(inode) != 0) {
-                       if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME)) {
-                               /* no */
-                       } else
-                               goto out;
-               }
-       }
+
+       status = -ESTALE;
+       if (NFS_STALE(inode))
+               goto out;
 
        status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
        if (status != 0) {
@@ -654,7 +678,7 @@ int nfs_attribute_timeout(struct inode *inode)
 
        if (nfs_have_delegation(inode, FMODE_READ))
                return 0;
-       return time_after(jiffies, nfsi->read_cache_jiffies+nfsi->attrtimeo);
+       return !time_in_range(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
 }
 
 /**
@@ -683,11 +707,8 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
        }
        spin_lock(&inode->i_lock);
        nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
-       if (S_ISDIR(inode->i_mode)) {
+       if (S_ISDIR(inode->i_mode))
                memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
-               /* This ensures we revalidate child dentries */
-               nfsi->cache_change_attribute = jiffies;
-       }
        spin_unlock(&inode->i_lock);
        nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
        dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
@@ -756,56 +777,27 @@ out:
        return ret;
 }
 
-/**
- * nfs_begin_data_update
- * @inode - pointer to inode
- * Declare that a set of operations will update file data on the server
- */
-void nfs_begin_data_update(struct inode *inode)
-{
-       atomic_inc(&NFS_I(inode)->data_updates);
-}
-
-/**
- * nfs_end_data_update
- * @inode - pointer to inode
- * Declare end of the operations that will update file data
- * This will mark the inode as immediately needing revalidation
- * of its attribute cache.
- */
-void nfs_end_data_update(struct inode *inode)
-{
-       struct nfs_inode *nfsi = NFS_I(inode);
-
-       /* Directories: invalidate page cache */
-       if (S_ISDIR(inode->i_mode)) {
-               spin_lock(&inode->i_lock);
-               nfsi->cache_validity |= NFS_INO_INVALID_DATA;
-               spin_unlock(&inode->i_lock);
-       }
-       nfsi->cache_change_attribute = jiffies;
-       atomic_dec(&nfsi->data_updates);
-}
-
 static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
-       unsigned long now = jiffies;
 
+       if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 &&
+                       nfsi->change_attr == fattr->pre_change_attr) {
+               nfsi->change_attr = fattr->change_attr;
+               if (S_ISDIR(inode->i_mode))
+                       nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+       }
        /* If we have atomic WCC data, we may update some attributes */
        if ((fattr->valid & NFS_ATTR_WCC) != 0) {
-               if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
+               if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
                        memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-                       nfsi->cache_change_attribute = now;
-               }
                if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
                        memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
-                       nfsi->cache_change_attribute = now;
+                       if (S_ISDIR(inode->i_mode))
+                               nfsi->cache_validity |= NFS_INO_INVALID_DATA;
                }
-               if (inode->i_size == fattr->pre_size && nfsi->npages == 0) {
+               if (inode->i_size == fattr->pre_size && nfsi->npages == 0)
                        inode->i_size = fattr->size;
-                       nfsi->cache_change_attribute = now;
-               }
        }
 }
 
@@ -822,7 +814,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 {
        struct nfs_inode *nfsi = NFS_I(inode);
        loff_t cur_size, new_isize;
-       int data_unstable;
+       unsigned long invalid = 0;
 
 
        /* Has the inode gone and changed behind our back? */
@@ -831,37 +823,41 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
                return -EIO;
        }
 
-       /* Are we in the process of updating data on the server? */
-       data_unstable = nfs_caches_unstable(inode);
-
        /* Do atomic weak cache consistency updates */
        nfs_wcc_update_inode(inode, fattr);
 
        if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
                        nfsi->change_attr != fattr->change_attr)
-               nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+               invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
        /* Verify a few of the more important attributes */
        if (!timespec_equal(&inode->i_mtime, &fattr->mtime))
-               nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+               invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
        cur_size = i_size_read(inode);
        new_isize = nfs_size_to_loff_t(fattr->size);
        if (cur_size != new_isize && nfsi->npages == 0)
-               nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+               invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
        /* Have any file permissions changed? */
        if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)
                        || inode->i_uid != fattr->uid
                        || inode->i_gid != fattr->gid)
-               nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+               invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
 
        /* Has the link count changed? */
        if (inode->i_nlink != fattr->nlink)
-               nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+               invalid |= NFS_INO_INVALID_ATTR;
 
        if (!timespec_equal(&inode->i_atime, &fattr->atime))
-               nfsi->cache_validity |= NFS_INO_INVALID_ATIME;
+               invalid |= NFS_INO_INVALID_ATIME;
+
+       if (invalid != 0)
+               nfsi->cache_validity |= invalid;
+       else
+               nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+                               | NFS_INO_INVALID_ATIME
+                               | NFS_INO_REVAL_PAGECACHE);
 
        nfsi->read_cache_jiffies = fattr->time_start;
        return 0;
@@ -911,17 +907,41 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
-       int status = 0;
 
        spin_lock(&inode->i_lock);
-       if (unlikely((fattr->valid & NFS_ATTR_FATTR) == 0)) {
-               nfsi->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
-               goto out;
-       }
-       status = nfs_update_inode(inode, fattr);
-out:
+       nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+       if (S_ISDIR(inode->i_mode))
+               nfsi->cache_validity |= NFS_INO_INVALID_DATA;
        spin_unlock(&inode->i_lock);
-       return status;
+       return nfs_refresh_inode(inode, fattr);
+}
+
+/**
+ * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * After an operation that has changed the inode metadata, mark the
+ * attribute cache as being invalid, then try to update it. Fake up
+ * weak cache consistency data, if none exist.
+ *
+ * This function is mainly designed to be used by the ->write_done() functions.
+ */
+int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
+{
+       if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
+                       (fattr->valid & NFS_ATTR_WCC_V4) == 0) {
+               fattr->pre_change_attr = NFS_I(inode)->change_attr;
+               fattr->valid |= NFS_ATTR_WCC_V4;
+       }
+       if ((fattr->valid & NFS_ATTR_FATTR) != 0 &&
+                       (fattr->valid & NFS_ATTR_WCC) == 0) {
+               memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
+               memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
+               fattr->pre_size = inode->i_size;
+               fattr->valid |= NFS_ATTR_WCC;
+       }
+       return nfs_post_op_update_inode(inode, fattr);
 }
 
 /*
@@ -941,9 +961,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        struct nfs_server *server;
        struct nfs_inode *nfsi = NFS_I(inode);
        loff_t cur_isize, new_isize;
-       unsigned int    invalid = 0;
+       unsigned long invalid = 0;
        unsigned long now = jiffies;
-       int data_stable;
 
        dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
                        __FUNCTION__, inode->i_sb->s_id, inode->i_ino,
@@ -968,57 +987,51 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
         * Update the read time so we don't revalidate too often.
         */
        nfsi->read_cache_jiffies = fattr->time_start;
-       nfsi->last_updated = now;
 
-       /* Fix a wraparound issue with nfsi->cache_change_attribute */
-       if (time_before(now, nfsi->cache_change_attribute))
-               nfsi->cache_change_attribute = now - 600*HZ;
-
-       /* Are we racing with known updates of the metadata on the server? */
-       data_stable = nfs_verify_change_attribute(inode, fattr->time_start);
-       if (data_stable)
-               nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATIME);
+       nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME
+                       | NFS_INO_REVAL_PAGECACHE);
 
        /* Do atomic weak cache consistency updates */
        nfs_wcc_update_inode(inode, fattr);
 
+       /* More cache consistency checks */
+       if (!(fattr->valid & NFS_ATTR_FATTR_V4)) {
+               /* NFSv2/v3: Check if the mtime agrees */
+               if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
+                       dprintk("NFS: mtime change on server for file %s/%ld\n",
+                                       inode->i_sb->s_id, inode->i_ino);
+                       invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+                       nfsi->cache_change_attribute = now;
+               }
+               /* If ctime has changed we should definitely clear access+acl caches */
+               if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
+                       invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+       } else if (nfsi->change_attr != fattr->change_attr) {
+               dprintk("NFS: change_attr change on server for file %s/%ld\n",
+                               inode->i_sb->s_id, inode->i_ino);
+               invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+               nfsi->cache_change_attribute = now;
+       }
+
        /* Check if our cached file size is stale */
        new_isize = nfs_size_to_loff_t(fattr->size);
        cur_isize = i_size_read(inode);
        if (new_isize != cur_isize) {
-               /* Do we perhaps have any outstanding writes? */
-               if (nfsi->npages == 0) {
-                       /* No, but did we race with nfs_end_data_update()? */
-                       if (data_stable) {
-                               inode->i_size = new_isize;
-                               invalid |= NFS_INO_INVALID_DATA;
-                       }
-                       invalid |= NFS_INO_INVALID_ATTR;
-               } else if (new_isize > cur_isize) {
+               /* Do we perhaps have any outstanding writes, or has
+                * the file grown beyond our last write? */
+               if (nfsi->npages == 0 || new_isize > cur_isize) {
                        inode->i_size = new_isize;
                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
                }
-               nfsi->cache_change_attribute = now;
                dprintk("NFS: isize change on server for file %s/%ld\n",
                                inode->i_sb->s_id, inode->i_ino);
        }
 
-       /* Check if the mtime agrees */
-       if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
-               memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
-               dprintk("NFS: mtime change on server for file %s/%ld\n",
-                               inode->i_sb->s_id, inode->i_ino);
-               invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
-               nfsi->cache_change_attribute = now;
-       }
 
-       /* If ctime has changed we should definitely clear access+acl caches */
-       if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
-               invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-               memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-               nfsi->cache_change_attribute = now;
-       }
+       memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
+       memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
        memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
+       nfsi->change_attr = fattr->change_attr;
 
        if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) ||
            inode->i_uid != fattr->uid ||
@@ -1039,31 +1052,29 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                inode->i_blocks = fattr->du.nfs2.blocks;
        }
 
-       if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
-                       nfsi->change_attr != fattr->change_attr) {
-               dprintk("NFS: change_attr change on server for file %s/%ld\n",
-                               inode->i_sb->s_id, inode->i_ino);
-               nfsi->change_attr = fattr->change_attr;
-               invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-               nfsi->cache_change_attribute = now;
-       }
-
        /* Update attrtimeo value if we're out of the unstable period */
        if (invalid & NFS_INO_INVALID_ATTR) {
                nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
                nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
                nfsi->attrtimeo_timestamp = now;
-       } else if (time_after(now, nfsi->attrtimeo_timestamp+nfsi->attrtimeo)) {
-               if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
-                       nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
-               nfsi->attrtimeo_timestamp = now;
+               nfsi->last_updated = now;
+       } else {
+               if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
+                       if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
+                               nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
+                       nfsi->attrtimeo_timestamp = now;
+               }
+               /*
+                * Avoid jiffy wraparound issues with nfsi->last_updated
+                */
+               if (!time_in_range(nfsi->last_updated, nfsi->read_cache_jiffies, now))
+                       nfsi->last_updated = nfsi->read_cache_jiffies;
        }
+       invalid &= ~NFS_INO_INVALID_ATTR;
        /* Don't invalidate the data if we were to blame */
        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
                                || S_ISLNK(inode->i_mode)))
                invalid &= ~NFS_INO_INVALID_DATA;
-       if (data_stable)
-               invalid &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME|NFS_INO_REVAL_PAGECACHE);
        if (!nfs_have_delegation(inode, FMODE_READ) ||
                        (nfsi->cache_validity & NFS_INO_REVAL_FORCED))
                nfsi->cache_validity |= invalid;
@@ -1152,7 +1163,6 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
        INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
        INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
        INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
-       atomic_set(&nfsi->data_updates, 0);
        nfsi->ncommit = 0;
        nfsi->npages = 0;
        nfs4_init_once(nfsi);
@@ -1249,6 +1259,7 @@ static void __exit exit_nfs_fs(void)
 /* Not quite true; I just maintain it */
 MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
 MODULE_LICENSE("GPL");
+module_param(enable_ino64, bool, 0644);
 
 module_init(init_nfs_fs)
 module_exit(exit_nfs_fs)
index 76cf55d..f3acf48 100644 (file)
@@ -5,8 +5,6 @@
 #include <linux/mount.h>
 
 struct nfs_string;
-struct nfs_mount_data;
-struct nfs4_mount_data;
 
 /* Maximum number of readahead requests
  * FIXME: this should really be a sysctl so that users may tune it to suit
@@ -27,20 +25,50 @@ struct nfs_clone_mount {
        rpc_authflavor_t authflavor;
 };
 
+/*
+ * In-kernel mount arguments
+ */
+struct nfs_parsed_mount_data {
+       int                     flags;
+       int                     rsize, wsize;
+       int                     timeo, retrans;
+       int                     acregmin, acregmax,
+                               acdirmin, acdirmax;
+       int                     namlen;
+       unsigned int            bsize;
+       unsigned int            auth_flavor_len;
+       rpc_authflavor_t        auth_flavors[1];
+       char                    *client_address;
+
+       struct {
+               struct sockaddr_in      address;
+               char                    *hostname;
+               unsigned int            program;
+               unsigned int            version;
+               unsigned short          port;
+               int                     protocol;
+       } mount_server;
+
+       struct {
+               struct sockaddr_in      address;
+               char                    *hostname;
+               char                    *export_path;
+               unsigned int            program;
+               int                     protocol;
+       } nfs_server;
+};
+
 /* client.c */
 extern struct rpc_program nfs_program;
 
 extern void nfs_put_client(struct nfs_client *);
 extern struct nfs_client *nfs_find_client(const struct sockaddr_in *, int);
-extern struct nfs_server *nfs_create_server(const struct nfs_mount_data *,
-                                           struct nfs_fh *);
-extern struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *,
-                                            const char *,
-                                            const struct sockaddr_in *,
-                                            const char *,
-                                            const char *,
-                                            rpc_authflavor_t,
-                                            struct nfs_fh *);
+extern struct nfs_server *nfs_create_server(
+                                       const struct nfs_parsed_mount_data *,
+                                       struct nfs_fh *);
+extern struct nfs_server *nfs4_create_server(
+                                       const struct nfs_parsed_mount_data *,
+                                       struct nfs_fh *);
 extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *,
                                                      struct nfs_fh *);
 extern void nfs_free_server(struct nfs_server *server);
index c5fce75..668ab96 100644 (file)
@@ -251,6 +251,7 @@ nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readres_sz) << 2;
        xdr_inline_pages(&req->rq_rcv_buf, replen,
                         args->pages, args->pgbase, count);
+       req->rq_rcv_buf.flags |= XDRBUF_READ;
        return 0;
 }
 
@@ -271,7 +272,7 @@ nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
        res->eof = 0;
        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
        if (iov->iov_len < hdrlen) {
-               printk(KERN_WARNING "NFS: READ reply header overflowed:"
+               dprintk("NFS: READ reply header overflowed:"
                                "length %d > %Zu\n", hdrlen, iov->iov_len);
                return -errno_NFSERR_IO;
        } else if (iov->iov_len != hdrlen) {
@@ -281,7 +282,7 @@ nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 
        recvd = req->rq_rcv_buf.len - hdrlen;
        if (count > recvd) {
-               printk(KERN_WARNING "NFS: server cheating in read reply: "
+               dprintk("NFS: server cheating in read reply: "
                        "count %d > recvd %d\n", count, recvd);
                count = recvd;
        }
@@ -313,6 +314,7 @@ nfs_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
 
        /* Copy the page array */
        xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
+       sndbuf->flags |= XDRBUF_WRITE;
        return 0;
 }
 
@@ -431,7 +433,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
 
        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
        if (iov->iov_len < hdrlen) {
-               printk(KERN_WARNING "NFS: READDIR reply header overflowed:"
+               dprintk("NFS: READDIR reply header overflowed:"
                                "length %d > %Zu\n", hdrlen, iov->iov_len);
                return -errno_NFSERR_IO;
        } else if (iov->iov_len != hdrlen) {
@@ -454,7 +456,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
                len = ntohl(*p++);
                p += XDR_QUADLEN(len) + 1;      /* name plus cookie */
                if (len > NFS2_MAXNAMLEN) {
-                       printk(KERN_WARNING "NFS: giant filename in readdir (len 0x%x)!\n",
+                       dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
                                                len);
                        goto err_unmap;
                }
@@ -471,7 +473,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
        entry[0] = entry[1] = 0;
        /* truncate listing ? */
        if (!nr) {
-               printk(KERN_NOTICE "NFS: readdir reply truncated!\n");
+               dprintk("NFS: readdir reply truncated!\n");
                entry[1] = 1;
        }
        goto out;
@@ -583,12 +585,12 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
        /* Convert length of symlink */
        len = ntohl(*p++);
        if (len >= rcvbuf->page_len || len <= 0) {
-               dprintk(KERN_WARNING "nfs: server returned giant symlink!\n");
+               dprintk("nfs: server returned giant symlink!\n");
                return -ENAMETOOLONG;
        }
        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
        if (iov->iov_len < hdrlen) {
-               printk(KERN_WARNING "NFS: READLINK reply header overflowed:"
+               dprintk("NFS: READLINK reply header overflowed:"
                                "length %d > %Zu\n", hdrlen, iov->iov_len);
                return -errno_NFSERR_IO;
        } else if (iov->iov_len != hdrlen) {
@@ -597,7 +599,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
        }
        recvd = req->rq_rcv_buf.len - hdrlen;
        if (recvd < len) {
-               printk(KERN_WARNING "NFS: server cheating in readlink reply: "
+               dprintk("NFS: server cheating in readlink reply: "
                                "count %u > recvd %u\n", len, recvd);
                return -EIO;
        }
@@ -695,7 +697,7 @@ nfs_stat_to_errno(int stat)
                if (nfs_errtbl[i].stat == stat)
                        return nfs_errtbl[i].errno;
        }
-       printk(KERN_ERR "nfs_stat_to_errno: bad nfs status return value: %d\n", stat);
+       dprintk("nfs_stat_to_errno: bad nfs status return value: %d\n", stat);
        return nfs_errtbl[i].errno;
 }
 
index 7322da4..9b73625 100644 (file)
@@ -317,13 +317,11 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
        }
 
        dprintk("NFS call setacl\n");
-       nfs_begin_data_update(inode);
        msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
        status = rpc_call_sync(server->client_acl, &msg, 0);
        spin_lock(&inode->i_lock);
        NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS;
        spin_unlock(&inode->i_lock);
-       nfs_end_data_update(inode);
        dprintk("NFS reply setacl: %d\n", status);
 
        /* pages may have been allocated at the xdr layer. */
index c7ca5d7..4cdc236 100644 (file)
@@ -166,6 +166,7 @@ nfs3_proc_lookup(struct inode *dir, struct qstr *name,
        nfs_fattr_init(&dir_attr);
        nfs_fattr_init(fattr);
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+       nfs_refresh_inode(dir, &dir_attr);
        if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) {
                msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
                msg.rpc_argp = fhandle;
@@ -173,8 +174,6 @@ nfs3_proc_lookup(struct inode *dir, struct qstr *name,
                status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        }
        dprintk("NFS reply lookup: %d\n", status);
-       if (status >= 0)
-               status = nfs_refresh_inode(dir, &dir_attr);
        return status;
 }
 
@@ -607,6 +606,9 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 
        nfs_fattr_init(&dir_attr);
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+
+       nfs_invalidate_atime(dir);
+
        nfs_refresh_inode(dir, &dir_attr);
        dprintk("NFS reply readdir: %d\n", status);
        return status;
@@ -724,9 +726,9 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
        if (nfs3_async_handle_jukebox(task, data->inode))
                return -EAGAIN;
-       /* Call back common NFS readpage processing */
-       if (task->tk_status >= 0)
-               nfs_refresh_inode(data->inode, &data->fattr);
+
+       nfs_invalidate_atime(data->inode);
+       nfs_refresh_inode(data->inode, &data->fattr);
        return 0;
 }
 
@@ -747,7 +749,7 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
        if (nfs3_async_handle_jukebox(task, data->inode))
                return -EAGAIN;
        if (task->tk_status >= 0)
-               nfs_post_op_update_inode(data->inode, data->res.fattr);
+               nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
        return 0;
 }
 
@@ -775,8 +777,7 @@ static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        if (nfs3_async_handle_jukebox(task, data->inode))
                return -EAGAIN;
-       if (task->tk_status >= 0)
-               nfs_post_op_update_inode(data->inode, data->res.fattr);
+       nfs_refresh_inode(data->inode, data->res.fattr);
        return 0;
 }
 
index d9e08f0..616d326 100644 (file)
@@ -346,6 +346,7 @@ nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readres_sz) << 2;
        xdr_inline_pages(&req->rq_rcv_buf, replen,
                         args->pages, args->pgbase, count);
+       req->rq_rcv_buf.flags |= XDRBUF_READ;
        return 0;
 }
 
@@ -367,6 +368,7 @@ nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
 
        /* Copy the page array */
        xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
+       sndbuf->flags |= XDRBUF_WRITE;
        return 0;
 }
 
@@ -524,7 +526,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
 
        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
        if (iov->iov_len < hdrlen) {
-               printk(KERN_WARNING "NFS: READDIR reply header overflowed:"
+               dprintk("NFS: READDIR reply header overflowed:"
                                "length %d > %Zu\n", hdrlen, iov->iov_len);
                return -errno_NFSERR_IO;
        } else if (iov->iov_len != hdrlen) {
@@ -547,7 +549,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
                len = ntohl(*p++);              /* string length */
                p += XDR_QUADLEN(len) + 2;      /* name + cookie */
                if (len > NFS3_MAXNAMLEN) {
-                       printk(KERN_WARNING "NFS: giant filename in readdir (len %x)!\n",
+                       dprintk("NFS: giant filename in readdir (len %x)!\n",
                                                len);
                        goto err_unmap;
                }
@@ -567,7 +569,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
                                        goto short_pkt;
                                len = ntohl(*p++);
                                if (len > NFS3_FHSIZE) {
-                                       printk(KERN_WARNING "NFS: giant filehandle in "
+                                       dprintk("NFS: giant filehandle in "
                                                "readdir (len %x)!\n", len);
                                        goto err_unmap;
                                }
@@ -588,7 +590,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
        entry[0] = entry[1] = 0;
        /* truncate listing ? */
        if (!nr) {
-               printk(KERN_NOTICE "NFS: readdir reply truncated!\n");
+               dprintk("NFS: readdir reply truncated!\n");
                entry[1] = 1;
        }
        goto out;
@@ -826,22 +828,23 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
        /* Convert length of symlink */
        len = ntohl(*p++);
        if (len >= rcvbuf->page_len || len <= 0) {
-               dprintk(KERN_WARNING "nfs: server returned giant symlink!\n");
+               dprintk("nfs: server returned giant symlink!\n");
                return -ENAMETOOLONG;
        }
 
        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
        if (iov->iov_len < hdrlen) {
-               printk(KERN_WARNING "NFS: READLINK reply header overflowed:"
+               dprintk("NFS: READLINK reply header overflowed:"
                                "length %d > %Zu\n", hdrlen, iov->iov_len);
                return -errno_NFSERR_IO;
        } else if (iov->iov_len != hdrlen) {
-               dprintk("NFS: READLINK header is short. iovec will be shifted.\n");
+               dprintk("NFS: READLINK header is short. "
+                       "iovec will be shifted.\n");
                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
        }
        recvd = req->rq_rcv_buf.len - hdrlen;
        if (recvd < len) {
-               printk(KERN_WARNING "NFS: server cheating in readlink reply: "
+               dprintk("NFS: server cheating in readlink reply: "
                                "count %u > recvd %u\n", len, recvd);
                return -EIO;
        }
@@ -876,13 +879,13 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
        ocount   = ntohl(*p++);
 
        if (ocount != count) {
-               printk(KERN_WARNING "NFS: READ count doesn't match RPC opaque count.\n");
+               dprintk("NFS: READ count doesn't match RPC opaque count.\n");
                return -errno_NFSERR_IO;
        }
 
        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
        if (iov->iov_len < hdrlen) {
-               printk(KERN_WARNING "NFS: READ reply header overflowed:"
+               dprintk("NFS: READ reply header overflowed:"
                                "length %d > %Zu\n", hdrlen, iov->iov_len);
                        return -errno_NFSERR_IO;
        } else if (iov->iov_len != hdrlen) {
@@ -892,7 +895,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 
        recvd = req->rq_rcv_buf.len - hdrlen;
        if (count > recvd) {
-               printk(KERN_WARNING "NFS: server cheating in read reply: "
+               dprintk("NFS: server cheating in read reply: "
                        "count %d > recvd %d\n", count, recvd);
                count = recvd;
                res->eof = 0;
index 4b90e17..cb99fd9 100644 (file)
@@ -62,10 +62,8 @@ struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *);
-static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
 static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
 static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp);
-static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int openflags);
 static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 
@@ -177,7 +175,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
                *p++ = xdr_one;                         /* bitmap length */
                *p++ = htonl(FATTR4_WORD0_FILEID);             /* bitmap */
                *p++ = htonl(8);              /* attribute buffer length */
-               p = xdr_encode_hyper(p, dentry->d_inode->i_ino);
+               p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_inode));
        }
        
        *p++ = xdr_one;                                  /* next */
@@ -189,7 +187,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
        *p++ = xdr_one;                         /* bitmap length */
        *p++ = htonl(FATTR4_WORD0_FILEID);             /* bitmap */
        *p++ = htonl(8);              /* attribute buffer length */
-       p = xdr_encode_hyper(p, dentry->d_parent->d_inode->i_ino);
+       p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_parent->d_inode));
 
        readdir->pgbase = (char *)p - (char *)start;
        readdir->count -= readdir->pgbase;
@@ -211,8 +209,9 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 
        spin_lock(&dir->i_lock);
        nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
-       if (cinfo->before == nfsi->change_attr && cinfo->atomic)
-               nfsi->change_attr = cinfo->after;
+       if (!cinfo->atomic || cinfo->before != nfsi->change_attr)
+               nfsi->cache_change_attribute = jiffies;
+       nfsi->change_attr = cinfo->after;
        spin_unlock(&dir->i_lock);
 }
 
@@ -454,7 +453,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
                memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
                rcu_read_unlock();
                lock_kernel();
-               ret = _nfs4_do_access(state->inode, state->owner->so_cred, open_mode);
+               ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
                unlock_kernel();
                if (ret != 0)
                        goto out;
@@ -948,36 +947,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
        return 0;
 }
 
-static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int openflags)
-{
-       struct nfs_access_entry cache;
-       int mask = 0;
-       int status;
-
-       if (openflags & FMODE_READ)
-               mask |= MAY_READ;
-       if (openflags & FMODE_WRITE)
-               mask |= MAY_WRITE;
-       if (openflags & FMODE_EXEC)
-               mask |= MAY_EXEC;
-       status = nfs_access_get_cached(inode, cred, &cache);
-       if (status == 0)
-               goto out;
-
-       /* Be clever: ask server to check for all possible rights */
-       cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
-       cache.cred = cred;
-       cache.jiffies = jiffies;
-       status = _nfs4_proc_access(inode, &cache);
-       if (status != 0)
-               return status;
-       nfs_access_add_cache(inode, &cache);
-out:
-       if ((cache.mask & mask) == mask)
-               return 0;
-       return -EACCES;
-}
-
 static int nfs4_recover_expired_lease(struct nfs_server *server)
 {
        struct nfs_client *clp = server->nfs_client;
@@ -1381,7 +1350,7 @@ static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct
 
        /* If the open_intent is for execute, we have an extra check to make */
        if (nd->intent.open.flags & FMODE_EXEC) {
-               ret = _nfs4_do_access(state->inode,
+               ret = nfs_may_open(state->inode,
                                state->owner->so_cred,
                                nd->intent.open.flags);
                if (ret < 0)
@@ -1390,7 +1359,7 @@ static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct
        filp = lookup_instantiate_filp(nd, path->dentry, NULL);
        if (!IS_ERR(filp)) {
                struct nfs_open_context *ctx;
-               ctx = (struct nfs_open_context *)filp->private_data;
+               ctx = nfs_file_open_context(filp);
                ctx->state = state;
                return 0;
        }
@@ -1428,13 +1397,16 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        state = nfs4_do_open(dir, &path, nd->intent.open.flags, &attr, cred);
        put_rpccred(cred);
        if (IS_ERR(state)) {
-               if (PTR_ERR(state) == -ENOENT)
+               if (PTR_ERR(state) == -ENOENT) {
                        d_add(dentry, NULL);
+                       nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+               }
                return (struct dentry *)state;
        }
        res = d_add_unique(dentry, igrab(state->inode));
        if (res != NULL)
                path.dentry = res;
+       nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir));
        nfs4_intent_set_file(nd, &path, state);
        return res;
 }
@@ -1468,6 +1440,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
                }
        }
        if (state->inode == dentry->d_inode) {
+               nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
                nfs4_intent_set_file(nd, &path, state);
                return 1;
        }
@@ -1757,10 +1730,16 @@ static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh
 
 static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 {
+       struct nfs_server *server = NFS_SERVER(inode);
+       struct nfs_fattr fattr;
        struct nfs4_accessargs args = {
                .fh = NFS_FH(inode),
+               .bitmask = server->attr_bitmask,
+       };
+       struct nfs4_accessres res = {
+               .server = server,
+               .fattr = &fattr,
        };
-       struct nfs4_accessres res = { 0 };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS],
                .rpc_argp = &args,
@@ -1786,6 +1765,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
                if (mode & MAY_EXEC)
                        args.access |= NFS4_ACCESS_EXECUTE;
        }
+       nfs_fattr_init(&fattr);
        status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
        if (!status) {
                entry->mask = 0;
@@ -1795,6 +1775,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
                        entry->mask |= MAY_WRITE;
                if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
                        entry->mask |= MAY_EXEC;
+               nfs_refresh_inode(inode, &fattr);
        }
        return status;
 }
@@ -1900,11 +1881,13 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
        }
        state = nfs4_do_open(dir, &path, flags, sattr, cred);
        put_rpccred(cred);
+       d_drop(dentry);
        if (IS_ERR(state)) {
                status = PTR_ERR(state);
                goto out;
        }
-       d_instantiate(dentry, igrab(state->inode));
+       d_add(dentry, igrab(state->inode));
+       nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
        if (flags & O_EXCL) {
                struct nfs_fattr fattr;
                status = nfs4_do_setattr(state->inode, &fattr, sattr, state);
@@ -2218,6 +2201,9 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        if (status == 0)
                memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
+
+       nfs_invalidate_atime(dir);
+
        dprintk("%s: returns %d\n", __FUNCTION__, status);
        return status;
 }
@@ -2414,6 +2400,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
                rpc_restart_call(task);
                return -EAGAIN;
        }
+
+       nfs_invalidate_atime(data->inode);
        if (task->tk_status > 0)
                renew_lease(server, data->timestamp);
        return 0;
@@ -2443,7 +2431,7 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
        }
        if (task->tk_status >= 0) {
                renew_lease(NFS_SERVER(inode), data->timestamp);
-               nfs_post_op_update_inode(inode, data->res.fattr);
+               nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
        }
        return 0;
 }
@@ -2485,8 +2473,7 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
                rpc_restart_call(task);
                return -EAGAIN;
        }
-       if (task->tk_status >= 0)
-               nfs_post_op_update_inode(inode, data->res.fattr);
+       nfs_refresh_inode(inode, data->res.fattr);
        return 0;
 }
 
@@ -3056,7 +3043,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        if (status == 0) {
                status = data->rpc_status;
                if (status == 0)
-                       nfs_post_op_update_inode(inode, &data->fattr);
+                       nfs_refresh_inode(inode, &data->fattr);
        }
        rpc_put_task(task);
        return status;
@@ -3303,7 +3290,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
        status = -ENOMEM;
        if (seqid == NULL)
                goto out;
-       task = nfs4_do_unlck(request, request->fl_file->private_data, lsp, seqid);
+       task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid);
        status = PTR_ERR(task);
        if (IS_ERR(task))
                goto out;
@@ -3447,7 +3434,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
        int ret;
 
        dprintk("%s: begin!\n", __FUNCTION__);
-       data = nfs4_alloc_lockdata(fl, fl->fl_file->private_data,
+       data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
                        fl->fl_u.nfs4_fl.owner);
        if (data == NULL)
                return -ENOMEM;
@@ -3573,7 +3560,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
        int status;
 
        /* verify open state */
-       ctx = (struct nfs_open_context *)filp->private_data;
+       ctx = nfs_file_open_context(filp);
        state = ctx->state;
 
        if (request->fl_start < 0 || request->fl_end < 0)
index 3e4adf8..bfb3626 100644 (file)
@@ -774,7 +774,7 @@ static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_s
        for (fl = inode->i_flock; fl != 0; fl = fl->fl_next) {
                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
                        continue;
-               if (((struct nfs_open_context *)fl->fl_file->private_data)->state != state)
+               if (nfs_file_open_context(fl->fl_file)->state != state)
                        continue;
                status = ops->recover_lock(state, fl);
                if (status >= 0)
index badd73b..51dd380 100644 (file)
@@ -376,10 +376,12 @@ static int nfs4_stat_to_errno(int);
                                decode_locku_maxsz)
 #define NFS4_enc_access_sz     (compound_encode_hdr_maxsz + \
                                encode_putfh_maxsz + \
-                               encode_access_maxsz)
+                               encode_access_maxsz + \
+                               encode_getattr_maxsz)
 #define NFS4_dec_access_sz     (compound_decode_hdr_maxsz + \
                                decode_putfh_maxsz + \
-                               decode_access_maxsz)
+                               decode_access_maxsz + \
+                               decode_getattr_maxsz)
 #define NFS4_enc_getattr_sz    (compound_encode_hdr_maxsz + \
                                encode_putfh_maxsz + \
                                encode_getattr_maxsz)
@@ -562,7 +564,6 @@ struct compound_hdr {
 
 #define RESERVE_SPACE(nbytes)  do {                            \
        p = xdr_reserve_space(xdr, nbytes);                     \
-       if (!p) printk("RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __FUNCTION__); \
        BUG_ON(!p);                                             \
 } while (0)
 
@@ -628,8 +629,8 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
        if (iap->ia_valid & ATTR_UID) {
                owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name);
                if (owner_namelen < 0) {
-                       printk(KERN_WARNING "nfs: couldn't resolve uid %d to string\n",
-                              iap->ia_uid);
+                       dprintk("nfs: couldn't resolve uid %d to string\n",
+                                       iap->ia_uid);
                        /* XXX */
                        strcpy(owner_name, "nobody");
                        owner_namelen = sizeof("nobody") - 1;
@@ -640,8 +641,8 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
        if (iap->ia_valid & ATTR_GID) {
                owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group);
                if (owner_grouplen < 0) {
-                       printk(KERN_WARNING "nfs4: couldn't resolve gid %d to string\n",
-                              iap->ia_gid);
+                       dprintk("nfs: couldn't resolve gid %d to string\n",
+                                       iap->ia_gid);
                        strcpy(owner_group, "nobody");
                        owner_grouplen = sizeof("nobody") - 1;
                        /* goto out; */
@@ -711,7 +712,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
         * Now we backfill the bitmap and the attribute buffer length.
         */
        if (len != ((char *)p - (char *)q) + 4) {
-               printk ("encode_attr: Attr length calculation error! %u != %Zu\n",
+               printk(KERN_ERR "nfs: Attr length error, %u != %Zu\n",
                                len, ((char *)p - (char *)q) + 4);
                BUG();
        }
@@ -1376,14 +1377,20 @@ static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-               .nops = 2,
+               .nops = 3,
        };
        int status;
 
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-       if ((status = encode_putfh(&xdr, args->fh)) == 0)
-               status = encode_access(&xdr, args->access);
+       status = encode_putfh(&xdr, args->fh);
+       if (status != 0)
+               goto out;
+       status = encode_access(&xdr, args->access);
+       if (status != 0)
+               goto out;
+       status = encode_getfattr(&xdr, args->bitmask);
+out:
        return status;
 }
 
@@ -1857,6 +1864,7 @@ static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readarg
        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_read_sz) << 2;
        xdr_inline_pages(&req->rq_rcv_buf, replen,
                         args->pages, args->pgbase, args->count);
+       req->rq_rcv_buf.flags |= XDRBUF_READ;
 out:
        return status;
 }
@@ -1933,6 +1941,7 @@ static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writea
        status = encode_write(&xdr, args);
        if (status)
                goto out;
+       req->rq_snd_buf.flags |= XDRBUF_WRITE;
        status = encode_getfattr(&xdr, args->bitmask);
 out:
        return status;
@@ -2180,9 +2189,9 @@ out:
 #define READ_BUF(nbytes)  do { \
        p = xdr_inline_decode(xdr, nbytes); \
        if (unlikely(!p)) { \
-               printk(KERN_INFO "%s: prematurely hit end of receive" \
+               dprintk("nfs: %s: prematurely hit end of receive" \
                                " buffer\n", __FUNCTION__); \
-               printk(KERN_INFO "%s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \
+               dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \
                                __FUNCTION__, xdr->p, nbytes, xdr->end); \
                return -EIO; \
        } \
@@ -2223,9 +2232,8 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
        READ_BUF(8);
        READ32(opnum);
        if (opnum != expected) {
-               printk(KERN_NOTICE
-                               "nfs4_decode_op_hdr: Server returned operation"
-                               " %d but we issued a request for %d\n",
+               dprintk("nfs: Server returned operation"
+                       " %d but we issued a request for %d\n",
                                opnum, expected);
                return -EIO;
        }
@@ -2758,7 +2766,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
                                dprintk("%s: nfs_map_name_to_uid failed!\n",
                                                __FUNCTION__);
                } else
-                       printk(KERN_WARNING "%s: name too long (%u)!\n",
+                       dprintk("%s: name too long (%u)!\n",
                                        __FUNCTION__, len);
                bitmap[1] &= ~FATTR4_WORD1_OWNER;
        }
@@ -2783,7 +2791,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
                                dprintk("%s: nfs_map_group_to_gid failed!\n",
                                                __FUNCTION__);
                } else
-                       printk(KERN_WARNING "%s: name too long (%u)!\n",
+                       dprintk("%s: name too long (%u)!\n",
                                        __FUNCTION__, len);
                bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
        }
@@ -2950,7 +2958,8 @@ static int verify_attr_len(struct xdr_stream *xdr, __be32 *savep, uint32_t attrl
        unsigned int nwords = xdr->p - savep;
 
        if (unlikely(attrwords != nwords)) {
-               printk(KERN_WARNING "%s: server returned incorrect attribute length: %u %c %u\n",
+               dprintk("%s: server returned incorrect attribute length: "
+                       "%u %c %u\n",
                                __FUNCTION__,
                                attrwords << 2,
                                (attrwords < nwords) ? '<' : '>',
@@ -3451,7 +3460,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
        recvd = req->rq_rcv_buf.len - hdrlen;
        if (count > recvd) {
-               printk(KERN_WARNING "NFS: server cheating in read reply: "
+               dprintk("NFS: server cheating in read reply: "
                                "count %u > recvd %u\n", count, recvd);
                count = recvd;
                eof = 0;
@@ -3500,7 +3509,8 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
                p += 2;                 /* cookie */
                len = ntohl(*p++);      /* filename length */
                if (len > NFS4_MAXNAMLEN) {
-                       printk(KERN_WARNING "NFS: giant filename in readdir (len 0x%x)\n", len);
+                       dprintk("NFS: giant filename in readdir (len 0x%x)\n",
+                                       len);
                        goto err_unmap;
                }
                xlen = XDR_QUADLEN(len);
@@ -3528,7 +3538,7 @@ short_pkt:
        entry[0] = entry[1] = 0;
        /* truncate listing ? */
        if (!nr) {
-               printk(KERN_NOTICE "NFS: readdir reply truncated!\n");
+               dprintk("NFS: readdir reply truncated!\n");
                entry[1] = 1;
        }
        goto out;
@@ -3554,13 +3564,13 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
        READ_BUF(4);
        READ32(len);
        if (len >= rcvbuf->page_len || len <= 0) {
-               dprintk(KERN_WARNING "nfs: server returned giant symlink!\n");
+               dprintk("nfs: server returned giant symlink!\n");
                return -ENAMETOOLONG;
        }
        hdrlen = (char *) xdr->p - (char *) iov->iov_base;
        recvd = req->rq_rcv_buf.len - hdrlen;
        if (recvd < len) {
-               printk(KERN_WARNING "NFS: server cheating in readlink reply: "
+               dprintk("NFS: server cheating in readlink reply: "
                                "count %u > recvd %u\n", len, recvd);
                return -EIO;
        }
@@ -3643,7 +3653,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
                hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base;
                recvd = req->rq_rcv_buf.len - hdrlen;
                if (attrlen > recvd) {
-                       printk(KERN_WARNING "NFS: server cheating in getattr"
+                       dprintk("NFS: server cheating in getattr"
                                        " acl reply: attrlen %u > recvd %u\n",
                                        attrlen, recvd);
                        return -EINVAL;
@@ -3688,8 +3698,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
        READ_BUF(8);
        READ32(opnum);
        if (opnum != OP_SETCLIENTID) {
-               printk(KERN_NOTICE
-                               "nfs4_decode_setclientid: Server returned operation"
+               dprintk("nfs: decode_setclientid: Server returned operation"
                                " %d\n", opnum);
                return -EIO;
        }
@@ -3783,8 +3792,13 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
-       if ((status = decode_putfh(&xdr)) == 0)
-               status = decode_access(&xdr, res);
+       status = decode_putfh(&xdr);
+       if (status != 0)
+               goto out;
+       status = decode_access(&xdr, res);
+       if (status != 0)
+               goto out;
+       decode_getfattr(&xdr, res->fattr, res->server);
 out:
        return status;
 }
index 3490322..e87b44e 100644 (file)
@@ -76,6 +76,7 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xprtsock.h>
 #include <linux/nfs.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
@@ -491,7 +492,7 @@ static int __init root_nfs_get_handle(void)
        struct sockaddr_in sin;
        int status;
        int protocol = (nfs_data.flags & NFS_MOUNT_TCP) ?
-                                       IPPROTO_TCP : IPPROTO_UDP;
+                                       XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP;
        int version = (nfs_data.flags & NFS_MOUNT_VER3) ?
                                        NFS_MNT3_VERSION : NFS_MNT_VERSION;
 
index 845cdde..97669ed 100644 (file)
@@ -476,6 +476,8 @@ nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
        dprintk("NFS call  readdir %d\n", (unsigned int)cookie);
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 
+       nfs_invalidate_atime(dir);
+
        dprintk("NFS reply readdir: %d\n", status);
        return status;
 }
@@ -550,6 +552,7 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 
 static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
+       nfs_invalidate_atime(data->inode);
        if (task->tk_status >= 0) {
                nfs_refresh_inode(data->inode, data->res.fattr);
                /* Emulate the eof flag, which isn't normally needed in NFSv2
@@ -576,7 +579,7 @@ static void nfs_proc_read_setup(struct nfs_read_data *data)
 static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        if (task->tk_status >= 0)
-               nfs_post_op_update_inode(data->inode, data->res.fattr);
+               nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
        return 0;
 }
 
index 19e0563..4587a86 100644 (file)
@@ -341,9 +341,6 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
                set_bit(NFS_INO_STALE, &NFS_FLAGS(data->inode));
                nfs_mark_for_revalidate(data->inode);
        }
-       spin_lock(&data->inode->i_lock);
-       NFS_I(data->inode)->cache_validity |= NFS_INO_INVALID_ATIME;
-       spin_unlock(&data->inode->i_lock);
        return 0;
 }
 
@@ -497,8 +494,7 @@ int nfs_readpage(struct file *file, struct page *page)
                if (ctx == NULL)
                        goto out_unlock;
        } else
-               ctx = get_nfs_open_context((struct nfs_open_context *)
-                               file->private_data);
+               ctx = get_nfs_open_context(nfs_file_open_context(file));
 
        error = nfs_readpage_async(ctx, inode, page);
 
@@ -576,8 +572,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
                if (desc.ctx == NULL)
                        return -EBADF;
        } else
-               desc.ctx = get_nfs_open_context((struct nfs_open_context *)
-                               filp->private_data);
+               desc.ctx = get_nfs_open_context(nfs_file_open_context(filp));
        if (rsize < PAGE_CACHE_SIZE)
                nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
        else
index b878528..fa517ae 100644 (file)
@@ -33,6 +33,8 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/xprtsock.h>
+#include <linux/sunrpc/xprtrdma.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include <linux/nfs4_mount.h>
 
 #define NFSDBG_FACILITY                NFSDBG_VFS
 
-
-struct nfs_parsed_mount_data {
-       int                     flags;
-       int                     rsize, wsize;
-       int                     timeo, retrans;
-       int                     acregmin, acregmax,
-                               acdirmin, acdirmax;
-       int                     namlen;
-       unsigned int            bsize;
-       unsigned int            auth_flavor_len;
-       rpc_authflavor_t        auth_flavors[1];
-       char                    *client_address;
-
-       struct {
-               struct sockaddr_in      address;
-               unsigned int            program;
-               unsigned int            version;
-               unsigned short          port;
-               int                     protocol;
-       } mount_server;
-
-       struct {
-               struct sockaddr_in      address;
-               char                    *hostname;
-               char                    *export_path;
-               unsigned int            program;
-               int                     protocol;
-       } nfs_server;
-};
-
 enum {
        /* Mount options that take no arguments */
        Opt_soft, Opt_hard,
@@ -97,7 +69,7 @@ enum {
        Opt_ac, Opt_noac,
        Opt_lock, Opt_nolock,
        Opt_v2, Opt_v3,
-       Opt_udp, Opt_tcp,
+       Opt_udp, Opt_tcp, Opt_rdma,
        Opt_acl, Opt_noacl,
        Opt_rdirplus, Opt_nordirplus,
        Opt_sharecache, Opt_nosharecache,
@@ -116,7 +88,7 @@ enum {
 
        /* Mount options that take string arguments */
        Opt_sec, Opt_proto, Opt_mountproto,
-       Opt_addr, Opt_mounthost, Opt_clientaddr,
+       Opt_addr, Opt_mountaddr, Opt_clientaddr,
 
        /* Mount options that are ignored */
        Opt_userspace, Opt_deprecated,
@@ -143,6 +115,7 @@ static match_table_t nfs_mount_option_tokens = {
        { Opt_v3, "v3" },
        { Opt_udp, "udp" },
        { Opt_tcp, "tcp" },
+       { Opt_rdma, "rdma" },
        { Opt_acl, "acl" },
        { Opt_noacl, "noacl" },
        { Opt_rdirplus, "rdirplus" },
@@ -175,13 +148,14 @@ static match_table_t nfs_mount_option_tokens = {
        { Opt_mountproto, "mountproto=%s" },
        { Opt_addr, "addr=%s" },
        { Opt_clientaddr, "clientaddr=%s" },
-       { Opt_mounthost, "mounthost=%s" },
+       { Opt_userspace, "mounthost=%s" },
+       { Opt_mountaddr, "mountaddr=%s" },
 
        { Opt_err, NULL }
 };
 
 enum {
-       Opt_xprt_udp, Opt_xprt_tcp,
+       Opt_xprt_udp, Opt_xprt_tcp, Opt_xprt_rdma,
 
        Opt_xprt_err
 };
@@ -189,6 +163,7 @@ enum {
 static match_table_t nfs_xprt_protocol_tokens = {
        { Opt_xprt_udp, "udp" },
        { Opt_xprt_tcp, "tcp" },
+       { Opt_xprt_rdma, "rdma" },
 
        { Opt_xprt_err, NULL }
 };
@@ -449,7 +424,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
                const char *nostr;
        } nfs_info[] = {
                { NFS_MOUNT_SOFT, ",soft", ",hard" },
-               { NFS_MOUNT_INTR, ",intr", "" },
+               { NFS_MOUNT_INTR, ",intr", ",nointr" },
                { NFS_MOUNT_NOCTO, ",nocto", "" },
                { NFS_MOUNT_NOAC, ",noac", "" },
                { NFS_MOUNT_NONLM, ",nolock", "" },
@@ -460,8 +435,6 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
        };
        const struct proc_nfs_info *nfs_infop;
        struct nfs_client *clp = nfss->nfs_client;
-       char buf[12];
-       const char *proto;
 
        seq_printf(m, ",vers=%d", clp->rpc_ops->version);
        seq_printf(m, ",rsize=%d", nfss->rsize);
@@ -480,18 +453,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
                else
                        seq_puts(m, nfs_infop->nostr);
        }
-       switch (nfss->client->cl_xprt->prot) {
-               case IPPROTO_TCP:
-                       proto = "tcp";
-                       break;
-               case IPPROTO_UDP:
-                       proto = "udp";
-                       break;
-               default:
-                       snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot);
-                       proto = buf;
-       }
-       seq_printf(m, ",proto=%s", proto);
+       seq_printf(m, ",proto=%s",
+                  rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO));
        seq_printf(m, ",timeo=%lu", 10U * clp->retrans_timeo / HZ);
        seq_printf(m, ",retrans=%u", clp->retrans_count);
        seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor));
@@ -506,8 +469,8 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
 
        nfs_show_mount_options(m, nfss, 0);
 
-       seq_puts(m, ",addr=");
-       seq_escape(m, nfss->nfs_client->cl_hostname, " \t\n\\");
+       seq_printf(m, ",addr="NIPQUAD_FMT,
+               NIPQUAD(nfss->nfs_client->cl_addr.sin_addr));
 
        return 0;
 }
@@ -698,13 +661,19 @@ static int nfs_parse_mount_options(char *raw,
                        break;
                case Opt_udp:
                        mnt->flags &= ~NFS_MOUNT_TCP;
-                       mnt->nfs_server.protocol = IPPROTO_UDP;
+                       mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
                        mnt->timeo = 7;
                        mnt->retrans = 5;
                        break;
                case Opt_tcp:
                        mnt->flags |= NFS_MOUNT_TCP;
-                       mnt->nfs_server.protocol = IPPROTO_TCP;
+                       mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+                       mnt->timeo = 600;
+                       mnt->retrans = 2;
+                       break;
+               case Opt_rdma:
+                       mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
+                       mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
                        mnt->timeo = 600;
                        mnt->retrans = 2;
                        break;
@@ -913,13 +882,20 @@ static int nfs_parse_mount_options(char *raw,
                        switch (token) {
                        case Opt_xprt_udp:
                                mnt->flags &= ~NFS_MOUNT_TCP;
-                               mnt->nfs_server.protocol = IPPROTO_UDP;
+                               mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
                                mnt->timeo = 7;
                                mnt->retrans = 5;
                                break;
                        case Opt_xprt_tcp:
                                mnt->flags |= NFS_MOUNT_TCP;
-                               mnt->nfs_server.protocol = IPPROTO_TCP;
+                               mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+                               mnt->timeo = 600;
+                               mnt->retrans = 2;
+                               break;
+                       case Opt_xprt_rdma:
+                               /* vector side protocols to TCP */
+                               mnt->flags |= NFS_MOUNT_TCP;
+                               mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
                                mnt->timeo = 600;
                                mnt->retrans = 2;
                                break;
@@ -937,11 +913,12 @@ static int nfs_parse_mount_options(char *raw,
 
                        switch (token) {
                        case Opt_xprt_udp:
-                               mnt->mount_server.protocol = IPPROTO_UDP;
+                               mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
                                break;
                        case Opt_xprt_tcp:
-                               mnt->mount_server.protocol = IPPROTO_TCP;
+                               mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
                                break;
+                       case Opt_xprt_rdma: /* not used for side protocols */
                        default:
                                goto out_unrec_xprt;
                        }
@@ -961,7 +938,7 @@ static int nfs_parse_mount_options(char *raw,
                                goto out_nomem;
                        mnt->client_address = string;
                        break;
-               case Opt_mounthost:
+               case Opt_mountaddr:
                        string = match_strdup(args);
                        if (string == NULL)
                                goto out_nomem;
@@ -1027,16 +1004,10 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
                sin = args->mount_server.address;
        else
                sin = args->nfs_server.address;
-       if (args->mount_server.port == 0) {
-               status = rpcb_getport_sync(&sin,
-                                          args->mount_server.program,
-                                          args->mount_server.version,
-                                          args->mount_server.protocol);
-               if (status < 0)
-                       goto out_err;
-               sin.sin_port = htons(status);
-       } else
-               sin.sin_port = htons(args->mount_server.port);
+       /*
+        * autobind will be used if mount_server.port == 0
+        */
+       sin.sin_port = htons(args->mount_server.port);
 
        /*
         * Now ask the mount server to map our export path
@@ -1049,14 +1020,11 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
                           args->mount_server.version,
                           args->mount_server.protocol,
                           root_fh);
-       if (status < 0)
-               goto out_err;
-
-       return status;
+       if (status == 0)
+               return 0;
 
-out_err:
-       dfprintk(MOUNT, "NFS: unable to contact server on host "
-                NIPQUAD_FMT "\n", NIPQUAD(sin.sin_addr.s_addr));
+       dfprintk(MOUNT, "NFS: unable to mount server " NIPQUAD_FMT
+                       ", error %d\n", NIPQUAD(sin.sin_addr.s_addr), status);
        return status;
 }
 
@@ -1079,15 +1047,31 @@ out_err:
  * XXX: as far as I can tell, changing the NFS program number is not
  *      supported in the NFS client.
  */
-static int nfs_validate_mount_data(struct nfs_mount_data **options,
+static int nfs_validate_mount_data(void *options,
+                                  struct nfs_parsed_mount_data *args,
                                   struct nfs_fh *mntfh,
                                   const char *dev_name)
 {
-       struct nfs_mount_data *data = *options;
+       struct nfs_mount_data *data = (struct nfs_mount_data *)options;
 
        if (data == NULL)
                goto out_no_data;
 
+       memset(args, 0, sizeof(*args));
+       args->flags             = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
+       args->rsize             = NFS_MAX_FILE_IO_SIZE;
+       args->wsize             = NFS_MAX_FILE_IO_SIZE;
+       args->timeo             = 600;
+       args->retrans           = 2;
+       args->acregmin          = 3;
+       args->acregmax          = 60;
+       args->acdirmin          = 30;
+       args->acdirmax          = 60;
+       args->mount_server.protocol = XPRT_TRANSPORT_UDP;
+       args->mount_server.program = NFS_MNT_PROGRAM;
+       args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+       args->nfs_server.program = NFS_PROGRAM;
+
        switch (data->version) {
        case 1:
                data->namlen = 0;
@@ -1116,92 +1100,73 @@ static int nfs_validate_mount_data(struct nfs_mount_data **options,
                if (mntfh->size < sizeof(mntfh->data))
                        memset(mntfh->data + mntfh->size, 0,
                               sizeof(mntfh->data) - mntfh->size);
+
+               if (!nfs_verify_server_address((struct sockaddr *) &data->addr))
+                       goto out_no_address;
+
+               /*
+                * Translate to nfs_parsed_mount_data, which nfs_fill_super
+                * can deal with.
+                */
+               args->flags             = data->flags;
+               args->rsize             = data->rsize;
+               args->wsize             = data->wsize;
+               args->flags             = data->flags;
+               args->timeo             = data->timeo;
+               args->retrans           = data->retrans;
+               args->acregmin          = data->acregmin;
+               args->acregmax          = data->acregmax;
+               args->acdirmin          = data->acdirmin;
+               args->acdirmax          = data->acdirmax;
+               args->nfs_server.address = data->addr;
+               if (!(data->flags & NFS_MOUNT_TCP))
+                       args->nfs_server.protocol = XPRT_TRANSPORT_UDP;
+               /* N.B. caller will free nfs_server.hostname in all cases */
+               args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
+               args->namlen            = data->namlen;
+               args->bsize             = data->bsize;
+               args->auth_flavors[0]   = data->pseudoflavor;
                break;
        default: {
                unsigned int len;
                char *c;
                int status;
-               struct nfs_parsed_mount_data args = {
-                       .flags          = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP),
-                       .rsize          = NFS_MAX_FILE_IO_SIZE,
-                       .wsize          = NFS_MAX_FILE_IO_SIZE,
-                       .timeo          = 600,
-                       .retrans        = 2,
-                       .acregmin       = 3,
-                       .acregmax       = 60,
-                       .acdirmin       = 30,
-                       .acdirmax       = 60,
-                       .mount_server.protocol = IPPROTO_UDP,
-                       .mount_server.program = NFS_MNT_PROGRAM,
-                       .nfs_server.protocol = IPPROTO_TCP,
-                       .nfs_server.program = NFS_PROGRAM,
-               };
-
-               if (nfs_parse_mount_options((char *) *options, &args) == 0)
-                       return -EINVAL;
 
-               data = kzalloc(sizeof(*data), GFP_KERNEL);
-               if (data == NULL)
-                       return -ENOMEM;
+               if (nfs_parse_mount_options((char *)options, args) == 0)
+                       return -EINVAL;
 
-               /*
-                * NB: after this point, caller will free "data"
-                * if we return an error
-                */
-               *options = data;
+               if (!nfs_verify_server_address((struct sockaddr *)
+                                               &args->nfs_server.address))
+                       goto out_no_address;
 
                c = strchr(dev_name, ':');
                if (c == NULL)
                        return -EINVAL;
                len = c - dev_name;
-               if (len > sizeof(data->hostname))
-                       return -ENAMETOOLONG;
-               strncpy(data->hostname, dev_name, len);
-               args.nfs_server.hostname = data->hostname;
+               /* N.B. caller will free nfs_server.hostname in all cases */
+               args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
 
                c++;
                if (strlen(c) > NFS_MAXPATHLEN)
                        return -ENAMETOOLONG;
-               args.nfs_server.export_path = c;
+               args->nfs_server.export_path = c;
 
-               status = nfs_try_mount(&args, mntfh);
+               status = nfs_try_mount(args, mntfh);
                if (status)
                        return status;
 
-               /*
-                * Translate to nfs_mount_data, which nfs_fill_super
-                * can deal with.
-                */
-               data->version           = 6;
-               data->flags             = args.flags;
-               data->rsize             = args.rsize;
-               data->wsize             = args.wsize;
-               data->timeo             = args.timeo;
-               data->retrans           = args.retrans;
-               data->acregmin          = args.acregmin;
-               data->acregmax          = args.acregmax;
-               data->acdirmin          = args.acdirmin;
-               data->acdirmax          = args.acdirmax;
-               data->addr              = args.nfs_server.address;
-               data->namlen            = args.namlen;
-               data->bsize             = args.bsize;
-               data->pseudoflavor      = args.auth_flavors[0];
-
                break;
                }
        }
 
-       if (!(data->flags & NFS_MOUNT_SECFLAVOUR))
-               data->pseudoflavor = RPC_AUTH_UNIX;
+       if (!(args->flags & NFS_MOUNT_SECFLAVOUR))
+               args->auth_flavors[0] = RPC_AUTH_UNIX;
 
 #ifndef CONFIG_NFS_V3
-       if (data->flags & NFS_MOUNT_VER3)
+       if (args->flags & NFS_MOUNT_VER3)
                goto out_v3_not_compiled;
 #endif /* !CONFIG_NFS_V3 */
 
-       if (!nfs_verify_server_address((struct sockaddr *) &data->addr))
-               goto out_no_address;
-
        return 0;
 
 out_no_data:
@@ -1258,7 +1223,8 @@ static inline void nfs_initialise_sb(struct super_block *sb)
 /*
  * Finish setting up an NFS2/3 superblock
  */
-static void nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data)
+static void nfs_fill_super(struct super_block *sb,
+                          struct nfs_parsed_mount_data *data)
 {
        struct nfs_server *server = NFS_SB(sb);
 
@@ -1379,7 +1345,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        struct nfs_server *server = NULL;
        struct super_block *s;
        struct nfs_fh mntfh;
-       struct nfs_mount_data *data = raw_data;
+       struct nfs_parsed_mount_data data;
        struct dentry *mntroot;
        int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
        struct nfs_sb_mountdata sb_mntdata = {
@@ -1388,12 +1354,12 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        int error;
 
        /* Validate the mount data */
-       error = nfs_validate_mount_data(&data, &mntfh, dev_name);
+       error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name);
        if (error < 0)
                goto out;
 
        /* Get a volume representation */
-       server = nfs_create_server(data, &mntfh);
+       server = nfs_create_server(&data, &mntfh);
        if (IS_ERR(server)) {
                error = PTR_ERR(server);
                goto out;
@@ -1417,7 +1383,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 
        if (!s->s_root) {
                /* initial superblock/root creation */
-               nfs_fill_super(s, data);
+               nfs_fill_super(s, &data);
        }
 
        mntroot = nfs_get_root(s, &mntfh);
@@ -1432,8 +1398,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        error = 0;
 
 out:
-       if (data != raw_data)
-               kfree(data);
+       kfree(data.nfs_server.hostname);
        return error;
 
 out_err_nosb:
@@ -1559,38 +1524,49 @@ static void nfs4_fill_super(struct super_block *sb)
 /*
  * Validate NFSv4 mount options
  */
-static int nfs4_validate_mount_data(struct nfs4_mount_data **options,
-                                   const char *dev_name,
-                                   struct sockaddr_in *addr,
-                                   rpc_authflavor_t *authflavour,
-                                   char **hostname,
-                                   char **mntpath,
-                                   char **ip_addr)
+static int nfs4_validate_mount_data(void *options,
+                                   struct nfs_parsed_mount_data *args,
+                                   const char *dev_name)
 {
-       struct nfs4_mount_data *data = *options;
+       struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
        char *c;
 
        if (data == NULL)
                goto out_no_data;
 
+       memset(args, 0, sizeof(*args));
+       args->rsize             = NFS_MAX_FILE_IO_SIZE;
+       args->wsize             = NFS_MAX_FILE_IO_SIZE;
+       args->timeo             = 600;
+       args->retrans           = 2;
+       args->acregmin          = 3;
+       args->acregmax          = 60;
+       args->acdirmin          = 30;
+       args->acdirmax          = 60;
+       args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+
        switch (data->version) {
        case 1:
-               if (data->host_addrlen != sizeof(*addr))
+               if (data->host_addrlen != sizeof(args->nfs_server.address))
                        goto out_no_address;
-               if (copy_from_user(addr, data->host_addr, sizeof(*addr)))
+               if (copy_from_user(&args->nfs_server.address,
+                                  data->host_addr,
+                                  sizeof(args->nfs_server.address)))
                        return -EFAULT;
-               if (addr->sin_port == 0)
-                       addr->sin_port = htons(NFS_PORT);
-               if (!nfs_verify_server_address((struct sockaddr *) addr))
+               if (args->nfs_server.address.sin_port == 0)
+                       args->nfs_server.address.sin_port = htons(NFS_PORT);
+               if (!nfs_verify_server_address((struct sockaddr *)
+                                               &args->nfs_server.address))
                        goto out_no_address;
 
                switch (data->auth_flavourlen) {
                case 0:
-                       *authflavour = RPC_AUTH_UNIX;
+                       args->auth_flavors[0] = RPC_AUTH_UNIX;
                        break;
                case 1:
-                       if (copy_from_user(authflavour, data->auth_flavours,
-                                          sizeof(*authflavour)))
+                       if (copy_from_user(&args->auth_flavors[0],
+                                          data->auth_flavours,
+                                          sizeof(args->auth_flavors[0])))
                                return -EFAULT;
                        break;
                default:
@@ -1600,75 +1576,57 @@ static int nfs4_validate_mount_data(struct nfs4_mount_data **options,
                c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
                if (IS_ERR(c))
                        return PTR_ERR(c);
-               *hostname = c;
+               args->nfs_server.hostname = c;
 
                c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN);
                if (IS_ERR(c))
                        return PTR_ERR(c);
-               *mntpath = c;
-               dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *mntpath);
+               args->nfs_server.export_path = c;
+               dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c);
 
                c = strndup_user(data->client_addr.data, 16);
                if (IS_ERR(c))
                        return PTR_ERR(c);
-               *ip_addr = c;
+               args->client_address = c;
+
+               /*
+                * Translate to nfs_parsed_mount_data, which nfs4_fill_super
+                * can deal with.
+                */
+
+               args->flags     = data->flags & NFS4_MOUNT_FLAGMASK;
+               args->rsize     = data->rsize;
+               args->wsize     = data->wsize;
+               args->timeo     = data->timeo;
+               args->retrans   = data->retrans;
+               args->acregmin  = data->acregmin;
+               args->acregmax  = data->acregmax;
+               args->acdirmin  = data->acdirmin;
+               args->acdirmax  = data->acdirmax;
+               args->nfs_server.protocol = data->proto;
 
                break;
        default: {
                unsigned int len;
-               struct nfs_parsed_mount_data args = {
-                       .rsize          = NFS_MAX_FILE_IO_SIZE,
-                       .wsize          = NFS_MAX_FILE_IO_SIZE,
-                       .timeo          = 600,
-                       .retrans        = 2,
-                       .acregmin       = 3,
-                       .acregmax       = 60,
-                       .acdirmin       = 30,
-                       .acdirmax       = 60,
-                       .nfs_server.protocol = IPPROTO_TCP,
-               };
-
-               if (nfs_parse_mount_options((char *) *options, &args) == 0)
+
+               if (nfs_parse_mount_options((char *)options, args) == 0)
                        return -EINVAL;
 
                if (!nfs_verify_server_address((struct sockaddr *)
-                                               &args.nfs_server.address))
+                                               &args->nfs_server.address))
                        return -EINVAL;
-               *addr = args.nfs_server.address;
 
-               switch (args.auth_flavor_len) {
+               switch (args->auth_flavor_len) {
                case 0:
-                       *authflavour = RPC_AUTH_UNIX;
+                       args->auth_flavors[0] = RPC_AUTH_UNIX;
                        break;
                case 1:
-                       *authflavour = (rpc_authflavor_t) args.auth_flavors[0];
                        break;
                default:
                        goto out_inval_auth;
                }
 
                /*
-                * Translate to nfs4_mount_data, which nfs4_fill_super
-                * can deal with.
-                */
-               data = kzalloc(sizeof(*data), GFP_KERNEL);
-               if (data == NULL)
-                       return -ENOMEM;
-               *options = data;
-
-               data->version   = 1;
-               data->flags     = args.flags & NFS4_MOUNT_FLAGMASK;
-               data->rsize     = args.rsize;
-               data->wsize     = args.wsize;
-               data->timeo     = args.timeo;
-               data->retrans   = args.retrans;
-               data->acregmin  = args.acregmin;
-               data->acregmax  = args.acregmax;
-               data->acdirmin  = args.acdirmin;
-               data->acdirmax  = args.acdirmax;
-               data->proto     = args.nfs_server.protocol;
-
-               /*
                 * Split "dev_name" into "hostname:mntpath".
                 */
                c = strchr(dev_name, ':');
@@ -1678,27 +1636,25 @@ static int nfs4_validate_mount_data(struct nfs4_mount_data **options,
                len = c - dev_name;
                if (len > NFS4_MAXNAMLEN)
                        return -ENAMETOOLONG;
-               *hostname = kzalloc(len, GFP_KERNEL);
-               if (*hostname == NULL)
+               args->nfs_server.hostname = kzalloc(len, GFP_KERNEL);
+               if (args->nfs_server.hostname == NULL)
                        return -ENOMEM;
-               strncpy(*hostname, dev_name, len - 1);
+               strncpy(args->nfs_server.hostname, dev_name, len - 1);
 
                c++;                    /* step over the ':' */
                len = strlen(c);
                if (len > NFS4_MAXPATHLEN)
                        return -ENAMETOOLONG;
-               *mntpath = kzalloc(len + 1, GFP_KERNEL);
-               if (*mntpath == NULL)
+               args->nfs_server.export_path = kzalloc(len + 1, GFP_KERNEL);
+               if (args->nfs_server.export_path == NULL)
                        return -ENOMEM;
-               strncpy(*mntpath, c, len);
+               strncpy(args->nfs_server.export_path, c, len);
 
-               dprintk("MNTPATH: %s\n", *mntpath);
+               dprintk("MNTPATH: %s\n", args->nfs_server.export_path);
 
-               if (args.client_address == NULL)
+               if (args->client_address == NULL)
                        goto out_no_client_address;
 
-               *ip_addr = args.client_address;
-
                break;
                }
        }
@@ -1729,14 +1685,11 @@ out_no_client_address:
 static int nfs4_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
-       struct nfs4_mount_data *data = raw_data;
+       struct nfs_parsed_mount_data data;
        struct super_block *s;
        struct nfs_server *server;
-       struct sockaddr_in addr;
-       rpc_authflavor_t authflavour;
        struct nfs_fh mntfh;
        struct dentry *mntroot;
-       char *mntpath = NULL, *hostname = NULL, *ip_addr = NULL;
        int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
        struct nfs_sb_mountdata sb_mntdata = {
                .mntflags = flags,
@@ -1744,14 +1697,12 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
        int error;
 
        /* Validate the mount data */
-       error = nfs4_validate_mount_data(&data, dev_name, &addr, &authflavour,
-                                        &hostname, &mntpath, &ip_addr);
+       error = nfs4_validate_mount_data(raw_data, &data, dev_name);
        if (error < 0)
                goto out;
 
        /* Get a volume representation */
-       server = nfs4_create_server(data, hostname, &addr, mntpath, ip_addr,
-                                   authflavour, &mntfh);
+       server = nfs4_create_server(&data, &mntfh);
        if (IS_ERR(server)) {
                error = PTR_ERR(server);
                goto out;
@@ -1790,9 +1741,9 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
        error = 0;
 
 out:
-       kfree(ip_addr);
-       kfree(mntpath);
-       kfree(hostname);
+       kfree(data.client_address);
+       kfree(data.nfs_server.export_path);
+       kfree(data.nfs_server.hostname);
        return error;
 
 out_free:
index 045ab80..1aed850 100644 (file)
@@ -66,7 +66,6 @@ static void nfs_async_unlink_init(struct rpc_task *task, void *calldata)
                .rpc_cred = data->cred,
        };
 
-       nfs_begin_data_update(dir);
        NFS_PROTO(dir)->unlink_setup(&msg, dir);
        rpc_call_setup(task, &msg, 0);
 }
@@ -84,8 +83,6 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
 
        if (!NFS_PROTO(dir)->unlink_done(task, dir))
                rpc_restart_call(task);
-       else
-               nfs_end_data_update(dir);
 }
 
 /**
index 0d7a77c..e2bb66c 100644 (file)
@@ -110,6 +110,13 @@ void nfs_writedata_release(void *wdata)
        nfs_writedata_free(wdata);
 }
 
+static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
+{
+       ctx->error = error;
+       smp_wmb();
+       set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+}
+
 static struct nfs_page *nfs_page_find_request_locked(struct page *page)
 {
        struct nfs_page *req = NULL;
@@ -243,10 +250,7 @@ static void nfs_end_page_writeback(struct page *page)
 
 /*
  * Find an associated nfs write request, and prepare to flush it out
- * Returns 1 if there was no write request, or if the request was
- * already tagged by nfs_set_page_dirty.Returns 0 if the request
- * was not tagged.
- * May also return an error if the user signalled nfs_wait_on_request().
+ * May return an error if the user signalled nfs_wait_on_request().
  */
 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
                                struct page *page)
@@ -261,7 +265,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
                req = nfs_page_find_request_locked(page);
                if (req == NULL) {
                        spin_unlock(&inode->i_lock);
-                       return 1;
+                       return 0;
                }
                if (nfs_lock_request_dontget(req))
                        break;
@@ -282,7 +286,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
                spin_unlock(&inode->i_lock);
                nfs_unlock_request(req);
                nfs_pageio_complete(pgio);
-               return 1;
+               return 0;
        }
        if (nfs_set_page_writeback(page) != 0) {
                spin_unlock(&inode->i_lock);
@@ -290,70 +294,56 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
        }
        radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
                        NFS_PAGE_TAG_LOCKED);
-       ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
        spin_unlock(&inode->i_lock);
        nfs_pageio_add_request(pgio, req);
-       return ret;
+       return 0;
 }
 
-/*
- * Write an mmapped page to the server.
- */
-static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
+static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
 {
-       struct nfs_pageio_descriptor mypgio, *pgio;
-       struct nfs_open_context *ctx;
        struct inode *inode = page->mapping->host;
-       unsigned offset;
-       int err;
 
        nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
        nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
 
-       if (wbc->for_writepages)
-               pgio = wbc->fs_private;
-       else {
-               nfs_pageio_init_write(&mypgio, inode, wb_priority(wbc));
-               pgio = &mypgio;
-       }
-
        nfs_pageio_cond_complete(pgio, page->index);
+       return nfs_page_async_flush(pgio, page);
+}
 
-       err = nfs_page_async_flush(pgio, page);
-       if (err <= 0)
-               goto out;
-       err = 0;
-       offset = nfs_page_length(page);
-       if (!offset)
-               goto out;
-
-       nfs_pageio_cond_complete(pgio, page->index);
+/*
+ * Write an mmapped page to the server.
+ */
+static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
+{
+       struct nfs_pageio_descriptor pgio;
+       int err;
 
-       ctx = nfs_find_open_context(inode, NULL, FMODE_WRITE);
-       if (ctx == NULL) {
-               err = -EBADF;
-               goto out;
-       }
-       err = nfs_writepage_setup(ctx, page, 0, offset);
-       put_nfs_open_context(ctx);
-       if (err != 0)
-               goto out;
-       err = nfs_page_async_flush(pgio, page);
-       if (err > 0)
-               err = 0;
-out:
-       if (!wbc->for_writepages)
-               nfs_pageio_complete(pgio);
-       return err;
+       nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc));
+       err = nfs_do_writepage(page, wbc, &pgio);
+       nfs_pageio_complete(&pgio);
+       if (err < 0)
+               return err;
+       if (pgio.pg_error < 0)
+               return pgio.pg_error;
+       return 0;
 }
 
 int nfs_writepage(struct page *page, struct writeback_control *wbc)
 {
-       int err;
+       int ret;
+
+       ret = nfs_writepage_locked(page, wbc);
+       unlock_page(page);
+       return ret;
+}
+
+static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data)
+{
+       int ret;
 
-       err = nfs_writepage_locked(page, wbc);
+       ret = nfs_do_writepage(page, wbc, data);
        unlock_page(page);
-       return err; 
+       return ret;
 }
 
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
@@ -365,12 +355,11 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
        nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
        nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
-       wbc->fs_private = &pgio;
-       err = generic_writepages(mapping, wbc);
+       err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
        nfs_pageio_complete(&pgio);
-       if (err)
+       if (err < 0)
                return err;
-       if (pgio.pg_error)
+       if (pgio.pg_error < 0)
                return pgio.pg_error;
        return 0;
 }
@@ -389,14 +378,11 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
                return error;
        if (!nfsi->npages) {
                igrab(inode);
-               nfs_begin_data_update(inode);
                if (nfs_have_delegation(inode, FMODE_WRITE))
                        nfsi->change_attr++;
        }
        SetPagePrivate(req->wb_page);
        set_page_private(req->wb_page, (unsigned long)req);
-       if (PageDirty(req->wb_page))
-               set_bit(PG_NEED_FLUSH, &req->wb_flags);
        nfsi->npages++;
        kref_get(&req->wb_kref);
        return 0;
@@ -416,12 +402,9 @@ static void nfs_inode_remove_request(struct nfs_page *req)
        set_page_private(req->wb_page, 0);
        ClearPagePrivate(req->wb_page);
        radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
-       if (test_and_clear_bit(PG_NEED_FLUSH, &req->wb_flags))
-               __set_page_dirty_nobuffers(req->wb_page);
        nfsi->npages--;
        if (!nfsi->npages) {
                spin_unlock(&inode->i_lock);
-               nfs_end_data_update(inode);
                iput(inode);
        } else
                spin_unlock(&inode->i_lock);
@@ -682,7 +665,7 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
 
 int nfs_flush_incompatible(struct file *file, struct page *page)
 {
-       struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+       struct nfs_open_context *ctx = nfs_file_open_context(file);
        struct nfs_page *req;
        int do_flush, status;
        /*
@@ -716,7 +699,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
 int nfs_updatepage(struct file *file, struct page *page,
                unsigned int offset, unsigned int count)
 {
-       struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+       struct nfs_open_context *ctx = nfs_file_open_context(file);
        struct inode    *inode = page->mapping->host;
        int             status = 0;
 
@@ -967,7 +950,7 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
 
        if (task->tk_status < 0) {
                nfs_set_pageerror(page);
-               req->wb_context->error = task->tk_status;
+               nfs_context_set_write_error(req->wb_context, task->tk_status);
                dprintk(", error = %d\n", task->tk_status);
                goto out;
        }
@@ -1030,7 +1013,7 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
 
                if (task->tk_status < 0) {
                        nfs_set_pageerror(page);
-                       req->wb_context->error = task->tk_status;
+                       nfs_context_set_write_error(req->wb_context, task->tk_status);
                        dprintk(", error = %d\n", task->tk_status);
                        goto remove_request;
                }
@@ -1244,7 +1227,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
                        req->wb_bytes,
                        (long long)req_offset(req));
                if (task->tk_status < 0) {
-                       req->wb_context->error = task->tk_status;
+                       nfs_context_set_write_error(req->wb_context, task->tk_status);
                        nfs_inode_remove_request(req);
                        dprintk(", error = %d\n", task->tk_status);
                        goto next;
@@ -1347,53 +1330,52 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr
        return ret;
 }
 
-/*
- * flush the inode to disk.
- */
-int nfs_wb_all(struct inode *inode)
+static int __nfs_write_mapping(struct address_space *mapping, struct writeback_control *wbc, int how)
 {
-       struct address_space *mapping = inode->i_mapping;
-       struct writeback_control wbc = {
-               .bdi = mapping->backing_dev_info,
-               .sync_mode = WB_SYNC_ALL,
-               .nr_to_write = LONG_MAX,
-               .for_writepages = 1,
-               .range_cyclic = 1,
-       };
        int ret;
 
-       ret = nfs_writepages(mapping, &wbc);
+       ret = nfs_writepages(mapping, wbc);
        if (ret < 0)
                goto out;
-       ret = nfs_sync_mapping_wait(mapping, &wbc, 0);
-       if (ret >= 0)
-               return 0;
+       ret = nfs_sync_mapping_wait(mapping, wbc, how);
+       if (ret < 0)
+               goto out;
+       return 0;
 out:
        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
        return ret;
 }
 
-int nfs_sync_mapping_range(struct address_space *mapping, loff_t range_start, loff_t range_end, int how)
+/* Two pass sync: first using WB_SYNC_NONE, then WB_SYNC_ALL */
+static int nfs_write_mapping(struct address_space *mapping, int how)
 {
        struct writeback_control wbc = {
                .bdi = mapping->backing_dev_info,
-               .sync_mode = WB_SYNC_ALL,
+               .sync_mode = WB_SYNC_NONE,
                .nr_to_write = LONG_MAX,
-               .range_start = range_start,
-               .range_end = range_end,
                .for_writepages = 1,
+               .range_cyclic = 1,
        };
        int ret;
 
-       ret = nfs_writepages(mapping, &wbc);
+       ret = __nfs_write_mapping(mapping, &wbc, how);
        if (ret < 0)
-               goto out;
-       ret = nfs_sync_mapping_wait(mapping, &wbc, how);
-       if (ret >= 0)
-               return 0;
-out:
-       __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-       return ret;
+               return ret;
+       wbc.sync_mode = WB_SYNC_ALL;
+       return __nfs_write_mapping(mapping, &wbc, how);
+}
+
+/*
+ * flush the inode to disk.
+ */
+int nfs_wb_all(struct inode *inode)
+{
+       return nfs_write_mapping(inode->i_mapping, 0);
+}
+
+int nfs_wb_nocommit(struct inode *inode)
+{
+       return nfs_write_mapping(inode->i_mapping, FLUSH_NOCOMMIT);
 }
 
 int nfs_wb_page_cancel(struct inode *inode, struct page *page)
@@ -1477,35 +1459,6 @@ int nfs_wb_page(struct inode *inode, struct page* page)
        return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
 }
 
-int nfs_set_page_dirty(struct page *page)
-{
-       struct address_space *mapping = page->mapping;
-       struct inode *inode;
-       struct nfs_page *req;
-       int ret;
-
-       if (!mapping)
-               goto out_raced;
-       inode = mapping->host;
-       if (!inode)
-               goto out_raced;
-       spin_lock(&inode->i_lock);
-       req = nfs_page_find_request_locked(page);
-       if (req != NULL) {
-               /* Mark any existing write requests for flushing */
-               ret = !test_and_set_bit(PG_NEED_FLUSH, &req->wb_flags);
-               spin_unlock(&inode->i_lock);
-               nfs_release_request(req);
-               return ret;
-       }
-       ret = __set_page_dirty_nobuffers(page);
-       spin_unlock(&inode->i_lock);
-       return ret;
-out_raced:
-       return !TestSetPageDirty(page);
-}
-
-
 int __init nfs_init_writepagecache(void)
 {
        nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
index e15f2cf..5733394 100644 (file)
@@ -102,7 +102,8 @@ check_filename(char *str, int len, __be32 err)
 out:                                           \
        return status;                          \
 xdr_error:                                     \
-       printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); \
+       dprintk("NFSD: xdr error (%s:%d)\n",    \
+                       __FILE__, __LINE__);    \
        status = nfserr_bad_xdr;                \
        goto out
 
@@ -124,7 +125,8 @@ xdr_error:                                  \
        if (!(x = (p==argp->tmp || p == argp->tmpp) ? \
                savemem(argp, p, nbytes) :      \
                (char *)p)) {                   \
-               printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); \
+               dprintk("NFSD: xdr error (%s:%d)\n", \
+                               __FILE__, __LINE__); \
                goto xdr_error;                 \
                }                               \
        p += XDR_QUADLEN(nbytes);               \
@@ -140,7 +142,8 @@ xdr_error:                                  \
                p = argp->p;                    \
                argp->p += XDR_QUADLEN(nbytes); \
        } else if (!(p = read_buf(argp, nbytes))) { \
-               printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); \
+               dprintk("NFSD: xdr error (%s:%d)\n", \
+                               __FILE__, __LINE__); \
                goto xdr_error;                 \
        }                                       \
 } while (0)
@@ -948,7 +951,8 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
         */
        avail = (char*)argp->end - (char*)argp->p;
        if (avail + argp->pagelen < write->wr_buflen) {
-               printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); 
+               dprintk("NFSD: xdr error (%s:%d)\n",
+                               __FILE__, __LINE__);
                goto xdr_error;
        }
        argp->rqstp->rq_vec[0].iov_base = p;
@@ -1019,7 +1023,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
                argp->ops = kmalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL);
                if (!argp->ops) {
                        argp->ops = argp->iops;
-                       printk(KERN_INFO "nfsd: couldn't allocate room for COMPOUND\n");
+                       dprintk("nfsd: couldn't allocate room for COMPOUND\n");
                        goto xdr_error;
                }
        }
@@ -1326,7 +1330,7 @@ static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *
        path = exp->ex_path;
 
        if (strncmp(path, rootpath, strlen(rootpath))) {
-               printk("nfsd: fs_locations failed;"
+               dprintk("nfsd: fs_locations failed;"
                        "%s is not contained in %s\n", path, rootpath);
                *stat = nfserr_notsupp;
                return NULL;
index d7a5e03..e757a74 100644 (file)
@@ -109,6 +109,10 @@ static inline u64 get_jiffies_64(void)
         ((long)(a) - (long)(b) >= 0))
 #define time_before_eq(a,b)    time_after_eq(b,a)
 
+#define time_in_range(a,b,c) \
+       (time_after_eq(a,b) && \
+        time_before_eq(a,c))
+
 /* Same as above, but does so with platform independent 64bit types.
  * These must be used when utilizing jiffies_64 (i.e. return value of
  * get_jiffies_64() */
index 7250eea..c5164c2 100644 (file)
 #include <linux/nfs3.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_xdr.h>
-
 #include <linux/nfs_fs_sb.h>
 
-#include <linux/rwsem.h>
 #include <linux/mempool.h>
 
 /*
@@ -77,6 +75,9 @@ struct nfs_open_context {
        struct nfs4_state *state;
        fl_owner_t lockowner;
        int mode;
+
+       unsigned long flags;
+#define NFS_CONTEXT_ERROR_WRITE                (0)
        int error;
 
        struct list_head list;
@@ -133,11 +134,6 @@ struct nfs_inode {
         * server.
         */
        unsigned long           cache_change_attribute;
-       /*
-        * Counter indicating the number of outstanding requests that
-        * will cause a file data update.
-        */
-       atomic_t                data_updates;
 
        struct rb_root          access_cache;
        struct list_head        access_cache_entry_lru;
@@ -205,27 +201,18 @@ static inline struct nfs_inode *NFS_I(struct inode *inode)
 #define NFS_CLIENT(inode)              (NFS_SERVER(inode)->client)
 #define NFS_PROTO(inode)               (NFS_SERVER(inode)->nfs_client->rpc_ops)
 #define NFS_COOKIEVERF(inode)          (NFS_I(inode)->cookieverf)
-#define NFS_READTIME(inode)            (NFS_I(inode)->read_cache_jiffies)
-#define NFS_CHANGE_ATTR(inode)         (NFS_I(inode)->change_attr)
-#define NFS_ATTRTIMEO(inode)           (NFS_I(inode)->attrtimeo)
 #define NFS_MINATTRTIMEO(inode) \
        (S_ISDIR(inode->i_mode)? NFS_SERVER(inode)->acdirmin \
                               : NFS_SERVER(inode)->acregmin)
 #define NFS_MAXATTRTIMEO(inode) \
        (S_ISDIR(inode->i_mode)? NFS_SERVER(inode)->acdirmax \
                               : NFS_SERVER(inode)->acregmax)
-#define NFS_ATTRTIMEO_UPDATE(inode)    (NFS_I(inode)->attrtimeo_timestamp)
 
 #define NFS_FLAGS(inode)               (NFS_I(inode)->flags)
 #define NFS_STALE(inode)               (test_bit(NFS_INO_STALE, &NFS_FLAGS(inode)))
 
 #define NFS_FILEID(inode)              (NFS_I(inode)->fileid)
 
-static inline int nfs_caches_unstable(struct inode *inode)
-{
-       return atomic_read(&NFS_I(inode)->data_updates) != 0;
-}
-
 static inline void nfs_mark_for_revalidate(struct inode *inode)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
@@ -237,12 +224,6 @@ static inline void nfs_mark_for_revalidate(struct inode *inode)
        spin_unlock(&inode->i_lock);
 }
 
-static inline void NFS_CACHEINV(struct inode *inode)
-{
-       if (!nfs_caches_unstable(inode))
-               nfs_mark_for_revalidate(inode);
-}
-
 static inline int nfs_server_capable(struct inode *inode, int cap)
 {
        return NFS_SERVER(inode)->caps & cap;
@@ -253,28 +234,33 @@ static inline int NFS_USE_READDIRPLUS(struct inode *inode)
        return test_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
 }
 
+static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf)
+{
+       dentry->d_time = verf;
+}
+
 /**
  * nfs_save_change_attribute - Returns the inode attribute change cookie
- * @inode - pointer to inode
+ * @dir - pointer to parent directory inode
  * The "change attribute" is updated every time we finish an operation
  * that will result in a metadata change on the server.
  */
-static inline long nfs_save_change_attribute(struct inode *inode)
+static inline unsigned long nfs_save_change_attribute(struct inode *dir)
 {
-       return NFS_I(inode)->cache_change_attribute;
+       return NFS_I(dir)->cache_change_attribute;
 }
 
 /**
- * nfs_verify_change_attribute - Detects NFS inode cache updates
- * @inode - pointer to inode
+ * nfs_verify_change_attribute - Detects NFS remote directory changes
+ * @dir - pointer to parent directory inode
  * @chattr - previously saved change attribute
- * Return "false" if metadata has been updated (or is in the process of
- * being updated) since the change attribute was saved.
+ * Return "false" if the verifiers doesn't match the change attribute.
+ * This would usually indicate that the directory contents have changed on
+ * the server, and that any dentries need revalidating.
  */
-static inline int nfs_verify_change_attribute(struct inode *inode, unsigned long chattr)
+static inline int nfs_verify_change_attribute(struct inode *dir, unsigned long chattr)
 {
-       return !nfs_caches_unstable(inode)
-               && time_after_eq(chattr, NFS_I(inode)->cache_change_attribute);
+       return chattr == NFS_I(dir)->cache_change_attribute;
 }
 
 /*
@@ -283,15 +269,14 @@ static inline int nfs_verify_change_attribute(struct inode *inode, unsigned long
 extern int nfs_sync_mapping(struct address_space *mapping);
 extern void nfs_zap_mapping(struct inode *inode, struct address_space *mapping);
 extern void nfs_zap_caches(struct inode *);
+extern void nfs_invalidate_atime(struct inode *);
 extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *,
                                struct nfs_fattr *);
 extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *);
 extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr);
+extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int nfs_permission(struct inode *, int, struct nameidata *);
-extern int nfs_access_get_cached(struct inode *, struct rpc_cred *, struct nfs_access_entry *);
-extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *);
-extern void nfs_access_zap_cache(struct inode *inode);
 extern int nfs_open(struct inode *, struct file *);
 extern int nfs_release(struct inode *, struct file *);
 extern int nfs_attribute_timeout(struct inode *inode);
@@ -301,13 +286,10 @@ extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *map
 extern int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *mapping);
 extern int nfs_setattr(struct dentry *, struct iattr *);
 extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr);
-extern void nfs_begin_attr_update(struct inode *);
-extern void nfs_end_attr_update(struct inode *);
-extern void nfs_begin_data_update(struct inode *);
-extern void nfs_end_data_update(struct inode *);
 extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx);
 extern void put_nfs_open_context(struct nfs_open_context *ctx);
 extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode);
+extern u64 nfs_compat_user_ino64(u64 fileid);
 
 /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */
 extern __be32 root_nfs_parse_addr(char *name); /*__init*/
@@ -328,14 +310,15 @@ extern const struct inode_operations nfs3_file_inode_operations;
 extern const struct file_operations nfs_file_operations;
 extern const struct address_space_operations nfs_file_aops;
 
-static inline struct rpc_cred *nfs_file_cred(struct file *file)
+static inline struct nfs_open_context *nfs_file_open_context(struct file *filp)
 {
-       if (file != NULL) {
-               struct nfs_open_context *ctx;
+       return filp->private_data;
+}
 
-               ctx = (struct nfs_open_context*)file->private_data;
-               return ctx->cred;
-       }
+static inline struct rpc_cred *nfs_file_cred(struct file *file)
+{
+       if (file != NULL)
+               return nfs_file_open_context(file)->cred;
        return NULL;
 }
 
@@ -378,6 +361,8 @@ extern const struct file_operations nfs_dir_operations;
 extern struct dentry_operations nfs_dentry_operations;
 
 extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr);
+extern int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags);
+extern void nfs_access_zap_cache(struct inode *inode);
 
 /*
  * linux/fs/nfs/symlink.c
@@ -420,15 +405,14 @@ extern int  nfs_flush_incompatible(struct file *file, struct page *page);
 extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
 extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
 extern void nfs_writedata_release(void *);
-extern int nfs_set_page_dirty(struct page *);
 
 /*
  * Try to write back everything synchronously (but check the
  * return value!)
  */
 extern long nfs_sync_mapping_wait(struct address_space *, struct writeback_control *, int);
-extern int nfs_sync_mapping_range(struct address_space *, loff_t, loff_t, int);
 extern int nfs_wb_all(struct inode *inode);
+extern int nfs_wb_nocommit(struct inode *inode);
 extern int nfs_wb_page(struct inode *inode, struct page* page);
 extern int nfs_wb_page_priority(struct inode *inode, struct page* page, int how);
 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
index 78e6079..30dbcc1 100644 (file)
@@ -30,7 +30,6 @@
 #define PG_BUSY                        0
 #define PG_NEED_COMMIT         1
 #define PG_NEED_RESCHED                2
-#define PG_NEED_FLUSH          3
 
 struct nfs_inode;
 struct nfs_page {
index cf74a4d..daab252 100644 (file)
@@ -62,7 +62,8 @@ struct nfs_fattr {
 #define NFS_ATTR_FATTR         0x0002          /* post-op attributes */
 #define NFS_ATTR_FATTR_V3      0x0004          /* NFSv3 attributes */
 #define NFS_ATTR_FATTR_V4      0x0008          /* NFSv4 change attribute */
-#define NFS_ATTR_FATTR_V4_REFERRAL     0x0010          /* NFSv4 referral */
+#define NFS_ATTR_WCC_V4                0x0010          /* pre-op change attribute */
+#define NFS_ATTR_FATTR_V4_REFERRAL     0x0020          /* NFSv4 referral */
 
 /*
  * Info on the file system
@@ -538,10 +539,13 @@ typedef u64 clientid4;
 
 struct nfs4_accessargs {
        const struct nfs_fh *           fh;
+       const u32 *                     bitmask;
        u32                             access;
 };
 
 struct nfs4_accessres {
+       const struct nfs_server *       server;
+       struct nfs_fattr *              fattr;
        u32                             supported;
        u32                             access;
 };
index c0d9d14..d9d5c5a 100644 (file)
@@ -117,7 +117,7 @@ struct rpc_create_args {
 
 struct rpc_clnt *rpc_create(struct rpc_create_args *args);
 struct rpc_clnt        *rpc_bind_new_program(struct rpc_clnt *,
-                               struct rpc_program *, int);
+                               struct rpc_program *, u32);
 struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
 void           rpc_shutdown_client(struct rpc_clnt *);
 void           rpc_release_client(struct rpc_clnt *);
index 3912cf1..3347c72 100644 (file)
@@ -88,6 +88,11 @@ enum {
        CTL_SLOTTABLE_TCP,
        CTL_MIN_RESVPORT,
        CTL_MAX_RESVPORT,
+       CTL_SLOTTABLE_RDMA,
+       CTL_RDMA_MAXINLINEREAD,
+       CTL_RDMA_MAXINLINEWRITE,
+       CTL_RDMA_WRITEPADDING,
+       CTL_RDMA_MEMREG,
 };
 
 #endif /* _LINUX_SUNRPC_DEBUG_H_ */
index 784d4c3..c4beb57 100644 (file)
@@ -138,6 +138,19 @@ typedef __be32     rpc_fraghdr;
 #define RPC_MAX_HEADER_WITH_AUTH \
        (RPC_CALLHDRSIZE + 2*(2+RPC_MAX_AUTH_SIZE/4))
 
+/*
+ * RFC1833/RFC3530 rpcbind (v3+) well-known netid's.
+ */
+#define RPCBIND_NETID_UDP      "udp"
+#define RPCBIND_NETID_TCP      "tcp"
+#define RPCBIND_NETID_UDP6     "udp6"
+#define RPCBIND_NETID_TCP6     "tcp6"
+
+/*
+ * Note that RFC 1833 does not put any size restrictions on the
+ * netid string, but all currently defined netid's fit in 4 bytes.
+ */
+#define RPCBIND_MAXNETIDLEN    (4u)
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_MSGPROT_H_ */
diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h
new file mode 100644 (file)
index 0000000..0013a0d
--- /dev/null
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_SUNRPC_RPC_RDMA_H
+#define _LINUX_SUNRPC_RPC_RDMA_H
+
+struct rpcrdma_segment {
+       uint32_t rs_handle;     /* Registered memory handle */
+       uint32_t rs_length;     /* Length of the chunk in bytes */
+       uint64_t rs_offset;     /* Chunk virtual address or offset */
+};
+
+/*
+ * read chunk(s), encoded as a linked list.
+ */
+struct rpcrdma_read_chunk {
+       uint32_t rc_discrim;    /* 1 indicates presence */
+       uint32_t rc_position;   /* Position in XDR stream */
+       struct rpcrdma_segment rc_target;
+};
+
+/*
+ * write chunk, and reply chunk.
+ */
+struct rpcrdma_write_chunk {
+       struct rpcrdma_segment wc_target;
+};
+
+/*
+ * write chunk(s), encoded as a counted array.
+ */
+struct rpcrdma_write_array {
+       uint32_t wc_discrim;    /* 1 indicates presence */
+       uint32_t wc_nchunks;    /* Array count */
+       struct rpcrdma_write_chunk wc_array[0];
+};
+
+struct rpcrdma_msg {
+       uint32_t rm_xid;        /* Mirrors the RPC header xid */
+       uint32_t rm_vers;       /* Version of this protocol */
+       uint32_t rm_credit;     /* Buffers requested/granted */
+       uint32_t rm_type;       /* Type of message (enum rpcrdma_proc) */
+       union {
+
+               struct {                        /* no chunks */
+                       uint32_t rm_empty[3];   /* 3 empty chunk lists */
+               } rm_nochunks;
+
+               struct {                        /* no chunks and padded */
+                       uint32_t rm_align;      /* Padding alignment */
+                       uint32_t rm_thresh;     /* Padding threshold */
+                       uint32_t rm_pempty[3];  /* 3 empty chunk lists */
+               } rm_padded;
+
+               uint32_t rm_chunks[0];  /* read, write and reply chunks */
+
+       } rm_body;
+};
+
+#define RPCRDMA_HDRLEN_MIN     28
+
+enum rpcrdma_errcode {
+       ERR_VERS = 1,
+       ERR_CHUNK = 2
+};
+
+struct rpcrdma_err_vers {
+       uint32_t rdma_vers_low; /* Version range supported by peer */
+       uint32_t rdma_vers_high;
+};
+
+enum rpcrdma_proc {
+       RDMA_MSG = 0,           /* An RPC call or reply msg */
+       RDMA_NOMSG = 1,         /* An RPC call or reply msg - separate body */
+       RDMA_MSGP = 2,          /* An RPC call or reply msg with padding */
+       RDMA_DONE = 3,          /* Client signals reply completion */
+       RDMA_ERROR = 4          /* An RPC RDMA encoding error */
+};
+
+#endif                         /* _LINUX_SUNRPC_RPC_RDMA_H */
index c6b53d1..0751c94 100644 (file)
@@ -70,7 +70,10 @@ struct xdr_buf {
 
        struct page **  pages;          /* Array of contiguous pages */
        unsigned int    page_base,      /* Start of page data */
-                       page_len;       /* Length of page data */
+                       page_len,       /* Length of page data */
+                       flags;          /* Flags for data disposition */
+#define XDRBUF_READ            0x01            /* target of file read */
+#define XDRBUF_WRITE           0x02            /* source of file write */
 
        unsigned int    buflen,         /* Total length of storage buffer */
                        len;            /* Length of XDR encoded message */
index d11cedd..30b17b3 100644 (file)
 
 #ifdef __KERNEL__
 
-extern unsigned int xprt_udp_slot_table_entries;
-extern unsigned int xprt_tcp_slot_table_entries;
-
 #define RPC_MIN_SLOT_TABLE     (2U)
 #define RPC_DEF_SLOT_TABLE     (16U)
 #define RPC_MAX_SLOT_TABLE     (128U)
 
 /*
- * Parameters for choosing a free port
- */
-extern unsigned int xprt_min_resvport;
-extern unsigned int xprt_max_resvport;
-
-#define RPC_MIN_RESVPORT       (1U)
-#define RPC_MAX_RESVPORT       (65535U)
-#define RPC_DEF_MIN_RESVPORT   (665U)
-#define RPC_DEF_MAX_RESVPORT   (1023U)
-
-/*
  * This describes a timeout strategy
  */
 struct rpc_timeout {
@@ -53,6 +39,10 @@ enum rpc_display_format_t {
        RPC_DISPLAY_PORT,
        RPC_DISPLAY_PROTO,
        RPC_DISPLAY_ALL,
+       RPC_DISPLAY_HEX_ADDR,
+       RPC_DISPLAY_HEX_PORT,
+       RPC_DISPLAY_UNIVERSAL_ADDR,
+       RPC_DISPLAY_NETID,
        RPC_DISPLAY_MAX,
 };
 
@@ -196,14 +186,22 @@ struct rpc_xprt {
        char *                  address_strings[RPC_DISPLAY_MAX];
 };
 
-struct rpc_xprtsock_create {
-       int                     proto;          /* IPPROTO_UDP or IPPROTO_TCP */
+struct xprt_create {
+       int                     ident;          /* XPRT_TRANSPORT identifier */
        struct sockaddr *       srcaddr;        /* optional local address */
        struct sockaddr *       dstaddr;        /* remote peer address */
        size_t                  addrlen;
        struct rpc_timeout *    timeout;        /* optional timeout parameters */
 };
 
+struct xprt_class {
+       struct list_head        list;
+       int                     ident;          /* XPRT_TRANSPORT identifier */
+       struct rpc_xprt *       (*setup)(struct xprt_create *);
+       struct module           *owner;
+       char                    name[32];
+};
+
 /*
  * Transport operations used by ULPs
  */
@@ -212,7 +210,7 @@ void                        xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long
 /*
  * Generic internal transport functions
  */
-struct rpc_xprt *      xprt_create_transport(struct rpc_xprtsock_create *args);
+struct rpc_xprt                *xprt_create_transport(struct xprt_create *args);
 void                   xprt_connect(struct rpc_task *task);
 void                   xprt_reserve(struct rpc_task *task);
 int                    xprt_reserve_xprt(struct rpc_task *task);
@@ -235,6 +233,8 @@ static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 *
 /*
  * Transport switch helper functions
  */
+int                    xprt_register_transport(struct xprt_class *type);
+int                    xprt_unregister_transport(struct xprt_class *type);
 void                   xprt_set_retrans_timeout_def(struct rpc_task *task);
 void                   xprt_set_retrans_timeout_rtt(struct rpc_task *task);
 void                   xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status);
@@ -248,14 +248,6 @@ void                       xprt_release_rqst_cong(struct rpc_task *task);
 void                   xprt_disconnect(struct rpc_xprt *xprt);
 
 /*
- * Socket transport setup operations
- */
-struct rpc_xprt *      xs_setup_udp(struct rpc_xprtsock_create *args);
-struct rpc_xprt *      xs_setup_tcp(struct rpc_xprtsock_create *args);
-int                    init_socket_xprt(void);
-void                   cleanup_socket_xprt(void);
-
-/*
  * Reserved bit positions in xprt->state
  */
 #define XPRT_LOCKED            (0)
diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
new file mode 100644 (file)
index 0000000..4de56b1
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_SUNRPC_XPRTRDMA_H
+#define _LINUX_SUNRPC_XPRTRDMA_H
+
+/*
+ * RPC transport identifier for RDMA
+ */
+#define XPRT_TRANSPORT_RDMA    256
+
+/*
+ * rpcbind (v3+) RDMA netid.
+ */
+#define RPCBIND_NETID_RDMA     "rdma"
+
+/*
+ * Constants. Max RPC/NFS header is big enough to account for
+ * additional marshaling buffers passed down by Linux client.
+ *
+ * RDMA header is currently fixed max size, and is big enough for a
+ * fully-chunked NFS message (read chunks are the largest). Note only
+ * a single chunk type per message is supported currently.
+ */
+#define RPCRDMA_MIN_SLOT_TABLE (2U)
+#define RPCRDMA_DEF_SLOT_TABLE (32U)
+#define RPCRDMA_MAX_SLOT_TABLE (256U)
+
+#define RPCRDMA_DEF_INLINE  (1024)     /* default inline max */
+
+#define RPCRDMA_INLINE_PAD_THRESH  (512)/* payload threshold to pad (bytes) */
+
+#define RDMA_RESOLVE_TIMEOUT   (5*HZ)  /* TBD 5 seconds */
+#define RDMA_CONNECT_RETRY_MAX (2)     /* retries if no listener backlog */
+
+/* memory registration strategies */
+#define RPCRDMA_PERSISTENT_REGISTRATION (1)
+
+enum rpcrdma_memreg {
+       RPCRDMA_BOUNCEBUFFERS = 0,
+       RPCRDMA_REGISTER,
+       RPCRDMA_MEMWINDOWS,
+       RPCRDMA_MEMWINDOWS_ASYNC,
+       RPCRDMA_MTHCAFMR,
+       RPCRDMA_ALLPHYSICAL,
+       RPCRDMA_LAST
+};
+
+#endif /* _LINUX_SUNRPC_XPRTRDMA_H */
diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
new file mode 100644 (file)
index 0000000..2c6c2c2
--- /dev/null
@@ -0,0 +1,51 @@
+/*
+ *  linux/include/linux/sunrpc/xprtsock.h
+ *
+ *  Declarations for the RPC transport socket provider.
+ */
+
+#ifndef _LINUX_SUNRPC_XPRTSOCK_H
+#define _LINUX_SUNRPC_XPRTSOCK_H
+
+#ifdef __KERNEL__
+
+/*
+ * Socket transport setup operations
+ */
+struct rpc_xprt *xs_setup_udp(struct xprt_create *args);
+struct rpc_xprt *xs_setup_tcp(struct xprt_create *args);
+
+int            init_socket_xprt(void);
+void           cleanup_socket_xprt(void);
+
+/*
+ * RPC transport identifiers for UDP, TCP
+ *
+ * To preserve compatibility with the historical use of raw IP protocol
+ * id's for transport selection, these are specified with the previous
+ * values. No such restriction exists for new transports, except that
+ * they may not collide with these values (17 and 6, respectively).
+ */
+#define XPRT_TRANSPORT_UDP     IPPROTO_UDP
+#define XPRT_TRANSPORT_TCP     IPPROTO_TCP
+
+/*
+ * RPC slot table sizes for UDP, TCP transports
+ */
+extern unsigned int xprt_udp_slot_table_entries;
+extern unsigned int xprt_tcp_slot_table_entries;
+
+/*
+ * Parameters for choosing a free port
+ */
+extern unsigned int xprt_min_resvport;
+extern unsigned int xprt_max_resvport;
+
+#define RPC_MIN_RESVPORT       (1U)
+#define RPC_MAX_RESVPORT       (65535U)
+#define RPC_DEF_MIN_RESVPORT   (665U)
+#define RPC_DEF_MAX_RESVPORT   (1023U)
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_SUNRPC_XPRTSOCK_H */
index c7c3337..d1321a8 100644 (file)
@@ -62,8 +62,6 @@ struct writeback_control {
        unsigned for_reclaim:1;         /* Invoked from the page allocator */
        unsigned for_writepages:1;      /* This is a writepages() call */
        unsigned range_cyclic:1;        /* range_start is cyclic */
-
-       void *fs_private;               /* For use by ->writepages() */
 };
 
 /*
index 04f3ffb..0ae703c 100644 (file)
@@ -1525,6 +1525,7 @@ add_names:
                        context->names[idx].ino = (unsigned long)-1;
        }
 }
+EXPORT_SYMBOL_GPL(__audit_inode_child);
 
 /**
  * auditsc_get_stamp - get local copies of audit_context values
index 8ebfc4d..5c69a72 100644 (file)
@@ -5,6 +5,7 @@
 
 obj-$(CONFIG_SUNRPC) += sunrpc.o
 obj-$(CONFIG_SUNRPC_GSS) += auth_gss/
+obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/
 
 sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
            auth.o auth_null.o auth_unix.o \
index 42b3220..8bd074d 100644 (file)
@@ -42,7 +42,7 @@ gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize)
 {
        u8 *ptr;
        u8 pad;
-       int len = buf->len;
+       size_t len = buf->len;
 
        if (len <= buf->head[0].iov_len) {
                pad = *(u8 *)(buf->head[0].iov_base + len - 1);
@@ -53,9 +53,9 @@ gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize)
        } else
                len -= buf->head[0].iov_len;
        if (len <= buf->page_len) {
-               int last = (buf->page_base + len - 1)
+               unsigned int last = (buf->page_base + len - 1)
                                        >>PAGE_CACHE_SHIFT;
-               int offset = (buf->page_base + len - 1)
+               unsigned int offset = (buf->page_base + len - 1)
                                        & (PAGE_CACHE_SIZE - 1);
                ptr = kmap_atomic(buf->pages[last], KM_USER0);
                pad = *(ptr + offset);
index 52429b1..76be83e 100644 (file)
@@ -127,7 +127,14 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s
        struct rpc_clnt         *clnt = NULL;
        struct rpc_auth         *auth;
        int err;
-       int len;
+       size_t len;
+
+       /* sanity check the name before trying to print it */
+       err = -EINVAL;
+       len = strlen(servname);
+       if (len > RPC_MAXNETNAMELEN)
+               goto out_no_rpciod;
+       len++;
 
        dprintk("RPC:       creating %s client for %s (xprt %p)\n",
                        program->name, servname, xprt);
@@ -148,7 +155,6 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s
        clnt->cl_parent = clnt;
 
        clnt->cl_server = clnt->cl_inline_name;
-       len = strlen(servname) + 1;
        if (len > sizeof(clnt->cl_inline_name)) {
                char *buf = kmalloc(len, GFP_KERNEL);
                if (buf != 0)
@@ -234,8 +240,8 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 {
        struct rpc_xprt *xprt;
        struct rpc_clnt *clnt;
-       struct rpc_xprtsock_create xprtargs = {
-               .proto = args->protocol,
+       struct xprt_create xprtargs = {
+               .ident = args->protocol,
                .srcaddr = args->saddress,
                .dstaddr = args->address,
                .addrlen = args->addrsize,
@@ -253,7 +259,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
         */
        if (args->servername == NULL) {
                struct sockaddr_in *addr =
-                                       (struct sockaddr_in *) &args->address;
+                                       (struct sockaddr_in *) args->address;
                snprintf(servername, sizeof(servername), NIPQUAD_FMT,
                        NIPQUAD(addr->sin_addr.s_addr));
                args->servername = servername;
@@ -269,9 +275,6 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
        if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
                xprt->resvport = 0;
 
-       dprintk("RPC:       creating %s client for %s (xprt %p)\n",
-                       args->program->name, args->servername, xprt);
-
        clnt = rpc_new_client(xprt, args->servername, args->program,
                                args->version, args->authflavor);
        if (IS_ERR(clnt))
@@ -439,7 +442,7 @@ rpc_release_client(struct rpc_clnt *clnt)
  */
 struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old,
                                      struct rpc_program *program,
-                                     int vers)
+                                     u32 vers)
 {
        struct rpc_clnt *clnt;
        struct rpc_version *version;
@@ -843,8 +846,7 @@ call_allocate(struct rpc_task *task)
        dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
 
        if (RPC_IS_ASYNC(task) || !signalled()) {
-               xprt_release(task);
-               task->tk_action = call_reserve;
+               task->tk_action = call_allocate;
                rpc_delay(task, HZ>>4);
                return;
        }
@@ -871,6 +873,7 @@ rpc_xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
        buf->head[0].iov_len = len;
        buf->tail[0].iov_len = 0;
        buf->page_len = 0;
+       buf->flags = 0;
        buf->len = 0;
        buf->buflen = len;
 }
@@ -937,7 +940,7 @@ call_bind(struct rpc_task *task)
 static void
 call_bind_status(struct rpc_task *task)
 {
-       int status = -EACCES;
+       int status = -EIO;
 
        if (task->tk_status >= 0) {
                dprint_status(task);
@@ -947,9 +950,20 @@ call_bind_status(struct rpc_task *task)
        }
 
        switch (task->tk_status) {
+       case -EAGAIN:
+               dprintk("RPC: %5u rpcbind waiting for another request "
+                               "to finish\n", task->tk_pid);
+               /* avoid busy-waiting here -- could be a network outage. */
+               rpc_delay(task, 5*HZ);
+               goto retry_timeout;
        case -EACCES:
                dprintk("RPC: %5u remote rpcbind: RPC program/version "
                                "unavailable\n", task->tk_pid);
+               /* fail immediately if this is an RPC ping */
+               if (task->tk_msg.rpc_proc->p_proc == 0) {
+                       status = -EOPNOTSUPP;
+                       break;
+               }
                rpc_delay(task, 3*HZ);
                goto retry_timeout;
        case -ETIMEDOUT:
@@ -957,6 +971,7 @@ call_bind_status(struct rpc_task *task)
                                task->tk_pid);
                goto retry_timeout;
        case -EPFNOSUPPORT:
+               /* server doesn't support any rpcbind version we know of */
                dprintk("RPC: %5u remote rpcbind service unavailable\n",
                                task->tk_pid);
                break;
@@ -969,7 +984,6 @@ call_bind_status(struct rpc_task *task)
        default:
                dprintk("RPC: %5u unrecognized rpcbind error (%d)\n",
                                task->tk_pid, -task->tk_status);
-               status = -EIO;
        }
 
        rpc_exit(task, status);
@@ -1257,7 +1271,6 @@ call_refresh(struct rpc_task *task)
 {
        dprint_status(task);
 
-       xprt_release(task);     /* Must do to obtain new XID */
        task->tk_action = call_refreshresult;
        task->tk_status = 0;
        task->tk_client->cl_stats->rpcauthrefresh++;
@@ -1375,6 +1388,8 @@ call_verify(struct rpc_task *task)
                        dprintk("RPC: %5u %s: retry stale creds\n",
                                        task->tk_pid, __FUNCTION__);
                        rpcauth_invalcred(task);
+                       /* Ensure we obtain a new XID! */
+                       xprt_release(task);
                        task->tk_action = call_refresh;
                        goto out_retry;
                case RPC_AUTH_BADCRED:
@@ -1523,13 +1538,18 @@ void rpc_show_tasks(void)
                spin_lock(&clnt->cl_lock);
                list_for_each_entry(t, &clnt->cl_tasks, tk_task) {
                        const char *rpc_waitq = "none";
+                       int proc;
+
+                       if (t->tk_msg.rpc_proc)
+                               proc = t->tk_msg.rpc_proc->p_proc;
+                       else
+                               proc = -1;
 
                        if (RPC_IS_QUEUED(t))
                                rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq);
 
                        printk("%5u %04d %04x %6d %8p %6d %8p %8ld %8s %8p %8p\n",
-                               t->tk_pid,
-                               (t->tk_msg.rpc_proc ? t->tk_msg.rpc_proc->p_proc : -1),
+                               t->tk_pid, proc,
                                t->tk_flags, t->tk_status,
                                t->tk_client,
                                (t->tk_client ? t->tk_client->cl_prog : 0),
index 669e12a..c8433e8 100644 (file)
@@ -14,7 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
-#include <linux/dnotify.h>
+#include <linux/fsnotify.h>
 #include <linux/kernel.h>
 
 #include <asm/ioctls.h>
@@ -329,6 +329,7 @@ rpc_show_info(struct seq_file *m, void *v)
                        clnt->cl_prog, clnt->cl_vers);
        seq_printf(m, "address: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR));
        seq_printf(m, "protocol: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_PROTO));
+       seq_printf(m, "port: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_PORT));
        return 0;
 }
 
@@ -585,6 +586,7 @@ rpc_populate(struct dentry *parent,
                if (S_ISDIR(mode))
                        inc_nlink(dir);
                d_add(dentry, inode);
+               fsnotify_create(dir, dentry);
        }
        mutex_unlock(&dir->i_mutex);
        return 0;
@@ -606,7 +608,7 @@ __rpc_mkdir(struct inode *dir, struct dentry *dentry)
        inode->i_ino = iunique(dir->i_sb, 100);
        d_instantiate(dentry, inode);
        inc_nlink(dir);
-       inode_dir_notify(dir, DN_CREATE);
+       fsnotify_mkdir(dir, dentry);
        return 0;
 out_err:
        printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %s\n",
@@ -748,7 +750,7 @@ rpc_mkpipe(struct dentry *parent, const char *name, void *private, struct rpc_pi
        rpci->flags = flags;
        rpci->ops = ops;
        rpci->nkern_readwriters = 1;
-       inode_dir_notify(dir, DN_CREATE);
+       fsnotify_create(dir, dentry);
        dget(dentry);
 out:
        mutex_unlock(&dir->i_mutex);
index d1740db..a05493a 100644 (file)
 
 #include <linux/types.h>
 #include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/in6.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/xprtsock.h>
 
 #ifdef RPC_DEBUG
 # define RPCDBG_FACILITY       RPCDBG_BIND
@@ -91,26 +94,6 @@ enum {
 #define RPCB_MAXADDRLEN                (128u)
 
 /*
- * r_netid
- *
- * Quoting RFC 3530, section 2.2:
- *
- * For TCP over IPv4 the value of r_netid is the string "tcp".  For UDP
- * over IPv4 the value of r_netid is the string "udp".
- *
- * ...
- *
- * For TCP over IPv6 the value of r_netid is the string "tcp6".  For UDP
- * over IPv6 the value of r_netid is the string "udp6".
- */
-#define RPCB_NETID_UDP "\165\144\160"          /* "udp" */
-#define RPCB_NETID_TCP "\164\143\160"          /* "tcp" */
-#define RPCB_NETID_UDP6        "\165\144\160\066"      /* "udp6" */
-#define RPCB_NETID_TCP6        "\164\143\160\066"      /* "tcp6" */
-
-#define RPCB_MAXNETIDLEN       (4u)
-
-/*
  * r_owner
  *
  * The "owner" is allowed to unset a service in the rpcbind database.
@@ -120,7 +103,7 @@ enum {
 #define RPCB_MAXOWNERLEN       sizeof(RPCB_OWNER_STRING)
 
 static void                    rpcb_getport_done(struct rpc_task *, void *);
-extern struct rpc_program      rpcb_program;
+static struct rpc_program      rpcb_program;
 
 struct rpcbind_args {
        struct rpc_xprt *       r_xprt;
@@ -137,10 +120,13 @@ struct rpcbind_args {
 static struct rpc_procinfo rpcb_procedures2[];
 static struct rpc_procinfo rpcb_procedures3[];
 
-static struct rpcb_info {
+struct rpcb_info {
        int                     rpc_vers;
        struct rpc_procinfo *   rpc_proc;
-} rpcb_next_version[];
+};
+
+static struct rpcb_info rpcb_next_version[];
+static struct rpcb_info rpcb_next_version6[];
 
 static void rpcb_getport_prepare(struct rpc_task *task, void *calldata)
 {
@@ -190,7 +176,17 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
                                   RPC_CLNT_CREATE_INTR),
        };
 
-       ((struct sockaddr_in *)srvaddr)->sin_port = htons(RPCBIND_PORT);
+       switch (srvaddr->sa_family) {
+       case AF_INET:
+               ((struct sockaddr_in *)srvaddr)->sin_port = htons(RPCBIND_PORT);
+               break;
+       case AF_INET6:
+               ((struct sockaddr_in6 *)srvaddr)->sin6_port = htons(RPCBIND_PORT);
+               break;
+       default:
+               return NULL;
+       }
+
        if (!privileged)
                args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
        return rpc_create(&args);
@@ -234,7 +230,7 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
                        prog, vers, prot, port);
 
        rpcb_clnt = rpcb_create("localhost", (struct sockaddr *) &sin,
-                                       IPPROTO_UDP, 2, 1);
+                                       XPRT_TRANSPORT_UDP, 2, 1);
        if (IS_ERR(rpcb_clnt))
                return PTR_ERR(rpcb_clnt);
 
@@ -316,6 +312,7 @@ void rpcb_getport_async(struct rpc_task *task)
        struct rpc_task *child;
        struct sockaddr addr;
        int status;
+       struct rpcb_info *info;
 
        dprintk("RPC: %5u %s(%s, %u, %u, %d)\n",
                task->tk_pid, __FUNCTION__,
@@ -325,7 +322,7 @@ void rpcb_getport_async(struct rpc_task *task)
        BUG_ON(clnt->cl_parent != clnt);
 
        if (xprt_test_and_set_binding(xprt)) {
-               status = -EACCES;               /* tell caller to check again */
+               status = -EAGAIN;       /* tell caller to check again */
                dprintk("RPC: %5u %s: waiting for another binder\n",
                        task->tk_pid, __FUNCTION__);
                goto bailout_nowake;
@@ -343,18 +340,43 @@ void rpcb_getport_async(struct rpc_task *task)
                goto bailout_nofree;
        }
 
-       if (rpcb_next_version[xprt->bind_index].rpc_proc == NULL) {
+       rpc_peeraddr(clnt, (void *)&addr, sizeof(addr));
+
+       /* Don't ever use rpcbind v2 for AF_INET6 requests */
+       switch (addr.sa_family) {
+       case AF_INET:
+               info = rpcb_next_version;
+               break;
+       case AF_INET6:
+               info = rpcb_next_version6;
+               break;
+       default:
+               status = -EAFNOSUPPORT;
+               dprintk("RPC: %5u %s: bad address family\n",
+                               task->tk_pid, __FUNCTION__);
+               goto bailout_nofree;
+       }
+       if (info[xprt->bind_index].rpc_proc == NULL) {
                xprt->bind_index = 0;
-               status = -EACCES;       /* tell caller to try again later */
+               status = -EPFNOSUPPORT;
                dprintk("RPC: %5u %s: no more getport versions available\n",
                        task->tk_pid, __FUNCTION__);
                goto bailout_nofree;
        }
-       bind_version = rpcb_next_version[xprt->bind_index].rpc_vers;
+       bind_version = info[xprt->bind_index].rpc_vers;
 
        dprintk("RPC: %5u %s: trying rpcbind version %u\n",
                task->tk_pid, __FUNCTION__, bind_version);
 
+       rpcb_clnt = rpcb_create(clnt->cl_server, &addr, xprt->prot,
+                               bind_version, 0);
+       if (IS_ERR(rpcb_clnt)) {
+               status = PTR_ERR(rpcb_clnt);
+               dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n",
+                       task->tk_pid, __FUNCTION__, PTR_ERR(rpcb_clnt));
+               goto bailout_nofree;
+       }
+
        map = kzalloc(sizeof(struct rpcbind_args), GFP_ATOMIC);
        if (!map) {
                status = -ENOMEM;
@@ -367,28 +389,19 @@ void rpcb_getport_async(struct rpc_task *task)
        map->r_prot = xprt->prot;
        map->r_port = 0;
        map->r_xprt = xprt_get(xprt);
-       map->r_netid = (xprt->prot == IPPROTO_TCP) ? RPCB_NETID_TCP :
-                                                  RPCB_NETID_UDP;
-       memcpy(&map->r_addr, rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR),
-                       sizeof(map->r_addr));
+       map->r_netid = rpc_peeraddr2str(clnt, RPC_DISPLAY_NETID);
+       memcpy(map->r_addr,
+              rpc_peeraddr2str(rpcb_clnt, RPC_DISPLAY_UNIVERSAL_ADDR),
+              sizeof(map->r_addr));
        map->r_owner = RPCB_OWNER_STRING;       /* ignored for GETADDR */
 
-       rpc_peeraddr(clnt, (void *)&addr, sizeof(addr));
-       rpcb_clnt = rpcb_create(clnt->cl_server, &addr, xprt->prot, bind_version, 0);
-       if (IS_ERR(rpcb_clnt)) {
-               status = PTR_ERR(rpcb_clnt);
-               dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n",
-                       task->tk_pid, __FUNCTION__, PTR_ERR(rpcb_clnt));
-               goto bailout;
-       }
-
        child = rpc_run_task(rpcb_clnt, RPC_TASK_ASYNC, &rpcb_getport_ops, map);
        rpc_release_client(rpcb_clnt);
        if (IS_ERR(child)) {
                status = -EIO;
                dprintk("RPC: %5u %s: rpc_run_task failed\n",
                        task->tk_pid, __FUNCTION__);
-               goto bailout_nofree;
+               goto bailout;
        }
        rpc_put_task(child);
 
@@ -403,6 +416,7 @@ bailout_nofree:
 bailout_nowake:
        task->tk_status = status;
 }
+EXPORT_SYMBOL_GPL(rpcb_getport_async);
 
 /*
  * Rpcbind child task calls this callback via tk_exit.
@@ -413,6 +427,10 @@ static void rpcb_getport_done(struct rpc_task *child, void *data)
        struct rpc_xprt *xprt = map->r_xprt;
        int status = child->tk_status;
 
+       /* Garbage reply: retry with a lesser rpcbind version */
+       if (status == -EIO)
+               status = -EPROTONOSUPPORT;
+
        /* rpcbind server doesn't support this rpcbind protocol version */
        if (status == -EPROTONOSUPPORT)
                xprt->bind_index++;
@@ -490,16 +508,24 @@ static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p,
                               unsigned short *portp)
 {
        char *addr;
-       int addr_len, c, i, f, first, val;
+       u32 addr_len;
+       int c, i, f, first, val;
 
        *portp = 0;
-       addr_len = (unsigned int) ntohl(*p++);
-       if (addr_len > RPCB_MAXADDRLEN)                 /* sanity */
-               return -EINVAL;
-
-       dprintk("RPC:       rpcb_decode_getaddr returned string: '%s'\n",
-                       (char *) p);
-
+       addr_len = ntohl(*p++);
+
+       /*
+        * Simple sanity check.  The smallest possible universal
+        * address is an IPv4 address string containing 11 bytes.
+        */
+       if (addr_len < 11 || addr_len > RPCB_MAXADDRLEN)
+               goto out_err;
+
+       /*
+        * Start at the end and walk backwards until the first dot
+        * is encountered.  When the second dot is found, we have
+        * both parts of the port number.
+        */
        addr = (char *)p;
        val = 0;
        first = 1;
@@ -521,8 +547,19 @@ static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p,
                }
        }
 
+       /*
+        * Simple sanity check.  If we never saw a dot in the reply,
+        * then this was probably just garbage.
+        */
+       if (first)
+               goto out_err;
+
        dprintk("RPC:       rpcb_decode_getaddr port=%u\n", *portp);
        return 0;
+
+out_err:
+       dprintk("RPC:       rpcbind server returned malformed reply\n");
+       return -EIO;
 }
 
 #define RPCB_program_sz                (1u)
@@ -531,7 +568,7 @@ static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p,
 #define RPCB_port_sz           (1u)
 #define RPCB_boolean_sz                (1u)
 
-#define RPCB_netid_sz          (1+XDR_QUADLEN(RPCB_MAXNETIDLEN))
+#define RPCB_netid_sz          (1+XDR_QUADLEN(RPCBIND_MAXNETIDLEN))
 #define RPCB_addr_sz           (1+XDR_QUADLEN(RPCB_MAXADDRLEN))
 #define RPCB_ownerstring_sz    (1+XDR_QUADLEN(RPCB_MAXOWNERLEN))
 
@@ -593,6 +630,14 @@ static struct rpcb_info rpcb_next_version[] = {
        { 0, NULL },
 };
 
+static struct rpcb_info rpcb_next_version6[] = {
+#ifdef CONFIG_SUNRPC_BIND34
+       { 4, &rpcb_procedures4[RPCBPROC_GETVERSADDR] },
+       { 3, &rpcb_procedures3[RPCBPROC_GETADDR] },
+#endif
+       { 0, NULL },
+};
+
 static struct rpc_version rpcb_version2 = {
        .number         = 2,
        .nrprocs        = RPCB_HIGHPROC_2,
@@ -621,7 +666,7 @@ static struct rpc_version *rpcb_version[] = {
 
 static struct rpc_stat rpcb_stats;
 
-struct rpc_program rpcb_program = {
+static struct rpc_program rpcb_program = {
        .name           = "rpcbind",
        .number         = RPCBIND_PROGRAM,
        .nrvers         = ARRAY_SIZE(rpcb_version),
index 954d7ec..3c773c5 100644 (file)
@@ -777,6 +777,7 @@ void *rpc_malloc(struct rpc_task *task, size_t size)
                        task->tk_pid, size, buf);
        return &buf->data;
 }
+EXPORT_SYMBOL_GPL(rpc_malloc);
 
 /**
  * rpc_free - free buffer allocated via rpc_malloc
@@ -802,6 +803,7 @@ void rpc_free(void *buffer)
        else
                kfree(buf);
 }
+EXPORT_SYMBOL_GPL(rpc_free);
 
 /*
  * Creation and deletion of RPC task structures
index 1d377d1..97ac45f 100644 (file)
@@ -34,6 +34,7 @@ size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len)
        desc->offset += len;
        return len;
 }
+EXPORT_SYMBOL_GPL(xdr_skb_read_bits);
 
 /**
  * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer
@@ -137,6 +138,7 @@ copy_tail:
 out:
        return copied;
 }
+EXPORT_SYMBOL_GPL(xdr_partial_copy_from_skb);
 
 /**
  * csum_partial_copy_to_xdr - checksum and copy data
@@ -179,3 +181,4 @@ no_checksum:
                return -1;
        return 0;
 }
+EXPORT_SYMBOL_GPL(csum_partial_copy_to_xdr);
index 384c4ad..33d89e8 100644 (file)
@@ -20,7 +20,7 @@
 #include <linux/sunrpc/auth.h>
 #include <linux/workqueue.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
-
+#include <linux/sunrpc/xprtsock.h>
 
 /* RPC scheduler */
 EXPORT_SYMBOL(rpc_execute);
index 8142fdb..31becbf 100644 (file)
@@ -17,6 +17,7 @@
 
 #include <linux/types.h>
 #include <linux/unistd.h>
+#include <linux/module.h>
 
 #include <linux/sunrpc/clnt.h>
 
@@ -40,6 +41,7 @@ rpc_init_rtt(struct rpc_rtt *rt, unsigned long timeo)
                rt->ntimeouts[i] = 0;
        }
 }
+EXPORT_SYMBOL_GPL(rpc_init_rtt);
 
 /*
  * NB: When computing the smoothed RTT and standard deviation,
@@ -75,6 +77,7 @@ rpc_update_rtt(struct rpc_rtt *rt, unsigned timer, long m)
        if (*sdrtt < RPC_RTO_MIN)
                *sdrtt = RPC_RTO_MIN;
 }
+EXPORT_SYMBOL_GPL(rpc_update_rtt);
 
 /*
  * Estimate rto for an nfs rpc sent via. an unreliable datagram.
@@ -103,3 +106,4 @@ rpc_calc_rto(struct rpc_rtt *rt, unsigned timer)
 
        return res;
 }
+EXPORT_SYMBOL_GPL(rpc_calc_rto);
index c8c2edc..282a9a2 100644 (file)
@@ -62,6 +62,9 @@ static inline void    do_xprt_reserve(struct rpc_task *);
 static void    xprt_connect_status(struct rpc_task *task);
 static int      __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
 
+static spinlock_t xprt_list_lock = SPIN_LOCK_UNLOCKED;
+static LIST_HEAD(xprt_list);
+
 /*
  * The transport code maintains an estimate on the maximum number of out-
  * standing RPC requests, using a smoothed version of the congestion
@@ -81,6 +84,78 @@ static int      __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
 #define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd)
 
 /**
+ * xprt_register_transport - register a transport implementation
+ * @transport: transport to register
+ *
+ * If a transport implementation is loaded as a kernel module, it can
+ * call this interface to make itself known to the RPC client.
+ *
+ * Returns:
+ * 0:          transport successfully registered
+ * -EEXIST:    transport already registered
+ * -EINVAL:    transport module being unloaded
+ */
+int xprt_register_transport(struct xprt_class *transport)
+{
+       struct xprt_class *t;
+       int result;
+
+       result = -EEXIST;
+       spin_lock(&xprt_list_lock);
+       list_for_each_entry(t, &xprt_list, list) {
+               /* don't register the same transport class twice */
+               if (t->ident == transport->ident)
+                       goto out;
+       }
+
+       result = -EINVAL;
+       if (try_module_get(THIS_MODULE)) {
+               list_add_tail(&transport->list, &xprt_list);
+               printk(KERN_INFO "RPC: Registered %s transport module.\n",
+                       transport->name);
+               result = 0;
+       }
+
+out:
+       spin_unlock(&xprt_list_lock);
+       return result;
+}
+EXPORT_SYMBOL_GPL(xprt_register_transport);
+
+/**
+ * xprt_unregister_transport - unregister a transport implementation
+ * transport: transport to unregister
+ *
+ * Returns:
+ * 0:          transport successfully unregistered
+ * -ENOENT:    transport never registered
+ */
+int xprt_unregister_transport(struct xprt_class *transport)
+{
+       struct xprt_class *t;
+       int result;
+
+       result = 0;
+       spin_lock(&xprt_list_lock);
+       list_for_each_entry(t, &xprt_list, list) {
+               if (t == transport) {
+                       printk(KERN_INFO
+                               "RPC: Unregistered %s transport module.\n",
+                               transport->name);
+                       list_del_init(&transport->list);
+                       module_put(THIS_MODULE);
+                       goto out;
+               }
+       }
+       result = -ENOENT;
+
+out:
+       spin_unlock(&xprt_list_lock);
+       return result;
+}
+EXPORT_SYMBOL_GPL(xprt_unregister_transport);
+
+/**
  * xprt_reserve_xprt - serialize write access to transports
  * @task: task that is requesting access to the transport
  *
@@ -118,6 +193,7 @@ out_sleep:
                rpc_sleep_on(&xprt->sending, task, NULL, NULL);
        return 0;
 }
+EXPORT_SYMBOL_GPL(xprt_reserve_xprt);
 
 static void xprt_clear_locked(struct rpc_xprt *xprt)
 {
@@ -167,6 +243,7 @@ out_sleep:
                rpc_sleep_on(&xprt->sending, task, NULL, NULL);
        return 0;
 }
+EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong);
 
 static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
 {
@@ -246,6 +323,7 @@ void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
                __xprt_lock_write_next(xprt);
        }
 }
+EXPORT_SYMBOL_GPL(xprt_release_xprt);
 
 /**
  * xprt_release_xprt_cong - allow other requests to use a transport
@@ -262,6 +340,7 @@ void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
                __xprt_lock_write_next_cong(xprt);
        }
 }
+EXPORT_SYMBOL_GPL(xprt_release_xprt_cong);
 
 static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task)
 {
@@ -314,6 +393,7 @@ void xprt_release_rqst_cong(struct rpc_task *task)
 {
        __xprt_put_cong(task->tk_xprt, task->tk_rqstp);
 }
+EXPORT_SYMBOL_GPL(xprt_release_rqst_cong);
 
 /**
  * xprt_adjust_cwnd - adjust transport congestion window
@@ -345,6 +425,7 @@ void xprt_adjust_cwnd(struct rpc_task *task, int result)
        xprt->cwnd = cwnd;
        __xprt_put_cong(xprt, req);
 }
+EXPORT_SYMBOL_GPL(xprt_adjust_cwnd);
 
 /**
  * xprt_wake_pending_tasks - wake all tasks on a transport's pending queue
@@ -359,6 +440,7 @@ void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status)
        else
                rpc_wake_up(&xprt->pending);
 }
+EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks);
 
 /**
  * xprt_wait_for_buffer_space - wait for transport output buffer to clear
@@ -373,6 +455,7 @@ void xprt_wait_for_buffer_space(struct rpc_task *task)
        task->tk_timeout = req->rq_timeout;
        rpc_sleep_on(&xprt->pending, task, NULL, NULL);
 }
+EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space);
 
 /**
  * xprt_write_space - wake the task waiting for transport output buffer space
@@ -393,6 +476,7 @@ void xprt_write_space(struct rpc_xprt *xprt)
        }
        spin_unlock_bh(&xprt->transport_lock);
 }
+EXPORT_SYMBOL_GPL(xprt_write_space);
 
 /**
  * xprt_set_retrans_timeout_def - set a request's retransmit timeout
@@ -406,6 +490,7 @@ void xprt_set_retrans_timeout_def(struct rpc_task *task)
 {
        task->tk_timeout = task->tk_rqstp->rq_timeout;
 }
+EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_def);
 
 /*
  * xprt_set_retrans_timeout_rtt - set a request's retransmit timeout
@@ -425,6 +510,7 @@ void xprt_set_retrans_timeout_rtt(struct rpc_task *task)
        if (task->tk_timeout > max_timeout || task->tk_timeout == 0)
                task->tk_timeout = max_timeout;
 }
+EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_rtt);
 
 static void xprt_reset_majortimeo(struct rpc_rqst *req)
 {
@@ -500,6 +586,7 @@ void xprt_disconnect(struct rpc_xprt *xprt)
        xprt_wake_pending_tasks(xprt, -ENOTCONN);
        spin_unlock_bh(&xprt->transport_lock);
 }
+EXPORT_SYMBOL_GPL(xprt_disconnect);
 
 static void
 xprt_init_autodisconnect(unsigned long data)
@@ -610,6 +697,7 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
        xprt->stat.bad_xids++;
        return NULL;
 }
+EXPORT_SYMBOL_GPL(xprt_lookup_rqst);
 
 /**
  * xprt_update_rtt - update an RPC client's RTT state after receiving a reply
@@ -629,6 +717,7 @@ void xprt_update_rtt(struct rpc_task *task)
                rpc_set_timeo(rtt, timer, req->rq_ntrans - 1);
        }
 }
+EXPORT_SYMBOL_GPL(xprt_update_rtt);
 
 /**
  * xprt_complete_rqst - called when reply processing is complete
@@ -653,6 +742,7 @@ void xprt_complete_rqst(struct rpc_task *task, int copied)
        req->rq_received = req->rq_private_buf.len = copied;
        rpc_wake_up_task(task);
 }
+EXPORT_SYMBOL_GPL(xprt_complete_rqst);
 
 static void xprt_timer(struct rpc_task *task)
 {
@@ -889,23 +979,25 @@ void xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long i
  * @args: rpc transport creation arguments
  *
  */
-struct rpc_xprt *xprt_create_transport(struct rpc_xprtsock_create *args)
+struct rpc_xprt *xprt_create_transport(struct xprt_create *args)
 {
        struct rpc_xprt *xprt;
        struct rpc_rqst *req;
+       struct xprt_class *t;
 
-       switch (args->proto) {
-       case IPPROTO_UDP:
-               xprt = xs_setup_udp(args);
-               break;
-       case IPPROTO_TCP:
-               xprt = xs_setup_tcp(args);
-               break;
-       default:
-               printk(KERN_ERR "RPC: unrecognized transport protocol: %d\n",
-                               args->proto);
-               return ERR_PTR(-EIO);
+       spin_lock(&xprt_list_lock);
+       list_for_each_entry(t, &xprt_list, list) {
+               if (t->ident == args->ident) {
+                       spin_unlock(&xprt_list_lock);
+                       goto found;
+               }
        }
+       spin_unlock(&xprt_list_lock);
+       printk(KERN_ERR "RPC: transport (%d) not supported\n", args->ident);
+       return ERR_PTR(-EIO);
+
+found:
+       xprt = t->setup(args);
        if (IS_ERR(xprt)) {
                dprintk("RPC:       xprt_create_transport: failed, %ld\n",
                                -PTR_ERR(xprt));
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
new file mode 100644 (file)
index 0000000..264f0fe
--- /dev/null
@@ -0,0 +1,3 @@
+obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o
+
+xprtrdma-y := transport.o rpc_rdma.o verbs.o
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
new file mode 100644 (file)
index 0000000..12db635
--- /dev/null
@@ -0,0 +1,868 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * rpc_rdma.c
+ *
+ * This file contains the guts of the RPC RDMA protocol, and
+ * does marshaling/unmarshaling, etc. It is also where interfacing
+ * to the Linux RPC framework lives.
+ */
+
+#include "xprt_rdma.h"
+
+#include <linux/highmem.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY       RPCDBG_TRANS
+#endif
+
+enum rpcrdma_chunktype {
+       rpcrdma_noch = 0,
+       rpcrdma_readch,
+       rpcrdma_areadch,
+       rpcrdma_writech,
+       rpcrdma_replych
+};
+
+#ifdef RPC_DEBUG
+static const char transfertypes[][12] = {
+       "pure inline",  /* no chunks */
+       " read chunk",  /* some argument via rdma read */
+       "*read chunk",  /* entire request via rdma read */
+       "write chunk",  /* some result via rdma write */
+       "reply chunk"   /* entire reply via rdma write */
+};
+#endif
+
+/*
+ * Chunk assembly from upper layer xdr_buf.
+ *
+ * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk
+ * elements. Segments are then coalesced when registered, if possible
+ * within the selected memreg mode.
+ *
+ * Note, this routine is never called if the connection's memory
+ * registration strategy is 0 (bounce buffers).
+ */
+
+static int
+rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, int pos,
+       enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs)
+{
+       int len, n = 0, p;
+
+       if (pos == 0 && xdrbuf->head[0].iov_len) {
+               seg[n].mr_page = NULL;
+               seg[n].mr_offset = xdrbuf->head[0].iov_base;
+               seg[n].mr_len = xdrbuf->head[0].iov_len;
+               pos += xdrbuf->head[0].iov_len;
+               ++n;
+       }
+
+       if (xdrbuf->page_len && (xdrbuf->pages[0] != NULL)) {
+               if (n == nsegs)
+                       return 0;
+               seg[n].mr_page = xdrbuf->pages[0];
+               seg[n].mr_offset = (void *)(unsigned long) xdrbuf->page_base;
+               seg[n].mr_len = min_t(u32,
+                       PAGE_SIZE - xdrbuf->page_base, xdrbuf->page_len);
+               len = xdrbuf->page_len - seg[n].mr_len;
+               pos += len;
+               ++n;
+               p = 1;
+               while (len > 0) {
+                       if (n == nsegs)
+                               return 0;
+                       seg[n].mr_page = xdrbuf->pages[p];
+                       seg[n].mr_offset = NULL;
+                       seg[n].mr_len = min_t(u32, PAGE_SIZE, len);
+                       len -= seg[n].mr_len;
+                       ++n;
+                       ++p;
+               }
+       }
+
+       if (pos < xdrbuf->len && xdrbuf->tail[0].iov_len) {
+               if (n == nsegs)
+                       return 0;
+               seg[n].mr_page = NULL;
+               seg[n].mr_offset = xdrbuf->tail[0].iov_base;
+               seg[n].mr_len = xdrbuf->tail[0].iov_len;
+               pos += xdrbuf->tail[0].iov_len;
+               ++n;
+       }
+
+       if (pos < xdrbuf->len)
+               dprintk("RPC:       %s: marshaled only %d of %d\n",
+                               __func__, pos, xdrbuf->len);
+
+       return n;
+}
+
+/*
+ * Create read/write chunk lists, and reply chunks, for RDMA
+ *
+ *   Assume check against THRESHOLD has been done, and chunks are required.
+ *   Assume only encoding one list entry for read|write chunks. The NFSv3
+ *     protocol is simple enough to allow this as it only has a single "bulk
+ *     result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The
+ *     RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.)
+ *
+ * When used for a single reply chunk (which is a special write
+ * chunk used for the entire reply, rather than just the data), it
+ * is used primarily for READDIR and READLINK which would otherwise
+ * be severely size-limited by a small rdma inline read max. The server
+ * response will come back as an RDMA Write, followed by a message
+ * of type RDMA_NOMSG carrying the xid and length. As a result, reply
+ * chunks do not provide data alignment, however they do not require
+ * "fixup" (moving the response to the upper layer buffer) either.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
+ *  Read chunklist (a linked list):
+ *   N elements, position P (same P for all chunks of same arg!):
+ *    1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
+ *
+ *  Write chunklist (a list of (one) counted array):
+ *   N elements:
+ *    1 - N - HLOO - HLOO - ... - HLOO - 0
+ *
+ *  Reply chunk (a counted array):
+ *   N elements:
+ *    1 - N - HLOO - HLOO - ... - HLOO
+ */
+
+static unsigned int
+rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
+               struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
+{
+       struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_task->tk_xprt);
+       int nsegs, nchunks = 0;
+       int pos;
+       struct rpcrdma_mr_seg *seg = req->rl_segments;
+       struct rpcrdma_read_chunk *cur_rchunk = NULL;
+       struct rpcrdma_write_array *warray = NULL;
+       struct rpcrdma_write_chunk *cur_wchunk = NULL;
+       u32 *iptr = headerp->rm_body.rm_chunks;
+
+       if (type == rpcrdma_readch || type == rpcrdma_areadch) {
+               /* a read chunk - server will RDMA Read our memory */
+               cur_rchunk = (struct rpcrdma_read_chunk *) iptr;
+       } else {
+               /* a write or reply chunk - server will RDMA Write our memory */
+               *iptr++ = xdr_zero;     /* encode a NULL read chunk list */
+               if (type == rpcrdma_replych)
+                       *iptr++ = xdr_zero;     /* a NULL write chunk list */
+               warray = (struct rpcrdma_write_array *) iptr;
+               cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1);
+       }
+
+       if (type == rpcrdma_replych || type == rpcrdma_areadch)
+               pos = 0;
+       else
+               pos = target->head[0].iov_len;
+
+       nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
+       if (nsegs == 0)
+               return 0;
+
+       do {
+               /* bind/register the memory, then build chunk from result. */
+               int n = rpcrdma_register_external(seg, nsegs,
+                                               cur_wchunk != NULL, r_xprt);
+               if (n <= 0)
+                       goto out;
+               if (cur_rchunk) {       /* read */
+                       cur_rchunk->rc_discrim = xdr_one;
+                       /* all read chunks have the same "position" */
+                       cur_rchunk->rc_position = htonl(pos);
+                       cur_rchunk->rc_target.rs_handle = htonl(seg->mr_rkey);
+                       cur_rchunk->rc_target.rs_length = htonl(seg->mr_len);
+                       xdr_encode_hyper(
+                                       (u32 *)&cur_rchunk->rc_target.rs_offset,
+                                       seg->mr_base);
+                       dprintk("RPC:       %s: read chunk "
+                               "elem %d@0x%llx:0x%x pos %d (%s)\n", __func__,
+                               seg->mr_len, seg->mr_base, seg->mr_rkey, pos,
+                               n < nsegs ? "more" : "last");
+                       cur_rchunk++;
+                       r_xprt->rx_stats.read_chunk_count++;
+               } else {                /* write/reply */
+                       cur_wchunk->wc_target.rs_handle = htonl(seg->mr_rkey);
+                       cur_wchunk->wc_target.rs_length = htonl(seg->mr_len);
+                       xdr_encode_hyper(
+                                       (u32 *)&cur_wchunk->wc_target.rs_offset,
+                                       seg->mr_base);
+                       dprintk("RPC:       %s: %s chunk "
+                               "elem %d@0x%llx:0x%x (%s)\n", __func__,
+                               (type == rpcrdma_replych) ? "reply" : "write",
+                               seg->mr_len, seg->mr_base, seg->mr_rkey,
+                               n < nsegs ? "more" : "last");
+                       cur_wchunk++;
+                       if (type == rpcrdma_replych)
+                               r_xprt->rx_stats.reply_chunk_count++;
+                       else
+                               r_xprt->rx_stats.write_chunk_count++;
+                       r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+               }
+               nchunks++;
+               seg   += n;
+               nsegs -= n;
+       } while (nsegs);
+
+       /* success. all failures return above */
+       req->rl_nchunks = nchunks;
+
+       BUG_ON(nchunks == 0);
+
+       /*
+        * finish off header. If write, marshal discrim and nchunks.
+        */
+       if (cur_rchunk) {
+               iptr = (u32 *) cur_rchunk;
+               *iptr++ = xdr_zero;     /* finish the read chunk list */
+               *iptr++ = xdr_zero;     /* encode a NULL write chunk list */
+               *iptr++ = xdr_zero;     /* encode a NULL reply chunk */
+       } else {
+               warray->wc_discrim = xdr_one;
+               warray->wc_nchunks = htonl(nchunks);
+               iptr = (u32 *) cur_wchunk;
+               if (type == rpcrdma_writech) {
+                       *iptr++ = xdr_zero; /* finish the write chunk list */
+                       *iptr++ = xdr_zero; /* encode a NULL reply chunk */
+               }
+       }
+
+       /*
+        * Return header size.
+        */
+       return (unsigned char *)iptr - (unsigned char *)headerp;
+
+out:
+       for (pos = 0; nchunks--;)
+               pos += rpcrdma_deregister_external(
+                               &req->rl_segments[pos], r_xprt, NULL);
+       return 0;
+}
+
+/*
+ * Copy write data inline.
+ * This function is used for "small" requests. Data which is passed
+ * to RPC via iovecs (or page list) is copied directly into the
+ * pre-registered memory buffer for this request. For small amounts
+ * of data, this is efficient. The cutoff value is tunable.
+ */
+static int
+rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
+{
+       int i, npages, curlen;
+       int copy_len;
+       unsigned char *srcp, *destp;
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+
+       destp = rqst->rq_svec[0].iov_base;
+       curlen = rqst->rq_svec[0].iov_len;
+       destp += curlen;
+       /*
+        * Do optional padding where it makes sense. Alignment of write
+        * payload can help the server, if our setting is accurate.
+        */
+       pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/);
+       if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH)
+               pad = 0;        /* don't pad this request */
+
+       dprintk("RPC:       %s: pad %d destp 0x%p len %d hdrlen %d\n",
+               __func__, pad, destp, rqst->rq_slen, curlen);
+
+       copy_len = rqst->rq_snd_buf.page_len;
+       r_xprt->rx_stats.pullup_copy_count += copy_len;
+       npages = PAGE_ALIGN(rqst->rq_snd_buf.page_base+copy_len) >> PAGE_SHIFT;
+       for (i = 0; copy_len && i < npages; i++) {
+               if (i == 0)
+                       curlen = PAGE_SIZE - rqst->rq_snd_buf.page_base;
+               else
+                       curlen = PAGE_SIZE;
+               if (curlen > copy_len)
+                       curlen = copy_len;
+               dprintk("RPC:       %s: page %d destp 0x%p len %d curlen %d\n",
+                       __func__, i, destp, copy_len, curlen);
+               srcp = kmap_atomic(rqst->rq_snd_buf.pages[i],
+                                       KM_SKB_SUNRPC_DATA);
+               if (i == 0)
+                       memcpy(destp, srcp+rqst->rq_snd_buf.page_base, curlen);
+               else
+                       memcpy(destp, srcp, curlen);
+               kunmap_atomic(srcp, KM_SKB_SUNRPC_DATA);
+               rqst->rq_svec[0].iov_len += curlen;
+               destp += curlen;
+               copy_len -= curlen;
+       }
+       if (rqst->rq_snd_buf.tail[0].iov_len) {
+               curlen = rqst->rq_snd_buf.tail[0].iov_len;
+               if (destp != rqst->rq_snd_buf.tail[0].iov_base) {
+                       memcpy(destp,
+                               rqst->rq_snd_buf.tail[0].iov_base, curlen);
+                       r_xprt->rx_stats.pullup_copy_count += curlen;
+               }
+               dprintk("RPC:       %s: tail destp 0x%p len %d curlen %d\n",
+                       __func__, destp, copy_len, curlen);
+               rqst->rq_svec[0].iov_len += curlen;
+       }
+       /* header now contains entire send message */
+       return pad;
+}
+
+/*
+ * Marshal a request: the primary job of this routine is to choose
+ * the transfer modes. See comments below.
+ *
+ * Uses multiple RDMA IOVs for a request:
+ *  [0] -- RPC RDMA header, which uses memory from the *start* of the
+ *         preregistered buffer that already holds the RPC data in
+ *         its middle.
+ *  [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
+ *  [2] -- optional padding.
+ *  [3] -- if padded, header only in [1] and data here.
+ */
+
+int
+rpcrdma_marshal_req(struct rpc_rqst *rqst)
+{
+       struct rpc_xprt *xprt = rqst->rq_task->tk_xprt;
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+       struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+       char *base;
+       size_t hdrlen, rpclen, padlen;
+       enum rpcrdma_chunktype rtype, wtype;
+       struct rpcrdma_msg *headerp;
+
+       /*
+        * rpclen gets amount of data in first buffer, which is the
+        * pre-registered buffer.
+        */
+       base = rqst->rq_svec[0].iov_base;
+       rpclen = rqst->rq_svec[0].iov_len;
+
+       /* build RDMA header in private area at front */
+       headerp = (struct rpcrdma_msg *) req->rl_base;
+       /* don't htonl XID, it's already done in request */
+       headerp->rm_xid = rqst->rq_xid;
+       headerp->rm_vers = xdr_one;
+       headerp->rm_credit = htonl(r_xprt->rx_buf.rb_max_requests);
+       headerp->rm_type = __constant_htonl(RDMA_MSG);
+
+       /*
+        * Chunks needed for results?
+        *
+        * o If the expected result is under the inline threshold, all ops
+        *   return as inline (but see later).
+        * o Large non-read ops return as a single reply chunk.
+        * o Large read ops return data as write chunk(s), header as inline.
+        *
+        * Note: the NFS code sending down multiple result segments implies
+        * the op is one of read, readdir[plus], readlink or NFSv4 getacl.
+        */
+
+       /*
+        * This code can handle read chunks, write chunks OR reply
+        * chunks -- only one type. If the request is too big to fit
+        * inline, then we will choose read chunks. If the request is
+        * a READ, then use write chunks to separate the file data
+        * into pages; otherwise use reply chunks.
+        */
+       if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
+               wtype = rpcrdma_noch;
+       else if (rqst->rq_rcv_buf.page_len == 0)
+               wtype = rpcrdma_replych;
+       else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+               wtype = rpcrdma_writech;
+       else
+               wtype = rpcrdma_replych;
+
+       /*
+        * Chunks needed for arguments?
+        *
+        * o If the total request is under the inline threshold, all ops
+        *   are sent as inline.
+        * o Large non-write ops are sent with the entire message as a
+        *   single read chunk (protocol 0-position special case).
+        * o Large write ops transmit data as read chunk(s), header as
+        *   inline.
+        *
+        * Note: the NFS code sending down multiple argument segments
+        * implies the op is a write.
+        * TBD check NFSv4 setacl
+        */
+       if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
+               rtype = rpcrdma_noch;
+       else if (rqst->rq_snd_buf.page_len == 0)
+               rtype = rpcrdma_areadch;
+       else
+               rtype = rpcrdma_readch;
+
+       /* The following simplification is not true forever */
+       if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
+               wtype = rpcrdma_noch;
+       BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch);
+
+       if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS &&
+           (rtype != rpcrdma_noch || wtype != rpcrdma_noch)) {
+               /* forced to "pure inline"? */
+               dprintk("RPC:       %s: too much data (%d/%d) for inline\n",
+                       __func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len);
+               return -1;
+       }
+
+       hdrlen = 28; /*sizeof *headerp;*/
+       padlen = 0;
+
+       /*
+        * Pull up any extra send data into the preregistered buffer.
+        * When padding is in use and applies to the transfer, insert
+        * it and change the message type.
+        */
+       if (rtype == rpcrdma_noch) {
+
+               padlen = rpcrdma_inline_pullup(rqst,
+                                               RPCRDMA_INLINE_PAD_VALUE(rqst));
+
+               if (padlen) {
+                       headerp->rm_type = __constant_htonl(RDMA_MSGP);
+                       headerp->rm_body.rm_padded.rm_align =
+                               htonl(RPCRDMA_INLINE_PAD_VALUE(rqst));
+                       headerp->rm_body.rm_padded.rm_thresh =
+                               __constant_htonl(RPCRDMA_INLINE_PAD_THRESH);
+                       headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero;
+                       headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
+                       headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
+                       hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
+                       BUG_ON(wtype != rpcrdma_noch);
+
+               } else {
+                       headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
+                       headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
+                       headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
+                       /* new length after pullup */
+                       rpclen = rqst->rq_svec[0].iov_len;
+                       /*
+                        * Currently we try to not actually use read inline.
+                        * Reply chunks have the desirable property that
+                        * they land, packed, directly in the target buffers
+                        * without headers, so they require no fixup. The
+                        * additional RDMA Write op sends the same amount
+                        * of data, streams on-the-wire and adds no overhead
+                        * on receive. Therefore, we request a reply chunk
+                        * for non-writes wherever feasible and efficient.
+                        */
+                       if (wtype == rpcrdma_noch &&
+                           r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER)
+                               wtype = rpcrdma_replych;
+               }
+       }
+
+       /*
+        * Marshal chunks. This routine will return the header length
+        * consumed by marshaling.
+        */
+       if (rtype != rpcrdma_noch) {
+               hdrlen = rpcrdma_create_chunks(rqst,
+                                       &rqst->rq_snd_buf, headerp, rtype);
+               wtype = rtype;  /* simplify dprintk */
+
+       } else if (wtype != rpcrdma_noch) {
+               hdrlen = rpcrdma_create_chunks(rqst,
+                                       &rqst->rq_rcv_buf, headerp, wtype);
+       }
+
+       if (hdrlen == 0)
+               return -1;
+
+       dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd padlen %zd\n"
+               "                   headerp 0x%p base 0x%p lkey 0x%x\n",
+               __func__, transfertypes[wtype], hdrlen, rpclen, padlen,
+               headerp, base, req->rl_iov.lkey);
+
+       /*
+        * initialize send_iov's - normally only two: rdma chunk header and
+        * single preregistered RPC header buffer, but if padding is present,
+        * then use a preregistered (and zeroed) pad buffer between the RPC
+        * header and any write data. In all non-rdma cases, any following
+        * data has been copied into the RPC header buffer.
+        */
+       req->rl_send_iov[0].addr = req->rl_iov.addr;
+       req->rl_send_iov[0].length = hdrlen;
+       req->rl_send_iov[0].lkey = req->rl_iov.lkey;
+
+       req->rl_send_iov[1].addr = req->rl_iov.addr + (base - req->rl_base);
+       req->rl_send_iov[1].length = rpclen;
+       req->rl_send_iov[1].lkey = req->rl_iov.lkey;
+
+       req->rl_niovs = 2;
+
+       if (padlen) {
+               struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+
+               req->rl_send_iov[2].addr = ep->rep_pad.addr;
+               req->rl_send_iov[2].length = padlen;
+               req->rl_send_iov[2].lkey = ep->rep_pad.lkey;
+
+               req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen;
+               req->rl_send_iov[3].length = rqst->rq_slen - rpclen;
+               req->rl_send_iov[3].lkey = req->rl_iov.lkey;
+
+               req->rl_niovs = 4;
+       }
+
+       return 0;
+}
+
+/*
+ * Chase down a received write or reply chunklist to get length
+ * RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
+ */
+static int
+rpcrdma_count_chunks(struct rpcrdma_rep *rep, int max, int wrchunk, u32 **iptrp)
+{
+       unsigned int i, total_len;
+       struct rpcrdma_write_chunk *cur_wchunk;
+
+       i = ntohl(**iptrp);     /* get array count */
+       if (i > max)
+               return -1;
+       cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
+       total_len = 0;
+       while (i--) {
+               struct rpcrdma_segment *seg = &cur_wchunk->wc_target;
+               ifdebug(FACILITY) {
+                       u64 off;
+                       xdr_decode_hyper((u32 *)&seg->rs_offset, &off);
+                       dprintk("RPC:       %s: chunk %d@0x%llx:0x%x\n",
+                               __func__,
+                               ntohl(seg->rs_length),
+                               off,
+                               ntohl(seg->rs_handle));
+               }
+               total_len += ntohl(seg->rs_length);
+               ++cur_wchunk;
+       }
+       /* check and adjust for properly terminated write chunk */
+       if (wrchunk) {
+               u32 *w = (u32 *) cur_wchunk;
+               if (*w++ != xdr_zero)
+                       return -1;
+               cur_wchunk = (struct rpcrdma_write_chunk *) w;
+       }
+       if ((char *) cur_wchunk > rep->rr_base + rep->rr_len)
+               return -1;
+
+       *iptrp = (u32 *) cur_wchunk;
+       return total_len;
+}
+
+/*
+ * Scatter inline received data back into provided iov's.
+ */
+static void
+rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len)
+{
+       int i, npages, curlen, olen;
+       char *destp;
+
+       curlen = rqst->rq_rcv_buf.head[0].iov_len;
+       if (curlen > copy_len) {        /* write chunk header fixup */
+               curlen = copy_len;
+               rqst->rq_rcv_buf.head[0].iov_len = curlen;
+       }
+
+       dprintk("RPC:       %s: srcp 0x%p len %d hdrlen %d\n",
+               __func__, srcp, copy_len, curlen);
+
+       /* Shift pointer for first receive segment only */
+       rqst->rq_rcv_buf.head[0].iov_base = srcp;
+       srcp += curlen;
+       copy_len -= curlen;
+
+       olen = copy_len;
+       i = 0;
+       rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
+       if (copy_len && rqst->rq_rcv_buf.page_len) {
+               npages = PAGE_ALIGN(rqst->rq_rcv_buf.page_base +
+                       rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT;
+               for (; i < npages; i++) {
+                       if (i == 0)
+                               curlen = PAGE_SIZE - rqst->rq_rcv_buf.page_base;
+                       else
+                               curlen = PAGE_SIZE;
+                       if (curlen > copy_len)
+                               curlen = copy_len;
+                       dprintk("RPC:       %s: page %d"
+                               " srcp 0x%p len %d curlen %d\n",
+                               __func__, i, srcp, copy_len, curlen);
+                       destp = kmap_atomic(rqst->rq_rcv_buf.pages[i],
+                                               KM_SKB_SUNRPC_DATA);
+                       if (i == 0)
+                               memcpy(destp + rqst->rq_rcv_buf.page_base,
+                                               srcp, curlen);
+                       else
+                               memcpy(destp, srcp, curlen);
+                       flush_dcache_page(rqst->rq_rcv_buf.pages[i]);
+                       kunmap_atomic(destp, KM_SKB_SUNRPC_DATA);
+                       srcp += curlen;
+                       copy_len -= curlen;
+                       if (copy_len == 0)
+                               break;
+               }
+               rqst->rq_rcv_buf.page_len = olen - copy_len;
+       } else
+               rqst->rq_rcv_buf.page_len = 0;
+
+       if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
+               curlen = copy_len;
+               if (curlen > rqst->rq_rcv_buf.tail[0].iov_len)
+                       curlen = rqst->rq_rcv_buf.tail[0].iov_len;
+               if (rqst->rq_rcv_buf.tail[0].iov_base != srcp)
+                       memcpy(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
+               dprintk("RPC:       %s: tail srcp 0x%p len %d curlen %d\n",
+                       __func__, srcp, copy_len, curlen);
+               rqst->rq_rcv_buf.tail[0].iov_len = curlen;
+               copy_len -= curlen; ++i;
+       } else
+               rqst->rq_rcv_buf.tail[0].iov_len = 0;
+
+       if (copy_len)
+               dprintk("RPC:       %s: %d bytes in"
+                       " %d extra segments (%d lost)\n",
+                       __func__, olen, i, copy_len);
+
+       /* TBD avoid a warning from call_decode() */
+       rqst->rq_private_buf = rqst->rq_rcv_buf;
+}
+
+/*
+ * This function is called when an async event is posted to
+ * the connection which changes the connection state. All it
+ * does at this point is mark the connection up/down, the rpc
+ * timers do the rest.
+ */
+void
+rpcrdma_conn_func(struct rpcrdma_ep *ep)
+{
+       struct rpc_xprt *xprt = ep->rep_xprt;
+
+       spin_lock_bh(&xprt->transport_lock);
+       if (ep->rep_connected > 0) {
+               if (!xprt_test_and_set_connected(xprt))
+                       xprt_wake_pending_tasks(xprt, 0);
+       } else {
+               if (xprt_test_and_clear_connected(xprt))
+                       xprt_wake_pending_tasks(xprt, ep->rep_connected);
+       }
+       spin_unlock_bh(&xprt->transport_lock);
+}
+
+/*
+ * This function is called when memory window unbind which we are waiting
+ * for completes. Just use rr_func (zeroed by upcall) to signal completion.
+ */
+static void
+rpcrdma_unbind_func(struct rpcrdma_rep *rep)
+{
+       wake_up(&rep->rr_unbind);
+}
+
+/*
+ * Called as a tasklet to do req/reply match and complete a request
+ * Errors must result in the RPC task either being awakened, or
+ * allowed to timeout, to discover the errors at that time.
+ */
+void
+rpcrdma_reply_handler(struct rpcrdma_rep *rep)
+{
+       struct rpcrdma_msg *headerp;
+       struct rpcrdma_req *req;
+       struct rpc_rqst *rqst;
+       struct rpc_xprt *xprt = rep->rr_xprt;
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+       u32 *iptr;
+       int i, rdmalen, status;
+
+       /* Check status. If bad, signal disconnect and return rep to pool */
+       if (rep->rr_len == ~0U) {
+               rpcrdma_recv_buffer_put(rep);
+               if (r_xprt->rx_ep.rep_connected == 1) {
+                       r_xprt->rx_ep.rep_connected = -EIO;
+                       rpcrdma_conn_func(&r_xprt->rx_ep);
+               }
+               return;
+       }
+       if (rep->rr_len < 28) {
+               dprintk("RPC:       %s: short/invalid reply\n", __func__);
+               goto repost;
+       }
+       headerp = (struct rpcrdma_msg *) rep->rr_base;
+       if (headerp->rm_vers != xdr_one) {
+               dprintk("RPC:       %s: invalid version %d\n",
+                       __func__, ntohl(headerp->rm_vers));
+               goto repost;
+       }
+
+       /* Get XID and try for a match. */
+       spin_lock(&xprt->transport_lock);
+       rqst = xprt_lookup_rqst(xprt, headerp->rm_xid);
+       if (rqst == NULL) {
+               spin_unlock(&xprt->transport_lock);
+               dprintk("RPC:       %s: reply 0x%p failed "
+                       "to match any request xid 0x%08x len %d\n",
+                       __func__, rep, headerp->rm_xid, rep->rr_len);
+repost:
+               r_xprt->rx_stats.bad_reply_count++;
+               rep->rr_func = rpcrdma_reply_handler;
+               if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+                       rpcrdma_recv_buffer_put(rep);
+
+               return;
+       }
+
+       /* get request object */
+       req = rpcr_to_rdmar(rqst);
+
+       dprintk("RPC:       %s: reply 0x%p completes request 0x%p\n"
+               "                   RPC request 0x%p xid 0x%08x\n",
+                       __func__, rep, req, rqst, headerp->rm_xid);
+
+       BUG_ON(!req || req->rl_reply);
+
+       /* from here on, the reply is no longer an orphan */
+       req->rl_reply = rep;
+
+       /* check for expected message types */
+       /* The order of some of these tests is important. */
+       switch (headerp->rm_type) {
+       case __constant_htonl(RDMA_MSG):
+               /* never expect read chunks */
+               /* never expect reply chunks (two ways to check) */
+               /* never expect write chunks without having offered RDMA */
+               if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
+                   (headerp->rm_body.rm_chunks[1] == xdr_zero &&
+                    headerp->rm_body.rm_chunks[2] != xdr_zero) ||
+                   (headerp->rm_body.rm_chunks[1] != xdr_zero &&
+                    req->rl_nchunks == 0))
+                       goto badheader;
+               if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
+                       /* count any expected write chunks in read reply */
+                       /* start at write chunk array count */
+                       iptr = &headerp->rm_body.rm_chunks[2];
+                       rdmalen = rpcrdma_count_chunks(rep,
+                                               req->rl_nchunks, 1, &iptr);
+                       /* check for validity, and no reply chunk after */
+                       if (rdmalen < 0 || *iptr++ != xdr_zero)
+                               goto badheader;
+                       rep->rr_len -=
+                           ((unsigned char *)iptr - (unsigned char *)headerp);
+                       status = rep->rr_len + rdmalen;
+                       r_xprt->rx_stats.total_rdma_reply += rdmalen;
+               } else {
+                       /* else ordinary inline */
+                       iptr = (u32 *)((unsigned char *)headerp + 28);
+                       rep->rr_len -= 28; /*sizeof *headerp;*/
+                       status = rep->rr_len;
+               }
+               /* Fix up the rpc results for upper layer */
+               rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len);
+               break;
+
+       case __constant_htonl(RDMA_NOMSG):
+               /* never expect read or write chunks, always reply chunks */
+               if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
+                   headerp->rm_body.rm_chunks[1] != xdr_zero ||
+                   headerp->rm_body.rm_chunks[2] != xdr_one ||
+                   req->rl_nchunks == 0)
+                       goto badheader;
+               iptr = (u32 *)((unsigned char *)headerp + 28);
+               rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr);
+               if (rdmalen < 0)
+                       goto badheader;
+               r_xprt->rx_stats.total_rdma_reply += rdmalen;
+               /* Reply chunk buffer already is the reply vector - no fixup. */
+               status = rdmalen;
+               break;
+
+badheader:
+       default:
+               dprintk("%s: invalid rpcrdma reply header (type %d):"
+                               " chunks[012] == %d %d %d"
+                               " expected chunks <= %d\n",
+                               __func__, ntohl(headerp->rm_type),
+                               headerp->rm_body.rm_chunks[0],
+                               headerp->rm_body.rm_chunks[1],
+                               headerp->rm_body.rm_chunks[2],
+                               req->rl_nchunks);
+               status = -EIO;
+               r_xprt->rx_stats.bad_reply_count++;
+               break;
+       }
+
+       /* If using mw bind, start the deregister process now. */
+       /* (Note: if mr_free(), cannot perform it here, in tasklet context) */
+       if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) {
+       case RPCRDMA_MEMWINDOWS:
+               for (i = 0; req->rl_nchunks-- > 1;)
+                       i += rpcrdma_deregister_external(
+                               &req->rl_segments[i], r_xprt, NULL);
+               /* Optionally wait (not here) for unbinds to complete */
+               rep->rr_func = rpcrdma_unbind_func;
+               (void) rpcrdma_deregister_external(&req->rl_segments[i],
+                                                  r_xprt, rep);
+               break;
+       case RPCRDMA_MEMWINDOWS_ASYNC:
+               for (i = 0; req->rl_nchunks--;)
+                       i += rpcrdma_deregister_external(&req->rl_segments[i],
+                                                        r_xprt, NULL);
+               break;
+       default:
+               break;
+       }
+
+       dprintk("RPC:       %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
+                       __func__, xprt, rqst, status);
+       xprt_complete_rqst(rqst->rq_task, status);
+       spin_unlock(&xprt->transport_lock);
+}
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
new file mode 100644 (file)
index 0000000..dc55cc9
--- /dev/null
@@ -0,0 +1,800 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * transport.c
+ *
+ * This file contains the top-level implementation of an RPC RDMA
+ * transport.
+ *
+ * Naming convention: functions beginning with xprt_ are part of the
+ * transport switch. All others are RPC RDMA internal.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+
+#include "xprt_rdma.h"
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY       RPCDBG_TRANS
+#endif
+
+MODULE_LICENSE("Dual BSD/GPL");
+
+MODULE_DESCRIPTION("RPC/RDMA Transport for Linux kernel NFS");
+MODULE_AUTHOR("Network Appliance, Inc.");
+
+/*
+ * tunables
+ */
+
+static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
+static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
+static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
+static unsigned int xprt_rdma_inline_write_padding;
+#if !RPCRDMA_PERSISTENT_REGISTRATION
+static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_REGISTER; /* FMR? */
+#else
+static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_ALLPHYSICAL;
+#endif
+
+#ifdef RPC_DEBUG
+
+static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
+static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
+static unsigned int zero;
+static unsigned int max_padding = PAGE_SIZE;
+static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
+static unsigned int max_memreg = RPCRDMA_LAST - 1;
+
+static struct ctl_table_header *sunrpc_table_header;
+
+static ctl_table xr_tunables_table[] = {
+       {
+               .ctl_name       = CTL_SLOTTABLE_RDMA,
+               .procname       = "rdma_slot_table_entries",
+               .data           = &xprt_rdma_slot_table_entries,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_minmax,
+               .strategy       = &sysctl_intvec,
+               .extra1         = &min_slot_table_size,
+               .extra2         = &max_slot_table_size
+       },
+       {
+               .ctl_name       = CTL_RDMA_MAXINLINEREAD,
+               .procname       = "rdma_max_inline_read",
+               .data           = &xprt_rdma_max_inline_read,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+               .strategy       = &sysctl_intvec,
+       },
+       {
+               .ctl_name       = CTL_RDMA_MAXINLINEWRITE,
+               .procname       = "rdma_max_inline_write",
+               .data           = &xprt_rdma_max_inline_write,
+               .maxlen         = sizeof(unsigned int),
+               .mode    &n