NFSv4.1: layoutcommit
Andy Adamson [Wed, 23 Mar 2011 13:27:54 +0000 (13:27 +0000)]
The filelayout driver sends LAYOUTCOMMIT only when COMMIT goes to
the data server (as opposed to the MDS) and the data server WRITE
is not NFS_FILE_SYNC.

Only whole file layout support means that there is only one IOMODE_RW layout
segment.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Alexandros Batsakis <batsakis@netapp.com>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Dean Hildebrand <dhildeb@us.ibm.com>
Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
Signed-off-by: Mingyang Guo <guomingyang@nrchpc.ac.cn>
Signed-off-by: Tao Guo <guotao@nrchpc.ac.cn>
Signed-off-by: Zhang Jingwang <zhangjingwang@nrchpc.ac.cn>
Tested-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

fs/nfs/file.c
fs/nfs/nfs4_fs.h
fs/nfs/nfs4filelayout.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4xdr.c
fs/nfs/pnfs.c
fs/nfs/pnfs.h
fs/nfs/write.c
include/linux/nfs4.h
include/linux/nfs_fs.h
include/linux/nfs_xdr.h

index d85a534..85cb95d 100644 (file)
@@ -326,6 +326,9 @@ nfs_file_fsync(struct file *file, int datasync)
                ret = xchg(&ctx->error, 0);
        if (!ret && status < 0)
                ret = status;
+       if (!ret && !datasync)
+               /* application has asked for meta-data sync */
+               ret = pnfs_layoutcommit_inode(inode, 1);
        return ret;
 }
 
index c64be1c..1e612d1 100644 (file)
@@ -262,6 +262,8 @@ extern int nfs4_proc_destroy_session(struct nfs4_session *);
 extern int nfs4_init_session(struct nfs_server *server);
 extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
                struct nfs_fsinfo *fsinfo);
+extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
+                                 int sync);
 
 static inline bool
 is_ds_only_client(struct nfs_client *clp)
index 97e75a2..fc1a0e9 100644 (file)
@@ -154,6 +154,23 @@ static int filelayout_read_done_cb(struct rpc_task *task,
 }
 
 /*
+ * We reference the rpc_cred of the first WRITE that triggers the need for
+ * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
+ * rfc5661 is not clear about which credential should be used.
+ */
+static void
+filelayout_set_layoutcommit(struct nfs_write_data *wdata)
+{
+       if (FILELAYOUT_LSEG(wdata->lseg)->commit_through_mds ||
+           wdata->res.verf->committed == NFS_FILE_SYNC)
+               return;
+
+       pnfs_set_layoutcommit(wdata);
+       dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino,
+               (unsigned long) wdata->lseg->pls_end_pos);
+}
+
+/*
  * Call ops for the async read/write cases
  * In the case of dense layouts, the offset needs to be reset to its
  * original value.
@@ -210,6 +227,7 @@ static int filelayout_write_done_cb(struct rpc_task *task,
                return -EAGAIN;
        }
 
+       filelayout_set_layoutcommit(data);
        return 0;
 }
 
index 5d61ccc..6f2f402 100644 (file)
@@ -5616,6 +5616,100 @@ int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
 }
 EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
 
+static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata)
+{
+       struct nfs4_layoutcommit_data *data = calldata;
+       struct nfs_server *server = NFS_SERVER(data->args.inode);
+
+       if (nfs4_setup_sequence(server, &data->args.seq_args,
+                               &data->res.seq_res, 1, task))
+               return;
+       rpc_call_start(task);
+}
+
+static void
+nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
+{
+       struct nfs4_layoutcommit_data *data = calldata;
+       struct nfs_server *server = NFS_SERVER(data->args.inode);
+
+       if (!nfs4_sequence_done(task, &data->res.seq_res))
+               return;
+
+       switch (task->tk_status) { /* Just ignore these failures */
+       case NFS4ERR_DELEG_REVOKED: /* layout was recalled */
+       case NFS4ERR_BADIOMODE:     /* no IOMODE_RW layout for range */
+       case NFS4ERR_BADLAYOUT:     /* no layout */
+       case NFS4ERR_GRACE:         /* loca_recalim always false */
+               task->tk_status = 0;
+       }
+
+       if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+               nfs_restart_rpc(task, server->nfs_client);
+               return;
+       }
+
+       if (task->tk_status == 0)
+               nfs_post_op_update_inode_force_wcc(data->args.inode,
+                                                  data->res.fattr);
+}
+
+static void nfs4_layoutcommit_release(void *calldata)
+{
+       struct nfs4_layoutcommit_data *data = calldata;
+
+       /* Matched by references in pnfs_set_layoutcommit */
+       put_lseg(data->lseg);
+       put_rpccred(data->cred);
+       kfree(data);
+}
+
+static const struct rpc_call_ops nfs4_layoutcommit_ops = {
+       .rpc_call_prepare = nfs4_layoutcommit_prepare,
+       .rpc_call_done = nfs4_layoutcommit_done,
+       .rpc_release = nfs4_layoutcommit_release,
+};
+
+int
+nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int sync)
+{
+       struct rpc_message msg = {
+               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT],
+               .rpc_argp = &data->args,
+               .rpc_resp = &data->res,
+               .rpc_cred = data->cred,
+       };
+       struct rpc_task_setup task_setup_data = {
+               .task = &data->task,
+               .rpc_client = NFS_CLIENT(data->args.inode),
+               .rpc_message = &msg,
+               .callback_ops = &nfs4_layoutcommit_ops,
+               .callback_data = data,
+               .flags = RPC_TASK_ASYNC,
+       };
+       struct rpc_task *task;
+       int status = 0;
+
+       dprintk("NFS: %4d initiating layoutcommit call. sync %d "
+               "lbw: %llu inode %lu\n",
+               data->task.tk_pid, sync,
+               data->args.lastbytewritten,
+               data->args.inode->i_ino);
+
+       task = rpc_run_task(&task_setup_data);
+       if (IS_ERR(task))
+               return PTR_ERR(task);
+       if (!sync)
+               goto out;
+       status = nfs4_wait_for_completion_rpc_task(task);
+       if (status != 0)
+               goto out;
+       status = task->tk_status;
+out:
+       dprintk("%s: status %d\n", __func__, status);
+       rpc_put_task(task);
+       return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
index 07cdf92..207d399 100644 (file)
@@ -324,6 +324,18 @@ static int nfs4_stat_to_errno(int);
 #define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
                                decode_stateid_maxsz + \
                                XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
+#define encode_layoutcommit_maxsz (op_encode_hdr_maxsz +          \
+                               2 /* offset */ + \
+                               2 /* length */ + \
+                               1 /* reclaim */ + \
+                               encode_stateid_maxsz + \
+                               1 /* new offset (true) */ + \
+                               2 /* last byte written */ + \
+                               1 /* nt_timechanged (false) */ + \
+                               1 /* layoutupdate4 layout type */ + \
+                               1 /* NULL filelayout layoutupdate4 payload */)
+#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
+
 #else /* CONFIG_NFS_V4_1 */
 #define encode_sequence_maxsz  0
 #define decode_sequence_maxsz  0
@@ -727,6 +739,17 @@ static int nfs4_stat_to_errno(int);
                                decode_sequence_maxsz + \
                                decode_putfh_maxsz +        \
                                decode_layoutget_maxsz)
+#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \
+                               encode_sequence_maxsz +\
+                               encode_putfh_maxsz + \
+                               encode_layoutcommit_maxsz + \
+                               encode_getattr_maxsz)
+#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \
+                               decode_sequence_maxsz + \
+                               decode_putfh_maxsz + \
+                               decode_layoutcommit_maxsz + \
+                               decode_getattr_maxsz)
+
 
 const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
                                      compound_encode_hdr_maxsz +
@@ -1816,6 +1839,34 @@ encode_layoutget(struct xdr_stream *xdr,
        hdr->nops++;
        hdr->replen += decode_layoutget_maxsz;
 }
+
+static int
+encode_layoutcommit(struct xdr_stream *xdr,
+                   const struct nfs4_layoutcommit_args *args,
+                   struct compound_hdr *hdr)
+{
+       __be32 *p;
+
+       dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
+               NFS_SERVER(args->inode)->pnfs_curr_ld->id);
+
+       p = reserve_space(xdr, 48 + NFS4_STATEID_SIZE);
+       *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
+       /* Only whole file layouts */
+       p = xdr_encode_hyper(p, 0); /* offset */
+       p = xdr_encode_hyper(p, NFS4_MAX_UINT64); /* length */
+       *p++ = cpu_to_be32(0); /* reclaim */
+       p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
+       *p++ = cpu_to_be32(1); /* newoffset = TRUE */
+       p = xdr_encode_hyper(p, args->lastbytewritten);
+       *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
+       *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
+       *p++ = cpu_to_be32(0); /* no file layout payload */
+
+       hdr->nops++;
+       hdr->replen += decode_layoutcommit_maxsz;
+       return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 /*
@@ -2607,6 +2658,26 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
        encode_layoutget(xdr, args, &hdr);
        encode_nops(&hdr);
 }
+
+/*
+ *  Encode LAYOUTCOMMIT request
+ */
+static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_layoutcommit_args *args)
+{
+       struct compound_hdr hdr = {
+               .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+       };
+
+       encode_compound_hdr(xdr, req, &hdr);
+       encode_sequence(xdr, &args->seq_args, &hdr);
+       encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+       encode_layoutcommit(xdr, args, &hdr);
+       encode_getfattr(xdr, args->bitmask, &hdr);
+       encode_nops(&hdr);
+       return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -5007,6 +5078,35 @@ out_overflow:
        print_overflow_msg(__func__, xdr);
        return -EIO;
 }
+
+static int decode_layoutcommit(struct xdr_stream *xdr,
+                              struct rpc_rqst *req,
+                              struct nfs4_layoutcommit_res *res)
+{
+       __be32 *p;
+       __u32 sizechanged;
+       int status;
+
+       status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
+       if (status)
+               return status;
+
+       p = xdr_inline_decode(xdr, 4);
+       if (unlikely(!p))
+               goto out_overflow;
+       sizechanged = be32_to_cpup(p);
+
+       if (sizechanged) {
+               /* throw away new size */
+               p = xdr_inline_decode(xdr, 8);
+               if (unlikely(!p))
+                       goto out_overflow;
+       }
+       return 0;
+out_overflow:
+       print_overflow_msg(__func__, xdr);
+       return -EIO;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 /*
@@ -6068,6 +6168,34 @@ static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp,
 out:
        return status;
 }
+
+/*
+ * Decode LAYOUTCOMMIT response
+ */
+static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_layoutcommit_res *res)
+{
+       struct compound_hdr hdr;
+       int status;
+
+       status = decode_compound_hdr(xdr, &hdr);
+       if (status)
+               goto out;
+       status = decode_sequence(xdr, &res->seq_res, rqstp);
+       if (status)
+               goto out;
+       status = decode_putfh(xdr);
+       if (status)
+               goto out;
+       status = decode_layoutcommit(xdr, rqstp, res);
+       if (status)
+               goto out;
+       decode_getfattr(xdr, res->fattr, res->server,
+                       !RPC_IS_ASYNC(rqstp->rq_task));
+out:
+       return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 /**
@@ -6269,6 +6397,7 @@ struct rpc_procinfo       nfs4_procedures[] = {
        PROC(RECLAIM_COMPLETE,  enc_reclaim_complete,   dec_reclaim_complete),
        PROC(GETDEVICEINFO,     enc_getdeviceinfo,      dec_getdeviceinfo),
        PROC(LAYOUTGET,         enc_layoutget,          dec_layoutget),
+       PROC(LAYOUTCOMMIT,      enc_layoutcommit,       dec_layoutcommit),
 #endif /* CONFIG_NFS_V4_1 */
 };
 
index c675659..2a08ca0 100644 (file)
@@ -946,3 +946,97 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
        return trypnfs;
 }
+
+/*
+ * Currently there is only one (whole file) write lseg.
+ */
+static struct pnfs_layout_segment *pnfs_list_write_lseg(struct inode *inode)
+{
+       struct pnfs_layout_segment *lseg, *rv = NULL;
+
+       list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list)
+               if (lseg->pls_range.iomode == IOMODE_RW)
+                       rv = lseg;
+       return rv;
+}
+
+void
+pnfs_set_layoutcommit(struct nfs_write_data *wdata)
+{
+       struct nfs_inode *nfsi = NFS_I(wdata->inode);
+       loff_t end_pos = wdata->args.offset + wdata->res.count;
+
+       spin_lock(&nfsi->vfs_inode.i_lock);
+       if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
+               /* references matched in nfs4_layoutcommit_release */
+               get_lseg(wdata->lseg);
+               wdata->lseg->pls_lc_cred =
+                       get_rpccred(wdata->args.context->state->owner->so_cred);
+               mark_inode_dirty_sync(wdata->inode);
+               dprintk("%s: Set layoutcommit for inode %lu ",
+                       __func__, wdata->inode->i_ino);
+       }
+       if (end_pos > wdata->lseg->pls_end_pos)
+               wdata->lseg->pls_end_pos = end_pos;
+       spin_unlock(&nfsi->vfs_inode.i_lock);
+}
+EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
+
+int
+pnfs_layoutcommit_inode(struct inode *inode, int sync)
+{
+       struct nfs4_layoutcommit_data *data;
+       struct nfs_inode *nfsi = NFS_I(inode);
+       struct pnfs_layout_segment *lseg;
+       struct rpc_cred *cred;
+       loff_t end_pos;
+       int status = 0;
+
+       dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
+
+       /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
+       data = kzalloc(sizeof(*data), GFP_NOFS);
+       spin_lock(&inode->i_lock);
+
+       if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
+               spin_unlock(&inode->i_lock);
+               kfree(data);
+               goto out;
+       }
+       /*
+        * Currently only one (whole file) write lseg which is referenced
+        * in pnfs_set_layoutcommit and will be found.
+        */
+       lseg = pnfs_list_write_lseg(inode);
+
+       end_pos = lseg->pls_end_pos;
+       cred = lseg->pls_lc_cred;
+       lseg->pls_end_pos = 0;
+       lseg->pls_lc_cred = NULL;
+
+       if (!data) {
+               put_lseg(lseg);
+               spin_unlock(&inode->i_lock);
+               put_rpccred(cred);
+               status = -ENOMEM;
+               goto out;
+       } else {
+               memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data,
+                       sizeof(nfsi->layout->plh_stateid.data));
+       }
+       spin_unlock(&inode->i_lock);
+
+       data->args.inode = inode;
+       data->lseg = lseg;
+       data->cred = cred;
+       nfs_fattr_init(&data->fattr);
+       data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
+       data->res.fattr = &data->fattr;
+       data->args.lastbytewritten = end_pos - 1;
+       data->res.server = NFS_SERVER(inode);
+
+       status = nfs4_proc_layoutcommit(data, sync);
+out:
+       dprintk("<-- %s status %d\n", __func__, status);
+       return status;
+}
index 5370f1b..0806c77 100644 (file)
@@ -43,6 +43,8 @@ struct pnfs_layout_segment {
        atomic_t pls_refcount;
        unsigned long pls_flags;
        struct pnfs_layout_hdr *pls_layout;
+       struct rpc_cred *pls_lc_cred; /* LAYOUTCOMMIT credential */
+       loff_t pls_end_pos; /* LAYOUTCOMMIT write end */
 };
 
 enum pnfs_try_status {
@@ -152,7 +154,8 @@ bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
 bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
-
+void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
+int pnfs_layoutcommit_inode(struct inode *inode, int sync);
 
 static inline int lo_fail_bit(u32 iomode)
 {
@@ -325,6 +328,10 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req)
 {
 }
 
+static inline int pnfs_layoutcommit_inode(struct inode *inode, int sync)
+{
+       return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 #endif /* FS_NFS_PNFS_H */
index e7aeda0..a03c11f 100644 (file)
@@ -1562,7 +1562,20 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
 
 int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-       return nfs_commit_unstable_pages(inode, wbc);
+       int ret;
+
+       ret = nfs_commit_unstable_pages(inode, wbc);
+       if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) {
+               int status, sync = wbc->sync_mode;
+
+               if (wbc->nonblocking || wbc->for_background)
+                               sync = 0;
+
+               status = pnfs_layoutcommit_inode(inode, sync);
+               if (status < 0)
+                       return status;
+       }
+       return ret;
 }
 
 /*
index 134716e..bf22cfa 100644 (file)
@@ -560,6 +560,7 @@ enum {
        NFSPROC4_CLNT_RECLAIM_COMPLETE,
        NFSPROC4_CLNT_LAYOUTGET,
        NFSPROC4_CLNT_GETDEVICEINFO,
+       NFSPROC4_CLNT_LAYOUTCOMMIT,
 };
 
 /* nfs41 types */
index 807e07c..1b93b9c 100644 (file)
@@ -228,6 +228,7 @@ struct nfs_inode {
 #define NFS_INO_FSCACHE_LOCK   (6)             /* FS-Cache cookie management lock */
 #define NFS_INO_COMMIT         (7)             /* inode is committing unstable writes */
 #define NFS_INO_PNFS_COMMIT    (8)             /* use pnfs code for commit */
+#define NFS_INO_LAYOUTCOMMIT   (9)             /* layoutcommit required */
 
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {
index ac0c0e5..84f3585 100644 (file)
@@ -236,6 +236,29 @@ struct nfs4_getdeviceinfo_res {
        struct nfs4_sequence_res seq_res;
 };
 
+struct nfs4_layoutcommit_args {
+       nfs4_stateid stateid;
+       __u64 lastbytewritten;
+       struct inode *inode;
+       const u32 *bitmask;
+       struct nfs4_sequence_args seq_args;
+};
+
+struct nfs4_layoutcommit_res {
+       struct nfs_fattr *fattr;
+       const struct nfs_server *server;
+       struct nfs4_sequence_res seq_res;
+};
+
+struct nfs4_layoutcommit_data {
+       struct rpc_task task;
+       struct nfs_fattr fattr;
+       struct pnfs_layout_segment *lseg;
+       struct rpc_cred *cred;
+       struct nfs4_layoutcommit_args args;
+       struct nfs4_layoutcommit_res res;
+};
+
 /*
  * Arguments to the open call.
  */