RDMA/core: Add memory management extensions support
Steve Wise [Tue, 15 Jul 2008 06:48:45 +0000 (23:48 -0700)]
This patch adds support for the IB "base memory management extension"
(BMME) and the equivalent iWARP operations (which the iWARP verbs
mandates all devices must implement).  The new operations are:

 - Allocate an ib_mr for use in fast register work requests.

 - Allocate/free a physical buffer lists for use in fast register work
   requests.  This allows device drivers to allocate this memory as
   needed for use in posting send requests (eg via dma_alloc_coherent).

 - New send queue work requests:
   * send with remote invalidate
   * fast register memory region
   * local invalidate memory region
   * RDMA read with invalidate local memory region (iWARP only)

Consumer interface details:

 - A new device capability flag IB_DEVICE_MEM_MGT_EXTENSIONS is added
   to indicate device support for these features.

 - New send work request opcodes IB_WR_FAST_REG_MR, IB_WR_LOCAL_INV,
   IB_WR_RDMA_READ_WITH_INV are added.

 - A new consumer API function, ib_alloc_mr() is added to allocate
   fast register memory regions.

 - New consumer API functions, ib_alloc_fast_reg_page_list() and
   ib_free_fast_reg_page_list() are added to allocate and free
   device-specific memory for fast registration page lists.

 - A new consumer API function, ib_update_fast_reg_key(), is added to
   allow the key portion of the R_Key and L_Key of a fast registration
   MR to be updated.  Consumers call this if desired before posting
   a IB_WR_FAST_REG_MR work request.

Consumers can use this as follows:

 - MR is allocated with ib_alloc_mr().

 - Page list memory is allocated with ib_alloc_fast_reg_page_list().

 - MR R_Key/L_Key "key" field is updated with ib_update_fast_reg_key().

 - MR made VALID and bound to a specific page list via
   ib_post_send(IB_WR_FAST_REG_MR)

 - MR made INVALID via ib_post_send(IB_WR_LOCAL_INV),
   ib_post_send(IB_WR_RDMA_READ_WITH_INV) or an incoming send with
   invalidate operation.

 - MR is deallocated with ib_dereg_mr()

 - page lists dealloced via ib_free_fast_reg_page_list().

Applications can allocate a fast register MR once, and then can
repeatedly bind the MR to different physical block lists (PBLs) via
posting work requests to a send queue (SQ).  For each outstanding
MR-to-PBL binding in the SQ pipe, a fast_reg_page_list needs to be
allocated (the fast_reg_page_list is owned by the low-level driver
from the consumer posting a work request until the request completes).
Thus pipelining can be achieved while still allowing device-specific
page_list processing.

The 32-bit fast register memory key/STag is composed of a 24-bit index
and an 8-bit key.  The application can change the key each time it
fast registers thus allowing more control over the peer's use of the
key/STag (ie it can effectively be changed each time the rkey is
rebound to a page list).

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

12 files changed:
drivers/infiniband/core/uverbs_cmd.c
drivers/infiniband/core/verbs.c
drivers/infiniband/hw/ehca/ehca_reqs.c
drivers/infiniband/hw/ipath/ipath_cq.c
drivers/infiniband/hw/ipath/ipath_rc.c
drivers/infiniband/hw/ipath/ipath_ruc.c
drivers/infiniband/hw/ipath/ipath_uc.c
drivers/infiniband/hw/ipath/ipath_ud.c
drivers/infiniband/hw/mlx4/cq.c
drivers/infiniband/hw/mthca/mthca_cq.c
include/rdma/ib_user_verbs.h
include/rdma/ib_verbs.h

index 112b37c..56feab6 100644 (file)
@@ -917,7 +917,7 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
                resp->wc[i].opcode         = wc[i].opcode;
                resp->wc[i].vendor_err     = wc[i].vendor_err;
                resp->wc[i].byte_len       = wc[i].byte_len;
-               resp->wc[i].imm_data       = (__u32 __force) wc[i].imm_data;
+               resp->wc[i].ex.imm_data    = (__u32 __force) wc[i].ex.imm_data;
                resp->wc[i].qp_num         = wc[i].qp->qp_num;
                resp->wc[i].src_qp         = wc[i].src_qp;
                resp->wc[i].wc_flags       = wc[i].wc_flags;
index 9f399d3..e0fbe59 100644 (file)
@@ -753,6 +753,52 @@ int ib_dereg_mr(struct ib_mr *mr)
 }
 EXPORT_SYMBOL(ib_dereg_mr);
 
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+{
+       struct ib_mr *mr;
+
+       if (!pd->device->alloc_fast_reg_mr)
+               return ERR_PTR(-ENOSYS);
+
+       mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
+
+       if (!IS_ERR(mr)) {
+               mr->device  = pd->device;
+               mr->pd      = pd;
+               mr->uobject = NULL;
+               atomic_inc(&pd->usecnt);
+               atomic_set(&mr->usecnt, 0);
+       }
+
+       return mr;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
+
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,
+                                                         int max_page_list_len)
+{
+       struct ib_fast_reg_page_list *page_list;
+
+       if (!device->alloc_fast_reg_page_list)
+               return ERR_PTR(-ENOSYS);
+
+       page_list = device->alloc_fast_reg_page_list(device, max_page_list_len);
+
+       if (!IS_ERR(page_list)) {
+               page_list->device = device;
+               page_list->max_page_list_len = max_page_list_len;
+       }
+
+       return page_list;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_page_list);
+
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
+{
+       page_list->device->free_fast_reg_page_list(page_list);
+}
+EXPORT_SYMBOL(ib_free_fast_reg_page_list);
+
 /* Memory windows */
 
 struct ib_mw *ib_alloc_mw(struct ib_pd *pd)
index f093b00..b799b27 100644 (file)
@@ -681,7 +681,7 @@ poll_cq_one_read_cqe:
        wc->dlid_path_bits = cqe->dlid;
        wc->src_qp = cqe->remote_qp_number;
        wc->wc_flags = cqe->w_completion_flags;
-       wc->imm_data = cpu_to_be32(cqe->immediate_data);
+       wc->ex.imm_data = cpu_to_be32(cqe->immediate_data);
        wc->sl = cqe->service_level;
 
 poll_cq_one_exit0:
index a03bd28..d385e41 100644 (file)
@@ -82,7 +82,7 @@ void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited)
                wc->uqueue[head].opcode = entry->opcode;
                wc->uqueue[head].vendor_err = entry->vendor_err;
                wc->uqueue[head].byte_len = entry->byte_len;
-               wc->uqueue[head].imm_data = (__u32 __force)entry->imm_data;
+               wc->uqueue[head].ex.imm_data = (__u32 __force) entry->ex.imm_data;
                wc->uqueue[head].qp_num = entry->qp->qp_num;
                wc->uqueue[head].src_qp = entry->src_qp;
                wc->uqueue[head].wc_flags = entry->wc_flags;
index 108df66..9771052 100644 (file)
@@ -1703,11 +1703,11 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
        case OP(SEND_LAST_WITH_IMMEDIATE):
        send_last_imm:
                if (header_in_data) {
-                       wc.imm_data = *(__be32 *) data;
+                       wc.ex.imm_data = *(__be32 *) data;
                        data += sizeof(__be32);
                } else {
                        /* Immediate data comes after BTH */
-                       wc.imm_data = ohdr->u.imm_data;
+                       wc.ex.imm_data = ohdr->u.imm_data;
                }
                hdrsize += 4;
                wc.wc_flags = IB_WC_WITH_IMM;
index a4b5521..af051f7 100644 (file)
@@ -331,7 +331,7 @@ again:
        switch (wqe->wr.opcode) {
        case IB_WR_SEND_WITH_IMM:
                wc.wc_flags = IB_WC_WITH_IMM;
-               wc.imm_data = wqe->wr.ex.imm_data;
+               wc.ex.imm_data = wqe->wr.ex.imm_data;
                /* FALLTHROUGH */
        case IB_WR_SEND:
                if (!ipath_get_rwqe(qp, 0))
@@ -342,7 +342,7 @@ again:
                if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
                        goto inv_err;
                wc.wc_flags = IB_WC_WITH_IMM;
-               wc.imm_data = wqe->wr.ex.imm_data;
+               wc.ex.imm_data = wqe->wr.ex.imm_data;
                if (!ipath_get_rwqe(qp, 1))
                        goto rnr_nak;
                /* FALLTHROUGH */
index 0596ec1..82cc588 100644 (file)
@@ -379,11 +379,11 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
        case OP(SEND_LAST_WITH_IMMEDIATE):
        send_last_imm:
                if (header_in_data) {
-                       wc.imm_data = *(__be32 *) data;
+                       wc.ex.imm_data = *(__be32 *) data;
                        data += sizeof(__be32);
                } else {
                        /* Immediate data comes after BTH */
-                       wc.imm_data = ohdr->u.imm_data;
+                       wc.ex.imm_data = ohdr->u.imm_data;
                }
                hdrsize += 4;
                wc.wc_flags = IB_WC_WITH_IMM;
@@ -483,11 +483,11 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
        case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
        rdma_last_imm:
                if (header_in_data) {
-                       wc.imm_data = *(__be32 *) data;
+                       wc.ex.imm_data = *(__be32 *) data;
                        data += sizeof(__be32);
                } else {
                        /* Immediate data comes after BTH */
-                       wc.imm_data = ohdr->u.imm_data;
+                       wc.ex.imm_data = ohdr->u.imm_data;
                }
                hdrsize += 4;
                wc.wc_flags = IB_WC_WITH_IMM;
index 77ca8ca..36aa242 100644 (file)
@@ -96,7 +96,7 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
 
        if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
                wc.wc_flags = IB_WC_WITH_IMM;
-               wc.imm_data = swqe->wr.ex.imm_data;
+               wc.ex.imm_data = swqe->wr.ex.imm_data;
        }
 
        /*
@@ -492,14 +492,14 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
        if (qp->ibqp.qp_num > 1 &&
            opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
                if (header_in_data) {
-                       wc.imm_data = *(__be32 *) data;
+                       wc.ex.imm_data = *(__be32 *) data;
                        data += sizeof(__be32);
                } else
-                       wc.imm_data = ohdr->u.ud.imm_data;
+                       wc.ex.imm_data = ohdr->u.ud.imm_data;
                wc.wc_flags = IB_WC_WITH_IMM;
                hdrsize += sizeof(u32);
        } else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
-               wc.imm_data = 0;
+               wc.ex.imm_data = 0;
                wc.wc_flags = 0;
        } else {
                dev->n_pkt_drops++;
index 4521319..299f208 100644 (file)
@@ -663,18 +663,18 @@ repoll:
 
                switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
                case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
-                       wc->opcode   = IB_WC_RECV_RDMA_WITH_IMM;
-                       wc->wc_flags = IB_WC_WITH_IMM;
-                       wc->imm_data = cqe->immed_rss_invalid;
+                       wc->opcode      = IB_WC_RECV_RDMA_WITH_IMM;
+                       wc->wc_flags    = IB_WC_WITH_IMM;
+                       wc->ex.imm_data = cqe->immed_rss_invalid;
                        break;
                case MLX4_RECV_OPCODE_SEND:
                        wc->opcode   = IB_WC_RECV;
                        wc->wc_flags = 0;
                        break;
                case MLX4_RECV_OPCODE_SEND_IMM:
-                       wc->opcode   = IB_WC_RECV;
-                       wc->wc_flags = IB_WC_WITH_IMM;
-                       wc->imm_data = cqe->immed_rss_invalid;
+                       wc->opcode      = IB_WC_RECV;
+                       wc->wc_flags    = IB_WC_WITH_IMM;
+                       wc->ex.imm_data = cqe->immed_rss_invalid;
                        break;
                }
 
index f788fce..d9f4735 100644 (file)
@@ -620,13 +620,13 @@ static inline int mthca_poll_one(struct mthca_dev *dev,
                case IB_OPCODE_SEND_LAST_WITH_IMMEDIATE:
                case IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE:
                        entry->wc_flags = IB_WC_WITH_IMM;
-                       entry->imm_data = cqe->imm_etype_pkey_eec;
+                       entry->ex.imm_data = cqe->imm_etype_pkey_eec;
                        entry->opcode = IB_WC_RECV;
                        break;
                case IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE:
                case IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE:
                        entry->wc_flags = IB_WC_WITH_IMM;
-                       entry->imm_data = cqe->imm_etype_pkey_eec;
+                       entry->ex.imm_data = cqe->imm_etype_pkey_eec;
                        entry->opcode = IB_WC_RECV_RDMA_WITH_IMM;
                        break;
                default:
index 885254f..a17f771 100644 (file)
@@ -289,7 +289,10 @@ struct ib_uverbs_wc {
        __u32 opcode;
        __u32 vendor_err;
        __u32 byte_len;
-       __u32 imm_data;
+       union {
+               __u32 imm_data;
+               __u32 invalidate_rkey;
+       } ex;
        __u32 qp_num;
        __u32 src_qp;
        __u32 wc_flags;
index 5f5621b..74c24b9 100644 (file)
@@ -103,6 +103,7 @@ enum ib_device_cap_flags {
         */
        IB_DEVICE_UD_IP_CSUM            = (1<<18),
        IB_DEVICE_UD_TSO                = (1<<19),
+       IB_DEVICE_MEM_MGT_EXTENSIONS    = (1<<21),
 };
 
 enum ib_atomic_cap {
@@ -148,6 +149,7 @@ struct ib_device_attr {
        int                     max_srq;
        int                     max_srq_wr;
        int                     max_srq_sge;
+       unsigned int            max_fast_reg_page_list_len;
        u16                     max_pkeys;
        u8                      local_ca_ack_delay;
 };
@@ -411,6 +413,8 @@ enum ib_wc_opcode {
        IB_WC_FETCH_ADD,
        IB_WC_BIND_MW,
        IB_WC_LSO,
+       IB_WC_LOCAL_INV,
+       IB_WC_FAST_REG_MR,
 /*
  * Set value of IB_WC_RECV so consumers can test if a completion is a
  * receive by testing (opcode & IB_WC_RECV).
@@ -421,7 +425,8 @@ enum ib_wc_opcode {
 
 enum ib_wc_flags {
        IB_WC_GRH               = 1,
-       IB_WC_WITH_IMM          = (1<<1)
+       IB_WC_WITH_IMM          = (1<<1),
+       IB_WC_WITH_INVALIDATE   = (1<<2),
 };
 
 struct ib_wc {
@@ -431,7 +436,10 @@ struct ib_wc {
        u32                     vendor_err;
        u32                     byte_len;
        struct ib_qp           *qp;
-       __be32                  imm_data;
+       union {
+               __be32          imm_data;
+               u32             invalidate_rkey;
+       } ex;
        u32                     src_qp;
        int                     wc_flags;
        u16                     pkey_index;
@@ -625,6 +633,9 @@ enum ib_wr_opcode {
        IB_WR_ATOMIC_FETCH_AND_ADD,
        IB_WR_LSO,
        IB_WR_SEND_WITH_INV,
+       IB_WR_RDMA_READ_WITH_INV,
+       IB_WR_LOCAL_INV,
+       IB_WR_FAST_REG_MR,
 };
 
 enum ib_send_flags {
@@ -641,6 +652,12 @@ struct ib_sge {
        u32     lkey;
 };
 
+struct ib_fast_reg_page_list {
+       struct ib_device       *device;
+       u64                    *page_list;
+       unsigned int            max_page_list_len;
+};
+
 struct ib_send_wr {
        struct ib_send_wr      *next;
        u64                     wr_id;
@@ -673,6 +690,15 @@ struct ib_send_wr {
                        u16     pkey_index; /* valid for GSI only */
                        u8      port_num;   /* valid for DR SMPs on switch only */
                } ud;
+               struct {
+                       u64                             iova_start;
+                       struct ib_fast_reg_page_list   *page_list;
+                       unsigned int                    page_shift;
+                       unsigned int                    page_list_len;
+                       u32                             length;
+                       int                             access_flags;
+                       u32                             rkey;
+               } fast_reg;
        } wr;
 };
 
@@ -1011,6 +1037,11 @@ struct ib_device {
        int                        (*query_mr)(struct ib_mr *mr,
                                               struct ib_mr_attr *mr_attr);
        int                        (*dereg_mr)(struct ib_mr *mr);
+       struct ib_mr *             (*alloc_fast_reg_mr)(struct ib_pd *pd,
+                                              int max_page_list_len);
+       struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device,
+                                                                  int page_list_len);
+       void                       (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list);
        int                        (*rereg_phys_mr)(struct ib_mr *mr,
                                                    int mr_rereg_mask,
                                                    struct ib_pd *pd,
@@ -1805,6 +1836,54 @@ int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
 int ib_dereg_mr(struct ib_mr *mr);
 
 /**
+ * ib_alloc_fast_reg_mr - Allocates memory region usable with the
+ *   IB_WR_FAST_REG_MR send work request.
+ * @pd: The protection domain associated with the region.
+ * @max_page_list_len: requested max physical buffer list length to be
+ *   used with fast register work requests for this MR.
+ */
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len);
+
+/**
+ * ib_alloc_fast_reg_page_list - Allocates a page list array
+ * @device - ib device pointer.
+ * @page_list_len - size of the page list array to be allocated.
+ *
+ * This allocates and returns a struct ib_fast_reg_page_list * and a
+ * page_list array that is at least page_list_len in size.  The actual
+ * size is returned in max_page_list_len.  The caller is responsible
+ * for initializing the contents of the page_list array before posting
+ * a send work request with the IB_WC_FAST_REG_MR opcode.
+ *
+ * The page_list array entries must be translated using one of the
+ * ib_dma_*() functions just like the addresses passed to
+ * ib_map_phys_fmr().  Once the ib_post_send() is issued, the struct
+ * ib_fast_reg_page_list must not be modified by the caller until the
+ * IB_WC_FAST_REG_MR work request completes.
+ */
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(
+                               struct ib_device *device, int page_list_len);
+
+/**
+ * ib_free_fast_reg_page_list - Deallocates a previously allocated
+ *   page list array.
+ * @page_list - struct ib_fast_reg_page_list pointer to be deallocated.
+ */
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
+
+/**
+ * ib_update_fast_reg_key - updates the key portion of the fast_reg MR
+ *   R_Key and L_Key.
+ * @mr - struct ib_mr pointer to be updated.
+ * @newkey - new key to be used.
+ */
+static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey)
+{
+       mr->lkey = (mr->lkey & 0xffffff00) | newkey;
+       mr->rkey = (mr->rkey & 0xffffff00) | newkey;
+}
+
+/**
  * ib_alloc_mw - Allocates a memory window.
  * @pd: The protection domain associated with the memory window.
  */