IB/mlx4: Add a driver Mellanox ConnectX InfiniBand adapters
Roland Dreier [Wed, 9 May 2007 01:00:38 +0000 (18:00 -0700)]
Add an InfiniBand driver for Mellanox ConnectX adapters.  Because
these adapters can also be used as ethernet NICs and Fibre Channel
HBAs, the driver is split into two modules:

  mlx4_core: Handles low-level things like device initialization and
    processing firmware commands.  Also controls resource allocation
    so that the InfiniBand, ethernet and FC functions can share a
    device without stepping on each other.

  mlx4_ib: Handles InfiniBand-specific things; plugs into the
    InfiniBand midlayer.

Signed-off-by: Roland Dreier <rolandd@cisco.com>

43 files changed:
drivers/infiniband/Kconfig
drivers/infiniband/Makefile
drivers/infiniband/hw/mlx4/Kconfig [new file with mode: 0644]
drivers/infiniband/hw/mlx4/Makefile [new file with mode: 0644]
drivers/infiniband/hw/mlx4/ah.c [new file with mode: 0644]
drivers/infiniband/hw/mlx4/cq.c [new file with mode: 0644]
drivers/infiniband/hw/mlx4/doorbell.c [new file with mode: 0644]
drivers/infiniband/hw/mlx4/mad.c [new file with mode: 0644]
drivers/infiniband/hw/mlx4/main.c [new file with mode: 0644]
drivers/infiniband/hw/mlx4/mlx4_ib.h [new file with mode: 0644]
drivers/infiniband/hw/mlx4/mr.c [new file with mode: 0644]
drivers/infiniband/hw/mlx4/qp.c [new file with mode: 0644]
drivers/infiniband/hw/mlx4/srq.c [new file with mode: 0644]
drivers/infiniband/hw/mlx4/user.h [new file with mode: 0644]
drivers/net/Kconfig
drivers/net/Makefile
drivers/net/mlx4/Makefile [new file with mode: 0644]
drivers/net/mlx4/alloc.c [new file with mode: 0644]
drivers/net/mlx4/catas.c [new file with mode: 0644]
drivers/net/mlx4/cmd.c [new file with mode: 0644]
drivers/net/mlx4/cq.c [new file with mode: 0644]
drivers/net/mlx4/eq.c [new file with mode: 0644]
drivers/net/mlx4/fw.c [new file with mode: 0644]
drivers/net/mlx4/fw.h [new file with mode: 0644]
drivers/net/mlx4/icm.c [new file with mode: 0644]
drivers/net/mlx4/icm.h [new file with mode: 0644]
drivers/net/mlx4/intf.c [new file with mode: 0644]
drivers/net/mlx4/main.c [new file with mode: 0644]
drivers/net/mlx4/mcg.c [new file with mode: 0644]
drivers/net/mlx4/mlx4.h [new file with mode: 0644]
drivers/net/mlx4/mr.c [new file with mode: 0644]
drivers/net/mlx4/pd.c [new file with mode: 0644]
drivers/net/mlx4/profile.c [new file with mode: 0644]
drivers/net/mlx4/qp.c [new file with mode: 0644]
drivers/net/mlx4/reset.c [new file with mode: 0644]
drivers/net/mlx4/srq.c [new file with mode: 0644]
include/linux/mlx4/cmd.h [new file with mode: 0644]
include/linux/mlx4/cq.h [new file with mode: 0644]
include/linux/mlx4/device.h [new file with mode: 0644]
include/linux/mlx4/doorbell.h [new file with mode: 0644]
include/linux/mlx4/driver.h [new file with mode: 0644]
include/linux/mlx4/qp.h [new file with mode: 0644]
include/linux/mlx4/srq.h [new file with mode: 0644]

index 82afba5..37deaae 100644 (file)
@@ -45,6 +45,8 @@ source "drivers/infiniband/hw/ehca/Kconfig"
 source "drivers/infiniband/hw/amso1100/Kconfig"
 source "drivers/infiniband/hw/cxgb3/Kconfig"
 
+source "drivers/infiniband/hw/mlx4/Kconfig"
+
 source "drivers/infiniband/ulp/ipoib/Kconfig"
 
 source "drivers/infiniband/ulp/srp/Kconfig"
index da2066c..75f325e 100644 (file)
@@ -4,6 +4,7 @@ obj-$(CONFIG_INFINIBAND_IPATH)          += hw/ipath/
 obj-$(CONFIG_INFINIBAND_EHCA)          += hw/ehca/
 obj-$(CONFIG_INFINIBAND_AMSO1100)      += hw/amso1100/
 obj-$(CONFIG_INFINIBAND_CXGB3)         += hw/cxgb3/
+obj-$(CONFIG_MLX4_INFINIBAND)          += hw/mlx4/
 obj-$(CONFIG_INFINIBAND_IPOIB)         += ulp/ipoib/
 obj-$(CONFIG_INFINIBAND_SRP)           += ulp/srp/
 obj-$(CONFIG_INFINIBAND_ISER)          += ulp/iser/
diff --git a/drivers/infiniband/hw/mlx4/Kconfig b/drivers/infiniband/hw/mlx4/Kconfig
new file mode 100644 (file)
index 0000000..b8912cd
--- /dev/null
@@ -0,0 +1,9 @@
+config MLX4_INFINIBAND
+       tristate "Mellanox ConnectX HCA support"
+       depends on INFINIBAND
+       select MLX4_CORE
+       ---help---
+         This driver provides low-level InfiniBand support for
+         Mellanox ConnectX PCI Express host channel adapters (HCAs).
+         This is required to use InfiniBand protocols such as
+         IP-over-IB or SRP with these devices.
diff --git a/drivers/infiniband/hw/mlx4/Makefile b/drivers/infiniband/hw/mlx4/Makefile
new file mode 100644 (file)
index 0000000..70f09c7
--- /dev/null
@@ -0,0 +1,3 @@
+obj-$(CONFIG_MLX4_INFINIBAND)  += mlx4_ib.o
+
+mlx4_ib-y :=   ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o
diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
new file mode 100644 (file)
index 0000000..c75ac94
--- /dev/null
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx4_ib.h"
+
+struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+       struct mlx4_dev *dev = to_mdev(pd->device)->dev;
+       struct mlx4_ib_ah *ah;
+
+       ah = kmalloc(sizeof *ah, GFP_ATOMIC);
+       if (!ah)
+               return ERR_PTR(-ENOMEM);
+
+       memset(&ah->av, 0, sizeof ah->av);
+
+       ah->av.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
+       ah->av.g_slid  = ah_attr->src_path_bits;
+       ah->av.dlid    = cpu_to_be16(ah_attr->dlid);
+       if (ah_attr->static_rate) {
+               ah->av.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
+               while (ah->av.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+                      !(1 << ah->av.stat_rate & dev->caps.stat_rate_support))
+                       --ah->av.stat_rate;
+       }
+       ah->av.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
+       if (ah_attr->ah_flags & IB_AH_GRH) {
+               ah->av.g_slid   |= 0x80;
+               ah->av.gid_index = ah_attr->grh.sgid_index;
+               ah->av.hop_limit = ah_attr->grh.hop_limit;
+               ah->av.sl_tclass_flowlabel |=
+                       cpu_to_be32((ah_attr->grh.traffic_class << 20) |
+                                   ah_attr->grh.flow_label);
+               memcpy(ah->av.dgid, ah_attr->grh.dgid.raw, 16);
+       }
+
+       return &ah->ibah;
+}
+
+int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+       struct mlx4_ib_ah *ah = to_mah(ibah);
+
+       memset(ah_attr, 0, sizeof *ah_attr);
+       ah_attr->dlid          = be16_to_cpu(ah->av.dlid);
+       ah_attr->sl            = be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 28;
+       ah_attr->port_num      = be32_to_cpu(ah->av.port_pd) >> 24;
+       if (ah->av.stat_rate)
+               ah_attr->static_rate = ah->av.stat_rate - MLX4_STAT_RATE_OFFSET;
+       ah_attr->src_path_bits = ah->av.g_slid & 0x7F;
+
+       if (mlx4_ib_ah_grh_present(ah)) {
+               ah_attr->ah_flags = IB_AH_GRH;
+
+               ah_attr->grh.traffic_class =
+                       be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20;
+               ah_attr->grh.flow_label =
+                       be32_to_cpu(ah->av.sl_tclass_flowlabel) & 0xfffff;
+               ah_attr->grh.hop_limit  = ah->av.hop_limit;
+               ah_attr->grh.sgid_index = ah->av.gid_index;
+               memcpy(ah_attr->grh.dgid.raw, ah->av.dgid, 16);
+       }
+
+       return 0;
+}
+
+int mlx4_ib_destroy_ah(struct ib_ah *ah)
+{
+       kfree(to_mah(ah));
+       return 0;
+}
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
new file mode 100644 (file)
index 0000000..b2a290c
--- /dev/null
@@ -0,0 +1,525 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+static void mlx4_ib_cq_comp(struct mlx4_cq *cq)
+{
+       struct ib_cq *ibcq = &to_mibcq(cq)->ibcq;
+       ibcq->comp_handler(ibcq, ibcq->cq_context);
+}
+
+static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type)
+{
+       struct ib_event event;
+       struct ib_cq *ibcq;
+
+       if (type != MLX4_EVENT_TYPE_CQ_ERROR) {
+               printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+                      "on CQ %06x\n", type, cq->cqn);
+               return;
+       }
+
+       ibcq = &to_mibcq(cq)->ibcq;
+       if (ibcq->event_handler) {
+               event.device     = ibcq->device;
+               event.event      = IB_EVENT_CQ_ERR;
+               event.element.cq = ibcq;
+               ibcq->event_handler(&event, ibcq->cq_context);
+       }
+}
+
+static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n)
+{
+       int offset = n * sizeof (struct mlx4_cqe);
+
+       if (buf->buf.nbufs == 1)
+               return buf->buf.u.direct.buf + offset;
+       else
+               return buf->buf.u.page_list[offset >> PAGE_SHIFT].buf +
+                       (offset & (PAGE_SIZE - 1));
+}
+
+static void *get_cqe(struct mlx4_ib_cq *cq, int n)
+{
+       return get_cqe_from_buf(&cq->buf, n);
+}
+
+static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n)
+{
+       struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe);
+
+       return (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+               !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe;
+}
+
+static struct mlx4_cqe *next_cqe_sw(struct mlx4_ib_cq *cq)
+{
+       return get_sw_cqe(cq, cq->mcq.cons_index);
+}
+
+struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector,
+                               struct ib_ucontext *context,
+                               struct ib_udata *udata)
+{
+       struct mlx4_ib_dev *dev = to_mdev(ibdev);
+       struct mlx4_ib_cq *cq;
+       struct mlx4_uar *uar;
+       int buf_size;
+       int err;
+
+       if (entries < 1 || entries > dev->dev->caps.max_cqes)
+               return ERR_PTR(-EINVAL);
+
+       cq = kmalloc(sizeof *cq, GFP_KERNEL);
+       if (!cq)
+               return ERR_PTR(-ENOMEM);
+
+       entries      = roundup_pow_of_two(entries + 1);
+       cq->ibcq.cqe = entries - 1;
+       buf_size     = entries * sizeof (struct mlx4_cqe);
+       spin_lock_init(&cq->lock);
+
+       if (context) {
+               struct mlx4_ib_create_cq ucmd;
+
+               if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+                       err = -EFAULT;
+                       goto err_cq;
+               }
+
+               cq->umem = ib_umem_get(context, ucmd.buf_addr, buf_size,
+                                      IB_ACCESS_LOCAL_WRITE);
+               if (IS_ERR(cq->umem)) {
+                       err = PTR_ERR(cq->umem);
+                       goto err_cq;
+               }
+
+               err = mlx4_mtt_init(dev->dev, ib_umem_page_count(cq->umem),
+                                   ilog2(cq->umem->page_size), &cq->buf.mtt);
+               if (err)
+                       goto err_buf;
+
+               err = mlx4_ib_umem_write_mtt(dev, &cq->buf.mtt, cq->umem);
+               if (err)
+                       goto err_mtt;
+
+               err = mlx4_ib_db_map_user(to_mucontext(context), ucmd.db_addr,
+                                         &cq->db);
+               if (err)
+                       goto err_mtt;
+
+               uar = &to_mucontext(context)->uar;
+       } else {
+               err = mlx4_ib_db_alloc(dev, &cq->db, 1);
+               if (err)
+                       goto err_cq;
+
+               cq->mcq.set_ci_db  = cq->db.db;
+               cq->mcq.arm_db     = cq->db.db + 1;
+               *cq->mcq.set_ci_db = 0;
+               *cq->mcq.arm_db    = 0;
+
+               if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &cq->buf.buf)) {
+                       err = -ENOMEM;
+                       goto err_db;
+               }
+
+               err = mlx4_mtt_init(dev->dev, cq->buf.buf.npages, cq->buf.buf.page_shift,
+                                   &cq->buf.mtt);
+               if (err)
+                       goto err_buf;
+
+               err = mlx4_buf_write_mtt(dev->dev, &cq->buf.mtt, &cq->buf.buf);
+               if (err)
+                       goto err_mtt;
+
+               uar = &dev->priv_uar;
+       }
+
+       err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
+                           cq->db.dma, &cq->mcq);
+       if (err)
+               goto err_dbmap;
+
+       cq->mcq.comp  = mlx4_ib_cq_comp;
+       cq->mcq.event = mlx4_ib_cq_event;
+
+       if (context)
+               if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof (__u32))) {
+                       err = -EFAULT;
+                       goto err_dbmap;
+               }
+
+       return &cq->ibcq;
+
+err_dbmap:
+       if (context)
+               mlx4_ib_db_unmap_user(to_mucontext(context), &cq->db);
+
+err_mtt:
+       mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt);
+
+err_buf:
+       if (context)
+               ib_umem_release(cq->umem);
+       else
+               mlx4_buf_free(dev->dev, entries * sizeof (struct mlx4_cqe),
+                             &cq->buf.buf);
+
+err_db:
+       if (!context)
+               mlx4_ib_db_free(dev, &cq->db);
+
+err_cq:
+       kfree(cq);
+
+       return ERR_PTR(err);
+}
+
+int mlx4_ib_destroy_cq(struct ib_cq *cq)
+{
+       struct mlx4_ib_dev *dev = to_mdev(cq->device);
+       struct mlx4_ib_cq *mcq = to_mcq(cq);
+
+       mlx4_cq_free(dev->dev, &mcq->mcq);
+       mlx4_mtt_cleanup(dev->dev, &mcq->buf.mtt);
+
+       if (cq->uobject) {
+               mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db);
+               ib_umem_release(mcq->umem);
+       } else {
+               mlx4_buf_free(dev->dev, (cq->cqe + 1) * sizeof (struct mlx4_cqe),
+                             &mcq->buf.buf);
+               mlx4_ib_db_free(dev, &mcq->db);
+       }
+
+       kfree(mcq);
+
+       return 0;
+}
+
+static void dump_cqe(void *cqe)
+{
+       __be32 *buf = cqe;
+
+       printk(KERN_DEBUG "CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n",
+              be32_to_cpu(buf[0]), be32_to_cpu(buf[1]), be32_to_cpu(buf[2]),
+              be32_to_cpu(buf[3]), be32_to_cpu(buf[4]), be32_to_cpu(buf[5]),
+              be32_to_cpu(buf[6]), be32_to_cpu(buf[7]));
+}
+
+static void mlx4_ib_handle_error_cqe(struct mlx4_err_cqe *cqe,
+                                    struct ib_wc *wc)
+{
+       if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) {
+               printk(KERN_DEBUG "local QP operation err "
+                      "(QPN %06x, WQE index %x, vendor syndrome %02x, "
+                      "opcode = %02x)\n",
+                      be32_to_cpu(cqe->my_qpn), be16_to_cpu(cqe->wqe_index),
+                      cqe->vendor_err_syndrome,
+                      cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+               dump_cqe(cqe);
+       }
+
+       switch (cqe->syndrome) {
+       case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR:
+               wc->status = IB_WC_LOC_LEN_ERR;
+               break;
+       case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR:
+               wc->status = IB_WC_LOC_QP_OP_ERR;
+               break;
+       case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR:
+               wc->status = IB_WC_LOC_PROT_ERR;
+               break;
+       case MLX4_CQE_SYNDROME_WR_FLUSH_ERR:
+               wc->status = IB_WC_WR_FLUSH_ERR;
+               break;
+       case MLX4_CQE_SYNDROME_MW_BIND_ERR:
+               wc->status = IB_WC_MW_BIND_ERR;
+               break;
+       case MLX4_CQE_SYNDROME_BAD_RESP_ERR:
+               wc->status = IB_WC_BAD_RESP_ERR;
+               break;
+       case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR:
+               wc->status = IB_WC_LOC_ACCESS_ERR;
+               break;
+       case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
+               wc->status = IB_WC_REM_INV_REQ_ERR;
+               break;
+       case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR:
+               wc->status = IB_WC_REM_ACCESS_ERR;
+               break;
+       case MLX4_CQE_SYNDROME_REMOTE_OP_ERR:
+               wc->status = IB_WC_REM_OP_ERR;
+               break;
+       case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
+               wc->status = IB_WC_RETRY_EXC_ERR;
+               break;
+       case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
+               wc->status = IB_WC_RNR_RETRY_EXC_ERR;
+               break;
+       case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR:
+               wc->status = IB_WC_REM_ABORT_ERR;
+               break;
+       default:
+               wc->status = IB_WC_GENERAL_ERR;
+               break;
+       }
+
+       wc->vendor_err = cqe->vendor_err_syndrome;
+}
+
+static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
+                           struct mlx4_ib_qp **cur_qp,
+                           struct ib_wc *wc)
+{
+       struct mlx4_cqe *cqe;
+       struct mlx4_qp *mqp;
+       struct mlx4_ib_wq *wq;
+       struct mlx4_ib_srq *srq;
+       int is_send;
+       int is_error;
+       u16 wqe_ctr;
+
+       cqe = next_cqe_sw(cq);
+       if (!cqe)
+               return -EAGAIN;
+
+       ++cq->mcq.cons_index;
+
+       /*
+        * Make sure we read CQ entry contents after we've checked the
+        * ownership bit.
+        */
+       rmb();
+
+       is_send  = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK;
+       is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+               MLX4_CQE_OPCODE_ERROR;
+
+       if (!*cur_qp ||
+           (be32_to_cpu(cqe->my_qpn) & 0xffffff) != (*cur_qp)->mqp.qpn) {
+               /*
+                * We do not have to take the QP table lock here,
+                * because CQs will be locked while QPs are removed
+                * from the table.
+                */
+               mqp = __mlx4_qp_lookup(to_mdev(cq->ibcq.device)->dev,
+                                      be32_to_cpu(cqe->my_qpn));
+               if (unlikely(!mqp)) {
+                       printk(KERN_WARNING "CQ %06x with entry for unknown QPN %06x\n",
+                              cq->mcq.cqn, be32_to_cpu(cqe->my_qpn) & 0xffffff);
+                       return -EINVAL;
+               }
+
+               *cur_qp = to_mibqp(mqp);
+       }
+
+       wc->qp = &(*cur_qp)->ibqp;
+
+       if (is_send) {
+               wq = &(*cur_qp)->sq;
+               wqe_ctr = be16_to_cpu(cqe->wqe_index);
+               wq->tail += wqe_ctr - (u16) wq->tail;
+               wc->wr_id = wq->wrid[wq->tail & (wq->max - 1)];
+               ++wq->tail;
+       } else if ((*cur_qp)->ibqp.srq) {
+               srq = to_msrq((*cur_qp)->ibqp.srq);
+               wqe_ctr = be16_to_cpu(cqe->wqe_index);
+               wc->wr_id = srq->wrid[wqe_ctr];
+               mlx4_ib_free_srq_wqe(srq, wqe_ctr);
+       } else {
+               wq        = &(*cur_qp)->rq;
+               wc->wr_id = wq->wrid[wq->tail & (wq->max - 1)];
+               ++wq->tail;
+       }
+
+       if (unlikely(is_error)) {
+               mlx4_ib_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc);
+               return 0;
+       }
+
+       wc->status = IB_WC_SUCCESS;
+
+       if (is_send) {
+               wc->wc_flags = 0;
+               switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+               case MLX4_OPCODE_RDMA_WRITE_IMM:
+                       wc->wc_flags |= IB_WC_WITH_IMM;
+               case MLX4_OPCODE_RDMA_WRITE:
+                       wc->opcode    = IB_WC_RDMA_WRITE;
+                       break;
+               case MLX4_OPCODE_SEND_IMM:
+                       wc->wc_flags |= IB_WC_WITH_IMM;
+               case MLX4_OPCODE_SEND:
+                       wc->opcode    = IB_WC_SEND;
+                       break;
+               case MLX4_OPCODE_RDMA_READ:
+                       wc->opcode    = IB_WC_SEND;
+                       wc->byte_len  = be32_to_cpu(cqe->byte_cnt);
+                       break;
+               case MLX4_OPCODE_ATOMIC_CS:
+                       wc->opcode    = IB_WC_COMP_SWAP;
+                       wc->byte_len  = 8;
+                       break;
+               case MLX4_OPCODE_ATOMIC_FA:
+                       wc->opcode    = IB_WC_FETCH_ADD;
+                       wc->byte_len  = 8;
+                       break;
+               case MLX4_OPCODE_BIND_MW:
+                       wc->opcode    = IB_WC_BIND_MW;
+                       break;
+               }
+       } else {
+               wc->byte_len = be32_to_cpu(cqe->byte_cnt);
+
+               switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+               case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
+                       wc->opcode   = IB_WC_RECV_RDMA_WITH_IMM;
+                       wc->wc_flags = IB_WC_WITH_IMM;
+                       wc->imm_data = cqe->immed_rss_invalid;
+                       break;
+               case MLX4_RECV_OPCODE_SEND:
+                       wc->opcode   = IB_WC_RECV;
+                       wc->wc_flags = 0;
+                       break;
+               case MLX4_RECV_OPCODE_SEND_IMM:
+                       wc->opcode   = IB_WC_RECV;
+                       wc->wc_flags = IB_WC_WITH_IMM;
+                       wc->imm_data = cqe->immed_rss_invalid;
+                       break;
+               }
+
+               wc->slid           = be16_to_cpu(cqe->rlid);
+               wc->sl             = cqe->sl >> 4;
+               wc->src_qp         = be32_to_cpu(cqe->g_mlpath_rqpn) & 0xffffff;
+               wc->dlid_path_bits = (be32_to_cpu(cqe->g_mlpath_rqpn) >> 24) & 0x7f;
+               wc->wc_flags      |= be32_to_cpu(cqe->g_mlpath_rqpn) & 0x80000000 ?
+                       IB_WC_GRH : 0;
+               wc->pkey_index     = be32_to_cpu(cqe->immed_rss_invalid) >> 16;
+       }
+
+       return 0;
+}
+
+int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+       struct mlx4_ib_cq *cq = to_mcq(ibcq);
+       struct mlx4_ib_qp *cur_qp = NULL;
+       unsigned long flags;
+       int npolled;
+       int err = 0;
+
+       spin_lock_irqsave(&cq->lock, flags);
+
+       for (npolled = 0; npolled < num_entries; ++npolled) {
+               err = mlx4_ib_poll_one(cq, &cur_qp, wc + npolled);
+               if (err)
+                       break;
+       }
+
+       if (npolled)
+               mlx4_cq_set_ci(&cq->mcq);
+
+       spin_unlock_irqrestore(&cq->lock, flags);
+
+       if (err == 0 || err == -EAGAIN)
+               return npolled;
+       else
+               return err;
+}
+
+int mlx4_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+       mlx4_cq_arm(&to_mcq(ibcq)->mcq,
+                   (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+                   MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT,
+                   to_mdev(ibcq->device)->uar_map,
+                   MLX4_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->uar_lock));
+
+       return 0;
+}
+
+void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
+{
+       u32 prod_index;
+       int nfreed = 0;
+       struct mlx4_cqe *cqe;
+
+       /*
+        * First we need to find the current producer index, so we
+        * know where to start cleaning from.  It doesn't matter if HW
+        * adds new entries after this loop -- the QP we're worried
+        * about is already in RESET, so the new entries won't come
+        * from our QP and therefore don't need to be checked.
+        */
+       for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); ++prod_index)
+               if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe)
+                       break;
+
+       /*
+        * Now sweep backwards through the CQ, removing CQ entries
+        * that match our QP by copying older entries on top of them.
+        */
+       while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
+               cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+               if ((be32_to_cpu(cqe->my_qpn) & 0xffffff) == qpn) {
+                       if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
+                               mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index));
+                       ++nfreed;
+               } else if (nfreed)
+                       memcpy(get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe),
+                              cqe, sizeof *cqe);
+       }
+
+       if (nfreed) {
+               cq->mcq.cons_index += nfreed;
+               /*
+                * Make sure update of buffer contents is done before
+                * updating consumer index.
+                */
+               wmb();
+               mlx4_cq_set_ci(&cq->mcq);
+       }
+}
+
+void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
+{
+       spin_lock_irq(&cq->lock);
+       __mlx4_ib_cq_clean(cq, qpn, srq);
+       spin_unlock_irq(&cq->lock);
+}
diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c
new file mode 100644 (file)
index 0000000..1c36087
--- /dev/null
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "mlx4_ib.h"
+
+struct mlx4_ib_db_pgdir {
+       struct list_head        list;
+       DECLARE_BITMAP(order0, MLX4_IB_DB_PER_PAGE);
+       DECLARE_BITMAP(order1, MLX4_IB_DB_PER_PAGE / 2);
+       unsigned long          *bits[2];
+       __be32                 *db_page;
+       dma_addr_t              db_dma;
+};
+
+static struct mlx4_ib_db_pgdir *mlx4_ib_alloc_db_pgdir(struct mlx4_ib_dev *dev)
+{
+       struct mlx4_ib_db_pgdir *pgdir;
+
+       pgdir = kzalloc(sizeof *pgdir, GFP_KERNEL);
+       if (!pgdir)
+               return NULL;
+
+       bitmap_fill(pgdir->order1, MLX4_IB_DB_PER_PAGE / 2);
+       pgdir->bits[0] = pgdir->order0;
+       pgdir->bits[1] = pgdir->order1;
+       pgdir->db_page = dma_alloc_coherent(dev->ib_dev.dma_device,
+                                           PAGE_SIZE, &pgdir->db_dma,
+                                           GFP_KERNEL);
+       if (!pgdir->db_page) {
+               kfree(pgdir);
+               return NULL;
+       }
+
+       return pgdir;
+}
+
+static int mlx4_ib_alloc_db_from_pgdir(struct mlx4_ib_db_pgdir *pgdir,
+                                      struct mlx4_ib_db *db, int order)
+{
+       int o;
+       int i;
+
+       for (o = order; o <= 1; ++o) {
+               i = find_first_bit(pgdir->bits[o], MLX4_IB_DB_PER_PAGE >> o);
+               if (i < MLX4_IB_DB_PER_PAGE >> o)
+                       goto found;
+       }
+
+       return -ENOMEM;
+
+found:
+       clear_bit(i, pgdir->bits[o]);
+
+       i <<= o;
+
+       if (o > order)
+               set_bit(i ^ 1, pgdir->bits[order]);
+
+       db->u.pgdir = pgdir;
+       db->index   = i;
+       db->db      = pgdir->db_page + db->index;
+       db->dma     = pgdir->db_dma  + db->index * 4;
+       db->order   = order;
+
+       return 0;
+}
+
+int mlx4_ib_db_alloc(struct mlx4_ib_dev *dev, struct mlx4_ib_db *db, int order)
+{
+       struct mlx4_ib_db_pgdir *pgdir;
+       int ret = 0;
+
+       mutex_lock(&dev->pgdir_mutex);
+
+       list_for_each_entry(pgdir, &dev->pgdir_list, list)
+               if (!mlx4_ib_alloc_db_from_pgdir(pgdir, db, order))
+                       goto out;
+
+       pgdir = mlx4_ib_alloc_db_pgdir(dev);
+       if (!pgdir) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       list_add(&pgdir->list, &dev->pgdir_list);
+
+       /* This should never fail -- we just allocated an empty page: */
+       WARN_ON(mlx4_ib_alloc_db_from_pgdir(pgdir, db, order));
+
+out:
+       mutex_unlock(&dev->pgdir_mutex);
+
+       return ret;
+}
+
+void mlx4_ib_db_free(struct mlx4_ib_dev *dev, struct mlx4_ib_db *db)
+{
+       int o;
+       int i;
+
+       mutex_lock(&dev->pgdir_mutex);
+
+       o = db->order;
+       i = db->index;
+
+       if (db->order == 0 && test_bit(i ^ 1, db->u.pgdir->order0)) {
+               clear_bit(i ^ 1, db->u.pgdir->order0);
+               ++o;
+       }
+
+       i >>= o;
+       set_bit(i, db->u.pgdir->bits[o]);
+
+       if (bitmap_full(db->u.pgdir->order1, MLX4_IB_DB_PER_PAGE / 2)) {
+               dma_free_coherent(dev->ib_dev.dma_device, PAGE_SIZE,
+                                 db->u.pgdir->db_page, db->u.pgdir->db_dma);
+               list_del(&db->u.pgdir->list);
+               kfree(db->u.pgdir);
+       }
+
+       mutex_unlock(&dev->pgdir_mutex);
+}
+
+struct mlx4_ib_user_db_page {
+       struct list_head        list;
+       struct ib_umem         *umem;
+       unsigned long           user_virt;
+       int                     refcnt;
+};
+
+int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+                       struct mlx4_ib_db *db)
+{
+       struct mlx4_ib_user_db_page *page;
+       struct ib_umem_chunk *chunk;
+       int err = 0;
+
+       mutex_lock(&context->db_page_mutex);
+
+       list_for_each_entry(page, &context->db_page_list, list)
+               if (page->user_virt == (virt & PAGE_MASK))
+                       goto found;
+
+       page = kmalloc(sizeof *page, GFP_KERNEL);
+       if (!page) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       page->user_virt = (virt & PAGE_MASK);
+       page->refcnt    = 0;
+       page->umem      = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
+                                     PAGE_SIZE, 0);
+       if (IS_ERR(page->umem)) {
+               err = PTR_ERR(page->umem);
+               kfree(page);
+               goto out;
+       }
+
+       list_add(&page->list, &context->db_page_list);
+
+found:
+       chunk = list_entry(page->umem->chunk_list.next, struct ib_umem_chunk, list);
+       db->dma         = sg_dma_address(chunk->page_list) + (virt & ~PAGE_MASK);
+       db->u.user_page = page;
+       ++page->refcnt;
+
+out:
+       mutex_unlock(&context->db_page_mutex);
+
+       return err;
+}
+
+void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_ib_db *db)
+{
+       mutex_lock(&context->db_page_mutex);
+
+       if (!--db->u.user_page->refcnt) {
+               list_del(&db->u.user_page->list);
+               ib_umem_release(db->u.user_page->umem);
+               kfree(db->u.user_page);
+       }
+
+       mutex_unlock(&context->db_page_mutex);
+}
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
new file mode 100644 (file)
index 0000000..3330917
--- /dev/null
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4_ib.h"
+
+enum {
+       MLX4_IB_VENDOR_CLASS1 = 0x9,
+       MLX4_IB_VENDOR_CLASS2 = 0xa
+};
+
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+                int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+                void *in_mad, void *response_mad)
+{
+       struct mlx4_cmd_mailbox *inmailbox, *outmailbox;
+       void *inbox;
+       int err;
+       u32 in_modifier = port;
+       u8 op_modifier = 0;
+
+       inmailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+       if (IS_ERR(inmailbox))
+               return PTR_ERR(inmailbox);
+       inbox = inmailbox->buf;
+
+       outmailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+       if (IS_ERR(outmailbox)) {
+               mlx4_free_cmd_mailbox(dev->dev, inmailbox);
+               return PTR_ERR(outmailbox);
+       }
+
+       memcpy(inbox, in_mad, 256);
+
+       /*
+        * Key check traps can't be generated unless we have in_wc to
+        * tell us where to send the trap.
+        */
+       if (ignore_mkey || !in_wc)
+               op_modifier |= 0x1;
+       if (ignore_bkey || !in_wc)
+               op_modifier |= 0x2;
+
+       if (in_wc) {
+               struct {
+                       __be32          my_qpn;
+                       u32             reserved1;
+                       __be32          rqpn;
+                       u8              sl;
+                       u8              g_path;
+                       u16             reserved2[2];
+                       __be16          pkey;
+                       u32             reserved3[11];
+                       u8              grh[40];
+               } *ext_info;
+
+               memset(inbox + 256, 0, 256);
+               ext_info = inbox + 256;
+
+               ext_info->my_qpn = cpu_to_be32(in_wc->qp->qp_num);
+               ext_info->rqpn   = cpu_to_be32(in_wc->src_qp);
+               ext_info->sl     = in_wc->sl << 4;
+               ext_info->g_path = in_wc->dlid_path_bits |
+                       (in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0);
+               ext_info->pkey   = cpu_to_be16(in_wc->pkey_index);
+
+               if (in_grh)
+                       memcpy(ext_info->grh, in_grh, 40);
+
+               op_modifier |= 0x4;
+
+               in_modifier |= in_wc->slid << 16;
+       }
+
+       err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma,
+                          in_modifier, op_modifier,
+                          MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C);
+
+       if (!err);
+               memcpy(response_mad, outmailbox->buf, 256);
+
+       mlx4_free_cmd_mailbox(dev->dev, inmailbox);
+       mlx4_free_cmd_mailbox(dev->dev, outmailbox);
+
+       return err;
+}
+
+static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl)
+{
+       struct ib_ah *new_ah;
+       struct ib_ah_attr ah_attr;
+
+       if (!dev->send_agent[port_num - 1][0])
+               return;
+
+       memset(&ah_attr, 0, sizeof ah_attr);
+       ah_attr.dlid     = lid;
+       ah_attr.sl       = sl;
+       ah_attr.port_num = port_num;
+
+       new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
+                             &ah_attr);
+       if (IS_ERR(new_ah))
+               return;
+
+       spin_lock(&dev->sm_lock);
+       if (dev->sm_ah[port_num - 1])
+               ib_destroy_ah(dev->sm_ah[port_num - 1]);
+       dev->sm_ah[port_num - 1] = new_ah;
+       spin_unlock(&dev->sm_lock);
+}
+
+/*
+ * Snoop SM MADs for port info and P_Key table sets, so we can
+ * synthesize LID change and P_Key change events.
+ */
+static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad)
+{
+       struct ib_event event;
+
+       if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+            mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+           mad->mad_hdr.method == IB_MGMT_METHOD_SET) {
+               if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) {
+                       struct ib_port_info *pinfo =
+                               (struct ib_port_info *) ((struct ib_smp *) mad)->data;
+
+                       update_sm_ah(to_mdev(ibdev), port_num,
+                                    be16_to_cpu(pinfo->sm_lid),
+                                    pinfo->neighbormtu_mastersmsl & 0xf);
+
+                       event.device           = ibdev;
+                       event.element.port_num = port_num;
+
+                       if(pinfo->clientrereg_resv_subnetto & 0x80)
+                               event.event    = IB_EVENT_CLIENT_REREGISTER;
+                       else
+                               event.event    = IB_EVENT_LID_CHANGE;
+
+                       ib_dispatch_event(&event);
+               }
+
+               if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) {
+                       event.device           = ibdev;
+                       event.event            = IB_EVENT_PKEY_CHANGE;
+                       event.element.port_num = port_num;
+                       ib_dispatch_event(&event);
+               }
+       }
+}
+
+static void node_desc_override(struct ib_device *dev,
+                              struct ib_mad *mad)
+{
+       if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+            mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+           mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP &&
+           mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) {
+               spin_lock(&to_mdev(dev)->sm_lock);
+               memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64);
+               spin_unlock(&to_mdev(dev)->sm_lock);
+       }
+}
+
+static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, struct ib_mad *mad)
+{
+       int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
+       struct ib_mad_send_buf *send_buf;
+       struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn];
+       int ret;
+
+       if (agent) {
+               send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR,
+                                             IB_MGMT_MAD_DATA, GFP_ATOMIC);
+               /*
+                * We rely here on the fact that MLX QPs don't use the
+                * address handle after the send is posted (this is
+                * wrong following the IB spec strictly, but we know
+                * it's OK for our devices).
+                */
+               spin_lock(&dev->sm_lock);
+               memcpy(send_buf->mad, mad, sizeof *mad);
+               if ((send_buf->ah = dev->sm_ah[port_num - 1]))
+                       ret = ib_post_send_mad(send_buf, NULL);
+               else
+                       ret = -EINVAL;
+               spin_unlock(&dev->sm_lock);
+
+               if (ret)
+                       ib_free_send_mad(send_buf);
+       }
+}
+
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags,        u8 port_num,
+                       struct ib_wc *in_wc, struct ib_grh *in_grh,
+                       struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+       u16 slid;
+       int err;
+
+       slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE);
+
+       if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) {
+               forward_trap(to_mdev(ibdev), port_num, in_mad);
+               return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+       }
+
+       if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+           in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+               if (in_mad->mad_hdr.method   != IB_MGMT_METHOD_GET &&
+                   in_mad->mad_hdr.method   != IB_MGMT_METHOD_SET &&
+                   in_mad->mad_hdr.method   != IB_MGMT_METHOD_TRAP_REPRESS)
+                       return IB_MAD_RESULT_SUCCESS;
+
+               /*
+                * Don't process SMInfo queries or vendor-specific
+                * MADs -- the SMA can't handle them.
+                */
+               if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO ||
+                   ((in_mad->mad_hdr.attr_id & IB_SMP_ATTR_VENDOR_MASK) ==
+                    IB_SMP_ATTR_VENDOR_MASK))
+                       return IB_MAD_RESULT_SUCCESS;
+       } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+                  in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS1   ||
+                  in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS2) {
+               if (in_mad->mad_hdr.method  != IB_MGMT_METHOD_GET &&
+                   in_mad->mad_hdr.method  != IB_MGMT_METHOD_SET)
+                       return IB_MAD_RESULT_SUCCESS;
+       } else
+               return IB_MAD_RESULT_SUCCESS;
+
+       err = mlx4_MAD_IFC(to_mdev(ibdev),
+                          mad_flags & IB_MAD_IGNORE_MKEY,
+                          mad_flags & IB_MAD_IGNORE_BKEY,
+                          port_num, in_wc, in_grh, in_mad, out_mad);
+       if (err)
+               return IB_MAD_RESULT_FAILURE;
+
+       if (!out_mad->mad_hdr.status) {
+               smp_snoop(ibdev, port_num, in_mad);
+               node_desc_override(ibdev, out_mad);
+       }
+
+       /* set return bit in status of directed route responses */
+       if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+               out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+       if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+               /* no response for trap repress */
+               return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+                        struct ib_mad_send_wc *mad_send_wc)
+{
+       ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int mlx4_ib_mad_init(struct mlx4_ib_dev *dev)
+{
+       struct ib_mad_agent *agent;
+       int p, q;
+       int ret;
+
+       for (p = 0; p < dev->dev->caps.num_ports; ++p)
+               for (q = 0; q <= 1; ++q) {
+                       agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
+                                                     q ? IB_QPT_GSI : IB_QPT_SMI,
+                                                     NULL, 0, send_handler,
+                                                     NULL, NULL);
+                       if (IS_ERR(agent)) {
+                               ret = PTR_ERR(agent);
+                               goto err;
+                       }
+                       dev->send_agent[p][q] = agent;
+               }
+
+       return 0;
+
+err:
+       for (p = 0; p < dev->dev->caps.num_ports; ++p)
+               for (q = 0; q <= 1; ++q)
+                       if (dev->send_agent[p][q])
+                               ib_unregister_mad_agent(dev->send_agent[p][q]);
+
+       return ret;
+}
+
+void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev)
+{
+       struct ib_mad_agent *agent;
+       int p, q;
+
+       for (p = 0; p < dev->dev->caps.num_ports; ++p) {
+               for (q = 0; q <= 1; ++q) {
+                       agent = dev->send_agent[p][q];
+                       dev->send_agent[p][q] = NULL;
+                       ib_unregister_mad_agent(agent);
+               }
+
+               if (dev->sm_ah[p])
+                       ib_destroy_ah(dev->sm_ah[p]);
+       }
+}
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
new file mode 100644 (file)
index 0000000..688ecb4
--- /dev/null
@@ -0,0 +1,651 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+#define DRV_NAME       "mlx4_ib"
+#define DRV_VERSION    "0.01"
+#define DRV_RELDATE    "May 1, 2006"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static const char mlx4_ib_version[] __devinitdata =
+       DRV_NAME ": Mellanox ConnectX InfiniBand driver v"
+       DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static void init_query_mad(struct ib_smp *mad)
+{
+       mad->base_version  = 1;
+       mad->mgmt_class    = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+       mad->class_version = 1;
+       mad->method        = IB_MGMT_METHOD_GET;
+}
+
+static int mlx4_ib_query_device(struct ib_device *ibdev,
+                               struct ib_device_attr *props)
+{
+       struct mlx4_ib_dev *dev = to_mdev(ibdev);
+       struct ib_smp *in_mad  = NULL;
+       struct ib_smp *out_mad = NULL;
+       int err = -ENOMEM;
+
+       in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+       out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+       if (!in_mad || !out_mad)
+               goto out;
+
+       init_query_mad(in_mad);
+       in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+       err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, out_mad);
+       if (err)
+               goto out;
+
+       memset(props, 0, sizeof *props);
+
+       props->fw_ver = dev->dev->caps.fw_ver;
+       props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
+               IB_DEVICE_PORT_ACTIVE_EVENT             |
+               IB_DEVICE_SYS_IMAGE_GUID                |
+               IB_DEVICE_RC_RNR_NAK_GEN;
+       if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR)
+               props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+       if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR)
+               props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+       if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM)
+               props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+       if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT)
+               props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
+
+       props->vendor_id           = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
+               0xffffff;
+       props->vendor_part_id      = be16_to_cpup((__be16 *) (out_mad->data + 30));
+       props->hw_ver              = be32_to_cpup((__be32 *) (out_mad->data + 32));
+       memcpy(&props->sys_image_guid, out_mad->data +  4, 8);
+
+       props->max_mr_size         = ~0ull;
+       props->page_size_cap       = dev->dev->caps.page_size_cap;
+       props->max_qp              = dev->dev->caps.num_qps - dev->dev->caps.reserved_qps;
+       props->max_qp_wr           = dev->dev->caps.max_wqes;
+       props->max_sge             = min(dev->dev->caps.max_sq_sg,
+                                        dev->dev->caps.max_rq_sg);
+       props->max_cq              = dev->dev->caps.num_cqs - dev->dev->caps.reserved_cqs;
+       props->max_cqe             = dev->dev->caps.max_cqes;
+       props->max_mr              = dev->dev->caps.num_mpts - dev->dev->caps.reserved_mrws;
+       props->max_pd              = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds;
+       props->max_qp_rd_atom      = dev->dev->caps.max_qp_dest_rdma;
+       props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma;
+       props->max_res_rd_atom     = props->max_qp_rd_atom * props->max_qp;
+       props->max_srq             = dev->dev->caps.num_srqs - dev->dev->caps.reserved_srqs;
+       props->max_srq_wr          = dev->dev->caps.max_srq_wqes;
+       props->max_srq_sge         = dev->dev->caps.max_srq_sge;
+       props->local_ca_ack_delay  = dev->dev->caps.local_ca_ack_delay;
+       props->atomic_cap          = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ?
+               IB_ATOMIC_HCA : IB_ATOMIC_NONE;
+       props->max_pkeys           = dev->dev->caps.pkey_table_len;
+       props->max_mcast_grp       = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms;
+       props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm;
+       props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+                                          props->max_mcast_grp;
+       props->max_map_per_fmr = (1 << (32 - ilog2(dev->dev->caps.num_mpts))) - 1;
+
+out:
+       kfree(in_mad);
+       kfree(out_mad);
+
+       return err;
+}
+
+static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+                             struct ib_port_attr *props)
+{
+       struct ib_smp *in_mad  = NULL;
+       struct ib_smp *out_mad = NULL;
+       int err = -ENOMEM;
+
+       in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+       out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+       if (!in_mad || !out_mad)
+               goto out;
+
+       memset(props, 0, sizeof *props);
+
+       init_query_mad(in_mad);
+       in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+       in_mad->attr_mod = cpu_to_be32(port);
+
+       err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+       if (err)
+               goto out;
+
+       props->lid              = be16_to_cpup((__be16 *) (out_mad->data + 16));
+       props->lmc              = out_mad->data[34] & 0x7;
+       props->sm_lid           = be16_to_cpup((__be16 *) (out_mad->data + 18));
+       props->sm_sl            = out_mad->data[36] & 0xf;
+       props->state            = out_mad->data[32] & 0xf;
+       props->phys_state       = out_mad->data[33] >> 4;
+       props->port_cap_flags   = be32_to_cpup((__be32 *) (out_mad->data + 20));
+       props->gid_tbl_len      = to_mdev(ibdev)->dev->caps.gid_table_len;
+       props->max_msg_sz       = 0x80000000;
+       props->pkey_tbl_len     = to_mdev(ibdev)->dev->caps.pkey_table_len;
+       props->bad_pkey_cntr    = be16_to_cpup((__be16 *) (out_mad->data + 46));
+       props->qkey_viol_cntr   = be16_to_cpup((__be16 *) (out_mad->data + 48));
+       props->active_width     = out_mad->data[31] & 0xf;
+       props->active_speed     = out_mad->data[35] >> 4;
+       props->max_mtu          = out_mad->data[41] & 0xf;
+       props->active_mtu       = out_mad->data[36] >> 4;
+       props->subnet_timeout   = out_mad->data[51] & 0x1f;
+       props->max_vl_num       = out_mad->data[37] >> 4;
+       props->init_type_reply  = out_mad->data[41] >> 4;
+
+out:
+       kfree(in_mad);
+       kfree(out_mad);
+
+       return err;
+}
+
+static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+                            union ib_gid *gid)
+{
+       struct ib_smp *in_mad  = NULL;
+       struct ib_smp *out_mad = NULL;
+       int err = -ENOMEM;
+
+       in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+       out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+       if (!in_mad || !out_mad)
+               goto out;
+
+       init_query_mad(in_mad);
+       in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+       in_mad->attr_mod = cpu_to_be32(port);
+
+       err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+       if (err)
+               goto out;
+
+       memcpy(gid->raw, out_mad->data + 8, 8);
+
+       init_query_mad(in_mad);
+       in_mad->attr_id  = IB_SMP_ATTR_GUID_INFO;
+       in_mad->attr_mod = cpu_to_be32(index / 8);
+
+       err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+       if (err)
+               goto out;
+
+       memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
+
+out:
+       kfree(in_mad);
+       kfree(out_mad);
+       return err;
+}
+
+static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+                             u16 *pkey)
+{
+       struct ib_smp *in_mad  = NULL;
+       struct ib_smp *out_mad = NULL;
+       int err = -ENOMEM;
+
+       in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+       out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+       if (!in_mad || !out_mad)
+               goto out;
+
+       init_query_mad(in_mad);
+       in_mad->attr_id  = IB_SMP_ATTR_PKEY_TABLE;
+       in_mad->attr_mod = cpu_to_be32(index / 32);
+
+       err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+       if (err)
+               goto out;
+
+       *pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]);
+
+out:
+       kfree(in_mad);
+       kfree(out_mad);
+       return err;
+}
+
+static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask,
+                                struct ib_device_modify *props)
+{
+       if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
+               return -EOPNOTSUPP;
+
+       if (mask & IB_DEVICE_MODIFY_NODE_DESC) {
+               spin_lock(&to_mdev(ibdev)->sm_lock);
+               memcpy(ibdev->node_desc, props->node_desc, 64);
+               spin_unlock(&to_mdev(ibdev)->sm_lock);
+       }
+
+       return 0;
+}
+
+static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols,
+                        u32 cap_mask)
+{
+       struct mlx4_cmd_mailbox *mailbox;
+       int err;
+
+       mailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+       if (IS_ERR(mailbox))
+               return PTR_ERR(mailbox);
+
+       memset(mailbox->buf, 0, 256);
+       *(u8 *) mailbox->buf         = !!reset_qkey_viols << 6;
+       ((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask);
+
+       err = mlx4_cmd(dev->dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT,
+                      MLX4_CMD_TIME_CLASS_B);
+
+       mlx4_free_cmd_mailbox(dev->dev, mailbox);
+       return err;
+}
+
+static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
+                              struct ib_port_modify *props)
+{
+       struct ib_port_attr attr;
+       u32 cap_mask;
+       int err;
+
+       mutex_lock(&to_mdev(ibdev)->cap_mask_mutex);
+
+       err = mlx4_ib_query_port(ibdev, port, &attr);
+       if (err)
+               goto out;
+
+       cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) &
+               ~props->clr_port_cap_mask;
+
+       err = mlx4_SET_PORT(to_mdev(ibdev), port,
+                           !!(mask & IB_PORT_RESET_QKEY_CNTR),
+                           cap_mask);
+
+out:
+       mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
+       return err;
+}
+
+static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
+                                                 struct ib_udata *udata)
+{
+       struct mlx4_ib_dev *dev = to_mdev(ibdev);
+       struct mlx4_ib_ucontext *context;
+       struct mlx4_ib_alloc_ucontext_resp resp;
+       int err;
+
+       resp.qp_tab_size      = dev->dev->caps.num_qps;
+       resp.bf_reg_size      = dev->dev->caps.bf_reg_size;
+       resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
+
+       context = kmalloc(sizeof *context, GFP_KERNEL);
+       if (!context)
+               return ERR_PTR(-ENOMEM);
+
+       err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar);
+       if (err) {
+               kfree(context);
+               return ERR_PTR(err);
+       }
+
+       INIT_LIST_HEAD(&context->db_page_list);
+       mutex_init(&context->db_page_mutex);
+
+       err = ib_copy_to_udata(udata, &resp, sizeof resp);
+       if (err) {
+               mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar);
+               kfree(context);
+               return ERR_PTR(-EFAULT);
+       }
+
+       return &context->ibucontext;
+}
+
+static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+       struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
+
+       mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar);
+       kfree(context);
+
+       return 0;
+}
+
+static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+       struct mlx4_ib_dev *dev = to_mdev(context->device);
+
+       if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+               return -EINVAL;
+
+       if (vma->vm_pgoff == 0) {
+               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+               if (io_remap_pfn_range(vma, vma->vm_start,
+                                      to_mucontext(context)->uar.pfn,
+                                      PAGE_SIZE, vma->vm_page_prot))
+                       return -EAGAIN;
+       } else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) {
+               /* FIXME want pgprot_writecombine() for BlueFlame pages */
+               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+               if (io_remap_pfn_range(vma, vma->vm_start,
+                                      to_mucontext(context)->uar.pfn +
+                                      dev->dev->caps.num_uars,
+                                      PAGE_SIZE, vma->vm_page_prot))
+                       return -EAGAIN;
+       } else
+               return -EINVAL;
+
+       return 0;
+}
+
+static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev,
+                                     struct ib_ucontext *context,
+                                     struct ib_udata *udata)
+{
+       struct mlx4_ib_pd *pd;
+       int err;
+
+       pd = kmalloc(sizeof *pd, GFP_KERNEL);
+       if (!pd)
+               return ERR_PTR(-ENOMEM);
+
+       err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn);
+       if (err) {
+               kfree(pd);
+               return ERR_PTR(err);
+       }
+
+       if (context)
+               if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) {
+                       mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn);
+                       kfree(pd);
+                       return ERR_PTR(-EFAULT);
+               }
+
+       return &pd->ibpd;
+}
+
+static int mlx4_ib_dealloc_pd(struct ib_pd *pd)
+{
+       mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn);
+       kfree(pd);
+
+       return 0;
+}
+
+static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+       return mlx4_multicast_attach(to_mdev(ibqp->device)->dev,
+                                    &to_mqp(ibqp)->mqp, gid->raw);
+}
+
+static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+       return mlx4_multicast_detach(to_mdev(ibqp->device)->dev,
+                                    &to_mqp(ibqp)->mqp, gid->raw);
+}
+
+static int init_node_data(struct mlx4_ib_dev *dev)
+{
+       struct ib_smp *in_mad  = NULL;
+       struct ib_smp *out_mad = NULL;
+       int err = -ENOMEM;
+
+       in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+       out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+       if (!in_mad || !out_mad)
+               goto out;
+
+       init_query_mad(in_mad);
+       in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
+
+       err = mlx4_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+       if (err)
+               goto out;
+
+       memcpy(dev->ib_dev.node_desc, out_mad->data, 64);
+
+       in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+       err = mlx4_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+       if (err)
+               goto out;
+
+       memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
+
+out:
+       kfree(in_mad);
+       kfree(out_mad);
+       return err;
+}
+
+static void *mlx4_ib_add(struct mlx4_dev *dev)
+{
+       struct mlx4_ib_dev *ibdev;
+
+       ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev);
+       if (!ibdev) {
+               dev_err(&dev->pdev->dev, "Device struct alloc failed\n");
+               return NULL;
+       }
+
+       if (mlx4_pd_alloc(dev, &ibdev->priv_pdn))
+               goto err_dealloc;
+
+       if (mlx4_uar_alloc(dev, &ibdev->priv_uar))
+               goto err_pd;
+
+       ibdev->uar_map = ioremap(ibdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+       if (!ibdev->uar_map)
+               goto err_uar;
+
+       INIT_LIST_HEAD(&ibdev->pgdir_list);
+       mutex_init(&ibdev->pgdir_mutex);
+
+       ibdev->dev = dev;
+
+       strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX);
+       ibdev->ib_dev.owner             = THIS_MODULE;
+       ibdev->ib_dev.node_type         = RDMA_NODE_IB_CA;
+       ibdev->ib_dev.phys_port_cnt     = dev->caps.num_ports;
+       ibdev->ib_dev.num_comp_vectors  = 1;
+       ibdev->ib_dev.dma_device        = &dev->pdev->dev;
+
+       ibdev->ib_dev.uverbs_abi_ver    = MLX4_IB_UVERBS_ABI_VERSION;
+       ibdev->ib_dev.uverbs_cmd_mask   =
+               (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
+               (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
+               (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
+               (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
+               (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
+               (1ull << IB_USER_VERBS_CMD_REG_MR)              |
+               (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
+               (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+               (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
+               (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
+               (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
+               (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
+               (1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
+               (1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
+               (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
+
+       ibdev->ib_dev.query_device      = mlx4_ib_query_device;
+       ibdev->ib_dev.query_port        = mlx4_ib_query_port;
+       ibdev->ib_dev.query_gid         = mlx4_ib_query_gid;
+       ibdev->ib_dev.query_pkey        = mlx4_ib_query_pkey;
+       ibdev->ib_dev.modify_device     = mlx4_ib_modify_device;
+       ibdev->ib_dev.modify_port       = mlx4_ib_modify_port;
+       ibdev->ib_dev.alloc_ucontext    = mlx4_ib_alloc_ucontext;
+       ibdev->ib_dev.dealloc_ucontext  = mlx4_ib_dealloc_ucontext;
+       ibdev->ib_dev.mmap              = mlx4_ib_mmap;
+       ibdev->ib_dev.alloc_pd          = mlx4_ib_alloc_pd;
+       ibdev->ib_dev.dealloc_pd        = mlx4_ib_dealloc_pd;
+       ibdev->ib_dev.create_ah         = mlx4_ib_create_ah;
+       ibdev->ib_dev.query_ah          = mlx4_ib_query_ah;
+       ibdev->ib_dev.destroy_ah        = mlx4_ib_destroy_ah;
+       ibdev->ib_dev.create_srq        = mlx4_ib_create_srq;
+       ibdev->ib_dev.modify_srq        = mlx4_ib_modify_srq;
+       ibdev->ib_dev.destroy_srq       = mlx4_ib_destroy_srq;
+       ibdev->ib_dev.post_srq_recv     = mlx4_ib_post_srq_recv;
+       ibdev->ib_dev.create_qp         = mlx4_ib_create_qp;
+       ibdev->ib_dev.modify_qp         = mlx4_ib_modify_qp;
+       ibdev->ib_dev.destroy_qp        = mlx4_ib_destroy_qp;
+       ibdev->ib_dev.post_send         = mlx4_ib_post_send;
+       ibdev->ib_dev.post_recv         = mlx4_ib_post_recv;
+       ibdev->ib_dev.create_cq         = mlx4_ib_create_cq;
+       ibdev->ib_dev.destroy_cq        = mlx4_ib_destroy_cq;
+       ibdev->ib_dev.poll_cq           = mlx4_ib_poll_cq;
+       ibdev->ib_dev.req_notify_cq     = mlx4_ib_arm_cq;
+       ibdev->ib_dev.get_dma_mr        = mlx4_ib_get_dma_mr;
+       ibdev->ib_dev.reg_user_mr       = mlx4_ib_reg_user_mr;
+       ibdev->ib_dev.dereg_mr          = mlx4_ib_dereg_mr;
+       ibdev->ib_dev.attach_mcast      = mlx4_ib_mcg_attach;
+       ibdev->ib_dev.detach_mcast      = mlx4_ib_mcg_detach;
+       ibdev->ib_dev.process_mad       = mlx4_ib_process_mad;
+
+       if (init_node_data(ibdev))
+               goto err_map;
+
+       spin_lock_init(&ibdev->sm_lock);
+       mutex_init(&ibdev->cap_mask_mutex);
+
+       if (ib_register_device(&ibdev->ib_dev))
+               goto err_map;
+
+       if (mlx4_ib_mad_init(ibdev))
+               goto err_reg;
+
+       return ibdev;
+
+err_reg:
+       ib_unregister_device(&ibdev->ib_dev);
+
+err_map:
+       iounmap(ibdev->uar_map);
+
+err_uar:
+       mlx4_uar_free(dev, &ibdev->priv_uar);
+
+err_pd:
+       mlx4_pd_free(dev, ibdev->priv_pdn);
+
+err_dealloc:
+       ib_dealloc_device(&ibdev->ib_dev);
+
+       return NULL;
+}
+
+static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
+{
+       struct mlx4_ib_dev *ibdev = ibdev_ptr;
+       int p;
+
+       for (p = 1; p <= dev->caps.num_ports; ++p)
+               mlx4_CLOSE_PORT(dev, p);
+
+       mlx4_ib_mad_cleanup(ibdev);
+       ib_unregister_device(&ibdev->ib_dev);
+       iounmap(ibdev->uar_map);
+       mlx4_uar_free(dev, &ibdev->priv_uar);
+       mlx4_pd_free(dev, ibdev->priv_pdn);
+       ib_dealloc_device(&ibdev->ib_dev);
+}
+
+static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
+                         enum mlx4_dev_event event, int subtype,
+                         int port)
+{
+       struct ib_event ibev;
+
+       switch (event) {
+       case MLX4_EVENT_TYPE_PORT_CHANGE:
+               ibev.event = subtype == MLX4_PORT_CHANGE_SUBTYPE_ACTIVE ?
+                       IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+               break;
+
+       case MLX4_EVENT_TYPE_LOCAL_CATAS_ERROR:
+               ibev.event = IB_EVENT_DEVICE_FATAL;
+               break;
+
+       default:
+               return;
+       }
+
+       ibev.device           = ibdev_ptr;
+       ibev.element.port_num = port;
+
+       ib_dispatch_event(&ibev);
+}
+
+static struct mlx4_interface mlx4_ib_interface = {
+       .add    = mlx4_ib_add,
+       .remove = mlx4_ib_remove,
+       .event  = mlx4_ib_event
+};
+
+static int __init mlx4_ib_init(void)
+{
+       return mlx4_register_interface(&mlx4_ib_interface);
+}
+
+static void __exit mlx4_ib_cleanup(void)
+{
+       mlx4_unregister_interface(&mlx4_ib_interface);
+}
+
+module_init(mlx4_ib_init);
+module_exit(mlx4_ib_cleanup);
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
new file mode 100644 (file)
index 0000000..93dac71
--- /dev/null
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_IB_H
+#define MLX4_IB_H
+
+#include <linux/compiler.h>
+#include <linux/list.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+enum {
+       MLX4_IB_DB_PER_PAGE     = PAGE_SIZE / 4
+};
+
+struct mlx4_ib_db_pgdir;
+struct mlx4_ib_user_db_page;
+
+struct mlx4_ib_db {
+       __be32                 *db;
+       union {
+               struct mlx4_ib_db_pgdir        *pgdir;
+               struct mlx4_ib_user_db_page    *user_page;
+       }                       u;
+       dma_addr_t              dma;
+       int                     index;
+       int                     order;
+};
+
+struct mlx4_ib_ucontext {
+       struct ib_ucontext      ibucontext;
+       struct mlx4_uar         uar;
+       struct list_head        db_page_list;
+       struct mutex            db_page_mutex;
+};
+
+struct mlx4_ib_pd {
+       struct ib_pd            ibpd;
+       u32                     pdn;
+};
+
+struct mlx4_ib_cq_buf {
+       struct mlx4_buf         buf;
+       struct mlx4_mtt         mtt;
+};
+
+struct mlx4_ib_cq {
+       struct ib_cq            ibcq;
+       struct mlx4_cq          mcq;
+       struct mlx4_ib_cq_buf   buf;
+       struct mlx4_ib_db       db;
+       spinlock_t              lock;
+       struct ib_umem         *umem;
+};
+
+struct mlx4_ib_mr {
+       struct ib_mr            ibmr;
+       struct mlx4_mr          mmr;
+       struct ib_umem         *umem;
+};
+
+struct mlx4_ib_wq {
+       u64                    *wrid;
+       spinlock_t              lock;
+       int                     max;
+       int                     max_gs;
+       int                     offset;
+       int                     wqe_shift;
+       unsigned                head;
+       unsigned                tail;
+};
+
+struct mlx4_ib_qp {
+       struct ib_qp            ibqp;
+       struct mlx4_qp          mqp;
+       struct mlx4_buf         buf;
+
+       struct mlx4_ib_db       db;
+       struct mlx4_ib_wq       rq;
+
+       u32                     doorbell_qpn;
+       __be32                  sq_signal_bits;
+       struct mlx4_ib_wq       sq;
+
+       struct ib_umem         *umem;
+       struct mlx4_mtt         mtt;
+       int                     buf_size;
+       struct mutex            mutex;
+       u8                      port;
+       u8                      alt_port;
+       u8                      atomic_rd_en;
+       u8                      resp_depth;
+       u8                      state;
+};
+
+struct mlx4_ib_srq {
+       struct ib_srq           ibsrq;
+       struct mlx4_srq         msrq;
+       struct mlx4_buf         buf;
+       struct mlx4_ib_db       db;
+       u64                    *wrid;
+       spinlock_t              lock;
+       int                     head;
+       int                     tail;
+       u16                     wqe_ctr;
+       struct ib_umem         *umem;
+       struct mlx4_mtt         mtt;
+       struct mutex            mutex;
+};
+
+struct mlx4_ib_ah {
+       struct ib_ah            ibah;
+       struct mlx4_av          av;
+};
+
+struct mlx4_ib_dev {
+       struct ib_device        ib_dev;
+       struct mlx4_dev        *dev;
+       void __iomem           *uar_map;
+
+       struct list_head        pgdir_list;
+       struct mutex            pgdir_mutex;
+
+       struct mlx4_uar         priv_uar;
+       u32                     priv_pdn;
+       MLX4_DECLARE_DOORBELL_LOCK(uar_lock);
+
+       struct ib_mad_agent    *send_agent[MLX4_MAX_PORTS][2];
+       struct ib_ah           *sm_ah[MLX4_MAX_PORTS];
+       spinlock_t              sm_lock;
+
+       struct mutex            cap_mask_mutex;
+};
+
+static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev)
+{
+       return container_of(ibdev, struct mlx4_ib_dev, ib_dev);
+}
+
+static inline struct mlx4_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
+{
+       return container_of(ibucontext, struct mlx4_ib_ucontext, ibucontext);
+}
+
+static inline struct mlx4_ib_pd *to_mpd(struct ib_pd *ibpd)
+{
+       return container_of(ibpd, struct mlx4_ib_pd, ibpd);
+}
+
+static inline struct mlx4_ib_cq *to_mcq(struct ib_cq *ibcq)
+{
+       return container_of(ibcq, struct mlx4_ib_cq, ibcq);
+}
+
+static inline struct mlx4_ib_cq *to_mibcq(struct mlx4_cq *mcq)
+{
+       return container_of(mcq, struct mlx4_ib_cq, mcq);
+}
+
+static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr)
+{
+       return container_of(ibmr, struct mlx4_ib_mr, ibmr);
+}
+
+static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp)
+{
+       return container_of(ibqp, struct mlx4_ib_qp, ibqp);
+}
+
+static inline struct mlx4_ib_qp *to_mibqp(struct mlx4_qp *mqp)
+{
+       return container_of(mqp, struct mlx4_ib_qp, mqp);
+}
+
+static inline struct mlx4_ib_srq *to_msrq(struct ib_srq *ibsrq)
+{
+       return container_of(ibsrq, struct mlx4_ib_srq, ibsrq);
+}
+
+static inline struct mlx4_ib_srq *to_mibsrq(struct mlx4_srq *msrq)
+{
+       return container_of(msrq, struct mlx4_ib_srq, msrq);
+}
+
+static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah)
+{
+       return container_of(ibah, struct mlx4_ib_ah, ibah);
+}
+
+int mlx4_ib_db_alloc(struct mlx4_ib_dev *dev, struct mlx4_ib_db *db, int order);
+void mlx4_ib_db_free(struct mlx4_ib_dev *dev, struct mlx4_ib_db *db);
+int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+                       struct mlx4_ib_db *db);
+void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_ib_db *db);
+
+struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc);
+int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
+                          struct ib_umem *umem);
+struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+                                 u64 virt_addr, int access_flags,
+                                 struct ib_udata *udata);
+int mlx4_ib_dereg_mr(struct ib_mr *mr);
+
+struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector,
+                               struct ib_ucontext *context,
+                               struct ib_udata *udata);
+int mlx4_ib_destroy_cq(struct ib_cq *cq);
+int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
+void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
+
+struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr);
+int mlx4_ib_destroy_ah(struct ib_ah *ah);
+
+struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
+                                 struct ib_srq_init_attr *init_attr,
+                                 struct ib_udata *udata);
+int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+                      enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+int mlx4_ib_destroy_srq(struct ib_srq *srq);
+void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index);
+int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+                         struct ib_recv_wr **bad_wr);
+
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+                               struct ib_qp_init_attr *init_attr,
+                               struct ib_udata *udata);
+int mlx4_ib_destroy_qp(struct ib_qp *qp);
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                     int attr_mask, struct ib_udata *udata);
+int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+                     struct ib_send_wr **bad_wr);
+int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+                     struct ib_recv_wr **bad_wr);
+
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+                int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+                void *in_mad, void *response_mad);
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags,        u8 port_num,
+                       struct ib_wc *in_wc, struct ib_grh *in_grh,
+                       struct ib_mad *in_mad, struct ib_mad *out_mad);
+int mlx4_ib_mad_init(struct mlx4_ib_dev *dev);
+void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev);
+
+static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)
+{
+       return !!(ah->av.g_slid & 0x80);
+}
+
+#endif /* MLX4_IB_H */
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
new file mode 100644 (file)
index 0000000..85ae906
--- /dev/null
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx4_ib.h"
+
+static u32 convert_access(int acc)
+{
+       return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX4_PERM_ATOMIC       : 0) |
+              (acc & IB_ACCESS_REMOTE_WRITE  ? MLX4_PERM_REMOTE_WRITE : 0) |
+              (acc & IB_ACCESS_REMOTE_READ   ? MLX4_PERM_REMOTE_READ  : 0) |
+              (acc & IB_ACCESS_LOCAL_WRITE   ? MLX4_PERM_LOCAL_WRITE  : 0) |
+              MLX4_PERM_LOCAL_READ;
+}
+
+struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
+{
+       struct mlx4_ib_mr *mr;
+       int err;
+
+       mr = kmalloc(sizeof *mr, GFP_KERNEL);
+       if (!mr)
+               return ERR_PTR(-ENOMEM);
+
+       err = mlx4_mr_alloc(to_mdev(pd->device)->dev, to_mpd(pd)->pdn, 0,
+                           ~0ull, convert_access(acc), 0, 0, &mr->mmr);
+       if (err)
+               goto err_free;
+
+       err = mlx4_mr_enable(to_mdev(pd->device)->dev, &mr->mmr);
+       if (err)
+               goto err_mr;
+
+       mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+       mr->umem = NULL;
+
+       return &mr->ibmr;
+
+err_mr:
+       mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
+
+err_free:
+       kfree(mr);
+
+       return ERR_PTR(err);
+}
+
+int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
+                          struct ib_umem *umem)
+{
+       u64 *pages;
+       struct ib_umem_chunk *chunk;
+       int i, j, k;
+       int n;
+       int len;
+       int err = 0;
+
+       pages = (u64 *) __get_free_page(GFP_KERNEL);
+       if (!pages)
+               return -ENOMEM;
+
+       i = n = 0;
+
+       list_for_each_entry(chunk, &umem->chunk_list, list)
+               for (j = 0; j < chunk->nmap; ++j) {
+                       len = sg_dma_len(&chunk->page_list[j]) >> mtt->page_shift;
+                       for (k = 0; k < len; ++k) {
+                               pages[i++] = sg_dma_address(&chunk->page_list[j]) +
+                                       umem->page_size * k;
+                               /*
+                                * Be friendly to WRITE_MTT firmware
+                                * command, and pass it chunks of
+                                * appropriate size.
+                                */
+                               if (i == PAGE_SIZE / sizeof (u64) - 2) {
+                                       err = mlx4_write_mtt(dev->dev, mtt, n,
+                                                            i, pages);
+                                       if (err)
+                                               goto out;
+                                       n += i;
+                                       i = 0;
+                               }
+                       }
+               }
+
+       if (i)
+               err = mlx4_write_mtt(dev->dev, mtt, n, i, pages);
+
+out:
+       free_page((unsigned long) pages);
+       return err;
+}
+
+struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+                                 u64 virt_addr, int access_flags,
+                                 struct ib_udata *udata)
+{
+       struct mlx4_ib_dev *dev = to_mdev(pd->device);
+       struct mlx4_ib_mr *mr;
+       int shift;
+       int err;
+       int n;
+
+       mr = kmalloc(sizeof *mr, GFP_KERNEL);
+       if (!mr)
+               return ERR_PTR(-ENOMEM);
+
+       mr->umem = ib_umem_get(pd->uobject->context, start, length, access_flags);
+       if (IS_ERR(mr->umem)) {
+               err = PTR_ERR(mr->umem);
+               goto err_free;
+       }
+
+       n = ib_umem_page_count(mr->umem);
+       shift = ilog2(mr->umem->page_size);
+
+       err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
+                           convert_access(access_flags), n, shift, &mr->mmr);
+       if (err)
+               goto err_umem;
+
+       err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem);
+       if (err)
+               goto err_mr;
+
+       err = mlx4_mr_enable(dev->dev, &mr->mmr);
+       if (err)
+               goto err_mr;
+
+       mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+
+       return &mr->ibmr;
+
+err_mr:
+       mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
+
+err_umem:
+       ib_umem_release(mr->umem);
+
+err_free:
+       kfree(mr);
+
+       return ERR_PTR(err);
+}
+
+int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
+{
+       struct mlx4_ib_mr *mr = to_mmr(ibmr);
+
+       mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr);
+       if (mr->umem)
+               ib_umem_release(mr->umem);
+       kfree(mr);
+
+       return 0;
+}
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
new file mode 100644 (file)
index 0000000..5cd7069
--- /dev/null
@@ -0,0 +1,1294 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_pack.h>
+
+#include <linux/mlx4/qp.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+enum {
+       MLX4_IB_ACK_REQ_FREQ    = 8,
+};
+
+enum {
+       MLX4_IB_DEFAULT_SCHED_QUEUE     = 0x83,
+       MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f
+};
+
+enum {
+       /*
+        * Largest possible UD header: send with GRH and immediate data.
+        */
+       MLX4_IB_UD_HEADER_SIZE          = 72
+};
+
+struct mlx4_ib_sqp {
+       struct mlx4_ib_qp       qp;
+       int                     pkey_index;
+       u32                     qkey;
+       u32                     send_psn;
+       struct ib_ud_header     ud_header;
+       u8                      header_buf[MLX4_IB_UD_HEADER_SIZE];
+};
+
+static const __be32 mlx4_ib_opcode[] = {
+       [IB_WR_SEND]                    = __constant_cpu_to_be32(MLX4_OPCODE_SEND),
+       [IB_WR_SEND_WITH_IMM]           = __constant_cpu_to_be32(MLX4_OPCODE_SEND_IMM),
+       [IB_WR_RDMA_WRITE]              = __constant_cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
+       [IB_WR_RDMA_WRITE_WITH_IMM]     = __constant_cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
+       [IB_WR_RDMA_READ]               = __constant_cpu_to_be32(MLX4_OPCODE_RDMA_READ),
+       [IB_WR_ATOMIC_CMP_AND_SWP]      = __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
+       [IB_WR_ATOMIC_FETCH_AND_ADD]    = __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
+};
+
+static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
+{
+       return container_of(mqp, struct mlx4_ib_sqp, qp);
+}
+
+static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+       return qp->mqp.qpn >= dev->dev->caps.sqp_start &&
+               qp->mqp.qpn <= dev->dev->caps.sqp_start + 3;
+}
+
+static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+       return qp->mqp.qpn >= dev->dev->caps.sqp_start &&
+               qp->mqp.qpn <= dev->dev->caps.sqp_start + 1;
+}
+
+static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
+{
+       if (qp->buf.nbufs == 1)
+               return qp->buf.u.direct.buf + offset;
+       else
+               return qp->buf.u.page_list[offset >> PAGE_SHIFT].buf +
+                       (offset & (PAGE_SIZE - 1));
+}
+
+static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n)
+{
+       return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
+}
+
+static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
+{
+       return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift));
+}
+
+static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
+{
+       struct ib_event event;
+       struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
+
+       if (type == MLX4_EVENT_TYPE_PATH_MIG)
+               to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
+
+       if (ibqp->event_handler) {
+               event.device     = ibqp->device;
+               event.element.qp = ibqp;
+               switch (type) {
+               case MLX4_EVENT_TYPE_PATH_MIG:
+                       event.event = IB_EVENT_PATH_MIG;
+                       break;
+               case MLX4_EVENT_TYPE_COMM_EST:
+                       event.event = IB_EVENT_COMM_EST;
+                       break;
+               case MLX4_EVENT_TYPE_SQ_DRAINED:
+                       event.event = IB_EVENT_SQ_DRAINED;
+                       break;
+               case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
+                       event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+                       break;
+               case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
+                       event.event = IB_EVENT_QP_FATAL;
+                       break;
+               case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
+                       event.event = IB_EVENT_PATH_MIG_ERR;
+                       break;
+               case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+                       event.event = IB_EVENT_QP_REQ_ERR;
+                       break;
+               case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
+                       event.event = IB_EVENT_QP_ACCESS_ERR;
+                       break;
+               default:
+                       printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+                              "on QP %06x\n", type, qp->qpn);
+                       return;
+               }
+
+               ibqp->event_handler(&event, ibqp->qp_context);
+       }
+}
+
+static int send_wqe_overhead(enum ib_qp_type type)
+{
+       /*
+        * UD WQEs must have a datagram segment.
+        * RC and UC WQEs might have a remote address segment.
+        * MLX WQEs need two extra inline data segments (for the UD
+        * header and space for the ICRC).
+        */
+       switch (type) {
+       case IB_QPT_UD:
+               return sizeof (struct mlx4_wqe_ctrl_seg) +
+                       sizeof (struct mlx4_wqe_datagram_seg);
+       case IB_QPT_UC:
+               return sizeof (struct mlx4_wqe_ctrl_seg) +
+                       sizeof (struct mlx4_wqe_raddr_seg);
+       case IB_QPT_RC:
+               return sizeof (struct mlx4_wqe_ctrl_seg) +
+                       sizeof (struct mlx4_wqe_atomic_seg) +
+                       sizeof (struct mlx4_wqe_raddr_seg);
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+               return sizeof (struct mlx4_wqe_ctrl_seg) +
+                       ALIGN(MLX4_IB_UD_HEADER_SIZE +
+                             sizeof (struct mlx4_wqe_inline_seg),
+                             sizeof (struct mlx4_wqe_data_seg)) +
+                       ALIGN(4 +
+                             sizeof (struct mlx4_wqe_inline_seg),
+                             sizeof (struct mlx4_wqe_data_seg));
+       default:
+               return sizeof (struct mlx4_wqe_ctrl_seg);
+       }
+}
+
+static int set_qp_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+                      enum ib_qp_type type, struct mlx4_ib_qp *qp)
+{
+       /* Sanity check QP size before proceeding */
+       if (cap->max_send_wr     > dev->dev->caps.max_wqes  ||
+           cap->max_recv_wr     > dev->dev->caps.max_wqes  ||
+           cap->max_send_sge    > dev->dev->caps.max_sq_sg ||
+           cap->max_recv_sge    > dev->dev->caps.max_rq_sg ||
+           cap->max_inline_data + send_wqe_overhead(type) +
+           sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz)
+               return -EINVAL;
+
+       /*
+        * For MLX transport we need 2 extra S/G entries:
+        * one for the header and one for the checksum at the end
+        */
+       if ((type == IB_QPT_SMI || type == IB_QPT_GSI) &&
+           cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
+               return -EINVAL;
+
+       qp->rq.max = cap->max_recv_wr ? roundup_pow_of_two(cap->max_recv_wr) : 0;
+       qp->sq.max = cap->max_send_wr ? roundup_pow_of_two(cap->max_send_wr) : 0;
+
+       qp->rq.wqe_shift = ilog2(roundup_pow_of_two(cap->max_recv_sge *
+                                                   sizeof (struct mlx4_wqe_data_seg)));
+       qp->rq.max_gs    = (1 << qp->rq.wqe_shift) / sizeof (struct mlx4_wqe_data_seg);
+
+       qp->sq.wqe_shift = ilog2(roundup_pow_of_two(max(cap->max_send_sge *
+                                                       sizeof (struct mlx4_wqe_data_seg),
+                                                       cap->max_inline_data +
+                                                       sizeof (struct mlx4_wqe_inline_seg)) +
+                                                   send_wqe_overhead(type)));
+       qp->sq.max_gs    = ((1 << qp->sq.wqe_shift) - send_wqe_overhead(type)) /
+               sizeof (struct mlx4_wqe_data_seg);
+
+       qp->buf_size = (qp->rq.max << qp->rq.wqe_shift) +
+               (qp->sq.max << qp->sq.wqe_shift);
+       if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
+               qp->rq.offset = 0;
+               qp->sq.offset = qp->rq.max << qp->rq.wqe_shift;
+       } else {
+               qp->rq.offset = qp->sq.max << qp->sq.wqe_shift;
+               qp->sq.offset = 0;
+       }
+
+       cap->max_send_wr  = qp->sq.max;
+       cap->max_recv_wr  = qp->rq.max;
+       cap->max_send_sge = qp->sq.max_gs;
+       cap->max_recv_sge = qp->rq.max_gs;
+       cap->max_inline_data = (1 << qp->sq.wqe_shift) - send_wqe_overhead(type) -
+               sizeof (struct mlx4_wqe_inline_seg);
+
+       return 0;
+}
+
+static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
+                           struct ib_qp_init_attr *init_attr,
+                           struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp)
+{
+       struct mlx4_wqe_ctrl_seg *ctrl;
+       int err;
+       int i;
+
+       mutex_init(&qp->mutex);
+       spin_lock_init(&qp->sq.lock);
+       spin_lock_init(&qp->rq.lock);
+
+       qp->state        = IB_QPS_RESET;
+       qp->atomic_rd_en = 0;
+       qp->resp_depth   = 0;
+
+       qp->rq.head         = 0;
+       qp->rq.tail         = 0;
+       qp->sq.head         = 0;
+       qp->sq.tail         = 0;
+
+       err = set_qp_size(dev, &init_attr->cap, init_attr->qp_type, qp);
+       if (err)
+               goto err;
+
+       if (pd->uobject) {
+               struct mlx4_ib_create_qp ucmd;
+
+               if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+                       err = -EFAULT;
+                       goto err;
+               }
+
+               qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
+                                      qp->buf_size, 0);
+               if (IS_ERR(qp->umem)) {
+                       err = PTR_ERR(qp->umem);
+                       goto err;
+               }
+
+               err = mlx4_mtt_init(dev->dev, ib_umem_page_count(qp->umem),
+                                   ilog2(qp->umem->page_size), &qp->mtt);
+               if (err)
+                       goto err_buf;
+
+               err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem);
+               if (err)
+                       goto err_mtt;
+
+               err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
+                                         ucmd.db_addr, &qp->db);
+               if (err)
+                       goto err_mtt;
+       } else {
+               err = mlx4_ib_db_alloc(dev, &qp->db, 0);
+               if (err)
+                       goto err;
+
+               *qp->db.db = 0;
+
+               if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) {
+                       err = -ENOMEM;
+                       goto err_db;
+               }
+
+               err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
+                                   &qp->mtt);
+               if (err)
+                       goto err_buf;
+
+               err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf);
+               if (err)
+                       goto err_mtt;
+
+               for (i = 0; i < qp->sq.max; ++i) {
+                       ctrl = get_send_wqe(qp, i);
+                       ctrl->owner_opcode = cpu_to_be32(1 << 31);
+               }
+
+               qp->sq.wrid  = kmalloc(qp->sq.max * sizeof (u64), GFP_KERNEL);
+               qp->rq.wrid  = kmalloc(qp->rq.max * sizeof (u64), GFP_KERNEL);
+
+               if (!qp->sq.wrid || !qp->rq.wrid) {
+                       err = -ENOMEM;
+                       goto err_wrid;
+               }
+
+               /* We don't support inline sends for kernel QPs (yet) */
+               init_attr->cap.max_inline_data = 0;
+       }
+
+       err = mlx4_qp_alloc(dev->dev, sqpn, &qp->mqp);
+       if (err)
+               goto err_wrid;
+
+       /*
+        * Hardware wants QPN written in big-endian order (after
+        * shifting) for send doorbell.  Precompute this value to save
+        * a little bit when posting sends.
+        */
+       qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
+
+       if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+               qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+       else
+               qp->sq_signal_bits = 0;
+
+       qp->mqp.event = mlx4_ib_qp_event;
+
+       return 0;
+
+err_wrid:
+       if (pd->uobject)
+               mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db);
+       else {
+               kfree(qp->sq.wrid);
+               kfree(qp->rq.wrid);
+       }
+
+err_mtt:
+       mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+err_buf:
+       if (pd->uobject)
+               ib_umem_release(qp->umem);
+       else
+               mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+
+err_db:
+       if (!pd->uobject)
+               mlx4_ib_db_free(dev, &qp->db);
+
+err:
+       return err;
+}
+
+static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state)
+{
+       switch (state) {
+       case IB_QPS_RESET:      return MLX4_QP_STATE_RST;
+       case IB_QPS_INIT:       return MLX4_QP_STATE_INIT;
+       case IB_QPS_RTR:        return MLX4_QP_STATE_RTR;
+       case IB_QPS_RTS:        return MLX4_QP_STATE_RTS;
+       case IB_QPS_SQD:        return MLX4_QP_STATE_SQD;
+       case IB_QPS_SQE:        return MLX4_QP_STATE_SQER;
+       case IB_QPS_ERR:        return MLX4_QP_STATE_ERR;
+       default:                return -1;
+       }
+}
+
+static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+{
+       if (send_cq == recv_cq)
+               spin_lock_irq(&send_cq->lock);
+       else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+               spin_lock_irq(&send_cq->lock);
+               spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
+       } else {
+               spin_lock_irq(&recv_cq->lock);
+               spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
+       }
+}
+
+static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+{
+       if (send_cq == recv_cq)
+               spin_unlock_irq(&send_cq->lock);
+       else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+               spin_unlock(&recv_cq->lock);
+               spin_unlock_irq(&send_cq->lock);
+       } else {
+               spin_unlock(&send_cq->lock);
+               spin_unlock_irq(&recv_cq->lock);
+       }
+}
+
+static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
+                             int is_user)
+{
+       struct mlx4_ib_cq *send_cq, *recv_cq;
+
+       if (qp->state != IB_QPS_RESET)
+               if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
+                                  MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
+                       printk(KERN_WARNING "mlx4_ib: modify QP %06x to RESET failed.\n",
+                              qp->mqp.qpn);
+
+       send_cq = to_mcq(qp->ibqp.send_cq);
+       recv_cq = to_mcq(qp->ibqp.recv_cq);
+
+       mlx4_ib_lock_cqs(send_cq, recv_cq);
+
+       if (!is_user) {
+               __mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
+                                qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL);
+               if (send_cq != recv_cq)
+                       __mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+       }
+
+       mlx4_qp_remove(dev->dev, &qp->mqp);
+
+       mlx4_ib_unlock_cqs(send_cq, recv_cq);
+
+       mlx4_qp_free(dev->dev, &qp->mqp);
+       mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+       if (is_user) {
+               mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context),
+                                     &qp->db);
+               ib_umem_release(qp->umem);
+       } else {
+               kfree(qp->sq.wrid);
+               kfree(qp->rq.wrid);
+               mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+               mlx4_ib_db_free(dev, &qp->db);
+       }
+}
+
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+                               struct ib_qp_init_attr *init_attr,
+                               struct ib_udata *udata)
+{
+       struct mlx4_ib_dev *dev = to_mdev(pd->device);
+       struct mlx4_ib_sqp *sqp;
+       struct mlx4_ib_qp *qp;
+       int err;
+
+       switch (init_attr->qp_type) {
+       case IB_QPT_RC:
+       case IB_QPT_UC:
+       case IB_QPT_UD:
+       {
+               qp = kmalloc(sizeof *qp, GFP_KERNEL);
+               if (!qp)
+                       return ERR_PTR(-ENOMEM);
+
+               err = create_qp_common(dev, pd, init_attr, udata, 0, qp);
+               if (err) {
+                       kfree(qp);
+                       return ERR_PTR(err);
+               }
+
+               qp->ibqp.qp_num = qp->mqp.qpn;
+
+               break;
+       }
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+       {
+               /* Userspace is not allowed to create special QPs: */
+               if (pd->uobject)
+                       return ERR_PTR(-EINVAL);
+
+               sqp = kmalloc(sizeof *sqp, GFP_KERNEL);
+               if (!sqp)
+                       return ERR_PTR(-ENOMEM);
+
+               qp = &sqp->qp;
+
+               err = create_qp_common(dev, pd, init_attr, udata,
+                                      dev->dev->caps.sqp_start +
+                                      (init_attr->qp_type == IB_QPT_SMI ? 0 : 2) +
+                                      init_attr->port_num - 1,
+                                      qp);
+               if (err) {
+                       kfree(sqp);
+                       return ERR_PTR(err);
+               }
+
+               qp->port        = init_attr->port_num;
+               qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
+
+               break;
+       }
+       default:
+               /* Don't support raw QPs */
+               return ERR_PTR(-EINVAL);
+       }
+
+       return &qp->ibqp;
+}
+
+int mlx4_ib_destroy_qp(struct ib_qp *qp)
+{
+       struct mlx4_ib_dev *dev = to_mdev(qp->device);
+       struct mlx4_ib_qp *mqp = to_mqp(qp);
+
+       if (is_qp0(dev, mqp))
+               mlx4_CLOSE_PORT(dev->dev, mqp->port);
+
+       destroy_qp_common(dev, mqp, !!qp->pd->uobject);
+
+       if (is_sqp(dev, mqp))
+               kfree(to_msqp(mqp));
+       else
+               kfree(mqp);
+
+       return 0;
+}
+
+static void init_port(struct mlx4_ib_dev *dev, int port)
+{
+       struct mlx4_init_port_param param;
+       int err;
+
+       memset(&param, 0, sizeof param);
+
+       param.port_width_cap = dev->dev->caps.port_width_cap;
+       param.vl_cap         = dev->dev->caps.vl_cap;
+       param.mtu            = ib_mtu_enum_to_int(dev->dev->caps.mtu_cap);
+       param.max_gid        = dev->dev->caps.gid_table_len;
+       param.max_pkey       = dev->dev->caps.pkey_table_len;
+
+       err = mlx4_INIT_PORT(dev->dev, &param, port);
+       if (err)
+               printk(KERN_WARNING "INIT_PORT failed, return code %d.\n", err);
+}
+
+static int to_mlx4_st(enum ib_qp_type type)
+{
+       switch (type) {
+       case IB_QPT_RC:         return MLX4_QP_ST_RC;
+       case IB_QPT_UC:         return MLX4_QP_ST_UC;
+       case IB_QPT_UD:         return MLX4_QP_ST_UD;
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:        return MLX4_QP_ST_MLX;
+       default:                return -1;
+       }
+}
+
+static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, struct ib_qp_attr *attr,
+                                  int attr_mask)
+{
+       u8 dest_rd_atomic;
+       u32 access_flags;
+       u32 hw_access_flags = 0;
+
+       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+               dest_rd_atomic = attr->max_dest_rd_atomic;
+       else
+               dest_rd_atomic = qp->resp_depth;
+
+       if (attr_mask & IB_QP_ACCESS_FLAGS)
+               access_flags = attr->qp_access_flags;
+       else
+               access_flags = qp->atomic_rd_en;
+
+       if (!dest_rd_atomic)
+               access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+       if (access_flags & IB_ACCESS_REMOTE_READ)
+               hw_access_flags |= MLX4_QP_BIT_RRE;
+       if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
+               hw_access_flags |= MLX4_QP_BIT_RAE;
+       if (access_flags & IB_ACCESS_REMOTE_WRITE)
+               hw_access_flags |= MLX4_QP_BIT_RWE;
+
+       return cpu_to_be32(hw_access_flags);
+}
+
+static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, struct ib_qp_attr *attr,
+                           int attr_mask)
+{
+       if (attr_mask & IB_QP_PKEY_INDEX)
+               sqp->pkey_index = attr->pkey_index;
+       if (attr_mask & IB_QP_QKEY)
+               sqp->qkey = attr->qkey;
+       if (attr_mask & IB_QP_SQ_PSN)
+               sqp->send_psn = attr->sq_psn;
+}
+
+static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port)
+{
+       path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6);
+}
+
+static int mlx4_set_path(struct mlx4_ib_dev *dev, struct ib_ah_attr *ah,
+                        struct mlx4_qp_path *path, u8 port)
+{
+       path->grh_mylmc     = ah->src_path_bits & 0x7f;
+       path->rlid          = cpu_to_be16(ah->dlid);
+       if (ah->static_rate) {
+               path->static_rate = ah->static_rate + MLX4_STAT_RATE_OFFSET;
+               while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+                      !(1 << path->static_rate & dev->dev->caps.stat_rate_support))
+                       --path->static_rate;
+       } else
+               path->static_rate = 0;
+       path->counter_index = 0xff;
+
+       if (ah->ah_flags & IB_AH_GRH) {
+               if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len) {
+                       printk(KERN_ERR "sgid_index (%u) too large. max is %d\n",
+                              ah->grh.sgid_index, dev->dev->caps.gid_table_len - 1);
+                       return -1;
+               }
+
+               path->grh_mylmc |= 1 << 7;
+               path->mgid_index = ah->grh.sgid_index;
+               path->hop_limit  = ah->grh.hop_limit;
+               path->tclass_flowlabel =
+                       cpu_to_be32((ah->grh.traffic_class << 20) |
+                                   (ah->grh.flow_label));
+               memcpy(path->rgid, ah->grh.dgid.raw, 16);
+       }
+
+       path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
+               ((port - 1) << 6) | ((ah->sl & 0xf) << 2);
+
+       return 0;
+}
+
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                     int attr_mask, struct ib_udata *udata)
+{
+       struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+       struct mlx4_ib_qp *qp = to_mqp(ibqp);
+       struct mlx4_qp_context *context;
+       enum mlx4_qp_optpar optpar = 0;
+       enum ib_qp_state cur_state, new_state;
+       int sqd_event;
+       int err = -EINVAL;
+
+       context = kzalloc(sizeof *context, GFP_KERNEL);
+       if (!context)
+               return -ENOMEM;
+
+       mutex_lock(&qp->mutex);
+
+       cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
+       new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+       if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask))
+               goto out;
+
+       if ((attr_mask & IB_QP_PKEY_INDEX) &&
+            attr->pkey_index >= dev->dev->caps.pkey_table_len) {
+               goto out;
+       }
+
+       if ((attr_mask & IB_QP_PORT) &&
+           (attr->port_num == 0 || attr->port_num > dev->dev->caps.num_ports)) {
+               goto out;
+       }
+
+       if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+           attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
+               goto out;
+       }
+
+       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+           attr->max_dest_rd_atomic > 1 << dev->dev->caps.max_qp_dest_rdma) {
+               goto out;
+       }
+
+       context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
+                                    (to_mlx4_st(ibqp->qp_type) << 16));
+       context->flags     |= cpu_to_be32(1 << 8); /* DE? */
+
+       if (!(attr_mask & IB_QP_PATH_MIG_STATE))
+               context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
+       else {
+               optpar |= MLX4_QP_OPTPAR_PM_STATE;
+               switch (attr->path_mig_state) {
+               case IB_MIG_MIGRATED:
+                       context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
+                       break;
+               case IB_MIG_REARM:
+                       context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11);
+                       break;
+               case IB_MIG_ARMED:
+                       context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11);
+                       break;
+               }
+       }
+
+       if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI ||
+           ibqp->qp_type == IB_QPT_UD)
+               context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
+       else if (attr_mask & IB_QP_PATH_MTU) {
+               if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) {
+                       printk(KERN_ERR "path MTU (%u) is invalid\n",
+                              attr->path_mtu);
+                       return -EINVAL;
+               }
+               context->mtu_msgmax = (attr->path_mtu << 5) | 31;
+       }
+
+       if (qp->rq.max)
+               context->rq_size_stride = ilog2(qp->rq.max) << 3;
+       context->rq_size_stride |= qp->rq.wqe_shift - 4;
+
+       if (qp->sq.max)
+               context->sq_size_stride = ilog2(qp->sq.max) << 3;
+       context->sq_size_stride |= qp->sq.wqe_shift - 4;
+
+       if (qp->ibqp.uobject)
+               context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index);
+       else
+               context->usr_page = cpu_to_be32(dev->priv_uar.index);
+
+       if (attr_mask & IB_QP_DEST_QPN)
+               context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
+
+       if (attr_mask & IB_QP_PORT) {
+               if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD &&
+                   !(attr_mask & IB_QP_AV)) {
+                       mlx4_set_sched(&context->pri_path, attr->port_num);
+                       optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE;
+               }
+       }
+
+       if (attr_mask & IB_QP_PKEY_INDEX) {
+               context->pri_path.pkey_index = attr->pkey_index;
+               optpar |= MLX4_QP_OPTPAR_PKEY_INDEX;
+       }
+
+       if (attr_mask & IB_QP_RNR_RETRY) {
+               context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
+               optpar |= MLX4_QP_OPTPAR_RNR_RETRY;
+       }
+
+       if (attr_mask & IB_QP_AV) {
+               if (mlx4_set_path(dev, &attr->ah_attr, &context->pri_path,
+                                 attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) {
+                       err = -EINVAL;
+                       goto out;
+               }
+
+               optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
+                          MLX4_QP_OPTPAR_SCHED_QUEUE);
+       }
+
+       if (attr_mask & IB_QP_TIMEOUT) {
+               context->pri_path.ackto = attr->timeout << 3;
+               optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT;
+       }
+
+       if (attr_mask & IB_QP_ALT_PATH) {
+               if (attr->alt_pkey_index >= dev->dev->caps.pkey_table_len)
+                       return -EINVAL;
+
+               if (attr->alt_port_num == 0 ||
+                   attr->alt_port_num > dev->dev->caps.num_ports)
+                       return -EINVAL;
+
+               if (mlx4_set_path(dev, &attr->alt_ah_attr, &context->alt_path,
+                                 attr->alt_port_num))
+                       return -EINVAL;
+
+               context->alt_path.pkey_index = attr->alt_pkey_index;
+               context->alt_path.ackto = attr->alt_timeout << 3;
+               optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH;
+       }
+
+       context->pd         = cpu_to_be32(to_mpd(ibqp->pd)->pdn);
+       context->params1    = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
+       if (attr_mask & IB_QP_RETRY_CNT) {
+               context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+               optpar |= MLX4_QP_OPTPAR_RETRY_COUNT;
+       }
+
+       if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+               if (attr->max_rd_atomic)
+                       context->params1 |=
+                               cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
+               optpar |= MLX4_QP_OPTPAR_SRA_MAX;
+       }
+
+       if (attr_mask & IB_QP_SQ_PSN)
+               context->next_send_psn = cpu_to_be32(attr->sq_psn);
+
+       context->cqn_send = cpu_to_be32(to_mcq(ibqp->send_cq)->mcq.cqn);
+
+       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+               if (attr->max_dest_rd_atomic)
+                       context->params2 |=
+                               cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
+               optpar |= MLX4_QP_OPTPAR_RRA_MAX;
+       }
+
+       if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
+               context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask);
+               optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE;
+       }
+
+       if (ibqp->srq)
+               context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC);
+
+       if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+               context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+               optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT;
+       }
+       if (attr_mask & IB_QP_RQ_PSN)
+               context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+       context->cqn_recv = cpu_to_be32(to_mcq(ibqp->recv_cq)->mcq.cqn);
+
+       if (attr_mask & IB_QP_QKEY) {
+               context->qkey = cpu_to_be32(attr->qkey);
+               optpar |= MLX4_QP_OPTPAR_Q_KEY;
+       }
+
+       if (ibqp->srq)
+               context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn);
+
+       if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+               context->db_rec_addr = cpu_to_be64(qp->db.dma);
+
+       if (cur_state == IB_QPS_INIT &&
+           new_state == IB_QPS_RTR  &&
+           (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI ||
+            ibqp->qp_type == IB_QPT_UD)) {
+               context->pri_path.sched_queue = (qp->port - 1) << 6;
+               if (is_qp0(dev, qp))
+                       context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE;
+               else
+                       context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE;
+       }
+
+       if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD  &&
+           attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
+               sqd_event = 1;
+       else
+               sqd_event = 0;
+
+       err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state),
+                            to_mlx4_state(new_state), context, optpar,
+                            sqd_event, &qp->mqp);
+       if (err)
+               goto out;
+
+       qp->state = new_state;
+
+       if (attr_mask & IB_QP_ACCESS_FLAGS)
+               qp->atomic_rd_en = attr->qp_access_flags;
+       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+               qp->resp_depth = attr->max_dest_rd_atomic;
+       if (attr_mask & IB_QP_PORT)
+               qp->port = attr->port_num;
+       if (attr_mask & IB_QP_ALT_PATH)
+               qp->alt_port = attr->alt_port_num;
+
+       if (is_sqp(dev, qp))
+               store_sqp_attrs(to_msqp(qp), attr, attr_mask);
+
+       /*
+        * If we moved QP0 to RTR, bring the IB link up; if we moved
+        * QP0 to RESET or ERROR, bring the link back down.
+        */
+       if (is_qp0(dev, qp)) {
+               if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR)
+                       init_port(dev, qp->port);
+
+               if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
+                   (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
+                       mlx4_CLOSE_PORT(dev->dev, qp->port);
+       }
+
+       /*
+        * If we moved a kernel QP to RESET, clean up all old CQ
+        * entries and reinitialize the QP.
+        */
+       if (new_state == IB_QPS_RESET && !ibqp->uobject) {
+               mlx4_ib_cq_clean(to_mcq(ibqp->recv_cq), qp->mqp.qpn,
+                                ibqp->srq ? to_msrq(ibqp->srq): NULL);
+               if (ibqp->send_cq != ibqp->recv_cq)
+                       mlx4_ib_cq_clean(to_mcq(ibqp->send_cq), qp->mqp.qpn, NULL);
+
+               qp->rq.head = 0;
+               qp->rq.tail = 0;
+               qp->sq.head = 0;
+               qp->sq.tail = 0;
+               *qp->db.db  = 0;
+       }
+
+out:
+       mutex_unlock(&qp->mutex);
+       kfree(context);
+       return err;
+}
+
+static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
+                           void *wqe)
+{
+       struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev;
+       struct mlx4_wqe_mlx_seg *mlx = wqe;
+       struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
+       struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+       u16 pkey;
+       int send_size;
+       int header_size;
+       int i;
+
+       send_size = 0;
+       for (i = 0; i < wr->num_sge; ++i)
+               send_size += wr->sg_list[i].length;
+
+       ib_ud_header_init(send_size, mlx4_ib_ah_grh_present(ah), &sqp->ud_header);
+
+       sqp->ud_header.lrh.service_level   =
+               be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 28;
+       sqp->ud_header.lrh.destination_lid = ah->av.dlid;
+       sqp->ud_header.lrh.source_lid      = cpu_to_be16(ah->av.g_slid & 0x7f);
+       if (mlx4_ib_ah_grh_present(ah)) {
+               sqp->ud_header.grh.traffic_class =
+                       (be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20) & 0xff;
+               sqp->ud_header.grh.flow_label    =
+                       ah->av.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+               ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.port_pd) >> 24,
+                                 ah->av.gid_index, &sqp->ud_header.grh.source_gid);
+               memcpy(sqp->ud_header.grh.destination_gid.raw,
+                      ah->av.dgid, 16);
+       }
+
+       mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+       mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) |
+                                 (sqp->ud_header.lrh.destination_lid ==
+                                  IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
+                                 (sqp->ud_header.lrh.service_level << 8));
+       mlx->rlid   = sqp->ud_header.lrh.destination_lid;
+
+       switch (wr->opcode) {
+       case IB_WR_SEND:
+               sqp->ud_header.bth.opcode        = IB_OPCODE_UD_SEND_ONLY;
+               sqp->ud_header.immediate_present = 0;
+               break;
+       case IB_WR_SEND_WITH_IMM:
+               sqp->ud_header.bth.opcode        = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+               sqp->ud_header.immediate_present = 1;
+               sqp->ud_header.immediate_data    = wr->imm_data;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 : 0;
+       if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
+               sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
+       sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+       if (!sqp->qp.ibqp.qp_num)
+               ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey);
+       else
+               ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey);
+       sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+       sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+       sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+       sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
+                                              sqp->qkey : wr->wr.ud.remote_qkey);
+       sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+
+       header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
+
+       if (0) {
+               printk(KERN_ERR "built UD header of size %d:\n", header_size);
+               for (i = 0; i < header_size / 4; ++i) {
+                       if (i % 8 == 0)
+                               printk("  [%02x] ", i * 4);
+                       printk(" %08x",
+                              be32_to_cpu(((__be32 *) sqp->header_buf)[i]));
+                       if ((i + 1) % 8 == 0)
+                               printk("\n");
+               }
+               printk("\n");
+       }
+
+       inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+       memcpy(inl + 1, sqp->header_buf, header_size);
+
+       return ALIGN(sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
+}
+
+static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
+{
+       unsigned cur;
+       struct mlx4_ib_cq *cq;
+
+       cur = wq->head - wq->tail;
+       if (likely(cur + nreq < wq->max))
+               return 0;
+
+       cq = to_mcq(ib_cq);
+       spin_lock(&cq->lock);
+       cur = wq->head - wq->tail;
+       spin_unlock(&cq->lock);
+
+       return cur + nreq >= wq->max;
+}
+
+int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+                     struct ib_send_wr **bad_wr)
+{
+       struct mlx4_ib_qp *qp = to_mqp(ibqp);
+       void *wqe;
+       struct mlx4_wqe_ctrl_seg *ctrl;
+       unsigned long flags;
+       int nreq;
+       int err = 0;
+       int ind;
+       int size;
+       int i;
+
+       spin_lock_irqsave(&qp->rq.lock, flags);
+
+       ind = qp->sq.head;
+
+       for (nreq = 0; wr; ++nreq, wr = wr->next) {
+               if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+                       err = -ENOMEM;
+                       *bad_wr = wr;
+                       goto out;
+               }
+
+               if (unlikely(wr->num_sge > qp->sq.max_gs)) {
+                       err = -EINVAL;
+                       *bad_wr = wr;
+                       goto out;
+               }
+
+               ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.max - 1));
+               qp->sq.wrid[ind & (qp->sq.max - 1)] = wr->wr_id;
+
+               ctrl->srcrb_flags =
+                       (wr->send_flags & IB_SEND_SIGNALED ?
+                        cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
+                       (wr->send_flags & IB_SEND_SOLICITED ?
+                        cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) |
+                       qp->sq_signal_bits;
+
+               if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+                   wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+                       ctrl->imm = wr->imm_data;
+               else
+                       ctrl->imm = 0;
+
+               wqe += sizeof *ctrl;
+               size = sizeof *ctrl / 16;
+
+               switch (ibqp->qp_type) {
+               case IB_QPT_RC:
+               case IB_QPT_UC:
+                       switch (wr->opcode) {
+                       case IB_WR_ATOMIC_CMP_AND_SWP:
+                       case IB_WR_ATOMIC_FETCH_AND_ADD:
+                               ((struct mlx4_wqe_raddr_seg *) wqe)->raddr =
+                                       cpu_to_be64(wr->wr.atomic.remote_addr);
+                               ((struct mlx4_wqe_raddr_seg *) wqe)->rkey =
+                                       cpu_to_be32(wr->wr.atomic.rkey);
+                               ((struct mlx4_wqe_raddr_seg *) wqe)->reserved = 0;
+
+                               wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+
+                               if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+                                       ((struct mlx4_wqe_atomic_seg *) wqe)->swap_add =
+                                               cpu_to_be64(wr->wr.atomic.swap);
+                                       ((struct mlx4_wqe_atomic_seg *) wqe)->compare =
+                                               cpu_to_be64(wr->wr.atomic.compare_add);
+                               } else {
+                                       ((struct mlx4_wqe_atomic_seg *) wqe)->swap_add =
+                                               cpu_to_be64(wr->wr.atomic.compare_add);
+                                       ((struct mlx4_wqe_atomic_seg *) wqe)->compare = 0;
+                               }
+
+                               wqe  += sizeof (struct mlx4_wqe_atomic_seg);
+                               size += (sizeof (struct mlx4_wqe_raddr_seg) +
+                                        sizeof (struct mlx4_wqe_atomic_seg)) / 16;
+
+                               break;
+
+                       case IB_WR_RDMA_READ:
+                       case IB_WR_RDMA_WRITE:
+                       case IB_WR_RDMA_WRITE_WITH_IMM:
+                               ((struct mlx4_wqe_raddr_seg *) wqe)->raddr =
+                                       cpu_to_be64(wr->wr.rdma.remote_addr);
+                               ((struct mlx4_wqe_raddr_seg *) wqe)->rkey =
+                                       cpu_to_be32(wr->wr.rdma.rkey);
+                               ((struct mlx4_wqe_raddr_seg *) wqe)->reserved = 0;
+
+                               wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+                               size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
+
+                               break;
+
+                       default:
+                               /* No extra segments required for sends */
+                               break;
+                       }
+                       break;
+
+               case IB_QPT_UD:
+                       memcpy(((struct mlx4_wqe_datagram_seg *) wqe)->av,
+                              &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
+                       ((struct mlx4_wqe_datagram_seg *) wqe)->dqpn =
+                               cpu_to_be32(wr->wr.ud.remote_qpn);
+                       ((struct mlx4_wqe_datagram_seg *) wqe)->qkey =
+                               cpu_to_be32(wr->wr.ud.remote_qkey);
+
+                       wqe  += sizeof (struct mlx4_wqe_datagram_seg);
+                       size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+                       break;
+
+               case IB_QPT_SMI:
+               case IB_QPT_GSI:
+                       err = build_mlx_header(to_msqp(qp), wr, ctrl);
+                       if (err < 0) {
+                               *bad_wr = wr;
+                               goto out;
+                       }
+                       wqe  += err;
+                       size += err / 16;
+
+                       err = 0;
+                       break;
+
+               default:
+                       break;
+               }
+
+               for (i = 0; i < wr->num_sge; ++i) {
+                       ((struct mlx4_wqe_data_seg *) wqe)->byte_count =
+                               cpu_to_be32(wr->sg_list[i].length);
+                       ((struct mlx4_wqe_data_seg *) wqe)->lkey =
+                               cpu_to_be32(wr->sg_list[i].lkey);
+                       ((struct mlx4_wqe_data_seg *) wqe)->addr =
+                               cpu_to_be64(wr->sg_list[i].addr);
+
+                       wqe  += sizeof (struct mlx4_wqe_data_seg);
+                       size += sizeof (struct mlx4_wqe_data_seg) / 16;
+               }
+
+               /* Add one more inline data segment for ICRC for MLX sends */
+               if (qp->ibqp.qp_type == IB_QPT_SMI || qp->ibqp.qp_type == IB_QPT_GSI) {
+                       ((struct mlx4_wqe_inline_seg *) wqe)->byte_count =
+                               cpu_to_be32((1 << 31) | 4);
+                       ((u32 *) wqe)[1] = 0;
+                       wqe  += sizeof (struct mlx4_wqe_data_seg);
+                       size += sizeof (struct mlx4_wqe_data_seg) / 16;
+               }
+
+               ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
+                                   MLX4_WQE_CTRL_FENCE : 0) | size;
+
+               /*
+                * Make sure descriptor is fully written before
+                * setting ownership bit (because HW can start
+                * executing as soon as we do).
+                */
+               wmb();
+
+               if (wr->opcode < 0 || wr->opcode > ARRAY_SIZE(mlx4_ib_opcode)) {
+                       err = -EINVAL;
+                       goto out;
+               }
+
+               ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
+                       (ind & qp->sq.max ? cpu_to_be32(1 << 31) : 0);
+
+               ++ind;
+       }
+
+out:
+       if (likely(nreq)) {
+               qp->sq.head += nreq;
+
+               /*
+                * Make sure that descriptors are written before
+                * doorbell record.
+                */
+               wmb();
+
+               writel(qp->doorbell_qpn,
+                      to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL);
+
+               /*
+                * Make sure doorbells don't leak out of SQ spinlock
+                * and reach the HCA out of order.
+                */
+               mmiowb();
+       }
+
+       spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+       return err;
+}
+
+int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+                     struct ib_recv_wr **bad_wr)
+{
+       struct mlx4_ib_qp *qp = to_mqp(ibqp);
+       struct mlx4_wqe_data_seg *scat;
+       unsigned long flags;
+       int err = 0;
+       int nreq;
+       int ind;
+       int i;
+
+       spin_lock_irqsave(&qp->rq.lock, flags);
+
+       ind = qp->rq.head & (qp->rq.max - 1);
+
+       for (nreq = 0; wr; ++nreq, wr = wr->next) {
+               if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.send_cq)) {
+                       err = -ENOMEM;
+                       *bad_wr = wr;
+                       goto out;
+               }
+
+               if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+                       err = -EINVAL;
+                       *bad_wr = wr;
+                       goto out;
+               }
+
+               scat = get_recv_wqe(qp, ind);
+
+               for (i = 0; i < wr->num_sge; ++i) {
+                       scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length);
+                       scat[i].lkey       = cpu_to_be32(wr->sg_list[i].lkey);
+                       scat[i].addr       = cpu_to_be64(wr->sg_list[i].addr);
+               }
+
+               if (i < qp->rq.max_gs) {
+                       scat[i].byte_count = 0;
+                       scat[i].lkey       = cpu_to_be32(MLX4_INVALID_LKEY);
+                       scat[i].addr       = 0;
+               }
+
+               qp->rq.wrid[ind] = wr->wr_id;
+
+               ind = (ind + 1) & (qp->rq.max - 1);
+       }
+
+out:
+       if (likely(nreq)) {
+               qp->rq.head += nreq;
+
+               /*
+                * Make sure that descriptors are written before
+                * doorbell record.
+                */
+               wmb();
+
+               *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
+       }
+
+       spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+       return err;
+}
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
new file mode 100644 (file)
index 0000000..42ab4a8
--- /dev/null
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/srq.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+static void *get_wqe(struct mlx4_ib_srq *srq, int n)
+{
+       int offset = n << srq->msrq.wqe_shift;
+
+       if (srq->buf.nbufs == 1)
+               return srq->buf.u.direct.buf + offset;
+       else
+               return srq->buf.u.page_list[offset >> PAGE_SHIFT].buf +
+                       (offset & (PAGE_SIZE - 1));
+}
+
+static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type)
+{
+       struct ib_event event;
+       struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq;
+
+       if (ibsrq->event_handler) {
+               event.device      = ibsrq->device;
+               event.element.srq = ibsrq;
+               switch (type) {
+               case MLX4_EVENT_TYPE_SRQ_LIMIT:
+                       event.event = IB_EVENT_SRQ_LIMIT_REACHED;
+                       break;
+               case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR:
+                       event.event = IB_EVENT_SRQ_ERR;
+                       break;
+               default:
+                       printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+                              "on SRQ %06x\n", type, srq->srqn);
+                       return;
+               }
+
+               ibsrq->event_handler(&event, ibsrq->srq_context);
+       }
+}
+
+struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
+                                 struct ib_srq_init_attr *init_attr,
+                                 struct ib_udata *udata)
+{
+       struct mlx4_ib_dev *dev = to_mdev(pd->device);
+       struct mlx4_ib_srq *srq;
+       struct mlx4_wqe_srq_next_seg *next;
+       int desc_size;
+       int buf_size;
+       int err;
+       int i;
+
+       /* Sanity check SRQ size before proceeding */
+       if (init_attr->attr.max_wr  >= dev->dev->caps.max_srq_wqes ||
+           init_attr->attr.max_sge >  dev->dev->caps.max_srq_sge)
+               return ERR_PTR(-EINVAL);
+
+       srq = kmalloc(sizeof *srq, GFP_KERNEL);
+       if (!srq)
+               return ERR_PTR(-ENOMEM);
+
+       mutex_init(&srq->mutex);
+       spin_lock_init(&srq->lock);
+       srq->msrq.max    = roundup_pow_of_two(init_attr->attr.max_wr + 1);
+       srq->msrq.max_gs = init_attr->attr.max_sge;
+
+       desc_size = max(32UL,
+                       roundup_pow_of_two(sizeof (struct mlx4_wqe_srq_next_seg) +
+                                          srq->msrq.max_gs *
+                                          sizeof (struct mlx4_wqe_data_seg)));
+       srq->msrq.wqe_shift = ilog2(desc_size);
+
+       buf_size = srq->msrq.max * desc_size;
+
+       if (pd->uobject) {
+               struct mlx4_ib_create_srq ucmd;
+
+               if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+                       err = -EFAULT;
+                       goto err_srq;
+               }
+
+               srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
+                                       buf_size, 0);
+               if (IS_ERR(srq->umem)) {
+                       err = PTR_ERR(srq->umem);
+                       goto err_srq;
+               }
+
+               err = mlx4_mtt_init(dev->dev, ib_umem_page_count(srq->umem),
+                                   ilog2(srq->umem->page_size), &srq->mtt);
+               if (err)
+                       goto err_buf;
+
+               err = mlx4_ib_umem_write_mtt(dev, &srq->mtt, srq->umem);
+               if (err)
+                       goto err_mtt;
+
+               err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
+                                         ucmd.db_addr, &srq->db);
+               if (err)
+                       goto err_mtt;
+       } else {
+               err = mlx4_ib_db_alloc(dev, &srq->db, 0);
+               if (err)
+                       goto err_srq;
+
+               *srq->db.db = 0;
+
+               if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf)) {
+                       err = -ENOMEM;
+                       goto err_db;
+               }
+
+               srq->head    = 0;
+               srq->tail    = srq->msrq.max - 1;
+               srq->wqe_ctr = 0;
+
+               for (i = 0; i < srq->msrq.max; ++i) {
+                       next = get_wqe(srq, i);
+                       next->next_wqe_index =
+                               cpu_to_be16((i + 1) & (srq->msrq.max - 1));
+               }
+
+               err = mlx4_mtt_init(dev->dev, srq->buf.npages, srq->buf.page_shift,
+                                   &srq->mtt);
+               if (err)
+                       goto err_buf;
+
+               err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf);
+               if (err)
+                       goto err_mtt;
+
+               srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL);
+               if (!srq->wrid) {
+                       err = -ENOMEM;
+                       goto err_mtt;
+               }
+       }
+
+       err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, &srq->mtt,
+                            srq->db.dma, &srq->msrq);
+       if (err)
+               goto err_wrid;
+
+       srq->msrq.event = mlx4_ib_srq_event;
+
+       if (pd->uobject)
+               if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) {
+                       err = -EFAULT;
+                       goto err_wrid;
+               }
+
+       init_attr->attr.max_wr = srq->msrq.max - 1;
+
+       return &srq->ibsrq;
+
+err_wrid:
+       if (pd->uobject)
+               mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db);
+       else
+               kfree(srq->wrid);
+
+err_mtt:
+       mlx4_mtt_cleanup(dev->dev, &srq->mtt);
+
+err_buf:
+       if (pd->uobject)
+               ib_umem_release(srq->umem);
+       else
+               mlx4_buf_free(dev->dev, buf_size, &srq->buf);
+
+err_db:
+       if (!pd->uobject)
+               mlx4_ib_db_free(dev, &srq->db);
+
+err_srq:
+       kfree(srq);
+
+       return ERR_PTR(err);
+}
+
+int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+                      enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+       struct mlx4_ib_dev *dev = to_mdev(ibsrq->device);
+       struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+       int ret;
+
+       /* We don't support resizing SRQs (yet?) */
+       if (attr_mask & IB_SRQ_MAX_WR)
+               return -EINVAL;
+
+       if (attr_mask & IB_SRQ_LIMIT) {
+               if (attr->srq_limit >= srq->msrq.max)
+                       return -EINVAL;
+
+               mutex_lock(&srq->mutex);
+               ret = mlx4_srq_arm(dev->dev, &srq->msrq, attr->srq_limit);
+               mutex_unlock(&srq->mutex);
+
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+int mlx4_ib_destroy_srq(struct ib_srq *srq)
+{
+       struct mlx4_ib_dev *dev = to_mdev(srq->device);
+       struct mlx4_ib_srq *msrq = to_msrq(srq);
+
+       mlx4_srq_free(dev->dev, &msrq->msrq);
+       mlx4_mtt_cleanup(dev->dev, &msrq->mtt);
+
+       if (srq->uobject) {
+               mlx4_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);
+               ib_umem_release(msrq->umem);
+       } else {
+               kfree(msrq->wrid);
+               mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift,
+                             &msrq->buf);
+               mlx4_ib_db_free(dev, &msrq->db);
+       }
+
+       kfree(msrq);
+
+       return 0;
+}
+
+void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index)
+{
+       struct mlx4_wqe_srq_next_seg *next;
+
+       /* always called with interrupts disabled. */
+       spin_lock(&srq->lock);
+
+       next = get_wqe(srq, srq->tail);
+       next->next_wqe_index = cpu_to_be16(wqe_index);
+       srq->tail = wqe_index;
+
+       spin_unlock(&srq->lock);
+}
+
+int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+                         struct ib_recv_wr **bad_wr)
+{
+       struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+       struct mlx4_wqe_srq_next_seg *next;
+       struct mlx4_wqe_data_seg *scat;
+       unsigned long flags;
+       int err = 0;
+       int nreq;
+       int i;
+
+       spin_lock_irqsave(&srq->lock, flags);
+
+       for (nreq = 0; wr; ++nreq, wr = wr->next) {
+               if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
+                       err = -EINVAL;
+                       *bad_wr = wr;
+                       break;
+               }
+
+               srq->wrid[srq->head] = wr->wr_id;
+
+               next      = get_wqe(srq, srq->head);
+               srq->head = be16_to_cpu(next->next_wqe_index);
+               scat      = (struct mlx4_wqe_data_seg *) (next + 1);
+
+               for (i = 0; i < wr->num_sge; ++i) {
+                       scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length);
+                       scat[i].lkey       = cpu_to_be32(wr->sg_list[i].lkey);
+                       scat[i].addr       = cpu_to_be64(wr->sg_list[i].addr);
+               }
+
+               if (i < srq->msrq.max_gs) {
+                       scat[i].byte_count = 0;
+                       scat[i].lkey       = cpu_to_be32(MLX4_INVALID_LKEY);
+                       scat[i].addr       = 0;
+               }
+       }
+
+       if (likely(nreq)) {
+               srq->wqe_ctr += nreq;
+
+               /*
+                * Make sure that descriptors are written before
+                * doorbell record.
+                */
+               wmb();
+
+               *srq->db.db = cpu_to_be32(srq->wqe_ctr);
+       }
+
+       spin_unlock_irqrestore(&srq->lock, flags);
+
+       return err;
+}
diff --git a/drivers/infiniband/hw/mlx4/user.h b/drivers/infiniband/hw/mlx4/user.h
new file mode 100644 (file)
index 0000000..5b8eddc
--- /dev/null
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_IB_USER_H
+#define MLX4_IB_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define MLX4_IB_UVERBS_ABI_VERSION     1
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct mlx4_ib_alloc_ucontext_resp {
+       __u32   qp_tab_size;
+       __u16   bf_reg_size;
+       __u16   bf_regs_per_page;
+};
+
+struct mlx4_ib_alloc_pd_resp {
+       __u32   pdn;
+       __u32   reserved;
+};
+
+struct mlx4_ib_create_cq {
+       __u64   buf_addr;
+       __u64   db_addr;
+};
+
+struct mlx4_ib_create_cq_resp {
+       __u32   cqn;
+       __u32   reserved;
+};
+
+struct mlx4_ib_resize_cq {
+       __u64   buf_addr;
+};
+
+struct mlx4_ib_create_srq {
+       __u64   buf_addr;
+       __u64   db_addr;
+};
+
+struct mlx4_ib_create_srq_resp {
+       __u32   srqn;
+       __u32   reserved;
+};
+
+struct mlx4_ib_create_qp {
+       __u64   buf_addr;
+       __u64   db_addr;
+};
+
+#endif /* MLX4_IB_USER_H */
index b86ccd2..6db12d0 100644 (file)
@@ -2493,6 +2493,20 @@ config PASEMI_MAC
          This driver supports the on-chip 1/10Gbit Ethernet controller on
          PA Semi's PWRficient line of chips.
 
+config MLX4_CORE
+       tristate
+       depends on PCI
+       default n
+
+config MLX4_DEBUG
+       bool "Verbose debugging output" if (MLX4_CORE && EMBEDDED)
+       default y
+       ---help---
+         This option causes debugging code to be compiled into the
+         mlx4_core driver.  The output can be turned on via the
+         debug_level module parameter (which can also be set after
+         the driver is loaded through sysfs).
+
 endmenu
 
 source "drivers/net/tokenring/Kconfig"
index 59c0459..7faeeea 100644 (file)
@@ -197,6 +197,7 @@ obj-$(CONFIG_SMC911X) += smc911x.o
 obj-$(CONFIG_DM9000) += dm9000.o
 obj-$(CONFIG_FEC_8XX) += fec_8xx/
 obj-$(CONFIG_PASEMI_MAC) += pasemi_mac.o
+obj-$(CONFIG_MLX4_CORE) += mlx4/
 
 obj-$(CONFIG_MACB) += macb.o
 
diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
new file mode 100644 (file)
index 0000000..0952a65
--- /dev/null
@@ -0,0 +1,4 @@
+obj-$(CONFIG_MLX4_CORE)                += mlx4_core.o
+
+mlx4_core-y := alloc.o catas.o cmd.o cq.o eq.o fw.o icm.o intf.o main.o mcg.o \
+               mr.o pd.o profile.o qp.o reset.o srq.o
diff --git a/drivers/net/mlx4/alloc.c b/drivers/net/mlx4/alloc.c
new file mode 100644 (file)
index 0000000..9ffdb9d
--- /dev/null
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
+
+#include "mlx4.h"
+
+u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap)
+{
+       u32 obj;
+
+       spin_lock(&bitmap->lock);
+
+       obj = find_next_zero_bit(bitmap->table, bitmap->max, bitmap->last);
+       if (obj >= bitmap->max) {
+               bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask;
+               obj = find_first_zero_bit(bitmap->table, bitmap->max);
+       }
+
+       if (obj < bitmap->max) {
+               set_bit(obj, bitmap->table);
+               obj |= bitmap->top;
+               bitmap->last = obj + 1;
+       } else
+               obj = -1;
+
+       spin_unlock(&bitmap->lock);
+
+       return obj;
+}
+
+void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj)
+{
+       obj &= bitmap->max - 1;
+
+       spin_lock(&bitmap->lock);
+       clear_bit(obj, bitmap->table);
+       bitmap->last = min(bitmap->last, obj);
+       bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask;
+       spin_unlock(&bitmap->lock);
+}
+
+int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask, u32 reserved)
+{
+       int i;
+
+       /* num must be a power of 2 */
+       if (num != roundup_pow_of_two(num))
+               return -EINVAL;
+
+       bitmap->last = 0;
+       bitmap->top  = 0;
+       bitmap->max  = num;
+       bitmap->mask = mask;
+       spin_lock_init(&bitmap->lock);
+       bitmap->table = kzalloc(BITS_TO_LONGS(num) * sizeof (long), GFP_KERNEL);
+       if (!bitmap->table)
+               return -ENOMEM;
+
+       for (i = 0; i < reserved; ++i)
+               set_bit(i, bitmap->table);
+
+       return 0;
+}
+
+void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap)
+{
+       kfree(bitmap->table);
+}
+
+/*
+ * Handling for queue buffers -- we allocate a bunch of memory and
+ * register it in a memory region at HCA virtual address 0.  If the
+ * requested size is > max_direct, we split the allocation into
+ * multiple pages, so we don't require too much contiguous memory.
+ */
+
+int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
+                  struct mlx4_buf *buf)
+{
+       dma_addr_t t;
+
+       if (size <= max_direct) {
+               buf->nbufs        = 1;
+               buf->npages       = 1;
+               buf->page_shift   = get_order(size) + PAGE_SHIFT;
+               buf->u.direct.buf = dma_alloc_coherent(&dev->pdev->dev,
+                                                      size, &t, GFP_KERNEL);
+               if (!buf->u.direct.buf)
+                       return -ENOMEM;
+
+               buf->u.direct.map = t;
+
+               while (t & ((1 << buf->page_shift) - 1)) {
+                       --buf->page_shift;
+                       buf->npages *= 2;
+               }
+
+               memset(buf->u.direct.buf, 0, size);
+       } else {
+               int i;
+
+               buf->nbufs       = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+               buf->npages      = buf->nbufs;
+               buf->page_shift  = PAGE_SHIFT;
+               buf->u.page_list = kzalloc(buf->nbufs * sizeof *buf->u.page_list,
+                                          GFP_KERNEL);
+               if (!buf->u.page_list)
+                       return -ENOMEM;
+
+               for (i = 0; i < buf->nbufs; ++i) {
+                       buf->u.page_list[i].buf =
+                               dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE,
+                                                  &t, GFP_KERNEL);
+                       if (!buf->u.page_list[i].buf)
+                               goto err_free;
+
+                       buf->u.page_list[i].map = t;
+
+                       memset(buf->u.page_list[i].buf, 0, PAGE_SIZE);
+               }
+       }
+
+       return 0;
+
+err_free:
+       mlx4_buf_free(dev, size, buf);
+
+       return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(mlx4_buf_alloc);
+
+void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
+{
+       int i;
+
+       if (buf->nbufs == 1)
+               dma_free_coherent(&dev->pdev->dev, size, buf->u.direct.buf,
+                                 buf->u.direct.map);
+       else {
+               for (i = 0; i < buf->nbufs; ++i)
+                       dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+                                         buf->u.page_list[i].buf,
+                                         buf->u.page_list[i].map);
+               kfree(buf->u.page_list);
+       }
+}
+EXPORT_SYMBOL_GPL(mlx4_buf_free);
diff --git a/drivers/net/mlx4/catas.c b/drivers/net/mlx4/catas.c
new file mode 100644 (file)
index 0000000..1bb088a
--- /dev/null
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx4.h"
+
+void mlx4_handle_catas_err(struct mlx4_dev *dev)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+
+       int i;
+
+       mlx4_err(dev, "Catastrophic error detected:\n");
+       for (i = 0; i < priv->fw.catas_size; ++i)
+               mlx4_err(dev, "  buf[%02x]: %08x\n",
+                        i, swab32(readl(priv->catas_err.map + i)));
+
+       mlx4_dispatch_event(dev, MLX4_EVENT_TYPE_LOCAL_CATAS_ERROR, 0, 0);
+}
+
+void mlx4_map_catas_buf(struct mlx4_dev *dev)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       unsigned long addr;
+
+       addr = pci_resource_start(dev->pdev, priv->fw.catas_bar) +
+               priv->fw.catas_offset;
+
+       priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
+       if (!priv->catas_err.map)
+               mlx4_warn(dev, "Failed to map catastrophic error buffer at 0x%lx\n",
+                         addr);
+
+}
+
+void mlx4_unmap_catas_buf(struct mlx4_dev *dev)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+
+       if (priv->catas_err.map)
+               iounmap(priv->catas_err.map);
+}
diff --git a/drivers/net/mlx4/cmd.c b/drivers/net/mlx4/cmd.c
new file mode 100644 (file)
index 0000000..c1f81a9
--- /dev/null
@@ -0,0 +1,429 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include <asm/io.h>
+
+#include "mlx4.h"
+
+#define CMD_POLL_TOKEN 0xffff
+
+enum {
+       /* command completed successfully: */
+       CMD_STAT_OK             = 0x00,
+       /* Internal error (such as a bus error) occurred while processing command: */
+       CMD_STAT_INTERNAL_ERR   = 0x01,
+       /* Operation/command not supported or opcode modifier not supported: */
+       CMD_STAT_BAD_OP         = 0x02,
+       /* Parameter not supported or parameter out of range: */
+       CMD_STAT_BAD_PARAM      = 0x03,
+       /* System not enabled or bad system state: */
+       CMD_STAT_BAD_SYS_STATE  = 0x04,
+       /* Attempt to access reserved or unallocaterd resource: */
+       CMD_STAT_BAD_RESOURCE   = 0x05,
+       /* Requested resource is currently executing a command, or is otherwise busy: */
+       CMD_STAT_RESOURCE_BUSY  = 0x06,
+       /* Required capability exceeds device limits: */
+       CMD_STAT_EXCEED_LIM     = 0x08,
+       /* Resource is not in the appropriate state or ownership: */
+       CMD_STAT_BAD_RES_STATE  = 0x09,
+       /* Index out of range: */
+       CMD_STAT_BAD_INDEX      = 0x0a,
+       /* FW image corrupted: */
+       CMD_STAT_BAD_NVMEM      = 0x0b,
+       /* Attempt to modify a QP/EE which is not in the presumed state: */
+       CMD_STAT_BAD_QP_STATE   = 0x10,
+       /* Bad segment parameters (Address/Size): */
+       CMD_STAT_BAD_SEG_PARAM  = 0x20,
+       /* Memory Region has Memory Windows bound to: */
+       CMD_STAT_REG_BOUND      = 0x21,
+       /* HCA local attached memory not present: */
+       CMD_STAT_LAM_NOT_PRE    = 0x22,
+       /* Bad management packet (silently discarded): */
+       CMD_STAT_BAD_PKT        = 0x30,
+       /* More outstanding CQEs in CQ than new CQ size: */
+       CMD_STAT_BAD_SIZE       = 0x40
+};
+
+enum {
+       HCR_IN_PARAM_OFFSET     = 0x00,
+       HCR_IN_MODIFIER_OFFSET  = 0x08,
+       HCR_OUT_PARAM_OFFSET    = 0x0c,
+       HCR_TOKEN_OFFSET        = 0x14,
+       HCR_STATUS_OFFSET       = 0x18,
+
+       HCR_OPMOD_SHIFT         = 12,
+       HCR_T_BIT               = 21,
+       HCR_E_BIT               = 22,
+       HCR_GO_BIT              = 23
+};
+
+enum {
+       GO_BIT_TIMEOUT          = 10000
+};
+
+struct mlx4_cmd_context {
+       struct completion       done;
+       int                     result;
+       int                     next;
+       u64                     out_param;
+       u16                     token;
+};
+
+static int mlx4_status_to_errno(u8 status) {
+       static const int trans_table[] = {
+               [CMD_STAT_INTERNAL_ERR]   = -EIO,
+               [CMD_STAT_BAD_OP]         = -EPERM,
+               [CMD_STAT_BAD_PARAM]      = -EINVAL,
+               [CMD_STAT_BAD_SYS_STATE]  = -ENXIO,
+               [CMD_STAT_BAD_RESOURCE]   = -EBADF,
+               [CMD_STAT_RESOURCE_BUSY]  = -EBUSY,
+               [CMD_STAT_EXCEED_LIM]     = -ENOMEM,
+               [CMD_STAT_BAD_RES_STATE]  = -EBADF,
+               [CMD_STAT_BAD_INDEX]      = -EBADF,
+               [CMD_STAT_BAD_NVMEM]      = -EFAULT,
+               [CMD_STAT_BAD_QP_STATE]   = -EINVAL,
+               [CMD_STAT_BAD_SEG_PARAM]  = -EFAULT,
+               [CMD_STAT_REG_BOUND]      = -EBUSY,
+               [CMD_STAT_LAM_NOT_PRE]    = -EAGAIN,
+               [CMD_STAT_BAD_PKT]        = -EINVAL,
+               [CMD_STAT_BAD_SIZE]       = -ENOMEM,
+       };
+
+       if (status >= ARRAY_SIZE(trans_table) ||
+           (status != CMD_STAT_OK && trans_table[status] == 0))
+               return -EIO;
+
+       return trans_table[status];
+}
+
+static int cmd_pending(struct mlx4_dev *dev)
+{
+       u32 status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
+
+       return (status & swab32(1 << HCR_GO_BIT)) ||
+               (mlx4_priv(dev)->cmd.toggle ==
+                !!(status & swab32(1 << HCR_T_BIT)));
+}
+
+static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
+                        u32 in_modifier, u8 op_modifier, u16 op, u16 token,
+                        int event)
+{
+       struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+       u32 __iomem *hcr = cmd->hcr;
+       int ret = -EAGAIN;
+       unsigned long end;
+
+       mutex_lock(&cmd->hcr_mutex);
+
+       end = jiffies;
+       if (event)
+               end += HZ * 10;
+
+       while (cmd_pending(dev)) {
+               if (time_after_eq(jiffies, end))
+                       goto out;
+               cond_resched();
+       }
+
+       /*
+        * We use writel (instead of something like memcpy_toio)
+        * because writes of less than 32 bits to the HCR don't work
+        * (and some architectures such as ia64 implement memcpy_toio
+        * in terms of writeb).
+        */
+       __raw_writel((__force u32) cpu_to_be32(in_param >> 32),           hcr + 0);
+       __raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful),  hcr + 1);
+       __raw_writel((__force u32) cpu_to_be32(in_modifier),              hcr + 2);
+       __raw_writel((__force u32) cpu_to_be32(out_param >> 32),          hcr + 3);
+       __raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), hcr + 4);
+       __raw_writel((__force u32) cpu_to_be32(token << 16),              hcr + 5);
+
+       /* __raw_writel may not order writes. */
+       wmb();
+
+       __raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT)                |
+                                              (cmd->toggle << HCR_T_BIT)       |
+                                              (event ? (1 << HCR_E_BIT) : 0)   |
+                                              (op_modifier << HCR_OPMOD_SHIFT) |
+                                              op),                       hcr + 6);
+       cmd->toggle = cmd->toggle ^ 1;
+
+       ret = 0;
+
+out:
+       mutex_unlock(&cmd->hcr_mutex);
+       return ret;
+}
+
+static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+                        int out_is_imm, u32 in_modifier, u8 op_modifier,
+                        u16 op, unsigned long timeout)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       void __iomem *hcr = priv->cmd.hcr;
+       int err = 0;
+       unsigned long end;
+
+       down(&priv->cmd.poll_sem);
+
+       err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
+                           in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0);
+       if (err)
+               goto out;
+
+       end = msecs_to_jiffies(timeout) + jiffies;
+       while (cmd_pending(dev) && time_before(jiffies, end))
+               cond_resched();
+
+       if (cmd_pending(dev)) {
+               err = -ETIMEDOUT;
+               goto out;
+       }
+
+       if (out_is_imm)
+               *out_param =
+                       (u64) be32_to_cpu((__force __be32)
+                                         __raw_readl(hcr + HCR_OUT_PARAM_OFFSET)) << 32 |
+                       (u64) be32_to_cpu((__force __be32)
+                                         __raw_readl(hcr + HCR_OUT_PARAM_OFFSET + 4));
+
+       err = mlx4_status_to_errno(be32_to_cpu((__force __be32)
+                                              __raw_readl(hcr + HCR_STATUS_OFFSET)) >> 24);
+
+out:
+       up(&priv->cmd.poll_sem);
+       return err;
+}
+
+void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       struct mlx4_cmd_context *context =
+               &priv->cmd.context[token & priv->cmd.token_mask];
+
+       /* previously timed out command completing at long last */
+       if (token != context->token)
+               return;
+
+       context->result    = mlx4_status_to_errno(status);
+       context->out_param = out_param;
+
+       context->token += priv->cmd.token_mask + 1;
+
+       complete(&context->done);
+}
+
+static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+                        int out_is_imm, u32 in_modifier, u8 op_modifier,
+                        u16 op, unsigned long timeout)
+{
+       struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+       struct mlx4_cmd_context *context;
+       int err = 0;
+
+       down(&cmd->event_sem);
+
+       spin_lock(&cmd->context_lock);
+       BUG_ON(cmd->free_head < 0);
+       context = &cmd->context[cmd->free_head];
+       cmd->free_head = context->next;
+       spin_unlock(&cmd->context_lock);
+
+       init_completion(&context->done);
+
+       mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
+                     in_modifier, op_modifier, op, context->token, 1);
+
+       if (!wait_for_completion_timeout(&context->done, msecs_to_jiffies(timeout))) {
+               err = -EBUSY;
+               goto out;
+       }
+
+       err = context->result;
+       if (err)
+               goto out;
+
+       if (out_is_imm)
+               *out_param = context->out_param;
+
+out:
+       spin_lock(&cmd->context_lock);
+       context->next = cmd->free_head;
+       cmd->free_head = context - cmd->context;
+       spin_unlock(&cmd->context_lock);
+
+       up(&cmd->event_sem);
+       return err;
+}
+
+int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+              int out_is_imm, u32 in_modifier, u8 op_modifier,
+              u16 op, unsigned long timeout)
+{
+       if (mlx4_priv(dev)->cmd.use_events)
+               return mlx4_cmd_wait(dev, in_param, out_param, out_is_imm,
+                                    in_modifier, op_modifier, op, timeout);
+       else
+               return mlx4_cmd_poll(dev, in_param, out_param, out_is_imm,
+                                    in_modifier, op_modifier, op, timeout);
+}
+EXPORT_SYMBOL_GPL(__mlx4_cmd);
+
+int mlx4_cmd_init(struct mlx4_dev *dev)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+
+       mutex_init(&priv->cmd.hcr_mutex);
+       sema_init(&priv->cmd.poll_sem, 1);
+       priv->cmd.use_events = 0;
+       priv->cmd.toggle     = 1;
+
+       priv->cmd.hcr = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_HCR_BASE,
+                               MLX4_HCR_SIZE);
+       if (!priv->cmd.hcr) {
+               mlx4_err(dev, "Couldn't map command register.");
+               return -ENOMEM;
+       }
+
+       priv->cmd.pool = pci_pool_create("mlx4_cmd", dev->pdev,
+                                        MLX4_MAILBOX_SIZE,
+                                        MLX4_MAILBOX_SIZE, 0);
+       if (!priv->cmd.pool) {
+               iounmap(priv->cmd.hcr);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+void mlx4_cmd_cleanup(struct mlx4_dev *dev)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+
+       pci_pool_destroy(priv->cmd.pool);
+       iounmap(priv->cmd.hcr);
+}
+
+/*
+ * Switch to using events to issue FW commands (can only be called
+ * after event queue for command events has been initialized).
+ */
+int mlx4_cmd_use_events(struct mlx4_dev *dev)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       int i;
+
+       priv->cmd.context = kmalloc(priv->cmd.max_cmds *
+                                  sizeof (struct mlx4_cmd_context),
+                                  GFP_KERNEL);
+       if (!priv->cmd.context)
+               return -ENOMEM;
+
+       for (i = 0; i < priv->cmd.max_cmds; ++i) {
+               priv->cmd.context[i].token = i;
+               priv->cmd.context[i].next  = i + 1;
+       }
+
+       priv->cmd.context[priv->cmd.max_cmds - 1].next = -1;
+       priv->cmd.free_head = 0;
+
+       sema_init(&priv->cmd.event_sem, priv->cmd.max_cmds);
+       spin_lock_init(&priv->cmd.context_lock);
+
+       for (priv->cmd.token_mask = 1;
+            priv->cmd.token_mask < priv->cmd.max_cmds;
+            priv->cmd.token_mask <<= 1)
+               ; /* nothing */
+       --priv->cmd.token_mask;
+
+       priv->cmd.use_events = 1;
+
+       down(&priv->cmd.poll_sem);
+
+       return 0;
+}
+
+/*
+ * Switch back to polling (used when shutting down the device)
+ */
+void mlx4_cmd_use_polling(struct mlx4_dev *dev)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       int i;
+
+       priv->cmd.use_events = 0;
+
+       for (i = 0; i < priv->cmd.max_cmds; ++i)
+               down(&priv->cmd.event_sem);
+
+       kfree(priv->cmd.context);
+
+       up(&priv->cmd.poll_sem);
+}
+
+struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev)
+{
+       struct mlx4_cmd_mailbox *mailbox;
+
+       mailbox = kmalloc(sizeof *mailbox, GFP_KERNEL);
+       if (!mailbox)
+               return ERR_PTR(-ENOMEM);
+
+       mailbox->buf = pci_pool_alloc(mlx4_priv(dev)->cmd.pool, GFP_KERNEL,
+                                     &mailbox->dma);
+       if (!mailbox->buf) {
+               kfree(mailbox);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       return mailbox;
+}
+EXPORT_SYMBOL_GPL(mlx4_alloc_cmd_mailbox);
+
+void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox)
+{
+       if (!mailbox)
+               return;
+
+       pci_pool_free(mlx4_priv(dev)->cmd.pool, mailbox->buf, mailbox->dma);
+       kfree(mailbox);
+}
+EXPORT_SYMBOL_GPL(mlx4_free_cmd_mailbox);
diff --git a/drivers/net/mlx4/cq.c b/drivers/net/mlx4/cq.c
new file mode 100644 (file)
index 0000000..437d78a
--- /dev/null
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/hardirq.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+struct mlx4_cq_context {
+       __be32                  flags;
+       u16                     reserved1[3];
+       __be16                  page_offset;
+       __be32                  logsize_usrpage;
+       u8                      reserved2;
+       u8                      cq_period;
+       u8                      reserved3;
+       u8                      cq_max_count;
+       u8                      reserved4[3];
+       u8                      comp_eqn;
+       u8                      log_page_size;
+       u8                      reserved5[2];
+       u8                      mtt_base_addr_h;
+       __be32                  mtt_base_addr_l;
+       __be32                  last_notified_index;
+       __be32                  solicit_producer_index;
+       __be32                  consumer_index;
+       __be32                  producer_index;
+       u8                      reserved6[2];
+       __be64                  db_rec_addr;
+};
+
+#define MLX4_CQ_STATUS_OK              ( 0 << 28)
+#define MLX4_CQ_STATUS_OVERFLOW                ( 9 << 28)
+#define MLX4_CQ_STATUS_WRITE_FAIL      (10 << 28)
+#define MLX4_CQ_FLAG_CC                        ( 1 << 18)
+#define MLX4_CQ_FLAG_OI                        ( 1 << 17)
+#define MLX4_CQ_STATE_ARMED            ( 9 <<  8)
+#define MLX4_CQ_STATE_ARMED_SOL                ( 6 <<  8)
+#define MLX4_EQ_STATE_FIRED            (10 <<  8)
+
+void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn)
+{
+       struct mlx4_cq *cq;
+
+       cq = radix_tree_lookup(&mlx4_priv(dev)->cq_table.tree,
+                              cqn & (dev->caps.num_cqs - 1));
+       if (!cq) {
+               mlx4_warn(dev, "Completion event for bogus CQ %08x\n", cqn);
+               return;
+       }
+
+       ++cq->arm_sn;
+
+       cq->comp(cq);
+}
+
+void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type)
+{
+       struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table;
+       struct mlx4_cq *cq;
+
+       spin_lock(&cq_table->lock);
+
+       cq = radix_tree_lookup(&cq_table->tree, cqn & (dev->caps.num_cqs - 1));
+       if (cq)
+               atomic_inc(&cq->refcount);
+
+       spin_unlock(&cq_table->lock);
+
+       if (!cq) {
+               mlx4_warn(dev, "Async event for bogus CQ %08x\n", cqn);
+               return;
+       }
+
+       cq->event(cq, event_type);
+
+       if (atomic_dec_and_test(&cq->refcount))
+               complete(&cq->free);
+}
+
+static int mlx4_SW2HW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+                        int cq_num)
+{
+       return mlx4_cmd(dev, mailbox->dma, cq_num, 0, MLX4_CMD_SW2HW_CQ,
+                       MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_HW2SW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+                        int cq_num)
+{
+       return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, cq_num,
+                           mailbox ? 0 : 1, MLX4_CMD_HW2SW_CQ,
+                           MLX4_CMD_TIME_CLASS_A);
+}
+
+int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
+                 struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       struct mlx4_cq_table *cq_table = &priv->cq_table;
+       struct mlx4_cmd_mailbox *mailbox;
+       struct mlx4_cq_context *cq_context;
+       u64 mtt_addr;
+       int err;
+
+       cq->cqn = mlx4_bitmap_alloc(&cq_table->bitmap);
+       if (cq->cqn == -1)
+               return -ENOMEM;
+
+       err = mlx4_table_get(dev, &cq_table->table, cq->cqn);
+       if (err)
+               goto err_out;
+
+       err = mlx4_table_get(dev, &cq_table->cmpt_table, cq->cqn);
+       if (err)
+               goto err_put;
+
+       spin_lock_irq(&cq_table->lock);
+       err = radix_tree_insert(&cq_table->tree, cq->cqn, cq);
+       spin_unlock_irq(&cq_table->lock);
+       if (err)
+               goto err_cmpt_put;
+
+       mailbox = mlx4_alloc_cmd_mailbox(dev);
+       if (IS_ERR(mailbox)) {
+               err = PTR_ERR(mailbox);
+               goto err_radix;
+       }
+
+       cq_context = mailbox->buf;
+       memset(cq_context, 0, sizeof *cq_context);
+
+       cq_context->logsize_usrpage = cpu_to_be32((ilog2(nent) << 24) | uar->index);
+       cq_context->comp_eqn        = priv->eq_table.eq[MLX4_EQ_COMP].eqn;
+       cq_context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
+
+       mtt_addr = mlx4_mtt_addr(dev, mtt);
+       cq_context->mtt_base_addr_h = mtt_addr >> 32;
+       cq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+       cq_context->db_rec_addr     = cpu_to_be64(db_rec);
+
+       err = mlx4_SW2HW_CQ(dev, mailbox, cq->cqn);
+       mlx4_free_cmd_mailbox(dev, mailbox);
+       if (err)
+               goto err_radix;
+
+       cq->cons_index = 0;
+       cq->arm_sn     = 1;
+       cq->uar        = uar;
+       atomic_set(&cq->refcount, 1);
+       init_completion(&cq->free);
+
+       return 0;
+
+err_radix:
+       spin_lock_irq(&cq_table->lock);
+       radix_tree_delete(&cq_table->tree, cq->cqn);
+       spin_unlock_irq(&cq_table->lock);
+
+err_cmpt_put:
+       mlx4_table_put(dev, &cq_table->cmpt_table, cq->cqn);
+
+err_put:
+       mlx4_table_put(dev, &cq_table->table, cq->cqn);
+
+err_out:
+       mlx4_bitmap_free(&cq_table->bitmap, cq->cqn);
+
+       return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_alloc);
+
+void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       struct mlx4_cq_table *cq_table = &priv->cq_table;
+       int err;
+
+       err = mlx4_HW2SW_CQ(dev, NULL, cq->cqn);
+       if (err)
+               mlx4_warn(dev, "HW2SW_CQ failed (%d) for CQN %06x\n", err, cq->cqn);
+
+       synchronize_irq(priv->eq_table.eq[MLX4_EQ_COMP].irq);
+
+       spin_lock_irq(&cq_table->lock);
+       radix_tree_delete(&cq_table->tree, cq->cqn);
+       spin_unlock_irq(&cq_table->lock);
+
+       if (atomic_dec_and_test(&cq->refcount))
+               complete(&cq->free);
+       wait_for_completion(&cq->free);
+
+       mlx4_table_put(dev, &cq_table->table, cq->cqn);
+       mlx4_bitmap_free(&cq_table->bitmap, cq->cqn);
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_free);
+
+int __devinit mlx4_init_cq_table(struct mlx4_dev *dev)
+{
+       struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table;
+       int err;
+
+       spin_lock_init(&cq_table->lock);
+       INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
+
+       err = mlx4_bitmap_init(&cq_table->bitmap, dev->caps.num_cqs,
+                              dev->caps.num_cqs - 1, dev->caps.reserved_cqs);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+void mlx4_cleanup_cq_table(struct mlx4_dev *dev)
+{
+       /* Nothing to do to clean up radix_tree */
+       mlx4_bitmap_cleanup(&mlx4_priv(dev)->cq_table.bitmap);
+}
diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c
new file mode 100644 (file)
index 0000000..acf1c80
--- /dev/null
@@ -0,0 +1,696 @@
+/*
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "fw.h"
+
+enum {
+       MLX4_NUM_ASYNC_EQE      = 0x100,
+       MLX4_NUM_SPARE_EQE      = 0x80,
+       MLX4_EQ_ENTRY_SIZE      = 0x20
+};
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mlx4_eq_context {
+       __be32                  flags;
+       u16                     reserved1[3];
+       __be16                  page_offset;
+       u8                      log_eq_size;
+       u8                      reserved2[4];
+       u8                      eq_period;
+       u8                      reserved3;
+       u8                      eq_max_count;
+       u8                      reserved4[3];
+       u8                      intr;
+       u8                      log_page_size;
+       u8                      reserved5[2];
+       u8                      mtt_base_addr_h;
+       __be32                  mtt_base_addr_l;
+       u32                     reserved6[2];
+       __be32                  consumer_index;
+       __be32                  producer_index;
+       u32                     reserved7[4];
+};
+
+#define MLX4_EQ_STATUS_OK         ( 0 << 28)
+#define MLX4_EQ_STATUS_WRITE_FAIL  (10 << 28)
+#define MLX4_EQ_OWNER_SW          ( 0 << 24)
+#define MLX4_EQ_OWNER_HW          ( 1 << 24)
+#define MLX4_EQ_FLAG_EC                   ( 1 << 18)
+#define MLX4_EQ_FLAG_OI                   ( 1 << 17)
+#define MLX4_EQ_STATE_ARMED       ( 9 <<  8)
+#define MLX4_EQ_STATE_FIRED       (10 <<  8)
+#define MLX4_EQ_STATE_ALWAYS_ARMED (11 <<  8)
+
+#define MLX4_ASYNC_EVENT_MASK ((1ull << MLX4_EVENT_TYPE_PATH_MIG)          | \
+                              (1ull << MLX4_EVENT_TYPE_COMM_EST)           | \
+                              (1ull << MLX4_EVENT_TYPE_SQ_DRAINED)         | \
+                              (1ull << MLX4_EVENT_TYPE_CQ_ERROR)           | \
+                              (1ull << MLX4_EVENT_TYPE_WQ_CATAS_ERROR)     | \
+                              (1ull << MLX4_EVENT_TYPE_EEC_CATAS_ERROR)    | \
+                              (1ull << MLX4_EVENT_TYPE_PATH_MIG_FAILED)    | \
+                              (1ull << MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \
+                              (1ull << MLX4_EVENT_TYPE_WQ_ACCESS_ERROR)    | \
+                              (1ull << MLX4_EVENT_TYPE_LOCAL_CATAS_ERROR)  | \
+                              (1ull << MLX4_EVENT_TYPE_PORT_CHANGE)        | \
+                              (1ull << MLX4_EVENT_TYPE_ECC_DETECT)         | \
+                              (1ull << MLX4_EVENT_TYPE_SRQ_CATAS_ERROR)    | \
+                              (1ull << MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE)    | \
+                              (1ull << MLX4_EVENT_TYPE_SRQ_LIMIT)          | \
+                              (1ull << MLX4_EVENT_TYPE_CMD))
+#define MLX4_CATAS_EVENT_MASK  (1ull << MLX4_EVENT_TYPE_LOCAL_CATAS_ERROR)
+
+struct mlx4_eqe {
+       u8                      reserved1;
+       u8                      type;
+       u8                      reserved2;
+       u8                      subtype;
+       union {
+               u32             raw[6];
+               struct {
+                       __be32  cqn;
+               } __attribute__((packed)) comp;
+               struct {
+                       u16     reserved1;
+                       __be16  token;
+                       u32     reserved2;
+                       u8      reserved3[3];
+                       u8      status;
+                       __be64  out_param;
+               } __attribute__((packed)) cmd;
+               struct {
+                       __be32  qpn;
+               } __attribute__((packed)) qp;
+               struct {
+                       __be32  srqn;
+               } __attribute__((packed)) srq;
+               struct {
+                       __be32  cqn;
+                       u32     reserved1;
+                       u8      reserved2[3];
+                       u8      syndrome;
+               } __attribute__((packed)) cq_err;
+               struct {
+                       u32     reserved1[2];
+                       __be32  port;
+               } __attribute__((packed)) port_change;
+       }                       event;
+       u8                      reserved3[3];
+       u8                      owner;
+} __attribute__((packed));
+
+static void eq_set_ci(struct mlx4_eq *eq, int req_not)
+{
+       __raw_writel((__force u32) cpu_to_be32((eq->cons_index & 0xffffff) |
+                                              req_not << 31),
+                    eq->doorbell);
+       /* We still want ordering, just not swabbing, so add a barrier */
+       mb();
+}
+
+static struct mlx4_eqe *get_eqe(struct mlx4_eq *eq, u32 entry)
+{
+       unsigned long off = (entry & (eq->nent - 1)) * MLX4_EQ_ENTRY_SIZE;
+       return eq->page_list[off / PAGE_SIZE].buf + off % PAGE_SIZE;
+}
+
+static struct mlx4_eqe *next_eqe_sw(struct mlx4_eq *eq)
+{
+       struct mlx4_eqe *eqe = get_eqe(eq, eq->cons_index);
+       return !!(eqe->owner & 0x80) ^ !!(eq->cons_index & eq->nent) ? NULL : eqe;
+}
+
+static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
+{
+       struct mlx4_eqe *eqe;
+       int cqn;
+       int eqes_found = 0;
+       int set_ci = 0;
+
+       while ((eqe = next_eqe_sw(eq))) {
+               /*
+                * Make sure we read EQ entry contents after we've
+                * checked the ownership bit.
+                */
+               rmb();
+
+               switch (eqe->type) {
+               case MLX4_EVENT_TYPE_COMP:
+                       cqn = be32_to_cpu(eqe->event.comp.cqn) & 0xffffff;
+                       mlx4_cq_completion(dev, cqn);
+                       break;
+
+               case MLX4_EVENT_TYPE_PATH_MIG:
+               case MLX4_EVENT_TYPE_COMM_EST:
+               case MLX4_EVENT_TYPE_SQ_DRAINED:
+               case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
+               case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
+               case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
+               case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+               case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
+                       mlx4_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+                                     eqe->type);
+                       break;
+
+               case MLX4_EVENT_TYPE_SRQ_LIMIT:
+               case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR:
+                       mlx4_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) & 0xffffff,
+                                     eqe->type);
+                       break;
+
+               case MLX4_EVENT_TYPE_CMD:
+                       mlx4_cmd_event(dev,
+                                      be16_to_cpu(eqe->event.cmd.token),
+                                      eqe->event.cmd.status,
+                                      be64_to_cpu(eqe->event.cmd.out_param));
+                       break;
+
+               case MLX4_EVENT_TYPE_PORT_CHANGE:
+                       mlx4_dispatch_event(dev, eqe->type, eqe->subtype,
+                                           be32_to_cpu(eqe->event.port_change.port) >> 28);
+                       break;
+
+               case MLX4_EVENT_TYPE_CQ_ERROR:
+                       mlx4_warn(dev, "CQ %s on CQN %06x\n",
+                                 eqe->event.cq_err.syndrome == 1 ?
+                                 "overrun" : "access violation",
+                                 be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff);
+                       mlx4_cq_event(dev, be32_to_cpu(eqe->event.cq_err.cqn),
+                                     eqe->type);
+                       break;
+
+               case MLX4_EVENT_TYPE_EQ_OVERFLOW:
+                       mlx4_warn(dev, "EQ overrun on EQN %d\n", eq->eqn);
+                       break;
+
+               case MLX4_EVENT_TYPE_EEC_CATAS_ERROR:
+               case MLX4_EVENT_TYPE_ECC_DETECT:
+               default:
+                       mlx4_warn(dev, "Unhandled event %02x(%02x) on EQ %d at index %u\n",
+                                 eqe->type, eqe->subtype, eq->eqn, eq->cons_index);
+                       break;
+               };
+
+               ++eq->cons_index;
+               eqes_found = 1;
+               ++set_ci;
+
+               /*
+                * The HCA will think the queue has overflowed if we
+                * don't tell it we've been processing events.  We
+                * create our EQs with MLX4_NUM_SPARE_EQE extra
+                * entries, so we must update our consumer index at
+                * least that often.
+                */
+               if (unlikely(set_ci >= MLX4_NUM_SPARE_EQE)) {
+                       /*
+                        * Conditional on hca_type is OK here because
+                        * this is a rare case, not the fast path.
+                        */
+                       eq_set_ci(eq, 0);
+                       set_ci = 0;
+               }
+       }
+
+       eq_set_ci(eq, 1);
+
+       return eqes_found;
+}
+
+static irqreturn_t mlx4_interrupt(int irq, void *dev_ptr)
+{
+       struct mlx4_dev *dev = dev_ptr;
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       int work = 0;
+       int i;
+
+       writel(priv->eq_table.clr_mask, priv->eq_table.clr_int);
+
+       for (i = 0; i < MLX4_EQ_CATAS; ++i)
+               work |= mlx4_eq_int(dev, &priv->eq_table.eq[i]);
+
+       return IRQ_RETVAL(work);
+}
+
+static irqreturn_t mlx4_msi_x_interrupt(int irq, void *eq_ptr)
+{
+       struct mlx4_eq  *eq  = eq_ptr;
+       struct mlx4_dev *dev = eq->dev;
+
+       mlx4_eq_int(dev, eq);
+
+       /* MSI-X vectors always belong to us */
+       return IRQ_HANDLED;
+}
+
+static irqreturn_t mlx4_catas_interrupt(int irq, void *dev_ptr)
+{
+       mlx4_handle_catas_err(dev_ptr);
+
+       /* MSI-X vectors always belong to us */
+       return IRQ_HANDLED;
+}
+
+static int mlx4_MAP_EQ(struct mlx4_dev *dev, u64 event_mask, int unmap,
+                       int eq_num)
+{
+       return mlx4_cmd(dev, event_mask, (unmap << 31) | eq_num,
+                       0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B);
+}
+
+static int mlx4_SW2HW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+                        int eq_num)
+{
+       return mlx4_cmd(dev, mailbox->dma, eq_num, 0, MLX4_CMD_SW2HW_EQ,
+                       MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_HW2SW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+                        int eq_num)
+{
+       return mlx4_cmd_box(dev, 0, mailbox->dma, eq_num, 0, MLX4_CMD_HW2SW_EQ,
+                           MLX4_CMD_TIME_CLASS_A);
+}
+
+static void __devinit __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev,
+                                              struct mlx4_eq *eq)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       int index;
+
+       index = eq->eqn / 4 - dev->caps.reserved_eqs / 4;
+
+       if (!priv->eq_table.uar_map[index]) {
+               priv->eq_table.uar_map[index] =
+                       ioremap(pci_resource_start(dev->pdev, 2) +
+                               ((eq->eqn / 4) << PAGE_SHIFT),
+                               PAGE_SIZE);
+               if (!priv->eq_table.uar_map[index]) {
+                       mlx4_err(dev, "Couldn't map EQ doorbell for EQN 0x%06x\n",
+                                eq->eqn);
+                       return NULL;
+               }
+       }
+
+       return priv->eq_table.uar_map[index] + 0x800 + 8 * (eq->eqn % 4);
+}
+
+static int __devinit mlx4_create_eq(struct mlx4_dev *dev, int nent,
+                                   u8 intr, struct mlx4_eq *eq)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       struct mlx4_cmd_mailbox *mailbox;
+       struct mlx4_eq_context *eq_context;
+       int npages;
+       u64 *dma_list = NULL;
+       dma_addr_t t;
+       u64 mtt_addr;
+       int err = -ENOMEM;
+       int i;
+
+       eq->dev   = dev;
+       eq->nent  = roundup_pow_of_two(max(nent, 2));
+       npages = PAGE_ALIGN(eq->nent * MLX4_EQ_ENTRY_SIZE) / PAGE_SIZE;
+
+       eq->page_list = kmalloc(npages * sizeof *eq->page_list,
+                               GFP_KERNEL);
+       if (!eq->page_list)
+               goto err_out;
+
+       for (i = 0; i < npages; ++i)
+               eq->page_list[i].buf = NULL;
+
+       dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+       if (!dma_list)
+               goto err_out_free;
+
+       mailbox = mlx4_alloc_cmd_mailbox(dev);
+       if (IS_ERR(mailbox))
+               goto err_out_free;
+       eq_context = mailbox->buf;
+
+       for (i = 0; i < npages; ++i) {
+               eq->page_list[i].buf = dma_alloc_coherent(&dev->pdev->dev,
+                                                         PAGE_SIZE, &t, GFP_KERNEL);
+               if (!eq->page_list[i].buf)
+                       goto err_out_free_pages;
+
+               dma_list[i] = t;
+               eq->page_list[i].map = t;
+
+               memset(eq->page_list[i].buf, 0, PAGE_SIZE);
+       }
+
+       eq->eqn = mlx4_bitmap_alloc(&priv->eq_table.bitmap);
+       if (eq->eqn == -1)
+               goto err_out_free_pages;
+
+       eq->doorbell = mlx4_get_eq_uar(dev, eq);
+       if (!eq->doorbell) {
+               err = -ENOMEM;
+               goto err_out_free_eq;
+       }
+
+       err = mlx4_mtt_init(dev, npages, PAGE_SHIFT, &eq->mtt);
+       if (err)
+               goto err_out_free_eq;
+
+       err = mlx4_write_mtt(dev, &eq->mtt, 0, npages, dma_list);
+       if (err)
+               goto err_out_free_mtt;
+
+       memset(eq_context, 0, sizeof *eq_context);
+       eq_context->flags         = cpu_to_be32(MLX4_EQ_STATUS_OK   |
+                                               MLX4_EQ_STATE_ARMED);
+       eq_context->log_eq_size   = ilog2(eq->nent);
+       eq_context->intr          = intr;
+       eq_context->log_page_size = PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT;
+
+       mtt_addr = mlx4_mtt_addr(dev, &eq->mtt);
+       eq_context->mtt_base_addr_h = mtt_addr >> 32;
+       eq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+
+       err = mlx4_SW2HW_EQ(dev, mailbox, eq->eqn);
+       if (err) {
+               mlx4_warn(dev, "SW2HW_EQ failed (%d)\n", err);
+               goto err_out_free_mtt;
+       }
+
+       kfree(dma_list);
+       mlx4_free_cmd_mailbox(dev, mailbox);
+
+       eq->cons_index = 0;
+
+       return err;
+
+err_out_free_mtt:
+       mlx4_mtt_cleanup(dev, &eq->mtt);
+
+err_out_free_eq:
+       mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn);
+
+err_out_free_pages:
+       for (i = 0; i < npages; ++i)
+               if (eq->page_list[i].buf)
+                       dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+                                         eq->page_list[i].buf,
+                                         eq->page_list[i].map);
+
+       mlx4_free_cmd_mailbox(dev, mailbox);
+
+err_out_free:
+       kfree(eq->page_list);
+       kfree(dma_list);
+
+err_out:
+       return err;
+}
+
+static void mlx4_free_eq(struct mlx4_dev *dev,
+                        struct mlx4_eq *eq)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       struct mlx4_cmd_mailbox *mailbox;
+       int err;
+       int npages = PAGE_ALIGN(MLX4_EQ_ENTRY_SIZE * eq->nent) / PAGE_SIZE;
+       int i;
+
+       mailbox = mlx4_alloc_cmd_mailbox(dev);
+       if (IS_ERR(mailbox))
+               return;
+
+       err = mlx4_HW2SW_EQ(dev, mailbox, eq->eqn);
+       if (err)
+               mlx4_warn(dev, "HW2SW_EQ failed (%d)\n", err);
+
+       if (0) {
+               mlx4_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn);
+               for (i = 0; i < sizeof (struct mlx4_eq_context) / 4; ++i) {
+                       if (i % 4 == 0)
+                               printk("[%02x] ", i * 4);
+                       printk(" %08x", be32_to_cpup(mailbox->buf + i * 4));
+                       if ((i + 1) % 4 == 0)
+                               printk("\n");
+               }
+       }
+
+       mlx4_mtt_cleanup(dev, &eq->mtt);
+       for (i = 0; i < npages; ++i)
+               pci_free_consistent(dev->pdev, PAGE_SIZE,
+                                   eq->page_list[i].buf,
+                                   eq->page_list[i].map);
+
+       kfree(eq->page_list);
+       mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn);
+       mlx4_free_cmd_mailbox(dev, mailbox);
+}
+
+static void mlx4_free_irqs(struct mlx4_dev *dev)
+{
+       struct mlx4_eq_table *eq_table = &mlx4_priv(dev)->eq_table;
+       int i;
+
+       if (eq_table->have_irq)
+               free_irq(dev->pdev->irq, dev);
+       for (i = 0; i < MLX4_NUM_EQ; ++i)
+               if (eq_table->eq[i].have_irq)
+                       free_irq(eq_table->eq[i].irq, eq_table->eq + i);
+}
+
+static int __devinit mlx4_map_clr_int(struct mlx4_dev *dev)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+
+       priv->clr_base = ioremap(pci_resource_start(dev->pdev, priv->fw.clr_int_bar) +
+                                priv->fw.clr_int_base, MLX4_CLR_INT_SIZE);
+       if (!priv->clr_base) {
+               mlx4_err(dev, "Couldn't map interrupt clear register, aborting.\n");
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static void mlx4_unmap_clr_int(struct mlx4_dev *dev)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+
+       iounmap(priv->clr_base);
+}
+
+int __devinit mlx4_map_eq_icm(struct mlx4_dev *dev, u64 icm_virt)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       int ret;
+
+       /*
+        * We assume that mapping one page is enough for the whole EQ
+        * context table.  This is fine with all current HCAs, because
+        * we only use 32 EQs and each EQ uses 64 bytes of context
+        * memory, or 1 KB total.
+        */
+       priv->eq_table.icm_virt = icm_virt;
+       priv->eq_table.icm_page = alloc_page(GFP_HIGHUSER);
+       if (!priv->eq_table.icm_page)
+               return -ENOMEM;
+       priv->eq_table.icm_dma  = pci_map_page(dev->pdev, priv->eq_table.icm_page, 0,
+                                              PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
+       if (pci_dma_mapping_error(priv->eq_table.icm_dma)) {
+               __free_page(priv->eq_table.icm_page);
+               return -ENOMEM;
+       }
+
+       ret = mlx4_MAP_ICM_page(dev, priv->eq_table.icm_dma, icm_virt);
+       if (ret) {
+               pci_unmap_page(dev->pdev, priv->eq_table.icm_dma, PAGE_SIZE,
+                              PCI_DMA_BIDIRECTIONAL);
+               __free_page(priv->eq_table.icm_page);
+       }
+
+       return ret;
+}
+
+void mlx4_unmap_eq_icm(struct mlx4_dev *dev)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+
+       mlx4_UNMAP_ICM(dev, priv->eq_table.icm_virt, 1);
+       pci_unmap_page(dev->pdev, priv->eq_table.icm_dma, PAGE_SIZE,
+                      PCI_DMA_BIDIRECTIONAL);
+       __free_page(priv->eq_table.icm_page);
+}
+
+int __devinit mlx4_init_eq_table(struct mlx4_dev *dev)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       int err;
+       int i;
+
+       err = mlx4_bitmap_init(&priv->eq_table.bitmap, dev->caps.num_eqs,
+                              dev->caps.num_eqs - 1, dev->caps.reserved_eqs);
+       if (err)
+               return err;
+
+       for (i = 0; i < ARRAY_SIZE(priv->eq_table.uar_map); ++i)
+               priv->eq_table.uar_map[i] = NULL;
+
+       err = mlx4_map_clr_int(dev);
+       if (err)
+               goto err_out_free;
+
+       priv->eq_table.clr_mask =
+               swab32(1 << (priv->eq_table.inta_pin & 31));
+       priv->eq_table.clr_int  = priv->clr_base +
+               (priv->eq_table.inta_pin < 32 ? 4 : 0);
+
+       err = mlx4_create_eq(dev, dev->caps.num_cqs + MLX4_NUM_SPARE_EQE,
+                            (dev->flags & MLX4_FLAG_MSI_X) ? MLX4_EQ_COMP : 0,
+                            &priv->eq_table.eq[MLX4_EQ_COMP]);
+       if (err)
+               goto err_out_unmap;
+
+       err = mlx4_create_eq(dev, MLX4_NUM_ASYNC_EQE + MLX4_NUM_SPARE_EQE,
+                            (dev->flags & MLX4_FLAG_MSI_X) ? MLX4_EQ_ASYNC : 0,
+                            &priv->eq_table.eq[MLX4_EQ_ASYNC]);
+       if (err)
+               goto err_out_comp;
+
+       if (dev->flags & MLX4_FLAG_MSI_X) {
+               static const char *eq_name[] = {
+                       [MLX4_EQ_COMP]  = DRV_NAME " (comp)",
+                       [MLX4_EQ_ASYNC] = DRV_NAME " (async)",
+                       [MLX4_EQ_CATAS] = DRV_NAME " (catas)"
+               };
+
+               err = mlx4_create_eq(dev, 1, MLX4_EQ_CATAS,
+                                    &priv->eq_table.eq[MLX4_EQ_CATAS]);
+               if (err)
+                       goto err_out_async;
+
+               for (i = 0; i < MLX4_EQ_CATAS; ++i) {
+                       err = request_irq(priv->eq_table.eq[i].irq,
+                                         mlx4_msi_x_interrupt,
+                                         0, eq_name[i], priv->eq_table.eq + i);
+                       if (err)
+                               goto err_out_catas;
+
+                       priv->eq_table.eq[i].have_irq = 1;
+               }
+
+               err = request_irq(priv->eq_table.eq[MLX4_EQ_CATAS].irq,
+                                 mlx4_catas_interrupt, 0,
+                                 eq_name[MLX4_EQ_CATAS], dev);
+               if (err)
+                       goto err_out_catas;
+
+               priv->eq_table.eq[MLX4_EQ_CATAS].have_irq = 1;
+       } else {
+               err = request_irq(dev->pdev->irq, mlx4_interrupt,
+                                 SA_SHIRQ, DRV_NAME, dev);
+               if (err)
+                       goto err_out_async;
+
+               priv->eq_table.have_irq = 1;
+       }
+
+       err = mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0,
+                         priv->eq_table.eq[MLX4_EQ_ASYNC].eqn);
+       if (err)
+               mlx4_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n",
+                          priv->eq_table.eq[MLX4_EQ_ASYNC].eqn, err);
+
+       for (i = 0; i < MLX4_EQ_CATAS; ++i)
+               eq_set_ci(&priv->eq_table.eq[i], 1);
+
+       if (dev->flags & MLX4_FLAG_MSI_X) {
+               err = mlx4_MAP_EQ(dev, MLX4_CATAS_EVENT_MASK, 0,
+                                 priv->eq_table.eq[MLX4_EQ_CATAS].eqn);
+               if (err)
+                       mlx4_warn(dev, "MAP_EQ for catas EQ %d failed (%d)\n",
+                                 priv->eq_table.eq[MLX4_EQ_CATAS].eqn, err);
+       }
+
+       return 0;
+
+err_out_catas:
+       mlx4_free_eq(dev, &priv->eq_table.eq[MLX4_EQ_CATAS]);
+
+err_out_async:
+       mlx4_free_eq(dev, &priv->eq_table.eq[MLX4_EQ_ASYNC]);
+
+err_out_comp:
+       mlx4_free_eq(dev, &priv->eq_table.eq[MLX4_EQ_COMP]);
+
+err_out_unmap:
+       mlx4_unmap_clr_int(dev);
+       mlx4_free_irqs(dev);
+
+err_out_free:
+       mlx4_bitmap_cleanup(&priv->eq_table.bitmap);
+       return err;
+}
+
+void mlx4_cleanup_eq_table(struct mlx4_dev *dev)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       int i;
+
+       if (dev->flags & MLX4_FLAG_MSI_X)
+               mlx4_MAP_EQ(dev, MLX4_CATAS_EVENT_MASK, 1,
+                           priv->eq_table.eq[MLX4_EQ_CATAS].eqn);
+
+       mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 1,
+                   priv->eq_table.eq[MLX4_EQ_ASYNC].eqn);
+
+       mlx4_free_irqs(dev);
+
+       for (i = 0; i < MLX4_EQ_CATAS; ++i)
+               mlx4_free_eq(dev, &priv->eq_table.eq[i]);
+       if (dev->flags & MLX4_FLAG_MSI_X)
+               mlx4_free_eq(dev, &priv->eq_table.eq[MLX4_EQ_CATAS]);
+
+       mlx4_unmap_clr_int(dev);
+
+       for (i = 0; i < ARRAY_SIZE(priv->eq_table.uar_map); ++i)
+               if (priv->eq_table.uar_map[i])
+                       iounmap(priv->eq_table.uar_map[i]);
+
+       mlx4_bitmap_cleanup(&priv->eq_table.bitmap);
+}
diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
new file mode 100644 (file)
index 0000000..c427173
--- /dev/null
@@ -0,0 +1,775 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/cmd.h>
+
+#include "fw.h"
+#include "icm.h"
+
+extern void __buggy_use_of_MLX4_GET(void);
+extern void __buggy_use_of_MLX4_PUT(void);
+
+#define MLX4_GET(dest, source, offset)                               \
+       do {                                                          \
+               void *__p = (char *) (source) + (offset);             \
+               switch (sizeof (dest)) {                              \
+               case 1: (dest) = *(u8 *) __p;       break;            \
+               case 2: (dest) = be16_to_cpup(__p); break;            \
+               case 4: (dest) = be32_to_cpup(__p); break;            \
+               case 8: (dest) = be64_to_cpup(__p); break;            \
+               default: __buggy_use_of_MLX4_GET();                   \
+               }                                                     \
+       } while (0)
+
+#define MLX4_PUT(dest, source, offset)                               \
+       do {                                                          \
+               void *__d = ((char *) (dest) + (offset));             \
+               switch (sizeof(source)) {                             \
+               case 1: *(u8 *) __d = (source);                break; \
+               case 2: *(__be16 *) __d = cpu_to_be16(source); break; \
+               case 4: *(__be32 *) __d = cpu_to_be32(source); break; \
+               case 8: *(__be64 *) __d = cpu_to_be64(source); break; \
+               default: __buggy_use_of_MLX4_PUT();                   \
+               }                                                     \
+       } while (0)
+
+static void dump_dev_cap_flags(struct mlx4_dev *dev, u32 flags)
+{
+       static const char *fname[] = {
+               [ 0] = "RC transport",
+               [ 1] = "UC transport",
+               [ 2] = "UD transport",
+               [ 3] = "SRC transport",
+               [ 4] = "reliable multicast",
+               [ 5] = "FCoIB support",
+               [ 6] = "SRQ support",
+               [ 7] = "IPoIB checksum offload",
+               [ 8] = "P_Key violation counter",
+               [ 9] = "Q_Key violation counter",
+               [10] = "VMM",
+               [16] = "MW support",
+               [17] = "APM support",
+               [18] = "Atomic ops support",
+               [19] = "Raw multicast support",
+               [20] = "Address vector port checking support",
+               [21] = "UD multicast support",
+               [24] = "Demand paging support",
+               [25] = "Router support"
+       };
+       int i;
+
+       mlx4_dbg(dev, "DEV_CAP flags:\n");
+       for (i = 0; i < 32; ++i)
+               if (fname[i] && (flags & (1 << i)))
+                       mlx4_dbg(dev, "    %s\n", fname[i]);
+}
+
+int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
+{
+       struct mlx4_cmd_mailbox *mailbox;
+       u32 *outbox;
+       u8 field;
+       u16 size;
+       u16 stat_rate;
+       int err;
+
+#define QUERY_DEV_CAP_OUT_SIZE                0x100
+#define QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET                0x10
+#define QUERY_DEV_CAP_MAX_QP_SZ_OFFSET         0x11
+#define QUERY_DEV_CAP_RSVD_QP_OFFSET           0x12
+#define QUERY_DEV_CAP_MAX_QP_OFFSET            0x13
+#define QUERY_DEV_CAP_RSVD_SRQ_OFFSET          0x14
+#define QUERY_DEV_CAP_MAX_SRQ_OFFSET           0x15
+#define QUERY_DEV_CAP_RSVD_EEC_OFFSET          0x16
+#define QUERY_DEV_CAP_MAX_EEC_OFFSET           0x17
+#define QUERY_DEV_CAP_MAX_CQ_SZ_OFFSET         0x19
+#define QUERY_DEV_CAP_RSVD_CQ_OFFSET           0x1a
+#define QUERY_DEV_CAP_MAX_CQ_OFFSET            0x1b
+#define QUERY_DEV_CAP_MAX_MPT_OFFSET           0x1d
+#define QUERY_DEV_CAP_RSVD_EQ_OFFSET           0x1e
+#define QUERY_DEV_CAP_MAX_EQ_OFFSET            0x1f
+#define QUERY_DEV_CAP_RSVD_MTT_OFFSET          0x20
+#define QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET                0x21
+#define QUERY_DEV_CAP_RSVD_MRW_OFFSET          0x22
+#define QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET       0x23
+#define QUERY_DEV_CAP_MAX_AV_OFFSET            0x27
+#define QUERY_DEV_CAP_MAX_REQ_QP_OFFSET                0x29
+#define QUERY_DEV_CAP_MAX_RES_QP_OFFSET                0x2b
+#define QUERY_DEV_CAP_MAX_RDMA_OFFSET          0x2f
+#define QUERY_DEV_CAP_RSZ_SRQ_OFFSET           0x33
+#define QUERY_DEV_CAP_ACK_DELAY_OFFSET         0x35
+#define QUERY_DEV_CAP_MTU_WIDTH_OFFSET         0x36
+#define QUERY_DEV_CAP_VL_PORT_OFFSET           0x37
+#define QUERY_DEV_CAP_MAX_GID_OFFSET           0x3b
+#define QUERY_DEV_CAP_RATE_SUPPORT_OFFSET      0x3c
+#define QUERY_DEV_CAP_MAX_PKEY_OFFSET          0x3f
+#define QUERY_DEV_CAP_FLAGS_OFFSET             0x44
+#define QUERY_DEV_CAP_RSVD_UAR_OFFSET          0x48
+#define QUERY_DEV_CAP_UAR_SZ_OFFSET            0x49
+#define QUERY_DEV_CAP_PAGE_SZ_OFFSET           0x4b
+#define QUERY_DEV_CAP_BF_OFFSET                        0x4c
+#define QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET     0x4d
+#define QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET  0x4e
+#define QUERY_DEV_CAP_LOG_MAX_BF_PAGES_OFFSET  0x4f
+#define QUERY_DEV_CAP_MAX_SG_SQ_OFFSET         0x51
+#define QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET    0x52
+#define QUERY_DEV_CAP_MAX_SG_RQ_OFFSET         0x55
+#define QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET    0x56
+#define QUERY_DEV_CAP_MAX_QP_MCG_OFFSET                0x61
+#define QUERY_DEV_CAP_RSVD_MCG_OFFSET          0x62
+#define QUERY_DEV_CAP_MAX_MCG_OFFSET           0x63
+#define QUERY_DEV_CAP_RSVD_PD_OFFSET           0x64
+#define QUERY_DEV_CAP_MAX_PD_OFFSET            0x65
+#define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET   0x80
+#define QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET      0x82
+#define QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET      0x84
+#define QUERY_DEV_CAP_ALTC_ENTRY_SZ_OFFSET     0x86
+#define QUERY_DEV_CAP_EQC_ENTRY_SZ_OFFSET      0x88
+#define QUERY_DEV_CAP_CQC_ENTRY_SZ_OFFSET      0x8a
+#define QUERY_DEV_CAP_SRQ_ENTRY_SZ_OFFSET      0x8c
+#define QUERY_DEV_CAP_C_MPT_ENTRY_SZ_OFFSET    0x8e
+#define QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET      0x90
+#define QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET    0x92
+#define QUERY_DEV_CAP_BMME_FLAGS_OFFSET                0x97
+#define QUERY_DEV_CAP_RSVD_LKEY_OFFSET         0x98
+#define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET                0xa0
+
+       mailbox = mlx4_alloc_cmd_mailbox(dev);
+       if (IS_ERR(mailbox))
+               return PTR_ERR(mailbox);
+       outbox = mailbox->buf;
+
+       err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP,
+                          MLX4_CMD_TIME_CLASS_A);
+
+       if (err)
+               goto out;
+
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_QP_OFFSET);
+       dev_cap->reserved_qps = 1 << (field & 0xf);
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_OFFSET);
+       dev_cap->max_qps = 1 << (field & 0x1f);
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_SRQ_OFFSET);
+       dev_cap->reserved_srqs = 1 << (field >> 4);
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SRQ_OFFSET);
+       dev_cap->max_srqs = 1 << (field & 0x1f);
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_CQ_SZ_OFFSET);
+       dev_cap->max_cq_sz = 1 << field;
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_CQ_OFFSET);
+       dev_cap->reserved_cqs = 1 << (field & 0xf);
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_CQ_OFFSET);
+       dev_cap->max_cqs = 1 << (field & 0x1f);
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MPT_OFFSET);
+       dev_cap->max_mpts = 1 << (field & 0x3f);
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_EQ_OFFSET);
+       dev_cap->reserved_eqs = 1 << (field & 0xf);
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_EQ_OFFSET);
+       dev_cap->max_eqs = 1 << (field & 0x7);
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MTT_OFFSET);
+       dev_cap->reserved_mtts = 1 << (field >> 4);
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET);
+       dev_cap->max_mrw_sz = 1 << field;
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MRW_OFFSET);
+       dev_cap->reserved_mrws = 1 << (field & 0xf);
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET);
+       dev_cap->max_mtt_seg = 1 << (field & 0x3f);
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_REQ_QP_OFFSET);
+       dev_cap->max_requester_per_qp = 1 << (field & 0x3f);
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RES_QP_OFFSET);
+       dev_cap->max_responder_per_qp = 1 << (field & 0x3f);
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RDMA_OFFSET);
+       dev_cap->max_rdma_global = 1 << (field & 0x3f);
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_ACK_DELAY_OFFSET);
+       dev_cap->local_ca_ack_delay = field & 0x1f;
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MTU_WIDTH_OFFSET);
+       dev_cap->max_mtu        = field >> 4;
+       dev_cap->max_port_width = field & 0xf;
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
+       dev_cap->max_vl    = field >> 4;
+       dev_cap->num_ports = field & 0xf;
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GID_OFFSET);
+       dev_cap->max_gids = 1 << (field & 0xf);
+       MLX4_GET(stat_rate, outbox, QUERY_DEV_CAP_RATE_SUPPORT_OFFSET);
+       dev_cap->stat_rate_support = stat_rate;
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PKEY_OFFSET);
+       dev_cap->max_pkeys = 1 << (field & 0xf);
+       MLX4_GET(dev_cap->flags, outbox, QUERY_DEV_CAP_FLAGS_OFFSET);
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_UAR_OFFSET);
+       dev_cap->reserved_uars = field >> 4;
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_UAR_SZ_OFFSET);
+       dev_cap->uar_size = 1 << ((field & 0x3f) + 20);
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_PAGE_SZ_OFFSET);
+       dev_cap->min_page_sz = 1 << field;
+
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_BF_OFFSET);
+       if (field & 0x80) {
+               MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET);
+               dev_cap->bf_reg_size = 1 << (field & 0x1f);
+               MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET);
+               dev_cap->bf_regs_per_page = 1 << (field & 0x3f);
+               mlx4_dbg(dev, "BlueFlame available (reg size %d, regs/page %d)\n",
+                        dev_cap->bf_reg_size, dev_cap->bf_regs_per_page);
+       } else {
+               dev_cap->bf_reg_size = 0;
+               mlx4_dbg(dev, "BlueFlame not available\n");
+       }
+
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_SQ_OFFSET);
+       dev_cap->max_sq_sg = field;
+       MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET);
+       dev_cap->max_sq_desc_sz = size;
+
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_MCG_OFFSET);
+       dev_cap->max_qp_per_mcg = 1 << field;
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MCG_OFFSET);
+       dev_cap->reserved_mgms = field & 0xf;
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MCG_OFFSET);
+       dev_cap->max_mcgs = 1 << field;
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_PD_OFFSET);
+       dev_cap->reserved_pds = field >> 4;
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PD_OFFSET);
+       dev_cap->max_pds = 1 << (field & 0x3f);
+
+       MLX4_GET(size, outbox, QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET);
+       dev_cap->rdmarc_entry_sz = size;
+       MLX4_GET(size, outbox, QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET);
+       dev_cap->qpc_entry_sz = size;
+       MLX4_GET(size, outbox, QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET);
+       dev_cap->aux_entry_sz = size;
+       MLX4_GET(size, outbox, QUERY_DEV_CAP_ALTC_ENTRY_SZ_OFFSET);
+       dev_cap->altc_entry_sz = size;
+       MLX4_GET(size, outbox, QUERY_DEV_CAP_EQC_ENTRY_SZ_OFFSET);
+       dev_cap->eqc_entry_sz = size;
+       MLX4_GET(size, outbox, QUERY_DEV_CAP_CQC_ENTRY_SZ_OFFSET);
+       dev_cap->cqc_entry_sz = size;
+       MLX4_GET(size, outbox, QUERY_DEV_CAP_SRQ_ENTRY_SZ_OFFSET);
+       dev_cap->srq_entry_sz = size;
+       MLX4_GET(size, outbox, QUERY_DEV_CAP_C_MPT_ENTRY_SZ_OFFSET);
+       dev_cap->cmpt_entry_sz = size;
+       MLX4_GET(size, outbox, QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET);
+       dev_cap->mtt_entry_sz = size;
+       MLX4_GET(size, outbox, QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET);
+       dev_cap->dmpt_entry_sz = size;
+
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET);
+       dev_cap->max_srq_sz = 1 << field;
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_SZ_OFFSET);
+       dev_cap->max_qp_sz = 1 << field;
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_RSZ_SRQ_OFFSET);
+       dev_cap->resize_srq = field & 1;
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_RQ_OFFSET);
+       dev_cap->max_rq_sg = field;
+       MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET);
+       dev_cap->max_rq_desc_sz = size;
+
+       MLX4_GET(dev_cap->bmme_flags, outbox,
+                QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+       MLX4_GET(dev_cap->reserved_lkey, outbox,
+                QUERY_DEV_CAP_RSVD_LKEY_OFFSET);
+       MLX4_GET(dev_cap->max_icm_sz, outbox,
+                QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET);
+
+       if (dev_cap->bmme_flags & 1)
+               mlx4_dbg(dev, "Base MM extensions: yes "
+                        "(flags %d, rsvd L_Key %08x)\n",
+                        dev_cap->bmme_flags, dev_cap->reserved_lkey);
+       else
+               mlx4_dbg(dev, "Base MM extensions: no\n");
+
+       /*
+        * Each UAR has 4 EQ doorbells; so if a UAR is reserved, then
+        * we can't use any EQs whose doorbell falls on that page,
+        * even if the EQ itself isn't reserved.
+        */
+       dev_cap->reserved_eqs = max(dev_cap->reserved_uars * 4,
+                                   dev_cap->reserved_eqs);
+
+       mlx4_dbg(dev, "Max ICM size %lld MB\n",
+                (unsigned long long) dev_cap->max_icm_sz >> 20);
+       mlx4_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n",
+                dev_cap->max_qps, dev_cap->reserved_qps, dev_cap->qpc_entry_sz);
+       mlx4_dbg(dev, "Max SRQs: %d, reserved SRQs: %d, entry size: %d\n",
+                dev_cap->max_srqs, dev_cap->reserved_srqs, dev_cap->srq_entry_sz);
+       mlx4_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n",
+                dev_cap->max_cqs, dev_cap->reserved_cqs, dev_cap->cqc_entry_sz);
+       mlx4_dbg(dev, "Max EQs: %d, reserved EQs: %d, entry size: %d\n",
+                dev_cap->max_eqs, dev_cap->reserved_eqs, dev_cap->eqc_entry_sz);
+       mlx4_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n",
+                dev_cap->reserved_mrws, dev_cap->reserved_mtts);
+       mlx4_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n",
+                dev_cap->max_pds, dev_cap->reserved_pds, dev_cap->reserved_uars);
+       mlx4_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n",
+                dev_cap->max_pds, dev_cap->reserved_mgms);
+       mlx4_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n",
+                dev_cap->max_cq_sz, dev_cap->max_qp_sz, dev_cap->max_srq_sz);
+       mlx4_dbg(dev, "Local CA ACK delay: %d, max MTU: %d, port width cap: %d\n",
+                dev_cap->local_ca_ack_delay, 128 << dev_cap->max_mtu,
+                dev_cap->max_port_width);
+       mlx4_dbg(dev, "Max SQ desc size: %d, max SQ S/G: %d\n",
+                dev_cap->max_sq_desc_sz, dev_cap->max_sq_sg);
+       mlx4_dbg(dev, "Max RQ desc size: %d, max RQ S/G: %d\n",
+                dev_cap->max_rq_desc_sz, dev_cap->max_rq_sg);
+
+       dump_dev_cap_flags(dev, dev_cap->flags);
+
+out:
+       mlx4_free_cmd_mailbox(dev, mailbox);
+       return err;
+}
+
+int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt)
+{
+       struct mlx4_cmd_mailbox *mailbox;
+       struct mlx4_icm_iter iter;
+       __be64 *pages;
+       int lg;
+       int nent = 0;
+       int i;
+       int err = 0;
+       int ts = 0, tc = 0;
+
+       mailbox = mlx4_alloc_cmd_mailbox(dev);
+       if (IS_ERR(mailbox))
+               return PTR_ERR(mailbox);
+       memset(mailbox->buf, 0, MLX4_MAILBOX_SIZE);
+       pages = mailbox->buf;
+
+       for (mlx4_icm_first(icm, &iter);
+            !mlx4_icm_last(&iter);
+            mlx4_icm_next(&iter)) {
+               /*
+                * We have to pass pages that are aligned to their
+                * size, so find the least significant 1 in the
+                * address or size and use that as our log2 size.
+                */
+               lg = ffs(mlx4_icm_addr(&iter) | mlx4_icm_size(&iter)) - 1;
+               if (lg < MLX4_ICM_PAGE_SHIFT) {
+                       mlx4_warn(dev, "Got FW area not aligned to %d (%llx/%lx).\n",
+                                  MLX4_ICM_PAGE_SIZE,
+                                  (unsigned long long) mlx4_icm_addr(&iter),
+                                  mlx4_icm_size(&iter));
+                       err = -EINVAL;
+                       goto out;
+               }
+
+               for (i = 0; i < mlx4_icm_size(&iter) >> lg; ++i) {
+                       if (virt != -1) {
+                               pages[nent * 2] = cpu_to_be64(virt);
+                               virt += 1 << lg;
+                       }
+
+                       pages[nent * 2 + 1] =
+                               cpu_to_be64((mlx4_icm_addr(&iter) + (i << lg)) |
+                                           (lg - MLX4_ICM_PAGE_SHIFT));
+                       ts += 1 << (lg - 10);
+                       ++tc;
+
+                       if (++nent == MLX4_MAILBOX_SIZE / 16) {
+                               err = mlx4_cmd(dev, mailbox->dma, nent, 0, op,
+                                               MLX4_CMD_TIME_CLASS_B);
+                               if (err)
+                                       goto out;
+                               nent = 0;
+                       }
+               }
+       }
+
+       if (nent)
+               err = mlx4_cmd(dev, mailbox->dma, nent, 0, op, MLX4_CMD_TIME_CLASS_B);
+       if (err)
+               goto out;
+
+       switch (op) {
+       case MLX4_CMD_MAP_FA:
+               mlx4_dbg(dev, "Mapped %d chunks/%d KB for FW.\n", tc, ts);
+               break;
+       case MLX4_CMD_MAP_ICM_AUX:
+               mlx4_dbg(dev, "Mapped %d chunks/%d KB for ICM aux.\n", tc, ts);
+               break;
+       case MLX4_CMD_MAP_ICM:
+               mlx4_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM.\n",
+                         tc, ts, (unsigned long long) virt - (ts << 10));
+               break;
+       }
+
+out:
+       mlx4_free_cmd_mailbox(dev, mailbox);
+       return err;
+}
+
+int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm)
+{
+       return mlx4_map_cmd(dev, MLX4_CMD_MAP_FA, icm, -1);
+}
+
+int mlx4_UNMAP_FA(struct mlx4_dev *dev)
+{
+       return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_FA, MLX4_CMD_TIME_CLASS_B);
+}
+
+
+int mlx4_RUN_FW(struct mlx4_dev *dev)
+{
+       return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_RUN_FW, MLX4_CMD_TIME_CLASS_A);
+}
+
+int mlx4_QUERY_FW(struct mlx4_dev *dev)
+{
+       struct mlx4_fw  *fw  = &mlx4_priv(dev)->fw;
+       struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+       struct mlx4_cmd_mailbox *mailbox;
+       u32 *outbox;
+       int err = 0;
+       u64 fw_ver;
+       u8 lg;
+
+#define QUERY_FW_OUT_SIZE             0x100
+#define QUERY_FW_VER_OFFSET            0x00
+#define QUERY_FW_MAX_CMD_OFFSET        0x0f
+#define QUERY_FW_ERR_START_OFFSET      0x30
+#define QUERY_FW_ERR_SIZE_OFFSET       0x38
+#define QUERY_FW_ERR_BAR_OFFSET        0x3c
+
+#define QUERY_FW_SIZE_OFFSET           0x00
+#define QUERY_FW_CLR_INT_BASE_OFFSET   0x20
+#define QUERY_FW_CLR_INT_BAR_OFFSET    0x28
+
+       mailbox = mlx4_alloc_cmd_mailbox(dev);
+       if (IS_ERR(mailbox))
+               return PTR_ERR(mailbox);
+       outbox = mailbox->buf;
+
+       err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_FW,
+                           MLX4_CMD_TIME_CLASS_A);
+       if (err)
+               goto out;
+
+       MLX4_GET(fw_ver, outbox, QUERY_FW_VER_OFFSET);
+       /*
+        * FW subminor version is at more signifant bits than minor
+        * version, so swap here.
+        */
+       dev->caps.fw_ver = (fw_ver & 0xffff00000000ull) |
+               ((fw_ver & 0xffff0000ull) >> 16) |
+               ((fw_ver & 0x0000ffffull) << 16);
+
+       MLX4_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET);
+       cmd->max_cmds = 1 << lg;
+
+       mlx4_dbg(dev, "FW version %d.%d.%03d, max commands %d\n",
+                (int) (dev->caps.fw_ver >> 32),
+                (int) (dev->caps.fw_ver >> 16) & 0xffff,
+                (int) dev->caps.fw_ver & 0xffff,
+                cmd->max_cmds);
+
+       MLX4_GET(fw->catas_offset, outbox, QUERY_FW_ERR_START_OFFSET);
+       MLX4_GET(fw->catas_size,   outbox, QUERY_FW_ERR_SIZE_OFFSET);
+       MLX4_GET(fw->catas_bar,    outbox, QUERY_FW_ERR_BAR_OFFSET);
+       fw->catas_bar = (fw->catas_bar >> 6) * 2;
+
+       mlx4_dbg(dev, "Catastrophic error buffer at 0x%llx, size 0x%x, BAR %d\n",
+                (unsigned long long) fw->catas_offset, fw->catas_size, fw->catas_bar);
+
+       MLX4_GET(fw->fw_pages,     outbox, QUERY_FW_SIZE_OFFSET);
+       MLX4_GET(fw->clr_int_base, outbox, QUERY_FW_CLR_INT_BASE_OFFSET);
+       MLX4_GET(fw->clr_int_bar,  outbox, QUERY_FW_CLR_INT_BAR_OFFSET);
+       fw->clr_int_bar = (fw->clr_int_bar >> 6) * 2;
+
+       mlx4_dbg(dev, "FW size %d KB\n", fw->fw_pages >> 2);
+
+       /*
+        * Round up number of system pages needed in case
+        * MLX4_ICM_PAGE_SIZE < PAGE_SIZE.
+        */
+       fw->fw_pages =
+               ALIGN(fw->fw_pages, PAGE_SIZE / MLX4_ICM_PAGE_SIZE) >>
+               (PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT);
+
+       mlx4_dbg(dev, "Clear int @ %llx, BAR %d\n",
+                (unsigned long long) fw->clr_int_base, fw->clr_int_bar);
+
+out:
+       mlx4_free_cmd_mailbox(dev, mailbox);
+       return err;
+}
+
+static void get_board_id(void *vsd, char *board_id)
+{
+       int i;
+
+#define VSD_OFFSET_SIG1                0x00
+#define VSD_OFFSET_SIG2                0xde
+#define VSD_OFFSET_MLX_BOARD_ID        0xd0
+#define VSD_OFFSET_TS_BOARD_ID 0x20
+
+#define VSD_SIGNATURE_TOPSPIN  0x5ad
+
+       memset(board_id, 0, MLX4_BOARD_ID_LEN);
+
+       if (be16_to_cpup(vsd + VSD_OFFSET_SIG1) == VSD_SIGNATURE_TOPSPIN &&
+           be16_to_cpup(vsd + VSD_OFFSET_SIG2) == VSD_SIGNATURE_TOPSPIN) {
+               strlcpy(board_id, vsd + VSD_OFFSET_TS_BOARD_ID, MLX4_BOARD_ID_LEN);
+       } else {
+               /*
+                * The board ID is a string but the firmware byte
+                * swaps each 4-byte word before passing it back to
+                * us.  Therefore we need to swab it before printing.
+                */
+               for (i = 0; i < 4; ++i)
+                       ((u32 *) board_id)[i] =
+                               swab32(*(u32 *) (vsd + VSD_OFFSET_MLX_BOARD_ID + i * 4));
+       }
+}
+
+int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter)
+{
+       struct mlx4_cmd_mailbox *mailbox;
+       u32 *outbox;
+       int err;
+
+#define QUERY_ADAPTER_OUT_SIZE             0x100
+#define QUERY_ADAPTER_VENDOR_ID_OFFSET     0x00
+#define QUERY_ADAPTER_DEVICE_ID_OFFSET     0x04
+#define QUERY_ADAPTER_REVISION_ID_OFFSET   0x08
+#define QUERY_ADAPTER_INTA_PIN_OFFSET      0x10
+#define QUERY_ADAPTER_VSD_OFFSET           0x20
+
+       mailbox = mlx4_alloc_cmd_mailbox(dev);
+       if (IS_ERR(mailbox))
+               return PTR_ERR(mailbox);
+       outbox = mailbox->buf;
+
+       err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_ADAPTER,
+                          MLX4_CMD_TIME_CLASS_A);
+       if (err)
+               goto out;
+
+       MLX4_GET(adapter->vendor_id, outbox,   QUERY_ADAPTER_VENDOR_ID_OFFSET);
+       MLX4_GET(adapter->device_id, outbox,   QUERY_ADAPTER_DEVICE_ID_OFFSET);
+       MLX4_GET(adapter->revision_id, outbox, QUERY_ADAPTER_REVISION_ID_OFFSET);
+       MLX4_GET(adapter->inta_pin, outbox,    QUERY_ADAPTER_INTA_PIN_OFFSET);
+
+       get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4,
+                    adapter->board_id);
+
+out:
+       mlx4_free_cmd_mailbox(dev, mailbox);
+       return err;
+}
+
+int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
+{
+       struct mlx4_cmd_mailbox *mailbox;
+       __be32 *inbox;
+       int err;
+
+#define INIT_HCA_IN_SIZE                0x200
+#define INIT_HCA_VERSION_OFFSET                 0x000
+#define         INIT_HCA_VERSION                2
+#define INIT_HCA_FLAGS_OFFSET           0x014
+#define INIT_HCA_QPC_OFFSET             0x020
+#define         INIT_HCA_QPC_BASE_OFFSET        (INIT_HCA_QPC_OFFSET + 0x10)
+#define         INIT_HCA_LOG_QP_OFFSET          (INIT_HCA_QPC_OFFSET + 0x17)
+#define         INIT_HCA_SRQC_BASE_OFFSET       (INIT_HCA_QPC_OFFSET + 0x28)
+#define         INIT_HCA_LOG_SRQ_OFFSET         (INIT_HCA_QPC_OFFSET + 0x2f)
+#define         INIT_HCA_CQC_BASE_OFFSET        (INIT_HCA_QPC_OFFSET + 0x30)
+#define         INIT_HCA_LOG_CQ_OFFSET          (INIT_HCA_QPC_OFFSET + 0x37)
+#define         INIT_HCA_ALTC_BASE_OFFSET       (INIT_HCA_QPC_OFFSET + 0x40)
+#define         INIT_HCA_AUXC_BASE_OFFSET       (INIT_HCA_QPC_OFFSET + 0x50)
+#define         INIT_HCA_EQC_BASE_OFFSET        (INIT_HCA_QPC_OFFSET + 0x60)
+#define         INIT_HCA_LOG_EQ_OFFSET          (INIT_HCA_QPC_OFFSET + 0x67)
+#define         INIT_HCA_RDMARC_BASE_OFFSET     (INIT_HCA_QPC_OFFSET + 0x70)
+#define         INIT_HCA_LOG_RD_OFFSET          (INIT_HCA_QPC_OFFSET + 0x77)
+#define INIT_HCA_MCAST_OFFSET           0x0c0
+#define         INIT_HCA_MC_BASE_OFFSET         (INIT_HCA_MCAST_OFFSET + 0x00)
+#define         INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12)
+#define         INIT_HCA_LOG_MC_HASH_SZ_OFFSET  (INIT_HCA_MCAST_OFFSET + 0x16)
+#define         INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b)
+#define INIT_HCA_TPT_OFFSET             0x0f0
+#define         INIT_HCA_DMPT_BASE_OFFSET       (INIT_HCA_TPT_OFFSET + 0x00)
+#define         INIT_HCA_LOG_MPT_SZ_OFFSET      (INIT_HCA_TPT_OFFSET + 0x0b)
+#define         INIT_HCA_MTT_BASE_OFFSET        (INIT_HCA_TPT_OFFSET + 0x10)
+#define         INIT_HCA_CMPT_BASE_OFFSET       (INIT_HCA_TPT_OFFSET + 0x18)
+#define INIT_HCA_UAR_OFFSET             0x120
+#define         INIT_HCA_LOG_UAR_SZ_OFFSET      (INIT_HCA_UAR_OFFSET + 0x0a)
+#define  INIT_HCA_UAR_PAGE_SZ_OFFSET     (INIT_HCA_UAR_OFFSET + 0x0b)
+
+       mailbox = mlx4_alloc_cmd_mailbox(dev);
+       if (IS_ERR(mailbox))
+               return PTR_ERR(mailbox);
+       inbox = mailbox->buf;
+
+       memset(inbox, 0, INIT_HCA_IN_SIZE);
+
+       *((u8 *) mailbox->buf + INIT_HCA_VERSION_OFFSET) = INIT_HCA_VERSION;
+
+#if defined(__LITTLE_ENDIAN)
+       *(inbox + INIT_HCA_FLAGS_OFFSET / 4) &= ~cpu_to_be32(1 << 1);
+#elif defined(__BIG_ENDIAN)
+       *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 1);
+#else
+#error Host endianness not defined
+#endif
+       /* Check port for UD address vector: */
+       *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1);
+
+       /* QPC/EEC/CQC/EQC/RDMARC attributes */
+
+       MLX4_PUT(inbox, param->qpc_base,      INIT_HCA_QPC_BASE_OFFSET);
+       MLX4_PUT(inbox, param->log_num_qps,   INIT_HCA_LOG_QP_OFFSET);
+       MLX4_PUT(inbox, param->srqc_base,     INIT_HCA_SRQC_BASE_OFFSET);
+       MLX4_PUT(inbox, param->log_num_srqs,  INIT_HCA_LOG_SRQ_OFFSET);
+       MLX4_PUT(inbox, param->cqc_base,      INIT_HCA_CQC_BASE_OFFSET);
+       MLX4_PUT(inbox, param->log_num_cqs,   INIT_HCA_LOG_CQ_OFFSET);
+       MLX4_PUT(inbox, param->altc_base,     INIT_HCA_ALTC_BASE_OFFSET);
+       MLX4_PUT(inbox, param->auxc_base,     INIT_HCA_AUXC_BASE_OFFSET);
+       MLX4_PUT(inbox, param->eqc_base,      INIT_HCA_EQC_BASE_OFFSET);
+       MLX4_PUT(inbox, param->log_num_eqs,   INIT_HCA_LOG_EQ_OFFSET);
+       MLX4_PUT(inbox, param->rdmarc_base,   INIT_HCA_RDMARC_BASE_OFFSET);
+       MLX4_PUT(inbox, param->log_rd_per_qp, INIT_HCA_LOG_RD_OFFSET);
+
+       /* multicast attributes */
+
+       MLX4_PUT(inbox, param->mc_base,         INIT_HCA_MC_BASE_OFFSET);
+       MLX4_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+       MLX4_PUT(inbox, param->log_mc_hash_sz,  INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
+       MLX4_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+
+       /* TPT attributes */
+
+       MLX4_PUT(inbox, param->dmpt_base,  INIT_HCA_DMPT_BASE_OFFSET);
+       MLX4_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET);
+       MLX4_PUT(inbox, param->mtt_base,   INIT_HCA_MTT_BASE_OFFSET);
+       MLX4_PUT(inbox, param->cmpt_base,  INIT_HCA_CMPT_BASE_OFFSET);
+
+       /* UAR attributes */
+
+       MLX4_PUT(inbox, (u8) (PAGE_SHIFT - 12), INIT_HCA_UAR_PAGE_SZ_OFFSET);
+       MLX4_PUT(inbox, param->log_uar_sz,      INIT_HCA_LOG_UAR_SZ_OFFSET);
+
+       err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA, 1000);
+
+       if (err)
+               mlx4_err(dev, "INIT_HCA returns %d\n", err);
+
+       mlx4_free_cmd_mailbox(dev, mailbox);
+       return err;
+}
+
+int mlx4_INIT_PORT(struct mlx4_dev *dev, struct mlx4_init_port_param *param, int port)
+{
+       struct mlx4_cmd_mailbox *mailbox;
+       u32 *inbox;
+       int err;
+       u32 flags;
+
+#define INIT_PORT_IN_SIZE          256
+#define INIT_PORT_FLAGS_OFFSET     0x00
+#define INIT_PORT_FLAG_SIG         (1 << 18)
+#define INIT_PORT_FLAG_NG          (1 << 17)
+#define INIT_PORT_FLAG_G0          (1 << 16)
+#define INIT_PORT_VL_SHIFT         4
+#define INIT_PORT_PORT_WIDTH_SHIFT 8
+#define INIT_PORT_MTU_OFFSET       0x04
+#define INIT_PORT_MAX_GID_OFFSET   0x06
+#define INIT_PORT_MAX_PKEY_OFFSET  0x0a
+#define INIT_PORT_GUID0_OFFSET     0x10
+#define INIT_PORT_NODE_GUID_OFFSET 0x18
+#define INIT_PORT_SI_GUID_OFFSET   0x20
+
+       mailbox = mlx4_alloc_cmd_mailbox(dev);
+       if (IS_ERR(mailbox))
+               return PTR_ERR(mailbox);
+       inbox = mailbox->buf;
+
+       memset(inbox, 0, INIT_PORT_IN_SIZE);
+
+       flags = 0;
+       flags |= param->set_guid0     ? INIT_PORT_FLAG_G0  : 0;
+       flags |= param->set_node_guid ? INIT_PORT_FLAG_NG  : 0;
+       flags |= param->set_si_guid   ? INIT_PORT_FLAG_SIG : 0;
+       flags |= (param->vl_cap & 0xf) << INIT_PORT_VL_SHIFT;
+       flags |= (param->port_width_cap & 0xf) << INIT_PORT_PORT_WIDTH_SHIFT;
+       MLX4_PUT(inbox, flags,            INIT_PORT_FLAGS_OFFSET);
+
+       MLX4_PUT(inbox, param->mtu,       INIT_PORT_MTU_OFFSET);
+       MLX4_PUT(inbox, param->max_gid,   INIT_PORT_MAX_GID_OFFSET);
+       MLX4_PUT(inbox, param->max_pkey,  INIT_PORT_MAX_PKEY_OFFSET);
+       MLX4_PUT(inbox, param->guid0,     INIT_PORT_GUID0_OFFSET);
+       MLX4_PUT(inbox, param->node_guid, INIT_PORT_NODE_GUID_OFFSET);
+       MLX4_PUT(inbox, param->si_guid,   INIT_PORT_SI_GUID_OFFSET);
+
+       err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_INIT_PORT,
+                      MLX4_CMD_TIME_CLASS_A);
+
+       mlx4_free_cmd_mailbox(dev, mailbox);
+
+       return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_INIT_PORT);
+
+int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port)
+{
+       return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000);
+}
+EXPORT_SYMBOL_GPL(mlx4_CLOSE_PORT);
+
+int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic)
+{
+       return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA, 1000);
+}
+
+int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages)
+{
+       int ret = mlx4_cmd_imm(dev, icm_size, aux_pages, 0, 0,
+                              MLX4_CMD_SET_ICM_SIZE,
+                              MLX4_CMD_TIME_CLASS_A);
+       if (ret)
+               return ret;
+
+       /*
+        * Round up number of system pages needed in case
+        * MLX4_ICM_PAGE_SIZE < PAGE_SIZE.
+        */
+       *aux_pages = ALIGN(*aux_pages, PAGE_SIZE / MLX4_ICM_PAGE_SIZE) >>
+               (PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT);
+
+       return 0;
+}
+
+int mlx4_NOP(struct mlx4_dev *dev)
+{
+       /* Input modifier of 0x1f means "finish as soon as possible." */
+       return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, 100);
+}
diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h
new file mode 100644 (file)
index 0000000..2616fa5
--- /dev/null
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_FW_H
+#define MLX4_FW_H
+
+#include "mlx4.h"
+#include "icm.h"
+
+struct mlx4_dev_cap {
+       int max_srq_sz;
+       int max_qp_sz;
+       int reserved_qps;
+       int max_qps;
+       int reserved_srqs;
+       int max_srqs;
+       int max_cq_sz;
+       int reserved_cqs;
+       int max_cqs;
+       int max_mpts;
+       int reserved_eqs;
+       int max_eqs;
+       int reserved_mtts;
+       int max_mrw_sz;
+       int reserved_mrws;
+       int max_mtt_seg;
+       int max_requester_per_qp;
+       int max_responder_per_qp;
+       int max_rdma_global;
+       int local_ca_ack_delay;
+       int max_mtu;
+       int max_port_width;
+       int max_vl;
+       int num_ports;
+       int max_gids;
+       u16 stat_rate_support;
+       int max_pkeys;
+       u32 flags;
+       int reserved_uars;
+       int uar_size;
+       int min_page_sz;
+       int bf_reg_size;
+       int bf_regs_per_page;
+       int max_sq_sg;
+       int max_sq_desc_sz;
+       int max_rq_sg;
+       int max_rq_desc_sz;
+       int max_qp_per_mcg;
+       int reserved_mgms;
+       int max_mcgs;
+       int reserved_pds;
+       int max_pds;
+       int qpc_entry_sz;
+       int rdmarc_entry_sz;
+       int altc_entry_sz;
+       int aux_entry_sz;
+       int srq_entry_sz;
+       int cqc_entry_sz;
+       int eqc_entry_sz;
+       int dmpt_entry_sz;
+       int cmpt_entry_sz;
+       int mtt_entry_sz;
+       int resize_srq;
+       u8  bmme_flags;
+       u32 reserved_lkey;
+       u64 max_icm_sz;
+};
+
+struct mlx4_adapter {
+       u32  vendor_id;
+       u32  device_id;
+       u32  revision_id;
+       char board_id[MLX4_BOARD_ID_LEN];
+       u8   inta_pin;
+};
+
+struct mlx4_init_hca_param {
+       u64 qpc_base;
+       u64 rdmarc_base;
+       u64 auxc_base;
+       u64 altc_base;
+       u64 srqc_base;
+       u64 cqc_base;
+       u64 eqc_base;
+       u64 mc_base;
+       u64 dmpt_base;
+       u64 cmpt_base;
+       u64 mtt_base;
+       u16 log_mc_entry_sz;
+       u16 log_mc_hash_sz;
+       u8  log_num_qps;
+       u8  log_num_srqs;
+       u8  log_num_cqs;
+       u8  log_num_eqs;
+       u8  log_rd_per_qp;
+       u8  log_mc_table_sz;
+       u8  log_mpt_sz;
+       u8  log_uar_sz;
+};
+
+struct mlx4_init_ib_param {
+       int port_width;
+       int vl_cap;
+       int mtu_cap;
+       u16 gid_cap;
+       u16 pkey_cap;
+       int set_guid0;
+       u64 guid0;
+       int set_node_guid;
+       u64 node_guid;
+       int set_si_guid;
+       u64 si_guid;
+};
+
+struct mlx4_set_ib_param {
+       int set_si_guid;
+       int reset_qkey_viol;
+       u64 si_guid;
+       u32 cap_mask;
+};
+
+int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap);
+int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm);
+int mlx4_UNMAP_FA(struct mlx4_dev *dev);
+int mlx4_RUN_FW(struct mlx4_dev *dev);
+int mlx4_QUERY_FW(struct mlx4_dev *dev);
+int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter);
+int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param);
+int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic);
+int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt);
+int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages);
+int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm);
+int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev);
+int mlx4_NOP(struct mlx4_dev *dev);
+
+#endif /* MLX4_FW_H */
diff --git a/drivers/net/mlx4/icm.c b/drivers/net/mlx4/icm.c
new file mode 100644 (file)
index 0000000..e96feae
--- /dev/null
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "icm.h"
+#include "fw.h"
+
+/*
+ * We allocate in as big chunks as we can, up to a maximum of 256 KB
+ * per chunk.
+ */
+enum {
+       MLX4_ICM_ALLOC_SIZE     = 1 << 18,
+       MLX4_TABLE_CHUNK_SIZE   = 1 << 18
+};
+
+void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm)
+{
+       struct mlx4_icm_chunk *chunk, *tmp;
+       int i;
+
+       list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) {
+               if (chunk->nsg > 0)
+                       pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
+                                    PCI_DMA_BIDIRECTIONAL);
+
+               for (i = 0; i < chunk->npages; ++i)
+                       __free_pages(chunk->mem[i].page,
+                                    get_order(chunk->mem[i].length));
+
+               kfree(chunk);
+       }
+
+       kfree(icm);
+}
+
+struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
+                               gfp_t gfp_mask)
+{
+       struct mlx4_icm *icm;
+       struct mlx4_icm_chunk *chunk = NULL;
+       int cur_order;
+
+       icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+       if (!icm)
+               return icm;
+
+       icm->refcount = 0;
+       INIT_LIST_HEAD(&icm->chunk_list);
+
+       cur_order = get_order(MLX4_ICM_ALLOC_SIZE);
+
+       while (npages > 0) {
+               if (!chunk) {
+                       chunk = kmalloc(sizeof *chunk,
+                                       gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+                       if (!chunk)
+                               goto fail;
+
+                       chunk->npages = 0;
+                       chunk->nsg    = 0;
+                       list_add_tail(&chunk->list, &icm->chunk_list);
+               }
+
+               while (1 << cur_order > npages)
+                       --cur_order;
+
+               chunk->mem[chunk->npages].page = alloc_pages(gfp_mask, cur_order);
+               if (chunk->mem[chunk->npages].page) {
+                       chunk->mem[chunk->npages].length = PAGE_SIZE << cur_order;
+                       chunk->mem[chunk->npages].offset = 0;
+
+                       if (++chunk->npages == MLX4_ICM_CHUNK_LEN) {
+                               chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+                                                       chunk->npages,
+                                                       PCI_DMA_BIDIRECTIONAL);
+
+                               if (chunk->nsg <= 0)
+                                       goto fail;
+
+                               chunk = NULL;
+                       }
+
+                       npages -= 1 << cur_order;
+               } else {
+                       --cur_order;
+                       if (cur_order < 0)
+                               goto fail;
+               }
+       }
+
+       if (chunk) {
+               chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+                                       chunk->npages,
+                                       PCI_DMA_BIDIRECTIONAL);
+
+               if (chunk->nsg <= 0)
+                       goto fail;
+       }
+
+       return icm;
+
+fail:
+       mlx4_free_icm(dev, icm);
+       return NULL;
+}
+
+static int mlx4_MAP_ICM(struct mlx4_dev *dev, struct mlx4_icm *icm, u64 virt)
+{
+       return mlx4_map_cmd(dev, MLX4_CMD_MAP_ICM, icm, virt);
+}
+
+int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count)
+{
+       return mlx4_cmd(dev, virt, page_count, 0, MLX4_CMD_UNMAP_ICM,
+                       MLX4_CMD_TIME_CLASS_B);
+}
+
+int mlx4_MAP_ICM_page(struct mlx4_dev *dev, u64 dma_addr, u64 virt)
+{
+       struct mlx4_cmd_mailbox *mailbox;
+       __be64 *inbox;
+       int err;
+
+       mailbox = mlx4_alloc_cmd_mailbox(dev);
+       if (IS_ERR(mailbox))
+               return PTR_ERR(mailbox);
+       inbox = mailbox->buf;
+
+       inbox[0] = cpu_to_be64(virt);
+       inbox[1] = cpu_to_be64(dma_addr);
+
+       err = mlx4_cmd(dev, mailbox->dma, 1, 0, MLX4_CMD_MAP_ICM,
+                      MLX4_CMD_TIME_CLASS_B);
+
+       mlx4_free_cmd_mailbox(dev, mailbox);
+
+       if (!err)
+               mlx4_dbg(dev, "Mapped page at %llx to %llx for ICM.\n",
+                         (unsigned long long) dma_addr, (unsigned long long) virt);
+
+       return err;
+}
+
+int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm)
+{
+       return mlx4_map_cmd(dev, MLX4_CMD_MAP_ICM_AUX, icm, -1);
+}
+
+int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev)
+{
+       return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_ICM_AUX, MLX4_CMD_TIME_CLASS_B);
+}
+
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj)
+{
+       int i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size);
+       int ret = 0;
+
+       mutex_lock(&table->mutex);
+
+       if (table->icm[i]) {
+               ++table->icm[i]->refcount;
+               goto out;
+       }
+
+       table->icm[i] = mlx4_alloc_icm(dev, MLX4_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
+                                      (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+                                      __GFP_NOWARN);
+       if (!table->icm[i]) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       if (mlx4_MAP_ICM(dev, table->icm[i], table->virt +
+                        (u64) i * MLX4_TABLE_CHUNK_SIZE)) {
+               mlx4_free_icm(dev, table->icm[i]);
+               table->icm[i] = NULL;
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       ++table->icm[i]->refcount;
+
+out:
+       mutex_unlock(&table->mutex);
+       return ret;
+}
+
+void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj)
+{
+       int i;
+
+       i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size);
+
+       mutex_lock(&table->mutex);
+
+       if (--table->icm[i]->refcount == 0) {
+               mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE,
+                              MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+               mlx4_free_icm(dev, table->icm[i]);
+               table->icm[i] = NULL;
+       }
+
+       mutex_unlock(&table->mutex);
+}
+
+void *mlx4_table_find(struct mlx4_icm_table *table, int obj)
+{
+       int idx, offset, i;
+       struct mlx4_icm_chunk *chunk;
+       struct mlx4_icm *icm;
+       struct page *page = NULL;
+
+       if (!table->lowmem)
+               return NULL;
+
+       mutex_lock(&table->mutex);
+
+       idx = obj & (table->num_obj - 1);
+       icm = table->icm[idx / (MLX4_TABLE_CHUNK_SIZE / table->obj_size)];
+       offset = idx % (MLX4_TABLE_CHUNK_SIZE / table->obj_size);
+
+       if (!icm)
+               goto out;
+
+       list_for_each_entry(chunk, &icm->chunk_list, list) {
+               for (i = 0; i < chunk->npages; ++i) {
+                       if (chunk->mem[i].length > offset) {
+                               page = chunk->mem[i].page;
+                               goto out;
+                       }
+                       offset -= chunk->mem[i].length;
+               }
+       }
+
+out:
+       mutex_unlock(&table->mutex);
+       return page ? lowmem_page_address(page) + offset : NULL;
+}
+
+int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+                        int start, int end)
+{
+       int inc = MLX4_TABLE_CHUNK_SIZE / table->obj_size;
+       int i, err;
+
+       for (i = start; i <= end; i += inc) {
+               err = mlx4_table_get(dev, table, i);
+               if (err)
+                       goto fail;
+       }
+
+       return 0;
+
+fail:
+       while (i > start) {
+               i -= inc;
+               mlx4_table_put(dev, table, i);
+       }
+
+       return err;
+}
+
+void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+                         int start, int end)
+{
+       int i;
+
+       for (i = start; i <= end; i += MLX4_TABLE_CHUNK_SIZE / table->obj_size)
+               mlx4_table_put(dev, table, i);
+}
+
+int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+                       u64 virt, int obj_size, int nobj, int reserved,
+                       int use_lowmem)
+{
+       int obj_per_chunk;
+       int num_icm;
+       unsigned chunk_size;
+       int i;
+
+       obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
+       num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
+
+       table->icm      = kcalloc(num_icm, sizeof *table->icm, GFP_KERNEL);
+       if (!table->icm)
+               return -ENOMEM;
+       table->virt     = virt;
+       table->num_icm  = num_icm;
+       table->num_obj  = nobj;
+       table->obj_size = obj_size;
+       table->lowmem   = use_lowmem;
+       mutex_init(&table->mutex);
+
+       for (i = 0; i * MLX4_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) {
+               chunk_size = MLX4_TABLE_CHUNK_SIZE;
+               if ((i + 1) * MLX4_TABLE_CHUNK_SIZE > nobj * obj_size)
+                       chunk_size = PAGE_ALIGN(nobj * obj_size - i * MLX4_TABLE_CHUNK_SIZE);
+
+               table->icm[i] = mlx4_alloc_icm(dev, chunk_size >> PAGE_SHIFT,
+                                              (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+                                              __GFP_NOWARN);
+               if (!table->icm[i])
+                       goto err;
+               if (mlx4_MAP_ICM(dev, table->icm[i], virt + i * MLX4_TABLE_CHUNK_SIZE)) {
+                       mlx4_free_icm(dev, table->icm[i]);
+                       table->icm[i] = NULL;
+                       goto err;
+               }
+
+               /*
+                * Add a reference to this ICM chunk so that it never
+                * gets freed (since it contains reserved firmware objects).
+                */
+               ++table->icm[i]->refcount;
+       }
+
+       return 0;
+
+err:
+       for (i = 0; i < num_icm; ++i)
+               if (table->icm[i]) {
+                       mlx4_UNMAP_ICM(dev, virt + i * MLX4_TABLE_CHUNK_SIZE,
+                                      MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+                       mlx4_free_icm(dev, table->icm[i]);
+               }
+
+       return -ENOMEM;
+}
+
+void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table)
+{
+       int i;
+
+       for (i = 0; i < table->num_icm; ++i)
+               if (table->icm[i]) {
+                       mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE,
+                                      MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+                       mlx4_free_icm(dev, table->icm[i]);
+               }
+
+       kfree(table->icm);
+}
diff --git a/drivers/net/mlx4/icm.h b/drivers/net/mlx4/icm.h
new file mode 100644 (file)
index 0000000..bea223d
--- /dev/null
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_ICM_H
+#define MLX4_ICM_H
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/mutex.h>
+
+#define MLX4_ICM_CHUNK_LEN                                             \
+       ((256 - sizeof (struct list_head) - 2 * sizeof (int)) /         \
+        (sizeof (struct scatterlist)))
+
+enum {
+       MLX4_ICM_PAGE_SHIFT     = 12,
+       MLX4_ICM_PAGE_SIZE      = 1 << MLX4_ICM_PAGE_SHIFT,
+};
+
+struct mlx4_icm_chunk {
+       struct list_head        list;
+       int                     npages;
+       int                     nsg;
+       struct scatterlist      mem[MLX4_ICM_CHUNK_LEN];
+};
+
+struct mlx4_icm {
+       struct list_head        chunk_list;
+       int                     refcount;
+};
+
+struct mlx4_icm_iter {
+       struct mlx4_icm        *icm;
+       struct mlx4_icm_chunk  *chunk;
+       int                     page_idx;
+};
+
+struct mlx4_dev;
+
+struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, gfp_t gfp_mask);
+void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm);
+
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj);
+void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj);
+int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+                        int start, int end);
+void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+                         int start, int end);
+int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+                       u64 virt, int obj_size, int nobj, int reserved,
+                       int use_lowmem);
+void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table);
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj);
+void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj);
+void *mlx4_table_find(struct mlx4_icm_table *table, int obj);
+int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+                        int start, int end);
+void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+                         int start, int end);
+
+static inline void mlx4_icm_first(struct mlx4_icm *icm,
+                                 struct mlx4_icm_iter *iter)
+{
+       iter->icm      = icm;
+       iter->chunk    = list_empty(&icm->chunk_list) ?
+               NULL : list_entry(icm->chunk_list.next,
+                                 struct mlx4_icm_chunk, list);
+       iter->page_idx = 0;
+}
+
+static inline int mlx4_icm_last(struct mlx4_icm_iter *iter)
+{
+       return !iter->chunk;
+}
+
+static inline void mlx4_icm_next(struct mlx4_icm_iter *iter)
+{
+       if (++iter->page_idx >= iter->chunk->nsg) {
+               if (iter->chunk->list.next == &iter->icm->chunk_list) {
+                       iter->chunk = NULL;
+                       return;
+               }
+
+               iter->chunk = list_entry(iter->chunk->list.next,
+                                        struct mlx4_icm_chunk, list);
+               iter->page_idx = 0;
+       }
+}
+
+static inline dma_addr_t mlx4_icm_addr(struct mlx4_icm_iter *iter)
+{
+       return sg_dma_address(&iter->chunk->mem[iter->page_idx]);
+}
+
+static inline unsigned long mlx4_icm_size(struct mlx4_icm_iter *iter)
+{
+       return sg_dma_len(&iter->chunk->mem[iter->page_idx]);
+}
+
+int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count);
+int mlx4_MAP_ICM_page(struct mlx4_dev *dev, u64 dma_addr, u64 virt);
+int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm);
+int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev);
+
+#endif /* MLX4_ICM_H */
diff --git a/drivers/net/mlx4/intf.c b/drivers/net/mlx4/intf.c
new file mode 100644 (file)
index 0000000..65854f9
--- /dev/null
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/driver.h>
+
+#include "mlx4.h"
+
+struct mlx4_device_context {
+       struct list_head        list;
+       struct mlx4_interface  *intf;
+       void                   *context;
+};
+
+static LIST_HEAD(intf_list);
+static LIST_HEAD(dev_list);
+static DEFINE_MUTEX(intf_mutex);
+
+static void mlx4_add_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
+{
+       struct mlx4_device_context *dev_ctx;
+
+       dev_ctx = kmalloc(sizeof *dev_ctx, GFP_KERNEL);
+       if (!dev_ctx)
+               return;
+
+       dev_ctx->intf    = intf;
+       dev_ctx->context = intf->add(&priv->dev);
+
+       if (dev_ctx->context) {
+               spin_lock_irq(&priv->ctx_lock);
+               list_add_tail(&dev_ctx->list, &priv->ctx_list);
+               spin_unlock_irq(&priv->ctx_lock);
+       } else
+               kfree(dev_ctx);
+}
+
+static void mlx4_remove_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
+{
+       struct mlx4_device_context *dev_ctx;
+
+       list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+               if (dev_ctx->intf == intf) {
+                       spin_lock_irq(&priv->ctx_lock);
+                       list_del(&dev_ctx->list);
+                       spin_unlock_irq(&priv->ctx_lock);
+
+                       intf->remove(&priv->dev, dev_ctx->context);
+                       kfree(dev_ctx);
+                       return;
+               }
+}
+
+int mlx4_register_interface(struct mlx4_interface *intf)
+{
+       struct mlx4_priv *priv;
+
+       if (!intf->add || !intf->remove)
+               return -EINVAL;
+
+       mutex_lock(&intf_mutex);
+
+       list_add_tail(&intf->list, &intf_list);
+       list_for_each_entry(priv, &dev_list, dev_list)
+               mlx4_add_device(intf, priv);
+
+       mutex_unlock(&intf_mutex);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_register_interface);
+
+void mlx4_unregister_interface(struct mlx4_interface *intf)
+{
+       struct mlx4_priv *priv;
+
+       mutex_lock(&intf_mutex);
+
+       list_for_each_entry(priv, &dev_list, dev_list)
+               mlx4_remove_device(intf, priv);
+
+       list_del(&intf->list);
+
+       mutex_unlock(&intf_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_interface);
+
+void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_event type,
+                        int subtype, int port)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       struct mlx4_device_context *dev_ctx;
+       unsigned long flags;
+
+       spin_lock_irqsave(&priv->ctx_lock, flags);
+
+       list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+               if (dev_ctx->intf->event)
+                       dev_ctx->intf->event(dev, dev_ctx->context, type,
+                                            subtype, port);
+
+       spin_unlock_irqrestore(&priv->ctx_lock, flags);
+}
+
+int mlx4_register_device(struct mlx4_dev *dev)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       struct mlx4_interface *intf;
+
+       INIT_LIST_HEAD(&priv->ctx_list);
+       spin_lock_init(&priv->ctx_lock);
+
+       mutex_lock(&intf_mutex);
+
+       list_add_tail(&priv->dev_list, &dev_list);
+       list_for_each_entry(intf, &intf_list, list)
+               mlx4_add_device(intf, priv);
+
+       mutex_unlock(&intf_mutex);
+
+       return 0;
+}
+
+void mlx4_unregister_device(struct mlx4_dev *dev)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       struct mlx4_interface *intf;
+
+       mutex_lock(&intf_mutex);
+
+       list_for_each_entry(intf, &intf_list, list)
+               mlx4_remove_device(intf, priv);
+
+       list_del(&priv->dev_list);
+
+       mutex_unlock(&intf_mutex);
+}
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
new file mode 100644 (file)
index 0000000..4debb02
--- /dev/null
@@ -0,0 +1,936 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+#include "mlx4.h"
+#include "fw.h"
+#include "icm.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA low-level driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+#ifdef CONFIG_MLX4_DEBUG
+
+int mlx4_debug_level = 0;
+module_param_named(debug_level, mlx4_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+
+#endif /* CONFIG_MLX4_DEBUG */
+
+#ifdef CONFIG_PCI_MSI
+
+static int msi_x;
+module_param(msi_x, int, 0444);
+MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
+
+#else /* CONFIG_PCI_MSI */
+
+#define msi_x (0)
+
+#endif /* CONFIG_PCI_MSI */
+
+static const char mlx4_version[] __devinitdata =
+       DRV_NAME ": Mellanox ConnectX core driver v"
+       DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static struct mlx4_profile default_profile = {
+       .num_qp         = 1 << 16,
+       .num_srq        = 1 << 16,
+       .rdmarc_per_qp  = 4,
+       .num_cq         = 1 << 16,
+       .num_mcg        = 1 << 13,
+       .num_mpt        = 1 << 17,
+       .num_mtt        = 1 << 20,
+};
+
+static int __devinit mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
+{
+       int err;
+
+       err = mlx4_QUERY_DEV_CAP(dev, dev_cap);
+       if (err) {
+               mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+               return err;
+       }
+
+       if (dev_cap->min_page_sz > PAGE_SIZE) {
+               mlx4_err(dev, "HCA minimum page size of %d bigger than "
+                        "kernel PAGE_SIZE of %ld, aborting.\n",
+                        dev_cap->min_page_sz, PAGE_SIZE);
+               return -ENODEV;
+       }
+       if (dev_cap->num_ports > MLX4_MAX_PORTS) {
+               mlx4_err(dev, "HCA has %d ports, but we only support %d, "
+                        "aborting.\n",
+                        dev_cap->num_ports, MLX4_MAX_PORTS);
+               return -ENODEV;
+       }
+
+       if (dev_cap->uar_size > pci_resource_len(dev->pdev, 2)) {
+               mlx4_err(dev, "HCA reported UAR size of 0x%x bigger than "
+                        "PCI resource 2 size of 0x%llx, aborting.\n",
+                        dev_cap->uar_size,
+                        (unsigned long long) pci_resource_len(dev->pdev, 2));
+               return -ENODEV;
+       }
+
+       dev->caps.num_ports          = dev_cap->num_ports;
+       dev->caps.num_uars           = dev_cap->uar_size / PAGE_SIZE;
+       dev->caps.vl_cap             = dev_cap->max_vl;
+       dev->caps.mtu_cap            = dev_cap->max_mtu;
+       dev->caps.gid_table_len      = dev_cap->max_gids;
+       dev->caps.pkey_table_len     = dev_cap->max_pkeys;
+       dev->caps.local_ca_ack_delay = dev_cap->local_ca_ack_delay;
+       dev->caps.bf_reg_size        = dev_cap->bf_reg_size;
+       dev->caps.bf_regs_per_page   = dev_cap->bf_regs_per_page;
+       dev->caps.max_sq_sg          = dev_cap->max_sq_sg;
+       dev->caps.max_rq_sg          = dev_cap->max_rq_sg;
+       dev->caps.max_wqes           = dev_cap->max_qp_sz;
+       dev->caps.max_qp_init_rdma   = dev_cap->max_requester_per_qp;
+       dev->caps.reserved_qps       = dev_cap->reserved_qps;
+       dev->caps.max_srq_wqes       = dev_cap->max_srq_sz;
+       dev->caps.max_srq_sge        = dev_cap->max_rq_sg - 1;
+       dev->caps.reserved_srqs      = dev_cap->reserved_srqs;
+       dev->caps.max_sq_desc_sz     = dev_cap->max_sq_desc_sz;
+       dev->caps.max_rq_desc_sz     = dev_cap->max_rq_desc_sz;
+       dev->caps.num_qp_per_mgm     = MLX4_QP_PER_MGM;
+       /*
+        * Subtract 1 from the limit because we need to allocate a
+        * spare CQE so the HCA HW can tell the difference between an
+        * empty CQ and a full CQ.
+        */
+       dev->caps.max_cqes           = dev_cap->max_cq_sz - 1;
+       dev->caps.reserved_cqs       = dev_cap->reserved_cqs;
+       dev->caps.reserved_eqs       = dev_cap->reserved_eqs;
+       dev->caps.reserved_mtts      = dev_cap->reserved_mtts;
+       dev->caps.reserved_mrws      = dev_cap->reserved_mrws;
+       dev->caps.reserved_uars      = dev_cap->reserved_uars;
+       dev->caps.reserved_pds       = dev_cap->reserved_pds;
+       dev->caps.port_width_cap     = dev_cap->max_port_width;
+       dev->caps.mtt_entry_sz       = MLX4_MTT_ENTRY_PER_SEG * dev_cap->mtt_entry_sz;
+       dev->caps.page_size_cap      = ~(u32) (dev_cap->min_page_sz - 1);
+       dev->caps.flags              = dev_cap->flags;
+       dev->caps.stat_rate_support  = dev_cap->stat_rate_support;
+
+       return 0;
+}
+
+static int __devinit mlx4_load_fw(struct mlx4_dev *dev)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       int err;
+
+       priv->fw.fw_icm = mlx4_alloc_icm(dev, priv->fw.fw_pages,
+                                        GFP_HIGHUSER | __GFP_NOWARN);
+       if (!priv->fw.fw_icm) {
+               mlx4_err(dev, "Couldn't allocate FW area, aborting.\n");
+               return -ENOMEM;
+       }
+
+       err = mlx4_MAP_FA(dev, priv->fw.fw_icm);
+       if (err) {
+               mlx4_err(dev, "MAP_FA command failed, aborting.\n");
+               goto err_free;
+       }
+
+       err = mlx4_RUN_FW(dev);
+       if (err) {
+               mlx4_err(dev, "RUN_FW command failed, aborting.\n");
+               goto err_unmap_fa;
+       }
+
+       return 0;
+
+err_unmap_fa:
+       mlx4_UNMAP_FA(dev);
+
+err_free:
+       mlx4_free_icm(dev, priv->fw.fw_icm);
+       return err;
+}
+
+static int __devinit mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base,
+                                         int cmpt_entry_sz)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       int err;
+
+       err = mlx4_init_icm_table(dev, &priv->qp_table.cmpt_table,
+                                 cmpt_base +
+                                 ((u64) (MLX4_CMPT_TYPE_QP *
+                                         cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+                                 cmpt_entry_sz, dev->caps.num_qps,
+                                 dev->caps.reserved_qps, 0);
+       if (err)
+               goto err;
+
+       err = mlx4_init_icm_table(dev, &priv->srq_table.cmpt_table,
+                                 cmpt_base +
+                                 ((u64) (MLX4_CMPT_TYPE_SRQ *
+                                         cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+                                 cmpt_entry_sz, dev->caps.num_srqs,
+                                 dev->caps.reserved_srqs, 0);
+       if (err)
+               goto err_qp;
+
+       err = mlx4_init_icm_table(dev, &priv->cq_table.cmpt_table,
+                                 cmpt_base +
+                                 ((u64) (MLX4_CMPT_TYPE_CQ *
+                                         cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+                                 cmpt_entry_sz, dev->caps.num_cqs,
+                                 dev->caps.reserved_cqs, 0);
+       if (err)
+               goto err_srq;
+
+       err = mlx4_init_icm_table(dev, &priv->eq_table.cmpt_table,
+                                 cmpt_base +
+                                 ((u64) (MLX4_CMPT_TYPE_EQ *
+                                         cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+                                 cmpt_entry_sz,
+                                 roundup_pow_of_two(MLX4_NUM_EQ +
+                                                    dev->caps.reserved_eqs),
+                                 MLX4_NUM_EQ + dev->caps.reserved_eqs, 0);
+       if (err)
+               goto err_cq;
+
+       return 0;
+
+err_cq:
+       mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+
+err_srq:
+       mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+
+err_qp:
+       mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+
+err:
+       return err;
+}
+
+static int __devinit mlx4_init_icm(struct mlx4_dev *dev,
+                                  struct mlx4_dev_cap *dev_cap,
+                                  struct mlx4_init_hca_param *init_hca,
+                                  u64 icm_size)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       u64 aux_pages;
+       int err;
+
+       err = mlx4_SET_ICM_SIZE(dev, icm_size, &aux_pages);
+       if (err) {
+               mlx4_err(dev, "SET_ICM_SIZE command failed, aborting.\n");
+               return err;
+       }
+
+       mlx4_dbg(dev, "%lld KB of HCA context requires %lld KB aux memory.\n",
+                (unsigned long long) icm_size >> 10,
+                (unsigned long long) aux_pages << 2);
+
+       priv->fw.aux_icm = mlx4_alloc_icm(dev, aux_pages,
+                                         GFP_HIGHUSER | __GFP_NOWARN);
+       if (!priv->fw.aux_icm) {
+               mlx4_err(dev, "Couldn't allocate aux memory, aborting.\n");
+               return -ENOMEM;
+       }
+
+       err = mlx4_MAP_ICM_AUX(dev, priv->fw.aux_icm);
+       if (err) {
+               mlx4_err(dev, "MAP_ICM_AUX command failed, aborting.\n");
+               goto err_free_aux;
+       }
+
+       err = mlx4_init_cmpt_table(dev, init_hca->cmpt_base, dev_cap->cmpt_entry_sz);
+       if (err) {
+               mlx4_err(dev, "Failed to map cMPT context memory, aborting.\n");
+               goto err_unmap_aux;
+       }
+
+       err = mlx4_map_eq_icm(dev, init_hca->eqc_base);
+       if (err) {
+               mlx4_err(dev, "Failed to map EQ context memory, aborting.\n");
+               goto err_unmap_cmpt;
+       }
+
+       err = mlx4_init_icm_table(dev, &priv->mr_table.mtt_table,
+                                 init_hca->mtt_base,
+                                 dev->caps.mtt_entry_sz,
+                                 dev->caps.num_mtt_segs,
+                                 dev->caps.reserved_mtts, 1);
+       if (err) {
+               mlx4_err(dev, "Failed to map MTT context memory, aborting.\n");
+               goto err_unmap_eq;
+       }
+
+       err = mlx4_init_icm_table(dev, &priv->mr_table.dmpt_table,
+                                 init_hca->dmpt_base,
+                                 dev_cap->dmpt_entry_sz,
+                                 dev->caps.num_mpts,
+                                 dev->caps.reserved_mrws, 1);
+       if (err) {
+               mlx4_err(dev, "Failed to map dMPT context memory, aborting.\n");
+               goto err_unmap_mtt;
+       }
+
+       err = mlx4_init_icm_table(dev, &priv->qp_table.qp_table,
+                                 init_hca->qpc_base,
+                                 dev_cap->qpc_entry_sz,
+                                 dev->caps.num_qps,
+                                 dev->caps.reserved_qps, 0);
+       if (err) {
+               mlx4_err(dev, "Failed to map QP context memory, aborting.\n");
+               goto err_unmap_dmpt;
+       }
+
+       err = mlx4_init_icm_table(dev, &priv->qp_table.auxc_table,
+                                 init_hca->auxc_base,
+                                 dev_cap->aux_entry_sz,
+                                 dev->caps.num_qps,
+                                 dev->caps.reserved_qps, 0);
+       if (err) {
+               mlx4_err(dev, "Failed to map AUXC context memory, aborting.\n");
+               goto err_unmap_qp;
+       }
+
+       err = mlx4_init_icm_table(dev, &priv->qp_table.altc_table,
+                                 init_hca->altc_base,
+                                 dev_cap->altc_entry_sz,
+                                 dev->caps.num_qps,
+                                 dev->caps.reserved_qps, 0);
+       if (err) {
+               mlx4_err(dev, "Failed to map ALTC context memory, aborting.\n");
+               goto err_unmap_auxc;
+       }
+
+       err = mlx4_init_icm_table(dev, &priv->qp_table.rdmarc_table,
+                                 init_hca->rdmarc_base,
+                                 dev_cap->rdmarc_entry_sz << priv->qp_table.rdmarc_shift,
+                                 dev->caps.num_qps,
+                                 dev->caps.reserved_qps, 0);
+       if (err) {
+               mlx4_err(dev, "Failed to map RDMARC context memory, aborting\n");
+               goto err_unmap_altc;
+       }
+
+       err = mlx4_init_icm_table(dev, &priv->cq_table.table,
+                                 init_hca->cqc_base,
+                                 dev_cap->cqc_entry_sz,
+                                 dev->caps.num_cqs,
+                                 dev->caps.reserved_cqs, 0);
+       if (err) {
+               mlx4_err(dev, "Failed to map CQ context memory, aborting.\n");
+               goto err_unmap_rdmarc;
+       }
+
+       err = mlx4_init_icm_table(dev, &priv->srq_table.table,
+                                 init_hca->srqc_base,
+                                 dev_cap->srq_entry_sz,
+                                 dev->caps.num_srqs,
+                                 dev->caps.reserved_srqs, 0);
+       if (err) {
+               mlx4_err(dev, "Failed to map SRQ context memory, aborting.\n");
+               goto err_unmap_cq;
+       }
+
+       /*
+        * It's not strictly required, but for simplicity just map the
+        * whole multicast group table now.  The table isn't very big
+        * and it's a lot easier than trying to track ref counts.
+        */
+       err = mlx4_init_icm_table(dev, &priv->mcg_table.table,
+                                 init_hca->mc_base, MLX4_MGM_ENTRY_SIZE,
+                                 dev->caps.num_mgms + dev->caps.num_amgms,
+                                 dev->caps.num_mgms + dev->caps.num_amgms,
+                                 0);
+       if (err) {
+               mlx4_err(dev, "Failed to map MCG context memory, aborting.\n");
+               goto err_unmap_srq;
+       }
+
+       return 0;
+
+err_unmap_srq:
+       mlx4_cleanup_icm_table(dev, &priv->srq_table.table);
+
+err_unmap_cq:
+       mlx4_cleanup_icm_table(dev, &priv->cq_table.table);
+
+err_unmap_rdmarc:
+       mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table);
+
+err_unmap_altc:
+       mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table);
+
+err_unmap_auxc:
+       mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table);
+
+err_unmap_qp:
+       mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table);
+
+err_unmap_dmpt:
+       mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table);
+
+err_unmap_mtt:
+       mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table);
+
+err_unmap_eq:
+       mlx4_unmap_eq_icm(dev);
+
+err_unmap_cmpt:
+       mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table);
+       mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+       mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+       mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+
+err_unmap_aux:
+       mlx4_UNMAP_ICM_AUX(dev);
+
+err_free_aux:
+       mlx4_free_icm(dev, priv->fw.aux_icm);
+
+       return err;
+}
+
+static void mlx4_free_icms(struct mlx4_dev *dev)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+
+       mlx4_cleanup_icm_table(dev, &priv->mcg_table.table);
+       mlx4_cleanup_icm_table(dev, &priv->srq_table.table);
+       mlx4_cleanup_icm_table(dev, &priv->cq_table.table);
+       mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table);
+       mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table);
+       mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table);
+       mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table);
+       mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table);
+       mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table);
+       mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table);
+       mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+       mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+       mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+       mlx4_unmap_eq_icm(dev);
+
+       mlx4_UNMAP_ICM_AUX(dev);
+       mlx4_free_icm(dev, priv->fw.aux_icm);
+}
+
+static void mlx4_close_hca(struct mlx4_dev *dev)
+{
+       mlx4_CLOSE_HCA(dev, 0);
+       mlx4_free_icms(dev);
+       mlx4_UNMAP_FA(dev);
+       mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm);
+}
+
+static int __devinit mlx4_init_hca(struct mlx4_dev *dev)
+{
+       struct mlx4_priv          *priv = mlx4_priv(dev);
+       struct mlx4_adapter        adapter;
+       struct mlx4_dev_cap        dev_cap;
+       struct mlx4_profile        profile;
+       struct mlx4_init_hca_param init_hca;
+       u64 icm_size;
+       int err;
+
+       err = mlx4_QUERY_FW(dev);
+       if (err) {
+               mlx4_err(dev, "QUERY_FW command failed, aborting.\n");
+               return err;
+       }
+
+       err = mlx4_load_fw(dev);
+       if (err) {
+               mlx4_err(dev, "Failed to start FW, aborting.\n");
+               return err;
+       }
+
+       err = mlx4_dev_cap(dev, &dev_cap);
+       if (err) {
+               mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+               goto err_stop_fw;
+       }
+
+       profile = default_profile;
+
+       icm_size = mlx4_make_profile(dev, &profile, &dev_cap, &init_hca);
+       if ((long long) icm_size < 0) {
+               err = icm_size;
+               goto err_stop_fw;
+       }
+
+       init_hca.log_uar_sz = ilog2(dev->caps.num_uars);
+
+       err = mlx4_init_icm(dev, &dev_cap, &init_hca, icm_size);
+       if (err)
+               goto err_stop_fw;
+
+       err = mlx4_INIT_HCA(dev, &init_hca);
+       if (err) {
+               mlx4_err(dev, "INIT_HCA command failed, aborting.\n");
+               goto err_free_icm;
+       }
+
+       err = mlx4_QUERY_ADAPTER(dev, &adapter);
+       if (err) {
+               mlx4_err(dev, "QUERY_ADAPTER command failed, aborting.\n");
+               goto err_close;
+       }
+
+       priv->eq_table.inta_pin = adapter.inta_pin;
+       priv->rev_id            = adapter.revision_id;
+       memcpy(priv->board_id, adapter.board_id, sizeof priv->board_id);
+
+       return 0;
+
+err_close:
+       mlx4_close_hca(dev);
+
+err_free_icm:
+       mlx4_free_icms(dev);
+
+err_stop_fw:
+       mlx4_UNMAP_FA(dev);
+       mlx4_free_icm(dev, priv->fw.fw_icm);
+
+       return err;
+}
+
+static int __devinit mlx4_setup_hca(struct mlx4_dev *dev)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       int err;
+
+       MLX4_INIT_DOORBELL_LOCK(&priv->doorbell_lock);
+
+       err = mlx4_init_uar_table(dev);
+       if (err) {
+               mlx4_err(dev, "Failed to initialize "
+                        "user access region table, aborting.\n");
+               return err;
+       }
+
+       err = mlx4_uar_alloc(dev, &priv->driver_uar);
+       if (err) {
+               mlx4_err(dev, "Failed to allocate driver access region, "
+                        "aborting.\n");
+               goto err_uar_table_free;
+       }
+
+       priv->kar = ioremap(priv->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+       if (!priv->kar) {
+               mlx4_err(dev, "Couldn't map kernel access region, "
+                        "aborting.\n");
+               err = -ENOMEM;
+               goto err_uar_free;
+       }
+
+       err = mlx4_init_pd_table(dev);
+       if (err) {
+               mlx4_err(dev, "Failed to initialize "
+                        "protection domain table, aborting.\n");
+               goto err_kar_unmap;
+       }
+
+       err = mlx4_init_mr_table(dev);
+       if (err) {
+               mlx4_err(dev, "Failed to initialize "
+                        "memory region table, aborting.\n");
+               goto err_pd_table_free;
+       }
+
+       mlx4_map_catas_buf(dev);
+
+       err = mlx4_init_eq_table(dev);
+       if (err) {
+               mlx4_err(dev, "Failed to initialize "
+                        "event queue table, aborting.\n");
+               goto err_catas_buf;
+       }
+
+       err = mlx4_cmd_use_events(dev);
+       if (err) {
+               mlx4_err(dev, "Failed to switch to event-driven "
+                        "firmware commands, aborting.\n");
+               goto err_eq_table_free;
+       }
+
+       err = mlx4_NOP(dev);
+       if (err) {
+               mlx4_err(dev, "NOP command failed to generate interrupt "
+                        "(IRQ %d), aborting.\n",
+                        priv->eq_table.eq[MLX4_EQ_ASYNC].irq);
+               if (dev->flags & MLX4_FLAG_MSI_X)
+                       mlx4_err(dev, "Try again with MSI-X disabled.\n");
+               else
+                       mlx4_err(dev, "BIOS or ACPI interrupt routing problem?\n");
+
+               goto err_cmd_poll;
+       }
+
+       mlx4_dbg(dev, "NOP command IRQ test passed\n");
+
+       err = mlx4_init_cq_table(dev);
+       if (err) {
+               mlx4_err(dev, "Failed to initialize "
+                        "completion queue table, aborting.\n");
+               goto err_cmd_poll;
+       }
+
+       err = mlx4_init_srq_table(dev);
+       if (err) {
+               mlx4_err(dev, "Failed to initialize "
+                        "shared receive queue table, aborting.\n");
+               goto err_cq_table_free;
+       }
+
+       err = mlx4_init_qp_table(dev);
+       if (err) {
+               mlx4_err(dev, "Failed to initialize "
+                        "queue pair table, aborting.\n");
+               goto err_srq_table_free;
+       }
+
+       err = mlx4_init_mcg_table(dev);
+       if (err) {
+               mlx4_err(dev, "Failed to initialize "
+                        "multicast group table, aborting.\n");
+               goto err_qp_table_free;
+       }
+
+       return 0;
+
+err_qp_table_free:
+       mlx4_cleanup_qp_table(dev);
+
+err_srq_table_free:
+       mlx4_cleanup_srq_table(dev);
+
+err_cq_table_free:
+       mlx4_cleanup_cq_table(dev);
+
+err_cmd_poll:
+       mlx4_cmd_use_polling(dev);
+
+err_eq_table_free:
+       mlx4_cleanup_eq_table(dev);
+
+err_catas_buf:
+       mlx4_unmap_catas_buf(dev);
+       mlx4_cleanup_mr_table(dev);
+
+err_pd_table_free:
+       mlx4_cleanup_pd_table(dev);
+
+err_kar_unmap:
+       iounmap(priv->kar);
+
+err_uar_free:
+       mlx4_uar_free(dev, &priv->driver_uar);
+
+err_uar_table_free:
+       mlx4_cleanup_uar_table(dev);
+       return err;
+}
+
+static void __devinit mlx4_enable_msi_x(struct mlx4_dev *dev)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       struct msix_entry entries[MLX4_NUM_EQ];
+       int err;
+       int i;
+
+       if (msi_x) {
+               for (i = 0; i < MLX4_NUM_EQ; ++i)
+                       entries[i].entry = i;
+
+               err = pci_enable_msix(dev->pdev, entries, ARRAY_SIZE(entries));
+               if (err) {
+                       if (err > 0)
+                               mlx4_info(dev, "Only %d MSI-X vectors available, "
+                                         "not using MSI-X\n", err);
+                       goto no_msi;
+               }
+
+               for (i = 0; i < MLX4_NUM_EQ; ++i)
+                       priv->eq_table.eq[i].irq = entries[i].vector;
+
+               dev->flags |= MLX4_FLAG_MSI_X;
+               return;
+       }
+
+no_msi:
+       for (i = 0; i < MLX4_NUM_EQ; ++i)
+               priv->eq_table.eq[i].irq = dev->pdev->irq;
+}
+
+static int __devinit mlx4_init_one(struct pci_dev *pdev,
+                                  const struct pci_device_id *id)
+{
+       static int mlx4_version_printed;
+       struct mlx4_priv *priv;
+       struct mlx4_dev *dev;
+       int err;
+
+       if (!mlx4_version_printed) {
+               printk(KERN_INFO "%s", mlx4_version);
+               ++mlx4_version_printed;
+       }
+
+       printk(KERN_INFO PFX "Initializing %s\n",
+              pci_name(pdev));
+
+       err = pci_enable_device(pdev);
+       if (err) {
+               dev_err(&pdev->dev, "Cannot enable PCI device, "
+                       "aborting.\n");
+               return err;
+       }
+
+       /*
+        * Check for BARs.  We expect 0: 1MB, 2: 8MB, 4: DDR (may not
+        * be present)
+        */
+       if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) ||
+           pci_resource_len(pdev, 0) != 1 << 20) {
+               dev_err(&pdev->dev, "Missing DCS, aborting.\n");
+               err = -ENODEV;
+               goto err_disable_pdev;
+       }
+       if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) {
+               dev_err(&pdev->dev, "Missing UAR, aborting.\n");
+               err = -ENODEV;
+               goto err_disable_pdev;
+       }
+
+       err = pci_request_region(pdev, 0, DRV_NAME);
+       if (err) {
+               dev_err(&pdev->dev, "Cannot request control region, aborting.\n");
+               goto err_disable_pdev;
+       }
+
+       err = pci_request_region(pdev, 2, DRV_NAME);
+       if (err) {
+               dev_err(&pdev->dev, "Cannot request UAR region, aborting.\n");
+               goto err_release_bar0;
+       }
+
+       pci_set_master(pdev);
+
+       err = pci_set_dma_mask(pdev, DMA_64BIT_MASK);
+       if (err) {
+               dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n");
+              &n