Merge branch 'misc' into for-next
Roland Dreier [Tue, 2 Mar 2010 07:52:31 +0000 (23:52 -0800)]
Conflicts:
drivers/infiniband/core/uverbs_main.c

38 files changed:
drivers/infiniband/core/ucm.c
drivers/infiniband/core/user_mad.c
drivers/infiniband/core/uverbs.h
drivers/infiniband/core/uverbs_main.c
drivers/infiniband/hw/cxgb3/cxio_hal.c
drivers/infiniband/hw/cxgb3/cxio_hal.h
drivers/infiniband/hw/cxgb3/cxio_wr.h
drivers/infiniband/hw/cxgb3/iwch.c
drivers/infiniband/hw/cxgb3/iwch.h
drivers/infiniband/hw/cxgb3/iwch_provider.c
drivers/infiniband/hw/cxgb3/iwch_qp.c
drivers/infiniband/hw/ehca/ehca_irq.c
drivers/infiniband/hw/ehca/ehca_qp.c
drivers/infiniband/hw/ehca/ehca_sqp.c
drivers/infiniband/hw/mlx4/qp.c
drivers/infiniband/hw/nes/nes.c
drivers/infiniband/hw/nes/nes.h
drivers/infiniband/hw/nes/nes_cm.c
drivers/infiniband/hw/nes/nes_hw.c
drivers/infiniband/hw/nes/nes_hw.h
drivers/infiniband/hw/nes/nes_nic.c
drivers/infiniband/hw/nes/nes_verbs.c
drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
drivers/infiniband/ulp/iser/iscsi_iser.c
drivers/infiniband/ulp/iser/iscsi_iser.h
drivers/infiniband/ulp/iser/iser_initiator.c
drivers/infiniband/ulp/iser/iser_memory.c
drivers/infiniband/ulp/iser/iser_verbs.c
drivers/infiniband/ulp/srp/ib_srp.c
drivers/infiniband/ulp/srp/ib_srp.h
drivers/net/cxgb3/adapter.h
drivers/net/cxgb3/cxgb3_main.c
drivers/net/cxgb3/cxgb3_offload.h
drivers/net/cxgb3/regs.h
drivers/net/cxgb3/sge.c
drivers/net/cxgb3/t3_hw.c
include/rdma/ib_verbs.h
include/rdma/rdma_cm.h

index f504c9b..1b09b73 100644 (file)
@@ -1215,15 +1215,18 @@ static void ib_ucm_release_dev(struct device *dev)
 
        ucm_dev = container_of(dev, struct ib_ucm_device, dev);
        cdev_del(&ucm_dev->cdev);
-       clear_bit(ucm_dev->devnum, dev_map);
+       if (ucm_dev->devnum < IB_UCM_MAX_DEVICES)
+               clear_bit(ucm_dev->devnum, dev_map);
+       else
+               clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, dev_map);
        kfree(ucm_dev);
 }
 
 static const struct file_operations ucm_fops = {
-       .owner   = THIS_MODULE,
-       .open    = ib_ucm_open,
+       .owner   = THIS_MODULE,
+       .open    = ib_ucm_open,
        .release = ib_ucm_close,
-       .write   = ib_ucm_write,
+       .write   = ib_ucm_write,
        .poll    = ib_ucm_poll,
 };
 
@@ -1237,8 +1240,32 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
 
+static dev_t overflow_maj;
+static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES);
+static int find_overflow_devnum(void)
+{
+       int ret;
+
+       if (!overflow_maj) {
+               ret = alloc_chrdev_region(&overflow_maj, 0, IB_UCM_MAX_DEVICES,
+                                         "infiniband_cm");
+               if (ret) {
+                       printk(KERN_ERR "ucm: couldn't register dynamic device number\n");
+                       return ret;
+               }
+       }
+
+       ret = find_first_zero_bit(overflow_map, IB_UCM_MAX_DEVICES);
+       if (ret >= IB_UCM_MAX_DEVICES)
+               return -1;
+
+       return ret;
+}
+
 static void ib_ucm_add_one(struct ib_device *device)
 {
+       int devnum;
+       dev_t base;
        struct ib_ucm_device *ucm_dev;
 
        if (!device->alloc_ucontext ||
@@ -1251,16 +1278,25 @@ static void ib_ucm_add_one(struct ib_device *device)
 
        ucm_dev->ib_dev = device;
 
-       ucm_dev->devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES);
-       if (ucm_dev->devnum >= IB_UCM_MAX_DEVICES)
-               goto err;
-
-       set_bit(ucm_dev->devnum, dev_map);
+       devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES);
+       if (devnum >= IB_UCM_MAX_DEVICES) {
+               devnum = find_overflow_devnum();
+               if (devnum < 0)
+                       goto err;
+
+               ucm_dev->devnum = devnum + IB_UCM_MAX_DEVICES;
+               base = devnum + overflow_maj;
+               set_bit(devnum, overflow_map);
+       } else {
+               ucm_dev->devnum = devnum;
+               base = devnum + IB_UCM_BASE_DEV;
+               set_bit(devnum, dev_map);
+       }
 
        cdev_init(&ucm_dev->cdev, &ucm_fops);
        ucm_dev->cdev.owner = THIS_MODULE;
        kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum);
-       if (cdev_add(&ucm_dev->cdev, IB_UCM_BASE_DEV + ucm_dev->devnum, 1))
+       if (cdev_add(&ucm_dev->cdev, base, 1))
                goto err;
 
        ucm_dev->dev.class = &cm_class;
@@ -1281,7 +1317,10 @@ err_dev:
        device_unregister(&ucm_dev->dev);
 err_cdev:
        cdev_del(&ucm_dev->cdev);
-       clear_bit(ucm_dev->devnum, dev_map);
+       if (ucm_dev->devnum < IB_UCM_MAX_DEVICES)
+               clear_bit(devnum, dev_map);
+       else
+               clear_bit(devnum, overflow_map);
 err:
        kfree(ucm_dev);
        return;
@@ -1340,6 +1379,8 @@ static void __exit ib_ucm_cleanup(void)
        ib_unregister_client(&ucm_client);
        class_remove_file(&cm_class, &class_attr_abi_version);
        unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
+       if (overflow_maj)
+               unregister_chrdev_region(overflow_maj, IB_UCM_MAX_DEVICES);
        idr_destroy(&ctx_id_table);
 }
 
index 7de0296..02d360c 100644 (file)
@@ -65,12 +65,9 @@ enum {
 };
 
 /*
- * Our lifetime rules for these structs are the following: each time a
- * device special file is opened, we look up the corresponding struct
- * ib_umad_port by minor in the umad_port[] table while holding the
- * port_lock.  If this lookup succeeds, we take a reference on the
- * ib_umad_port's struct ib_umad_device while still holding the
- * port_lock; if the lookup fails, we fail the open().  We drop these
+ * Our lifetime rules for these structs are the following:
+ * device special file is opened, we take a reference on the
+ * ib_umad_port's struct ib_umad_device. We drop these
  * references in the corresponding close().
  *
  * In addition to references coming from open character devices, there
@@ -78,19 +75,14 @@ enum {
  * module's reference taken when allocating the ib_umad_device in
  * ib_umad_add_one().
  *
- * When destroying an ib_umad_device, we clear all of its
- * ib_umad_ports from umad_port[] while holding port_lock before
- * dropping the module's reference to the ib_umad_device.  This is
- * always safe because any open() calls will either succeed and obtain
- * a reference before we clear the umad_port[] entries, or fail after
- * we clear the umad_port[] entries.
+ * When destroying an ib_umad_device, we drop the module's reference.
  */
 
 struct ib_umad_port {
-       struct cdev           *cdev;
+       struct cdev           cdev;
        struct device         *dev;
 
-       struct cdev           *sm_cdev;
+       struct cdev           sm_cdev;
        struct device         *sm_dev;
        struct semaphore       sm_sem;
 
@@ -136,7 +128,6 @@ static struct class *umad_class;
 static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
 
 static DEFINE_SPINLOCK(port_lock);
-static struct ib_umad_port *umad_port[IB_UMAD_MAX_PORTS];
 static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
 
 static void ib_umad_add_one(struct ib_device *device);
@@ -496,8 +487,8 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
                ah_attr.ah_flags = IB_AH_GRH;
                memcpy(ah_attr.grh.dgid.raw, packet->mad.hdr.gid, 16);
                ah_attr.grh.sgid_index     = packet->mad.hdr.gid_index;
-               ah_attr.grh.flow_label     = be32_to_cpu(packet->mad.hdr.flow_label);
-               ah_attr.grh.hop_limit      = packet->mad.hdr.hop_limit;
+               ah_attr.grh.flow_label     = be32_to_cpu(packet->mad.hdr.flow_label);
+               ah_attr.grh.hop_limit      = packet->mad.hdr.hop_limit;
                ah_attr.grh.traffic_class  = packet->mad.hdr.traffic_class;
        }
 
@@ -528,9 +519,9 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
                goto err_ah;
        }
 
-       packet->msg->ah         = ah;
+       packet->msg->ah         = ah;
        packet->msg->timeout_ms = packet->mad.hdr.timeout_ms;
-       packet->msg->retries    = packet->mad.hdr.retries;
+       packet->msg->retries    = packet->mad.hdr.retries;
        packet->msg->context[0] = packet;
 
        /* Copy MAD header.  Any RMPP header is already in place. */
@@ -779,15 +770,11 @@ static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd,
 /*
  * ib_umad_open() does not need the BKL:
  *
- *  - umad_port[] accesses are protected by port_lock, the
- *    ib_umad_port structures are properly reference counted, and
+ *  - the ib_umad_port structures are properly reference counted, and
  *    everything else is purely local to the file being created, so
  *    races against other open calls are not a problem;
  *  - the ioctl method does not affect any global state outside of the
  *    file structure being operated on;
- *  - the port is added to umad_port[] as the last part of module
- *    initialization so the open method will either immediately run
- *    -ENXIO, or all required initialization will be done.
  */
 static int ib_umad_open(struct inode *inode, struct file *filp)
 {
@@ -795,13 +782,10 @@ static int ib_umad_open(struct inode *inode, struct file *filp)
        struct ib_umad_file *file;
        int ret = 0;
 
-       spin_lock(&port_lock);
-       port = umad_port[iminor(inode) - IB_UMAD_MINOR_BASE];
+       port = container_of(inode->i_cdev, struct ib_umad_port, cdev);
        if (port)
                kref_get(&port->umad_dev->ref);
-       spin_unlock(&port_lock);
-
-       if (!port)
+       else
                return -ENXIO;
 
        mutex_lock(&port->file_mutex);
@@ -872,16 +856,16 @@ static int ib_umad_close(struct inode *inode, struct file *filp)
 }
 
 static const struct file_operations umad_fops = {
-       .owner          = THIS_MODULE,
-       .read           = ib_umad_read,
-       .write          = ib_umad_write,
-       .poll           = ib_umad_poll,
+       .owner          = THIS_MODULE,
+       .read           = ib_umad_read,
+       .write          = ib_umad_write,
+       .poll           = ib_umad_poll,
        .unlocked_ioctl = ib_umad_ioctl,
 #ifdef CONFIG_COMPAT
-       .compat_ioctl   = ib_umad_compat_ioctl,
+       .compat_ioctl   = ib_umad_compat_ioctl,
 #endif
-       .open           = ib_umad_open,
-       .release        = ib_umad_close
+       .open           = ib_umad_open,
+       .release        = ib_umad_close
 };
 
 static int ib_umad_sm_open(struct inode *inode, struct file *filp)
@@ -892,13 +876,10 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp)
        };
        int ret;
 
-       spin_lock(&port_lock);
-       port = umad_port[iminor(inode) - IB_UMAD_MINOR_BASE - IB_UMAD_MAX_PORTS];
+       port = container_of(inode->i_cdev, struct ib_umad_port, sm_cdev);
        if (port)
                kref_get(&port->umad_dev->ref);
-       spin_unlock(&port_lock);
-
-       if (!port)
+       else
                return -ENXIO;
 
        if (filp->f_flags & O_NONBLOCK) {
@@ -949,8 +930,8 @@ static int ib_umad_sm_close(struct inode *inode, struct file *filp)
 }
 
 static const struct file_operations umad_sm_fops = {
-       .owner   = THIS_MODULE,
-       .open    = ib_umad_sm_open,
+       .owner   = THIS_MODULE,
+       .open    = ib_umad_sm_open,
        .release = ib_umad_sm_close
 };
 
@@ -990,16 +971,51 @@ static ssize_t show_abi_version(struct class *class, char *buf)
 }
 static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
 
+static dev_t overflow_maj;
+static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS);
+static int find_overflow_devnum(void)
+{
+       int ret;
+
+       if (!overflow_maj) {
+               ret = alloc_chrdev_region(&overflow_maj, 0, IB_UMAD_MAX_PORTS * 2,
+                                         "infiniband_mad");
+               if (ret) {
+                       printk(KERN_ERR "user_mad: couldn't register dynamic device number\n");
+                       return ret;
+               }
+       }
+
+       ret = find_first_zero_bit(overflow_map, IB_UMAD_MAX_PORTS);
+       if (ret >= IB_UMAD_MAX_PORTS)
+               return -1;
+
+       return ret;
+}
+
 static int ib_umad_init_port(struct ib_device *device, int port_num,
                             struct ib_umad_port *port)
 {
+       int devnum;
+       dev_t base;
+
        spin_lock(&port_lock);
-       port->dev_num = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
-       if (port->dev_num >= IB_UMAD_MAX_PORTS) {
+       devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
+       if (devnum >= IB_UMAD_MAX_PORTS) {
                spin_unlock(&port_lock);
-               return -1;
+               devnum = find_overflow_devnum();
+               if (devnum < 0)
+                       return -1;
+
+               spin_lock(&port_lock);
+               port->dev_num = devnum + IB_UMAD_MAX_PORTS;
+               base = devnum + overflow_maj;
+               set_bit(devnum, overflow_map);
+       } else {
+               port->dev_num = devnum;
+               base = devnum + base_dev;
+               set_bit(devnum, dev_map);
        }
-       set_bit(port->dev_num, dev_map);
        spin_unlock(&port_lock);
 
        port->ib_dev   = device;
@@ -1008,17 +1024,14 @@ static int ib_umad_init_port(struct ib_device *device, int port_num,
        mutex_init(&port->file_mutex);
        INIT_LIST_HEAD(&port->file_list);
 
-       port->cdev = cdev_alloc();
-       if (!port->cdev)
-               return -1;
-       port->cdev->owner = THIS_MODULE;
-       port->cdev->ops   = &umad_fops;
-       kobject_set_name(&port->cdev->kobj, "umad%d", port->dev_num);
-       if (cdev_add(port->cdev, base_dev + port->dev_num, 1))
+       cdev_init(&port->cdev, &umad_fops);
+       port->cdev.owner = THIS_MODULE;
+       kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num);
+       if (cdev_add(&port->cdev, base, 1))
                goto err_cdev;
 
        port->dev = device_create(umad_class, device->dma_device,
-                                 port->cdev->dev, port,
+                                 port->cdev.dev, port,
                                  "umad%d", port->dev_num);
        if (IS_ERR(port->dev))
                goto err_cdev;
@@ -1028,17 +1041,15 @@ static int ib_umad_init_port(struct ib_device *device, int port_num,
        if (device_create_file(port->dev, &dev_attr_port))
                goto err_dev;
 
-       port->sm_cdev = cdev_alloc();
-       if (!port->sm_cdev)
-               goto err_dev;
-       port->sm_cdev->owner = THIS_MODULE;
-       port->sm_cdev->ops   = &umad_sm_fops;
-       kobject_set_name(&port->sm_cdev->kobj, "issm%d", port->dev_num);
-       if (cdev_add(port->sm_cdev, base_dev + port->dev_num + IB_UMAD_MAX_PORTS, 1))
+       base += IB_UMAD_MAX_PORTS;
+       cdev_init(&port->sm_cdev, &umad_sm_fops);
+       port->sm_cdev.owner = THIS_MODULE;
+       kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num);
+       if (cdev_add(&port->sm_cdev, base, 1))
                goto err_sm_cdev;
 
        port->sm_dev = device_create(umad_class, device->dma_device,
-                                    port->sm_cdev->dev, port,
+                                    port->sm_cdev.dev, port,
                                     "issm%d", port->dev_num);
        if (IS_ERR(port->sm_dev))
                goto err_sm_cdev;
@@ -1048,24 +1059,23 @@ static int ib_umad_init_port(struct ib_device *device, int port_num,
        if (device_create_file(port->sm_dev, &dev_attr_port))
                goto err_sm_dev;
 
-       spin_lock(&port_lock);
-       umad_port[port->dev_num] = port;
-       spin_unlock(&port_lock);
-
        return 0;
 
 err_sm_dev:
-       device_destroy(umad_class, port->sm_cdev->dev);
+       device_destroy(umad_class, port->sm_cdev.dev);
 
 err_sm_cdev:
-       cdev_del(port->sm_cdev);
+       cdev_del(&port->sm_cdev);
 
 err_dev:
-       device_destroy(umad_class, port->cdev->dev);
+       device_destroy(umad_class, port->cdev.dev);
 
 err_cdev:
-       cdev_del(port->cdev);
-       clear_bit(port->dev_num, dev_map);
+       cdev_del(&port->cdev);
+       if (port->dev_num < IB_UMAD_MAX_PORTS)
+               clear_bit(devnum, dev_map);
+       else
+               clear_bit(devnum, overflow_map);
 
        return -1;
 }
@@ -1079,15 +1089,11 @@ static void ib_umad_kill_port(struct ib_umad_port *port)
        dev_set_drvdata(port->dev,    NULL);
        dev_set_drvdata(port->sm_dev, NULL);
 
-       device_destroy(umad_class, port->cdev->dev);
-       device_destroy(umad_class, port->sm_cdev->dev);
+       device_destroy(umad_class, port->cdev.dev);
+       device_destroy(umad_class, port->sm_cdev.dev);
 
-       cdev_del(port->cdev);
-       cdev_del(port->sm_cdev);
-
-       spin_lock(&port_lock);
-       umad_port[port->dev_num] = NULL;
-       spin_unlock(&port_lock);
+       cdev_del(&port->cdev);
+       cdev_del(&port->sm_cdev);
 
        mutex_lock(&port->file_mutex);
 
@@ -1106,7 +1112,10 @@ static void ib_umad_kill_port(struct ib_umad_port *port)
 
        mutex_unlock(&port->file_mutex);
 
-       clear_bit(port->dev_num, dev_map);
+       if (port->dev_num < IB_UMAD_MAX_PORTS)
+               clear_bit(port->dev_num, dev_map);
+       else
+               clear_bit(port->dev_num - IB_UMAD_MAX_PORTS, overflow_map);
 }
 
 static void ib_umad_add_one(struct ib_device *device)
@@ -1214,6 +1223,8 @@ static void __exit ib_umad_cleanup(void)
        ib_unregister_client(&umad_client);
        class_destroy(umad_class);
        unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
+       if (overflow_maj)
+               unregister_chrdev_region(overflow_maj, IB_UMAD_MAX_PORTS * 2);
 }
 
 module_init(ib_umad_init);
index b3ea958..e54d9ac 100644 (file)
@@ -41,6 +41,7 @@
 #include <linux/idr.h>
 #include <linux/mutex.h>
 #include <linux/completion.h>
+#include <linux/cdev.h>
 
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_umem.h>
 
 struct ib_uverbs_device {
        struct kref                             ref;
+       int                                     num_comp_vectors;
        struct completion                       comp;
-       int                                     devnum;
-       struct cdev                            *cdev;
        struct device                          *dev;
        struct ib_device                       *ib_dev;
-       int                                     num_comp_vectors;
+       int                                     devnum;
+       struct cdev                             cdev;
 };
 
 struct ib_uverbs_event_file {
        struct kref                             ref;
+       int                                     is_async;
        struct ib_uverbs_file                  *uverbs_file;
        spinlock_t                              lock;
+       int                                     is_closed;
        wait_queue_head_t                       poll_wait;
        struct fasync_struct                   *async_queue;
        struct list_head                        event_list;
-       int                                     is_async;
-       int                                     is_closed;
 };
 
 struct ib_uverbs_file {
index 82b60c6..ff59a79 100644 (file)
@@ -73,40 +73,39 @@ DEFINE_IDR(ib_uverbs_qp_idr);
 DEFINE_IDR(ib_uverbs_srq_idr);
 
 static DEFINE_SPINLOCK(map_lock);
-static struct ib_uverbs_device *dev_table[IB_UVERBS_MAX_DEVICES];
 static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
 
 static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
                                     const char __user *buf, int in_len,
                                     int out_len) = {
-       [IB_USER_VERBS_CMD_GET_CONTEXT]         = ib_uverbs_get_context,
-       [IB_USER_VERBS_CMD_QUERY_DEVICE]        = ib_uverbs_query_device,
-       [IB_USER_VERBS_CMD_QUERY_PORT]          = ib_uverbs_query_port,
-       [IB_USER_VERBS_CMD_ALLOC_PD]            = ib_uverbs_alloc_pd,
-       [IB_USER_VERBS_CMD_DEALLOC_PD]          = ib_uverbs_dealloc_pd,
-       [IB_USER_VERBS_CMD_REG_MR]              = ib_uverbs_reg_mr,
-       [IB_USER_VERBS_CMD_DEREG_MR]            = ib_uverbs_dereg_mr,
+       [IB_USER_VERBS_CMD_GET_CONTEXT]         = ib_uverbs_get_context,
+       [IB_USER_VERBS_CMD_QUERY_DEVICE]        = ib_uverbs_query_device,
+       [IB_USER_VERBS_CMD_QUERY_PORT]          = ib_uverbs_query_port,
+       [IB_USER_VERBS_CMD_ALLOC_PD]            = ib_uverbs_alloc_pd,
+       [IB_USER_VERBS_CMD_DEALLOC_PD]          = ib_uverbs_dealloc_pd,
+       [IB_USER_VERBS_CMD_REG_MR]              = ib_uverbs_reg_mr,
+       [IB_USER_VERBS_CMD_DEREG_MR]            = ib_uverbs_dereg_mr,
        [IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel,
-       [IB_USER_VERBS_CMD_CREATE_CQ]           = ib_uverbs_create_cq,
-       [IB_USER_VERBS_CMD_RESIZE_CQ]           = ib_uverbs_resize_cq,
-       [IB_USER_VERBS_CMD_POLL_CQ]             = ib_uverbs_poll_cq,
-       [IB_USER_VERBS_CMD_REQ_NOTIFY_CQ]       = ib_uverbs_req_notify_cq,
-       [IB_USER_VERBS_CMD_DESTROY_CQ]          = ib_uverbs_destroy_cq,
-       [IB_USER_VERBS_CMD_CREATE_QP]           = ib_uverbs_create_qp,
-       [IB_USER_VERBS_CMD_QUERY_QP]            = ib_uverbs_query_qp,
-       [IB_USER_VERBS_CMD_MODIFY_QP]           = ib_uverbs_modify_qp,
-       [IB_USER_VERBS_CMD_DESTROY_QP]          = ib_uverbs_destroy_qp,
-       [IB_USER_VERBS_CMD_POST_SEND]           = ib_uverbs_post_send,
-       [IB_USER_VERBS_CMD_POST_RECV]           = ib_uverbs_post_recv,
-       [IB_USER_VERBS_CMD_POST_SRQ_RECV]       = ib_uverbs_post_srq_recv,
-       [IB_USER_VERBS_CMD_CREATE_AH]           = ib_uverbs_create_ah,
-       [IB_USER_VERBS_CMD_DESTROY_AH]          = ib_uverbs_destroy_ah,
-       [IB_USER_VERBS_CMD_ATTACH_MCAST]        = ib_uverbs_attach_mcast,
-       [IB_USER_VERBS_CMD_DETACH_MCAST]        = ib_uverbs_detach_mcast,
-       [IB_USER_VERBS_CMD_CREATE_SRQ]          = ib_uverbs_create_srq,
-       [IB_USER_VERBS_CMD_MODIFY_SRQ]          = ib_uverbs_modify_srq,
-       [IB_USER_VERBS_CMD_QUERY_SRQ]           = ib_uverbs_query_srq,
-       [IB_USER_VERBS_CMD_DESTROY_SRQ]         = ib_uverbs_destroy_srq,
+       [IB_USER_VERBS_CMD_CREATE_CQ]           = ib_uverbs_create_cq,
+       [IB_USER_VERBS_CMD_RESIZE_CQ]           = ib_uverbs_resize_cq,
+       [IB_USER_VERBS_CMD_POLL_CQ]             = ib_uverbs_poll_cq,
+       [IB_USER_VERBS_CMD_REQ_NOTIFY_CQ]       = ib_uverbs_req_notify_cq,
+       [IB_USER_VERBS_CMD_DESTROY_CQ]          = ib_uverbs_destroy_cq,
+       [IB_USER_VERBS_CMD_CREATE_QP]           = ib_uverbs_create_qp,
+       [IB_USER_VERBS_CMD_QUERY_QP]            = ib_uverbs_query_qp,
+       [IB_USER_VERBS_CMD_MODIFY_QP]           = ib_uverbs_modify_qp,
+       [IB_USER_VERBS_CMD_DESTROY_QP]          = ib_uverbs_destroy_qp,
+       [IB_USER_VERBS_CMD_POST_SEND]           = ib_uverbs_post_send,
+       [IB_USER_VERBS_CMD_POST_RECV]           = ib_uverbs_post_recv,
+       [IB_USER_VERBS_CMD_POST_SRQ_RECV]       = ib_uverbs_post_srq_recv,
+       [IB_USER_VERBS_CMD_CREATE_AH]           = ib_uverbs_create_ah,
+       [IB_USER_VERBS_CMD_DESTROY_AH]          = ib_uverbs_destroy_ah,
+       [IB_USER_VERBS_CMD_ATTACH_MCAST]        = ib_uverbs_attach_mcast,
+       [IB_USER_VERBS_CMD_DETACH_MCAST]        = ib_uverbs_detach_mcast,
+       [IB_USER_VERBS_CMD_CREATE_SRQ]          = ib_uverbs_create_srq,
+       [IB_USER_VERBS_CMD_MODIFY_SRQ]          = ib_uverbs_modify_srq,
+       [IB_USER_VERBS_CMD_QUERY_SRQ]           = ib_uverbs_query_srq,
+       [IB_USER_VERBS_CMD_DESTROY_SRQ]         = ib_uverbs_destroy_srq,
 };
 
 static void ib_uverbs_add_one(struct ib_device *device);
@@ -366,7 +365,7 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
 
 static const struct file_operations uverbs_event_fops = {
        .owner   = THIS_MODULE,
-       .read    = ib_uverbs_event_read,
+       .read    = ib_uverbs_event_read,
        .poll    = ib_uverbs_event_poll,
        .release = ib_uverbs_event_close,
        .fasync  = ib_uverbs_event_fasync
@@ -601,14 +600,12 @@ static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
 /*
  * ib_uverbs_open() does not need the BKL:
  *
- *  - dev_table[] accesses are protected by map_lock, the
- *    ib_uverbs_device structures are properly reference counted, and
+ *  - the ib_uverbs_device structures are properly reference counted and
  *    everything else is purely local to the file being created, so
  *    races against other open calls are not a problem;
  *  - there is no ioctl method to race against;
- *  - the device is added to dev_table[] as the last part of module
- *    initialization, the open method will either immediately run
- *    -ENXIO, or all required initialization will be done.
+ *  - the open method will either immediately run -ENXIO, or all
+ *    required initialization will be done.
  */
 static int ib_uverbs_open(struct inode *inode, struct file *filp)
 {
@@ -616,13 +613,10 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
        struct ib_uverbs_file *file;
        int ret;
 
-       spin_lock(&map_lock);
-       dev = dev_table[iminor(inode) - IB_UVERBS_BASE_MINOR];
+       dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev);
        if (dev)
                kref_get(&dev->ref);
-       spin_unlock(&map_lock);
-
-       if (!dev)
+       else
                return -ENXIO;
 
        if (!try_module_get(dev->ib_dev->owner)) {
@@ -669,17 +663,17 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp)
 }
 
 static const struct file_operations uverbs_fops = {
-       .owner   = THIS_MODULE,
-       .write   = ib_uverbs_write,
-       .open    = ib_uverbs_open,
+       .owner   = THIS_MODULE,
+       .write   = ib_uverbs_write,
+       .open    = ib_uverbs_open,
        .release = ib_uverbs_close
 };
 
 static const struct file_operations uverbs_mmap_fops = {
-       .owner   = THIS_MODULE,
-       .write   = ib_uverbs_write,
+       .owner   = THIS_MODULE,
+       .write   = ib_uverbs_write,
        .mmap    = ib_uverbs_mmap,
-       .open    = ib_uverbs_open,
+       .open    = ib_uverbs_open,
        .release = ib_uverbs_close
 };
 
@@ -719,8 +713,38 @@ static ssize_t show_abi_version(struct class *class, char *buf)
 }
 static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
 
+static dev_t overflow_maj;
+static DECLARE_BITMAP(overflow_map, IB_UVERBS_MAX_DEVICES);
+
+/*
+ * If we have more than IB_UVERBS_MAX_DEVICES, dynamically overflow by
+ * requesting a new major number and doubling the number of max devices we
+ * support. It's stupid, but simple.
+ */
+static int find_overflow_devnum(void)
+{
+       int ret;
+
+       if (!overflow_maj) {
+               ret = alloc_chrdev_region(&overflow_maj, 0, IB_UVERBS_MAX_DEVICES,
+                                         "infiniband_verbs");
+               if (ret) {
+                       printk(KERN_ERR "user_verbs: couldn't register dynamic device number\n");
+                       return ret;
+               }
+       }
+
+       ret = find_first_zero_bit(overflow_map, IB_UVERBS_MAX_DEVICES);
+       if (ret >= IB_UVERBS_MAX_DEVICES)
+               return -1;
+
+       return ret;
+}
+
 static void ib_uverbs_add_one(struct ib_device *device)
 {
+       int devnum;
+       dev_t base;
        struct ib_uverbs_device *uverbs_dev;
 
        if (!device->alloc_ucontext)
@@ -734,28 +758,36 @@ static void ib_uverbs_add_one(struct ib_device *device)
        init_completion(&uverbs_dev->comp);
 
        spin_lock(&map_lock);
-       uverbs_dev->devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
-       if (uverbs_dev->devnum >= IB_UVERBS_MAX_DEVICES) {
+       devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
+       if (devnum >= IB_UVERBS_MAX_DEVICES) {
                spin_unlock(&map_lock);
-               goto err;
+               devnum = find_overflow_devnum();
+               if (devnum < 0)
+                       goto err;
+
+               spin_lock(&map_lock);
+               uverbs_dev->devnum = devnum + IB_UVERBS_MAX_DEVICES;
+               base = devnum + overflow_maj;
+               set_bit(devnum, overflow_map);
+       } else {
+               uverbs_dev->devnum = devnum;
+               base = devnum + IB_UVERBS_BASE_DEV;
+               set_bit(devnum, dev_map);
        }
-       set_bit(uverbs_dev->devnum, dev_map);
        spin_unlock(&map_lock);
 
        uverbs_dev->ib_dev           = device;
        uverbs_dev->num_comp_vectors = device->num_comp_vectors;
 
-       uverbs_dev->cdev = cdev_alloc();
-       if (!uverbs_dev->cdev)
-               goto err;
-       uverbs_dev->cdev->owner = THIS_MODULE;
-       uverbs_dev->cdev->ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
-       kobject_set_name(&uverbs_dev->cdev->kobj, "uverbs%d", uverbs_dev->devnum);
-       if (cdev_add(uverbs_dev->cdev, IB_UVERBS_BASE_DEV + uverbs_dev->devnum, 1))
+       cdev_init(&uverbs_dev->cdev, NULL);
+       uverbs_dev->cdev.owner = THIS_MODULE;
+       uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
+       kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);
+       if (cdev_add(&uverbs_dev->cdev, base, 1))
                goto err_cdev;
 
        uverbs_dev->dev = device_create(uverbs_class, device->dma_device,
-                                       uverbs_dev->cdev->dev, uverbs_dev,
+                                       uverbs_dev->cdev.dev, uverbs_dev,
                                        "uverbs%d", uverbs_dev->devnum);
        if (IS_ERR(uverbs_dev->dev))
                goto err_cdev;
@@ -765,20 +797,19 @@ static void ib_uverbs_add_one(struct ib_device *device)
        if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version))
                goto err_class;
 
-       spin_lock(&map_lock);
-       dev_table[uverbs_dev->devnum] = uverbs_dev;
-       spin_unlock(&map_lock);
-
        ib_set_client_data(device, &uverbs_client, uverbs_dev);
 
        return;
 
 err_class:
-       device_destroy(uverbs_class, uverbs_dev->cdev->dev);
+       device_destroy(uverbs_class, uverbs_dev->cdev.dev);
 
 err_cdev:
-       cdev_del(uverbs_dev->cdev);
-       clear_bit(uverbs_dev->devnum, dev_map);
+       cdev_del(&uverbs_dev->cdev);
+       if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES)
+               clear_bit(devnum, dev_map);
+       else
+               clear_bit(devnum, overflow_map);
 
 err:
        kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
@@ -795,14 +826,13 @@ static void ib_uverbs_remove_one(struct ib_device *device)
                return;
 
        dev_set_drvdata(uverbs_dev->dev, NULL);
-       device_destroy(uverbs_class, uverbs_dev->cdev->dev);
-       cdev_del(uverbs_dev->cdev);
+       device_destroy(uverbs_class, uverbs_dev->cdev.dev);
+       cdev_del(&uverbs_dev->cdev);
 
-       spin_lock(&map_lock);
-       dev_table[uverbs_dev->devnum] = NULL;
-       spin_unlock(&map_lock);
-
-       clear_bit(uverbs_dev->devnum, dev_map);
+       if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES)
+               clear_bit(uverbs_dev->devnum, dev_map);
+       else
+               clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map);
 
        kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
        wait_for_completion(&uverbs_dev->comp);
@@ -856,6 +886,8 @@ static void __exit ib_uverbs_cleanup(void)
        ib_unregister_client(&uverbs_client);
        class_destroy(uverbs_class);
        unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
+       if (overflow_maj)
+               unregister_chrdev_region(overflow_maj, IB_UVERBS_MAX_DEVICES);
        idr_destroy(&ib_uverbs_pd_idr);
        idr_destroy(&ib_uverbs_mr_idr);
        idr_destroy(&ib_uverbs_mw_idr);
index 0677fc7..a28e862 100644 (file)
@@ -109,7 +109,6 @@ int cxio_hal_cq_op(struct cxio_rdev *rdev_p, struct t3_cq *cq,
                while (!CQ_VLD_ENTRY(rptr, cq->size_log2, cqe)) {
                        udelay(1);
                        if (i++ > 1000000) {
-                               BUG_ON(1);
                                printk(KERN_ERR "%s: stalled rnic\n",
                                       rdev_p->dev_name);
                                return -EIO;
@@ -155,7 +154,7 @@ static int cxio_hal_clear_qp_ctx(struct cxio_rdev *rdev_p, u32 qpid)
        return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb);
 }
 
-int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
+int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq, int kernel)
 {
        struct rdma_cq_setup setup;
        int size = (1UL << (cq->size_log2)) * sizeof(struct t3_cqe);
@@ -163,12 +162,12 @@ int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
        cq->cqid = cxio_hal_get_cqid(rdev_p->rscp);
        if (!cq->cqid)
                return -ENOMEM;
-       cq->sw_queue = kzalloc(size, GFP_KERNEL);
-       if (!cq->sw_queue)
-               return -ENOMEM;
-       cq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev),
-                                            (1UL << (cq->size_log2)) *
-                                            sizeof(struct t3_cqe),
+       if (kernel) {
+               cq->sw_queue = kzalloc(size, GFP_KERNEL);
+               if (!cq->sw_queue)
+                       return -ENOMEM;
+       }
+       cq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev), size,
                                             &(cq->dma_addr), GFP_KERNEL);
        if (!cq->queue) {
                kfree(cq->sw_queue);
index f3d440c..073373c 100644 (file)
@@ -53,7 +53,7 @@
 #define T3_MAX_PBL_SIZE 256
 #define T3_MAX_RQ_SIZE 1024
 #define T3_MAX_QP_DEPTH (T3_MAX_RQ_SIZE-1)
-#define T3_MAX_CQ_DEPTH 8192
+#define T3_MAX_CQ_DEPTH 262144
 #define T3_MAX_NUM_STAG (1<<15)
 #define T3_MAX_MR_SIZE 0x100000000ULL
 #define T3_PAGESIZE_MASK 0xffff000  /* 4KB-128MB */
@@ -157,7 +157,7 @@ int cxio_rdev_open(struct cxio_rdev *rdev);
 void cxio_rdev_close(struct cxio_rdev *rdev);
 int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq,
                   enum t3_cq_opcode op, u32 credit);
-int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
+int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq, int kernel);
 int cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
 int cxio_resize_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
 void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
index a197a5b..15073b2 100644 (file)
@@ -730,7 +730,22 @@ struct t3_cq {
 
 static inline void cxio_set_wq_in_error(struct t3_wq *wq)
 {
-       wq->queue->wq_in_err.err = 1;
+       wq->queue->wq_in_err.err |= 1;
+}
+
+static inline void cxio_disable_wq_db(struct t3_wq *wq)
+{
+       wq->queue->wq_in_err.err |= 2;
+}
+
+static inline void cxio_enable_wq_db(struct t3_wq *wq)
+{
+       wq->queue->wq_in_err.err &= ~2;
+}
+
+static inline int cxio_wq_db_enabled(struct t3_wq *wq)
+{
+       return !(wq->queue->wq_in_err.err & 2);
 }
 
 static inline struct t3_cqe *cxio_next_hw_cqe(struct t3_cq *cq)
index b0ea010..ee1d8b4 100644 (file)
@@ -65,6 +65,46 @@ struct cxgb3_client t3c_client = {
 static LIST_HEAD(dev_list);
 static DEFINE_MUTEX(dev_mutex);
 
+static int disable_qp_db(int id, void *p, void *data)
+{
+       struct iwch_qp *qhp = p;
+
+       cxio_disable_wq_db(&qhp->wq);
+       return 0;
+}
+
+static int enable_qp_db(int id, void *p, void *data)
+{
+       struct iwch_qp *qhp = p;
+
+       if (data)
+               ring_doorbell(qhp->rhp->rdev.ctrl_qp.doorbell, qhp->wq.qpid);
+       cxio_enable_wq_db(&qhp->wq);
+       return 0;
+}
+
+static void disable_dbs(struct iwch_dev *rnicp)
+{
+       spin_lock_irq(&rnicp->lock);
+       idr_for_each(&rnicp->qpidr, disable_qp_db, NULL);
+       spin_unlock_irq(&rnicp->lock);
+}
+
+static void enable_dbs(struct iwch_dev *rnicp, int ring_db)
+{
+       spin_lock_irq(&rnicp->lock);
+       idr_for_each(&rnicp->qpidr, enable_qp_db,
+                    (void *)(unsigned long)ring_db);
+       spin_unlock_irq(&rnicp->lock);
+}
+
+static void iwch_db_drop_task(struct work_struct *work)
+{
+       struct iwch_dev *rnicp = container_of(work, struct iwch_dev,
+                                             db_drop_task.work);
+       enable_dbs(rnicp, 1);
+}
+
 static void rnic_init(struct iwch_dev *rnicp)
 {
        PDBG("%s iwch_dev %p\n", __func__,  rnicp);
@@ -72,6 +112,7 @@ static void rnic_init(struct iwch_dev *rnicp)
        idr_init(&rnicp->qpidr);
        idr_init(&rnicp->mmidr);
        spin_lock_init(&rnicp->lock);
+       INIT_DELAYED_WORK(&rnicp->db_drop_task, iwch_db_drop_task);
 
        rnicp->attr.max_qps = T3_MAX_NUM_QP - 32;
        rnicp->attr.max_wrs = T3_MAX_QP_DEPTH;
@@ -147,6 +188,8 @@ static void close_rnic_dev(struct t3cdev *tdev)
        mutex_lock(&dev_mutex);
        list_for_each_entry_safe(dev, tmp, &dev_list, entry) {
                if (dev->rdev.t3cdev_p == tdev) {
+                       dev->rdev.flags = CXIO_ERROR_FATAL;
+                       cancel_delayed_work_sync(&dev->db_drop_task);
                        list_del(&dev->entry);
                        iwch_unregister_device(dev);
                        cxio_rdev_close(&dev->rdev);
@@ -165,7 +208,8 @@ static void iwch_event_handler(struct t3cdev *tdev, u32 evt, u32 port_id)
        struct cxio_rdev *rdev = tdev->ulp;
        struct iwch_dev *rnicp;
        struct ib_event event;
-       u32    portnum = port_id + 1;
+       u32 portnum = port_id + 1;
+       int dispatch = 0;
 
        if (!rdev)
                return;
@@ -174,21 +218,49 @@ static void iwch_event_handler(struct t3cdev *tdev, u32 evt, u32 port_id)
        case OFFLOAD_STATUS_DOWN: {
                rdev->flags = CXIO_ERROR_FATAL;
                event.event  = IB_EVENT_DEVICE_FATAL;
+               dispatch = 1;
                break;
                }
        case OFFLOAD_PORT_DOWN: {
                event.event  = IB_EVENT_PORT_ERR;
+               dispatch = 1;
                break;
                }
        case OFFLOAD_PORT_UP: {
                event.event  = IB_EVENT_PORT_ACTIVE;
+               dispatch = 1;
+               break;
+               }
+       case OFFLOAD_DB_FULL: {
+               disable_dbs(rnicp);
+               break;
+               }
+       case OFFLOAD_DB_EMPTY: {
+               enable_dbs(rnicp, 1);
+               break;
+               }
+       case OFFLOAD_DB_DROP: {
+               unsigned long delay = 1000;
+               unsigned short r;
+
+               disable_dbs(rnicp);
+               get_random_bytes(&r, 2);
+               delay += r & 1023;
+
+               /*
+                * delay is between 1000-2023 usecs.
+                */
+               schedule_delayed_work(&rnicp->db_drop_task,
+                       usecs_to_jiffies(delay));
                break;
                }
        }
 
-       event.device = &rnicp->ibdev;
-       event.element.port_num = portnum;
-       ib_dispatch_event(&event);
+       if (dispatch) {
+               event.device = &rnicp->ibdev;
+               event.element.port_num = portnum;
+               ib_dispatch_event(&event);
+       }
 
        return;
 }
index 8473550..a1c4457 100644 (file)
@@ -36,6 +36,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/idr.h>
+#include <linux/workqueue.h>
 
 #include <rdma/ib_verbs.h>
 
@@ -110,6 +111,7 @@ struct iwch_dev {
        struct idr mmidr;
        spinlock_t lock;
        struct list_head entry;
+       struct delayed_work db_drop_task;
 };
 
 static inline struct iwch_dev *to_iwch_dev(struct ib_device *ibdev)
index ed71755..47b35c6 100644 (file)
@@ -187,7 +187,7 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, int entries, int ve
        entries = roundup_pow_of_two(entries);
        chp->cq.size_log2 = ilog2(entries);
 
-       if (cxio_create_cq(&rhp->rdev, &chp->cq)) {
+       if (cxio_create_cq(&rhp->rdev, &chp->cq, !ucontext)) {
                kfree(chp);
                return ERR_PTR(-ENOMEM);
        }
index 3eb8cec..b4d893d 100644 (file)
@@ -452,7 +452,8 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                ++(qhp->wq.sq_wptr);
        }
        spin_unlock_irqrestore(&qhp->lock, flag);
-       ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+       if (cxio_wq_db_enabled(&qhp->wq))
+               ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
 
 out:
        if (err)
@@ -514,7 +515,8 @@ int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
                num_wrs--;
        }
        spin_unlock_irqrestore(&qhp->lock, flag);
-       ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+       if (cxio_wq_db_enabled(&qhp->wq))
+               ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
 
 out:
        if (err)
@@ -597,7 +599,8 @@ int iwch_bind_mw(struct ib_qp *qp,
        ++(qhp->wq.sq_wptr);
        spin_unlock_irqrestore(&qhp->lock, flag);
 
-       ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+       if (cxio_wq_db_enabled(&qhp->wq))
+               ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
 
        return err;
 }
index 42be0b1..b2b6fea 100644 (file)
@@ -548,11 +548,10 @@ void ehca_process_eq(struct ehca_shca *shca, int is_irq)
        struct ehca_eq *eq = &shca->eq;
        struct ehca_eqe_cache_entry *eqe_cache = eq->eqe_cache;
        u64 eqe_value, ret;
-       unsigned long flags;
        int eqe_cnt, i;
        int eq_empty = 0;
 
-       spin_lock_irqsave(&eq->irq_spinlock, flags);
+       spin_lock(&eq->irq_spinlock);
        if (is_irq) {
                const int max_query_cnt = 100;
                int query_cnt = 0;
@@ -643,7 +642,7 @@ void ehca_process_eq(struct ehca_shca *shca, int is_irq)
        } while (1);
 
 unlock_irq_spinlock:
-       spin_unlock_irqrestore(&eq->irq_spinlock, flags);
+       spin_unlock(&eq->irq_spinlock);
 }
 
 void ehca_tasklet_eq(unsigned long data)
index 0338f1f..b105f66 100644 (file)
@@ -55,9 +55,7 @@ static struct kmem_cache *qp_cache;
 /*
  * attributes not supported by query qp
  */
-#define QP_ATTR_QUERY_NOT_SUPPORTED (IB_QP_MAX_DEST_RD_ATOMIC | \
-                                    IB_QP_MAX_QP_RD_ATOMIC   | \
-                                    IB_QP_ACCESS_FLAGS       | \
+#define QP_ATTR_QUERY_NOT_SUPPORTED (IB_QP_ACCESS_FLAGS       | \
                                     IB_QP_EN_SQD_ASYNC_NOTIFY)
 
 /*
index 8c1213f..dba8f9f 100644 (file)
@@ -222,7 +222,7 @@ int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 {
        int ret;
 
-       if (!port_num || port_num > ibdev->phys_port_cnt)
+       if (!port_num || port_num > ibdev->phys_port_cnt || !in_wc)
                return IB_MAD_RESULT_FAILURE;
 
        /* accept only pma request */
index a182352..ae75389 100644 (file)
@@ -1214,7 +1214,7 @@ out:
 static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
                            void *wqe, unsigned *mlx_seg_len)
 {
-       struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev;
+       struct ib_device *ib_dev = sqp->qp.ibqp.device;
        struct mlx4_wqe_mlx_seg *mlx = wqe;
        struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
        struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
index b9d09ba..4272c52 100644 (file)
@@ -110,6 +110,7 @@ static unsigned int sysfs_idx_addr;
 
 static struct pci_device_id nes_pci_table[] = {
        {PCI_VENDOR_ID_NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020, PCI_ANY_ID, PCI_ANY_ID},
+       {PCI_VENDOR_ID_NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020_KR, PCI_ANY_ID, PCI_ANY_ID},
        {0}
 };
 
index 9884056..cc78fee 100644 (file)
@@ -64,8 +64,9 @@
  * NetEffect PCI vendor id and NE010 PCI device id.
  */
 #ifndef PCI_VENDOR_ID_NETEFFECT        /* not in pci.ids yet */
-#define PCI_VENDOR_ID_NETEFFECT       0x1678
-#define PCI_DEVICE_ID_NETEFFECT_NE020 0x0100
+#define PCI_VENDOR_ID_NETEFFECT          0x1678
+#define PCI_DEVICE_ID_NETEFFECT_NE020    0x0100
+#define PCI_DEVICE_ID_NETEFFECT_NE020_KR 0x0110
 #endif
 
 #define NE020_REV   4
@@ -193,8 +194,8 @@ extern u32 cm_packets_created;
 extern u32 cm_packets_received;
 extern u32 cm_packets_dropped;
 extern u32 cm_packets_retrans;
-extern u32 cm_listens_created;
-extern u32 cm_listens_destroyed;
+extern atomic_t cm_listens_created;
+extern atomic_t cm_listens_destroyed;
 extern u32 cm_backlog_drops;
 extern atomic_t cm_loopbacks;
 extern atomic_t cm_nodes_created;
index 39468c2..2a49ee4 100644 (file)
@@ -67,8 +67,8 @@ u32 cm_packets_dropped;
 u32 cm_packets_retrans;
 u32 cm_packets_created;
 u32 cm_packets_received;
-u32 cm_listens_created;
-u32 cm_listens_destroyed;
+atomic_t cm_listens_created;
+atomic_t cm_listens_destroyed;
 u32 cm_backlog_drops;
 atomic_t cm_loopbacks;
 atomic_t cm_nodes_created;
@@ -1011,9 +1011,10 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core,
                                        event.cm_info.loc_port =
                                                         loopback->loc_port;
                                        event.cm_info.cm_id = loopback->cm_id;
+                                       add_ref_cm_node(loopback);
+                                       loopback->state = NES_CM_STATE_CLOSED;
                                        cm_event_connect_error(&event);
                                        cm_node->state = NES_CM_STATE_LISTENER_DESTROYED;
-                                       loopback->state = NES_CM_STATE_CLOSED;
 
                                        rem_ref_cm_node(cm_node->cm_core,
                                                         cm_node);
@@ -1042,7 +1043,7 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core,
                kfree(listener);
                listener = NULL;
                ret = 0;
-               cm_listens_destroyed++;
+               atomic_inc(&cm_listens_destroyed);
        } else {
                spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
        }
@@ -3172,7 +3173,7 @@ int nes_create_listen(struct iw_cm_id *cm_id, int backlog)
                        g_cm_core->api->stop_listener(g_cm_core, (void *)cm_node);
                        return err;
                }
-               cm_listens_created++;
+               atomic_inc(&cm_listens_created);
        }
 
        cm_id->add_ref(cm_id);
index b1c2cbb..ce7f538 100644 (file)
@@ -748,16 +748,28 @@ static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count,
 
        if (hw_rev != NE020_REV) {
                /* init serdes 0 */
-               if (wide_ppm_offset && (nesadapter->phy_type[0] == NES_PHY_TYPE_CX4))
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000FFFAA);
-               else
+               switch (nesadapter->phy_type[0]) {
+               case NES_PHY_TYPE_CX4:
+                       if (wide_ppm_offset)
+                               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000FFFAA);
+                       else
+                               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
+                       break;
+               case NES_PHY_TYPE_KR:
+                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
+                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000);
+                       break;
+               case NES_PHY_TYPE_PUMA_1G:
                        nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
-
-               if (nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) {
                        sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0);
                        sds |= 0x00000100;
                        nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, sds);
+                       break;
+               default:
+                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
+                       break;
                }
+
                if (!OneG_Mode)
                        nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0, 0x11110000);
 
@@ -778,6 +790,9 @@ static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count,
                        if (wide_ppm_offset)
                                nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000FFFAA);
                        break;
+               case NES_PHY_TYPE_KR:
+                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000);
+                       break;
                case NES_PHY_TYPE_PUMA_1G:
                        sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
                        sds |= 0x000000100;
@@ -1279,115 +1294,115 @@ int nes_destroy_cqp(struct nes_device *nesdev)
 
 
 /**
- * nes_init_phy
+ * nes_init_1g_phy
  */
-int nes_init_phy(struct nes_device *nesdev)
+int nes_init_1g_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index)
 {
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
        u32 counter = 0;
-       u32 sds;
-       u32 mac_index = nesdev->mac_index;
-       u32 tx_config = 0;
        u16 phy_data;
-       u32 temp_phy_data = 0;
-       u32 temp_phy_data2 = 0;
-       u8  phy_type = nesadapter->phy_type[mac_index];
-       u8  phy_index = nesadapter->phy_index[mac_index];
-
-       if ((nesadapter->OneG_Mode) &&
-           (phy_type != NES_PHY_TYPE_PUMA_1G)) {
-               nes_debug(NES_DBG_PHY, "1G PHY, mac_index = %d.\n", mac_index);
-               if (phy_type == NES_PHY_TYPE_1G) {
-                       tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG);
-                       tx_config &= 0xFFFFFFE3;
-                       tx_config |= 0x04;
-                       nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config);
-               }
+       int ret = 0;
 
-               nes_read_1G_phy_reg(nesdev, 1, phy_index, &phy_data);
-               nes_write_1G_phy_reg(nesdev, 23, phy_index, 0xb000);
+       nes_read_1G_phy_reg(nesdev, 1, phy_index, &phy_data);
+       nes_write_1G_phy_reg(nesdev, 23, phy_index, 0xb000);
 
-               /* Reset the PHY */
-               nes_write_1G_phy_reg(nesdev, 0, phy_index, 0x8000);
-               udelay(100);
-               counter = 0;
-               do {
-                       nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
-                       if (counter++ > 100)
-                               break;
-               } while (phy_data & 0x8000);
-
-               /* Setting no phy loopback */
-               phy_data &= 0xbfff;
-               phy_data |= 0x1140;
-               nes_write_1G_phy_reg(nesdev, 0, phy_index,  phy_data);
+       /* Reset the PHY */
+       nes_write_1G_phy_reg(nesdev, 0, phy_index, 0x8000);
+       udelay(100);
+       counter = 0;
+       do {
                nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
-               nes_read_1G_phy_reg(nesdev, 0x17, phy_index, &phy_data);
-               nes_read_1G_phy_reg(nesdev, 0x1e, phy_index, &phy_data);
-
-               /* Setting the interrupt mask */
-               nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data);
-               nes_write_1G_phy_reg(nesdev, 0x19, phy_index, 0xffee);
-               nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data);
+               if (counter++ > 100) {
+                       ret = -1;
+                       break;
+               }
+       } while (phy_data & 0x8000);
+
+       /* Setting no phy loopback */
+       phy_data &= 0xbfff;
+       phy_data |= 0x1140;
+       nes_write_1G_phy_reg(nesdev, 0, phy_index,  phy_data);
+       nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+       nes_read_1G_phy_reg(nesdev, 0x17, phy_index, &phy_data);
+       nes_read_1G_phy_reg(nesdev, 0x1e, phy_index, &phy_data);
+
+       /* Setting the interrupt mask */
+       nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data);
+       nes_write_1G_phy_reg(nesdev, 0x19, phy_index, 0xffee);
+       nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data);
+
+       /* turning on flow control */
+       nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data);
+       nes_write_1G_phy_reg(nesdev, 4, phy_index, (phy_data & ~(0x03E0)) | 0xc00);
+       nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data);
+
+       /* Clear Half duplex */
+       nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data);
+       nes_write_1G_phy_reg(nesdev, 9, phy_index, phy_data & ~(0x0100));
+       nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data);
+
+       nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+       nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data | 0x0300);
+
+       return ret;
+}
 
-               /* turning on flow control */
-               nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data);
-               nes_write_1G_phy_reg(nesdev, 4, phy_index, (phy_data & ~(0x03E0)) | 0xc00);
-               nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data);
 
-               /* Clear Half duplex */
-               nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data);
-               nes_write_1G_phy_reg(nesdev, 9, phy_index, phy_data & ~(0x0100));
-               nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data);
+/**
+ * nes_init_2025_phy
+ */
+int nes_init_2025_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index)
+{
+       u32 temp_phy_data = 0;
+       u32 temp_phy_data2 = 0;
+       u32 counter = 0;
+       u32 sds;
+       u32 mac_index = nesdev->mac_index;
+       int ret = 0;
+       unsigned int first_attempt = 1;
 
-               nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
-               nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data | 0x0300);
+       /* Check firmware heartbeat */
+       nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+       temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+       udelay(1500);
+       nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+       temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
 
-               return 0;
+       if (temp_phy_data != temp_phy_data2) {
+               nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd);
+               temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+               if ((temp_phy_data & 0xff) > 0x20)
+                       return 0;
+               printk(PFX "Reinitialize external PHY\n");
        }
 
-       if ((phy_type == NES_PHY_TYPE_IRIS) ||
-           (phy_type == NES_PHY_TYPE_ARGUS) ||
-           (phy_type == NES_PHY_TYPE_SFP_D)) {
-               /* setup 10G MDIO operation */
-               tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG);
-               tx_config &= 0xFFFFFFE3;
-               tx_config |= 0x15;
-               nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config);
-       }
-       if ((phy_type == NES_PHY_TYPE_ARGUS) ||
-           (phy_type == NES_PHY_TYPE_SFP_D)) {
-               u32 first_time = 1;
+       /* no heartbeat, configure the PHY */
+       nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0x0000, 0x8000);
+       nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0000);
+       nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
+       nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
 
-               /* Check firmware heartbeat */
-               nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
-               temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-               udelay(1500);
-               nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
-               temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+       switch (phy_type) {
+       case NES_PHY_TYPE_ARGUS:
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C);
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0008);
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0001);
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098);
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
 
-               if (temp_phy_data != temp_phy_data2) {
-                       nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd);
-                       temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-                       if ((temp_phy_data & 0xff) > 0x20)
-                               return 0;
-                       printk(PFX "Reinitializing PHY\n");
-               }
+               /* setup LEDs */
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007);
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A);
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009);
+               break;
 
-               /* no heartbeat, configure the PHY */
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0x0000, 0x8000);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0000);
+       case NES_PHY_TYPE_SFP_D:
                nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
                nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
-               if (phy_type == NES_PHY_TYPE_ARGUS) {
-                       nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C);
-                       nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0008);
-                       nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0001);
-               } else {
-                       nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x0004);
-                       nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0038);
-                       nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013);
-               }
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x0004);
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0038);
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013);
                nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098);
                nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
 
@@ -1395,71 +1410,136 @@ int nes_init_phy(struct nes_device *nesdev)
                nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007);
                nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A);
                nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009);
+               break;
+
+       case NES_PHY_TYPE_KR:
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C);
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0010);
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013);
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0080);
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
+
+               /* setup LEDs */
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x000B);
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x0003);
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0004);
 
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0028, 0xA528);
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0022, 0x406D);
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0023, 0x0020);
+               break;
+       }
+
+       nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0028, 0xA528);
 
-               /* Bring PHY out of reset */
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0002);
+       /* Bring PHY out of reset */
+       nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0002);
 
-               /* Check for heartbeat */
-               counter = 0;
-               mdelay(690);
+       /* Check for heartbeat */
+       counter = 0;
+       mdelay(690);
+       nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+       temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+       do {
+               if (counter++ > 150) {
+                       printk(PFX "No PHY heartbeat\n");
+                       break;
+               }
+               mdelay(1);
                nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+               temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+       } while ((temp_phy_data2 == temp_phy_data));
+
+       /* wait for tracking */
+       counter = 0;
+       do {
+               nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd);
                temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-               do {
-                       if (counter++ > 150) {
-                               printk(PFX "No PHY heartbeat\n");
+               if (counter++ > 300) {
+                       if (((temp_phy_data & 0xff) == 0x0) && first_attempt) {
+                               first_attempt = 0;
+                               counter = 0;
+                               /* reset AMCC PHY and try again */
+                               nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x00c0);
+                               nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x0040);
+                               continue;
+                       } else {
+                               ret = 1;
                                break;
                        }
-                       mdelay(1);
-                       nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
-                       temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-               } while ((temp_phy_data2 == temp_phy_data));
-
-               /* wait for tracking */
-               counter = 0;
-               do {
-                       nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd);
-                       temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-                       if (counter++ > 300) {
-                               if (((temp_phy_data & 0xff) == 0x0) && first_time) {
-                                       first_time = 0;
-                                       counter = 0;
-                                       /* reset AMCC PHY and try again */
-                                       nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x00c0);
-                                       nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x0040);
-                                       continue;
-                               } else {
-                                       printk(PFX "PHY did not track\n");
-                                       break;
-                               }
-                       }
-                       mdelay(10);
-               } while ((temp_phy_data & 0xff) < 0x30);
-
-               /* setup signal integrity */
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd003, 0x0000);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00D, 0x00FE);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00E, 0x0032);
+               }
+               mdelay(10);
+       } while ((temp_phy_data & 0xff) < 0x30);
+
+       /* setup signal integrity */
+       nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd003, 0x0000);
+       nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00D, 0x00FE);
+       nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00E, 0x0032);
+       if (phy_type == NES_PHY_TYPE_KR) {
+               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x000C);
+       } else {
                nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x0002);
                nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc314, 0x0063);
+       }
+
+       /* reset serdes */
+       sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200);
+       sds |= 0x1;
+       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200, sds);
+       sds &= 0xfffffffe;
+       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200, sds);
+
+       counter = 0;
+       while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET) & 0x00000040) != 0x00000040)
+                       && (counter++ < 5000))
+               ;
+
+       return ret;
+}
+
 
-               /* reset serdes */
-               sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 +
-                                      mac_index * 0x200);
-               sds |= 0x1;
-               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 +
-                                 mac_index * 0x200, sds);
-               sds &= 0xfffffffe;
-               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 +
-                                 mac_index * 0x200, sds);
-
-               counter = 0;
-               while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET) & 0x00000040) != 0x00000040)
-                               && (counter++ < 5000))
-                       ;
+/**
+ * nes_init_phy
+ */
+int nes_init_phy(struct nes_device *nesdev)
+{
+       struct nes_adapter *nesadapter = nesdev->nesadapter;
+       u32 mac_index = nesdev->mac_index;
+       u32 tx_config = 0;
+       unsigned long flags;
+       u8  phy_type = nesadapter->phy_type[mac_index];
+       u8  phy_index = nesadapter->phy_index[mac_index];
+       int ret = 0;
+
+       tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG);
+       if (phy_type == NES_PHY_TYPE_1G) {
+               /* setup 1G MDIO operation */
+               tx_config &= 0xFFFFFFE3;
+               tx_config |= 0x04;
+       } else {
+               /* setup 10G MDIO operation */
+               tx_config &= 0xFFFFFFE3;
+               tx_config |= 0x15;
        }
-       return 0;
+       nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config);
+
+       spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
+
+       switch (phy_type) {
+       case NES_PHY_TYPE_1G:
+               ret = nes_init_1g_phy(nesdev, phy_type, phy_index);
+               break;
+       case NES_PHY_TYPE_ARGUS:
+       case NES_PHY_TYPE_SFP_D:
+       case NES_PHY_TYPE_KR:
+               ret = nes_init_2025_phy(nesdev, phy_type, phy_index);
+               break;
+       }
+
+       spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
+
+       return ret;
 }
 
 
@@ -2460,23 +2540,9 @@ static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number)
                        }
                } else {
                        switch (nesadapter->phy_type[mac_index]) {
-                       case NES_PHY_TYPE_IRIS:
-                               nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 1);
-                               temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-                               u32temp = 20;
-                               do {
-                                       nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 1);
-                                       phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-                                       if ((phy_data == temp_phy_data) || (!(--u32temp)))
-                                               break;
-                                       temp_phy_data = phy_data;
-                               } while (1);
-                               nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n",
-                                       __func__, phy_data, nesadapter->mac_link_down[mac_index] ? "DOWN" : "UP");
-                               break;
-
                        case NES_PHY_TYPE_ARGUS:
                        case NES_PHY_TYPE_SFP_D:
+                       case NES_PHY_TYPE_KR:
                                /* clear the alarms */
                                nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0x0008);
                                nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc001);
@@ -3352,8 +3418,6 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
        u16 async_event_id;
        u8 tcp_state;
        u8 iwarp_state;
-       int must_disconn = 1;
-       int must_terminate = 0;
        struct ib_event ibevent;
 
        nes_debug(NES_DBG_AEQ, "\n");
@@ -3367,6 +3431,8 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
                BUG_ON(!context);
        }
 
+       /* context is nesqp unless async_event_id == CQ ERROR */
+       nesqp = (struct nes_qp *)(unsigned long)context;
        async_event_id = (u16)aeq_info;
        tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT;
        iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT;
@@ -3378,8 +3444,6 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
 
        switch (async_event_id) {
                case NES_AEQE_AEID_LLP_FIN_RECEIVED:
-                       nesqp = (struct nes_qp *)(unsigned long)context;
-
                        if (nesqp->term_flags)
                                return; /* Ignore it, wait for close complete */
 
@@ -3394,79 +3458,48 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
                                                async_event_id, nesqp->last_aeq, tcp_state);
                        }
 
-                       if ((tcp_state != NES_AEQE_TCP_STATE_CLOSE_WAIT) ||
-                                       (nesqp->ibqp_state != IB_QPS_RTS)) {
-                               /* FIN Received but tcp state or IB state moved on,
-                                               should expect a close complete */
-                               return;
-                       }
-
+                       break;
                case NES_AEQE_AEID_LLP_CLOSE_COMPLETE:
-                       nesqp = (struct nes_qp *)(unsigned long)context;
                        if (nesqp->term_flags) {
                                nes_terminate_done(nesqp, 0);
                                return;
                        }
+                       spin_lock_irqsave(&nesqp->lock, flags);
+                       nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING;
+                       spin_unlock_irqrestore(&nesqp->lock, flags);
+                       nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_CLOSING, 0, 0);
+                       nes_cm_disconn(nesqp);
+                       break;
 
-               case NES_AEQE_AEID_LLP_CONNECTION_RESET:
                case NES_AEQE_AEID_RESET_SENT:
-                       nesqp = (struct nes_qp *)(unsigned long)context;
-                       if (async_event_id == NES_AEQE_AEID_RESET_SENT) {
-                               tcp_state = NES_AEQE_TCP_STATE_CLOSED;
-                       }
+                       tcp_state = NES_AEQE_TCP_STATE_CLOSED;
                        spin_lock_irqsave(&nesqp->lock, flags);
                        nesqp->hw_iwarp_state = iwarp_state;
                        nesqp->hw_tcp_state = tcp_state;
                        nesqp->last_aeq = async_event_id;
-
-                       if ((tcp_state == NES_AEQE_TCP_STATE_CLOSED) ||
-                                       (tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT)) {
-                               nesqp->hte_added = 0;
-                               next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE;
-                       }
-
-                       if ((nesqp->ibqp_state == IB_QPS_RTS) &&
-                                       ((tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) ||
-                                       (async_event_id == NES_AEQE_AEID_LLP_CONNECTION_RESET))) {
-                               switch (nesqp->hw_iwarp_state) {
-                                       case NES_AEQE_IWARP_STATE_RTS:
-                                               next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
-                                               nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING;
-                                               break;
-                                       case NES_AEQE_IWARP_STATE_TERMINATE:
-                                               must_disconn = 0; /* terminate path takes care of disconn */
-                                               if (nesqp->term_flags == 0)
-                                                       must_terminate = 1;
-                                               break;
-                               }
-                       } else {
-                               if (async_event_id ==  NES_AEQE_AEID_LLP_FIN_RECEIVED) {
-                                       /* FIN Received but ib state not RTS,
-                                                       close complete will be on its way */
-                                       must_disconn = 0;
-                               }
-                       }
+                       nesqp->hte_added = 0;
                        spin_unlock_irqrestore(&nesqp->lock, flags);
+                       next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE;
+                       nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
+                       nes_cm_disconn(nesqp);
+                       break;
 
-                       if (must_terminate)
-                               nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL);
-                       else if (must_disconn) {
-                               if (next_iwarp_state) {
-                                       nes_debug(NES_DBG_AEQ, "issuing hw modifyqp for QP%u. next state = 0x%08X\n",
-                                                 nesqp->hwqp.qp_id, next_iwarp_state);
-                                       nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
-                               }
-                               nes_cm_disconn(nesqp);
-                       }
+               case NES_AEQE_AEID_LLP_CONNECTION_RESET:
+                       if (atomic_read(&nesqp->close_timer_started))
+                               return;
+                       spin_lock_irqsave(&nesqp->lock, flags);
+                       nesqp->hw_iwarp_state = iwarp_state;
+                       nesqp->hw_tcp_state = tcp_state;
+                       nesqp->last_aeq = async_event_id;
+                       spin_unlock_irqrestore(&nesqp->lock, flags);
+                       nes_cm_disconn(nesqp);
                        break;
 
                case NES_AEQE_AEID_TERMINATE_SENT:
-                       nesqp = (struct nes_qp *)(unsigned long)context;
                        nes_terminate_send_fin(nesdev, nesqp, aeqe);
                        break;
 
                case NES_AEQE_AEID_LLP_TERMINATE_RECEIVED:
-                       nesqp = (struct nes_qp *)(unsigned long)context;
                        nes_terminate_received(nesdev, nesqp, aeqe);
                        break;
 
@@ -3480,7 +3513,8 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
                case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER:
                case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION:
                case NES_AEQE_AEID_AMP_TO_WRAP:
-                       nesqp = (struct nes_qp *)(unsigned long)context;
+                       printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_ACCESS_ERR\n",
+                                       nesqp->hwqp.qp_id, async_event_id);
                        nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_ACCESS_ERR);
                        break;
 
@@ -3488,7 +3522,6 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
                case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL:
                case NES_AEQE_AEID_DDP_UBE_INVALID_MO:
                case NES_AEQE_AEID_DDP_UBE_INVALID_QN:
-                       nesqp = (struct nes_qp *)(unsigned long)context;
                        if (iwarp_opcode(nesqp, aeq_info) > IWARP_OPCODE_TERM) {
                                aeq_info &= 0xffff0000;
                                aeq_info |= NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE;
@@ -3530,7 +3563,8 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
                case NES_AEQE_AEID_STAG_ZERO_INVALID:
                case NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST:
                case NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP:
-                       nesqp = (struct nes_qp *)(unsigned long)context;
+                       printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_FATAL\n",
+                                       nesqp->hwqp.qp_id, async_event_id);
                        nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL);
                        break;
 
index 084be0e..9b1e7f8 100644 (file)
 
 #define NES_PHY_TYPE_CX4       1
 #define NES_PHY_TYPE_1G        2
-#define NES_PHY_TYPE_IRIS      3
 #define NES_PHY_TYPE_ARGUS     4
 #define NES_PHY_TYPE_PUMA_1G   5
 #define NES_PHY_TYPE_PUMA_10G  6
 #define NES_PHY_TYPE_GLADIUS   7
 #define NES_PHY_TYPE_SFP_D     8
+#define NES_PHY_TYPE_KR               9
 
 #define NES_MULTICAST_PF_MAX 8
 
index ab11027..7dd6ce6 100644 (file)
@@ -1230,8 +1230,8 @@ static void nes_netdev_get_ethtool_stats(struct net_device *netdev,
        target_stat_values[++index] = cm_packets_received;
        target_stat_values[++index] = cm_packets_dropped;
        target_stat_values[++index] = cm_packets_retrans;
-       target_stat_values[++index] = cm_listens_created;
-       target_stat_values[++index] = cm_listens_destroyed;
+       target_stat_values[++index] = atomic_read(&cm_listens_created);
+       target_stat_values[++index] = atomic_read(&cm_listens_destroyed);
        target_stat_values[++index] = cm_backlog_drops;
        target_stat_values[++index] = atomic_read(&cm_loopbacks);
        target_stat_values[++index] = atomic_read(&cm_nodes_created);
@@ -1461,9 +1461,9 @@ static int nes_netdev_get_settings(struct net_device *netdev, struct ethtool_cmd
                }
                return 0;
        }
-       if ((phy_type == NES_PHY_TYPE_IRIS) ||
-           (phy_type == NES_PHY_TYPE_ARGUS) ||
-           (phy_type == NES_PHY_TYPE_SFP_D)) {
+       if ((phy_type == NES_PHY_TYPE_ARGUS) ||
+           (phy_type == NES_PHY_TYPE_SFP_D) ||
+           (phy_type == NES_PHY_TYPE_KR)) {
                et_cmd->transceiver = XCVR_EXTERNAL;
                et_cmd->port        = PORT_FIBRE;
                et_cmd->supported   = SUPPORTED_FIBRE;
@@ -1583,8 +1583,7 @@ struct net_device *nes_netdev_init(struct nes_device *nesdev,
        struct net_device *netdev;
        struct nic_qp_map *curr_qp_map;
        u32 u32temp;
-       u16 phy_data;
-       u16 temp_phy_data;
+       u8 phy_type = nesdev->nesadapter->phy_type[nesdev->mac_index];
 
        netdev = alloc_etherdev(sizeof(struct nes_vnic));
        if (!netdev) {
@@ -1692,65 +1691,23 @@ struct net_device *nes_netdev_init(struct nes_device *nesdev,
 
        if ((nesdev->netdev_count == 0) &&
            ((PCI_FUNC(nesdev->pcidev->devfn) == nesdev->mac_index) ||
-            ((nesdev->nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_PUMA_1G) &&
+            ((phy_type == NES_PHY_TYPE_PUMA_1G) &&
              (((PCI_FUNC(nesdev->pcidev->devfn) == 1) && (nesdev->mac_index == 2)) ||
               ((PCI_FUNC(nesdev->pcidev->devfn) == 2) && (nesdev->mac_index == 1)))))) {
-               /*
-                * nes_debug(NES_DBG_INIT, "Setting up PHY interrupt mask. Using register index 0x%04X\n",
-                *              NES_IDX_PHY_PCS_CONTROL_STATUS0 + (0x200 * (nesvnic->logical_port & 1)));
-                */
                u32temp = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 +
                                (0x200 * (nesdev->mac_index & 1)));
-               if (nesdev->nesadapter->phy_type[nesdev->mac_index] != NES_PHY_TYPE_PUMA_1G) {
+               if (phy_type != NES_PHY_TYPE_PUMA_1G) {
                        u32temp |= 0x00200000;
                        nes_write_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 +
                                (0x200 * (nesdev->mac_index & 1)), u32temp);
                }
 
-               u32temp = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 +
-                               (0x200 * (nesdev->mac_index & 1)));
-
-               if ((u32temp&0x0f1f0000) == 0x0f0f0000) {
-                       if (nesdev->nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_IRIS) {
-                               nes_init_phy(nesdev);
-                               nes_read_10G_phy_reg(nesdev, nesdev->nesadapter->phy_index[nesdev->mac_index], 1, 1);
-                               temp_phy_data = (u16)nes_read_indexed(nesdev,
-                                                                       NES_IDX_MAC_MDIO_CONTROL);
-                               u32temp = 20;
-                               do {
-                                       nes_read_10G_phy_reg(nesdev, nesdev->nesadapter->phy_index[nesdev->mac_index], 1, 1);
-                                       phy_data = (u16)nes_read_indexed(nesdev,
-                                                                       NES_IDX_MAC_MDIO_CONTROL);
-                                       if ((phy_data == temp_phy_data) || (!(--u32temp)))
-                                               break;
-                                       temp_phy_data = phy_data;
-                               } while (1);
-                               if (phy_data & 4) {
-                                       nes_debug(NES_DBG_INIT, "The Link is UP!!.\n");
-                                       nesvnic->linkup = 1;
-                               } else {
-                                       nes_debug(NES_DBG_INIT, "The Link is DOWN!!.\n");
-                               }
-                       } else {
-                               nes_debug(NES_DBG_INIT, "The Link is UP!!.\n");
-                               nesvnic->linkup = 1;
-                       }
-               } else if (nesdev->nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_PUMA_1G) {
-                       nes_debug(NES_DBG_INIT, "mac_index=%d, logical_port=%d, u32temp=0x%04X, PCI_FUNC=%d\n",
-                               nesdev->mac_index, nesvnic->logical_port, u32temp, PCI_FUNC(nesdev->pcidev->devfn));
-                       if (((nesdev->mac_index < 2) && ((u32temp&0x01010000) == 0x01010000)) ||
-                           ((nesdev->mac_index > 1) && ((u32temp&0x02020000) == 0x02020000)))  {
-                               nes_debug(NES_DBG_INIT, "The Link is UP!!.\n");
-                               nesvnic->linkup = 1;
-                       }
-               }
                /* clear the MAC interrupt status, assumes direct logical to physical mapping */
                u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index));
                nes_debug(NES_DBG_INIT, "Phy interrupt status = 0x%X.\n", u32temp);
                nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index), u32temp);
 
-               if (nesdev->nesadapter->phy_type[nesdev->mac_index] != NES_PHY_TYPE_IRIS)
-                       nes_init_phy(nesdev);
+               nes_init_phy(nesdev);
 
        }
 
index 64d3136..815725f 100644 (file)
@@ -228,7 +228,7 @@ static int nes_bind_mw(struct ib_qp *ibqp, struct ib_mw *ibmw,
        /* Check for SQ overflow */
        if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) {
                spin_unlock_irqrestore(&nesqp->lock, flags);
-               return -EINVAL;
+               return -ENOMEM;
        }
 
        wqe = &nesqp->hwqp.sq_vbase[head];
@@ -3294,7 +3294,7 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
 
                /* Check for SQ overflow */
                if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) {
-                       err = -EINVAL;
+                       err = -ENOMEM;
                        break;
                }
 
@@ -3577,7 +3577,7 @@ static int nes_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
                }
                /* Check for RQ overflow */
                if (((head + (2 * qsize) - nesqp->hwqp.rq_tail) % qsize) == (qsize - 1)) {
-                       err = -EINVAL;
+                       err = -ENOMEM;
                        break;
                }
 
index e9795f6..d10b4ec 100644 (file)
@@ -55,9 +55,7 @@ static int ipoib_get_coalesce(struct net_device *dev,
        struct ipoib_dev_priv *priv = netdev_priv(dev);
 
        coal->rx_coalesce_usecs = priv->ethtool.coalesce_usecs;
-       coal->tx_coalesce_usecs = priv->ethtool.coalesce_usecs;
        coal->rx_max_coalesced_frames = priv->ethtool.max_coalesced_frames;
-       coal->tx_max_coalesced_frames = priv->ethtool.max_coalesced_frames;
 
        return 0;
 }
@@ -69,10 +67,8 @@ static int ipoib_set_coalesce(struct net_device *dev,
        int ret;
 
        /*
-        * Since IPoIB uses a single CQ for both rx and tx, we assume
-        * that rx params dictate the configuration.  These values are
-        * saved in the private data and returned when ipoib_get_coalesce()
-        * is called.
+        * These values are saved in the private data and returned
+        * when ipoib_get_coalesce() is called
         */
        if (coal->rx_coalesce_usecs       > 0xffff ||
            coal->rx_max_coalesced_frames > 0xffff)
@@ -85,8 +81,6 @@ static int ipoib_set_coalesce(struct net_device *dev,
                return ret;
        }
 
-       coal->tx_coalesce_usecs       = coal->rx_coalesce_usecs;
-       coal->tx_max_coalesced_frames = coal->rx_max_coalesced_frames;
        priv->ethtool.coalesce_usecs       = coal->rx_coalesce_usecs;
        priv->ethtool.max_coalesced_frames = coal->rx_max_coalesced_frames;
 
index 5f7a6fc..71237f8 100644 (file)
@@ -128,6 +128,28 @@ static int iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode)
        return 0;
 }
 
+int iser_initialize_task_headers(struct iscsi_task *task,
+                                               struct iser_tx_desc *tx_desc)
+{
+       struct iscsi_iser_conn *iser_conn = task->conn->dd_data;
+       struct iser_device     *device    = iser_conn->ib_conn->device;
+       struct iscsi_iser_task *iser_task = task->dd_data;
+       u64 dma_addr;
+
+       dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc,
+                               ISER_HEADERS_LEN, DMA_TO_DEVICE);
+       if (ib_dma_mapping_error(device->ib_device, dma_addr))
+               return -ENOMEM;
+
+       tx_desc->dma_addr = dma_addr;
+       tx_desc->tx_sg[0].addr   = tx_desc->dma_addr;
+       tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;
+       tx_desc->tx_sg[0].lkey   = device->mr->lkey;
+
+       iser_task->headers_initialized  = 1;
+       iser_task->iser_conn            = iser_conn;
+       return 0;
+}
 /**
  * iscsi_iser_task_init - Initialize task
  * @task: iscsi task
@@ -137,17 +159,17 @@ static int iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode)
 static int
 iscsi_iser_task_init(struct iscsi_task *task)
 {
-       struct iscsi_iser_conn *iser_conn  = task->conn->dd_data;
        struct iscsi_iser_task *iser_task = task->dd_data;
 
+       if (!iser_task->headers_initialized)
+               if (iser_initialize_task_headers(task, &iser_task->desc))
+                       return -ENOMEM;
+
        /* mgmt task */
-       if (!task->sc) {
-               iser_task->desc.data = task->data;
+       if (!task->sc)
                return 0;
-       }
 
        iser_task->command_sent = 0;
-       iser_task->iser_conn    = iser_conn;
        iser_task_rdma_init(iser_task);
        return 0;
 }
@@ -168,7 +190,7 @@ iscsi_iser_mtask_xmit(struct iscsi_conn *conn, struct iscsi_task *task)
 {
        int error = 0;
 
-       iser_dbg("task deq [cid %d itt 0x%x]\n", conn->id, task->itt);
+       iser_dbg("mtask xmit [cid %d itt 0x%x]\n", conn->id, task->itt);
 
        error = iser_send_control(conn, task);
 
@@ -178,9 +200,6 @@ iscsi_iser_mtask_xmit(struct iscsi_conn *conn, struct iscsi_task *task)
         * - if yes, the task is recycled at iscsi_complete_pdu
         * - if no,  the task is recycled at iser_snd_completion
         */
-       if (error && error != -ENOBUFS)
-               iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED);
-
        return error;
 }
 
@@ -232,7 +251,7 @@ iscsi_iser_task_xmit(struct iscsi_task *task)
                           task->imm_count, task->unsol_r2t.data_length);
        }
 
-       iser_dbg("task deq [cid %d itt 0x%x]\n",
+       iser_dbg("ctask xmit [cid %d itt 0x%x]\n",
                   conn->id, task->itt);
 
        /* Send the cmd PDU */
@@ -248,8 +267,6 @@ iscsi_iser_task_xmit(struct iscsi_task *task)
                error = iscsi_iser_task_xmit_unsol_data(conn, task);
 
  iscsi_iser_task_xmit_exit:
-       if (error && error != -ENOBUFS)
-               iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED);
        return error;
 }
 
@@ -283,7 +300,7 @@ iscsi_iser_conn_create(struct iscsi_cls_session *cls_session, uint32_t conn_idx)
         * due to issues with the login code re iser sematics
         * this not set in iscsi_conn_setup - FIXME
         */
-       conn->max_recv_dlength = 128;
+       conn->max_recv_dlength = ISER_RECV_DATA_SEG_LEN;
 
        iser_conn = conn->dd_data;
        conn->dd_data = iser_conn;
@@ -401,7 +418,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
        struct Scsi_Host *shost;
        struct iser_conn *ib_conn;
 
-       shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 1);
+       shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0);
        if (!shost)
                return NULL;
        shost->transportt = iscsi_iser_scsi_transport;
@@ -675,7 +692,7 @@ static int __init iser_init(void)
        memset(&ig, 0, sizeof(struct iser_global));
 
        ig.desc_cache = kmem_cache_create("iser_descriptors",
-                                         sizeof (struct iser_desc),
+                                         sizeof(struct iser_tx_desc),
                                          0, SLAB_HWCACHE_ALIGN,
                                          NULL);
        if (ig.desc_cache == NULL)
index 9d529ca..036934c 100644 (file)
 #define ISER_MAX_TX_MISC_PDUS          6 /* NOOP_OUT(2), TEXT(1),         *
                                           * SCSI_TMFUNC(2), LOGOUT(1) */
 
-#define ISER_QP_MAX_RECV_DTOS          (ISCSI_DEF_XMIT_CMDS_MAX + \
-                                       ISER_MAX_RX_MISC_PDUS    +  \
-                                       ISER_MAX_TX_MISC_PDUS)
+#define ISER_QP_MAX_RECV_DTOS          (ISCSI_DEF_XMIT_CMDS_MAX)
+
+#define ISER_MIN_POSTED_RX             (ISCSI_DEF_XMIT_CMDS_MAX >> 2)
 
 /* the max TX (send) WR supported by the iSER QP is defined by                 *
  * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect   *
@@ -132,6 +132,12 @@ struct iser_hdr {
        __be64  read_va;
 } __attribute__((packed));
 
+/* Constant PDU lengths calculations */
+#define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr))
+
+#define ISER_RECV_DATA_SEG_LEN 128
+#define ISER_RX_PAYLOAD_SIZE   (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
+#define ISER_RX_LOGIN_SIZE     (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
 
 /* Length of an object name string */
 #define ISER_OBJECT_NAME_SIZE              64
@@ -187,51 +193,43 @@ struct iser_regd_buf {
        struct iser_mem_reg     reg;        /* memory registration info        */
        void                    *virt_addr;
        struct iser_device      *device;    /* device->device for dma_unmap    */
-       u64                     dma_addr;   /* if non zero, addr for dma_unmap */
        enum dma_data_direction direction;  /* direction for dma_unmap         */
        unsigned int            data_size;
-       atomic_t                ref_count;  /* refcount, freed when dec to 0   */
-};
-
-#define MAX_REGD_BUF_VECTOR_LEN        2
-
-struct iser_dto {
-       struct iscsi_iser_task *task;
-       struct iser_conn *ib_conn;
-       int                        notify_enable;
-
-       /* vector of registered buffers */
-       unsigned int               regd_vector_len;
-       struct iser_regd_buf       *regd[MAX_REGD_BUF_VECTOR_LEN];
-
-       /* offset into the registered buffer may be specified */
-       unsigned int               offset[MAX_REGD_BUF_VECTOR_LEN];
-
-       /* a smaller size may be specified, if 0, then full size is used */
-       unsigned int               used_sz[MAX_REGD_BUF_VECTOR_LEN];
 };
 
 enum iser_desc_type {
-       ISCSI_RX,
        ISCSI_TX_CONTROL ,
        ISCSI_TX_SCSI_COMMAND,
        ISCSI_TX_DATAOUT
 };
 
-struct iser_desc {
+struct iser_tx_desc {
        struct iser_hdr              iser_header;
        struct iscsi_hdr             iscsi_header;
-       struct iser_regd_buf         hdr_regd_buf;
-       void                         *data;         /* used by RX & TX_CONTROL */
-       struct iser_regd_buf         data_regd_buf; /* used by RX & TX_CONTROL */
        enum   iser_desc_type        type;
-       struct iser_dto              dto;
+       u64                          dma_addr;
+       /* sg[0] points to iser/iscsi headers, sg[1] optionally points to either
+       of immediate data, unsolicited data-out or control (login,text) */
+       struct ib_sge                tx_sg[2];
+       int                          num_sge;
 };
 
+#define ISER_RX_PAD_SIZE       (256 - (ISER_RX_PAYLOAD_SIZE + \
+                                       sizeof(u64) + sizeof(struct ib_sge)))
+struct iser_rx_desc {
+       struct iser_hdr              iser_header;
+       struct iscsi_hdr             iscsi_header;
+       char                         data[ISER_RECV_DATA_SEG_LEN];
+       u64                          dma_addr;
+       struct ib_sge                rx_sg;
+       char                         pad[ISER_RX_PAD_SIZE];
+} __attribute__((packed));
+
 struct iser_device {
        struct ib_device             *ib_device;
        struct ib_pd                 *pd;
-       struct ib_cq                 *cq;
+       struct ib_cq                 *rx_cq;
+       struct ib_cq                 *tx_cq;
        struct ib_mr                 *mr;
        struct tasklet_struct        cq_tasklet;
        struct list_head             ig_list; /* entry in ig devices list */
@@ -250,15 +248,18 @@ struct iser_conn {
        struct ib_fmr_pool           *fmr_pool;     /* pool of IB FMRs         */
        int                          disc_evt_flag; /* disconn event delivered */
        wait_queue_head_t            wait;          /* waitq for conn/disconn  */
-       atomic_t                     post_recv_buf_count; /* posted rx count   */
+       int                          post_recv_buf_count; /* posted rx count  */
        atomic_t                     post_send_buf_count; /* posted tx count   */
-       atomic_t                     unexpected_pdu_count;/* count of received *
-                                                          * unexpected pdus   *
-                                                          * not yet retired   */
        char                         name[ISER_OBJECT_NAME_SIZE];
        struct iser_page_vec         *page_vec;     /* represents SG to fmr maps*
                                                     * maps serialized as tx is*/
        struct list_head             conn_list;       /* entry in ig conn list */
+
+       char                         *login_buf;
+       u64                          login_dma;
+       unsigned int                 rx_desc_head;
+       struct iser_rx_desc          *rx_descs;
+       struct ib_recv_wr            rx_wr[ISER_MIN_POSTED_RX];
 };
 
 struct iscsi_iser_conn {
@@ -267,7 +268,7 @@ struct iscsi_iser_conn {
 };
 
 struct iscsi_iser_task {
-       struct iser_desc             desc;
+       struct iser_tx_desc          desc;
        struct iscsi_iser_conn       *iser_conn;
        enum iser_task_status        status;
        int                          command_sent;  /* set if command  sent  */
@@ -275,6 +276,7 @@ struct iscsi_iser_task {
        struct iser_regd_buf         rdma_regd[ISER_DIRS_NUM];/* regd rdma buf */
        struct iser_data_buf         data[ISER_DIRS_NUM];     /* orig. data des*/
        struct iser_data_buf         data_copy[ISER_DIRS_NUM];/* contig. copy  */
+       int                          headers_initialized;
 };
 
 struct iser_page_vec {
@@ -322,22 +324,17 @@ void iser_conn_put(struct iser_conn *ib_conn);
 
 void iser_conn_terminate(struct iser_conn *ib_conn);
 
-void iser_rcv_completion(struct iser_desc *desc,
-                        unsigned long    dto_xfer_len);
+void iser_rcv_completion(struct iser_rx_desc *desc,
+                        unsigned long    dto_xfer_len,
+                       struct iser_conn *ib_conn);
 
-void iser_snd_completion(struct iser_desc *desc);
+void iser_snd_completion(struct iser_tx_desc *desc, struct iser_conn *ib_conn);
 
 void iser_task_rdma_init(struct iscsi_iser_task *task);
 
 void iser_task_rdma_finalize(struct iscsi_iser_task *task);
 
-void iser_dto_buffs_release(struct iser_dto *dto);
-
-int  iser_regd_buff_release(struct iser_regd_buf *regd_buf);
-
-void iser_reg_single(struct iser_device      *device,
-                    struct iser_regd_buf    *regd_buf,
-                    enum dma_data_direction direction);
+void iser_free_rx_descriptors(struct iser_conn *ib_conn);
 
 void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *task,
                                     enum iser_data_dir         cmd_dir);
@@ -356,11 +353,9 @@ int  iser_reg_page_vec(struct iser_conn     *ib_conn,
 
 void iser_unreg_mem(struct iser_mem_reg *mem_reg);
 
-int  iser_post_recv(struct iser_desc *rx_desc);
-int  iser_post_send(struct iser_desc *tx_desc);
-
-int iser_conn_state_comp(struct iser_conn *ib_conn,
-                        enum iser_ib_conn_state comp);
+int  iser_post_recvl(struct iser_conn *ib_conn);
+int  iser_post_recvm(struct iser_conn *ib_conn, int count);
+int  iser_post_send(struct iser_conn *ib_conn, struct iser_tx_desc *tx_desc);
 
 int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
                            struct iser_data_buf       *data,
@@ -368,4 +363,6 @@ int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
                            enum   dma_data_direction  dma_dir);
 
 void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task);
+int  iser_initialize_task_headers(struct iscsi_task *task,
+                       struct iser_tx_desc *tx_desc);
 #endif
index 9de6402..0b9ef07 100644 (file)
 
 #include "iscsi_iser.h"
 
-/* Constant PDU lengths calculations */
-#define ISER_TOTAL_HEADERS_LEN  (sizeof (struct iser_hdr) + \
-                                sizeof (struct iscsi_hdr))
-
-/* iser_dto_add_regd_buff - increments the reference count for *
- * the registered buffer & adds it to the DTO object           */
-static void iser_dto_add_regd_buff(struct iser_dto *dto,
-                                  struct iser_regd_buf *regd_buf,
-                                  unsigned long use_offset,
-                                  unsigned long use_size)
-{
-       int add_idx;
-
-       atomic_inc(&regd_buf->ref_count);
-
-       add_idx = dto->regd_vector_len;
-       dto->regd[add_idx] = regd_buf;
-       dto->used_sz[add_idx] = use_size;
-       dto->offset[add_idx] = use_offset;
-
-       dto->regd_vector_len++;
-}
-
 /* Register user buffer memory and initialize passive rdma
  *  dto descriptor. Total data size is stored in
  *  iser_task->data[ISER_DIR_IN].data_len
@@ -122,9 +99,9 @@ iser_prepare_write_cmd(struct iscsi_task *task,
        struct iscsi_iser_task *iser_task = task->dd_data;
        struct iser_regd_buf *regd_buf;
        int err;
-       struct iser_dto *send_dto = &iser_task->desc.dto;
        struct iser_hdr *hdr = &iser_task->desc.iser_header;
        struct iser_data_buf *buf_out = &iser_task->data[ISER_DIR_OUT];
+       struct ib_sge *tx_dsg = &iser_task->desc.tx_sg[1];
 
        err = iser_dma_map_task_data(iser_task,
                                     buf_out,
@@ -163,135 +140,100 @@ iser_prepare_write_cmd(struct iscsi_task *task,
        if (imm_sz > 0) {
                iser_dbg("Cmd itt:%d, WRITE, adding imm.data sz: %d\n",
                         task->itt, imm_sz);
-               iser_dto_add_regd_buff(send_dto,
-                                      regd_buf,
-                                      0,
-                                      imm_sz);
+               tx_dsg->addr   = regd_buf->reg.va;
+               tx_dsg->length = imm_sz;
+               tx_dsg->lkey   = regd_buf->reg.lkey;
+               iser_task->desc.num_sge = 2;
        }
 
        return 0;
 }
 
-/**
- * iser_post_receive_control - allocates, initializes and posts receive DTO.
- */
-static int iser_post_receive_control(struct iscsi_conn *conn)
+/* creates a new tx descriptor and adds header regd buffer */
+static void iser_create_send_desc(struct iser_conn     *ib_conn,
+                                 struct iser_tx_desc   *tx_desc)
 {
-       struct iscsi_iser_conn *iser_conn = conn->dd_data;
-       struct iser_desc     *rx_desc;
-       struct iser_regd_buf *regd_hdr;
-       struct iser_regd_buf *regd_data;
-       struct iser_dto      *recv_dto = NULL;
-       struct iser_device  *device = iser_conn->ib_conn->device;
-       int rx_data_size, err;
-       int posts, outstanding_unexp_pdus;
-
-       /* for the login sequence we must support rx of upto 8K; login is done
-        * after conn create/bind (connect) and conn stop/bind (reconnect),
-        * what's common for both schemes is that the connection is not started
-        */
-       if (conn->c_stage != ISCSI_CONN_STARTED)
-               rx_data_size = ISCSI_DEF_MAX_RECV_SEG_LEN;
-       else /* FIXME till user space sets conn->max_recv_dlength correctly */
-               rx_data_size = 128;
-
-       outstanding_unexp_pdus =
-               atomic_xchg(&iser_conn->ib_conn->unexpected_pdu_count, 0);
-
-       /*
-        * in addition to the response buffer, replace those consumed by
-        * unexpected pdus.
-        */
-       for (posts = 0; posts < 1 + outstanding_unexp_pdus; posts++) {
-               rx_desc = kmem_cache_alloc(ig.desc_cache, GFP_NOIO);
-               if (rx_desc == NULL) {
-                       iser_err("Failed to alloc desc for post recv %d\n",
-                                posts);
-                       err = -ENOMEM;
-                       goto post_rx_cache_alloc_failure;
-               }
-               rx_desc->type = ISCSI_RX;
-               rx_desc->data = kmalloc(rx_data_size, GFP_NOIO);
-               if (rx_desc->data == NULL) {
-                       iser_err("Failed to alloc data buf for post recv %d\n",
-                                posts);
-                       err = -ENOMEM;
-                       goto post_rx_kmalloc_failure;
-               }
-
-               recv_dto = &rx_desc->dto;
-               recv_dto->ib_conn = iser_conn->ib_conn;
-               recv_dto->regd_vector_len = 0;
+       struct iser_device *device = ib_conn->device;
 
-               regd_hdr = &rx_desc->hdr_regd_buf;
-               memset(regd_hdr, 0, sizeof(struct iser_regd_buf));
-               regd_hdr->device  = device;
-               regd_hdr->virt_addr  = rx_desc; /* == &rx_desc->iser_header */
-               regd_hdr->data_size  = ISER_TOTAL_HEADERS_LEN;
+       ib_dma_sync_single_for_cpu(device->ib_device,
+               tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);
 
-               iser_reg_single(device, regd_hdr, DMA_FROM_DEVICE);
-
-               iser_dto_add_regd_buff(recv_dto, regd_hdr, 0, 0);
+       memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr));
+       tx_desc->iser_header.flags = ISER_VER;
 
-               regd_data = &rx_desc->data_regd_buf;
-               memset(regd_data, 0, sizeof(struct iser_regd_buf));
-               regd_data->device  = device;
-               regd_data->virt_addr  = rx_desc->data;
-               regd_data->data_size  = rx_data_size;
+       tx_desc->num_sge = 1;
 
-               iser_reg_single(device, regd_data, DMA_FROM_DEVICE);
+       if (tx_desc->tx_sg[0].lkey != device->mr->lkey) {
+               tx_desc->tx_sg[0].lkey = device->mr->lkey;
+               iser_dbg("sdesc %p lkey mismatch, fixing\n", tx_desc);
+       }
+}
 
-               iser_dto_add_regd_buff(recv_dto, regd_data, 0, 0);
 
-               err = iser_post_recv(rx_desc);
-               if (err) {
-                       iser_err("Failed iser_post_recv for post %d\n", posts);
-                       goto post_rx_post_recv_failure;
-               }
+int iser_alloc_rx_descriptors(struct iser_conn *ib_conn)
+{
+       int i, j;
+       u64 dma_addr;
+       struct iser_rx_desc *rx_desc;
+       struct ib_sge       *rx_sg;
+       struct iser_device  *device = ib_conn->device;
+
+       ib_conn->rx_descs = kmalloc(ISER_QP_MAX_RECV_DTOS *
+                               sizeof(struct iser_rx_desc), GFP_KERNEL);
+       if (!ib_conn->rx_descs)
+               goto rx_desc_alloc_fail;
+
+       rx_desc = ib_conn->rx_descs;
+
+       for (i = 0; i < ISER_QP_MAX_RECV_DTOS; i++, rx_desc++)  {
+               dma_addr = ib_dma_map_single(device->ib_device, (void *)rx_desc,
+                                       ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+               if (ib_dma_mapping_error(device->ib_device, dma_addr))
+                       goto rx_desc_dma_map_failed;
+
+               rx_desc->dma_addr = dma_addr;
+
+               rx_sg = &rx_desc->rx_sg;
+               rx_sg->addr   = rx_desc->dma_addr;
+               rx_sg->length = ISER_RX_PAYLOAD_SIZE;
+               rx_sg->lkey   = device->mr->lkey;
        }
-       /* all posts successful */
-       return 0;
 
-post_rx_post_recv_failure:
-       iser_dto_buffs_release(recv_dto);
-       kfree(rx_desc->data);
-post_rx_kmalloc_failure:
-       kmem_cache_free(ig.desc_cache, rx_desc);
-post_rx_cache_alloc_failure:
-       if (posts > 0) {
-               /*
-                * response buffer posted, but did not replace all unexpected
-                * pdu recv bufs. Ignore error, retry occurs next send
-                */
-               outstanding_unexp_pdus -= (posts - 1);
-               err = 0;
-       }
-       atomic_add(outstanding_unexp_pdus,
-                  &iser_conn->ib_conn->unexpected_pdu_count);
+       ib_conn->rx_desc_head = 0;
+       return 0;
 
-       return err;
+rx_desc_dma_map_failed:
+       rx_desc = ib_conn->rx_descs;
+       for (j = 0; j < i; j++, rx_desc++)
+               ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr,
+                       ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+       kfree(ib_conn->rx_descs);
+       ib_conn->rx_descs = NULL;
+rx_desc_alloc_fail:
+       iser_err("failed allocating rx descriptors / data buffers\n");
+       return -ENOMEM;
 }
 
-/* creates a new tx descriptor and adds header regd buffer */
-static void iser_create_send_desc(struct iscsi_iser_conn *iser_conn,
-                                 struct iser_desc       *tx_desc)
+void iser_free_rx_descriptors(struct iser_conn *ib_conn)
 {
-       struct iser_regd_buf *regd_hdr = &tx_desc->hdr_regd_buf;
-       struct iser_dto      *send_dto = &tx_desc->dto;
+       int i;
+       struct iser_rx_desc *rx_desc;
+       struct iser_device *device = ib_conn->device;
 
-       memset(regd_hdr, 0, sizeof(struct iser_regd_buf));
-       regd_hdr->device  = iser_conn->ib_conn->device;
-       regd_hdr->virt_addr  = tx_desc; /* == &tx_desc->iser_header */
-       regd_hdr->data_size  = ISER_TOTAL_HEADERS_LEN;
+       if (ib_conn->login_buf) {
+               ib_dma_unmap_single(device->ib_device, ib_conn->login_dma,
+                       ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
+               kfree(ib_conn->login_buf);
+       }
 
-       send_dto->ib_conn         = iser_conn->ib_conn;
-       send_dto->notify_enable   = 1;
-       send_dto->regd_vector_len = 0;
+       if (!ib_conn->rx_descs)
+               return;
 
-       memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr));
-       tx_desc->iser_header.flags = ISER_VER;
-
-       iser_dto_add_regd_buff(send_dto, regd_hdr, 0, 0);
+       rx_desc = ib_conn->rx_descs;
+       for (i = 0; i < ISER_QP_MAX_RECV_DTOS; i++, rx_desc++)
+               ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr,
+                       ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+       kfree(ib_conn->rx_descs);
 }
 
 /**
@@ -301,46 +243,23 @@ int iser_conn_set_full_featured_mode(struct iscsi_conn *conn)
 {
        struct iscsi_iser_conn *iser_conn = conn->dd_data;
 
-       int i;
-       /*
-        * FIXME this value should be declared to the target during login with
-        * the MaxOutstandingUnexpectedPDUs key when supported
-        */
-       int initial_post_recv_bufs_num = ISER_MAX_RX_MISC_PDUS;
-
-       iser_dbg("Initially post: %d\n", initial_post_recv_bufs_num);
+       iser_dbg("Initially post: %d\n", ISER_MIN_POSTED_RX);
 
        /* Check that there is no posted recv or send buffers left - */
        /* they must be consumed during the login phase */
-       BUG_ON(atomic_read(&iser_conn->ib_conn->post_recv_buf_count) != 0);
+       BUG_ON(iser_conn->ib_conn->post_recv_buf_count != 0);
        BUG_ON(atomic_read(&iser_conn->ib_conn->post_send_buf_count) != 0);
 
-       /* Initial post receive buffers */
-       for (i = 0; i < initial_post_recv_bufs_num; i++) {
-               if (iser_post_receive_control(conn) != 0) {
-                       iser_err("Failed to post recv bufs at:%d conn:0x%p\n",
-                                i, conn);
-                       return -ENOMEM;
-               }
-       }
-       iser_dbg("Posted %d post recv bufs, conn:0x%p\n", i, conn);
-       return 0;
-}
+       if (iser_alloc_rx_descriptors(iser_conn->ib_conn))
+               return -ENOMEM;
 
-static int
-iser_check_xmit(struct iscsi_conn *conn, void *task)
-{
-       struct iscsi_iser_conn *iser_conn = conn->dd_data;
+       /* Initial post receive buffers */
+       if (iser_post_recvm(iser_conn->ib_conn, ISER_MIN_POSTED_RX))
+               return -ENOMEM;
 
-       if (atomic_read(&iser_conn->ib_conn->post_send_buf_count) ==
-           ISER_QP_MAX_REQ_DTOS) {
-               iser_dbg("%ld can't xmit task %p\n",jiffies,task);
-               return -ENOBUFS;
-       }
        return 0;
 }
 
-
 /**
  * iser_send_command - send command PDU
  */
@@ -349,27 +268,18 @@ int iser_send_command(struct iscsi_conn *conn,
 {
        struct iscsi_iser_conn *iser_conn = conn->dd_data;
        struct iscsi_iser_task *iser_task = task->dd_data;
-       struct iser_dto *send_dto = NULL;
        unsigned long edtl;
-       int err = 0;
+       int err;
        struct iser_data_buf *data_buf;
        struct iscsi_cmd *hdr =  (struct iscsi_cmd *)task->hdr;
        struct scsi_cmnd *sc  =  task->sc;
-
-       if (!iser_conn_state_comp(iser_conn->ib_conn, ISER_CONN_UP)) {
-               iser_err("Failed to send, conn: 0x%p is not up\n", iser_conn->ib_conn);
-               return -EPERM;
-       }
-       if (iser_check_xmit(conn, task))
-               return -ENOBUFS;
+       struct iser_tx_desc *tx_desc = &iser_task->desc;
 
        edtl = ntohl(hdr->data_length);
 
        /* build the tx desc regd header and add it to the tx desc dto */
-       iser_task->desc.type = ISCSI_TX_SCSI_COMMAND;
-       send_dto = &iser_task->desc.dto;
-       send_dto->task = iser_task;
-       iser_create_send_desc(iser_conn, &iser_task->desc);
+       tx_desc->type = ISCSI_TX_SCSI_COMMAND;
+       iser_create_send_desc(iser_conn->ib_conn, tx_desc);
 
        if (hdr->flags & ISCSI_FLAG_CMD_READ)
                data_buf = &iser_task->data[ISER_DIR_IN];
@@ -398,23 +308,13 @@ int iser_send_command(struct iscsi_conn *conn,
                        goto send_command_error;
        }
 
-       iser_reg_single(iser_conn->ib_conn->device,
-                       send_dto->regd[0], DMA_TO_DEVICE);
-
-       if (iser_post_receive_control(conn) != 0) {
-               iser_err("post_recv failed!\n");
-               err = -ENOMEM;
-               goto send_command_error;
-       }
-
        iser_task->status = ISER_TASK_STATUS_STARTED;
 
-       err = iser_post_send(&iser_task->desc);
+       err = iser_post_send(iser_conn->ib_conn, tx_desc);
        if (!err)
                return 0;
 
 send_command_error:
-       iser_dto_buffs_release(send_dto);
        iser_err("conn %p failed task->itt %d err %d\n",conn, task->itt, err);
        return err;
 }
@@ -428,20 +328,13 @@ int iser_send_data_out(struct iscsi_conn *conn,
 {
        struct iscsi_iser_conn *iser_conn = conn->dd_data;
        struct iscsi_iser_task *iser_task = task->dd_data;
-       struct iser_desc *tx_desc = NULL;
-       struct iser_dto *send_dto = NULL;
+       struct iser_tx_desc *tx_desc = NULL;
+       struct iser_regd_buf *regd_buf;
        unsigned long buf_offset;
        unsigned long data_seg_len;
        uint32_t itt;
        int err = 0;
-
-       if (!iser_conn_state_comp(iser_conn->ib_conn, ISER_CONN_UP)) {
-               iser_err("Failed to send, conn: 0x%p is not up\n", iser_conn->ib_conn);
-               return -EPERM;
-       }
-
-       if (iser_check_xmit(conn, task))
-               return -ENOBUFS;
+       struct ib_sge *tx_dsg;
 
        itt = (__force uint32_t)hdr->itt;
        data_seg_len = ntoh24(hdr->dlength);
@@ -450,28 +343,25 @@ int iser_send_data_out(struct iscsi_conn *conn,
        iser_dbg("%s itt %d dseg_len %d offset %d\n",
                 __func__,(int)itt,(int)data_seg_len,(int)buf_offset);
 
-       tx_desc = kmem_cache_alloc(ig.desc_cache, GFP_NOIO);
+       tx_desc = kmem_cache_zalloc(ig.desc_cache, GFP_ATOMIC);
        if (tx_desc == NULL) {
                iser_err("Failed to alloc desc for post dataout\n");
                return -ENOMEM;
        }
 
        tx_desc->type = ISCSI_TX_DATAOUT;
+       tx_desc->iser_header.flags = ISER_VER;
        memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr));
 
-       /* build the tx desc regd header and add it to the tx desc dto */
-       send_dto = &tx_desc->dto;
-       send_dto->task = iser_task;
-       iser_create_send_desc(iser_conn, tx_desc);
-
-       iser_reg_single(iser_conn->ib_conn->device,
-                       send_dto->regd[0], DMA_TO_DEVICE);
+       /* build the tx desc */
+       iser_initialize_task_headers(task, tx_desc);
 
-       /* all data was registered for RDMA, we can use the lkey */
-       iser_dto_add_regd_buff(send_dto,
-                              &iser_task->rdma_regd[ISER_DIR_OUT],
-                              buf_offset,
-                              data_seg_len);
+       regd_buf = &iser_task->rdma_regd[ISER_DIR_OUT];
+       tx_dsg = &tx_desc->tx_sg[1];
+       tx_dsg->addr    = regd_buf->reg.va + buf_offset;
+       tx_dsg->length  = data_seg_len;
+       tx_dsg->lkey    = regd_buf->reg.lkey;
+       tx_desc->num_sge = 2;
 
        if (buf_offset + data_seg_len > iser_task->data[ISER_DIR_OUT].data_len) {
                iser_err("Offset:%ld & DSL:%ld in Data-Out "
@@ -485,12 +375,11 @@ int iser_send_data_out(struct iscsi_conn *conn,
                 itt, buf_offset, data_seg_len);
 
 
-       err = iser_post_send(tx_desc);
+       err = iser_post_send(iser_conn->ib_conn, tx_desc);
        if (!err)
                return 0;
 
 send_data_out_error:
-       iser_dto_buffs_release(send_dto);
        kmem_cache_free(ig.desc_cache, tx_desc);
        iser_err("conn %p failed err %d\n",conn, err);
        return err;
@@ -501,64 +390,44 @@ int iser_send_control(struct iscsi_conn *conn,
 {
        struct iscsi_iser_conn *iser_conn = conn->dd_data;
        struct iscsi_iser_task *iser_task = task->dd_data;
-       struct iser_desc *mdesc = &iser_task->desc;
-       struct iser_dto *send_dto = NULL;
+       struct iser_tx_desc *mdesc = &iser_task->desc;
        unsigned long data_seg_len;
        int err = 0;
-       struct iser_regd_buf *regd_buf;
        struct iser_device *device;
-       unsigned char opcode;
-
-       if (!iser_conn_state_comp(iser_conn->ib_conn, ISER_CONN_UP)) {
-               iser_err("Failed to send, conn: 0x%p is not up\n", iser_conn->ib_conn);
-               return -EPERM;
-       }
-
-       if (iser_check_xmit(conn, task))
-               return -ENOBUFS;
 
        /* build the tx desc regd header and add it to the tx desc dto */
        mdesc->type = ISCSI_TX_CONTROL;
-       send_dto = &mdesc->dto;
-       send_dto->task = NULL;
-       iser_create_send_desc(iser_conn, mdesc);
+       iser_create_send_desc(iser_conn->ib_conn, mdesc);
 
        device = iser_conn->ib_conn->device;
 
-       iser_reg_single(device, send_dto->regd[0], DMA_TO_DEVICE);
-
        data_seg_len = ntoh24(task->hdr->dlength);
 
        if (data_seg_len > 0) {
-               regd_buf = &mdesc->data_regd_buf;
-               memset(regd_buf, 0, sizeof(struct iser_regd_buf));
-               regd_buf->device = device;
-               regd_buf->virt_addr = task->data;
-               regd_buf->data_size = task->data_count;
-               iser_reg_single(device, regd_buf,
-                               DMA_TO_DEVICE);
-               iser_dto_add_regd_buff(send_dto, regd_buf,
-                                      0,
-                                      data_seg_len);
+               struct ib_sge *tx_dsg = &mdesc->tx_sg[1];
+               if (task != conn->login_task) {
+                       iser_err("data present on non login task!!!\n");
+                       goto send_control_error;
+               }
+               memcpy(iser_conn->ib_conn->login_buf, task->data,
+                                                       task->data_count);
+               tx_dsg->addr    = iser_conn->ib_conn->login_dma;
+               tx_dsg->length  = data_seg_len;
+               tx_dsg->lkey    = device->mr->lkey;
+               mdesc->num_sge = 2;
        }
 
-       opcode = task->hdr->opcode & ISCSI_OPCODE_MASK;
-
-       /* post recv buffer for response if one is expected */
-       if (!(opcode == ISCSI_OP_NOOP_OUT && task->hdr->itt == RESERVED_ITT)) {
-               if (iser_post_receive_control(conn) != 0) {
-                       iser_err("post_rcv_buff failed!\n");
-                       err = -ENOMEM;
+       if (task == conn->login_task) {
+               err = iser_post_recvl(iser_conn->ib_conn);
+               if (err)
                        goto send_control_error;
-               }
        }
 
-       err = iser_post_send(mdesc);
+       err = iser_post_send(iser_conn->ib_conn, mdesc);
        if (!err)
                return 0;
 
 send_control_error:
-       iser_dto_buffs_release(send_dto);
        iser_err("conn %p failed err %d\n",conn, err);
        return err;
 }
@@ -566,104 +435,71 @@ send_control_error:
 /**
  * iser_rcv_dto_completion - recv DTO completion
  */
-void iser_rcv_completion(struct iser_desc *rx_desc,
-                        unsigned long dto_xfer_len)
+void iser_rcv_completion(struct iser_rx_desc *rx_desc,
+                        unsigned long rx_xfer_len,
+                        struct iser_conn *ib_conn)
 {
-       struct iser_dto *dto = &rx_desc->dto;
-       struct iscsi_iser_conn *conn = dto->ib_conn->iser_conn;
-       struct iscsi_task *task;
-       struct iscsi_iser_task *iser_task;
+       struct iscsi_iser_conn *conn = ib_conn->iser_conn;
        struct iscsi_hdr *hdr;
-       char   *rx_data = NULL;
-       int     rx_data_len = 0;
-       unsigned char opcode;
-
-       hdr = &rx_desc->iscsi_header;
+       u64 rx_dma;
+       int rx_buflen, outstanding, count, err;
+
+       /* differentiate between login to all other PDUs */
+       if ((char *)rx_desc == ib_conn->login_buf) {
+               rx_dma = ib_conn->login_dma;
+               rx_buflen = ISER_RX_LOGIN_SIZE;
+       } else {
+               rx_dma = rx_desc->dma_addr;
+               rx_buflen = ISER_RX_PAYLOAD_SIZE;
+       }
 
-       iser_dbg("op 0x%x itt 0x%x\n", hdr->opcode,hdr->itt);
+       ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, rx_dma,
+                       rx_buflen, DMA_FROM_DEVICE);
 
-       if (dto_xfer_len > ISER_TOTAL_HEADERS_LEN) { /* we have data */
-               rx_data_len = dto_xfer_len - ISER_TOTAL_HEADERS_LEN;
-               rx_data     = dto->regd[1]->virt_addr;
-               rx_data    += dto->offset[1];
-       }
+       hdr = &rx_desc->iscsi_header;
 
-       opcode = hdr->opcode & ISCSI_OPCODE_MASK;
-
-       if (opcode == ISCSI_OP_SCSI_CMD_RSP) {
-               spin_lock(&conn->iscsi_conn->session->lock);
-               task = iscsi_itt_to_ctask(conn->iscsi_conn, hdr->itt);
-               if (task)
-                       __iscsi_get_task(task);
-               spin_unlock(&conn->iscsi_conn->session->lock);
-
-               if (!task)
-                       iser_err("itt can't be matched to task!!! "
-                                "conn %p opcode %d itt %d\n",
-                                conn->iscsi_conn, opcode, hdr->itt);
-               else {
-                       iser_task = task->dd_data;
-                       iser_dbg("itt %d task %p\n",hdr->itt, task);
-                       iser_task->status = ISER_TASK_STATUS_COMPLETED;
-                       iser_task_rdma_finalize(iser_task);
-                       iscsi_put_task(task);
-               }
-       }
-       iser_dto_buffs_release(dto);
+       iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode,
+                       hdr->itt, (int)(rx_xfer_len - ISER_HEADERS_LEN));
 
-       iscsi_iser_recv(conn->iscsi_conn, hdr, rx_data, rx_data_len);
+       iscsi_iser_recv(conn->iscsi_conn, hdr,
+               rx_desc->data, rx_xfer_len - ISER_HEADERS_LEN);
 
-       kfree(rx_desc->data);
-       kmem_cache_free(ig.desc_cache, rx_desc);
+       ib_dma_sync_single_for_device(ib_conn->device->ib_device, rx_dma,
+                       rx_buflen, DMA_FROM_DEVICE);
 
        /* decrementing conn->post_recv_buf_count only --after-- freeing the   *
         * task eliminates the need to worry on tasks which are completed in   *
         * parallel to the execution of iser_conn_term. So the code that waits *
         * for the posted rx bufs refcount to become zero handles everything   */
-       atomic_dec(&conn->ib_conn->post_recv_buf_count);
+       conn->ib_conn->post_recv_buf_count--;
 
-       /*
-        * if an unexpected PDU was received then the recv wr consumed must
-        * be replaced, this is done in the next send of a control-type PDU
-        */
-       if (opcode == ISCSI_OP_NOOP_IN && hdr->itt == RESERVED_ITT) {
-               /* nop-in with itt = 0xffffffff */
-               atomic_inc(&conn->ib_conn->unexpected_pdu_count);
-       }
-       else if (opcode == ISCSI_OP_ASYNC_EVENT) {
-               /* asyncronous message */
-               atomic_inc(&conn->ib_conn->unexpected_pdu_count);
+       if (rx_dma == ib_conn->login_dma)
+               return;
+
+       outstanding = ib_conn->post_recv_buf_count;
+       if (outstanding + ISER_MIN_POSTED_RX <= ISER_QP_MAX_RECV_DTOS) {
+               count = min(ISER_QP_MAX_RECV_DTOS - outstanding,
+                                               ISER_MIN_POSTED_RX);
+               err = iser_post_recvm(ib_conn, count);
+               if (err)
+                       iser_err("posting %d rx bufs err %d\n", count, err);
        }
-       /* a reject PDU consumes the recv buf posted for the response */
 }
 
-void iser_snd_completion(struct iser_desc *tx_desc)
+void iser_snd_completion(struct iser_tx_desc *tx_desc,
+                       struct iser_conn *ib_conn)
 {
-       struct iser_dto        *dto = &tx_desc->dto;
-       struct iser_conn       *ib_conn = dto->ib_conn;
-       struct iscsi_iser_conn *iser_conn = ib_conn->iser_conn;
-       struct iscsi_conn      *conn = iser_conn->iscsi_conn;
        struct iscsi_task *task;
-       int resume_tx = 0;
-
-       iser_dbg("Initiator, Data sent dto=0x%p\n", dto);
-
-       iser_dto_buffs_release(dto);
+       struct iser_device *device = ib_conn->device;
 
-       if (tx_desc->type == ISCSI_TX_DATAOUT)
+       if (tx_desc->type == ISCSI_TX_DATAOUT) {
+               ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr,
+                                       ISER_HEADERS_LEN, DMA_TO_DEVICE);
                kmem_cache_free(ig.desc_cache, tx_desc);
-
-       if (atomic_read(&iser_conn->ib_conn->post_send_buf_count) ==
-           ISER_QP_MAX_REQ_DTOS)
-               resume_tx = 1;
+       }
 
        atomic_dec(&ib_conn->post_send_buf_count);
 
-       if (resume_tx) {
-               iser_dbg("%ld resuming tx\n",jiffies);
-               iscsi_conn_queue_work(conn);
-       }
-
        if (tx_desc->type == ISCSI_TX_CONTROL) {
                /* this arithmetic is legal by libiscsi dd_data allocation */
                task = (void *) ((long)(void *)tx_desc -
@@ -692,7 +528,6 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task)
 
 void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
 {
-       int deferred;
        int is_rdma_aligned = 1;
        struct iser_regd_buf *regd;
 
@@ -710,32 +545,17 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
 
        if (iser_task->dir[ISER_DIR_IN]) {
                regd = &iser_task->rdma_regd[ISER_DIR_IN];
-               deferred = iser_regd_buff_release(regd);
-               if (deferred) {
-                       iser_err("%d references remain for BUF-IN rdma reg\n",
-                                atomic_read(&regd->ref_count));
-               }
+               if (regd->reg.is_fmr)
+                       iser_unreg_mem(&regd->reg);
        }
 
        if (iser_task->dir[ISER_DIR_OUT]) {
                regd = &iser_task->rdma_regd[ISER_DIR_OUT];
-               deferred = iser_regd_buff_release(regd);
-               if (deferred) {
-                       iser_err("%d references remain for BUF-OUT rdma reg\n",
-                                atomic_read(&regd->ref_count));
-               }
+               if (regd->reg.is_fmr)
+                       iser_unreg_mem(&regd->reg);
        }
 
        /* if the data was unaligned, it was already unmapped and then copied */
        if (is_rdma_aligned)
                iser_dma_unmap_task_data(iser_task);
 }
-
-void iser_dto_buffs_release(struct iser_dto *dto)
-{
-       int i;
-
-       for (i = 0; i < dto->regd_vector_len; i++)
-               iser_regd_buff_release(dto->regd[i]);
-}
-
index 274c883..fb88d68 100644 (file)
 #define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */
 
 /**
- * Decrements the reference count for the
- * registered buffer & releases it
- *
- * returns 0 if released, 1 if deferred
- */
-int iser_regd_buff_release(struct iser_regd_buf *regd_buf)
-{
-       struct ib_device *dev;
-
-       if ((atomic_read(&regd_buf->ref_count) == 0) ||
-           atomic_dec_and_test(&regd_buf->ref_count)) {
-               /* if we used the dma mr, unreg is just NOP */
-               if (regd_buf->reg.is_fmr)
-                       iser_unreg_mem(&regd_buf->reg);
-
-               if (regd_buf->dma_addr) {
-                       dev = regd_buf->device->ib_device;
-                       ib_dma_unmap_single(dev,
-                                        regd_buf->dma_addr,
-                                        regd_buf->data_size,
-                                        regd_buf->direction);
-               }
-               /* else this regd buf is associated with task which we */
-               /* dma_unmap_single/sg later */
-               return 0;
-       } else {
-               iser_dbg("Release deferred, regd.buff: 0x%p\n", regd_buf);
-               return 1;
-       }
-}
-
-/**
- * iser_reg_single - fills registered buffer descriptor with
- *                  registration information
- */
-void iser_reg_single(struct iser_device *device,
-                    struct iser_regd_buf *regd_buf,
-                    enum dma_data_direction direction)
-{
-       u64 dma_addr;
-
-       dma_addr = ib_dma_map_single(device->ib_device,
-                                    regd_buf->virt_addr,
-                                    regd_buf->data_size, direction);
-       BUG_ON(ib_dma_mapping_error(device->ib_device, dma_addr));
-
-       regd_buf->reg.lkey = device->mr->lkey;
-       regd_buf->reg.len  = regd_buf->data_size;
-       regd_buf->reg.va   = dma_addr;
-       regd_buf->reg.is_fmr = 0;
-
-       regd_buf->dma_addr  = dma_addr;
-       regd_buf->direction = direction;
-}
-
-/**
  * iser_start_rdma_unaligned_sg
  */
 static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
@@ -109,10 +53,10 @@ static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
        unsigned long  cmd_data_len = data->data_len;
 
        if (cmd_data_len > ISER_KMALLOC_THRESHOLD)
-               mem = (void *)__get_free_pages(GFP_NOIO,
+               mem = (void *)__get_free_pages(GFP_ATOMIC,
                      ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT);
        else
-               mem = kmalloc(cmd_data_len, GFP_NOIO);
+               mem = kmalloc(cmd_data_len, GFP_ATOMIC);
 
        if (mem == NULL) {
                iser_err("Failed to allocate mem size %d %d for copying sglist\n",
@@ -474,9 +418,5 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task,
                        return err;
                }
        }
-
-       /* take a reference on this regd buf such that it will not be released *
-        * (eg in send dto completion) before we get the scsi response         */
-       atomic_inc(&regd_buf->ref_count);
        return 0;
 }
index 8579f32..308d17b 100644 (file)
@@ -37,9 +37,8 @@
 #include "iscsi_iser.h"
 
 #define ISCSI_ISER_MAX_CONN    8
-#define ISER_MAX_CQ_LEN                ((ISER_QP_MAX_RECV_DTOS + \
-                               ISER_QP_MAX_REQ_DTOS) *   \
-                                ISCSI_ISER_MAX_CONN)
+#define ISER_MAX_RX_CQ_LEN     (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN)
+#define ISER_MAX_TX_CQ_LEN     (ISER_QP_MAX_REQ_DTOS  * ISCSI_ISER_MAX_CONN)
 
 static void iser_cq_tasklet_fn(unsigned long data);
 static void iser_cq_callback(struct ib_cq *cq, void *cq_context);
@@ -67,15 +66,23 @@ static int iser_create_device_ib_res(struct iser_device *device)
        if (IS_ERR(device->pd))
                goto pd_err;
 
-       device->cq = ib_create_cq(device->ib_device,
+       device->rx_cq = ib_create_cq(device->ib_device,
                                  iser_cq_callback,
                                  iser_cq_event_callback,
                                  (void *)device,
-                                 ISER_MAX_CQ_LEN, 0);
-       if (IS_ERR(device->cq))
-               goto cq_err;
+                                 ISER_MAX_RX_CQ_LEN, 0);
+       if (IS_ERR(device->rx_cq))
+               goto rx_cq_err;
 
-       if (ib_req_notify_cq(device->cq, IB_CQ_NEXT_COMP))
+       device->tx_cq = ib_create_cq(device->ib_device,
+                                 NULL, iser_cq_event_callback,
+                                 (void *)device,
+                                 ISER_MAX_TX_CQ_LEN, 0);
+
+       if (IS_ERR(device->tx_cq))
+               goto tx_cq_err;
+
+       if (ib_req_notify_cq(device->rx_cq, IB_CQ_NEXT_COMP))
                goto cq_arm_err;
 
        tasklet_init(&device->cq_tasklet,
@@ -93,8 +100,10 @@ static int iser_create_device_ib_res(struct iser_device *device)
 dma_mr_err:
        tasklet_kill(&device->cq_tasklet);
 cq_arm_err:
-       ib_destroy_cq(device->cq);
-cq_err:
+       ib_destroy_cq(device->tx_cq);
+tx_cq_err:
+       ib_destroy_cq(device->rx_cq);
+rx_cq_err:
        ib_dealloc_pd(device->pd);
 pd_err:
        iser_err("failed to allocate an IB resource\n");
@@ -112,11 +121,13 @@ static void iser_free_device_ib_res(struct iser_device *device)
        tasklet_kill(&device->cq_tasklet);
 
        (void)ib_dereg_mr(device->mr);
-       (void)ib_destroy_cq(device->cq);
+       (void)ib_destroy_cq(device->tx_cq);
+       (void)ib_destroy_cq(device->rx_cq);
        (void)ib_dealloc_pd(device->pd);
 
        device->mr = NULL;
-       device->cq = NULL;
+       device->tx_cq = NULL;
+       device->rx_cq = NULL;
        device->pd = NULL;
 }
 
@@ -129,13 +140,23 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn)
 {
        struct iser_device      *device;
        struct ib_qp_init_attr  init_attr;
-       int                     ret;
+       int                     ret = -ENOMEM;
        struct ib_fmr_pool_param params;
 
        BUG_ON(ib_conn->device == NULL);
 
        device = ib_conn->device;
 
+       ib_conn->login_buf = kmalloc(ISER_RX_LOGIN_SIZE, GFP_KERNEL);
+       if (!ib_conn->login_buf) {
+               goto alloc_err;
+               ret = -ENOMEM;
+       }
+
+       ib_conn->login_dma = ib_dma_map_single(ib_conn->device->ib_device,
+                               (void *)ib_conn->login_buf, ISER_RX_LOGIN_SIZE,
+                               DMA_FROM_DEVICE);
+
        ib_conn->page_vec = kmalloc(sizeof(struct iser_page_vec) +
                                    (sizeof(u64) * (ISCSI_ISER_SG_TABLESIZE +1)),
                                    GFP_KERNEL);
@@ -169,12 +190,12 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn)
 
        init_attr.event_handler = iser_qp_event_callback;
        init_attr.qp_context    = (void *)ib_conn;
-       init_attr.send_cq       = device->cq;
-       init_attr.recv_cq       = device->cq;
+       init_attr.send_cq       = device->tx_cq;
+       init_attr.recv_cq       = device->rx_cq;
        init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS;
        init_attr.cap.max_recv_wr  = ISER_QP_MAX_RECV_DTOS;
-       init_attr.cap.max_send_sge = MAX_REGD_BUF_VECTOR_LEN;
-       init_attr.cap.max_recv_sge = 2;
+       init_attr.cap.max_send_sge = 2;
+       init_attr.cap.max_recv_sge = 1;
        init_attr.sq_sig_type   = IB_SIGNAL_REQ_WR;
        init_attr.qp_type       = IB_QPT_RC;
 
@@ -192,6 +213,7 @@ qp_err:
        (void)ib_destroy_fmr_pool(ib_conn->fmr_pool);
 fmr_pool_err:
        kfree(ib_conn->page_vec);
+       kfree(ib_conn->login_buf);
 alloc_err:
        iser_err("unable to alloc mem or create resource, err %d\n", ret);
        return ret;
@@ -278,17 +300,6 @@ static void iser_device_try_release(struct iser_device *device)
        mutex_unlock(&ig.device_list_mutex);
 }
 
-int iser_conn_state_comp(struct iser_conn *ib_conn,
-                       enum iser_ib_conn_state comp)
-{
-       int ret;
-
-       spin_lock_bh(&ib_conn->lock);
-       ret = (ib_conn->state == comp);
-       spin_unlock_bh(&ib_conn->lock);
-       return ret;
-}
-
 static int iser_conn_state_comp_exch(struct iser_conn *ib_conn,
                                     enum iser_ib_conn_state comp,
                                     enum iser_ib_conn_state exch)
@@ -314,7 +325,7 @@ static void iser_conn_release(struct iser_conn *ib_conn)
        mutex_lock(&ig.connlist_mutex);
        list_del(&ib_conn->conn_list);
        mutex_unlock(&ig.connlist_mutex);
-
+       iser_free_rx_descriptors(ib_conn);
        iser_free_ib_conn_res(ib_conn);
        ib_conn->device = NULL;
        /* on EVENT_ADDR_ERROR there's no device yet for this conn */
@@ -442,7 +453,7 @@ static void iser_disconnected_handler(struct rdma_cm_id *cma_id)
                                   ISCSI_ERR_CONN_FAILED);
 
        /* Complete the termination process if no posts are pending */
-       if ((atomic_read(&ib_conn->post_recv_buf_count) == 0) &&
+       if (ib_conn->post_recv_buf_count == 0 &&
            (atomic_read(&ib_conn->post_send_buf_count) == 0)) {
                ib_conn->state = ISER_CONN_DOWN;
                wake_up_interruptible(&ib_conn->wait);
@@ -489,9 +500,8 @@ void iser_conn_init(struct iser_conn *ib_conn)
 {
        ib_conn->state = ISER_CONN_INIT;
        init_waitqueue_head(&ib_conn->wait);
-       atomic_set(&ib_conn->post_recv_buf_count, 0);
+       ib_conn->post_recv_buf_count = 0;
        atomic_set(&ib_conn->post_send_buf_count, 0);
-       atomic_set(&ib_conn->unexpected_pdu_count, 0);
        atomic_set(&ib_conn->refcount, 1);
        INIT_LIST_HEAD(&ib_conn->conn_list);
        spin_lock_init(&ib_conn->lock);
@@ -626,136 +636,97 @@ void iser_unreg_mem(struct iser_mem_reg *reg)
        reg->mem_h = NULL;
 }
 
-/**
- * iser_dto_to_iov - builds IOV from a dto descriptor
- */
-static void iser_dto_to_iov(struct iser_dto *dto, struct ib_sge *iov, int iov_len)
+int iser_post_recvl(struct iser_conn *ib_conn)
 {
-       int                  i;
-       struct ib_sge        *sge;
-       struct iser_regd_buf *regd_buf;
-
-       if (dto->regd_vector_len > iov_len) {
-               iser_err("iov size %d too small for posting dto of len %d\n",
-                        iov_len, dto->regd_vector_len);
-               BUG();
-       }
+       struct ib_recv_wr rx_wr, *rx_wr_failed;
+       struct ib_sge     sge;
+       int ib_ret;
 
-       for (i = 0; i < dto->regd_vector_len; i++) {
-               sge         = &iov[i];
-               regd_buf  = dto->regd[i];
-
-               sge->addr   = regd_buf->reg.va;
-               sge->length = regd_buf->reg.len;
-               sge->lkey   = regd_buf->reg.lkey;
-
-               if (dto->used_sz[i] > 0)  /* Adjust size */
-                       sge->length = dto->used_sz[i];
-
-               /* offset and length should not exceed the regd buf length */
-               if (sge->length + dto->offset[i] > regd_buf->reg.len) {
-                       iser_err("Used len:%ld + offset:%d, exceed reg.buf.len:"
-                                "%ld in dto:0x%p [%d], va:0x%08lX\n",
-                                (unsigned long)sge->length, dto->offset[i],
-                                (unsigned long)regd_buf->reg.len, dto, i,
-                                (unsigned long)sge->addr);
-                       BUG();
-               }
+       sge.addr   = ib_conn->login_dma;
+       sge.length = ISER_RX_LOGIN_SIZE;
+       sge.lkey   = ib_conn->device->mr->lkey;
 
-               sge->addr += dto->offset[i]; /* Adjust offset */
+       rx_wr.wr_id   = (unsigned long)ib_conn->login_buf;
+       rx_wr.sg_list = &sge;
+       rx_wr.num_sge = 1;
+       rx_wr.next    = NULL;
+
+       ib_conn->post_recv_buf_count++;
+       ib_ret  = ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed);
+       if (ib_ret) {
+               iser_err("ib_post_recv failed ret=%d\n", ib_ret);
+               ib_conn->post_recv_buf_count--;
        }
+       return ib_ret;
 }
 
-/**
- * iser_post_recv - Posts a receive buffer.
- *
- * returns 0 on success, -1 on failure
- */
-int iser_post_recv(struct iser_desc *rx_desc)
+int iser_post_recvm(struct iser_conn *ib_conn, int count)
 {
-       int               ib_ret, ret_val = 0;
-       struct ib_recv_wr recv_wr, *recv_wr_failed;
-       struct ib_sge     iov[2];
-       struct iser_conn  *ib_conn;
-       struct iser_dto   *recv_dto = &rx_desc->dto;
-
-       /* Retrieve conn */
-       ib_conn = recv_dto->ib_conn;
-
-       iser_dto_to_iov(recv_dto, iov, 2);
+       struct ib_recv_wr *rx_wr, *rx_wr_failed;
+       int i, ib_ret;
+       unsigned int my_rx_head = ib_conn->rx_desc_head;
+       struct iser_rx_desc *rx_desc;
+
+       for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
+               rx_desc         = &ib_conn->rx_descs[my_rx_head];
+               rx_wr->wr_id    = (unsigned long)rx_desc;
+               rx_wr->sg_list  = &rx_desc->rx_sg;
+               rx_wr->num_sge  = 1;
+               rx_wr->next     = rx_wr + 1;
+               my_rx_head = (my_rx_head + 1) & (ISER_QP_MAX_RECV_DTOS - 1);
+       }
 
-       recv_wr.next    = NULL;
-       recv_wr.sg_list = iov;
-       recv_wr.num_sge = recv_dto->regd_vector_len;
-       recv_wr.wr_id   = (unsigned long)rx_desc;
+       rx_wr--;
+       rx_wr->next = NULL; /* mark end of work requests list */
 
-       atomic_inc(&ib_conn->post_recv_buf_count);
-       ib_ret  = ib_post_recv(ib_conn->qp, &recv_wr, &recv_wr_failed);
+       ib_conn->post_recv_buf_count += count;
+       ib_ret  = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed);
        if (ib_ret) {
                iser_err("ib_post_recv failed ret=%d\n", ib_ret);
-               atomic_dec(&ib_conn->post_recv_buf_count);
-               ret_val = -1;
-       }
-
-       return ret_val;
+               ib_conn->post_recv_buf_count -= count;
+       } else
+               ib_conn->rx_desc_head = my_rx_head;
+       return ib_ret;
 }
 
+
 /**
  * iser_start_send - Initiate a Send DTO operation
  *
  * returns 0 on success, -1 on failure
  */
-int iser_post_send(struct iser_desc *tx_desc)
+int iser_post_send(struct iser_conn *ib_conn, struct iser_tx_desc *tx_desc)
 {
-       int               ib_ret, ret_val = 0;
+       int               ib_ret;
        struct ib_send_wr send_wr, *send_wr_failed;
-       struct ib_sge     iov[MAX_REGD_BUF_VECTOR_LEN];
-       struct iser_conn  *ib_conn;
-       struct iser_dto   *dto = &tx_desc->dto;
 
-       ib_conn = dto->ib_conn;
-
-       iser_dto_to_iov(dto, iov, MAX_REGD_BUF_VECTOR_LEN);
+       ib_dma_sync_single_for_device(ib_conn->device->ib_device,
+               tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);
 
        send_wr.next       = NULL;
        send_wr.wr_id      = (unsigned long)tx_desc;
-       send_wr.sg_list    = iov;
-       send_wr.num_sge    = dto->regd_vector_len;
+       send_wr.sg_list    = tx_desc->tx_sg;
+       send_wr.num_sge    = tx_desc->num_sge;
        send_wr.opcode     = IB_WR_SEND;
-       send_wr.send_flags = dto->notify_enable ? IB_SEND_SIGNALED : 0;
+       send_wr.send_flags = IB_SEND_SIGNALED;
 
        atomic_inc(&ib_conn->post_send_buf_count);
 
        ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed);
        if (ib_ret) {
-               iser_err("Failed to start SEND DTO, dto: 0x%p, IOV len: %d\n",
-                        dto, dto->regd_vector_len);
                iser_err("ib_post_send failed, ret:%d\n", ib_ret);
                atomic_dec(&ib_conn->post_send_buf_count);
-               ret_val = -1;
        }
-
-       return ret_val;
+       return ib_ret;
 }
 
-static void iser_handle_comp_error(struct iser_desc *desc)
+static void iser_handle_comp_error(struct iser_tx_desc *desc,
+                               struct iser_conn *ib_conn)
 {
-       struct iser_dto  *dto     = &desc->dto;
-       struct iser_conn *ib_conn = dto->ib_conn;
-
-       iser_dto_buffs_release(dto);
-
-       if (desc->type == ISCSI_RX) {
-               kfree(desc->data);
+       if (desc && desc->type == ISCSI_TX_DATAOUT)
                kmem_cache_free(ig.desc_cache, desc);
-               atomic_dec(&ib_conn->post_recv_buf_count);
-       } else { /* type is TX control/command/dataout */
-               if (desc->type == ISCSI_TX_DATAOUT)
-                       kmem_cache_free(ig.desc_cache, desc);
-               atomic_dec(&ib_conn->post_send_buf_count);
-       }
 
-       if (atomic_read(&ib_conn->post_recv_buf_count) == 0 &&
+       if (ib_conn->post_recv_buf_count == 0 &&
            atomic_read(&ib_conn->post_send_buf_count) == 0) {
                /* getting here when the state is UP means that the conn is *
                 * being terminated asynchronously from the iSCSI layer's   *
@@ -774,32 +745,74 @@ static void iser_handle_comp_error(struct iser_desc *desc)
        }
 }
 
+static int iser_drain_tx_cq(struct iser_device  *device)
+{
+       struct ib_cq  *cq = device->tx_cq;
+       struct ib_wc  wc;
+       struct iser_tx_desc *tx_desc;
+       struct iser_conn *ib_conn;
+       int completed_tx = 0;
+
+       while (ib_poll_cq(cq, 1, &wc) == 1) {
+               tx_desc = (struct iser_tx_desc *) (unsigned long) wc.wr_id;
+               ib_conn = wc.qp->qp_context;
+               if (wc.status == IB_WC_SUCCESS) {
+                       if (wc.opcode == IB_WC_SEND)
+                               iser_snd_completion(tx_desc, ib_conn);
+                       else
+                               iser_err("expected opcode %d got %d\n",
+                                       IB_WC_SEND, wc.opcode);
+               } else {
+                       iser_err("tx id %llx status %d vend_err %x\n",
+                               wc.wr_id, wc.status, wc.vendor_err);
+                       atomic_dec(&ib_conn->post_send_buf_count);
+                       iser_handle_comp_error(tx_desc, ib_conn);
+               }
+               completed_tx++;
+       }
+       return completed_tx;
+}
+
+
 static void iser_cq_tasklet_fn(unsigned long data)
 {
         struct iser_device  *device = (struct iser_device *)data;
-        struct ib_cq        *cq = device->cq;
+        struct ib_cq        *cq = device->rx_cq;
         struct ib_wc        wc;
-        struct iser_desc    *desc;
+        struct iser_rx_desc *desc;
         unsigned long       xfer_len;
+       struct iser_conn *ib_conn;
+       int completed_tx, completed_rx;
+       completed_tx = completed_rx = 0;
 
        while (ib_poll_cq(cq, 1, &wc) == 1) {
-               desc     = (struct iser_desc *) (unsigned long) wc.wr_id;
+               desc     = (struct iser_rx_desc *) (unsigned long) wc.wr_id;
                BUG_ON(desc == NULL);
-
+               ib_conn = wc.qp->qp_context;
                if (wc.status == IB_WC_SUCCESS) {
-                       if (desc->type == ISCSI_RX) {
+                       if (wc.opcode == IB_WC_RECV) {
                                xfer_len = (unsigned long)wc.byte_len;
-                               iser_rcv_completion(desc, xfer_len);
-                       } else /* type == ISCSI_TX_CONTROL/SCSI_CMD/DOUT */
-                               iser_snd_completion(desc);
+                               iser_rcv_completion(desc, xfer_len, ib_conn);
+                       } else
+                               iser_err("expected opcode %d got %d\n",
+                                       IB_WC_RECV, wc.opcode);
                } else {
-                       iser_err("comp w. error op %d status %d\n",desc->type,wc.status);
-                       iser_handle_comp_error(desc);
+                       if (wc.status != IB_WC_WR_FLUSH_ERR)
+                               iser_err("rx id %llx status %d vend_err %x\n",
+                                       wc.wr_id, wc.status, wc.vendor_err);
+                       ib_conn->post_recv_buf_count--;
+                       iser_handle_comp_error(NULL, ib_conn);
                }
+               completed_rx++;
+               if (!(completed_rx & 63))
+                       completed_tx += iser_drain_tx_cq(device);
        }
        /* #warning "it is assumed here that arming CQ only once its empty" *
         * " would not cause interrupts to be missed"                       */
        ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+
+       completed_tx += iser_drain_tx_cq(device);
+       iser_dbg("got %d rx %d tx completions\n", completed_rx, completed_tx);
 }
 
 static void iser_cq_callback(struct ib_cq *cq, void *cq_context)
index 54c8fe2..ed3f9eb 100644 (file)
@@ -80,7 +80,8 @@ MODULE_PARM_DESC(mellanox_workarounds,
 
 static void srp_add_one(struct ib_device *device);
 static void srp_remove_one(struct ib_device *device);
-static void srp_completion(struct ib_cq *cq, void *target_ptr);
+static void srp_recv_completion(struct ib_cq *cq, void *target_ptr);
+static void srp_send_completion(struct ib_cq *cq, void *target_ptr);
 static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event);
 
 static struct scsi_transport_template *ib_srp_transport_template;
@@ -227,14 +228,21 @@ static int srp_create_target_ib(struct srp_target_port *target)
        if (!init_attr)
                return -ENOMEM;
 
-       target->cq = ib_create_cq(target->srp_host->srp_dev->dev,
-                                 srp_completion, NULL, target, SRP_CQ_SIZE, 0);
-       if (IS_ERR(target->cq)) {
-               ret = PTR_ERR(target->cq);
-               goto out;
+       target->recv_cq = ib_create_cq(target->srp_host->srp_dev->dev,
+                                      srp_recv_completion, NULL, target, SRP_RQ_SIZE, 0);
+       if (IS_ERR(target->recv_cq)) {
+               ret = PTR_ERR(target->recv_cq);
+               goto err;
        }
 
-       ib_req_notify_cq(target->cq, IB_CQ_NEXT_COMP);
+       target->send_cq = ib_create_cq(target->srp_host->srp_dev->dev,
+                                      srp_send_completion, NULL, target, SRP_SQ_SIZE, 0);
+       if (IS_ERR(target->send_cq)) {
+               ret = PTR_ERR(target->send_cq);
+               goto err_recv_cq;
+       }
+
+       ib_req_notify_cq(target->recv_cq, IB_CQ_NEXT_COMP);
 
        init_attr->event_handler       = srp_qp_event;
        init_attr->cap.max_send_wr     = SRP_SQ_SIZE;
@@ -243,24 +251,32 @@ static int srp_create_target_ib(struct srp_target_port *target)
        init_attr->cap.max_send_sge    = 1;
        init_attr->sq_sig_type         = IB_SIGNAL_ALL_WR;
        init_attr->qp_type             = IB_QPT_RC;
-       init_attr->send_cq             = target->cq;
-       init_attr->recv_cq             = target->cq;
+       init_attr->send_cq             = target->send_cq;
+       init_attr->recv_cq             = target->recv_cq;
 
        target->qp = ib_create_qp(target->srp_host->srp_dev->pd, init_attr);
        if (IS_ERR(target->qp)) {
                ret = PTR_ERR(target->qp);
-               ib_destroy_cq(target->cq);
-               goto out;
+               goto err_send_cq;
        }
 
        ret = srp_init_qp(target, target->qp);
-       if (ret) {
-               ib_destroy_qp(target->qp);
-               ib_destroy_cq(target->cq);
-               goto out;
-       }
+       if (ret)
+               goto err_qp;
 
-out:
+       kfree(init_attr);
+       return 0;
+
+err_qp:
+       ib_destroy_qp(target->qp);
+
+err_send_cq:
+       ib_destroy_cq(target->send_cq);
+
+err_recv_cq:
+       ib_destroy_cq(target->recv_cq);
+
+err:
        kfree(init_attr);
        return ret;
 }
@@ -270,7 +286,8 @@ static void srp_free_target_ib(struct srp_target_port *target)
        int i;
 
        ib_destroy_qp(target->qp);
-       ib_destroy_cq(target->cq);
+       ib_destroy_cq(target->send_cq);
+       ib_destroy_cq(target->recv_cq);
 
        for (i = 0; i < SRP_RQ_SIZE; ++i)
                srp_free_iu(target->srp_host, target->rx_ring[i]);
@@ -568,7 +585,9 @@ static int srp_reconnect_target(struct srp_target_port *target)
        if (ret)
                goto err;
 
-       while (ib_poll_cq(target->cq, 1, &wc) > 0)
+       while (ib_poll_cq(target->recv_cq, 1, &wc) > 0)
+               ; /* nothing */
+       while (ib_poll_cq(target->send_cq, 1, &wc) > 0)
                ; /* nothing */
 
        spin_lock_irq(target->scsi_host->host_lock);
@@ -851,7 +870,7 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc)
        struct srp_iu *iu;
        u8 opcode;
 
-       iu = target->rx_ring[wc->wr_id & ~SRP_OP_RECV];
+       iu = target->rx_ring[wc->wr_id];
 
        dev = target->srp_host->srp_dev->dev;
        ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_ti_iu_len,
@@ -898,7 +917,7 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc)
                                      DMA_FROM_DEVICE);
 }
 
-static void srp_completion(struct ib_cq *cq, void *target_ptr)
+static void srp_recv_completion(struct ib_cq *cq, void *target_ptr)
 {
        struct srp_target_port *target = target_ptr;
        struct ib_wc wc;
@@ -907,17 +926,31 @@ static void srp_completion(struct ib_cq *cq, void *target_ptr)
        while (ib_poll_cq(cq, 1, &wc) > 0) {
                if (wc.status) {
                        shost_printk(KERN_ERR, target->scsi_host,
-                                    PFX "failed %s status %d\n",
-                                    wc.wr_id & SRP_OP_RECV ? "receive" : "send",
+                                    PFX "failed receive status %d\n",
                                     wc.status);
                        target->qp_in_error = 1;
                        break;
                }
 
-               if (wc.wr_id & SRP_OP_RECV)
-                       srp_handle_recv(target, &wc);
-               else
-                       ++target->tx_tail;
+               srp_handle_recv(target, &wc);
+       }
+}
+
+static void srp_send_completion(struct ib_cq *cq, void *target_ptr)
+{
+       struct srp_target_port *target = target_ptr;
+       struct ib_wc wc;
+
+       while (ib_poll_cq(cq, 1, &wc) > 0) {
+               if (wc.status) {
+                       shost_printk(KERN_ERR, target->scsi_host,
+                                    PFX "failed send status %d\n",
+                                    wc.status);
+                       target->qp_in_error = 1;
+                       break;
+               }
+
+               ++target->tx_tail;
        }
 }
 
@@ -930,7 +963,7 @@ static int __srp_post_recv(struct srp_target_port *target)
        int ret;
 
        next     = target->rx_head & (SRP_RQ_SIZE - 1);
-       wr.wr_id = next | SRP_OP_RECV;
+       wr.wr_id = next;
        iu       = target->rx_ring[next];
 
        list.addr   = iu->dma;
@@ -970,6 +1003,8 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target,
 {
        s32 min = (req_type == SRP_REQ_TASK_MGMT) ? 1 : 2;
 
+       srp_send_completion(target->send_cq, target);
+
        if (target->tx_head - target->tx_tail >= SRP_SQ_SIZE)
                return NULL;
 
index e185b90..5a80eac 100644 (file)
@@ -60,7 +60,6 @@ enum {
        SRP_RQ_SHIFT            = 6,
        SRP_RQ_SIZE             = 1 << SRP_RQ_SHIFT,
        SRP_SQ_SIZE             = SRP_RQ_SIZE - 1,
-       SRP_CQ_SIZE             = SRP_SQ_SIZE + SRP_RQ_SIZE,
 
        SRP_TAG_TSK_MGMT        = 1 << (SRP_RQ_SHIFT + 1),
 
@@ -69,8 +68,6 @@ enum {
        SRP_FMR_DIRTY_SIZE      = SRP_FMR_POOL_SIZE / 4
 };
 
-#define SRP_OP_RECV            (1 << 31)
-
 enum srp_target_state {
        SRP_TARGET_LIVE,
        SRP_TARGET_CONNECTING,
@@ -133,7 +130,8 @@ struct srp_target_port {
        int                     path_query_id;
 
        struct ib_cm_id        *cm_id;
-       struct ib_cq           *cq;
+       struct ib_cq           *recv_cq;
+       struct ib_cq           *send_cq;
        struct ib_qp           *qp;
 
        int                     max_ti_iu_len;
index 3e8618b..4cd7f42 100644 (file)
@@ -264,6 +264,10 @@ struct adapter {
        struct work_struct fatal_error_handler_task;
        struct work_struct link_fault_handler_task;
 
+       struct work_struct db_full_task;
+       struct work_struct db_empty_task;
+       struct work_struct db_drop_task;
+
        struct dentry *debugfs_root;
 
        struct mutex mdio_lock;
@@ -335,6 +339,7 @@ int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
 int t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
                unsigned char *data);
 irqreturn_t t3_sge_intr_msix(int irq, void *cookie);
+extern struct workqueue_struct *cxgb3_wq;
 
 int t3_get_edc_fw(struct cphy *phy, int edc_idx, int size);
 
index 89bec9c..37945fc 100644 (file)
@@ -45,6 +45,7 @@
 #include <linux/firmware.h>
 #include <linux/log2.h>
 #include <linux/stringify.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 
 #include "common.h"
@@ -140,7 +141,7 @@ MODULE_PARM_DESC(ofld_disable, "whether to enable offload at init time or not");
  * will block keventd as it needs the rtnl lock, and we'll deadlock waiting
  * for our work to complete.  Get our own work queue to solve this.
  */
-static struct workqueue_struct *cxgb3_wq;
+struct workqueue_struct *cxgb3_wq;
 
 /**
  *     link_report - show link status and link speed/duplex
@@ -590,6 +591,19 @@ static void setup_rss(struct adapter *adap)
                      V_RRCPLCPUSIZE(6) | F_HASHTOEPLITZ, cpus, rspq_map);
 }
 
+static void ring_dbs(struct adapter *adap)
+{
+       int i, j;
+
+       for (i = 0; i < SGE_QSETS; i++) {
+               struct sge_qset *qs = &adap->sge.qs[i];
+
+               if (qs->adap)
+                       for (j = 0; j < SGE_TXQ_PER_SET; j++)
+                               t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(qs->txq[j].cntxt_id));
+       }
+}
+
 static void init_napi(struct adapter *adap)
 {
        int i;
@@ -2754,6 +2768,42 @@ static void t3_adap_check_task(struct work_struct *work)
        spin_unlock_irq(&adapter->work_lock);
 }
 
+static void db_full_task(struct work_struct *work)
+{
+       struct adapter *adapter = container_of(work, struct adapter,
+                                              db_full_task);
+
+       cxgb3_event_notify(&adapter->tdev, OFFLOAD_DB_FULL, 0);
+}
+
+static void db_empty_task(struct work_struct *work)
+{
+       struct adapter *adapter = container_of(work, struct adapter,
+                                              db_empty_task);
+
+       cxgb3_event_notify(&adapter->tdev, OFFLOAD_DB_EMPTY, 0);
+}
+
+static void db_drop_task(struct work_struct *work)
+{
+       struct adapter *adapter = container_of(work, struct adapter,
+                                              db_drop_task);
+       unsigned long delay = 1000;
+       unsigned short r;
+
+       cxgb3_event_notify(&adapter->tdev, OFFLOAD_DB_DROP, 0);
+
+       /*
+        * Sleep a while before ringing the driver qset dbs.
+        * The delay is between 1000-2023 usecs.
+        */
+       get_random_bytes(&r, 2);
+       delay += r & 1023;
+       set_current_state(TASK_UNINTERRUPTIBLE);
+       schedule_timeout(usecs_to_jiffies(delay));
+       ring_dbs(adapter);
+}
+
 /*
  * Processes external (PHY) interrupts in process context.
  */
@@ -3222,6 +3272,11 @@ static int __devinit init_one(struct pci_dev *pdev,
        INIT_LIST_HEAD(&adapter->adapter_list);
        INIT_WORK(&adapter->ext_intr_handler_task, ext_intr_task);
        INIT_WORK(&adapter->fatal_error_handler_task, fatal_error_task);
+
+       INIT_WORK(&adapter->db_full_task, db_full_task);
+       INIT_WORK(&adapter->db_empty_task, db_empty_task);
+       INIT_WORK(&adapter->db_drop_task, db_drop_task);
+
        INIT_DELAYED_WORK(&adapter->adap_check_task, t3_adap_check_task);
 
        for (i = 0; i < ai->nports0 + ai->nports1; ++i) {
index 670aa62..929c298 100644 (file)
@@ -73,7 +73,10 @@ enum {
        OFFLOAD_STATUS_UP,
        OFFLOAD_STATUS_DOWN,
        OFFLOAD_PORT_DOWN,
-       OFFLOAD_PORT_UP
+       OFFLOAD_PORT_UP,
+       OFFLOAD_DB_FULL,
+       OFFLOAD_DB_EMPTY,
+       OFFLOAD_DB_DROP
 };
 
 struct cxgb3_client {
index 1b5327b..cb42353 100644 (file)
 #define V_LOPIODRBDROPERR(x) ((x) << S_LOPIODRBDROPERR)
 #define F_LOPIODRBDROPERR    V_LOPIODRBDROPERR(1U)
 
+#define S_HIPRIORITYDBFULL    7
+#define V_HIPRIORITYDBFULL(x) ((x) << S_HIPRIORITYDBFULL)
+#define F_HIPRIORITYDBFULL    V_HIPRIORITYDBFULL(1U)
+
+#define S_HIPRIORITYDBEMPTY   6
+#define V_HIPRIORITYDBEMPTY(x) ((x) << S_HIPRIORITYDBEMPTY)
+#define F_HIPRIORITYDBEMPTY    V_HIPRIORITYDBEMPTY(1U)
+
+#define S_LOPRIORITYDBFULL    5
+#define V_LOPRIORITYDBFULL(x) ((x) << S_LOPRIORITYDBFULL)
+#define F_LOPRIORITYDBFULL    V_LOPRIORITYDBFULL(1U)
+
+#define S_LOPRIORITYDBEMPTY   4
+#define V_LOPRIORITYDBEMPTY(x) ((x) << S_LOPRIORITYDBEMPTY)
+#define F_LOPRIORITYDBEMPTY    V_LOPRIORITYDBEMPTY(1U)
+
 #define S_RSPQDISABLED    3
 #define V_RSPQDISABLED(x) ((x) << S_RSPQDISABLED)
 #define F_RSPQDISABLED    V_RSPQDISABLED(1U)
index 318a018..9b43446 100644 (file)
@@ -42,6 +42,7 @@
 #include "sge_defs.h"
 #include "t3_cpl.h"
 #include "firmware_exports.h"
+#include "cxgb3_offload.h"
 
 #define USE_GTS 0
 
@@ -2833,8 +2834,13 @@ void t3_sge_err_intr_handler(struct adapter *adapter)
        }
 
        if (status & (F_HIPIODRBDROPERR | F_LOPIODRBDROPERR))
-               CH_ALERT(adapter, "SGE dropped %s priority doorbell\n",
-                        status & F_HIPIODRBDROPERR ? "high" : "lo");
+               queue_work(cxgb3_wq, &adapter->db_drop_task);
+
+       if (status & (F_HIPRIORITYDBFULL | F_LOPRIORITYDBFULL))
+               queue_work(cxgb3_wq, &adapter->db_full_task);
+
+       if (status & (F_HIPRIORITYDBEMPTY | F_LOPRIORITYDBEMPTY))
+               queue_work(cxgb3_wq, &adapter->db_empty_task);
 
        t3_write_reg(adapter, A_SG_INT_CAUSE, status);
        if (status &  SGE_FATALERR)
index 032cfe0..c38fc71 100644 (file)
@@ -1432,7 +1432,10 @@ static int t3_handle_intr_status(struct adapter *adapter, unsigned int reg,
                       F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
                       V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
                       F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
-                      F_HIRCQPARITYERROR)
+                      F_HIRCQPARITYERROR | F_LOPRIORITYDBFULL | \
+                      F_HIPRIORITYDBFULL | F_LOPRIORITYDBEMPTY | \
+                      F_HIPRIORITYDBEMPTY | F_HIPIODRBDROPERR | \
+                      F_LOPIODRBDROPERR)
 #define MC5_INTR_MASK (F_PARITYERR | F_ACTRGNFULL | F_UNKNOWNCMD | \
                       F_REQQPARERR | F_DISPQPARERR | F_DELACTEMPTY | \
                       F_NFASRCHFAIL)
index 09509ed..a585e0f 100644 (file)
@@ -984,9 +984,9 @@ struct ib_device {
        struct list_head              event_handler_list;
        spinlock_t                    event_handler_lock;
 
+       spinlock_t                    client_data_lock;
        struct list_head              core_list;
        struct list_head              client_data_list;
-       spinlock_t                    client_data_lock;
 
        struct ib_cache               cache;
        int                          *pkey_tbl_len;
@@ -1144,8 +1144,8 @@ struct ib_device {
                IB_DEV_UNREGISTERED
        }                            reg_state;
 
-       u64                          uverbs_cmd_mask;
        int                          uverbs_abi_ver;
+       u64                          uverbs_cmd_mask;
 
        char                         node_desc[64];
        __be64                       node_guid;
index c6b2962..4fae903 100644 (file)
@@ -67,7 +67,6 @@ enum rdma_port_space {
        RDMA_PS_IPOIB = 0x0002,
        RDMA_PS_TCP   = 0x0106,
        RDMA_PS_UDP   = 0x0111,
-       RDMA_PS_SCTP  = 0x0183
 };
 
 struct rdma_addr {