IB/ipath: Fix many locking issues when switching to error state
Ralph Campbell [Tue, 13 May 2008 18:41:29 +0000 (11:41 -0700)]
The send DMA hardware queue voided a number of prior assumptions about
when a send is complete which led to completions being generated out of
order.  There were also a number of locking issues when switching the QP
to the error or reset states, and we implement the IB_QPS_SQD state.

Signed-off-by: Ralph Campbell <ralph.campbell@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

drivers/infiniband/hw/ipath/ipath_qp.c
drivers/infiniband/hw/ipath/ipath_rc.c
drivers/infiniband/hw/ipath/ipath_ruc.c
drivers/infiniband/hw/ipath/ipath_uc.c
drivers/infiniband/hw/ipath/ipath_ud.c
drivers/infiniband/hw/ipath/ipath_user_sdma.h
drivers/infiniband/hw/ipath/ipath_verbs.c
drivers/infiniband/hw/ipath/ipath_verbs.h

index 6f98632..4715911 100644 (file)
@@ -242,7 +242,6 @@ static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp)
 {
        struct ipath_qp *q, **qpp;
        unsigned long flags;
-       int fnd = 0;
 
        spin_lock_irqsave(&qpt->lock, flags);
 
@@ -253,51 +252,40 @@ static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp)
                        *qpp = qp->next;
                        qp->next = NULL;
                        atomic_dec(&qp->refcount);
-                       fnd = 1;
                        break;
                }
        }
 
        spin_unlock_irqrestore(&qpt->lock, flags);
-
-       if (!fnd)
-               return;
-
-       free_qpn(qpt, qp->ibqp.qp_num);
-
-       wait_event(qp->wait, !atomic_read(&qp->refcount));
 }
 
 /**
- * ipath_free_all_qps - remove all QPs from the table
+ * ipath_free_all_qps - check for QPs still in use
  * @qpt: the QP table to empty
+ *
+ * There should not be any QPs still in use.
+ * Free memory for table.
  */
-void ipath_free_all_qps(struct ipath_qp_table *qpt)
+unsigned ipath_free_all_qps(struct ipath_qp_table *qpt)
 {
        unsigned long flags;
-       struct ipath_qp *qp, *nqp;
-       u32 n;
+       struct ipath_qp *qp;
+       u32 n, qp_inuse = 0;
 
+       spin_lock_irqsave(&qpt->lock, flags);
        for (n = 0; n < qpt->max; n++) {
-               spin_lock_irqsave(&qpt->lock, flags);
                qp = qpt->table[n];
                qpt->table[n] = NULL;
-               spin_unlock_irqrestore(&qpt->lock, flags);
-
-               while (qp) {
-                       nqp = qp->next;
-                       free_qpn(qpt, qp->ibqp.qp_num);
-                       if (!atomic_dec_and_test(&qp->refcount) ||
-                           !ipath_destroy_qp(&qp->ibqp))
-                               ipath_dbg("QP memory leak!\n");
-                       qp = nqp;
-               }
+
+               for (; qp; qp = qp->next)
+                       qp_inuse++;
        }
+       spin_unlock_irqrestore(&qpt->lock, flags);
 
-       for (n = 0; n < ARRAY_SIZE(qpt->map); n++) {
+       for (n = 0; n < ARRAY_SIZE(qpt->map); n++)
                if (qpt->map[n].page)
-                       free_page((unsigned long)qpt->map[n].page);
-       }
+                       free_page((unsigned long) qpt->map[n].page);
+       return qp_inuse;
 }
 
 /**
@@ -336,11 +324,12 @@ static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type)
        qp->remote_qpn = 0;
        qp->qkey = 0;
        qp->qp_access_flags = 0;
-       qp->s_busy = 0;
+       atomic_set(&qp->s_dma_busy, 0);
        qp->s_flags &= IPATH_S_SIGNAL_REQ_WR;
        qp->s_hdrwords = 0;
        qp->s_wqe = NULL;
        qp->s_pkt_delay = 0;
+       qp->s_draining = 0;
        qp->s_psn = 0;
        qp->r_psn = 0;
        qp->r_msn = 0;
@@ -353,7 +342,8 @@ static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type)
        }
        qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
        qp->r_nak_state = 0;
-       qp->r_wrid_valid = 0;
+       qp->r_aflags = 0;
+       qp->r_flags = 0;
        qp->s_rnr_timeout = 0;
        qp->s_head = 0;
        qp->s_tail = 0;
@@ -361,7 +351,6 @@ static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type)
        qp->s_last = 0;
        qp->s_ssn = 1;
        qp->s_lsn = 0;
-       qp->s_wait_credit = 0;
        memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue));
        qp->r_head_ack_queue = 0;
        qp->s_tail_ack_queue = 0;
@@ -370,7 +359,6 @@ static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type)
                qp->r_rq.wq->head = 0;
                qp->r_rq.wq->tail = 0;
        }
-       qp->r_reuse_sge = 0;
 }
 
 /**
@@ -402,39 +390,21 @@ int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err)
                list_del_init(&qp->piowait);
        spin_unlock(&dev->pending_lock);
 
-       wc.vendor_err = 0;
-       wc.byte_len = 0;
-       wc.imm_data = 0;
+       /* Schedule the sending tasklet to drain the send work queue. */
+       if (qp->s_last != qp->s_head)
+               ipath_schedule_send(qp);
+
+       memset(&wc, 0, sizeof(wc));
        wc.qp = &qp->ibqp;
-       wc.src_qp = 0;
-       wc.wc_flags = 0;
-       wc.pkey_index = 0;
-       wc.slid = 0;
-       wc.sl = 0;
-       wc.dlid_path_bits = 0;
-       wc.port_num = 0;
-       if (qp->r_wrid_valid) {
-               qp->r_wrid_valid = 0;
+       wc.opcode = IB_WC_RECV;
+
+       if (test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) {
                wc.wr_id = qp->r_wr_id;
-               wc.opcode = IB_WC_RECV;
                wc.status = err;
                ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
        }
        wc.status = IB_WC_WR_FLUSH_ERR;
 
-       while (qp->s_last != qp->s_head) {
-               struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
-
-               wc.wr_id = wqe->wr.wr_id;
-               wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
-               if (++qp->s_last >= qp->s_size)
-                       qp->s_last = 0;
-               ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 1);
-       }
-       qp->s_cur = qp->s_tail = qp->s_head;
-       qp->s_hdrwords = 0;
-       qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
-
        if (qp->r_rq.wq) {
                struct ipath_rwq *wq;
                u32 head;
@@ -450,7 +420,6 @@ int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err)
                tail = wq->tail;
                if (tail >= qp->r_rq.size)
                        tail = 0;
-               wc.opcode = IB_WC_RECV;
                while (tail != head) {
                        wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
                        if (++tail >= qp->r_rq.size)
@@ -482,11 +451,10 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
        struct ipath_ibdev *dev = to_idev(ibqp->device);
        struct ipath_qp *qp = to_iqp(ibqp);
        enum ib_qp_state cur_state, new_state;
-       unsigned long flags;
        int lastwqe = 0;
        int ret;
 
-       spin_lock_irqsave(&qp->s_lock, flags);
+       spin_lock_irq(&qp->s_lock);
 
        cur_state = attr_mask & IB_QP_CUR_STATE ?
                attr->cur_qp_state : qp->state;
@@ -539,16 +507,42 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 
        switch (new_state) {
        case IB_QPS_RESET:
+               if (qp->state != IB_QPS_RESET) {
+                       qp->state = IB_QPS_RESET;
+                       spin_lock(&dev->pending_lock);
+                       if (!list_empty(&qp->timerwait))
+                               list_del_init(&qp->timerwait);
+                       if (!list_empty(&qp->piowait))
+                               list_del_init(&qp->piowait);
+                       spin_unlock(&dev->pending_lock);
+                       qp->s_flags &= ~IPATH_S_ANY_WAIT;
+                       spin_unlock_irq(&qp->s_lock);
+                       /* Stop the sending tasklet */
+                       tasklet_kill(&qp->s_task);
+                       wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
+                       spin_lock_irq(&qp->s_lock);
+               }
                ipath_reset_qp(qp, ibqp->qp_type);
                break;
 
+       case IB_QPS_SQD:
+               qp->s_draining = qp->s_last != qp->s_cur;
+               qp->state = new_state;
+               break;
+
+       case IB_QPS_SQE:
+               if (qp->ibqp.qp_type == IB_QPT_RC)
+                       goto inval;
+               qp->state = new_state;
+               break;
+
        case IB_QPS_ERR:
                lastwqe = ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
                break;
 
        default:
+               qp->state = new_state;
                break;
-
        }
 
        if (attr_mask & IB_QP_PKEY_INDEX)
@@ -601,8 +595,7 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
        if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
                qp->s_max_rd_atomic = attr->max_rd_atomic;
 
-       qp->state = new_state;
-       spin_unlock_irqrestore(&qp->s_lock, flags);
+       spin_unlock_irq(&qp->s_lock);
 
        if (lastwqe) {
                struct ib_event ev;
@@ -616,7 +609,7 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
        goto bail;
 
 inval:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
+       spin_unlock_irq(&qp->s_lock);
        ret = -EINVAL;
 
 bail:
@@ -647,7 +640,7 @@ int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
        attr->pkey_index = qp->s_pkey_index;
        attr->alt_pkey_index = 0;
        attr->en_sqd_async_notify = 0;
-       attr->sq_draining = 0;
+       attr->sq_draining = qp->s_draining;
        attr->max_rd_atomic = qp->s_max_rd_atomic;
        attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
        attr->min_rnr_timer = qp->r_min_rnr_timer;
@@ -837,6 +830,7 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
                spin_lock_init(&qp->r_rq.lock);
                atomic_set(&qp->refcount, 0);
                init_waitqueue_head(&qp->wait);
+               init_waitqueue_head(&qp->wait_dma);
                tasklet_init(&qp->s_task, ipath_do_send, (unsigned long)qp);
                INIT_LIST_HEAD(&qp->piowait);
                INIT_LIST_HEAD(&qp->timerwait);
@@ -930,6 +924,7 @@ bail_ip:
        else
                vfree(qp->r_rq.wq);
        ipath_free_qp(&dev->qp_table, qp);
+       free_qpn(&dev->qp_table, qp->ibqp.qp_num);
 bail_qp:
        kfree(qp);
 bail_swq:
@@ -951,41 +946,44 @@ int ipath_destroy_qp(struct ib_qp *ibqp)
 {
        struct ipath_qp *qp = to_iqp(ibqp);
        struct ipath_ibdev *dev = to_idev(ibqp->device);
-       unsigned long flags;
 
-       spin_lock_irqsave(&qp->s_lock, flags);
-       qp->state = IB_QPS_ERR;
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-       spin_lock(&dev->n_qps_lock);
-       dev->n_qps_allocated--;
-       spin_unlock(&dev->n_qps_lock);
+       /* Make sure HW and driver activity is stopped. */
+       spin_lock_irq(&qp->s_lock);
+       if (qp->state != IB_QPS_RESET) {
+               qp->state = IB_QPS_RESET;
+               spin_lock(&dev->pending_lock);
+               if (!list_empty(&qp->timerwait))
+                       list_del_init(&qp->timerwait);
+               if (!list_empty(&qp->piowait))
+                       list_del_init(&qp->piowait);
+               spin_unlock(&dev->pending_lock);
+               qp->s_flags &= ~IPATH_S_ANY_WAIT;
+               spin_unlock_irq(&qp->s_lock);
+               /* Stop the sending tasklet */
+               tasklet_kill(&qp->s_task);
+               wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
+       } else
+               spin_unlock_irq(&qp->s_lock);
 
-       /* Stop the sending tasklet. */
-       tasklet_kill(&qp->s_task);
+       ipath_free_qp(&dev->qp_table, qp);
 
        if (qp->s_tx) {
                atomic_dec(&qp->refcount);
                if (qp->s_tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
                        kfree(qp->s_tx->txreq.map_addr);
+               spin_lock_irq(&dev->pending_lock);
+               list_add(&qp->s_tx->txreq.list, &dev->txreq_free);
+               spin_unlock_irq(&dev->pending_lock);
+               qp->s_tx = NULL;
        }
 
-       /* Make sure the QP isn't on the timeout list. */
-       spin_lock_irqsave(&dev->pending_lock, flags);
-       if (!list_empty(&qp->timerwait))
-               list_del_init(&qp->timerwait);
-       if (!list_empty(&qp->piowait))
-               list_del_init(&qp->piowait);
-       if (qp->s_tx)
-               list_add(&qp->s_tx->txreq.list, &dev->txreq_free);
-       spin_unlock_irqrestore(&dev->pending_lock, flags);
+       wait_event(qp->wait, !atomic_read(&qp->refcount));
 
-       /*
-        * Make sure that the QP is not in the QPN table so receive
-        * interrupts will discard packets for this QP.  XXX Also remove QP
-        * from multicast table.
-        */
-       if (atomic_read(&qp->refcount) != 0)
-               ipath_free_qp(&dev->qp_table, qp);
+       /* all user's cleaned up, mark it available */
+       free_qpn(&dev->qp_table, qp->ibqp.qp_num);
+       spin_lock(&dev->n_qps_lock);
+       dev->n_qps_allocated--;
+       spin_unlock(&dev->n_qps_lock);
 
        if (qp->ip)
                kref_put(&qp->ip->ref, ipath_release_mmap_info);
@@ -1055,9 +1053,10 @@ void ipath_get_credit(struct ipath_qp *qp, u32 aeth)
        }
 
        /* Restart sending if it was blocked due to lack of credits. */
-       if (qp->s_cur != qp->s_head &&
+       if ((qp->s_flags & IPATH_S_WAIT_SSN_CREDIT) &&
+           qp->s_cur != qp->s_head &&
            (qp->s_lsn == (u32) -1 ||
             ipath_cmp24(get_swqe_ptr(qp, qp->s_cur)->ssn,
                         qp->s_lsn + 1) <= 0))
-               tasklet_hi_schedule(&qp->s_task);
+               ipath_schedule_send(qp);
 }
index b4b26c3..5b5276a 100644 (file)
@@ -92,6 +92,10 @@ static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp,
        u32 bth0;
        u32 bth2;
 
+       /* Don't send an ACK if we aren't supposed to. */
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+               goto bail;
+
        /* header size in 32-bit words LRH+BTH = (8+12)/4. */
        hwords = 5;
 
@@ -238,14 +242,25 @@ int ipath_make_rc_req(struct ipath_qp *qp)
            ipath_make_rc_ack(dev, qp, ohdr, pmtu))
                goto done;
 
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) ||
-           qp->s_rnr_timeout || qp->s_wait_credit)
-               goto bail;
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
+               if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
+                       goto bail;
+               /* We are in the error state, flush the work request. */
+               if (qp->s_last == qp->s_head)
+                       goto bail;
+               /* If DMAs are in progress, we can't flush immediately. */
+               if (atomic_read(&qp->s_dma_busy)) {
+                       qp->s_flags |= IPATH_S_WAIT_DMA;
+                       goto bail;
+               }
+               wqe = get_swqe_ptr(qp, qp->s_last);
+               ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+               goto done;
+       }
 
-       /* Limit the number of packets sent without an ACK. */
-       if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT) > 0) {
-               qp->s_wait_credit = 1;
-               dev->n_rc_stalls++;
+       /* Leave BUSY set until RNR timeout. */
+       if (qp->s_rnr_timeout) {
+               qp->s_flags |= IPATH_S_WAITING;
                goto bail;
        }
 
@@ -257,6 +272,9 @@ int ipath_make_rc_req(struct ipath_qp *qp)
        wqe = get_swqe_ptr(qp, qp->s_cur);
        switch (qp->s_state) {
        default:
+               if (!(ib_ipath_state_ops[qp->state] &
+                   IPATH_PROCESS_NEXT_SEND_OK))
+                       goto bail;
                /*
                 * Resend an old request or start a new one.
                 *
@@ -294,8 +312,10 @@ int ipath_make_rc_req(struct ipath_qp *qp)
                case IB_WR_SEND_WITH_IMM:
                        /* If no credit, return. */
                        if (qp->s_lsn != (u32) -1 &&
-                           ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0)
+                           ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
+                               qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT;
                                goto bail;
+                       }
                        wqe->lpsn = wqe->psn;
                        if (len > pmtu) {
                                wqe->lpsn += (len - 1) / pmtu;
@@ -325,8 +345,10 @@ int ipath_make_rc_req(struct ipath_qp *qp)
                case IB_WR_RDMA_WRITE_WITH_IMM:
                        /* If no credit, return. */
                        if (qp->s_lsn != (u32) -1 &&
-                           ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0)
+                           ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
+                               qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT;
                                goto bail;
+                       }
                        ohdr->u.rc.reth.vaddr =
                                cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
                        ohdr->u.rc.reth.rkey =
@@ -570,7 +592,11 @@ int ipath_make_rc_req(struct ipath_qp *qp)
        ipath_make_ruc_header(dev, qp, ohdr, bth0 | (qp->s_state << 24), bth2);
 done:
        ret = 1;
+       goto unlock;
+
 bail:
+       qp->s_flags &= ~IPATH_S_BUSY;
+unlock:
        spin_unlock_irqrestore(&qp->s_lock, flags);
        return ret;
 }
@@ -606,7 +632,11 @@ static void send_rc_ack(struct ipath_qp *qp)
 
        spin_unlock_irqrestore(&qp->s_lock, flags);
 
+       /* Don't try to send ACKs if the link isn't ACTIVE */
        dd = dev->dd;
+       if (!(dd->ipath_flags & IPATH_LINKACTIVE))
+               goto done;
+
        piobuf = ipath_getpiobuf(dd, 0, NULL);
        if (!piobuf) {
                /*
@@ -668,15 +698,16 @@ static void send_rc_ack(struct ipath_qp *qp)
        goto done;
 
 queue_ack:
-       dev->n_rc_qacks++;
-       qp->s_flags |= IPATH_S_ACK_PENDING;
-       qp->s_nak_state = qp->r_nak_state;
-       qp->s_ack_psn = qp->r_ack_psn;
+       if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK) {
+               dev->n_rc_qacks++;
+               qp->s_flags |= IPATH_S_ACK_PENDING;
+               qp->s_nak_state = qp->r_nak_state;
+               qp->s_ack_psn = qp->r_ack_psn;
+
+               /* Schedule the send tasklet. */
+               ipath_schedule_send(qp);
+       }
        spin_unlock_irqrestore(&qp->s_lock, flags);
-
-       /* Call ipath_do_rc_send() in another thread. */
-       tasklet_hi_schedule(&qp->s_task);
-
 done:
        return;
 }
@@ -735,7 +766,7 @@ static void reset_psn(struct ipath_qp *qp, u32 psn)
        /*
         * Set the state to restart in the middle of a request.
         * Don't change the s_sge, s_cur_sge, or s_cur_size.
-        * See ipath_do_rc_send().
+        * See ipath_make_rc_req().
         */
        switch (opcode) {
        case IB_WR_SEND:
@@ -801,7 +832,7 @@ void ipath_restart_rc(struct ipath_qp *qp, u32 psn)
                dev->n_rc_resends += (qp->s_psn - psn) & IPATH_PSN_MASK;
 
        reset_psn(qp, psn);
-       tasklet_hi_schedule(&qp->s_task);
+       ipath_schedule_send(qp);
 
 bail:
        return;
@@ -809,13 +840,7 @@ bail:
 
 static inline void update_last_psn(struct ipath_qp *qp, u32 psn)
 {
-       if (qp->s_last_psn != psn) {
-               qp->s_last_psn = psn;
-               if (qp->s_wait_credit) {
-                       qp->s_wait_credit = 0;
-                       tasklet_hi_schedule(&qp->s_task);
-               }
-       }
+       qp->s_last_psn = psn;
 }
 
 /**
@@ -915,14 +940,10 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
                     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
                        qp->s_num_rd_atomic--;
                        /* Restart sending task if fence is complete */
-                       if ((qp->s_flags & IPATH_S_FENCE_PENDING) &&
-                           !qp->s_num_rd_atomic) {
-                               qp->s_flags &= ~IPATH_S_FENCE_PENDING;
-                               tasklet_hi_schedule(&qp->s_task);
-                       } else if (qp->s_flags & IPATH_S_RDMAR_PENDING) {
-                               qp->s_flags &= ~IPATH_S_RDMAR_PENDING;
-                               tasklet_hi_schedule(&qp->s_task);
-                       }
+                       if (((qp->s_flags & IPATH_S_FENCE_PENDING) &&
+                            !qp->s_num_rd_atomic) ||
+                           qp->s_flags & IPATH_S_RDMAR_PENDING)
+                               ipath_schedule_send(qp);
                }
                /* Post a send completion queue entry if requested. */
                if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
@@ -956,6 +977,8 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
                } else {
                        if (++qp->s_last >= qp->s_size)
                                qp->s_last = 0;
+                       if (qp->state == IB_QPS_SQD && qp->s_last == qp->s_cur)
+                               qp->s_draining = 0;
                        if (qp->s_last == qp->s_tail)
                                break;
                        wqe = get_swqe_ptr(qp, qp->s_last);
@@ -979,7 +1002,7 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
                         */
                        if (ipath_cmp24(qp->s_psn, psn) <= 0) {
                                reset_psn(qp, psn + 1);
-                               tasklet_hi_schedule(&qp->s_task);
+                               ipath_schedule_send(qp);
                        }
                } else if (ipath_cmp24(qp->s_psn, psn) <= 0) {
                        qp->s_state = OP(SEND_LAST);
@@ -1018,6 +1041,7 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
                        ib_ipath_rnr_table[(aeth >> IPATH_AETH_CREDIT_SHIFT) &
                                           IPATH_AETH_CREDIT_MASK];
                ipath_insert_rnr_queue(qp);
+               ipath_schedule_send(qp);
                goto bail;
 
        case 3:         /* NAK */
@@ -1108,6 +1132,10 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
 
        spin_lock_irqsave(&qp->s_lock, flags);
 
+       /* Double check we can process this now that we hold the s_lock. */
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+               goto ack_done;
+
        /* Ignore invalid responses. */
        if (ipath_cmp24(psn, qp->s_next_psn) >= 0)
                goto ack_done;
@@ -1343,7 +1371,12 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
        psn &= IPATH_PSN_MASK;
        e = NULL;
        old_req = 1;
+
        spin_lock_irqsave(&qp->s_lock, flags);
+       /* Double check we can process this now that we hold the s_lock. */
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+               goto unlock_done;
+
        for (i = qp->r_head_ack_queue; ; i = prev) {
                if (i == qp->s_tail_ack_queue)
                        old_req = 0;
@@ -1471,7 +1504,7 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
                break;
        }
        qp->r_nak_state = 0;
-       tasklet_hi_schedule(&qp->s_task);
+       ipath_schedule_send(qp);
 
 unlock_done:
        spin_unlock_irqrestore(&qp->s_lock, flags);
@@ -1503,18 +1536,15 @@ void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err)
 
 static inline void ipath_update_ack_queue(struct ipath_qp *qp, unsigned n)
 {
-       unsigned long flags;
        unsigned next;
 
        next = n + 1;
        if (next > IPATH_MAX_RDMA_ATOMIC)
                next = 0;
-       spin_lock_irqsave(&qp->s_lock, flags);
        if (n == qp->s_tail_ack_queue) {
                qp->s_tail_ack_queue = next;
                qp->s_ack_state = OP(ACKNOWLEDGE);
        }
-       spin_unlock_irqrestore(&qp->s_lock, flags);
 }
 
 /**
@@ -1543,6 +1573,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
        int diff;
        struct ib_reth *reth;
        int header_in_data;
+       unsigned long flags;
 
        /* Validate the SLID. See Ch. 9.6.1.5 */
        if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
@@ -1690,9 +1721,8 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                        goto nack_inv;
                ipath_copy_sge(&qp->r_sge, data, tlen);
                qp->r_msn++;
-               if (!qp->r_wrid_valid)
+               if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
                        break;
-               qp->r_wrid_valid = 0;
                wc.wr_id = qp->r_wr_id;
                wc.status = IB_WC_SUCCESS;
                if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
@@ -1764,9 +1794,13 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                next = qp->r_head_ack_queue + 1;
                if (next > IPATH_MAX_RDMA_ATOMIC)
                        next = 0;
+               spin_lock_irqsave(&qp->s_lock, flags);
+               /* Double check we can process this while holding the s_lock. */
+               if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+                       goto unlock;
                if (unlikely(next == qp->s_tail_ack_queue)) {
                        if (!qp->s_ack_queue[next].sent)
-                               goto nack_inv;
+                               goto nack_inv_unlck;
                        ipath_update_ack_queue(qp, next);
                }
                e = &qp->s_ack_queue[qp->r_head_ack_queue];
@@ -1787,7 +1821,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                        ok = ipath_rkey_ok(qp, &e->rdma_sge, len, vaddr,
                                           rkey, IB_ACCESS_REMOTE_READ);
                        if (unlikely(!ok))
-                               goto nack_acc;
+                               goto nack_acc_unlck;
                        /*
                         * Update the next expected PSN.  We add 1 later
                         * below, so only add the remainder here.
@@ -1814,13 +1848,12 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                qp->r_psn++;
                qp->r_state = opcode;
                qp->r_nak_state = 0;
-               barrier();
                qp->r_head_ack_queue = next;
 
-               /* Call ipath_do_rc_send() in another thread. */
-               tasklet_hi_schedule(&qp->s_task);
+               /* Schedule the send tasklet. */
+               ipath_schedule_send(qp);
 
-               goto done;
+               goto unlock;
        }
 
        case OP(COMPARE_SWAP):
@@ -1839,9 +1872,13 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                next = qp->r_head_ack_queue + 1;
                if (next > IPATH_MAX_RDMA_ATOMIC)
                        next = 0;
+               spin_lock_irqsave(&qp->s_lock, flags);
+               /* Double check we can process this while holding the s_lock. */
+               if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+                       goto unlock;
                if (unlikely(next == qp->s_tail_ack_queue)) {
                        if (!qp->s_ack_queue[next].sent)
-                               goto nack_inv;
+                               goto nack_inv_unlck;
                        ipath_update_ack_queue(qp, next);
                }
                if (!header_in_data)
@@ -1851,13 +1888,13 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |
                        be32_to_cpu(ateth->vaddr[1]);
                if (unlikely(vaddr & (sizeof(u64) - 1)))
-                       goto nack_inv;
+                       goto nack_inv_unlck;
                rkey = be32_to_cpu(ateth->rkey);
                /* Check rkey & NAK */
                if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge,
                                            sizeof(u64), vaddr, rkey,
                                            IB_ACCESS_REMOTE_ATOMIC)))
-                       goto nack_acc;
+                       goto nack_acc_unlck;
                /* Perform atomic OP and save result. */
                maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
                sdata = be64_to_cpu(ateth->swap_data);
@@ -1874,13 +1911,12 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                qp->r_psn++;
                qp->r_state = opcode;
                qp->r_nak_state = 0;
-               barrier();
                qp->r_head_ack_queue = next;
 
-               /* Call ipath_do_rc_send() in another thread. */
-               tasklet_hi_schedule(&qp->s_task);
+               /* Schedule the send tasklet. */
+               ipath_schedule_send(qp);
 
-               goto done;
+               goto unlock;
        }
 
        default:
@@ -1901,19 +1937,26 @@ rnr_nak:
        qp->r_ack_psn = qp->r_psn;
        goto send_ack;
 
+nack_inv_unlck:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
 nack_inv:
        ipath_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
        qp->r_nak_state = IB_NAK_INVALID_REQUEST;
        qp->r_ack_psn = qp->r_psn;
        goto send_ack;
 
+nack_acc_unlck:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
 nack_acc:
        ipath_rc_error(qp, IB_WC_LOC_PROT_ERR);
        qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
        qp->r_ack_psn = qp->r_psn;
 send_ack:
        send_rc_ack(qp);
+       goto done;
 
+unlock:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
 done:
        return;
 }
index c716a03..a4b5521 100644 (file)
@@ -78,6 +78,7 @@ const u32 ib_ipath_rnr_table[32] = {
  * ipath_insert_rnr_queue - put QP on the RNR timeout list for the device
  * @qp: the QP
  *
+ * Called with the QP s_lock held and interrupts disabled.
  * XXX Use a simple list for now.  We might need a priority
  * queue if we have lots of QPs waiting for RNR timeouts
  * but that should be rare.
@@ -85,9 +86,9 @@ const u32 ib_ipath_rnr_table[32] = {
 void ipath_insert_rnr_queue(struct ipath_qp *qp)
 {
        struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-       unsigned long flags;
 
-       spin_lock_irqsave(&dev->pending_lock, flags);
+       /* We already did a spin_lock_irqsave(), so just use spin_lock */
+       spin_lock(&dev->pending_lock);
        if (list_empty(&dev->rnrwait))
                list_add(&qp->timerwait, &dev->rnrwait);
        else {
@@ -109,7 +110,7 @@ void ipath_insert_rnr_queue(struct ipath_qp *qp)
                        nqp->s_rnr_timeout -= qp->s_rnr_timeout;
                list_add(&qp->timerwait, l);
        }
-       spin_unlock_irqrestore(&dev->pending_lock, flags);
+       spin_unlock(&dev->pending_lock);
 }
 
 /**
@@ -185,6 +186,11 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
        }
 
        spin_lock_irqsave(&rq->lock, flags);
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+               ret = 0;
+               goto unlock;
+       }
+
        wq = rq->wq;
        tail = wq->tail;
        /* Validate tail before using it since it is user writable. */
@@ -192,9 +198,8 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
                tail = 0;
        do {
                if (unlikely(tail == wq->head)) {
-                       spin_unlock_irqrestore(&rq->lock, flags);
                        ret = 0;
-                       goto bail;
+                       goto unlock;
                }
                /* Make sure entry is read after head index is read. */
                smp_rmb();
@@ -207,7 +212,7 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
        wq->tail = tail;
 
        ret = 1;
-       qp->r_wrid_valid = 1;
+       set_bit(IPATH_R_WRID_VALID, &qp->r_aflags);
        if (handler) {
                u32 n;
 
@@ -234,8 +239,8 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
                        goto bail;
                }
        }
+unlock:
        spin_unlock_irqrestore(&rq->lock, flags);
-
 bail:
        return ret;
 }
@@ -263,35 +268,59 @@ static void ipath_ruc_loopback(struct ipath_qp *sqp)
        atomic64_t *maddr;
        enum ib_wc_status send_status;
 
+       /*
+        * Note that we check the responder QP state after
+        * checking the requester's state.
+        */
        qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn);
-       if (!qp) {
-               dev->n_pkt_drops++;
-               return;
-       }
 
-again:
        spin_lock_irqsave(&sqp->s_lock, flags);
 
-       if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_SEND_OK) ||
-           sqp->s_rnr_timeout) {
-               spin_unlock_irqrestore(&sqp->s_lock, flags);
-               goto done;
-       }
+       /* Return if we are already busy processing a work request. */
+       if ((sqp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
+           !(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
+               goto unlock;
 
-       /* Get the next send request. */
-       if (sqp->s_last == sqp->s_head) {
-               /* Send work queue is empty. */
-               spin_unlock_irqrestore(&sqp->s_lock, flags);
-               goto done;
+       sqp->s_flags |= IPATH_S_BUSY;
+
+again:
+       if (sqp->s_last == sqp->s_head)
+               goto clr_busy;
+       wqe = get_swqe_ptr(sqp, sqp->s_last);
+
+       /* Return if it is not OK to start a new work reqeust. */
+       if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
+               if (!(ib_ipath_state_ops[sqp->state] & IPATH_FLUSH_SEND))
+                       goto clr_busy;
+               /* We are in the error state, flush the work request. */
+               send_status = IB_WC_WR_FLUSH_ERR;
+               goto flush_send;
        }
 
        /*
         * We can rely on the entry not changing without the s_lock
         * being held until we update s_last.
+        * We increment s_cur to indicate s_last is in progress.
         */
-       wqe = get_swqe_ptr(sqp, sqp->s_last);
+       if (sqp->s_last == sqp->s_cur) {
+               if (++sqp->s_cur >= sqp->s_size)
+                       sqp->s_cur = 0;
+       }
        spin_unlock_irqrestore(&sqp->s_lock, flags);
 
+       if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+               dev->n_pkt_drops++;
+               /*
+                * For RC, the requester would timeout and retry so
+                * shortcut the timeouts and just signal too many retries.
+                */
+               if (sqp->ibqp.qp_type == IB_QPT_RC)
+                       send_status = IB_WC_RETRY_EXC_ERR;
+               else
+                       send_status = IB_WC_SUCCESS;
+               goto serr;
+       }
+
        memset(&wc, 0, sizeof wc);
        send_status = IB_WC_SUCCESS;
 
@@ -396,8 +425,7 @@ again:
                sqp->s_len -= len;
        }
 
-       if (wqe->wr.opcode == IB_WR_RDMA_WRITE ||
-           wqe->wr.opcode == IB_WR_RDMA_READ)
+       if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
                goto send_comp;
 
        if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
@@ -417,6 +445,8 @@ again:
                       wqe->wr.send_flags & IB_SEND_SOLICITED);
 
 send_comp:
+       spin_lock_irqsave(&sqp->s_lock, flags);
+flush_send:
        sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
        ipath_send_complete(sqp, wqe, send_status);
        goto again;
@@ -437,11 +467,12 @@ rnr_nak:
                sqp->s_rnr_retry--;
        spin_lock_irqsave(&sqp->s_lock, flags);
        if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_RECV_OK))
-               goto unlock;
+               goto clr_busy;
+       sqp->s_flags |= IPATH_S_WAITING;
        dev->n_rnr_naks++;
        sqp->s_rnr_timeout = ib_ipath_rnr_table[qp->r_min_rnr_timer];
        ipath_insert_rnr_queue(sqp);
-       goto unlock;
+       goto clr_busy;
 
 inv_err:
        send_status = IB_WC_REM_INV_REQ_ERR;
@@ -473,17 +504,19 @@ serr:
                }
                goto done;
        }
+clr_busy:
+       sqp->s_flags &= ~IPATH_S_BUSY;
 unlock:
        spin_unlock_irqrestore(&sqp->s_lock, flags);
 done:
-       if (atomic_dec_and_test(&qp->refcount))
+       if (qp && atomic_dec_and_test(&qp->refcount))
                wake_up(&qp->wait);
 }
 
 static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp)
 {
        if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA) ||
-               qp->ibqp.qp_type == IB_QPT_SMI) {
+           qp->ibqp.qp_type == IB_QPT_SMI) {
                unsigned long flags;
 
                spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
@@ -501,26 +534,36 @@ static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp)
  * @dev: the device we ran out of buffers on
  *
  * Called when we run out of PIO buffers.
+ * If we are now in the error state, return zero to flush the
+ * send work request.
  */
-static void ipath_no_bufs_available(struct ipath_qp *qp,
+static int ipath_no_bufs_available(struct ipath_qp *qp,
                                    struct ipath_ibdev *dev)
 {
        unsigned long flags;
+       int ret = 1;
 
        /*
         * Note that as soon as want_buffer() is called and
         * possibly before it returns, ipath_ib_piobufavail()
-        * could be called.  If we are still in the tasklet function,
-        * tasklet_hi_schedule() will not call us until the next time
-        * tasklet_hi_schedule() is called.
-        * We leave the busy flag set so that another post send doesn't
-        * try to put the same QP on the piowait list again.
+        * could be called. Therefore, put QP on the piowait list before
+        * enabling the PIO avail interrupt.
         */
-       spin_lock_irqsave(&dev->pending_lock, flags);
-       list_add_tail(&qp->piowait, &dev->piowait);
-       spin_unlock_irqrestore(&dev->pending_lock, flags);
-       want_buffer(dev->dd, qp);
-       dev->n_piowait++;
+       spin_lock_irqsave(&qp->s_lock, flags);
+       if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
+               dev->n_piowait++;
+               qp->s_flags |= IPATH_S_WAITING;
+               qp->s_flags &= ~IPATH_S_BUSY;
+               spin_lock(&dev->pending_lock);
+               if (list_empty(&qp->piowait))
+                       list_add_tail(&qp->piowait, &dev->piowait);
+               spin_unlock(&dev->pending_lock);
+       } else
+               ret = 0;
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       if (ret)
+               want_buffer(dev->dd, qp);
+       return ret;
 }
 
 /**
@@ -596,15 +639,13 @@ void ipath_do_send(unsigned long data)
        struct ipath_qp *qp = (struct ipath_qp *)data;
        struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
        int (*make_req)(struct ipath_qp *qp);
-
-       if (test_and_set_bit(IPATH_S_BUSY, &qp->s_busy))
-               goto bail;
+       unsigned long flags;
 
        if ((qp->ibqp.qp_type == IB_QPT_RC ||
             qp->ibqp.qp_type == IB_QPT_UC) &&
            qp->remote_ah_attr.dlid == dev->dd->ipath_lid) {
                ipath_ruc_loopback(qp);
-               goto clear;
+               goto bail;
        }
 
        if (qp->ibqp.qp_type == IB_QPT_RC)
@@ -614,6 +655,19 @@ void ipath_do_send(unsigned long data)
        else
               make_req = ipath_make_ud_req;
 
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       /* Return if we are already busy processing a work request. */
+       if ((qp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
+           !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) {
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               goto bail;
+       }
+
+       qp->s_flags |= IPATH_S_BUSY;
+
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+
 again:
        /* Check for a constructed packet to be sent. */
        if (qp->s_hdrwords != 0) {
@@ -623,8 +677,8 @@ again:
                 */
                if (ipath_verbs_send(qp, &qp->s_hdr, qp->s_hdrwords,
                                     qp->s_cur_sge, qp->s_cur_size)) {
-                       ipath_no_bufs_available(qp, dev);
-                       goto bail;
+                       if (ipath_no_bufs_available(qp, dev))
+                               goto bail;
                }
                dev->n_unicast_xmit++;
                /* Record that we sent the packet and s_hdr is empty. */
@@ -633,16 +687,20 @@ again:
 
        if (make_req(qp))
                goto again;
-clear:
-       clear_bit(IPATH_S_BUSY, &qp->s_busy);
+
 bail:;
 }
 
+/*
+ * This should be called with s_lock held.
+ */
 void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
                         enum ib_wc_status status)
 {
-       unsigned long flags;
-       u32 last;
+       u32 old_last, last;
+
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
+               return;
 
        /* See ch. 11.2.4.1 and 10.7.3.1 */
        if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
@@ -661,10 +719,14 @@ void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
                               status != IB_WC_SUCCESS);
        }
 
-       spin_lock_irqsave(&qp->s_lock, flags);
-       last = qp->s_last;
+       old_last = last = qp->s_last;
        if (++last >= qp->s_size)
                last = 0;
        qp->s_last = last;
-       spin_unlock_irqrestore(&qp->s_lock, flags);
+       if (qp->s_cur == old_last)
+               qp->s_cur = last;
+       if (qp->s_tail == old_last)
+               qp->s_tail = last;
+       if (qp->state == IB_QPS_SQD && last == qp->s_cur)
+               qp->s_draining = 0;
 }
index bfe8926..7fd18e8 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -47,14 +47,30 @@ int ipath_make_uc_req(struct ipath_qp *qp)
 {
        struct ipath_other_headers *ohdr;
        struct ipath_swqe *wqe;
+       unsigned long flags;
        u32 hwords;
        u32 bth0;
        u32 len;
        u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
        int ret = 0;
 
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK))
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
+               if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
+                       goto bail;
+               /* We are in the error state, flush the work request. */
+               if (qp->s_last == qp->s_head)
+                       goto bail;
+               /* If DMAs are in progress, we can't flush immediately. */
+               if (atomic_read(&qp->s_dma_busy)) {
+                       qp->s_flags |= IPATH_S_WAIT_DMA;
+                       goto bail;
+               }
+               wqe = get_swqe_ptr(qp, qp->s_last);
+               ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
                goto done;
+       }
 
        ohdr = &qp->s_hdr.u.oth;
        if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
@@ -69,9 +85,12 @@ int ipath_make_uc_req(struct ipath_qp *qp)
        qp->s_wqe = NULL;
        switch (qp->s_state) {
        default:
+               if (!(ib_ipath_state_ops[qp->state] &
+                   IPATH_PROCESS_NEXT_SEND_OK))
+                       goto bail;
                /* Check if send work queue is empty. */
                if (qp->s_cur == qp->s_head)
-                       goto done;
+                       goto bail;
                /*
                 * Start a new request.
                 */
@@ -134,7 +153,7 @@ int ipath_make_uc_req(struct ipath_qp *qp)
                        break;
 
                default:
-                       goto done;
+                       goto bail;
                }
                break;
 
@@ -194,9 +213,14 @@ int ipath_make_uc_req(struct ipath_qp *qp)
        ipath_make_ruc_header(to_idev(qp->ibqp.device),
                              qp, ohdr, bth0 | (qp->s_state << 24),
                              qp->s_next_psn++ & IPATH_PSN_MASK);
+done:
        ret = 1;
+       goto unlock;
 
-done:
+bail:
+       qp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
        return ret;
 }
 
@@ -258,8 +282,7 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
         */
        opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
 
-       wc.imm_data = 0;
-       wc.wc_flags = 0;
+       memset(&wc, 0, sizeof wc);
 
        /* Compare the PSN verses the expected PSN. */
        if (unlikely(ipath_cmp24(psn, qp->r_psn) != 0)) {
@@ -322,8 +345,8 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
        case OP(SEND_ONLY):
        case OP(SEND_ONLY_WITH_IMMEDIATE):
        send_first:
-               if (qp->r_reuse_sge) {
-                       qp->r_reuse_sge = 0;
+               if (qp->r_flags & IPATH_R_REUSE_SGE) {
+                       qp->r_flags &= ~IPATH_R_REUSE_SGE;
                        qp->r_sge = qp->s_rdma_read_sge;
                } else if (!ipath_get_rwqe(qp, 0)) {
                        dev->n_pkt_drops++;
@@ -340,13 +363,13 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
        case OP(SEND_MIDDLE):
                /* Check for invalid length PMTU or posted rwqe len. */
                if (unlikely(tlen != (hdrsize + pmtu + 4))) {
-                       qp->r_reuse_sge = 1;
+                       qp->r_flags |= IPATH_R_REUSE_SGE;
                        dev->n_pkt_drops++;
                        goto done;
                }
                qp->r_rcv_len += pmtu;
                if (unlikely(qp->r_rcv_len > qp->r_len)) {
-                       qp->r_reuse_sge = 1;
+                       qp->r_flags |= IPATH_R_REUSE_SGE;
                        dev->n_pkt_drops++;
                        goto done;
                }
@@ -372,7 +395,7 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                /* Check for invalid length. */
                /* XXX LAST len should be >= 1 */
                if (unlikely(tlen < (hdrsize + pad + 4))) {
-                       qp->r_reuse_sge = 1;
+                       qp->r_flags |= IPATH_R_REUSE_SGE;
                        dev->n_pkt_drops++;
                        goto done;
                }
@@ -380,7 +403,7 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                tlen -= (hdrsize + pad + 4);
                wc.byte_len = tlen + qp->r_rcv_len;
                if (unlikely(wc.byte_len > qp->r_len)) {
-                       qp->r_reuse_sge = 1;
+                       qp->r_flags |= IPATH_R_REUSE_SGE;
                        dev->n_pkt_drops++;
                        goto done;
                }
@@ -390,14 +413,10 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                wc.wr_id = qp->r_wr_id;
                wc.status = IB_WC_SUCCESS;
                wc.opcode = IB_WC_RECV;
-               wc.vendor_err = 0;
                wc.qp = &qp->ibqp;
                wc.src_qp = qp->remote_qpn;
-               wc.pkey_index = 0;
                wc.slid = qp->remote_ah_attr.dlid;
                wc.sl = qp->remote_ah_attr.sl;
-               wc.dlid_path_bits = 0;
-               wc.port_num = 0;
                /* Signal completion event if the solicited bit is set. */
                ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
                               (ohdr->bth[0] &
@@ -488,8 +507,8 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                        dev->n_pkt_drops++;
                        goto done;
                }
-               if (qp->r_reuse_sge)
-                       qp->r_reuse_sge = 0;
+               if (qp->r_flags & IPATH_R_REUSE_SGE)
+                       qp->r_flags &= ~IPATH_R_REUSE_SGE;
                else if (!ipath_get_rwqe(qp, 1)) {
                        dev->n_pkt_drops++;
                        goto done;
index 8b6a261..77ca8ca 100644 (file)
@@ -65,9 +65,9 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
        u32 length;
 
        qp = ipath_lookup_qpn(&dev->qp_table, swqe->wr.wr.ud.remote_qpn);
-       if (!qp) {
+       if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
                dev->n_pkt_drops++;
-               goto send_comp;
+               goto done;
        }
 
        rsge.sg_list = NULL;
@@ -91,14 +91,12 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
         * present on the wire.
         */
        length = swqe->length;
+       memset(&wc, 0, sizeof wc);
        wc.byte_len = length + sizeof(struct ib_grh);
 
        if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
                wc.wc_flags = IB_WC_WITH_IMM;
                wc.imm_data = swqe->wr.ex.imm_data;
-       } else {
-               wc.wc_flags = 0;
-               wc.imm_data = 0;
        }
 
        /*
@@ -229,7 +227,6 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
        }
        wc.status = IB_WC_SUCCESS;
        wc.opcode = IB_WC_RECV;
-       wc.vendor_err = 0;
        wc.qp = &qp->ibqp;
        wc.src_qp = sqp->ibqp.qp_num;
        /* XXX do we know which pkey matched? Only needed for GSI. */
@@ -248,8 +245,7 @@ drop:
        kfree(rsge.sg_list);
        if (atomic_dec_and_test(&qp->refcount))
                wake_up(&qp->wait);
-send_comp:
-       ipath_send_complete(sqp, swqe, IB_WC_SUCCESS);
+done:;
 }
 
 /**
@@ -264,6 +260,7 @@ int ipath_make_ud_req(struct ipath_qp *qp)
        struct ipath_other_headers *ohdr;
        struct ib_ah_attr *ah_attr;
        struct ipath_swqe *wqe;
+       unsigned long flags;
        u32 nwords;
        u32 extra_bytes;
        u32 bth0;
@@ -271,13 +268,30 @@ int ipath_make_ud_req(struct ipath_qp *qp)
        u16 lid;
        int ret = 0;
 
-       if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)))
-               goto bail;
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
+               if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
+                       goto bail;
+               /* We are in the error state, flush the work request. */
+               if (qp->s_last == qp->s_head)
+                       goto bail;
+               /* If DMAs are in progress, we can't flush immediately. */
+               if (atomic_read(&qp->s_dma_busy)) {
+                       qp->s_flags |= IPATH_S_WAIT_DMA;
+                       goto bail;
+               }
+               wqe = get_swqe_ptr(qp, qp->s_last);
+               ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+               goto done;
+       }
 
        if (qp->s_cur == qp->s_head)
                goto bail;
 
        wqe = get_swqe_ptr(qp, qp->s_cur);
+       if (++qp->s_cur >= qp->s_size)
+               qp->s_cur = 0;
 
        /* Construct the header. */
        ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr;
@@ -288,10 +302,23 @@ int ipath_make_ud_req(struct ipath_qp *qp)
                        dev->n_unicast_xmit++;
        } else {
                dev->n_unicast_xmit++;
-               lid = ah_attr->dlid &
-                       ~((1 << dev->dd->ipath_lmc) - 1);
+               lid = ah_attr->dlid & ~((1 << dev->dd->ipath_lmc) - 1);
                if (unlikely(lid == dev->dd->ipath_lid)) {
+                       /*
+                        * If DMAs are in progress, we can't generate
+                        * a completion for the loopback packet since
+                        * it would be out of order.
+                        * XXX Instead of waiting, we could queue a
+                        * zero length descriptor so we get a callback.
+                        */
+                       if (atomic_read(&qp->s_dma_busy)) {
+                               qp->s_flags |= IPATH_S_WAIT_DMA;
+                               goto bail;
+                       }
+                       spin_unlock_irqrestore(&qp->s_lock, flags);
                        ipath_ud_loopback(qp, wqe);
+                       spin_lock_irqsave(&qp->s_lock, flags);
+                       ipath_send_complete(qp, wqe, IB_WC_SUCCESS);
                        goto done;
                }
        }
@@ -368,11 +395,13 @@ int ipath_make_ud_req(struct ipath_qp *qp)
        ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
 
 done:
-       if (++qp->s_cur >= qp->s_size)
-               qp->s_cur = 0;
        ret = 1;
+       goto unlock;
 
 bail:
+       qp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
        return ret;
 }
 
@@ -506,8 +535,8 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
        /*
         * Get the next work request entry to find where to put the data.
         */
-       if (qp->r_reuse_sge)
-               qp->r_reuse_sge = 0;
+       if (qp->r_flags & IPATH_R_REUSE_SGE)
+               qp->r_flags &= ~IPATH_R_REUSE_SGE;
        else if (!ipath_get_rwqe(qp, 0)) {
                /*
                 * Count VL15 packets dropped due to no receive buffer.
@@ -523,7 +552,7 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
        }
        /* Silently drop packets which are too big. */
        if (wc.byte_len > qp->r_len) {
-               qp->r_reuse_sge = 1;
+               qp->r_flags |= IPATH_R_REUSE_SGE;
                dev->n_pkt_drops++;
                goto bail;
        }
@@ -535,7 +564,8 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh));
        ipath_copy_sge(&qp->r_sge, data,
                       wc.byte_len - sizeof(struct ib_grh));
-       qp->r_wrid_valid = 0;
+       if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
+               goto bail;
        wc.wr_id = qp->r_wr_id;
        wc.status = IB_WC_SUCCESS;
        wc.opcode = IB_WC_RECV;
index e70946c..fc76316 100644 (file)
@@ -45,8 +45,6 @@ int ipath_user_sdma_writev(struct ipath_devdata *dd,
 int ipath_user_sdma_make_progress(struct ipath_devdata *dd,
                                  struct ipath_user_sdma_queue *pq);
 
-int ipath_user_sdma_pkt_sent(const struct ipath_user_sdma_queue *pq,
-                            u32 counter);
 void ipath_user_sdma_queue_drain(struct ipath_devdata *dd,
                                 struct ipath_user_sdma_queue *pq);
 
index 22bb42d..e0ec540 100644 (file)
@@ -111,16 +111,24 @@ static unsigned int ib_ipath_disable_sma;
 module_param_named(disable_sma, ib_ipath_disable_sma, uint, S_IWUSR | S_IRUGO);
 MODULE_PARM_DESC(disable_sma, "Disable the SMA");
 
+/*
+ * Note that it is OK to post send work requests in the SQE and ERR
+ * states; ipath_do_send() will process them and generate error
+ * completions as per IB 1.2 C10-96.
+ */
 const int ib_ipath_state_ops[IB_QPS_ERR + 1] = {
        [IB_QPS_RESET] = 0,
        [IB_QPS_INIT] = IPATH_POST_RECV_OK,
        [IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
        [IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
-           IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK,
+           IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK |
+           IPATH_PROCESS_NEXT_SEND_OK,
        [IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
-           IPATH_POST_SEND_OK,
-       [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
-       [IB_QPS_ERR] = 0,
+           IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK,
+       [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+           IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
+       [IB_QPS_ERR] = IPATH_POST_RECV_OK | IPATH_FLUSH_RECV |
+           IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
 };
 
 struct ipath_ucontext {
@@ -230,18 +238,6 @@ void ipath_skip_sge(struct ipath_sge_state *ss, u32 length)
        }
 }
 
-static void ipath_flush_wqe(struct ipath_qp *qp, struct ib_send_wr *wr)
-{
-       struct ib_wc wc;
-
-       memset(&wc, 0, sizeof(wc));
-       wc.wr_id = wr->wr_id;
-       wc.status = IB_WC_WR_FLUSH_ERR;
-       wc.opcode = ib_ipath_wc_opcode[wr->opcode];
-       wc.qp = &qp->ibqp;
-       ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 1);
-}
-
 /*
  * Count the number of DMA descriptors needed to send length bytes of data.
  * Don't modify the ipath_sge_state to get the count.
@@ -347,14 +343,8 @@ static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr)
        spin_lock_irqsave(&qp->s_lock, flags);
 
        /* Check that state is OK to post send. */
-       if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK))) {
-               if (qp->state != IB_QPS_SQE && qp->state != IB_QPS_ERR)
-                       goto bail_inval;
-               /* C10-96 says generate a flushed completion entry. */
-               ipath_flush_wqe(qp, wr);
-               ret = 0;
-               goto bail;
-       }
+       if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)))
+               goto bail_inval;
 
        /* IB spec says that num_sge == 0 is OK. */
        if (wr->num_sge > qp->s_max_sge)
@@ -677,6 +667,7 @@ bail:;
 static void ipath_ib_timer(struct ipath_ibdev *dev)
 {
        struct ipath_qp *resend = NULL;
+       struct ipath_qp *rnr = NULL;
        struct list_head *last;
        struct ipath_qp *qp;
        unsigned long flags;
@@ -703,7 +694,9 @@ static void ipath_ib_timer(struct ipath_ibdev *dev)
                if (--qp->s_rnr_timeout == 0) {
                        do {
                                list_del_init(&qp->timerwait);
-                               tasklet_hi_schedule(&qp->s_task);
+                               qp->timer_next = rnr;
+                               rnr = qp;
+                               atomic_inc(&qp->refcount);
                                if (list_empty(last))
                                        break;
                                qp = list_entry(last->next, struct ipath_qp,
@@ -743,9 +736,13 @@ static void ipath_ib_timer(struct ipath_ibdev *dev)
        spin_unlock_irqrestore(&dev->pending_lock, flags);
 
        /* XXX What if timer fires again while this is running? */
-       for (qp = resend; qp != NULL; qp = qp->timer_next) {
+       while (resend != NULL) {
+               qp = resend;
+               resend = qp->timer_next;
+
                spin_lock_irqsave(&qp->s_lock, flags);
-               if (qp->s_last != qp->s_tail && qp->state == IB_QPS_RTS) {
+               if (qp->s_last != qp->s_tail &&
+                   ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
                        dev->n_timeouts++;
                        ipath_restart_rc(qp, qp->s_last_psn + 1);
                }
@@ -755,6 +752,19 @@ static void ipath_ib_timer(struct ipath_ibdev *dev)
                if (atomic_dec_and_test(&qp->refcount))
                        wake_up(&qp->wait);
        }
+       while (rnr != NULL) {
+               qp = rnr;
+               rnr = qp->timer_next;
+
+               spin_lock_irqsave(&qp->s_lock, flags);
+               if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
+                       ipath_schedule_send(qp);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+
+               /* Notify ipath_destroy_qp() if it is waiting. */
+               if (atomic_dec_and_test(&qp->refcount))
+                       wake_up(&qp->wait);
+       }
 }
 
 static void update_sge(struct ipath_sge_state *ss, u32 length)
@@ -1010,13 +1020,24 @@ static void sdma_complete(void *cookie, int status)
        struct ipath_verbs_txreq *tx = cookie;
        struct ipath_qp *qp = tx->qp;
        struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+       unsigned int flags;
+       enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ?
+               IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR;
 
-       /* Generate a completion queue entry if needed */
-       if (qp->ibqp.qp_type != IB_QPT_RC && tx->wqe) {
-               enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ?
-                       IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR;
-
+       if (atomic_dec_and_test(&qp->s_dma_busy)) {
+               spin_lock_irqsave(&qp->s_lock, flags);
+               if (tx->wqe)
+                       ipath_send_complete(qp, tx->wqe, ibs);
+               if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
+                    qp->s_last != qp->s_head) ||
+                   (qp->s_flags & IPATH_S_WAIT_DMA))
+                       ipath_schedule_send(qp);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               wake_up(&qp->wait_dma);
+       } else if (tx->wqe) {
+               spin_lock_irqsave(&qp->s_lock, flags);
                ipath_send_complete(qp, tx->wqe, ibs);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
        }
 
        if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
@@ -1027,6 +1048,21 @@ static void sdma_complete(void *cookie, int status)
                wake_up(&qp->wait);
 }
 
+static void decrement_dma_busy(struct ipath_qp *qp)
+{
+       unsigned int flags;
+
+       if (atomic_dec_and_test(&qp->s_dma_busy)) {
+               spin_lock_irqsave(&qp->s_lock, flags);
+               if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
+                    qp->s_last != qp->s_head) ||
+                   (qp->s_flags & IPATH_S_WAIT_DMA))
+                       ipath_schedule_send(qp);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               wake_up(&qp->wait_dma);
+       }
+}
+
 /*
  * Compute the number of clock cycles of delay before sending the next packet.
  * The multipliers reflect the number of clocks for the fastest rate so
@@ -1065,9 +1101,12 @@ static int ipath_verbs_send_dma(struct ipath_qp *qp,
        if (tx) {
                qp->s_tx = NULL;
                /* resend previously constructed packet */
+               atomic_inc(&qp->s_dma_busy);
                ret = ipath_sdma_verbs_send(dd, tx->ss, tx->len, tx);
-               if (ret)
+               if (ret) {
                        qp->s_tx = tx;
+                       decrement_dma_busy(qp);
+               }
                goto bail;
        }
 
@@ -1118,12 +1157,14 @@ static int ipath_verbs_send_dma(struct ipath_qp *qp,
                tx->txreq.sg_count = ndesc;
                tx->map_len = (hdrwords + 2) << 2;
                tx->txreq.map_addr = &tx->hdr;
+               atomic_inc(&qp->s_dma_busy);
                ret = ipath_sdma_verbs_send(dd, ss, dwords, tx);
                if (ret) {
                        /* save ss and length in dwords */
                        tx->ss = ss;
                        tx->len = dwords;
                        qp->s_tx = tx;
+                       decrement_dma_busy(qp);
                }
                goto bail;
        }
@@ -1144,6 +1185,7 @@ static int ipath_verbs_send_dma(struct ipath_qp *qp,
        memcpy(piobuf, hdr, hdrwords << 2);
        ipath_copy_from_sge(piobuf + hdrwords, ss, len);
 
+       atomic_inc(&qp->s_dma_busy);
        ret = ipath_sdma_verbs_send(dd, NULL, 0, tx);
        /*
         * If we couldn't queue the DMA request, save the info
@@ -1154,6 +1196,7 @@ static int ipath_verbs_send_dma(struct ipath_qp *qp,
                tx->ss = NULL;
                tx->len = 0;
                qp->s_tx = tx;
+               decrement_dma_busy(qp);
        }
        dev->n_unaligned++;
        goto bail;
@@ -1177,6 +1220,7 @@ static int ipath_verbs_send_pio(struct ipath_qp *qp,
        unsigned flush_wc;
        u32 control;
        int ret;
+       unsigned int flags;
 
        piobuf = ipath_getpiobuf(dd, plen, NULL);
        if (unlikely(piobuf == NULL)) {
@@ -1247,8 +1291,11 @@ static int ipath_verbs_send_pio(struct ipath_qp *qp,
        }
        copy_io(piobuf, ss, len, flush_wc);
 done:
-       if (qp->s_wqe)
+       if (qp->s_wqe) {
+               spin_lock_irqsave(&qp->s_lock, flags);
                ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+       }
        ret = 0;
 bail:
        return ret;
@@ -1281,19 +1328,12 @@ int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
         * can defer SDMA restart until link goes ACTIVE without
         * worrying about just how we got there.
         */
-       if (qp->ibqp.qp_type == IB_QPT_SMI)
+       if (qp->ibqp.qp_type == IB_QPT_SMI ||
+           !(dd->ipath_flags & IPATH_HAS_SEND_DMA))
                ret = ipath_verbs_send_pio(qp, hdr, hdrwords, ss, len,
                                           plen, dwords);
-       /* All non-VL15 packets are dropped if link is not ACTIVE */
-       else if (!(dd->ipath_flags & IPATH_LINKACTIVE)) {
-               if (qp->s_wqe)
-                       ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
-               ret = 0;
-       } else if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
-               ret = ipath_verbs_send_dma(qp, hdr, hdrwords, ss, len,
-                                          plen, dwords);
        else
-               ret = ipath_verbs_send_pio(qp, hdr, hdrwords, ss, len,
+               ret = ipath_verbs_send_dma(qp, hdr, hdrwords, ss, len,
                                           plen, dwords);
 
        return ret;
@@ -1401,27 +1441,46 @@ bail:
  * This is called from ipath_intr() at interrupt level when a PIO buffer is
  * available after ipath_verbs_send() returned an error that no buffers were
  * available.  Return 1 if we consumed all the PIO buffers and we still have
- * QPs waiting for buffers (for now, just do a tasklet_hi_schedule and
+ * QPs waiting for buffers (for now, just restart the send tasklet and
  * return zero).
  */
 int ipath_ib_piobufavail(struct ipath_ibdev *dev)
 {
+       struct list_head *list;
+       struct ipath_qp *qplist;
        struct ipath_qp *qp;
        unsigned long flags;
 
        if (dev == NULL)
                goto bail;
 
+       list = &dev->piowait;
+       qplist = NULL;
+
        spin_lock_irqsave(&dev->pending_lock, flags);
-       while (!list_empty(&dev->piowait)) {
-               qp = list_entry(dev->piowait.next, struct ipath_qp,
-                               piowait);
+       while (!list_empty(list)) {
+               qp = list_entry(list->next, struct ipath_qp, piowait);
                list_del_init(&qp->piowait);
-               clear_bit(IPATH_S_BUSY, &qp->s_busy);
-               tasklet_hi_schedule(&qp->s_task);
+               qp->pio_next = qplist;
+               qplist = qp;
+               atomic_inc(&qp->refcount);
        }
        spin_unlock_irqrestore(&dev->pending_lock, flags);
 
+       while (qplist != NULL) {
+               qp = qplist;
+               qplist = qp->pio_next;
+
+               spin_lock_irqsave(&qp->s_lock, flags);
+               if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
+                       ipath_schedule_send(qp);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+
+               /* Notify ipath_destroy_qp() if it is waiting. */
+               if (atomic_dec_and_test(&qp->refcount))
+                       wake_up(&qp->wait);
+       }
+
 bail:
        return 0;
 }
@@ -2143,11 +2202,12 @@ bail:
 void ipath_unregister_ib_device(struct ipath_ibdev *dev)
 {
        struct ib_device *ibdev = &dev->ibdev;
-
-       disable_timer(dev->dd);
+       u32 qps_inuse;
 
        ib_unregister_device(ibdev);
 
+       disable_timer(dev->dd);
+
        if (!list_empty(&dev->pending[0]) ||
            !list_empty(&dev->pending[1]) ||
            !list_empty(&dev->pending[2]))
@@ -2162,7 +2222,10 @@ void ipath_unregister_ib_device(struct ipath_ibdev *dev)
         * Note that ipath_unregister_ib_device() can be called before all
         * the QPs are destroyed!
         */
-       ipath_free_all_qps(&dev->qp_table);
+       qps_inuse = ipath_free_all_qps(&dev->qp_table);
+       if (qps_inuse)
+               ipath_dev_err(dev->dd, "QP memory leak! %u still in use\n",
+                       qps_inuse);
        kfree(dev->qp_table.table);
        kfree(dev->lk_table.table);
        kfree(dev->txreq_bufs);
@@ -2213,17 +2276,14 @@ static ssize_t show_stats(struct device *device, struct device_attribute *attr,
                      "RC OTH NAKs %d\n"
                      "RC timeouts %d\n"
                      "RC RDMA dup %d\n"
-                     "RC stalls   %d\n"
                      "piobuf wait %d\n"
-                     "no piobuf   %d\n"
                      "unaligned   %d\n"
                      "PKT drops   %d\n"
                      "WQE errs    %d\n",
                      dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks,
                      dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks,
                      dev->n_other_naks, dev->n_timeouts,
-                     dev->n_rdma_dup_busy, dev->n_rc_stalls, dev->n_piowait,
-                     dev->n_no_piobuf, dev->n_unaligned,
+                     dev->n_rdma_dup_busy, dev->n_piowait, dev->n_unaligned,
                      dev->n_pkt_drops, dev->n_wqe_errs);
        for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) {
                const struct ipath_opcode_stats *si = &dev->opstats[i];
index 4c7c2aa..deee02c 100644 (file)
 #define IPATH_POST_RECV_OK             0x02
 #define IPATH_PROCESS_RECV_OK          0x04
 #define IPATH_PROCESS_SEND_OK          0x08
+#define IPATH_PROCESS_NEXT_SEND_OK     0x10
+#define IPATH_FLUSH_SEND               0x20
+#define IPATH_FLUSH_RECV               0x40
+#define IPATH_PROCESS_OR_FLUSH_SEND \
+       (IPATH_PROCESS_SEND_OK | IPATH_FLUSH_SEND)
 
 /* IB Performance Manager status values */
 #define IB_PMA_SAMPLE_STATUS_DONE      0x00
@@ -353,12 +358,14 @@ struct ipath_qp {
        struct ib_qp ibqp;
        struct ipath_qp *next;          /* link list for QPN hash table */
        struct ipath_qp *timer_next;    /* link list for ipath_ib_timer() */
+       struct ipath_qp *pio_next;      /* link for ipath_ib_piobufavail() */
        struct list_head piowait;       /* link for wait PIO buf */
        struct list_head timerwait;     /* link for waiting for timeouts */
        struct ib_ah_attr remote_ah_attr;
        struct ipath_ib_header s_hdr;   /* next packet header to send */
        atomic_t refcount;
        wait_queue_head_t wait;
+       wait_queue_head_t wait_dma;
        struct tasklet_struct s_task;
        struct ipath_mmap_info *ip;
        struct ipath_sge_state *s_cur_sge;
@@ -369,7 +376,7 @@ struct ipath_qp {
        struct ipath_sge_state s_rdma_read_sge;
        struct ipath_sge_state r_sge;   /* current receive data */
        spinlock_t s_lock;
-       unsigned long s_busy;
+       atomic_t s_dma_busy;
        u16 s_pkt_delay;
        u16 s_hdrwords;         /* size of s_hdr in 32 bit words */
        u32 s_cur_size;         /* size of send packet in bytes */
@@ -383,6 +390,7 @@ struct ipath_qp {
        u32 s_rnr_timeout;      /* number of milliseconds for RNR timeout */
        u32 r_ack_psn;          /* PSN for next ACK or atomic ACK */
        u64 r_wr_id;            /* ID for current receive WQE */
+       unsigned long r_aflags;
        u32 r_len;              /* total length of r_sge */
        u32 r_rcv_len;          /* receive data len processed */
        u32 r_psn;              /* expected rcv packet sequence number */
@@ -394,8 +402,7 @@ struct ipath_qp {
        u8 r_state;             /* opcode of last packet received */
        u8 r_nak_state;         /* non-zero if NAK is pending */
        u8 r_min_rnr_timer;     /* retry timeout value for RNR NAKs */
-       u8 r_reuse_sge;         /* for UC receive errors */
-       u8 r_wrid_valid;        /* r_wrid set but CQ entry not yet made */
+       u8 r_flags;
        u8 r_max_rd_atomic;     /* max number of RDMA read/atomic to receive */
        u8 r_head_ack_queue;    /* index into s_ack_queue[] */
        u8 qp_access_flags;
@@ -404,13 +411,13 @@ struct ipath_qp {
        u8 s_rnr_retry_cnt;
        u8 s_retry;             /* requester retry counter */
        u8 s_rnr_retry;         /* requester RNR retry counter */
-       u8 s_wait_credit;       /* limit number of unacked packets sent */
        u8 s_pkey_index;        /* PKEY index to use */
        u8 s_max_rd_atomic;     /* max number of RDMA read/atomic to send */
        u8 s_num_rd_atomic;     /* number of RDMA read/atomic pending */
        u8 s_tail_ack_queue;    /* index into s_ack_queue[] */
        u8 s_flags;
        u8 s_dmult;
+       u8 s_draining;
        u8 timeout;             /* Timeout for this QP */
        enum ib_mtu path_mtu;
        u32 remote_qpn;
@@ -428,16 +435,39 @@ struct ipath_qp {
        struct ipath_sge r_sg_list[0];  /* verified SGEs */
 };
 
-/* Bit definition for s_busy. */
-#define IPATH_S_BUSY           0
+/*
+ * Atomic bit definitions for r_aflags.
+ */
+#define IPATH_R_WRID_VALID     0
+
+/*
+ * Bit definitions for r_flags.
+ */
+#define IPATH_R_REUSE_SGE      0x01
 
 /*
  * Bit definitions for s_flags.
+ *
+ * IPATH_S_FENCE_PENDING - waiting for all prior RDMA read or atomic SWQEs
+ *                        before processing the next SWQE
+ * IPATH_S_RDMAR_PENDING - waiting for any RDMA read or atomic SWQEs
+ *                        before processing the next SWQE
+ * IPATH_S_WAITING - waiting for RNR timeout or send buffer available.
+ * IPATH_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE
+ * IPATH_S_WAIT_DMA - waiting for send DMA queue to drain before generating
+ *                   next send completion entry not via send DMA.
  */
 #define IPATH_S_SIGNAL_REQ_WR  0x01
 #define IPATH_S_FENCE_PENDING  0x02
 #define IPATH_S_RDMAR_PENDING  0x04
 #define IPATH_S_ACK_PENDING    0x08
+#define IPATH_S_BUSY           0x10
+#define IPATH_S_WAITING                0x20
+#define IPATH_S_WAIT_SSN_CREDIT        0x40
+#define IPATH_S_WAIT_DMA       0x80
+
+#define IPATH_S_ANY_WAIT (IPATH_S_FENCE_PENDING | IPATH_S_RDMAR_PENDING | \
+       IPATH_S_WAITING | IPATH_S_WAIT_SSN_CREDIT | IPATH_S_WAIT_DMA)
 
 #define IPATH_PSN_CREDIT       512
 
@@ -573,13 +603,11 @@ struct ipath_ibdev {
        u32 n_rnr_naks;
        u32 n_other_naks;
        u32 n_timeouts;
-       u32 n_rc_stalls;
        u32 n_pkt_drops;
        u32 n_vl15_dropped;
        u32 n_wqe_errs;
        u32 n_rdma_dup_busy;
        u32 n_piowait;
-       u32 n_no_piobuf;
        u32 n_unaligned;
        u32 port_cap_flags;
        u32 pma_sample_start;
@@ -657,6 +685,17 @@ static inline struct ipath_ibdev *to_idev(struct ib_device *ibdev)
        return container_of(ibdev, struct ipath_ibdev, ibdev);
 }
 
+/*
+ * This must be called with s_lock held.
+ */
+static inline void ipath_schedule_send(struct ipath_qp *qp)
+{
+       if (qp->s_flags & IPATH_S_ANY_WAIT)
+               qp->s_flags &= ~IPATH_S_ANY_WAIT;
+       if (!(qp->s_flags & IPATH_S_BUSY))
+               tasklet_hi_schedule(&qp->s_task);
+}
+
 int ipath_process_mad(struct ib_device *ibdev,
                      int mad_flags,
                      u8 port_num,
@@ -706,7 +745,7 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
                   int attr_mask, struct ib_qp_init_attr *init_attr);
 
-void ipath_free_all_qps(struct ipath_qp_table *qpt);
+unsigned ipath_free_all_qps(struct ipath_qp_table *qpt);
 
 int ipath_init_qp_table(struct ipath_ibdev *idev, int size);