Path: blob/master/drivers/infiniband/hw/qib/qib_rc.c
15112 views
/*1* Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.2* Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.3*4* This software is available to you under a choice of one of two5* licenses. You may choose to be licensed under the terms of the GNU6* General Public License (GPL) Version 2, available from the file7* COPYING in the main directory of this source tree, or the8* OpenIB.org BSD license below:9*10* Redistribution and use in source and binary forms, with or11* without modification, are permitted provided that the following12* conditions are met:13*14* - Redistributions of source code must retain the above15* copyright notice, this list of conditions and the following16* disclaimer.17*18* - Redistributions in binary form must reproduce the above19* copyright notice, this list of conditions and the following20* disclaimer in the documentation and/or other materials21* provided with the distribution.22*23* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,24* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF25* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND26* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS27* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN28* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN29* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE30* SOFTWARE.31*/3233#include <linux/io.h>3435#include "qib.h"3637/* cut down ridiculously long IB macro names */38#define OP(x) IB_OPCODE_RC_##x3940static void rc_timeout(unsigned long arg);4142static u32 restart_sge(struct qib_sge_state *ss, struct qib_swqe *wqe,43u32 psn, u32 pmtu)44{45u32 len;4647len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu;48ss->sge = wqe->sg_list[0];49ss->sg_list = wqe->sg_list + 1;50ss->num_sge = wqe->wr.num_sge;51ss->total_len = wqe->length;52qib_skip_sge(ss, len, 0);53return wqe->length - len;54}5556static void start_timer(struct qib_qp *qp)57{58qp->s_flags |= QIB_S_TIMER;59qp->s_timer.function = rc_timeout;60/* 4.096 usec. * (1 << qp->timeout) */61qp->s_timer.expires = jiffies +62usecs_to_jiffies((4096UL * (1UL << qp->timeout)) / 1000UL);63add_timer(&qp->s_timer);64}6566/**67* qib_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)68* @dev: the device for this QP69* @qp: a pointer to the QP70* @ohdr: a pointer to the IB header being constructed71* @pmtu: the path MTU72*73* Return 1 if constructed; otherwise, return 0.74* Note that we are in the responder's side of the QP context.75* Note the QP s_lock must be held.76*/77static int qib_make_rc_ack(struct qib_ibdev *dev, struct qib_qp *qp,78struct qib_other_headers *ohdr, u32 pmtu)79{80struct qib_ack_entry *e;81u32 hwords;82u32 len;83u32 bth0;84u32 bth2;8586/* Don't send an ACK if we aren't supposed to. */87if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))88goto bail;8990/* header size in 32-bit words LRH+BTH = (8+12)/4. */91hwords = 5;9293switch (qp->s_ack_state) {94case OP(RDMA_READ_RESPONSE_LAST):95case OP(RDMA_READ_RESPONSE_ONLY):96e = &qp->s_ack_queue[qp->s_tail_ack_queue];97if (e->rdma_sge.mr) {98atomic_dec(&e->rdma_sge.mr->refcount);99e->rdma_sge.mr = NULL;100}101/* FALLTHROUGH */102case OP(ATOMIC_ACKNOWLEDGE):103/*104* We can increment the tail pointer now that the last105* response has been sent instead of only being106* constructed.107*/108if (++qp->s_tail_ack_queue > QIB_MAX_RDMA_ATOMIC)109qp->s_tail_ack_queue = 0;110/* FALLTHROUGH */111case OP(SEND_ONLY):112case OP(ACKNOWLEDGE):113/* Check for no next entry in the queue. */114if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {115if (qp->s_flags & QIB_S_ACK_PENDING)116goto normal;117goto bail;118}119120e = &qp->s_ack_queue[qp->s_tail_ack_queue];121if (e->opcode == OP(RDMA_READ_REQUEST)) {122/*123* If a RDMA read response is being resent and124* we haven't seen the duplicate request yet,125* then stop sending the remaining responses the126* responder has seen until the requester resends it.127*/128len = e->rdma_sge.sge_length;129if (len && !e->rdma_sge.mr) {130qp->s_tail_ack_queue = qp->r_head_ack_queue;131goto bail;132}133/* Copy SGE state in case we need to resend */134qp->s_rdma_mr = e->rdma_sge.mr;135if (qp->s_rdma_mr)136atomic_inc(&qp->s_rdma_mr->refcount);137qp->s_ack_rdma_sge.sge = e->rdma_sge;138qp->s_ack_rdma_sge.num_sge = 1;139qp->s_cur_sge = &qp->s_ack_rdma_sge;140if (len > pmtu) {141len = pmtu;142qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);143} else {144qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);145e->sent = 1;146}147ohdr->u.aeth = qib_compute_aeth(qp);148hwords++;149qp->s_ack_rdma_psn = e->psn;150bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK;151} else {152/* COMPARE_SWAP or FETCH_ADD */153qp->s_cur_sge = NULL;154len = 0;155qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);156ohdr->u.at.aeth = qib_compute_aeth(qp);157ohdr->u.at.atomic_ack_eth[0] =158cpu_to_be32(e->atomic_data >> 32);159ohdr->u.at.atomic_ack_eth[1] =160cpu_to_be32(e->atomic_data);161hwords += sizeof(ohdr->u.at) / sizeof(u32);162bth2 = e->psn & QIB_PSN_MASK;163e->sent = 1;164}165bth0 = qp->s_ack_state << 24;166break;167168case OP(RDMA_READ_RESPONSE_FIRST):169qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);170/* FALLTHROUGH */171case OP(RDMA_READ_RESPONSE_MIDDLE):172qp->s_cur_sge = &qp->s_ack_rdma_sge;173qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr;174if (qp->s_rdma_mr)175atomic_inc(&qp->s_rdma_mr->refcount);176len = qp->s_ack_rdma_sge.sge.sge_length;177if (len > pmtu)178len = pmtu;179else {180ohdr->u.aeth = qib_compute_aeth(qp);181hwords++;182qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);183e = &qp->s_ack_queue[qp->s_tail_ack_queue];184e->sent = 1;185}186bth0 = qp->s_ack_state << 24;187bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK;188break;189190default:191normal:192/*193* Send a regular ACK.194* Set the s_ack_state so we wait until after sending195* the ACK before setting s_ack_state to ACKNOWLEDGE196* (see above).197*/198qp->s_ack_state = OP(SEND_ONLY);199qp->s_flags &= ~QIB_S_ACK_PENDING;200qp->s_cur_sge = NULL;201if (qp->s_nak_state)202ohdr->u.aeth =203cpu_to_be32((qp->r_msn & QIB_MSN_MASK) |204(qp->s_nak_state <<205QIB_AETH_CREDIT_SHIFT));206else207ohdr->u.aeth = qib_compute_aeth(qp);208hwords++;209len = 0;210bth0 = OP(ACKNOWLEDGE) << 24;211bth2 = qp->s_ack_psn & QIB_PSN_MASK;212}213qp->s_rdma_ack_cnt++;214qp->s_hdrwords = hwords;215qp->s_cur_size = len;216qib_make_ruc_header(qp, ohdr, bth0, bth2);217return 1;218219bail:220qp->s_ack_state = OP(ACKNOWLEDGE);221qp->s_flags &= ~(QIB_S_RESP_PENDING | QIB_S_ACK_PENDING);222return 0;223}224225/**226* qib_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)227* @qp: a pointer to the QP228*229* Return 1 if constructed; otherwise, return 0.230*/231int qib_make_rc_req(struct qib_qp *qp)232{233struct qib_ibdev *dev = to_idev(qp->ibqp.device);234struct qib_other_headers *ohdr;235struct qib_sge_state *ss;236struct qib_swqe *wqe;237u32 hwords;238u32 len;239u32 bth0;240u32 bth2;241u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);242char newreq;243unsigned long flags;244int ret = 0;245int delta;246247ohdr = &qp->s_hdr.u.oth;248if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)249ohdr = &qp->s_hdr.u.l.oth;250251/*252* The lock is needed to synchronize between the sending tasklet,253* the receive interrupt handler, and timeout resends.254*/255spin_lock_irqsave(&qp->s_lock, flags);256257/* Sending responses has higher priority over sending requests. */258if ((qp->s_flags & QIB_S_RESP_PENDING) &&259qib_make_rc_ack(dev, qp, ohdr, pmtu))260goto done;261262if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_SEND_OK)) {263if (!(ib_qib_state_ops[qp->state] & QIB_FLUSH_SEND))264goto bail;265/* We are in the error state, flush the work request. */266if (qp->s_last == qp->s_head)267goto bail;268/* If DMAs are in progress, we can't flush immediately. */269if (atomic_read(&qp->s_dma_busy)) {270qp->s_flags |= QIB_S_WAIT_DMA;271goto bail;272}273wqe = get_swqe_ptr(qp, qp->s_last);274while (qp->s_last != qp->s_acked) {275qib_send_complete(qp, wqe, IB_WC_SUCCESS);276if (++qp->s_last >= qp->s_size)277qp->s_last = 0;278wqe = get_swqe_ptr(qp, qp->s_last);279}280qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);281goto done;282}283284if (qp->s_flags & (QIB_S_WAIT_RNR | QIB_S_WAIT_ACK))285goto bail;286287if (qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) {288if (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {289qp->s_flags |= QIB_S_WAIT_PSN;290goto bail;291}292qp->s_sending_psn = qp->s_psn;293qp->s_sending_hpsn = qp->s_psn - 1;294}295296/* header size in 32-bit words LRH+BTH = (8+12)/4. */297hwords = 5;298bth0 = 0;299300/* Send a request. */301wqe = get_swqe_ptr(qp, qp->s_cur);302switch (qp->s_state) {303default:304if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_NEXT_SEND_OK))305goto bail;306/*307* Resend an old request or start a new one.308*309* We keep track of the current SWQE so that310* we don't reset the "furthest progress" state311* if we need to back up.312*/313newreq = 0;314if (qp->s_cur == qp->s_tail) {315/* Check if send work queue is empty. */316if (qp->s_tail == qp->s_head)317goto bail;318/*319* If a fence is requested, wait for previous320* RDMA read and atomic operations to finish.321*/322if ((wqe->wr.send_flags & IB_SEND_FENCE) &&323qp->s_num_rd_atomic) {324qp->s_flags |= QIB_S_WAIT_FENCE;325goto bail;326}327wqe->psn = qp->s_next_psn;328newreq = 1;329}330/*331* Note that we have to be careful not to modify the332* original work request since we may need to resend333* it.334*/335len = wqe->length;336ss = &qp->s_sge;337bth2 = qp->s_psn & QIB_PSN_MASK;338switch (wqe->wr.opcode) {339case IB_WR_SEND:340case IB_WR_SEND_WITH_IMM:341/* If no credit, return. */342if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT) &&343qib_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {344qp->s_flags |= QIB_S_WAIT_SSN_CREDIT;345goto bail;346}347wqe->lpsn = wqe->psn;348if (len > pmtu) {349wqe->lpsn += (len - 1) / pmtu;350qp->s_state = OP(SEND_FIRST);351len = pmtu;352break;353}354if (wqe->wr.opcode == IB_WR_SEND)355qp->s_state = OP(SEND_ONLY);356else {357qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);358/* Immediate data comes after the BTH */359ohdr->u.imm_data = wqe->wr.ex.imm_data;360hwords += 1;361}362if (wqe->wr.send_flags & IB_SEND_SOLICITED)363bth0 |= IB_BTH_SOLICITED;364bth2 |= IB_BTH_REQ_ACK;365if (++qp->s_cur == qp->s_size)366qp->s_cur = 0;367break;368369case IB_WR_RDMA_WRITE:370if (newreq && !(qp->s_flags & QIB_S_UNLIMITED_CREDIT))371qp->s_lsn++;372/* FALLTHROUGH */373case IB_WR_RDMA_WRITE_WITH_IMM:374/* If no credit, return. */375if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT) &&376qib_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {377qp->s_flags |= QIB_S_WAIT_SSN_CREDIT;378goto bail;379}380ohdr->u.rc.reth.vaddr =381cpu_to_be64(wqe->wr.wr.rdma.remote_addr);382ohdr->u.rc.reth.rkey =383cpu_to_be32(wqe->wr.wr.rdma.rkey);384ohdr->u.rc.reth.length = cpu_to_be32(len);385hwords += sizeof(struct ib_reth) / sizeof(u32);386wqe->lpsn = wqe->psn;387if (len > pmtu) {388wqe->lpsn += (len - 1) / pmtu;389qp->s_state = OP(RDMA_WRITE_FIRST);390len = pmtu;391break;392}393if (wqe->wr.opcode == IB_WR_RDMA_WRITE)394qp->s_state = OP(RDMA_WRITE_ONLY);395else {396qp->s_state =397OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);398/* Immediate data comes after RETH */399ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;400hwords += 1;401if (wqe->wr.send_flags & IB_SEND_SOLICITED)402bth0 |= IB_BTH_SOLICITED;403}404bth2 |= IB_BTH_REQ_ACK;405if (++qp->s_cur == qp->s_size)406qp->s_cur = 0;407break;408409case IB_WR_RDMA_READ:410/*411* Don't allow more operations to be started412* than the QP limits allow.413*/414if (newreq) {415if (qp->s_num_rd_atomic >=416qp->s_max_rd_atomic) {417qp->s_flags |= QIB_S_WAIT_RDMAR;418goto bail;419}420qp->s_num_rd_atomic++;421if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT))422qp->s_lsn++;423/*424* Adjust s_next_psn to count the425* expected number of responses.426*/427if (len > pmtu)428qp->s_next_psn += (len - 1) / pmtu;429wqe->lpsn = qp->s_next_psn++;430}431ohdr->u.rc.reth.vaddr =432cpu_to_be64(wqe->wr.wr.rdma.remote_addr);433ohdr->u.rc.reth.rkey =434cpu_to_be32(wqe->wr.wr.rdma.rkey);435ohdr->u.rc.reth.length = cpu_to_be32(len);436qp->s_state = OP(RDMA_READ_REQUEST);437hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);438ss = NULL;439len = 0;440bth2 |= IB_BTH_REQ_ACK;441if (++qp->s_cur == qp->s_size)442qp->s_cur = 0;443break;444445case IB_WR_ATOMIC_CMP_AND_SWP:446case IB_WR_ATOMIC_FETCH_AND_ADD:447/*448* Don't allow more operations to be started449* than the QP limits allow.450*/451if (newreq) {452if (qp->s_num_rd_atomic >=453qp->s_max_rd_atomic) {454qp->s_flags |= QIB_S_WAIT_RDMAR;455goto bail;456}457qp->s_num_rd_atomic++;458if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT))459qp->s_lsn++;460wqe->lpsn = wqe->psn;461}462if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {463qp->s_state = OP(COMPARE_SWAP);464ohdr->u.atomic_eth.swap_data = cpu_to_be64(465wqe->wr.wr.atomic.swap);466ohdr->u.atomic_eth.compare_data = cpu_to_be64(467wqe->wr.wr.atomic.compare_add);468} else {469qp->s_state = OP(FETCH_ADD);470ohdr->u.atomic_eth.swap_data = cpu_to_be64(471wqe->wr.wr.atomic.compare_add);472ohdr->u.atomic_eth.compare_data = 0;473}474ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(475wqe->wr.wr.atomic.remote_addr >> 32);476ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(477wqe->wr.wr.atomic.remote_addr);478ohdr->u.atomic_eth.rkey = cpu_to_be32(479wqe->wr.wr.atomic.rkey);480hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);481ss = NULL;482len = 0;483bth2 |= IB_BTH_REQ_ACK;484if (++qp->s_cur == qp->s_size)485qp->s_cur = 0;486break;487488default:489goto bail;490}491qp->s_sge.sge = wqe->sg_list[0];492qp->s_sge.sg_list = wqe->sg_list + 1;493qp->s_sge.num_sge = wqe->wr.num_sge;494qp->s_sge.total_len = wqe->length;495qp->s_len = wqe->length;496if (newreq) {497qp->s_tail++;498if (qp->s_tail >= qp->s_size)499qp->s_tail = 0;500}501if (wqe->wr.opcode == IB_WR_RDMA_READ)502qp->s_psn = wqe->lpsn + 1;503else {504qp->s_psn++;505if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)506qp->s_next_psn = qp->s_psn;507}508break;509510case OP(RDMA_READ_RESPONSE_FIRST):511/*512* qp->s_state is normally set to the opcode of the513* last packet constructed for new requests and therefore514* is never set to RDMA read response.515* RDMA_READ_RESPONSE_FIRST is used by the ACK processing516* thread to indicate a SEND needs to be restarted from an517* earlier PSN without interferring with the sending thread.518* See qib_restart_rc().519*/520qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);521/* FALLTHROUGH */522case OP(SEND_FIRST):523qp->s_state = OP(SEND_MIDDLE);524/* FALLTHROUGH */525case OP(SEND_MIDDLE):526bth2 = qp->s_psn++ & QIB_PSN_MASK;527if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)528qp->s_next_psn = qp->s_psn;529ss = &qp->s_sge;530len = qp->s_len;531if (len > pmtu) {532len = pmtu;533break;534}535if (wqe->wr.opcode == IB_WR_SEND)536qp->s_state = OP(SEND_LAST);537else {538qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);539/* Immediate data comes after the BTH */540ohdr->u.imm_data = wqe->wr.ex.imm_data;541hwords += 1;542}543if (wqe->wr.send_flags & IB_SEND_SOLICITED)544bth0 |= IB_BTH_SOLICITED;545bth2 |= IB_BTH_REQ_ACK;546qp->s_cur++;547if (qp->s_cur >= qp->s_size)548qp->s_cur = 0;549break;550551case OP(RDMA_READ_RESPONSE_LAST):552/*553* qp->s_state is normally set to the opcode of the554* last packet constructed for new requests and therefore555* is never set to RDMA read response.556* RDMA_READ_RESPONSE_LAST is used by the ACK processing557* thread to indicate a RDMA write needs to be restarted from558* an earlier PSN without interferring with the sending thread.559* See qib_restart_rc().560*/561qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);562/* FALLTHROUGH */563case OP(RDMA_WRITE_FIRST):564qp->s_state = OP(RDMA_WRITE_MIDDLE);565/* FALLTHROUGH */566case OP(RDMA_WRITE_MIDDLE):567bth2 = qp->s_psn++ & QIB_PSN_MASK;568if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)569qp->s_next_psn = qp->s_psn;570ss = &qp->s_sge;571len = qp->s_len;572if (len > pmtu) {573len = pmtu;574break;575}576if (wqe->wr.opcode == IB_WR_RDMA_WRITE)577qp->s_state = OP(RDMA_WRITE_LAST);578else {579qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);580/* Immediate data comes after the BTH */581ohdr->u.imm_data = wqe->wr.ex.imm_data;582hwords += 1;583if (wqe->wr.send_flags & IB_SEND_SOLICITED)584bth0 |= IB_BTH_SOLICITED;585}586bth2 |= IB_BTH_REQ_ACK;587qp->s_cur++;588if (qp->s_cur >= qp->s_size)589qp->s_cur = 0;590break;591592case OP(RDMA_READ_RESPONSE_MIDDLE):593/*594* qp->s_state is normally set to the opcode of the595* last packet constructed for new requests and therefore596* is never set to RDMA read response.597* RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing598* thread to indicate a RDMA read needs to be restarted from599* an earlier PSN without interferring with the sending thread.600* See qib_restart_rc().601*/602len = ((qp->s_psn - wqe->psn) & QIB_PSN_MASK) * pmtu;603ohdr->u.rc.reth.vaddr =604cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);605ohdr->u.rc.reth.rkey =606cpu_to_be32(wqe->wr.wr.rdma.rkey);607ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);608qp->s_state = OP(RDMA_READ_REQUEST);609hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);610bth2 = (qp->s_psn & QIB_PSN_MASK) | IB_BTH_REQ_ACK;611qp->s_psn = wqe->lpsn + 1;612ss = NULL;613len = 0;614qp->s_cur++;615if (qp->s_cur == qp->s_size)616qp->s_cur = 0;617break;618}619qp->s_sending_hpsn = bth2;620delta = (((int) bth2 - (int) wqe->psn) << 8) >> 8;621if (delta && delta % QIB_PSN_CREDIT == 0)622bth2 |= IB_BTH_REQ_ACK;623if (qp->s_flags & QIB_S_SEND_ONE) {624qp->s_flags &= ~QIB_S_SEND_ONE;625qp->s_flags |= QIB_S_WAIT_ACK;626bth2 |= IB_BTH_REQ_ACK;627}628qp->s_len -= len;629qp->s_hdrwords = hwords;630qp->s_cur_sge = ss;631qp->s_cur_size = len;632qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), bth2);633done:634ret = 1;635goto unlock;636637bail:638qp->s_flags &= ~QIB_S_BUSY;639unlock:640spin_unlock_irqrestore(&qp->s_lock, flags);641return ret;642}643644/**645* qib_send_rc_ack - Construct an ACK packet and send it646* @qp: a pointer to the QP647*648* This is called from qib_rc_rcv() and qib_kreceive().649* Note that RDMA reads and atomics are handled in the650* send side QP state and tasklet.651*/652void qib_send_rc_ack(struct qib_qp *qp)653{654struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device);655struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);656struct qib_pportdata *ppd = ppd_from_ibp(ibp);657u64 pbc;658u16 lrh0;659u32 bth0;660u32 hwords;661u32 pbufn;662u32 __iomem *piobuf;663struct qib_ib_header hdr;664struct qib_other_headers *ohdr;665u32 control;666unsigned long flags;667668spin_lock_irqsave(&qp->s_lock, flags);669670if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))671goto unlock;672673/* Don't send ACK or NAK if a RDMA read or atomic is pending. */674if ((qp->s_flags & QIB_S_RESP_PENDING) || qp->s_rdma_ack_cnt)675goto queue_ack;676677/* Construct the header with s_lock held so APM doesn't change it. */678ohdr = &hdr.u.oth;679lrh0 = QIB_LRH_BTH;680/* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */681hwords = 6;682if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {683hwords += qib_make_grh(ibp, &hdr.u.l.grh,684&qp->remote_ah_attr.grh, hwords, 0);685ohdr = &hdr.u.l.oth;686lrh0 = QIB_LRH_GRH;687}688/* read pkey_index w/o lock (its atomic) */689bth0 = qib_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24);690if (qp->s_mig_state == IB_MIG_MIGRATED)691bth0 |= IB_BTH_MIG_REQ;692if (qp->r_nak_state)693ohdr->u.aeth = cpu_to_be32((qp->r_msn & QIB_MSN_MASK) |694(qp->r_nak_state <<695QIB_AETH_CREDIT_SHIFT));696else697ohdr->u.aeth = qib_compute_aeth(qp);698lrh0 |= ibp->sl_to_vl[qp->remote_ah_attr.sl] << 12 |699qp->remote_ah_attr.sl << 4;700hdr.lrh[0] = cpu_to_be16(lrh0);701hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);702hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);703hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits);704ohdr->bth[0] = cpu_to_be32(bth0);705ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);706ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & QIB_PSN_MASK);707708spin_unlock_irqrestore(&qp->s_lock, flags);709710/* Don't try to send ACKs if the link isn't ACTIVE */711if (!(ppd->lflags & QIBL_LINKACTIVE))712goto done;713714control = dd->f_setpbc_control(ppd, hwords + SIZE_OF_CRC,715qp->s_srate, lrh0 >> 12);716/* length is + 1 for the control dword */717pbc = ((u64) control << 32) | (hwords + 1);718719piobuf = dd->f_getsendbuf(ppd, pbc, &pbufn);720if (!piobuf) {721/*722* We are out of PIO buffers at the moment.723* Pass responsibility for sending the ACK to the724* send tasklet so that when a PIO buffer becomes725* available, the ACK is sent ahead of other outgoing726* packets.727*/728spin_lock_irqsave(&qp->s_lock, flags);729goto queue_ack;730}731732/*733* Write the pbc.734* We have to flush after the PBC for correctness735* on some cpus or WC buffer can be written out of order.736*/737writeq(pbc, piobuf);738739if (dd->flags & QIB_PIO_FLUSH_WC) {740u32 *hdrp = (u32 *) &hdr;741742qib_flush_wc();743qib_pio_copy(piobuf + 2, hdrp, hwords - 1);744qib_flush_wc();745__raw_writel(hdrp[hwords - 1], piobuf + hwords + 1);746} else747qib_pio_copy(piobuf + 2, (u32 *) &hdr, hwords);748749if (dd->flags & QIB_USE_SPCL_TRIG) {750u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023;751752qib_flush_wc();753__raw_writel(0xaebecede, piobuf + spcl_off);754}755756qib_flush_wc();757qib_sendbuf_done(dd, pbufn);758759ibp->n_unicast_xmit++;760goto done;761762queue_ack:763if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) {764ibp->n_rc_qacks++;765qp->s_flags |= QIB_S_ACK_PENDING | QIB_S_RESP_PENDING;766qp->s_nak_state = qp->r_nak_state;767qp->s_ack_psn = qp->r_ack_psn;768769/* Schedule the send tasklet. */770qib_schedule_send(qp);771}772unlock:773spin_unlock_irqrestore(&qp->s_lock, flags);774done:775return;776}777778/**779* reset_psn - reset the QP state to send starting from PSN780* @qp: the QP781* @psn: the packet sequence number to restart at782*783* This is called from qib_rc_rcv() to process an incoming RC ACK784* for the given QP.785* Called at interrupt level with the QP s_lock held.786*/787static void reset_psn(struct qib_qp *qp, u32 psn)788{789u32 n = qp->s_acked;790struct qib_swqe *wqe = get_swqe_ptr(qp, n);791u32 opcode;792793qp->s_cur = n;794795/*796* If we are starting the request from the beginning,797* let the normal send code handle initialization.798*/799if (qib_cmp24(psn, wqe->psn) <= 0) {800qp->s_state = OP(SEND_LAST);801goto done;802}803804/* Find the work request opcode corresponding to the given PSN. */805opcode = wqe->wr.opcode;806for (;;) {807int diff;808809if (++n == qp->s_size)810n = 0;811if (n == qp->s_tail)812break;813wqe = get_swqe_ptr(qp, n);814diff = qib_cmp24(psn, wqe->psn);815if (diff < 0)816break;817qp->s_cur = n;818/*819* If we are starting the request from the beginning,820* let the normal send code handle initialization.821*/822if (diff == 0) {823qp->s_state = OP(SEND_LAST);824goto done;825}826opcode = wqe->wr.opcode;827}828829/*830* Set the state to restart in the middle of a request.831* Don't change the s_sge, s_cur_sge, or s_cur_size.832* See qib_make_rc_req().833*/834switch (opcode) {835case IB_WR_SEND:836case IB_WR_SEND_WITH_IMM:837qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);838break;839840case IB_WR_RDMA_WRITE:841case IB_WR_RDMA_WRITE_WITH_IMM:842qp->s_state = OP(RDMA_READ_RESPONSE_LAST);843break;844845case IB_WR_RDMA_READ:846qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);847break;848849default:850/*851* This case shouldn't happen since its only852* one PSN per req.853*/854qp->s_state = OP(SEND_LAST);855}856done:857qp->s_psn = psn;858/*859* Set QIB_S_WAIT_PSN as qib_rc_complete() may start the timer860* asynchronously before the send tasklet can get scheduled.861* Doing it in qib_make_rc_req() is too late.862*/863if ((qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) &&864(qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))865qp->s_flags |= QIB_S_WAIT_PSN;866}867868/*869* Back up requester to resend the last un-ACKed request.870* The QP r_lock and s_lock should be held and interrupts disabled.871*/872static void qib_restart_rc(struct qib_qp *qp, u32 psn, int wait)873{874struct qib_swqe *wqe = get_swqe_ptr(qp, qp->s_acked);875struct qib_ibport *ibp;876877if (qp->s_retry == 0) {878if (qp->s_mig_state == IB_MIG_ARMED) {879qib_migrate_qp(qp);880qp->s_retry = qp->s_retry_cnt;881} else if (qp->s_last == qp->s_acked) {882qib_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);883qib_error_qp(qp, IB_WC_WR_FLUSH_ERR);884return;885} else /* XXX need to handle delayed completion */886return;887} else888qp->s_retry--;889890ibp = to_iport(qp->ibqp.device, qp->port_num);891if (wqe->wr.opcode == IB_WR_RDMA_READ)892ibp->n_rc_resends++;893else894ibp->n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK;895896qp->s_flags &= ~(QIB_S_WAIT_FENCE | QIB_S_WAIT_RDMAR |897QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_PSN |898QIB_S_WAIT_ACK);899if (wait)900qp->s_flags |= QIB_S_SEND_ONE;901reset_psn(qp, psn);902}903904/*905* This is called from s_timer for missing responses.906*/907static void rc_timeout(unsigned long arg)908{909struct qib_qp *qp = (struct qib_qp *)arg;910struct qib_ibport *ibp;911unsigned long flags;912913spin_lock_irqsave(&qp->r_lock, flags);914spin_lock(&qp->s_lock);915if (qp->s_flags & QIB_S_TIMER) {916ibp = to_iport(qp->ibqp.device, qp->port_num);917ibp->n_rc_timeouts++;918qp->s_flags &= ~QIB_S_TIMER;919del_timer(&qp->s_timer);920qib_restart_rc(qp, qp->s_last_psn + 1, 1);921qib_schedule_send(qp);922}923spin_unlock(&qp->s_lock);924spin_unlock_irqrestore(&qp->r_lock, flags);925}926927/*928* This is called from s_timer for RNR timeouts.929*/930void qib_rc_rnr_retry(unsigned long arg)931{932struct qib_qp *qp = (struct qib_qp *)arg;933unsigned long flags;934935spin_lock_irqsave(&qp->s_lock, flags);936if (qp->s_flags & QIB_S_WAIT_RNR) {937qp->s_flags &= ~QIB_S_WAIT_RNR;938del_timer(&qp->s_timer);939qib_schedule_send(qp);940}941spin_unlock_irqrestore(&qp->s_lock, flags);942}943944/*945* Set qp->s_sending_psn to the next PSN after the given one.946* This would be psn+1 except when RDMA reads are present.947*/948static void reset_sending_psn(struct qib_qp *qp, u32 psn)949{950struct qib_swqe *wqe;951u32 n = qp->s_last;952953/* Find the work request corresponding to the given PSN. */954for (;;) {955wqe = get_swqe_ptr(qp, n);956if (qib_cmp24(psn, wqe->lpsn) <= 0) {957if (wqe->wr.opcode == IB_WR_RDMA_READ)958qp->s_sending_psn = wqe->lpsn + 1;959else960qp->s_sending_psn = psn + 1;961break;962}963if (++n == qp->s_size)964n = 0;965if (n == qp->s_tail)966break;967}968}969970/*971* This should be called with the QP s_lock held and interrupts disabled.972*/973void qib_rc_send_complete(struct qib_qp *qp, struct qib_ib_header *hdr)974{975struct qib_other_headers *ohdr;976struct qib_swqe *wqe;977struct ib_wc wc;978unsigned i;979u32 opcode;980u32 psn;981982if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_OR_FLUSH_SEND))983return;984985/* Find out where the BTH is */986if ((be16_to_cpu(hdr->lrh[0]) & 3) == QIB_LRH_BTH)987ohdr = &hdr->u.oth;988else989ohdr = &hdr->u.l.oth;990991opcode = be32_to_cpu(ohdr->bth[0]) >> 24;992if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&993opcode <= OP(ATOMIC_ACKNOWLEDGE)) {994WARN_ON(!qp->s_rdma_ack_cnt);995qp->s_rdma_ack_cnt--;996return;997}998999psn = be32_to_cpu(ohdr->bth[2]);1000reset_sending_psn(qp, psn);10011002/*1003* Start timer after a packet requesting an ACK has been sent and1004* there are still requests that haven't been acked.1005*/1006if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&1007!(qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR | QIB_S_WAIT_PSN)) &&1008(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))1009start_timer(qp);10101011while (qp->s_last != qp->s_acked) {1012wqe = get_swqe_ptr(qp, qp->s_last);1013if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) >= 0 &&1014qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)1015break;1016for (i = 0; i < wqe->wr.num_sge; i++) {1017struct qib_sge *sge = &wqe->sg_list[i];10181019atomic_dec(&sge->mr->refcount);1020}1021/* Post a send completion queue entry if requested. */1022if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) ||1023(wqe->wr.send_flags & IB_SEND_SIGNALED)) {1024memset(&wc, 0, sizeof wc);1025wc.wr_id = wqe->wr.wr_id;1026wc.status = IB_WC_SUCCESS;1027wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode];1028wc.byte_len = wqe->length;1029wc.qp = &qp->ibqp;1030qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);1031}1032if (++qp->s_last >= qp->s_size)1033qp->s_last = 0;1034}1035/*1036* If we were waiting for sends to complete before resending,1037* and they are now complete, restart sending.1038*/1039if (qp->s_flags & QIB_S_WAIT_PSN &&1040qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {1041qp->s_flags &= ~QIB_S_WAIT_PSN;1042qp->s_sending_psn = qp->s_psn;1043qp->s_sending_hpsn = qp->s_psn - 1;1044qib_schedule_send(qp);1045}1046}10471048static inline void update_last_psn(struct qib_qp *qp, u32 psn)1049{1050qp->s_last_psn = psn;1051}10521053/*1054* Generate a SWQE completion.1055* This is similar to qib_send_complete but has to check to be sure1056* that the SGEs are not being referenced if the SWQE is being resent.1057*/1058static struct qib_swqe *do_rc_completion(struct qib_qp *qp,1059struct qib_swqe *wqe,1060struct qib_ibport *ibp)1061{1062struct ib_wc wc;1063unsigned i;10641065/*1066* Don't decrement refcount and don't generate a1067* completion if the SWQE is being resent until the send1068* is finished.1069*/1070if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) < 0 ||1071qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {1072for (i = 0; i < wqe->wr.num_sge; i++) {1073struct qib_sge *sge = &wqe->sg_list[i];10741075atomic_dec(&sge->mr->refcount);1076}1077/* Post a send completion queue entry if requested. */1078if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) ||1079(wqe->wr.send_flags & IB_SEND_SIGNALED)) {1080memset(&wc, 0, sizeof wc);1081wc.wr_id = wqe->wr.wr_id;1082wc.status = IB_WC_SUCCESS;1083wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode];1084wc.byte_len = wqe->length;1085wc.qp = &qp->ibqp;1086qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);1087}1088if (++qp->s_last >= qp->s_size)1089qp->s_last = 0;1090} else1091ibp->n_rc_delayed_comp++;10921093qp->s_retry = qp->s_retry_cnt;1094update_last_psn(qp, wqe->lpsn);10951096/*1097* If we are completing a request which is in the process of1098* being resent, we can stop resending it since we know the1099* responder has already seen it.1100*/1101if (qp->s_acked == qp->s_cur) {1102if (++qp->s_cur >= qp->s_size)1103qp->s_cur = 0;1104qp->s_acked = qp->s_cur;1105wqe = get_swqe_ptr(qp, qp->s_cur);1106if (qp->s_acked != qp->s_tail) {1107qp->s_state = OP(SEND_LAST);1108qp->s_psn = wqe->psn;1109}1110} else {1111if (++qp->s_acked >= qp->s_size)1112qp->s_acked = 0;1113if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)1114qp->s_draining = 0;1115wqe = get_swqe_ptr(qp, qp->s_acked);1116}1117return wqe;1118}11191120/**1121* do_rc_ack - process an incoming RC ACK1122* @qp: the QP the ACK came in on1123* @psn: the packet sequence number of the ACK1124* @opcode: the opcode of the request that resulted in the ACK1125*1126* This is called from qib_rc_rcv_resp() to process an incoming RC ACK1127* for the given QP.1128* Called at interrupt level with the QP s_lock held.1129* Returns 1 if OK, 0 if current operation should be aborted (NAK).1130*/1131static int do_rc_ack(struct qib_qp *qp, u32 aeth, u32 psn, int opcode,1132u64 val, struct qib_ctxtdata *rcd)1133{1134struct qib_ibport *ibp;1135enum ib_wc_status status;1136struct qib_swqe *wqe;1137int ret = 0;1138u32 ack_psn;1139int diff;11401141/* Remove QP from retry timer */1142if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) {1143qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR);1144del_timer(&qp->s_timer);1145}11461147/*1148* Note that NAKs implicitly ACK outstanding SEND and RDMA write1149* requests and implicitly NAK RDMA read and atomic requests issued1150* before the NAK'ed request. The MSN won't include the NAK'ed1151* request but will include an ACK'ed request(s).1152*/1153ack_psn = psn;1154if (aeth >> 29)1155ack_psn--;1156wqe = get_swqe_ptr(qp, qp->s_acked);1157ibp = to_iport(qp->ibqp.device, qp->port_num);11581159/*1160* The MSN might be for a later WQE than the PSN indicates so1161* only complete WQEs that the PSN finishes.1162*/1163while ((diff = qib_cmp24(ack_psn, wqe->lpsn)) >= 0) {1164/*1165* RDMA_READ_RESPONSE_ONLY is a special case since1166* we want to generate completion events for everything1167* before the RDMA read, copy the data, then generate1168* the completion for the read.1169*/1170if (wqe->wr.opcode == IB_WR_RDMA_READ &&1171opcode == OP(RDMA_READ_RESPONSE_ONLY) &&1172diff == 0) {1173ret = 1;1174goto bail;1175}1176/*1177* If this request is a RDMA read or atomic, and the ACK is1178* for a later operation, this ACK NAKs the RDMA read or1179* atomic. In other words, only a RDMA_READ_LAST or ONLY1180* can ACK a RDMA read and likewise for atomic ops. Note1181* that the NAK case can only happen if relaxed ordering is1182* used and requests are sent after an RDMA read or atomic1183* is sent but before the response is received.1184*/1185if ((wqe->wr.opcode == IB_WR_RDMA_READ &&1186(opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||1187((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||1188wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&1189(opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {1190/* Retry this request. */1191if (!(qp->r_flags & QIB_R_RDMAR_SEQ)) {1192qp->r_flags |= QIB_R_RDMAR_SEQ;1193qib_restart_rc(qp, qp->s_last_psn + 1, 0);1194if (list_empty(&qp->rspwait)) {1195qp->r_flags |= QIB_R_RSP_SEND;1196atomic_inc(&qp->refcount);1197list_add_tail(&qp->rspwait,1198&rcd->qp_wait_list);1199}1200}1201/*1202* No need to process the ACK/NAK since we are1203* restarting an earlier request.1204*/1205goto bail;1206}1207if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||1208wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {1209u64 *vaddr = wqe->sg_list[0].vaddr;1210*vaddr = val;1211}1212if (qp->s_num_rd_atomic &&1213(wqe->wr.opcode == IB_WR_RDMA_READ ||1214wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||1215wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {1216qp->s_num_rd_atomic--;1217/* Restart sending task if fence is complete */1218if ((qp->s_flags & QIB_S_WAIT_FENCE) &&1219!qp->s_num_rd_atomic) {1220qp->s_flags &= ~(QIB_S_WAIT_FENCE |1221QIB_S_WAIT_ACK);1222qib_schedule_send(qp);1223} else if (qp->s_flags & QIB_S_WAIT_RDMAR) {1224qp->s_flags &= ~(QIB_S_WAIT_RDMAR |1225QIB_S_WAIT_ACK);1226qib_schedule_send(qp);1227}1228}1229wqe = do_rc_completion(qp, wqe, ibp);1230if (qp->s_acked == qp->s_tail)1231break;1232}12331234switch (aeth >> 29) {1235case 0: /* ACK */1236ibp->n_rc_acks++;1237if (qp->s_acked != qp->s_tail) {1238/*1239* We are expecting more ACKs so1240* reset the retransmit timer.1241*/1242start_timer(qp);1243/*1244* We can stop resending the earlier packets and1245* continue with the next packet the receiver wants.1246*/1247if (qib_cmp24(qp->s_psn, psn) <= 0)1248reset_psn(qp, psn + 1);1249} else if (qib_cmp24(qp->s_psn, psn) <= 0) {1250qp->s_state = OP(SEND_LAST);1251qp->s_psn = psn + 1;1252}1253if (qp->s_flags & QIB_S_WAIT_ACK) {1254qp->s_flags &= ~QIB_S_WAIT_ACK;1255qib_schedule_send(qp);1256}1257qib_get_credit(qp, aeth);1258qp->s_rnr_retry = qp->s_rnr_retry_cnt;1259qp->s_retry = qp->s_retry_cnt;1260update_last_psn(qp, psn);1261ret = 1;1262goto bail;12631264case 1: /* RNR NAK */1265ibp->n_rnr_naks++;1266if (qp->s_acked == qp->s_tail)1267goto bail;1268if (qp->s_flags & QIB_S_WAIT_RNR)1269goto bail;1270if (qp->s_rnr_retry == 0) {1271status = IB_WC_RNR_RETRY_EXC_ERR;1272goto class_b;1273}1274if (qp->s_rnr_retry_cnt < 7)1275qp->s_rnr_retry--;12761277/* The last valid PSN is the previous PSN. */1278update_last_psn(qp, psn - 1);12791280ibp->n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK;12811282reset_psn(qp, psn);12831284qp->s_flags &= ~(QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_ACK);1285qp->s_flags |= QIB_S_WAIT_RNR;1286qp->s_timer.function = qib_rc_rnr_retry;1287qp->s_timer.expires = jiffies + usecs_to_jiffies(1288ib_qib_rnr_table[(aeth >> QIB_AETH_CREDIT_SHIFT) &1289QIB_AETH_CREDIT_MASK]);1290add_timer(&qp->s_timer);1291goto bail;12921293case 3: /* NAK */1294if (qp->s_acked == qp->s_tail)1295goto bail;1296/* The last valid PSN is the previous PSN. */1297update_last_psn(qp, psn - 1);1298switch ((aeth >> QIB_AETH_CREDIT_SHIFT) &1299QIB_AETH_CREDIT_MASK) {1300case 0: /* PSN sequence error */1301ibp->n_seq_naks++;1302/*1303* Back up to the responder's expected PSN.1304* Note that we might get a NAK in the middle of an1305* RDMA READ response which terminates the RDMA1306* READ.1307*/1308qib_restart_rc(qp, psn, 0);1309qib_schedule_send(qp);1310break;13111312case 1: /* Invalid Request */1313status = IB_WC_REM_INV_REQ_ERR;1314ibp->n_other_naks++;1315goto class_b;13161317case 2: /* Remote Access Error */1318status = IB_WC_REM_ACCESS_ERR;1319ibp->n_other_naks++;1320goto class_b;13211322case 3: /* Remote Operation Error */1323status = IB_WC_REM_OP_ERR;1324ibp->n_other_naks++;1325class_b:1326if (qp->s_last == qp->s_acked) {1327qib_send_complete(qp, wqe, status);1328qib_error_qp(qp, IB_WC_WR_FLUSH_ERR);1329}1330break;13311332default:1333/* Ignore other reserved NAK error codes */1334goto reserved;1335}1336qp->s_retry = qp->s_retry_cnt;1337qp->s_rnr_retry = qp->s_rnr_retry_cnt;1338goto bail;13391340default: /* 2: reserved */1341reserved:1342/* Ignore reserved NAK codes. */1343goto bail;1344}13451346bail:1347return ret;1348}13491350/*1351* We have seen an out of sequence RDMA read middle or last packet.1352* This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.1353*/1354static void rdma_seq_err(struct qib_qp *qp, struct qib_ibport *ibp, u32 psn,1355struct qib_ctxtdata *rcd)1356{1357struct qib_swqe *wqe;13581359/* Remove QP from retry timer */1360if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) {1361qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR);1362del_timer(&qp->s_timer);1363}13641365wqe = get_swqe_ptr(qp, qp->s_acked);13661367while (qib_cmp24(psn, wqe->lpsn) > 0) {1368if (wqe->wr.opcode == IB_WR_RDMA_READ ||1369wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||1370wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)1371break;1372wqe = do_rc_completion(qp, wqe, ibp);1373}13741375ibp->n_rdma_seq++;1376qp->r_flags |= QIB_R_RDMAR_SEQ;1377qib_restart_rc(qp, qp->s_last_psn + 1, 0);1378if (list_empty(&qp->rspwait)) {1379qp->r_flags |= QIB_R_RSP_SEND;1380atomic_inc(&qp->refcount);1381list_add_tail(&qp->rspwait, &rcd->qp_wait_list);1382}1383}13841385/**1386* qib_rc_rcv_resp - process an incoming RC response packet1387* @ibp: the port this packet came in on1388* @ohdr: the other headers for this packet1389* @data: the packet data1390* @tlen: the packet length1391* @qp: the QP for this packet1392* @opcode: the opcode for this packet1393* @psn: the packet sequence number for this packet1394* @hdrsize: the header length1395* @pmtu: the path MTU1396*1397* This is called from qib_rc_rcv() to process an incoming RC response1398* packet for the given QP.1399* Called at interrupt level.1400*/1401static void qib_rc_rcv_resp(struct qib_ibport *ibp,1402struct qib_other_headers *ohdr,1403void *data, u32 tlen,1404struct qib_qp *qp,1405u32 opcode,1406u32 psn, u32 hdrsize, u32 pmtu,1407struct qib_ctxtdata *rcd)1408{1409struct qib_swqe *wqe;1410struct qib_pportdata *ppd = ppd_from_ibp(ibp);1411enum ib_wc_status status;1412unsigned long flags;1413int diff;1414u32 pad;1415u32 aeth;1416u64 val;14171418if (opcode != OP(RDMA_READ_RESPONSE_MIDDLE)) {1419/*1420* If ACK'd PSN on SDMA busy list try to make progress to1421* reclaim SDMA credits.1422*/1423if ((qib_cmp24(psn, qp->s_sending_psn) >= 0) &&1424(qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) {14251426/*1427* If send tasklet not running attempt to progress1428* SDMA queue.1429*/1430if (!(qp->s_flags & QIB_S_BUSY)) {1431/* Acquire SDMA Lock */1432spin_lock_irqsave(&ppd->sdma_lock, flags);1433/* Invoke sdma make progress */1434qib_sdma_make_progress(ppd);1435/* Release SDMA Lock */1436spin_unlock_irqrestore(&ppd->sdma_lock, flags);1437}1438}1439}14401441spin_lock_irqsave(&qp->s_lock, flags);1442if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))1443goto ack_done;14441445/* Ignore invalid responses. */1446if (qib_cmp24(psn, qp->s_next_psn) >= 0)1447goto ack_done;14481449/* Ignore duplicate responses. */1450diff = qib_cmp24(psn, qp->s_last_psn);1451if (unlikely(diff <= 0)) {1452/* Update credits for "ghost" ACKs */1453if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {1454aeth = be32_to_cpu(ohdr->u.aeth);1455if ((aeth >> 29) == 0)1456qib_get_credit(qp, aeth);1457}1458goto ack_done;1459}14601461/*1462* Skip everything other than the PSN we expect, if we are waiting1463* for a reply to a restarted RDMA read or atomic op.1464*/1465if (qp->r_flags & QIB_R_RDMAR_SEQ) {1466if (qib_cmp24(psn, qp->s_last_psn + 1) != 0)1467goto ack_done;1468qp->r_flags &= ~QIB_R_RDMAR_SEQ;1469}14701471if (unlikely(qp->s_acked == qp->s_tail))1472goto ack_done;1473wqe = get_swqe_ptr(qp, qp->s_acked);1474status = IB_WC_SUCCESS;14751476switch (opcode) {1477case OP(ACKNOWLEDGE):1478case OP(ATOMIC_ACKNOWLEDGE):1479case OP(RDMA_READ_RESPONSE_FIRST):1480aeth = be32_to_cpu(ohdr->u.aeth);1481if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {1482__be32 *p = ohdr->u.at.atomic_ack_eth;14831484val = ((u64) be32_to_cpu(p[0]) << 32) |1485be32_to_cpu(p[1]);1486} else1487val = 0;1488if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||1489opcode != OP(RDMA_READ_RESPONSE_FIRST))1490goto ack_done;1491hdrsize += 4;1492wqe = get_swqe_ptr(qp, qp->s_acked);1493if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))1494goto ack_op_err;1495/*1496* If this is a response to a resent RDMA read, we1497* have to be careful to copy the data to the right1498* location.1499*/1500qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,1501wqe, psn, pmtu);1502goto read_middle;15031504case OP(RDMA_READ_RESPONSE_MIDDLE):1505/* no AETH, no ACK */1506if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1)))1507goto ack_seq_err;1508if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))1509goto ack_op_err;1510read_middle:1511if (unlikely(tlen != (hdrsize + pmtu + 4)))1512goto ack_len_err;1513if (unlikely(pmtu >= qp->s_rdma_read_len))1514goto ack_len_err;15151516/*1517* We got a response so update the timeout.1518* 4.096 usec. * (1 << qp->timeout)1519*/1520qp->s_flags |= QIB_S_TIMER;1521mod_timer(&qp->s_timer, jiffies +1522usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /15231000UL));1524if (qp->s_flags & QIB_S_WAIT_ACK) {1525qp->s_flags &= ~QIB_S_WAIT_ACK;1526qib_schedule_send(qp);1527}15281529if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))1530qp->s_retry = qp->s_retry_cnt;15311532/*1533* Update the RDMA receive state but do the copy w/o1534* holding the locks and blocking interrupts.1535*/1536qp->s_rdma_read_len -= pmtu;1537update_last_psn(qp, psn);1538spin_unlock_irqrestore(&qp->s_lock, flags);1539qib_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0);1540goto bail;15411542case OP(RDMA_READ_RESPONSE_ONLY):1543aeth = be32_to_cpu(ohdr->u.aeth);1544if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))1545goto ack_done;1546/* Get the number of bytes the message was padded by. */1547pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;1548/*1549* Check that the data size is >= 0 && <= pmtu.1550* Remember to account for the AETH header (4) and1551* ICRC (4).1552*/1553if (unlikely(tlen < (hdrsize + pad + 8)))1554goto ack_len_err;1555/*1556* If this is a response to a resent RDMA read, we1557* have to be careful to copy the data to the right1558* location.1559*/1560wqe = get_swqe_ptr(qp, qp->s_acked);1561qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,1562wqe, psn, pmtu);1563goto read_last;15641565case OP(RDMA_READ_RESPONSE_LAST):1566/* ACKs READ req. */1567if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1)))1568goto ack_seq_err;1569if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))1570goto ack_op_err;1571/* Get the number of bytes the message was padded by. */1572pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;1573/*1574* Check that the data size is >= 1 && <= pmtu.1575* Remember to account for the AETH header (4) and1576* ICRC (4).1577*/1578if (unlikely(tlen <= (hdrsize + pad + 8)))1579goto ack_len_err;1580read_last:1581tlen -= hdrsize + pad + 8;1582if (unlikely(tlen != qp->s_rdma_read_len))1583goto ack_len_err;1584aeth = be32_to_cpu(ohdr->u.aeth);1585qib_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0);1586WARN_ON(qp->s_rdma_read_sge.num_sge);1587(void) do_rc_ack(qp, aeth, psn,1588OP(RDMA_READ_RESPONSE_LAST), 0, rcd);1589goto ack_done;1590}15911592ack_op_err:1593status = IB_WC_LOC_QP_OP_ERR;1594goto ack_err;15951596ack_seq_err:1597rdma_seq_err(qp, ibp, psn, rcd);1598goto ack_done;15991600ack_len_err:1601status = IB_WC_LOC_LEN_ERR;1602ack_err:1603if (qp->s_last == qp->s_acked) {1604qib_send_complete(qp, wqe, status);1605qib_error_qp(qp, IB_WC_WR_FLUSH_ERR);1606}1607ack_done:1608spin_unlock_irqrestore(&qp->s_lock, flags);1609bail:1610return;1611}16121613/**1614* qib_rc_rcv_error - process an incoming duplicate or error RC packet1615* @ohdr: the other headers for this packet1616* @data: the packet data1617* @qp: the QP for this packet1618* @opcode: the opcode for this packet1619* @psn: the packet sequence number for this packet1620* @diff: the difference between the PSN and the expected PSN1621*1622* This is called from qib_rc_rcv() to process an unexpected1623* incoming RC packet for the given QP.1624* Called at interrupt level.1625* Return 1 if no more processing is needed; otherwise return 0 to1626* schedule a response to be sent.1627*/1628static int qib_rc_rcv_error(struct qib_other_headers *ohdr,1629void *data,1630struct qib_qp *qp,1631u32 opcode,1632u32 psn,1633int diff,1634struct qib_ctxtdata *rcd)1635{1636struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);1637struct qib_ack_entry *e;1638unsigned long flags;1639u8 i, prev;1640int old_req;16411642if (diff > 0) {1643/*1644* Packet sequence error.1645* A NAK will ACK earlier sends and RDMA writes.1646* Don't queue the NAK if we already sent one.1647*/1648if (!qp->r_nak_state) {1649ibp->n_rc_seqnak++;1650qp->r_nak_state = IB_NAK_PSN_ERROR;1651/* Use the expected PSN. */1652qp->r_ack_psn = qp->r_psn;1653/*1654* Wait to send the sequence NAK until all packets1655* in the receive queue have been processed.1656* Otherwise, we end up propagating congestion.1657*/1658if (list_empty(&qp->rspwait)) {1659qp->r_flags |= QIB_R_RSP_NAK;1660atomic_inc(&qp->refcount);1661list_add_tail(&qp->rspwait, &rcd->qp_wait_list);1662}1663}1664goto done;1665}16661667/*1668* Handle a duplicate request. Don't re-execute SEND, RDMA1669* write or atomic op. Don't NAK errors, just silently drop1670* the duplicate request. Note that r_sge, r_len, and1671* r_rcv_len may be in use so don't modify them.1672*1673* We are supposed to ACK the earliest duplicate PSN but we1674* can coalesce an outstanding duplicate ACK. We have to1675* send the earliest so that RDMA reads can be restarted at1676* the requester's expected PSN.1677*1678* First, find where this duplicate PSN falls within the1679* ACKs previously sent.1680* old_req is true if there is an older response that is scheduled1681* to be sent before sending this one.1682*/1683e = NULL;1684old_req = 1;1685ibp->n_rc_dupreq++;16861687spin_lock_irqsave(&qp->s_lock, flags);16881689for (i = qp->r_head_ack_queue; ; i = prev) {1690if (i == qp->s_tail_ack_queue)1691old_req = 0;1692if (i)1693prev = i - 1;1694else1695prev = QIB_MAX_RDMA_ATOMIC;1696if (prev == qp->r_head_ack_queue) {1697e = NULL;1698break;1699}1700e = &qp->s_ack_queue[prev];1701if (!e->opcode) {1702e = NULL;1703break;1704}1705if (qib_cmp24(psn, e->psn) >= 0) {1706if (prev == qp->s_tail_ack_queue &&1707qib_cmp24(psn, e->lpsn) <= 0)1708old_req = 0;1709break;1710}1711}1712switch (opcode) {1713case OP(RDMA_READ_REQUEST): {1714struct ib_reth *reth;1715u32 offset;1716u32 len;17171718/*1719* If we didn't find the RDMA read request in the ack queue,1720* we can ignore this request.1721*/1722if (!e || e->opcode != OP(RDMA_READ_REQUEST))1723goto unlock_done;1724/* RETH comes after BTH */1725reth = &ohdr->u.rc.reth;1726/*1727* Address range must be a subset of the original1728* request and start on pmtu boundaries.1729* We reuse the old ack_queue slot since the requester1730* should not back up and request an earlier PSN for the1731* same request.1732*/1733offset = ((psn - e->psn) & QIB_PSN_MASK) *1734ib_mtu_enum_to_int(qp->path_mtu);1735len = be32_to_cpu(reth->length);1736if (unlikely(offset + len != e->rdma_sge.sge_length))1737goto unlock_done;1738if (e->rdma_sge.mr) {1739atomic_dec(&e->rdma_sge.mr->refcount);1740e->rdma_sge.mr = NULL;1741}1742if (len != 0) {1743u32 rkey = be32_to_cpu(reth->rkey);1744u64 vaddr = be64_to_cpu(reth->vaddr);1745int ok;17461747ok = qib_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,1748IB_ACCESS_REMOTE_READ);1749if (unlikely(!ok))1750goto unlock_done;1751} else {1752e->rdma_sge.vaddr = NULL;1753e->rdma_sge.length = 0;1754e->rdma_sge.sge_length = 0;1755}1756e->psn = psn;1757if (old_req)1758goto unlock_done;1759qp->s_tail_ack_queue = prev;1760break;1761}17621763case OP(COMPARE_SWAP):1764case OP(FETCH_ADD): {1765/*1766* If we didn't find the atomic request in the ack queue1767* or the send tasklet is already backed up to send an1768* earlier entry, we can ignore this request.1769*/1770if (!e || e->opcode != (u8) opcode || old_req)1771goto unlock_done;1772qp->s_tail_ack_queue = prev;1773break;1774}17751776default:1777/*1778* Ignore this operation if it doesn't request an ACK1779* or an earlier RDMA read or atomic is going to be resent.1780*/1781if (!(psn & IB_BTH_REQ_ACK) || old_req)1782goto unlock_done;1783/*1784* Resend the most recent ACK if this request is1785* after all the previous RDMA reads and atomics.1786*/1787if (i == qp->r_head_ack_queue) {1788spin_unlock_irqrestore(&qp->s_lock, flags);1789qp->r_nak_state = 0;1790qp->r_ack_psn = qp->r_psn - 1;1791goto send_ack;1792}1793/*1794* Try to send a simple ACK to work around a Mellanox bug1795* which doesn't accept a RDMA read response or atomic1796* response as an ACK for earlier SENDs or RDMA writes.1797*/1798if (!(qp->s_flags & QIB_S_RESP_PENDING)) {1799spin_unlock_irqrestore(&qp->s_lock, flags);1800qp->r_nak_state = 0;1801qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;1802goto send_ack;1803}1804/*1805* Resend the RDMA read or atomic op which1806* ACKs this duplicate request.1807*/1808qp->s_tail_ack_queue = i;1809break;1810}1811qp->s_ack_state = OP(ACKNOWLEDGE);1812qp->s_flags |= QIB_S_RESP_PENDING;1813qp->r_nak_state = 0;1814qib_schedule_send(qp);18151816unlock_done:1817spin_unlock_irqrestore(&qp->s_lock, flags);1818done:1819return 1;18201821send_ack:1822return 0;1823}18241825void qib_rc_error(struct qib_qp *qp, enum ib_wc_status err)1826{1827unsigned long flags;1828int lastwqe;18291830spin_lock_irqsave(&qp->s_lock, flags);1831lastwqe = qib_error_qp(qp, err);1832spin_unlock_irqrestore(&qp->s_lock, flags);18331834if (lastwqe) {1835struct ib_event ev;18361837ev.device = qp->ibqp.device;1838ev.element.qp = &qp->ibqp;1839ev.event = IB_EVENT_QP_LAST_WQE_REACHED;1840qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);1841}1842}18431844static inline void qib_update_ack_queue(struct qib_qp *qp, unsigned n)1845{1846unsigned next;18471848next = n + 1;1849if (next > QIB_MAX_RDMA_ATOMIC)1850next = 0;1851qp->s_tail_ack_queue = next;1852qp->s_ack_state = OP(ACKNOWLEDGE);1853}18541855/**1856* qib_rc_rcv - process an incoming RC packet1857* @rcd: the context pointer1858* @hdr: the header of this packet1859* @has_grh: true if the header has a GRH1860* @data: the packet data1861* @tlen: the packet length1862* @qp: the QP for this packet1863*1864* This is called from qib_qp_rcv() to process an incoming RC packet1865* for the given QP.1866* Called at interrupt level.1867*/1868void qib_rc_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr,1869int has_grh, void *data, u32 tlen, struct qib_qp *qp)1870{1871struct qib_ibport *ibp = &rcd->ppd->ibport_data;1872struct qib_other_headers *ohdr;1873u32 opcode;1874u32 hdrsize;1875u32 psn;1876u32 pad;1877struct ib_wc wc;1878u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);1879int diff;1880struct ib_reth *reth;1881unsigned long flags;1882int ret;18831884/* Check for GRH */1885if (!has_grh) {1886ohdr = &hdr->u.oth;1887hdrsize = 8 + 12; /* LRH + BTH */1888} else {1889ohdr = &hdr->u.l.oth;1890hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */1891}18921893opcode = be32_to_cpu(ohdr->bth[0]);1894spin_lock_irqsave(&qp->s_lock, flags);1895if (qib_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))1896goto sunlock;1897spin_unlock_irqrestore(&qp->s_lock, flags);18981899psn = be32_to_cpu(ohdr->bth[2]);1900opcode >>= 24;19011902/*1903* Process responses (ACKs) before anything else. Note that the1904* packet sequence number will be for something in the send work1905* queue rather than the expected receive packet sequence number.1906* In other words, this QP is the requester.1907*/1908if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&1909opcode <= OP(ATOMIC_ACKNOWLEDGE)) {1910qib_rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn,1911hdrsize, pmtu, rcd);1912return;1913}19141915/* Compute 24 bits worth of difference. */1916diff = qib_cmp24(psn, qp->r_psn);1917if (unlikely(diff)) {1918if (qib_rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))1919return;1920goto send_ack;1921}19221923/* Check for opcode sequence errors. */1924switch (qp->r_state) {1925case OP(SEND_FIRST):1926case OP(SEND_MIDDLE):1927if (opcode == OP(SEND_MIDDLE) ||1928opcode == OP(SEND_LAST) ||1929opcode == OP(SEND_LAST_WITH_IMMEDIATE))1930break;1931goto nack_inv;19321933case OP(RDMA_WRITE_FIRST):1934case OP(RDMA_WRITE_MIDDLE):1935if (opcode == OP(RDMA_WRITE_MIDDLE) ||1936opcode == OP(RDMA_WRITE_LAST) ||1937opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))1938break;1939goto nack_inv;19401941default:1942if (opcode == OP(SEND_MIDDLE) ||1943opcode == OP(SEND_LAST) ||1944opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||1945opcode == OP(RDMA_WRITE_MIDDLE) ||1946opcode == OP(RDMA_WRITE_LAST) ||1947opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))1948goto nack_inv;1949/*1950* Note that it is up to the requester to not send a new1951* RDMA read or atomic operation before receiving an ACK1952* for the previous operation.1953*/1954break;1955}19561957memset(&wc, 0, sizeof wc);19581959if (qp->state == IB_QPS_RTR && !(qp->r_flags & QIB_R_COMM_EST)) {1960qp->r_flags |= QIB_R_COMM_EST;1961if (qp->ibqp.event_handler) {1962struct ib_event ev;19631964ev.device = qp->ibqp.device;1965ev.element.qp = &qp->ibqp;1966ev.event = IB_EVENT_COMM_EST;1967qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);1968}1969}19701971/* OK, process the packet. */1972switch (opcode) {1973case OP(SEND_FIRST):1974ret = qib_get_rwqe(qp, 0);1975if (ret < 0)1976goto nack_op_err;1977if (!ret)1978goto rnr_nak;1979qp->r_rcv_len = 0;1980/* FALLTHROUGH */1981case OP(SEND_MIDDLE):1982case OP(RDMA_WRITE_MIDDLE):1983send_middle:1984/* Check for invalid length PMTU or posted rwqe len. */1985if (unlikely(tlen != (hdrsize + pmtu + 4)))1986goto nack_inv;1987qp->r_rcv_len += pmtu;1988if (unlikely(qp->r_rcv_len > qp->r_len))1989goto nack_inv;1990qib_copy_sge(&qp->r_sge, data, pmtu, 1);1991break;19921993case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):1994/* consume RWQE */1995ret = qib_get_rwqe(qp, 1);1996if (ret < 0)1997goto nack_op_err;1998if (!ret)1999goto rnr_nak;2000goto send_last_imm;20012002case OP(SEND_ONLY):2003case OP(SEND_ONLY_WITH_IMMEDIATE):2004ret = qib_get_rwqe(qp, 0);2005if (ret < 0)2006goto nack_op_err;2007if (!ret)2008goto rnr_nak;2009qp->r_rcv_len = 0;2010if (opcode == OP(SEND_ONLY))2011goto send_last;2012/* FALLTHROUGH */2013case OP(SEND_LAST_WITH_IMMEDIATE):2014send_last_imm:2015wc.ex.imm_data = ohdr->u.imm_data;2016hdrsize += 4;2017wc.wc_flags = IB_WC_WITH_IMM;2018/* FALLTHROUGH */2019case OP(SEND_LAST):2020case OP(RDMA_WRITE_LAST):2021send_last:2022/* Get the number of bytes the message was padded by. */2023pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;2024/* Check for invalid length. */2025/* XXX LAST len should be >= 1 */2026if (unlikely(tlen < (hdrsize + pad + 4)))2027goto nack_inv;2028/* Don't count the CRC. */2029tlen -= (hdrsize + pad + 4);2030wc.byte_len = tlen + qp->r_rcv_len;2031if (unlikely(wc.byte_len > qp->r_len))2032goto nack_inv;2033qib_copy_sge(&qp->r_sge, data, tlen, 1);2034while (qp->r_sge.num_sge) {2035atomic_dec(&qp->r_sge.sge.mr->refcount);2036if (--qp->r_sge.num_sge)2037qp->r_sge.sge = *qp->r_sge.sg_list++;2038}2039qp->r_msn++;2040if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags))2041break;2042wc.wr_id = qp->r_wr_id;2043wc.status = IB_WC_SUCCESS;2044if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||2045opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))2046wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;2047else2048wc.opcode = IB_WC_RECV;2049wc.qp = &qp->ibqp;2050wc.src_qp = qp->remote_qpn;2051wc.slid = qp->remote_ah_attr.dlid;2052wc.sl = qp->remote_ah_attr.sl;2053/* Signal completion event if the solicited bit is set. */2054qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,2055(ohdr->bth[0] &2056cpu_to_be32(IB_BTH_SOLICITED)) != 0);2057break;20582059case OP(RDMA_WRITE_FIRST):2060case OP(RDMA_WRITE_ONLY):2061case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):2062if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))2063goto nack_inv;2064/* consume RWQE */2065reth = &ohdr->u.rc.reth;2066hdrsize += sizeof(*reth);2067qp->r_len = be32_to_cpu(reth->length);2068qp->r_rcv_len = 0;2069qp->r_sge.sg_list = NULL;2070if (qp->r_len != 0) {2071u32 rkey = be32_to_cpu(reth->rkey);2072u64 vaddr = be64_to_cpu(reth->vaddr);2073int ok;20742075/* Check rkey & NAK */2076ok = qib_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,2077rkey, IB_ACCESS_REMOTE_WRITE);2078if (unlikely(!ok))2079goto nack_acc;2080qp->r_sge.num_sge = 1;2081} else {2082qp->r_sge.num_sge = 0;2083qp->r_sge.sge.mr = NULL;2084qp->r_sge.sge.vaddr = NULL;2085qp->r_sge.sge.length = 0;2086qp->r_sge.sge.sge_length = 0;2087}2088if (opcode == OP(RDMA_WRITE_FIRST))2089goto send_middle;2090else if (opcode == OP(RDMA_WRITE_ONLY))2091goto send_last;2092ret = qib_get_rwqe(qp, 1);2093if (ret < 0)2094goto nack_op_err;2095if (!ret)2096goto rnr_nak;2097wc.ex.imm_data = ohdr->u.rc.imm_data;2098hdrsize += 4;2099wc.wc_flags = IB_WC_WITH_IMM;2100goto send_last;21012102case OP(RDMA_READ_REQUEST): {2103struct qib_ack_entry *e;2104u32 len;2105u8 next;21062107if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))2108goto nack_inv;2109next = qp->r_head_ack_queue + 1;2110/* s_ack_queue is size QIB_MAX_RDMA_ATOMIC+1 so use > not >= */2111if (next > QIB_MAX_RDMA_ATOMIC)2112next = 0;2113spin_lock_irqsave(&qp->s_lock, flags);2114if (unlikely(next == qp->s_tail_ack_queue)) {2115if (!qp->s_ack_queue[next].sent)2116goto nack_inv_unlck;2117qib_update_ack_queue(qp, next);2118}2119e = &qp->s_ack_queue[qp->r_head_ack_queue];2120if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {2121atomic_dec(&e->rdma_sge.mr->refcount);2122e->rdma_sge.mr = NULL;2123}2124reth = &ohdr->u.rc.reth;2125len = be32_to_cpu(reth->length);2126if (len) {2127u32 rkey = be32_to_cpu(reth->rkey);2128u64 vaddr = be64_to_cpu(reth->vaddr);2129int ok;21302131/* Check rkey & NAK */2132ok = qib_rkey_ok(qp, &e->rdma_sge, len, vaddr,2133rkey, IB_ACCESS_REMOTE_READ);2134if (unlikely(!ok))2135goto nack_acc_unlck;2136/*2137* Update the next expected PSN. We add 1 later2138* below, so only add the remainder here.2139*/2140if (len > pmtu)2141qp->r_psn += (len - 1) / pmtu;2142} else {2143e->rdma_sge.mr = NULL;2144e->rdma_sge.vaddr = NULL;2145e->rdma_sge.length = 0;2146e->rdma_sge.sge_length = 0;2147}2148e->opcode = opcode;2149e->sent = 0;2150e->psn = psn;2151e->lpsn = qp->r_psn;2152/*2153* We need to increment the MSN here instead of when we2154* finish sending the result since a duplicate request would2155* increment it more than once.2156*/2157qp->r_msn++;2158qp->r_psn++;2159qp->r_state = opcode;2160qp->r_nak_state = 0;2161qp->r_head_ack_queue = next;21622163/* Schedule the send tasklet. */2164qp->s_flags |= QIB_S_RESP_PENDING;2165qib_schedule_send(qp);21662167goto sunlock;2168}21692170case OP(COMPARE_SWAP):2171case OP(FETCH_ADD): {2172struct ib_atomic_eth *ateth;2173struct qib_ack_entry *e;2174u64 vaddr;2175atomic64_t *maddr;2176u64 sdata;2177u32 rkey;2178u8 next;21792180if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))2181goto nack_inv;2182next = qp->r_head_ack_queue + 1;2183if (next > QIB_MAX_RDMA_ATOMIC)2184next = 0;2185spin_lock_irqsave(&qp->s_lock, flags);2186if (unlikely(next == qp->s_tail_ack_queue)) {2187if (!qp->s_ack_queue[next].sent)2188goto nack_inv_unlck;2189qib_update_ack_queue(qp, next);2190}2191e = &qp->s_ack_queue[qp->r_head_ack_queue];2192if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {2193atomic_dec(&e->rdma_sge.mr->refcount);2194e->rdma_sge.mr = NULL;2195}2196ateth = &ohdr->u.atomic_eth;2197vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |2198be32_to_cpu(ateth->vaddr[1]);2199if (unlikely(vaddr & (sizeof(u64) - 1)))2200goto nack_inv_unlck;2201rkey = be32_to_cpu(ateth->rkey);2202/* Check rkey & NAK */2203if (unlikely(!qib_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),2204vaddr, rkey,2205IB_ACCESS_REMOTE_ATOMIC)))2206goto nack_acc_unlck;2207/* Perform atomic OP and save result. */2208maddr = (atomic64_t *) qp->r_sge.sge.vaddr;2209sdata = be64_to_cpu(ateth->swap_data);2210e->atomic_data = (opcode == OP(FETCH_ADD)) ?2211(u64) atomic64_add_return(sdata, maddr) - sdata :2212(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,2213be64_to_cpu(ateth->compare_data),2214sdata);2215atomic_dec(&qp->r_sge.sge.mr->refcount);2216qp->r_sge.num_sge = 0;2217e->opcode = opcode;2218e->sent = 0;2219e->psn = psn;2220e->lpsn = psn;2221qp->r_msn++;2222qp->r_psn++;2223qp->r_state = opcode;2224qp->r_nak_state = 0;2225qp->r_head_ack_queue = next;22262227/* Schedule the send tasklet. */2228qp->s_flags |= QIB_S_RESP_PENDING;2229qib_schedule_send(qp);22302231goto sunlock;2232}22332234default:2235/* NAK unknown opcodes. */2236goto nack_inv;2237}2238qp->r_psn++;2239qp->r_state = opcode;2240qp->r_ack_psn = psn;2241qp->r_nak_state = 0;2242/* Send an ACK if requested or required. */2243if (psn & (1 << 31))2244goto send_ack;2245return;22462247rnr_nak:2248qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;2249qp->r_ack_psn = qp->r_psn;2250/* Queue RNR NAK for later */2251if (list_empty(&qp->rspwait)) {2252qp->r_flags |= QIB_R_RSP_NAK;2253atomic_inc(&qp->refcount);2254list_add_tail(&qp->rspwait, &rcd->qp_wait_list);2255}2256return;22572258nack_op_err:2259qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR);2260qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;2261qp->r_ack_psn = qp->r_psn;2262/* Queue NAK for later */2263if (list_empty(&qp->rspwait)) {2264qp->r_flags |= QIB_R_RSP_NAK;2265atomic_inc(&qp->refcount);2266list_add_tail(&qp->rspwait, &rcd->qp_wait_list);2267}2268return;22692270nack_inv_unlck:2271spin_unlock_irqrestore(&qp->s_lock, flags);2272nack_inv:2273qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR);2274qp->r_nak_state = IB_NAK_INVALID_REQUEST;2275qp->r_ack_psn = qp->r_psn;2276/* Queue NAK for later */2277if (list_empty(&qp->rspwait)) {2278qp->r_flags |= QIB_R_RSP_NAK;2279atomic_inc(&qp->refcount);2280list_add_tail(&qp->rspwait, &rcd->qp_wait_list);2281}2282return;22832284nack_acc_unlck:2285spin_unlock_irqrestore(&qp->s_lock, flags);2286nack_acc:2287qib_rc_error(qp, IB_WC_LOC_PROT_ERR);2288qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;2289qp->r_ack_psn = qp->r_psn;2290send_ack:2291qib_send_rc_ack(qp);2292return;22932294sunlock:2295spin_unlock_irqrestore(&qp->s_lock, flags);2296}229722982299