Path: blob/master/drivers/infiniband/hw/ipath/ipath_rc.c
15112 views
/*1* Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.2* Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.3*4* This software is available to you under a choice of one of two5* licenses. You may choose to be licensed under the terms of the GNU6* General Public License (GPL) Version 2, available from the file7* COPYING in the main directory of this source tree, or the8* OpenIB.org BSD license below:9*10* Redistribution and use in source and binary forms, with or11* without modification, are permitted provided that the following12* conditions are met:13*14* - Redistributions of source code must retain the above15* copyright notice, this list of conditions and the following16* disclaimer.17*18* - Redistributions in binary form must reproduce the above19* copyright notice, this list of conditions and the following20* disclaimer in the documentation and/or other materials21* provided with the distribution.22*23* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,24* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF25* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND26* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS27* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN28* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN29* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE30* SOFTWARE.31*/3233#include <linux/io.h>3435#include "ipath_verbs.h"36#include "ipath_kernel.h"3738/* cut down ridiculously long IB macro names */39#define OP(x) IB_OPCODE_RC_##x4041static u32 restart_sge(struct ipath_sge_state *ss, struct ipath_swqe *wqe,42u32 psn, u32 pmtu)43{44u32 len;4546len = ((psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;47ss->sge = wqe->sg_list[0];48ss->sg_list = wqe->sg_list + 1;49ss->num_sge = wqe->wr.num_sge;50ipath_skip_sge(ss, len);51return wqe->length - len;52}5354/**55* ipath_init_restart- initialize the qp->s_sge after a restart56* @qp: the QP who's SGE we're restarting57* @wqe: the work queue to initialize the QP's SGE from58*59* The QP s_lock should be held and interrupts disabled.60*/61static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe)62{63struct ipath_ibdev *dev;6465qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn,66ib_mtu_enum_to_int(qp->path_mtu));67dev = to_idev(qp->ibqp.device);68spin_lock(&dev->pending_lock);69if (list_empty(&qp->timerwait))70list_add_tail(&qp->timerwait,71&dev->pending[dev->pending_index]);72spin_unlock(&dev->pending_lock);73}7475/**76* ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)77* @qp: a pointer to the QP78* @ohdr: a pointer to the IB header being constructed79* @pmtu: the path MTU80*81* Return 1 if constructed; otherwise, return 0.82* Note that we are in the responder's side of the QP context.83* Note the QP s_lock must be held.84*/85static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp,86struct ipath_other_headers *ohdr, u32 pmtu)87{88struct ipath_ack_entry *e;89u32 hwords;90u32 len;91u32 bth0;92u32 bth2;9394/* Don't send an ACK if we aren't supposed to. */95if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))96goto bail;9798/* header size in 32-bit words LRH+BTH = (8+12)/4. */99hwords = 5;100101switch (qp->s_ack_state) {102case OP(RDMA_READ_RESPONSE_LAST):103case OP(RDMA_READ_RESPONSE_ONLY):104case OP(ATOMIC_ACKNOWLEDGE):105/*106* We can increment the tail pointer now that the last107* response has been sent instead of only being108* constructed.109*/110if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC)111qp->s_tail_ack_queue = 0;112/* FALLTHROUGH */113case OP(SEND_ONLY):114case OP(ACKNOWLEDGE):115/* Check for no next entry in the queue. */116if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {117if (qp->s_flags & IPATH_S_ACK_PENDING)118goto normal;119qp->s_ack_state = OP(ACKNOWLEDGE);120goto bail;121}122123e = &qp->s_ack_queue[qp->s_tail_ack_queue];124if (e->opcode == OP(RDMA_READ_REQUEST)) {125/* Copy SGE state in case we need to resend */126qp->s_ack_rdma_sge = e->rdma_sge;127qp->s_cur_sge = &qp->s_ack_rdma_sge;128len = e->rdma_sge.sge.sge_length;129if (len > pmtu) {130len = pmtu;131qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);132} else {133qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);134e->sent = 1;135}136ohdr->u.aeth = ipath_compute_aeth(qp);137hwords++;138qp->s_ack_rdma_psn = e->psn;139bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;140} else {141/* COMPARE_SWAP or FETCH_ADD */142qp->s_cur_sge = NULL;143len = 0;144qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);145ohdr->u.at.aeth = ipath_compute_aeth(qp);146ohdr->u.at.atomic_ack_eth[0] =147cpu_to_be32(e->atomic_data >> 32);148ohdr->u.at.atomic_ack_eth[1] =149cpu_to_be32(e->atomic_data);150hwords += sizeof(ohdr->u.at) / sizeof(u32);151bth2 = e->psn;152e->sent = 1;153}154bth0 = qp->s_ack_state << 24;155break;156157case OP(RDMA_READ_RESPONSE_FIRST):158qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);159/* FALLTHROUGH */160case OP(RDMA_READ_RESPONSE_MIDDLE):161len = qp->s_ack_rdma_sge.sge.sge_length;162if (len > pmtu)163len = pmtu;164else {165ohdr->u.aeth = ipath_compute_aeth(qp);166hwords++;167qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);168qp->s_ack_queue[qp->s_tail_ack_queue].sent = 1;169}170bth0 = qp->s_ack_state << 24;171bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;172break;173174default:175normal:176/*177* Send a regular ACK.178* Set the s_ack_state so we wait until after sending179* the ACK before setting s_ack_state to ACKNOWLEDGE180* (see above).181*/182qp->s_ack_state = OP(SEND_ONLY);183qp->s_flags &= ~IPATH_S_ACK_PENDING;184qp->s_cur_sge = NULL;185if (qp->s_nak_state)186ohdr->u.aeth =187cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |188(qp->s_nak_state <<189IPATH_AETH_CREDIT_SHIFT));190else191ohdr->u.aeth = ipath_compute_aeth(qp);192hwords++;193len = 0;194bth0 = OP(ACKNOWLEDGE) << 24;195bth2 = qp->s_ack_psn & IPATH_PSN_MASK;196}197qp->s_hdrwords = hwords;198qp->s_cur_size = len;199ipath_make_ruc_header(dev, qp, ohdr, bth0, bth2);200return 1;201202bail:203return 0;204}205206/**207* ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)208* @qp: a pointer to the QP209*210* Return 1 if constructed; otherwise, return 0.211*/212int ipath_make_rc_req(struct ipath_qp *qp)213{214struct ipath_ibdev *dev = to_idev(qp->ibqp.device);215struct ipath_other_headers *ohdr;216struct ipath_sge_state *ss;217struct ipath_swqe *wqe;218u32 hwords;219u32 len;220u32 bth0;221u32 bth2;222u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);223char newreq;224unsigned long flags;225int ret = 0;226227ohdr = &qp->s_hdr.u.oth;228if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)229ohdr = &qp->s_hdr.u.l.oth;230231/*232* The lock is needed to synchronize between the sending tasklet,233* the receive interrupt handler, and timeout resends.234*/235spin_lock_irqsave(&qp->s_lock, flags);236237/* Sending responses has higher priority over sending requests. */238if ((qp->r_head_ack_queue != qp->s_tail_ack_queue ||239(qp->s_flags & IPATH_S_ACK_PENDING) ||240qp->s_ack_state != OP(ACKNOWLEDGE)) &&241ipath_make_rc_ack(dev, qp, ohdr, pmtu))242goto done;243244if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {245if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))246goto bail;247/* We are in the error state, flush the work request. */248if (qp->s_last == qp->s_head)249goto bail;250/* If DMAs are in progress, we can't flush immediately. */251if (atomic_read(&qp->s_dma_busy)) {252qp->s_flags |= IPATH_S_WAIT_DMA;253goto bail;254}255wqe = get_swqe_ptr(qp, qp->s_last);256ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);257goto done;258}259260/* Leave BUSY set until RNR timeout. */261if (qp->s_rnr_timeout) {262qp->s_flags |= IPATH_S_WAITING;263goto bail;264}265266/* header size in 32-bit words LRH+BTH = (8+12)/4. */267hwords = 5;268bth0 = 1 << 22; /* Set M bit */269270/* Send a request. */271wqe = get_swqe_ptr(qp, qp->s_cur);272switch (qp->s_state) {273default:274if (!(ib_ipath_state_ops[qp->state] &275IPATH_PROCESS_NEXT_SEND_OK))276goto bail;277/*278* Resend an old request or start a new one.279*280* We keep track of the current SWQE so that281* we don't reset the "furthest progress" state282* if we need to back up.283*/284newreq = 0;285if (qp->s_cur == qp->s_tail) {286/* Check if send work queue is empty. */287if (qp->s_tail == qp->s_head)288goto bail;289/*290* If a fence is requested, wait for previous291* RDMA read and atomic operations to finish.292*/293if ((wqe->wr.send_flags & IB_SEND_FENCE) &&294qp->s_num_rd_atomic) {295qp->s_flags |= IPATH_S_FENCE_PENDING;296goto bail;297}298wqe->psn = qp->s_next_psn;299newreq = 1;300}301/*302* Note that we have to be careful not to modify the303* original work request since we may need to resend304* it.305*/306len = wqe->length;307ss = &qp->s_sge;308bth2 = 0;309switch (wqe->wr.opcode) {310case IB_WR_SEND:311case IB_WR_SEND_WITH_IMM:312/* If no credit, return. */313if (qp->s_lsn != (u32) -1 &&314ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {315qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT;316goto bail;317}318wqe->lpsn = wqe->psn;319if (len > pmtu) {320wqe->lpsn += (len - 1) / pmtu;321qp->s_state = OP(SEND_FIRST);322len = pmtu;323break;324}325if (wqe->wr.opcode == IB_WR_SEND)326qp->s_state = OP(SEND_ONLY);327else {328qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);329/* Immediate data comes after the BTH */330ohdr->u.imm_data = wqe->wr.ex.imm_data;331hwords += 1;332}333if (wqe->wr.send_flags & IB_SEND_SOLICITED)334bth0 |= 1 << 23;335bth2 = 1 << 31; /* Request ACK. */336if (++qp->s_cur == qp->s_size)337qp->s_cur = 0;338break;339340case IB_WR_RDMA_WRITE:341if (newreq && qp->s_lsn != (u32) -1)342qp->s_lsn++;343/* FALLTHROUGH */344case IB_WR_RDMA_WRITE_WITH_IMM:345/* If no credit, return. */346if (qp->s_lsn != (u32) -1 &&347ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {348qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT;349goto bail;350}351ohdr->u.rc.reth.vaddr =352cpu_to_be64(wqe->wr.wr.rdma.remote_addr);353ohdr->u.rc.reth.rkey =354cpu_to_be32(wqe->wr.wr.rdma.rkey);355ohdr->u.rc.reth.length = cpu_to_be32(len);356hwords += sizeof(struct ib_reth) / sizeof(u32);357wqe->lpsn = wqe->psn;358if (len > pmtu) {359wqe->lpsn += (len - 1) / pmtu;360qp->s_state = OP(RDMA_WRITE_FIRST);361len = pmtu;362break;363}364if (wqe->wr.opcode == IB_WR_RDMA_WRITE)365qp->s_state = OP(RDMA_WRITE_ONLY);366else {367qp->s_state =368OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);369/* Immediate data comes after RETH */370ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;371hwords += 1;372if (wqe->wr.send_flags & IB_SEND_SOLICITED)373bth0 |= 1 << 23;374}375bth2 = 1 << 31; /* Request ACK. */376if (++qp->s_cur == qp->s_size)377qp->s_cur = 0;378break;379380case IB_WR_RDMA_READ:381/*382* Don't allow more operations to be started383* than the QP limits allow.384*/385if (newreq) {386if (qp->s_num_rd_atomic >=387qp->s_max_rd_atomic) {388qp->s_flags |= IPATH_S_RDMAR_PENDING;389goto bail;390}391qp->s_num_rd_atomic++;392if (qp->s_lsn != (u32) -1)393qp->s_lsn++;394/*395* Adjust s_next_psn to count the396* expected number of responses.397*/398if (len > pmtu)399qp->s_next_psn += (len - 1) / pmtu;400wqe->lpsn = qp->s_next_psn++;401}402ohdr->u.rc.reth.vaddr =403cpu_to_be64(wqe->wr.wr.rdma.remote_addr);404ohdr->u.rc.reth.rkey =405cpu_to_be32(wqe->wr.wr.rdma.rkey);406ohdr->u.rc.reth.length = cpu_to_be32(len);407qp->s_state = OP(RDMA_READ_REQUEST);408hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);409ss = NULL;410len = 0;411if (++qp->s_cur == qp->s_size)412qp->s_cur = 0;413break;414415case IB_WR_ATOMIC_CMP_AND_SWP:416case IB_WR_ATOMIC_FETCH_AND_ADD:417/*418* Don't allow more operations to be started419* than the QP limits allow.420*/421if (newreq) {422if (qp->s_num_rd_atomic >=423qp->s_max_rd_atomic) {424qp->s_flags |= IPATH_S_RDMAR_PENDING;425goto bail;426}427qp->s_num_rd_atomic++;428if (qp->s_lsn != (u32) -1)429qp->s_lsn++;430wqe->lpsn = wqe->psn;431}432if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {433qp->s_state = OP(COMPARE_SWAP);434ohdr->u.atomic_eth.swap_data = cpu_to_be64(435wqe->wr.wr.atomic.swap);436ohdr->u.atomic_eth.compare_data = cpu_to_be64(437wqe->wr.wr.atomic.compare_add);438} else {439qp->s_state = OP(FETCH_ADD);440ohdr->u.atomic_eth.swap_data = cpu_to_be64(441wqe->wr.wr.atomic.compare_add);442ohdr->u.atomic_eth.compare_data = 0;443}444ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(445wqe->wr.wr.atomic.remote_addr >> 32);446ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(447wqe->wr.wr.atomic.remote_addr);448ohdr->u.atomic_eth.rkey = cpu_to_be32(449wqe->wr.wr.atomic.rkey);450hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);451ss = NULL;452len = 0;453if (++qp->s_cur == qp->s_size)454qp->s_cur = 0;455break;456457default:458goto bail;459}460qp->s_sge.sge = wqe->sg_list[0];461qp->s_sge.sg_list = wqe->sg_list + 1;462qp->s_sge.num_sge = wqe->wr.num_sge;463qp->s_len = wqe->length;464if (newreq) {465qp->s_tail++;466if (qp->s_tail >= qp->s_size)467qp->s_tail = 0;468}469bth2 |= qp->s_psn & IPATH_PSN_MASK;470if (wqe->wr.opcode == IB_WR_RDMA_READ)471qp->s_psn = wqe->lpsn + 1;472else {473qp->s_psn++;474if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)475qp->s_next_psn = qp->s_psn;476}477/*478* Put the QP on the pending list so lost ACKs will cause479* a retry. More than one request can be pending so the480* QP may already be on the dev->pending list.481*/482spin_lock(&dev->pending_lock);483if (list_empty(&qp->timerwait))484list_add_tail(&qp->timerwait,485&dev->pending[dev->pending_index]);486spin_unlock(&dev->pending_lock);487break;488489case OP(RDMA_READ_RESPONSE_FIRST):490/*491* This case can only happen if a send is restarted.492* See ipath_restart_rc().493*/494ipath_init_restart(qp, wqe);495/* FALLTHROUGH */496case OP(SEND_FIRST):497qp->s_state = OP(SEND_MIDDLE);498/* FALLTHROUGH */499case OP(SEND_MIDDLE):500bth2 = qp->s_psn++ & IPATH_PSN_MASK;501if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)502qp->s_next_psn = qp->s_psn;503ss = &qp->s_sge;504len = qp->s_len;505if (len > pmtu) {506len = pmtu;507break;508}509if (wqe->wr.opcode == IB_WR_SEND)510qp->s_state = OP(SEND_LAST);511else {512qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);513/* Immediate data comes after the BTH */514ohdr->u.imm_data = wqe->wr.ex.imm_data;515hwords += 1;516}517if (wqe->wr.send_flags & IB_SEND_SOLICITED)518bth0 |= 1 << 23;519bth2 |= 1 << 31; /* Request ACK. */520qp->s_cur++;521if (qp->s_cur >= qp->s_size)522qp->s_cur = 0;523break;524525case OP(RDMA_READ_RESPONSE_LAST):526/*527* This case can only happen if a RDMA write is restarted.528* See ipath_restart_rc().529*/530ipath_init_restart(qp, wqe);531/* FALLTHROUGH */532case OP(RDMA_WRITE_FIRST):533qp->s_state = OP(RDMA_WRITE_MIDDLE);534/* FALLTHROUGH */535case OP(RDMA_WRITE_MIDDLE):536bth2 = qp->s_psn++ & IPATH_PSN_MASK;537if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)538qp->s_next_psn = qp->s_psn;539ss = &qp->s_sge;540len = qp->s_len;541if (len > pmtu) {542len = pmtu;543break;544}545if (wqe->wr.opcode == IB_WR_RDMA_WRITE)546qp->s_state = OP(RDMA_WRITE_LAST);547else {548qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);549/* Immediate data comes after the BTH */550ohdr->u.imm_data = wqe->wr.ex.imm_data;551hwords += 1;552if (wqe->wr.send_flags & IB_SEND_SOLICITED)553bth0 |= 1 << 23;554}555bth2 |= 1 << 31; /* Request ACK. */556qp->s_cur++;557if (qp->s_cur >= qp->s_size)558qp->s_cur = 0;559break;560561case OP(RDMA_READ_RESPONSE_MIDDLE):562/*563* This case can only happen if a RDMA read is restarted.564* See ipath_restart_rc().565*/566ipath_init_restart(qp, wqe);567len = ((qp->s_psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;568ohdr->u.rc.reth.vaddr =569cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);570ohdr->u.rc.reth.rkey =571cpu_to_be32(wqe->wr.wr.rdma.rkey);572ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len);573qp->s_state = OP(RDMA_READ_REQUEST);574hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);575bth2 = qp->s_psn & IPATH_PSN_MASK;576qp->s_psn = wqe->lpsn + 1;577ss = NULL;578len = 0;579qp->s_cur++;580if (qp->s_cur == qp->s_size)581qp->s_cur = 0;582break;583}584if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT - 1) >= 0)585bth2 |= 1 << 31; /* Request ACK. */586qp->s_len -= len;587qp->s_hdrwords = hwords;588qp->s_cur_sge = ss;589qp->s_cur_size = len;590ipath_make_ruc_header(dev, qp, ohdr, bth0 | (qp->s_state << 24), bth2);591done:592ret = 1;593goto unlock;594595bail:596qp->s_flags &= ~IPATH_S_BUSY;597unlock:598spin_unlock_irqrestore(&qp->s_lock, flags);599return ret;600}601602/**603* send_rc_ack - Construct an ACK packet and send it604* @qp: a pointer to the QP605*606* This is called from ipath_rc_rcv() and only uses the receive607* side QP state.608* Note that RDMA reads and atomics are handled in the609* send side QP state and tasklet.610*/611static void send_rc_ack(struct ipath_qp *qp)612{613struct ipath_ibdev *dev = to_idev(qp->ibqp.device);614struct ipath_devdata *dd;615u16 lrh0;616u32 bth0;617u32 hwords;618u32 __iomem *piobuf;619struct ipath_ib_header hdr;620struct ipath_other_headers *ohdr;621unsigned long flags;622623spin_lock_irqsave(&qp->s_lock, flags);624625/* Don't send ACK or NAK if a RDMA read or atomic is pending. */626if (qp->r_head_ack_queue != qp->s_tail_ack_queue ||627(qp->s_flags & IPATH_S_ACK_PENDING) ||628qp->s_ack_state != OP(ACKNOWLEDGE))629goto queue_ack;630631spin_unlock_irqrestore(&qp->s_lock, flags);632633/* Don't try to send ACKs if the link isn't ACTIVE */634dd = dev->dd;635if (!(dd->ipath_flags & IPATH_LINKACTIVE))636goto done;637638piobuf = ipath_getpiobuf(dd, 0, NULL);639if (!piobuf) {640/*641* We are out of PIO buffers at the moment.642* Pass responsibility for sending the ACK to the643* send tasklet so that when a PIO buffer becomes644* available, the ACK is sent ahead of other outgoing645* packets.646*/647spin_lock_irqsave(&qp->s_lock, flags);648goto queue_ack;649}650651/* Construct the header. */652ohdr = &hdr.u.oth;653lrh0 = IPATH_LRH_BTH;654/* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */655hwords = 6;656if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {657hwords += ipath_make_grh(dev, &hdr.u.l.grh,658&qp->remote_ah_attr.grh,659hwords, 0);660ohdr = &hdr.u.l.oth;661lrh0 = IPATH_LRH_GRH;662}663/* read pkey_index w/o lock (its atomic) */664bth0 = ipath_get_pkey(dd, qp->s_pkey_index) |665(OP(ACKNOWLEDGE) << 24) | (1 << 22);666if (qp->r_nak_state)667ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |668(qp->r_nak_state <<669IPATH_AETH_CREDIT_SHIFT));670else671ohdr->u.aeth = ipath_compute_aeth(qp);672lrh0 |= qp->remote_ah_attr.sl << 4;673hdr.lrh[0] = cpu_to_be16(lrh0);674hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);675hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);676hdr.lrh[3] = cpu_to_be16(dd->ipath_lid |677qp->remote_ah_attr.src_path_bits);678ohdr->bth[0] = cpu_to_be32(bth0);679ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);680ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & IPATH_PSN_MASK);681682writeq(hwords + 1, piobuf);683684if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) {685u32 *hdrp = (u32 *) &hdr;686687ipath_flush_wc();688__iowrite32_copy(piobuf + 2, hdrp, hwords - 1);689ipath_flush_wc();690__raw_writel(hdrp[hwords - 1], piobuf + hwords + 1);691} else692__iowrite32_copy(piobuf + 2, (u32 *) &hdr, hwords);693694ipath_flush_wc();695696dev->n_unicast_xmit++;697goto done;698699queue_ack:700if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK) {701dev->n_rc_qacks++;702qp->s_flags |= IPATH_S_ACK_PENDING;703qp->s_nak_state = qp->r_nak_state;704qp->s_ack_psn = qp->r_ack_psn;705706/* Schedule the send tasklet. */707ipath_schedule_send(qp);708}709spin_unlock_irqrestore(&qp->s_lock, flags);710done:711return;712}713714/**715* reset_psn - reset the QP state to send starting from PSN716* @qp: the QP717* @psn: the packet sequence number to restart at718*719* This is called from ipath_rc_rcv() to process an incoming RC ACK720* for the given QP.721* Called at interrupt level with the QP s_lock held.722*/723static void reset_psn(struct ipath_qp *qp, u32 psn)724{725u32 n = qp->s_last;726struct ipath_swqe *wqe = get_swqe_ptr(qp, n);727u32 opcode;728729qp->s_cur = n;730731/*732* If we are starting the request from the beginning,733* let the normal send code handle initialization.734*/735if (ipath_cmp24(psn, wqe->psn) <= 0) {736qp->s_state = OP(SEND_LAST);737goto done;738}739740/* Find the work request opcode corresponding to the given PSN. */741opcode = wqe->wr.opcode;742for (;;) {743int diff;744745if (++n == qp->s_size)746n = 0;747if (n == qp->s_tail)748break;749wqe = get_swqe_ptr(qp, n);750diff = ipath_cmp24(psn, wqe->psn);751if (diff < 0)752break;753qp->s_cur = n;754/*755* If we are starting the request from the beginning,756* let the normal send code handle initialization.757*/758if (diff == 0) {759qp->s_state = OP(SEND_LAST);760goto done;761}762opcode = wqe->wr.opcode;763}764765/*766* Set the state to restart in the middle of a request.767* Don't change the s_sge, s_cur_sge, or s_cur_size.768* See ipath_make_rc_req().769*/770switch (opcode) {771case IB_WR_SEND:772case IB_WR_SEND_WITH_IMM:773qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);774break;775776case IB_WR_RDMA_WRITE:777case IB_WR_RDMA_WRITE_WITH_IMM:778qp->s_state = OP(RDMA_READ_RESPONSE_LAST);779break;780781case IB_WR_RDMA_READ:782qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);783break;784785default:786/*787* This case shouldn't happen since its only788* one PSN per req.789*/790qp->s_state = OP(SEND_LAST);791}792done:793qp->s_psn = psn;794}795796/**797* ipath_restart_rc - back up requester to resend the last un-ACKed request798* @qp: the QP to restart799* @psn: packet sequence number for the request800* @wc: the work completion request801*802* The QP s_lock should be held and interrupts disabled.803*/804void ipath_restart_rc(struct ipath_qp *qp, u32 psn)805{806struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);807struct ipath_ibdev *dev;808809if (qp->s_retry == 0) {810ipath_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);811ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);812goto bail;813}814qp->s_retry--;815816/*817* Remove the QP from the timeout queue.818* Note: it may already have been removed by ipath_ib_timer().819*/820dev = to_idev(qp->ibqp.device);821spin_lock(&dev->pending_lock);822if (!list_empty(&qp->timerwait))823list_del_init(&qp->timerwait);824if (!list_empty(&qp->piowait))825list_del_init(&qp->piowait);826spin_unlock(&dev->pending_lock);827828if (wqe->wr.opcode == IB_WR_RDMA_READ)829dev->n_rc_resends++;830else831dev->n_rc_resends += (qp->s_psn - psn) & IPATH_PSN_MASK;832833reset_psn(qp, psn);834ipath_schedule_send(qp);835836bail:837return;838}839840static inline void update_last_psn(struct ipath_qp *qp, u32 psn)841{842qp->s_last_psn = psn;843}844845/**846* do_rc_ack - process an incoming RC ACK847* @qp: the QP the ACK came in on848* @psn: the packet sequence number of the ACK849* @opcode: the opcode of the request that resulted in the ACK850*851* This is called from ipath_rc_rcv_resp() to process an incoming RC ACK852* for the given QP.853* Called at interrupt level with the QP s_lock held and interrupts disabled.854* Returns 1 if OK, 0 if current operation should be aborted (NAK).855*/856static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,857u64 val)858{859struct ipath_ibdev *dev = to_idev(qp->ibqp.device);860struct ib_wc wc;861enum ib_wc_status status;862struct ipath_swqe *wqe;863int ret = 0;864u32 ack_psn;865int diff;866867/*868* Remove the QP from the timeout queue (or RNR timeout queue).869* If ipath_ib_timer() has already removed it,870* it's OK since we hold the QP s_lock and ipath_restart_rc()871* just won't find anything to restart if we ACK everything.872*/873spin_lock(&dev->pending_lock);874if (!list_empty(&qp->timerwait))875list_del_init(&qp->timerwait);876spin_unlock(&dev->pending_lock);877878/*879* Note that NAKs implicitly ACK outstanding SEND and RDMA write880* requests and implicitly NAK RDMA read and atomic requests issued881* before the NAK'ed request. The MSN won't include the NAK'ed882* request but will include an ACK'ed request(s).883*/884ack_psn = psn;885if (aeth >> 29)886ack_psn--;887wqe = get_swqe_ptr(qp, qp->s_last);888889/*890* The MSN might be for a later WQE than the PSN indicates so891* only complete WQEs that the PSN finishes.892*/893while ((diff = ipath_cmp24(ack_psn, wqe->lpsn)) >= 0) {894/*895* RDMA_READ_RESPONSE_ONLY is a special case since896* we want to generate completion events for everything897* before the RDMA read, copy the data, then generate898* the completion for the read.899*/900if (wqe->wr.opcode == IB_WR_RDMA_READ &&901opcode == OP(RDMA_READ_RESPONSE_ONLY) &&902diff == 0) {903ret = 1;904goto bail;905}906/*907* If this request is a RDMA read or atomic, and the ACK is908* for a later operation, this ACK NAKs the RDMA read or909* atomic. In other words, only a RDMA_READ_LAST or ONLY910* can ACK a RDMA read and likewise for atomic ops. Note911* that the NAK case can only happen if relaxed ordering is912* used and requests are sent after an RDMA read or atomic913* is sent but before the response is received.914*/915if ((wqe->wr.opcode == IB_WR_RDMA_READ &&916(opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||917((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||918wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&919(opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {920/*921* The last valid PSN seen is the previous922* request's.923*/924update_last_psn(qp, wqe->psn - 1);925/* Retry this request. */926ipath_restart_rc(qp, wqe->psn);927/*928* No need to process the ACK/NAK since we are929* restarting an earlier request.930*/931goto bail;932}933if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||934wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)935*(u64 *) wqe->sg_list[0].vaddr = val;936if (qp->s_num_rd_atomic &&937(wqe->wr.opcode == IB_WR_RDMA_READ ||938wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||939wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {940qp->s_num_rd_atomic--;941/* Restart sending task if fence is complete */942if (((qp->s_flags & IPATH_S_FENCE_PENDING) &&943!qp->s_num_rd_atomic) ||944qp->s_flags & IPATH_S_RDMAR_PENDING)945ipath_schedule_send(qp);946}947/* Post a send completion queue entry if requested. */948if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||949(wqe->wr.send_flags & IB_SEND_SIGNALED)) {950memset(&wc, 0, sizeof wc);951wc.wr_id = wqe->wr.wr_id;952wc.status = IB_WC_SUCCESS;953wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];954wc.byte_len = wqe->length;955wc.qp = &qp->ibqp;956wc.src_qp = qp->remote_qpn;957wc.slid = qp->remote_ah_attr.dlid;958wc.sl = qp->remote_ah_attr.sl;959ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);960}961qp->s_retry = qp->s_retry_cnt;962/*963* If we are completing a request which is in the process of964* being resent, we can stop resending it since we know the965* responder has already seen it.966*/967if (qp->s_last == qp->s_cur) {968if (++qp->s_cur >= qp->s_size)969qp->s_cur = 0;970qp->s_last = qp->s_cur;971if (qp->s_last == qp->s_tail)972break;973wqe = get_swqe_ptr(qp, qp->s_cur);974qp->s_state = OP(SEND_LAST);975qp->s_psn = wqe->psn;976} else {977if (++qp->s_last >= qp->s_size)978qp->s_last = 0;979if (qp->state == IB_QPS_SQD && qp->s_last == qp->s_cur)980qp->s_draining = 0;981if (qp->s_last == qp->s_tail)982break;983wqe = get_swqe_ptr(qp, qp->s_last);984}985}986987switch (aeth >> 29) {988case 0: /* ACK */989dev->n_rc_acks++;990/* If this is a partial ACK, reset the retransmit timer. */991if (qp->s_last != qp->s_tail) {992spin_lock(&dev->pending_lock);993if (list_empty(&qp->timerwait))994list_add_tail(&qp->timerwait,995&dev->pending[dev->pending_index]);996spin_unlock(&dev->pending_lock);997/*998* If we get a partial ACK for a resent operation,999* we can stop resending the earlier packets and1000* continue with the next packet the receiver wants.1001*/1002if (ipath_cmp24(qp->s_psn, psn) <= 0) {1003reset_psn(qp, psn + 1);1004ipath_schedule_send(qp);1005}1006} else if (ipath_cmp24(qp->s_psn, psn) <= 0) {1007qp->s_state = OP(SEND_LAST);1008qp->s_psn = psn + 1;1009}1010ipath_get_credit(qp, aeth);1011qp->s_rnr_retry = qp->s_rnr_retry_cnt;1012qp->s_retry = qp->s_retry_cnt;1013update_last_psn(qp, psn);1014ret = 1;1015goto bail;10161017case 1: /* RNR NAK */1018dev->n_rnr_naks++;1019if (qp->s_last == qp->s_tail)1020goto bail;1021if (qp->s_rnr_retry == 0) {1022status = IB_WC_RNR_RETRY_EXC_ERR;1023goto class_b;1024}1025if (qp->s_rnr_retry_cnt < 7)1026qp->s_rnr_retry--;10271028/* The last valid PSN is the previous PSN. */1029update_last_psn(qp, psn - 1);10301031if (wqe->wr.opcode == IB_WR_RDMA_READ)1032dev->n_rc_resends++;1033else1034dev->n_rc_resends +=1035(qp->s_psn - psn) & IPATH_PSN_MASK;10361037reset_psn(qp, psn);10381039qp->s_rnr_timeout =1040ib_ipath_rnr_table[(aeth >> IPATH_AETH_CREDIT_SHIFT) &1041IPATH_AETH_CREDIT_MASK];1042ipath_insert_rnr_queue(qp);1043ipath_schedule_send(qp);1044goto bail;10451046case 3: /* NAK */1047if (qp->s_last == qp->s_tail)1048goto bail;1049/* The last valid PSN is the previous PSN. */1050update_last_psn(qp, psn - 1);1051switch ((aeth >> IPATH_AETH_CREDIT_SHIFT) &1052IPATH_AETH_CREDIT_MASK) {1053case 0: /* PSN sequence error */1054dev->n_seq_naks++;1055/*1056* Back up to the responder's expected PSN.1057* Note that we might get a NAK in the middle of an1058* RDMA READ response which terminates the RDMA1059* READ.1060*/1061ipath_restart_rc(qp, psn);1062break;10631064case 1: /* Invalid Request */1065status = IB_WC_REM_INV_REQ_ERR;1066dev->n_other_naks++;1067goto class_b;10681069case 2: /* Remote Access Error */1070status = IB_WC_REM_ACCESS_ERR;1071dev->n_other_naks++;1072goto class_b;10731074case 3: /* Remote Operation Error */1075status = IB_WC_REM_OP_ERR;1076dev->n_other_naks++;1077class_b:1078ipath_send_complete(qp, wqe, status);1079ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);1080break;10811082default:1083/* Ignore other reserved NAK error codes */1084goto reserved;1085}1086qp->s_rnr_retry = qp->s_rnr_retry_cnt;1087goto bail;10881089default: /* 2: reserved */1090reserved:1091/* Ignore reserved NAK codes. */1092goto bail;1093}10941095bail:1096return ret;1097}10981099/**1100* ipath_rc_rcv_resp - process an incoming RC response packet1101* @dev: the device this packet came in on1102* @ohdr: the other headers for this packet1103* @data: the packet data1104* @tlen: the packet length1105* @qp: the QP for this packet1106* @opcode: the opcode for this packet1107* @psn: the packet sequence number for this packet1108* @hdrsize: the header length1109* @pmtu: the path MTU1110* @header_in_data: true if part of the header data is in the data buffer1111*1112* This is called from ipath_rc_rcv() to process an incoming RC response1113* packet for the given QP.1114* Called at interrupt level.1115*/1116static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,1117struct ipath_other_headers *ohdr,1118void *data, u32 tlen,1119struct ipath_qp *qp,1120u32 opcode,1121u32 psn, u32 hdrsize, u32 pmtu,1122int header_in_data)1123{1124struct ipath_swqe *wqe;1125enum ib_wc_status status;1126unsigned long flags;1127int diff;1128u32 pad;1129u32 aeth;1130u64 val;11311132spin_lock_irqsave(&qp->s_lock, flags);11331134/* Double check we can process this now that we hold the s_lock. */1135if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))1136goto ack_done;11371138/* Ignore invalid responses. */1139if (ipath_cmp24(psn, qp->s_next_psn) >= 0)1140goto ack_done;11411142/* Ignore duplicate responses. */1143diff = ipath_cmp24(psn, qp->s_last_psn);1144if (unlikely(diff <= 0)) {1145/* Update credits for "ghost" ACKs */1146if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {1147if (!header_in_data)1148aeth = be32_to_cpu(ohdr->u.aeth);1149else {1150aeth = be32_to_cpu(((__be32 *) data)[0]);1151data += sizeof(__be32);1152}1153if ((aeth >> 29) == 0)1154ipath_get_credit(qp, aeth);1155}1156goto ack_done;1157}11581159if (unlikely(qp->s_last == qp->s_tail))1160goto ack_done;1161wqe = get_swqe_ptr(qp, qp->s_last);1162status = IB_WC_SUCCESS;11631164switch (opcode) {1165case OP(ACKNOWLEDGE):1166case OP(ATOMIC_ACKNOWLEDGE):1167case OP(RDMA_READ_RESPONSE_FIRST):1168if (!header_in_data)1169aeth = be32_to_cpu(ohdr->u.aeth);1170else {1171aeth = be32_to_cpu(((__be32 *) data)[0]);1172data += sizeof(__be32);1173}1174if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {1175if (!header_in_data) {1176__be32 *p = ohdr->u.at.atomic_ack_eth;11771178val = ((u64) be32_to_cpu(p[0]) << 32) |1179be32_to_cpu(p[1]);1180} else1181val = be64_to_cpu(((__be64 *) data)[0]);1182} else1183val = 0;1184if (!do_rc_ack(qp, aeth, psn, opcode, val) ||1185opcode != OP(RDMA_READ_RESPONSE_FIRST))1186goto ack_done;1187hdrsize += 4;1188wqe = get_swqe_ptr(qp, qp->s_last);1189if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))1190goto ack_op_err;1191qp->r_flags &= ~IPATH_R_RDMAR_SEQ;1192/*1193* If this is a response to a resent RDMA read, we1194* have to be careful to copy the data to the right1195* location.1196*/1197qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,1198wqe, psn, pmtu);1199goto read_middle;12001201case OP(RDMA_READ_RESPONSE_MIDDLE):1202/* no AETH, no ACK */1203if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {1204dev->n_rdma_seq++;1205if (qp->r_flags & IPATH_R_RDMAR_SEQ)1206goto ack_done;1207qp->r_flags |= IPATH_R_RDMAR_SEQ;1208ipath_restart_rc(qp, qp->s_last_psn + 1);1209goto ack_done;1210}1211if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))1212goto ack_op_err;1213read_middle:1214if (unlikely(tlen != (hdrsize + pmtu + 4)))1215goto ack_len_err;1216if (unlikely(pmtu >= qp->s_rdma_read_len))1217goto ack_len_err;12181219/* We got a response so update the timeout. */1220spin_lock(&dev->pending_lock);1221if (qp->s_rnr_timeout == 0 && !list_empty(&qp->timerwait))1222list_move_tail(&qp->timerwait,1223&dev->pending[dev->pending_index]);1224spin_unlock(&dev->pending_lock);12251226if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))1227qp->s_retry = qp->s_retry_cnt;12281229/*1230* Update the RDMA receive state but do the copy w/o1231* holding the locks and blocking interrupts.1232*/1233qp->s_rdma_read_len -= pmtu;1234update_last_psn(qp, psn);1235spin_unlock_irqrestore(&qp->s_lock, flags);1236ipath_copy_sge(&qp->s_rdma_read_sge, data, pmtu);1237goto bail;12381239case OP(RDMA_READ_RESPONSE_ONLY):1240if (!header_in_data)1241aeth = be32_to_cpu(ohdr->u.aeth);1242else1243aeth = be32_to_cpu(((__be32 *) data)[0]);1244if (!do_rc_ack(qp, aeth, psn, opcode, 0))1245goto ack_done;1246/* Get the number of bytes the message was padded by. */1247pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;1248/*1249* Check that the data size is >= 0 && <= pmtu.1250* Remember to account for the AETH header (4) and1251* ICRC (4).1252*/1253if (unlikely(tlen < (hdrsize + pad + 8)))1254goto ack_len_err;1255/*1256* If this is a response to a resent RDMA read, we1257* have to be careful to copy the data to the right1258* location.1259*/1260wqe = get_swqe_ptr(qp, qp->s_last);1261qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,1262wqe, psn, pmtu);1263goto read_last;12641265case OP(RDMA_READ_RESPONSE_LAST):1266/* ACKs READ req. */1267if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {1268dev->n_rdma_seq++;1269if (qp->r_flags & IPATH_R_RDMAR_SEQ)1270goto ack_done;1271qp->r_flags |= IPATH_R_RDMAR_SEQ;1272ipath_restart_rc(qp, qp->s_last_psn + 1);1273goto ack_done;1274}1275if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))1276goto ack_op_err;1277/* Get the number of bytes the message was padded by. */1278pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;1279/*1280* Check that the data size is >= 1 && <= pmtu.1281* Remember to account for the AETH header (4) and1282* ICRC (4).1283*/1284if (unlikely(tlen <= (hdrsize + pad + 8)))1285goto ack_len_err;1286read_last:1287tlen -= hdrsize + pad + 8;1288if (unlikely(tlen != qp->s_rdma_read_len))1289goto ack_len_err;1290if (!header_in_data)1291aeth = be32_to_cpu(ohdr->u.aeth);1292else {1293aeth = be32_to_cpu(((__be32 *) data)[0]);1294data += sizeof(__be32);1295}1296ipath_copy_sge(&qp->s_rdma_read_sge, data, tlen);1297(void) do_rc_ack(qp, aeth, psn,1298OP(RDMA_READ_RESPONSE_LAST), 0);1299goto ack_done;1300}13011302ack_op_err:1303status = IB_WC_LOC_QP_OP_ERR;1304goto ack_err;13051306ack_len_err:1307status = IB_WC_LOC_LEN_ERR;1308ack_err:1309ipath_send_complete(qp, wqe, status);1310ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);1311ack_done:1312spin_unlock_irqrestore(&qp->s_lock, flags);1313bail:1314return;1315}13161317/**1318* ipath_rc_rcv_error - process an incoming duplicate or error RC packet1319* @dev: the device this packet came in on1320* @ohdr: the other headers for this packet1321* @data: the packet data1322* @qp: the QP for this packet1323* @opcode: the opcode for this packet1324* @psn: the packet sequence number for this packet1325* @diff: the difference between the PSN and the expected PSN1326* @header_in_data: true if part of the header data is in the data buffer1327*1328* This is called from ipath_rc_rcv() to process an unexpected1329* incoming RC packet for the given QP.1330* Called at interrupt level.1331* Return 1 if no more processing is needed; otherwise return 0 to1332* schedule a response to be sent.1333*/1334static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,1335struct ipath_other_headers *ohdr,1336void *data,1337struct ipath_qp *qp,1338u32 opcode,1339u32 psn,1340int diff,1341int header_in_data)1342{1343struct ipath_ack_entry *e;1344u8 i, prev;1345int old_req;1346unsigned long flags;13471348if (diff > 0) {1349/*1350* Packet sequence error.1351* A NAK will ACK earlier sends and RDMA writes.1352* Don't queue the NAK if we already sent one.1353*/1354if (!qp->r_nak_state) {1355qp->r_nak_state = IB_NAK_PSN_ERROR;1356/* Use the expected PSN. */1357qp->r_ack_psn = qp->r_psn;1358goto send_ack;1359}1360goto done;1361}13621363/*1364* Handle a duplicate request. Don't re-execute SEND, RDMA1365* write or atomic op. Don't NAK errors, just silently drop1366* the duplicate request. Note that r_sge, r_len, and1367* r_rcv_len may be in use so don't modify them.1368*1369* We are supposed to ACK the earliest duplicate PSN but we1370* can coalesce an outstanding duplicate ACK. We have to1371* send the earliest so that RDMA reads can be restarted at1372* the requester's expected PSN.1373*1374* First, find where this duplicate PSN falls within the1375* ACKs previously sent.1376*/1377psn &= IPATH_PSN_MASK;1378e = NULL;1379old_req = 1;13801381spin_lock_irqsave(&qp->s_lock, flags);1382/* Double check we can process this now that we hold the s_lock. */1383if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))1384goto unlock_done;13851386for (i = qp->r_head_ack_queue; ; i = prev) {1387if (i == qp->s_tail_ack_queue)1388old_req = 0;1389if (i)1390prev = i - 1;1391else1392prev = IPATH_MAX_RDMA_ATOMIC;1393if (prev == qp->r_head_ack_queue) {1394e = NULL;1395break;1396}1397e = &qp->s_ack_queue[prev];1398if (!e->opcode) {1399e = NULL;1400break;1401}1402if (ipath_cmp24(psn, e->psn) >= 0) {1403if (prev == qp->s_tail_ack_queue)1404old_req = 0;1405break;1406}1407}1408switch (opcode) {1409case OP(RDMA_READ_REQUEST): {1410struct ib_reth *reth;1411u32 offset;1412u32 len;14131414/*1415* If we didn't find the RDMA read request in the ack queue,1416* or the send tasklet is already backed up to send an1417* earlier entry, we can ignore this request.1418*/1419if (!e || e->opcode != OP(RDMA_READ_REQUEST) || old_req)1420goto unlock_done;1421/* RETH comes after BTH */1422if (!header_in_data)1423reth = &ohdr->u.rc.reth;1424else {1425reth = (struct ib_reth *)data;1426data += sizeof(*reth);1427}1428/*1429* Address range must be a subset of the original1430* request and start on pmtu boundaries.1431* We reuse the old ack_queue slot since the requester1432* should not back up and request an earlier PSN for the1433* same request.1434*/1435offset = ((psn - e->psn) & IPATH_PSN_MASK) *1436ib_mtu_enum_to_int(qp->path_mtu);1437len = be32_to_cpu(reth->length);1438if (unlikely(offset + len > e->rdma_sge.sge.sge_length))1439goto unlock_done;1440if (len != 0) {1441u32 rkey = be32_to_cpu(reth->rkey);1442u64 vaddr = be64_to_cpu(reth->vaddr);1443int ok;14441445ok = ipath_rkey_ok(qp, &e->rdma_sge,1446len, vaddr, rkey,1447IB_ACCESS_REMOTE_READ);1448if (unlikely(!ok))1449goto unlock_done;1450} else {1451e->rdma_sge.sg_list = NULL;1452e->rdma_sge.num_sge = 0;1453e->rdma_sge.sge.mr = NULL;1454e->rdma_sge.sge.vaddr = NULL;1455e->rdma_sge.sge.length = 0;1456e->rdma_sge.sge.sge_length = 0;1457}1458e->psn = psn;1459qp->s_ack_state = OP(ACKNOWLEDGE);1460qp->s_tail_ack_queue = prev;1461break;1462}14631464case OP(COMPARE_SWAP):1465case OP(FETCH_ADD): {1466/*1467* If we didn't find the atomic request in the ack queue1468* or the send tasklet is already backed up to send an1469* earlier entry, we can ignore this request.1470*/1471if (!e || e->opcode != (u8) opcode || old_req)1472goto unlock_done;1473qp->s_ack_state = OP(ACKNOWLEDGE);1474qp->s_tail_ack_queue = prev;1475break;1476}14771478default:1479if (old_req)1480goto unlock_done;1481/*1482* Resend the most recent ACK if this request is1483* after all the previous RDMA reads and atomics.1484*/1485if (i == qp->r_head_ack_queue) {1486spin_unlock_irqrestore(&qp->s_lock, flags);1487qp->r_nak_state = 0;1488qp->r_ack_psn = qp->r_psn - 1;1489goto send_ack;1490}1491/*1492* Try to send a simple ACK to work around a Mellanox bug1493* which doesn't accept a RDMA read response or atomic1494* response as an ACK for earlier SENDs or RDMA writes.1495*/1496if (qp->r_head_ack_queue == qp->s_tail_ack_queue &&1497!(qp->s_flags & IPATH_S_ACK_PENDING) &&1498qp->s_ack_state == OP(ACKNOWLEDGE)) {1499spin_unlock_irqrestore(&qp->s_lock, flags);1500qp->r_nak_state = 0;1501qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;1502goto send_ack;1503}1504/*1505* Resend the RDMA read or atomic op which1506* ACKs this duplicate request.1507*/1508qp->s_ack_state = OP(ACKNOWLEDGE);1509qp->s_tail_ack_queue = i;1510break;1511}1512qp->r_nak_state = 0;1513ipath_schedule_send(qp);15141515unlock_done:1516spin_unlock_irqrestore(&qp->s_lock, flags);1517done:1518return 1;15191520send_ack:1521return 0;1522}15231524void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err)1525{1526unsigned long flags;1527int lastwqe;15281529spin_lock_irqsave(&qp->s_lock, flags);1530lastwqe = ipath_error_qp(qp, err);1531spin_unlock_irqrestore(&qp->s_lock, flags);15321533if (lastwqe) {1534struct ib_event ev;15351536ev.device = qp->ibqp.device;1537ev.element.qp = &qp->ibqp;1538ev.event = IB_EVENT_QP_LAST_WQE_REACHED;1539qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);1540}1541}15421543static inline void ipath_update_ack_queue(struct ipath_qp *qp, unsigned n)1544{1545unsigned next;15461547next = n + 1;1548if (next > IPATH_MAX_RDMA_ATOMIC)1549next = 0;1550if (n == qp->s_tail_ack_queue) {1551qp->s_tail_ack_queue = next;1552qp->s_ack_state = OP(ACKNOWLEDGE);1553}1554}15551556/**1557* ipath_rc_rcv - process an incoming RC packet1558* @dev: the device this packet came in on1559* @hdr: the header of this packet1560* @has_grh: true if the header has a GRH1561* @data: the packet data1562* @tlen: the packet length1563* @qp: the QP for this packet1564*1565* This is called from ipath_qp_rcv() to process an incoming RC packet1566* for the given QP.1567* Called at interrupt level.1568*/1569void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,1570int has_grh, void *data, u32 tlen, struct ipath_qp *qp)1571{1572struct ipath_other_headers *ohdr;1573u32 opcode;1574u32 hdrsize;1575u32 psn;1576u32 pad;1577struct ib_wc wc;1578u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);1579int diff;1580struct ib_reth *reth;1581int header_in_data;1582unsigned long flags;15831584/* Validate the SLID. See Ch. 9.6.1.5 */1585if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))1586goto done;15871588/* Check for GRH */1589if (!has_grh) {1590ohdr = &hdr->u.oth;1591hdrsize = 8 + 12; /* LRH + BTH */1592psn = be32_to_cpu(ohdr->bth[2]);1593header_in_data = 0;1594} else {1595ohdr = &hdr->u.l.oth;1596hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */1597/*1598* The header with GRH is 60 bytes and the core driver sets1599* the eager header buffer size to 56 bytes so the last 41600* bytes of the BTH header (PSN) is in the data buffer.1601*/1602header_in_data = dev->dd->ipath_rcvhdrentsize == 16;1603if (header_in_data) {1604psn = be32_to_cpu(((__be32 *) data)[0]);1605data += sizeof(__be32);1606} else1607psn = be32_to_cpu(ohdr->bth[2]);1608}16091610/*1611* Process responses (ACKs) before anything else. Note that the1612* packet sequence number will be for something in the send work1613* queue rather than the expected receive packet sequence number.1614* In other words, this QP is the requester.1615*/1616opcode = be32_to_cpu(ohdr->bth[0]) >> 24;1617if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&1618opcode <= OP(ATOMIC_ACKNOWLEDGE)) {1619ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn,1620hdrsize, pmtu, header_in_data);1621goto done;1622}16231624/* Compute 24 bits worth of difference. */1625diff = ipath_cmp24(psn, qp->r_psn);1626if (unlikely(diff)) {1627if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode,1628psn, diff, header_in_data))1629goto done;1630goto send_ack;1631}16321633/* Check for opcode sequence errors. */1634switch (qp->r_state) {1635case OP(SEND_FIRST):1636case OP(SEND_MIDDLE):1637if (opcode == OP(SEND_MIDDLE) ||1638opcode == OP(SEND_LAST) ||1639opcode == OP(SEND_LAST_WITH_IMMEDIATE))1640break;1641goto nack_inv;16421643case OP(RDMA_WRITE_FIRST):1644case OP(RDMA_WRITE_MIDDLE):1645if (opcode == OP(RDMA_WRITE_MIDDLE) ||1646opcode == OP(RDMA_WRITE_LAST) ||1647opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))1648break;1649goto nack_inv;16501651default:1652if (opcode == OP(SEND_MIDDLE) ||1653opcode == OP(SEND_LAST) ||1654opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||1655opcode == OP(RDMA_WRITE_MIDDLE) ||1656opcode == OP(RDMA_WRITE_LAST) ||1657opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))1658goto nack_inv;1659/*1660* Note that it is up to the requester to not send a new1661* RDMA read or atomic operation before receiving an ACK1662* for the previous operation.1663*/1664break;1665}16661667memset(&wc, 0, sizeof wc);16681669/* OK, process the packet. */1670switch (opcode) {1671case OP(SEND_FIRST):1672if (!ipath_get_rwqe(qp, 0))1673goto rnr_nak;1674qp->r_rcv_len = 0;1675/* FALLTHROUGH */1676case OP(SEND_MIDDLE):1677case OP(RDMA_WRITE_MIDDLE):1678send_middle:1679/* Check for invalid length PMTU or posted rwqe len. */1680if (unlikely(tlen != (hdrsize + pmtu + 4)))1681goto nack_inv;1682qp->r_rcv_len += pmtu;1683if (unlikely(qp->r_rcv_len > qp->r_len))1684goto nack_inv;1685ipath_copy_sge(&qp->r_sge, data, pmtu);1686break;16871688case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):1689/* consume RWQE */1690if (!ipath_get_rwqe(qp, 1))1691goto rnr_nak;1692goto send_last_imm;16931694case OP(SEND_ONLY):1695case OP(SEND_ONLY_WITH_IMMEDIATE):1696if (!ipath_get_rwqe(qp, 0))1697goto rnr_nak;1698qp->r_rcv_len = 0;1699if (opcode == OP(SEND_ONLY))1700goto send_last;1701/* FALLTHROUGH */1702case OP(SEND_LAST_WITH_IMMEDIATE):1703send_last_imm:1704if (header_in_data) {1705wc.ex.imm_data = *(__be32 *) data;1706data += sizeof(__be32);1707} else {1708/* Immediate data comes after BTH */1709wc.ex.imm_data = ohdr->u.imm_data;1710}1711hdrsize += 4;1712wc.wc_flags = IB_WC_WITH_IMM;1713/* FALLTHROUGH */1714case OP(SEND_LAST):1715case OP(RDMA_WRITE_LAST):1716send_last:1717/* Get the number of bytes the message was padded by. */1718pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;1719/* Check for invalid length. */1720/* XXX LAST len should be >= 1 */1721if (unlikely(tlen < (hdrsize + pad + 4)))1722goto nack_inv;1723/* Don't count the CRC. */1724tlen -= (hdrsize + pad + 4);1725wc.byte_len = tlen + qp->r_rcv_len;1726if (unlikely(wc.byte_len > qp->r_len))1727goto nack_inv;1728ipath_copy_sge(&qp->r_sge, data, tlen);1729qp->r_msn++;1730if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))1731break;1732wc.wr_id = qp->r_wr_id;1733wc.status = IB_WC_SUCCESS;1734if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||1735opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))1736wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;1737else1738wc.opcode = IB_WC_RECV;1739wc.qp = &qp->ibqp;1740wc.src_qp = qp->remote_qpn;1741wc.slid = qp->remote_ah_attr.dlid;1742wc.sl = qp->remote_ah_attr.sl;1743/* Signal completion event if the solicited bit is set. */1744ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,1745(ohdr->bth[0] &1746cpu_to_be32(1 << 23)) != 0);1747break;17481749case OP(RDMA_WRITE_FIRST):1750case OP(RDMA_WRITE_ONLY):1751case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):1752if (unlikely(!(qp->qp_access_flags &1753IB_ACCESS_REMOTE_WRITE)))1754goto nack_inv;1755/* consume RWQE */1756/* RETH comes after BTH */1757if (!header_in_data)1758reth = &ohdr->u.rc.reth;1759else {1760reth = (struct ib_reth *)data;1761data += sizeof(*reth);1762}1763hdrsize += sizeof(*reth);1764qp->r_len = be32_to_cpu(reth->length);1765qp->r_rcv_len = 0;1766if (qp->r_len != 0) {1767u32 rkey = be32_to_cpu(reth->rkey);1768u64 vaddr = be64_to_cpu(reth->vaddr);1769int ok;17701771/* Check rkey & NAK */1772ok = ipath_rkey_ok(qp, &qp->r_sge,1773qp->r_len, vaddr, rkey,1774IB_ACCESS_REMOTE_WRITE);1775if (unlikely(!ok))1776goto nack_acc;1777} else {1778qp->r_sge.sg_list = NULL;1779qp->r_sge.sge.mr = NULL;1780qp->r_sge.sge.vaddr = NULL;1781qp->r_sge.sge.length = 0;1782qp->r_sge.sge.sge_length = 0;1783}1784if (opcode == OP(RDMA_WRITE_FIRST))1785goto send_middle;1786else if (opcode == OP(RDMA_WRITE_ONLY))1787goto send_last;1788if (!ipath_get_rwqe(qp, 1))1789goto rnr_nak;1790goto send_last_imm;17911792case OP(RDMA_READ_REQUEST): {1793struct ipath_ack_entry *e;1794u32 len;1795u8 next;17961797if (unlikely(!(qp->qp_access_flags &1798IB_ACCESS_REMOTE_READ)))1799goto nack_inv;1800next = qp->r_head_ack_queue + 1;1801if (next > IPATH_MAX_RDMA_ATOMIC)1802next = 0;1803spin_lock_irqsave(&qp->s_lock, flags);1804/* Double check we can process this while holding the s_lock. */1805if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))1806goto unlock;1807if (unlikely(next == qp->s_tail_ack_queue)) {1808if (!qp->s_ack_queue[next].sent)1809goto nack_inv_unlck;1810ipath_update_ack_queue(qp, next);1811}1812e = &qp->s_ack_queue[qp->r_head_ack_queue];1813/* RETH comes after BTH */1814if (!header_in_data)1815reth = &ohdr->u.rc.reth;1816else {1817reth = (struct ib_reth *)data;1818data += sizeof(*reth);1819}1820len = be32_to_cpu(reth->length);1821if (len) {1822u32 rkey = be32_to_cpu(reth->rkey);1823u64 vaddr = be64_to_cpu(reth->vaddr);1824int ok;18251826/* Check rkey & NAK */1827ok = ipath_rkey_ok(qp, &e->rdma_sge, len, vaddr,1828rkey, IB_ACCESS_REMOTE_READ);1829if (unlikely(!ok))1830goto nack_acc_unlck;1831/*1832* Update the next expected PSN. We add 1 later1833* below, so only add the remainder here.1834*/1835if (len > pmtu)1836qp->r_psn += (len - 1) / pmtu;1837} else {1838e->rdma_sge.sg_list = NULL;1839e->rdma_sge.num_sge = 0;1840e->rdma_sge.sge.mr = NULL;1841e->rdma_sge.sge.vaddr = NULL;1842e->rdma_sge.sge.length = 0;1843e->rdma_sge.sge.sge_length = 0;1844}1845e->opcode = opcode;1846e->sent = 0;1847e->psn = psn;1848/*1849* We need to increment the MSN here instead of when we1850* finish sending the result since a duplicate request would1851* increment it more than once.1852*/1853qp->r_msn++;1854qp->r_psn++;1855qp->r_state = opcode;1856qp->r_nak_state = 0;1857qp->r_head_ack_queue = next;18581859/* Schedule the send tasklet. */1860ipath_schedule_send(qp);18611862goto unlock;1863}18641865case OP(COMPARE_SWAP):1866case OP(FETCH_ADD): {1867struct ib_atomic_eth *ateth;1868struct ipath_ack_entry *e;1869u64 vaddr;1870atomic64_t *maddr;1871u64 sdata;1872u32 rkey;1873u8 next;18741875if (unlikely(!(qp->qp_access_flags &1876IB_ACCESS_REMOTE_ATOMIC)))1877goto nack_inv;1878next = qp->r_head_ack_queue + 1;1879if (next > IPATH_MAX_RDMA_ATOMIC)1880next = 0;1881spin_lock_irqsave(&qp->s_lock, flags);1882/* Double check we can process this while holding the s_lock. */1883if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))1884goto unlock;1885if (unlikely(next == qp->s_tail_ack_queue)) {1886if (!qp->s_ack_queue[next].sent)1887goto nack_inv_unlck;1888ipath_update_ack_queue(qp, next);1889}1890if (!header_in_data)1891ateth = &ohdr->u.atomic_eth;1892else1893ateth = (struct ib_atomic_eth *)data;1894vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |1895be32_to_cpu(ateth->vaddr[1]);1896if (unlikely(vaddr & (sizeof(u64) - 1)))1897goto nack_inv_unlck;1898rkey = be32_to_cpu(ateth->rkey);1899/* Check rkey & NAK */1900if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge,1901sizeof(u64), vaddr, rkey,1902IB_ACCESS_REMOTE_ATOMIC)))1903goto nack_acc_unlck;1904/* Perform atomic OP and save result. */1905maddr = (atomic64_t *) qp->r_sge.sge.vaddr;1906sdata = be64_to_cpu(ateth->swap_data);1907e = &qp->s_ack_queue[qp->r_head_ack_queue];1908e->atomic_data = (opcode == OP(FETCH_ADD)) ?1909(u64) atomic64_add_return(sdata, maddr) - sdata :1910(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,1911be64_to_cpu(ateth->compare_data),1912sdata);1913e->opcode = opcode;1914e->sent = 0;1915e->psn = psn & IPATH_PSN_MASK;1916qp->r_msn++;1917qp->r_psn++;1918qp->r_state = opcode;1919qp->r_nak_state = 0;1920qp->r_head_ack_queue = next;19211922/* Schedule the send tasklet. */1923ipath_schedule_send(qp);19241925goto unlock;1926}19271928default:1929/* NAK unknown opcodes. */1930goto nack_inv;1931}1932qp->r_psn++;1933qp->r_state = opcode;1934qp->r_ack_psn = psn;1935qp->r_nak_state = 0;1936/* Send an ACK if requested or required. */1937if (psn & (1 << 31))1938goto send_ack;1939goto done;19401941rnr_nak:1942qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;1943qp->r_ack_psn = qp->r_psn;1944goto send_ack;19451946nack_inv_unlck:1947spin_unlock_irqrestore(&qp->s_lock, flags);1948nack_inv:1949ipath_rc_error(qp, IB_WC_LOC_QP_OP_ERR);1950qp->r_nak_state = IB_NAK_INVALID_REQUEST;1951qp->r_ack_psn = qp->r_psn;1952goto send_ack;19531954nack_acc_unlck:1955spin_unlock_irqrestore(&qp->s_lock, flags);1956nack_acc:1957ipath_rc_error(qp, IB_WC_LOC_PROT_ERR);1958qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;1959qp->r_ack_psn = qp->r_psn;1960send_ack:1961send_rc_ack(qp);1962goto done;19631964unlock:1965spin_unlock_irqrestore(&qp->s_lock, flags);1966done:1967return;1968}196919701971