Path: blob/main/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h
39566 views
#ifndef _SDP_H_1#define _SDP_H_23#define LINUXKPI_PARAM_PREFIX ib_sdp_45#include "opt_ddb.h"6#include "opt_inet.h"7#include "opt_ofed.h"89#include <sys/param.h>10#include <sys/systm.h>11#include <sys/malloc.h>12#include <sys/kernel.h>13#include <sys/sysctl.h>14#include <sys/mbuf.h>15#include <sys/lock.h>16#include <sys/rwlock.h>17#include <sys/socket.h>18#include <sys/socketvar.h>19#include <sys/protosw.h>20#include <sys/proc.h>21#include <sys/jail.h>22#include <sys/domain.h>2324#ifdef DDB25#include <ddb/ddb.h>26#endif2728#include <net/if.h>29#include <net/if_var.h>30#include <net/route.h>31#include <net/vnet.h>3233#include <netinet/in.h>34#include <netinet/in_systm.h>35#include <netinet/in_var.h>36#include <netinet/in_pcb.h>37#include <netinet/tcp.h>38#include <netinet/tcp_fsm.h>39#include <netinet/tcp_timer.h>40#include <netinet/tcp_var.h>4142#include <linux/device.h>43#include <linux/err.h>44#include <linux/sched.h>45#include <linux/workqueue.h>46#include <linux/wait.h>47#include <linux/module.h>48#include <linux/moduleparam.h>49#include <linux/pci.h>5051#include <rdma/ib_verbs.h>52#include <rdma/rdma_cm.h>53#include <rdma/ib_cm.h>54#include <rdma/ib_fmr_pool.h>55#include <rdma/rdma_sdp.h>5657#ifdef SDP_DEBUG58#define CONFIG_INFINIBAND_SDP_DEBUG59#endif6061#include "sdp_dbg.h"6263#undef LIST_HEAD64/* From sys/queue.h */65#define LIST_HEAD(name, type) \66struct name { \67struct type *lh_first; /* first element */ \68}6970/* Interval between successive polls in the Tx routine when polling is used71instead of interrupts (in per-core Tx rings) - should be power of 2 */72#define SDP_TX_POLL_MODER 1673#define SDP_TX_POLL_TIMEOUT (HZ / 20)74#define SDP_NAGLE_TIMEOUT (HZ / 10)7576#define SDP_SRCAVAIL_CANCEL_TIMEOUT (HZ * 5)77#define SDP_SRCAVAIL_ADV_TIMEOUT (1 * HZ)78#define SDP_SRCAVAIL_PAYLOAD_LEN 17980#define SDP_RESOLVE_TIMEOUT 100081#define SDP_ROUTE_TIMEOUT 100082#define SDP_RETRY_COUNT 583#define SDP_KEEPALIVE_TIME (120 * 60 * HZ)84#define SDP_FIN_WAIT_TIMEOUT (60 * HZ) /* like TCP_FIN_TIMEOUT */8586#define SDP_TX_SIZE 0x4087#define SDP_RX_SIZE 0x408889#define SDP_FMR_SIZE (MIN(0x1000, PAGE_SIZE) / sizeof(u64))90#define SDP_FMR_POOL_SIZE 102491#define SDP_FMR_DIRTY_SIZE ( SDP_FMR_POOL_SIZE / 4 )9293#define SDP_MAX_RDMA_READ_LEN (PAGE_SIZE * (SDP_FMR_SIZE - 2))9495/* mb inlined data len - rest will be rx'ed into frags */96#define SDP_HEAD_SIZE (sizeof(struct sdp_bsdh))9798/* limit tx payload len, if the sink supports bigger buffers than the source99* can handle.100* or rx fragment size (limited by sge->length size) */101#define SDP_MAX_PACKET (1 << 16)102#define SDP_MAX_PAYLOAD (SDP_MAX_PACKET - SDP_HEAD_SIZE)103104#define SDP_MAX_RECV_SGES (SDP_MAX_PACKET / MCLBYTES)105#define SDP_MAX_SEND_SGES (SDP_MAX_PACKET / MCLBYTES) + 2106107#define SDP_NUM_WC 4108109#define SDP_DEF_ZCOPY_THRESH 64*1024110#define SDP_MIN_ZCOPY_THRESH PAGE_SIZE111#define SDP_MAX_ZCOPY_THRESH 1048576112113#define SDP_OP_RECV 0x800000000LL114#define SDP_OP_SEND 0x400000000LL115#define SDP_OP_RDMA 0x200000000LL116#define SDP_OP_NOP 0x100000000LL117118/* how long (in jiffies) to block sender till tx completion*/119#define SDP_BZCOPY_POLL_TIMEOUT (HZ / 10)120121#define SDP_AUTO_CONF 0xffff122#define AUTO_MOD_DELAY (HZ / 4)123124struct sdp_mb_cb {125__u32 seq; /* Starting sequence number */126struct bzcopy_state *bz;127struct rx_srcavail_state *rx_sa;128struct tx_srcavail_state *tx_sa;129};130131#define M_PUSH M_PROTO1 /* Do a 'push'. */132#define M_URG M_PROTO2 /* Mark as urgent (oob). */133134#define SDP_SKB_CB(__mb) ((struct sdp_mb_cb *)&((__mb)->cb[0]))135#define BZCOPY_STATE(mb) (SDP_SKB_CB(mb)->bz)136#define RX_SRCAVAIL_STATE(mb) (SDP_SKB_CB(mb)->rx_sa)137#define TX_SRCAVAIL_STATE(mb) (SDP_SKB_CB(mb)->tx_sa)138139#ifndef MIN140#define MIN(a, b) (a < b ? a : b)141#endif142143#define ring_head(ring) (atomic_read(&(ring).head))144#define ring_tail(ring) (atomic_read(&(ring).tail))145#define ring_posted(ring) (ring_head(ring) - ring_tail(ring))146147#define rx_ring_posted(ssk) ring_posted(ssk->rx_ring)148#ifdef SDP_ZCOPY149#define tx_ring_posted(ssk) (ring_posted(ssk->tx_ring) + \150(ssk->tx_ring.rdma_inflight ? ssk->tx_ring.rdma_inflight->busy : 0))151#else152#define tx_ring_posted(ssk) ring_posted(ssk->tx_ring)153#endif154155extern int sdp_zcopy_thresh;156extern int rcvbuf_initial_size;157extern struct workqueue_struct *rx_comp_wq;158extern struct ib_client sdp_client;159160enum sdp_mid {161SDP_MID_HELLO = 0x0,162SDP_MID_HELLO_ACK = 0x1,163SDP_MID_DISCONN = 0x2,164SDP_MID_ABORT = 0x3,165SDP_MID_SENDSM = 0x4,166SDP_MID_RDMARDCOMPL = 0x6,167SDP_MID_SRCAVAIL_CANCEL = 0x8,168SDP_MID_CHRCVBUF = 0xB,169SDP_MID_CHRCVBUF_ACK = 0xC,170SDP_MID_SINKAVAIL = 0xFD,171SDP_MID_SRCAVAIL = 0xFE,172SDP_MID_DATA = 0xFF,173};174175enum sdp_flags {176SDP_OOB_PRES = 1 << 0,177SDP_OOB_PEND = 1 << 1,178};179180enum {181SDP_MIN_TX_CREDITS = 2182};183184enum {185SDP_ERR_ERROR = -4,186SDP_ERR_FAULT = -3,187SDP_NEW_SEG = -2,188SDP_DO_WAIT_MEM = -1189};190191struct sdp_rrch {192__u32 len;193} __attribute__((__packed__));194195struct sdp_srcah {196__u32 len;197__u32 rkey;198__u64 vaddr;199} __attribute__((__packed__));200201struct sdp_buf {202struct mbuf *mb;203u64 mapping[SDP_MAX_SEND_SGES];204} __attribute__((__packed__));205206struct sdp_chrecvbuf {207u32 size;208} __attribute__((__packed__));209210/* Context used for synchronous zero copy bcopy (BZCOPY) */211struct bzcopy_state {212unsigned char __user *u_base;213int u_len;214int left;215int page_cnt;216int cur_page;217int cur_offset;218int busy;219struct sdp_sock *ssk;220struct page **pages;221};222223enum rx_sa_flag {224RX_SA_ABORTED = 2,225};226227enum tx_sa_flag {228TX_SA_SENDSM = 0x01,229TX_SA_CROSS_SEND = 0x02,230TX_SA_INTRRUPTED = 0x04,231TX_SA_TIMEDOUT = 0x08,232TX_SA_ERROR = 0x10,233};234235struct rx_srcavail_state {236/* Advertised buffer stuff */237u32 mseq;238u32 used;239u32 reported;240u32 len;241u32 rkey;242u64 vaddr;243244/* Dest buff info */245struct ib_umem *umem;246struct ib_pool_fmr *fmr;247248/* Utility */249u8 busy;250enum rx_sa_flag flags;251};252253struct tx_srcavail_state {254/* Data below 'busy' will be reset */255u8 busy;256257struct ib_umem *umem;258struct ib_pool_fmr *fmr;259260u32 bytes_sent;261u32 bytes_acked;262263enum tx_sa_flag abort_flags;264u8 posted;265266u32 mseq;267};268269struct sdp_tx_ring {270#ifdef SDP_ZCOPY271struct rx_srcavail_state *rdma_inflight;272#endif273struct sdp_buf *buffer;274atomic_t head;275atomic_t tail;276struct ib_cq *cq;277278atomic_t credits;279#define tx_credits(ssk) (atomic_read(&ssk->tx_ring.credits))280281struct callout timer;282u16 poll_cnt;283};284285struct sdp_rx_ring {286struct sdp_buf *buffer;287atomic_t head;288atomic_t tail;289struct ib_cq *cq;290291int destroyed;292struct rwlock destroyed_lock;293};294295struct sdp_device {296struct ib_pd *pd;297struct ib_fmr_pool *fmr_pool;298};299300struct sdp_moderation {301unsigned long last_moder_packets;302unsigned long last_moder_tx_packets;303unsigned long last_moder_bytes;304unsigned long last_moder_jiffies;305int last_moder_time;306u16 rx_usecs;307u16 rx_frames;308u16 tx_usecs;309u32 pkt_rate_low;310u16 rx_usecs_low;311u32 pkt_rate_high;312u16 rx_usecs_high;313u16 sample_interval;314u16 adaptive_rx_coal;315u32 msg_enable;316317int moder_cnt;318int moder_time;319};320321/* These are flags fields. */322#define SDP_TIMEWAIT 0x0001 /* In ssk timewait state. */323#define SDP_DROPPED 0x0002 /* Socket has been dropped. */324#define SDP_SOCKREF 0x0004 /* Holding a sockref for close. */325#define SDP_NODELAY 0x0008 /* Disble nagle. */326#define SDP_NEEDFIN 0x0010 /* Send a fin on the next tx. */327#define SDP_DREQWAIT 0x0020 /* Waiting on DREQ. */328#define SDP_DESTROY 0x0040 /* Being destroyed. */329#define SDP_DISCON 0x0080 /* rdma_disconnect is owed. */330331/* These are oobflags */332#define SDP_HADOOB 0x0001 /* Had OOB data. */333#define SDP_HAVEOOB 0x0002 /* Have OOB data. */334335struct sdp_sock {336LIST_ENTRY(sdp_sock) list;337struct socket *socket;338struct rdma_cm_id *id;339struct ib_device *ib_device;340struct sdp_device *sdp_dev;341struct ib_qp *qp;342struct ucred *cred;343struct callout keep2msl; /* 2msl and keepalive timer. */344struct callout nagle_timer; /* timeout waiting for ack */345struct ib_ucontext context;346in_port_t lport;347in_addr_t laddr;348in_port_t fport;349in_addr_t faddr;350int flags;351int oobflags; /* protected by rx lock. */352int state;353int softerror;354int recv_bytes; /* Bytes per recv. buf including header */355int xmit_size_goal;356char iobc;357358struct sdp_rx_ring rx_ring;359struct sdp_tx_ring tx_ring;360struct rwlock lock;361struct mbufq rxctlq; /* received control packets */362363int qp_active; /* XXX Flag. */364int max_sge;365struct work_struct rx_comp_work;366#define rcv_nxt(ssk) atomic_read(&(ssk->rcv_nxt))367atomic_t rcv_nxt;368369/* SDP specific */370atomic_t mseq_ack;371#define mseq_ack(ssk) (atomic_read(&ssk->mseq_ack))372unsigned max_bufs; /* Initial buffers offered by other side */373unsigned min_bufs; /* Low water mark to wake senders */374375unsigned long nagle_last_unacked; /* mseq of lastest unacked packet */376377atomic_t remote_credits;378#define remote_credits(ssk) (atomic_read(&ssk->remote_credits))379int poll_cq;380381/* SDP slow start */382int recv_request_head; /* mark the rx_head when the resize request383was received */384int recv_request; /* XXX flag if request to resize was received */385386unsigned long tx_packets;387unsigned long rx_packets;388unsigned long tx_bytes;389unsigned long rx_bytes;390struct sdp_moderation auto_mod;391struct task shutdown_task;392#ifdef SDP_ZCOPY393struct tx_srcavail_state *tx_sa;394struct rx_srcavail_state *rx_sa;395spinlock_t tx_sa_lock;396struct delayed_work srcavail_cancel_work;397int srcavail_cancel_mseq;398/* ZCOPY data: -1:use global; 0:disable zcopy; >0: zcopy threshold */399int zcopy_thresh;400#endif401};402403#define sdp_sk(so) ((struct sdp_sock *)(so->so_pcb))404405#define SDP_RLOCK(ssk) rw_rlock(&(ssk)->lock)406#define SDP_WLOCK(ssk) rw_wlock(&(ssk)->lock)407#define SDP_RUNLOCK(ssk) rw_runlock(&(ssk)->lock)408#define SDP_WUNLOCK(ssk) rw_wunlock(&(ssk)->lock)409#define SDP_WLOCK_ASSERT(ssk) rw_assert(&(ssk)->lock, RA_WLOCKED)410#define SDP_RLOCK_ASSERT(ssk) rw_assert(&(ssk)->lock, RA_RLOCKED)411#define SDP_LOCK_ASSERT(ssk) rw_assert(&(ssk)->lock, RA_LOCKED)412413MALLOC_DECLARE(M_SDP);414SYSCTL_DECL(_net_inet_sdp);415416static inline void tx_sa_reset(struct tx_srcavail_state *tx_sa)417{418memset((void *)&tx_sa->busy, 0,419sizeof(*tx_sa) - offsetof(typeof(*tx_sa), busy));420}421422static inline void rx_ring_unlock(struct sdp_rx_ring *rx_ring)423{424rw_runlock(&rx_ring->destroyed_lock);425}426427static inline int rx_ring_trylock(struct sdp_rx_ring *rx_ring)428{429rw_rlock(&rx_ring->destroyed_lock);430if (rx_ring->destroyed) {431rx_ring_unlock(rx_ring);432return 0;433}434return 1;435}436437static inline void rx_ring_destroy_lock(struct sdp_rx_ring *rx_ring)438{439rw_wlock(&rx_ring->destroyed_lock);440rx_ring->destroyed = 1;441rw_wunlock(&rx_ring->destroyed_lock);442}443444static inline void sdp_arm_rx_cq(struct sdp_sock *ssk)445{446sdp_prf(ssk->socket, NULL, "Arming RX cq");447sdp_dbg_data(ssk->socket, "Arming RX cq\n");448449ib_req_notify_cq(ssk->rx_ring.cq, IB_CQ_NEXT_COMP);450}451452static inline void sdp_arm_tx_cq(struct sdp_sock *ssk)453{454sdp_prf(ssk->socket, NULL, "Arming TX cq");455sdp_dbg_data(ssk->socket, "Arming TX cq. credits: %d, posted: %d\n",456tx_credits(ssk), tx_ring_posted(ssk));457458ib_req_notify_cq(ssk->tx_ring.cq, IB_CQ_NEXT_COMP);459}460461/* return the min of:462* - tx credits463* - free slots in tx_ring (not including SDP_MIN_TX_CREDITS464*/465static inline int tx_slots_free(struct sdp_sock *ssk)466{467int min_free;468469min_free = MIN(tx_credits(ssk),470SDP_TX_SIZE - tx_ring_posted(ssk));471if (min_free < SDP_MIN_TX_CREDITS)472return 0;473474return min_free - SDP_MIN_TX_CREDITS;475};476477/* utilities */478static inline char *mid2str(int mid)479{480#define ENUM2STR(e) [e] = #e481static char *mid2str[] = {482ENUM2STR(SDP_MID_HELLO),483ENUM2STR(SDP_MID_HELLO_ACK),484ENUM2STR(SDP_MID_ABORT),485ENUM2STR(SDP_MID_DISCONN),486ENUM2STR(SDP_MID_SENDSM),487ENUM2STR(SDP_MID_RDMARDCOMPL),488ENUM2STR(SDP_MID_SRCAVAIL_CANCEL),489ENUM2STR(SDP_MID_CHRCVBUF),490ENUM2STR(SDP_MID_CHRCVBUF_ACK),491ENUM2STR(SDP_MID_DATA),492ENUM2STR(SDP_MID_SRCAVAIL),493ENUM2STR(SDP_MID_SINKAVAIL),494};495496if (mid >= ARRAY_SIZE(mid2str))497return NULL;498499return mid2str[mid];500}501502static inline struct mbuf *503sdp_alloc_mb(struct socket *sk, u8 mid, int size, int wait)504{505struct sdp_bsdh *h;506struct mbuf *mb;507508MGETHDR(mb, wait, MT_DATA);509if (mb == NULL)510return (NULL);511mb->m_pkthdr.len = mb->m_len = sizeof(struct sdp_bsdh);512h = mtod(mb, struct sdp_bsdh *);513h->mid = mid;514515return mb;516}517static inline struct mbuf *518sdp_alloc_mb_data(struct socket *sk, int wait)519{520return sdp_alloc_mb(sk, SDP_MID_DATA, 0, wait);521}522523static inline struct mbuf *524sdp_alloc_mb_disconnect(struct socket *sk, int wait)525{526return sdp_alloc_mb(sk, SDP_MID_DISCONN, 0, wait);527}528529static inline void *530mb_put(struct mbuf *mb, int len)531{532uint8_t *data;533534data = mb->m_data;535data += mb->m_len;536mb->m_len += len;537return (void *)data;538}539540static inline struct mbuf *541sdp_alloc_mb_chrcvbuf_ack(struct socket *sk, int size, int wait)542{543struct mbuf *mb;544struct sdp_chrecvbuf *resp_size;545546mb = sdp_alloc_mb(sk, SDP_MID_CHRCVBUF_ACK, sizeof(*resp_size), wait);547if (mb == NULL)548return (NULL);549resp_size = (struct sdp_chrecvbuf *)mb_put(mb, sizeof *resp_size);550resp_size->size = htonl(size);551552return mb;553}554555static inline struct mbuf *556sdp_alloc_mb_srcavail(struct socket *sk, u32 len, u32 rkey, u64 vaddr, int wait)557{558struct mbuf *mb;559struct sdp_srcah *srcah;560561mb = sdp_alloc_mb(sk, SDP_MID_SRCAVAIL, sizeof(*srcah), wait);562if (mb == NULL)563return (NULL);564srcah = (struct sdp_srcah *)mb_put(mb, sizeof(*srcah));565srcah->len = htonl(len);566srcah->rkey = htonl(rkey);567srcah->vaddr = cpu_to_be64(vaddr);568569return mb;570}571572static inline struct mbuf *573sdp_alloc_mb_srcavail_cancel(struct socket *sk, int wait)574{575return sdp_alloc_mb(sk, SDP_MID_SRCAVAIL_CANCEL, 0, wait);576}577578static inline struct mbuf *579sdp_alloc_mb_rdmardcompl(struct socket *sk, u32 len, int wait)580{581struct mbuf *mb;582struct sdp_rrch *rrch;583584mb = sdp_alloc_mb(sk, SDP_MID_RDMARDCOMPL, sizeof(*rrch), wait);585if (mb == NULL)586return (NULL);587rrch = (struct sdp_rrch *)mb_put(mb, sizeof(*rrch));588rrch->len = htonl(len);589590return mb;591}592593static inline struct mbuf *594sdp_alloc_mb_sendsm(struct socket *sk, int wait)595{596return sdp_alloc_mb(sk, SDP_MID_SENDSM, 0, wait);597}598static inline int sdp_tx_ring_slots_left(struct sdp_sock *ssk)599{600return SDP_TX_SIZE - tx_ring_posted(ssk);601}602603static inline int credit_update_needed(struct sdp_sock *ssk)604{605int c;606607c = remote_credits(ssk);608if (likely(c > SDP_MIN_TX_CREDITS))609c += c/2;610return unlikely(c < rx_ring_posted(ssk)) &&611likely(tx_credits(ssk) > 0) &&612likely(sdp_tx_ring_slots_left(ssk));613}614615616#define SDPSTATS_COUNTER_INC(stat)617#define SDPSTATS_COUNTER_ADD(stat, val)618#define SDPSTATS_COUNTER_MID_INC(stat, mid)619#define SDPSTATS_HIST_LINEAR(stat, size)620#define SDPSTATS_HIST(stat, size)621622static inline void623sdp_cleanup_sdp_buf(struct sdp_sock *ssk, struct sdp_buf *sbuf,624enum dma_data_direction dir)625{626struct ib_device *dev;627struct mbuf *mb;628int i;629630dev = ssk->ib_device;631for (i = 0, mb = sbuf->mb; mb != NULL; mb = mb->m_next, i++)632ib_dma_unmap_single(dev, sbuf->mapping[i], mb->m_len, dir);633}634635/* sdp_main.c */636void sdp_set_default_moderation(struct sdp_sock *ssk);637void sdp_start_keepalive_timer(struct socket *sk);638void sdp_urg(struct sdp_sock *ssk, struct mbuf *mb);639void sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk);640void sdp_abort(struct socket *sk);641struct sdp_sock *sdp_notify(struct sdp_sock *ssk, int error);642643644/* sdp_cma.c */645int sdp_cma_handler(struct rdma_cm_id *, struct rdma_cm_event *);646647/* sdp_tx.c */648int sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device);649void sdp_tx_ring_destroy(struct sdp_sock *ssk);650int sdp_xmit_poll(struct sdp_sock *ssk, int force);651void sdp_post_send(struct sdp_sock *ssk, struct mbuf *mb);652void sdp_post_sends(struct sdp_sock *ssk, int wait);653void sdp_post_keepalive(struct sdp_sock *ssk);654655/* sdp_rx.c */656void sdp_rx_ring_init(struct sdp_sock *ssk);657int sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device);658void sdp_rx_ring_destroy(struct sdp_sock *ssk);659int sdp_resize_buffers(struct sdp_sock *ssk, u32 new_size);660int sdp_init_buffers(struct sdp_sock *ssk, u32 new_size);661void sdp_do_posts(struct sdp_sock *ssk);662void sdp_rx_comp_full(struct sdp_sock *ssk);663664/* sdp_zcopy.c */665struct kiocb;666int sdp_sendmsg_zcopy(struct kiocb *iocb, struct socket *sk, struct iovec *iov);667int sdp_handle_srcavail(struct sdp_sock *ssk, struct sdp_srcah *srcah);668void sdp_handle_sendsm(struct sdp_sock *ssk, u32 mseq_ack);669void sdp_handle_rdma_read_compl(struct sdp_sock *ssk, u32 mseq_ack,670u32 bytes_completed);671int sdp_handle_rdma_read_cqe(struct sdp_sock *ssk);672int sdp_rdma_to_iovec(struct socket *sk, struct iovec *iov, struct mbuf *mb,673unsigned long *used);674int sdp_post_rdma_rd_compl(struct sdp_sock *ssk,675struct rx_srcavail_state *rx_sa);676int sdp_post_sendsm(struct socket *sk);677void srcavail_cancel_timeout(struct work_struct *work);678void sdp_abort_srcavail(struct socket *sk);679void sdp_abort_rdma_read(struct socket *sk);680int sdp_process_rx(struct sdp_sock *ssk);681682#endif683684685