Path: blob/main/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c
39566 views
/*-1* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.02*3* Copyright (c) 2004 Topspin Communications. All rights reserved.4* Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.5* Copyright (c) 2004 Voltaire, Inc. All rights reserved.6*7* This software is available to you under a choice of one of two8* licenses. You may choose to be licensed under the terms of the GNU9* General Public License (GPL) Version 2, available from the file10* COPYING in the main directory of this source tree, or the11* OpenIB.org BSD license below:12*13* Redistribution and use in source and binary forms, with or14* without modification, are permitted provided that the following15* conditions are met:16*17* - Redistributions of source code must retain the above18* copyright notice, this list of conditions and the following19* disclaimer.20*21* - Redistributions in binary form must reproduce the above22* copyright notice, this list of conditions and the following23* disclaimer in the documentation and/or other materials24* provided with the distribution.25*26* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,27* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF28* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND29* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS30* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN31* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN32* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE33* SOFTWARE.34*/3536#include <sys/cdefs.h>37#include "ipoib.h"38#include <sys/eventhandler.h>3940#include <linux/module.h>4142#include <linux/slab.h>43#include <linux/kernel.h>44#include <linux/vmalloc.h>4546#include <linux/if_vlan.h>4748#include <net/infiniband.h>4950#include <rdma/ib_addr.h>51#include <rdma/ib_cache.h>5253MODULE_AUTHOR("Roland Dreier");54MODULE_DESCRIPTION("IP-over-InfiniBand net driver");55MODULE_LICENSE("Dual BSD/GPL");5657int ipoib_sendq_size = IPOIB_TX_RING_SIZE;58int ipoib_recvq_size = IPOIB_RX_RING_SIZE;5960module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);61MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");62module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);63MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");6465#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG66int ipoib_debug_level = 1;6768module_param_named(debug_level, ipoib_debug_level, int, 0644);69MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");70#endif7172struct ipoib_path_iter {73struct ipoib_dev_priv *priv;74struct ipoib_path path;75};7677static const u8 ipv4_bcast_addr[] = {780x00, 0xff, 0xff, 0xff,790xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,800x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff81};8283struct workqueue_struct *ipoib_workqueue;8485struct ib_sa_client ipoib_sa_client;8687static void ipoib_add_one(struct ib_device *device);88static void ipoib_remove_one(struct ib_device *device, void *client_data);89static if_t ipoib_get_net_dev_by_params(90struct ib_device *dev, u8 port, u16 pkey,91const union ib_gid *gid, const struct sockaddr *addr,92void *client_data);93static void ipoib_start(if_t dev);94static int ipoib_ioctl(if_t ifp, u_long command, caddr_t data);9596static struct unrhdr *ipoib_unrhdr;9798static void99ipoib_unrhdr_init(void *arg)100{101102ipoib_unrhdr = new_unrhdr(0, 65535, NULL);103}104SYSINIT(ipoib_unrhdr_init, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_init, NULL);105106static void107ipoib_unrhdr_uninit(void *arg)108{109110if (ipoib_unrhdr != NULL) {111struct unrhdr *hdr;112113hdr = ipoib_unrhdr;114ipoib_unrhdr = NULL;115116delete_unrhdr(hdr);117}118}119SYSUNINIT(ipoib_unrhdr_uninit, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_uninit, NULL);120121static struct ib_client ipoib_client = {122.name = "ipoib",123.add = ipoib_add_one,124.remove = ipoib_remove_one,125.get_net_dev_by_params = ipoib_get_net_dev_by_params,126};127128int129ipoib_open(struct ipoib_dev_priv *priv)130{131if_t dev = priv->dev;132133ipoib_dbg(priv, "bringing up interface\n");134135set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);136137if (ipoib_pkey_dev_delay_open(priv))138return 0;139140if (ipoib_ib_dev_open(priv))141goto err_disable;142143if (ipoib_ib_dev_up(priv))144goto err_stop;145146if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {147struct ipoib_dev_priv *cpriv;148149/* Bring up any child interfaces too */150mutex_lock(&priv->vlan_mutex);151list_for_each_entry(cpriv, &priv->child_intfs, list)152if ((if_getdrvflags(cpriv->dev) & IFF_DRV_RUNNING) == 0)153ipoib_open(cpriv);154mutex_unlock(&priv->vlan_mutex);155}156if_setdrvflagbits(dev, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);157158return 0;159160err_stop:161ipoib_ib_dev_stop(priv, 1);162163err_disable:164clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);165166return -EINVAL;167}168169static void170ipoib_init(void *arg)171{172if_t dev;173struct ipoib_dev_priv *priv;174175priv = arg;176dev = priv->dev;177if ((if_getdrvflags(dev) & IFF_DRV_RUNNING) == 0)178ipoib_open(priv);179queue_work(ipoib_workqueue, &priv->flush_light);180}181182183static int184ipoib_stop(struct ipoib_dev_priv *priv)185{186if_t dev = priv->dev;187188ipoib_dbg(priv, "stopping interface\n");189190clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);191192if_setdrvflagbits(dev, 0, IFF_DRV_RUNNING | IFF_DRV_OACTIVE);193194ipoib_ib_dev_down(priv, 0);195ipoib_ib_dev_stop(priv, 0);196197if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {198struct ipoib_dev_priv *cpriv;199200/* Bring down any child interfaces too */201mutex_lock(&priv->vlan_mutex);202list_for_each_entry(cpriv, &priv->child_intfs, list)203if ((if_getdrvflags(cpriv->dev) & IFF_DRV_RUNNING) != 0)204ipoib_stop(cpriv);205mutex_unlock(&priv->vlan_mutex);206}207208return 0;209}210211static int212ipoib_propagate_ifnet_mtu(struct ipoib_dev_priv *priv, int new_mtu,213bool propagate)214{215if_t ifp;216struct ifreq ifr;217int error;218219ifp = priv->dev;220if (if_getmtu(ifp) == new_mtu)221return (0);222if (propagate) {223strlcpy(ifr.ifr_name, if_name(ifp), IFNAMSIZ);224ifr.ifr_mtu = new_mtu;225CURVNET_SET(if_getvnet(ifp));226error = ifhwioctl(SIOCSIFMTU, ifp, (caddr_t)&ifr, curthread);227CURVNET_RESTORE();228} else {229if_setmtu(ifp, new_mtu);230error = 0;231}232return (error);233}234235int236ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu, bool propagate)237{238int error, prev_admin_mtu;239240/* dev->if_mtu > 2K ==> connected mode */241if (ipoib_cm_admin_enabled(priv)) {242if (new_mtu > IPOIB_CM_MTU(ipoib_cm_max_mtu(priv)))243return -EINVAL;244245if (new_mtu > priv->mcast_mtu)246ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",247priv->mcast_mtu);248249return (ipoib_propagate_ifnet_mtu(priv, new_mtu, propagate));250}251252if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))253return -EINVAL;254255prev_admin_mtu = priv->admin_mtu;256priv->admin_mtu = new_mtu;257error = ipoib_propagate_ifnet_mtu(priv, min(priv->mcast_mtu,258priv->admin_mtu), propagate);259if (error == 0) {260/* check for MTU change to avoid infinite loop */261if (prev_admin_mtu != new_mtu)262queue_work(ipoib_workqueue, &priv->flush_light);263} else264priv->admin_mtu = prev_admin_mtu;265return (error);266}267268static int269ipoib_ioctl(if_t ifp, u_long command, caddr_t data)270{271struct ipoib_dev_priv *priv = if_getsoftc(ifp);272struct ifaddr *ifa = (struct ifaddr *) data;273struct ifreq *ifr = (struct ifreq *) data;274int error = 0;275276/* check if detaching */277if (priv == NULL)278return (ENXIO);279/* wait for device to become ready, if any */280while (priv->gone == 2)281pause("W", 1);282/* check for device gone */283if (priv->gone != 0)284return (ENXIO);285286switch (command) {287case SIOCSIFFLAGS:288if (if_getflags(ifp) & IFF_UP) {289if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0)290error = -ipoib_open(priv);291} else292if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)293ipoib_stop(priv);294break;295case SIOCADDMULTI:296case SIOCDELMULTI:297if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)298queue_work(ipoib_workqueue, &priv->restart_task);299break;300case SIOCSIFADDR:301if_setflagbits(ifp, IFF_UP, 0);302303switch (ifa->ifa_addr->sa_family) {304#ifdef INET305case AF_INET:306if_init(ifp, if_getsoftc(ifp)); /* before arpwhohas */307arp_ifinit(ifp, ifa);308break;309#endif310default:311if_init(ifp, if_getsoftc(ifp));312break;313}314break;315316case SIOCGIFADDR:317bcopy(if_getlladdr(ifp), &ifr->ifr_addr.sa_data[0],318INFINIBAND_ALEN);319break;320321case SIOCSIFMTU:322/*323* Set the interface MTU.324*/325error = -ipoib_change_mtu(priv, ifr->ifr_mtu, false);326break;327default:328error = EINVAL;329break;330}331return (error);332}333334335static struct ipoib_path *336__path_find(struct ipoib_dev_priv *priv, void *gid)337{338struct rb_node *n = priv->path_tree.rb_node;339struct ipoib_path *path;340int ret;341342while (n) {343path = rb_entry(n, struct ipoib_path, rb_node);344345ret = memcmp(gid, path->pathrec.dgid.raw,346sizeof (union ib_gid));347348if (ret < 0)349n = n->rb_left;350else if (ret > 0)351n = n->rb_right;352else353return path;354}355356return NULL;357}358359static int360__path_add(struct ipoib_dev_priv *priv, struct ipoib_path *path)361{362struct rb_node **n = &priv->path_tree.rb_node;363struct rb_node *pn = NULL;364struct ipoib_path *tpath;365int ret;366367while (*n) {368pn = *n;369tpath = rb_entry(pn, struct ipoib_path, rb_node);370371ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,372sizeof (union ib_gid));373if (ret < 0)374n = &pn->rb_left;375else if (ret > 0)376n = &pn->rb_right;377else378return -EEXIST;379}380381rb_link_node(&path->rb_node, pn, n);382rb_insert_color(&path->rb_node, &priv->path_tree);383384list_add_tail(&path->list, &priv->path_list);385386return 0;387}388389void390ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path)391{392393_IF_DRAIN(&path->queue);394395if (path->ah)396ipoib_put_ah(path->ah);397if (ipoib_cm_get(path))398ipoib_cm_destroy_tx(ipoib_cm_get(path));399400kfree(path);401}402403#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG404405struct ipoib_path_iter *406ipoib_path_iter_init(struct ipoib_dev_priv *priv)407{408struct ipoib_path_iter *iter;409410iter = kmalloc(sizeof *iter, GFP_KERNEL);411if (!iter)412return NULL;413414iter->priv = priv;415memset(iter->path.pathrec.dgid.raw, 0, 16);416417if (ipoib_path_iter_next(iter)) {418kfree(iter);419return NULL;420}421422return iter;423}424425int426ipoib_path_iter_next(struct ipoib_path_iter *iter)427{428struct ipoib_dev_priv *priv = iter->priv;429struct rb_node *n;430struct ipoib_path *path;431int ret = 1;432433spin_lock_irq(&priv->lock);434435n = rb_first(&priv->path_tree);436437while (n) {438path = rb_entry(n, struct ipoib_path, rb_node);439440if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,441sizeof (union ib_gid)) < 0) {442iter->path = *path;443ret = 0;444break;445}446447n = rb_next(n);448}449450spin_unlock_irq(&priv->lock);451452return ret;453}454455void456ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path)457{458*path = iter->path;459}460461#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */462463void464ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv)465{466struct ipoib_path *path, *tp;467468spin_lock_irq(&priv->lock);469470list_for_each_entry_safe(path, tp, &priv->path_list, list) {471ipoib_dbg(priv, "mark path LID 0x%04x GID %16D invalid\n",472be16_to_cpu(path->pathrec.dlid),473path->pathrec.dgid.raw, ":");474path->valid = 0;475}476477spin_unlock_irq(&priv->lock);478}479480void481ipoib_flush_paths(struct ipoib_dev_priv *priv)482{483struct ipoib_path *path, *tp;484LIST_HEAD(remove_list);485unsigned long flags;486487spin_lock_irqsave(&priv->lock, flags);488489list_splice_init(&priv->path_list, &remove_list);490491list_for_each_entry(path, &remove_list, list)492rb_erase(&path->rb_node, &priv->path_tree);493494list_for_each_entry_safe(path, tp, &remove_list, list) {495if (path->query)496ib_sa_cancel_query(path->query_id, path->query);497spin_unlock_irqrestore(&priv->lock, flags);498wait_for_completion(&path->done);499ipoib_path_free(priv, path);500spin_lock_irqsave(&priv->lock, flags);501}502503spin_unlock_irqrestore(&priv->lock, flags);504}505506static void507path_rec_completion(int status, struct ib_sa_path_rec *pathrec, void *path_ptr)508{509struct ipoib_path *path = path_ptr;510struct ipoib_dev_priv *priv = path->priv;511if_t dev = priv->dev;512struct ipoib_ah *ah = NULL;513struct ipoib_ah *old_ah = NULL;514struct epoch_tracker et;515struct ifqueue mbqueue;516struct mbuf *mb;517unsigned long flags;518519if (!status)520ipoib_dbg(priv, "PathRec LID 0x%04x for GID %16D\n",521be16_to_cpu(pathrec->dlid), pathrec->dgid.raw, ":");522else523ipoib_dbg(priv, "PathRec status %d for GID %16D\n",524status, path->pathrec.dgid.raw, ":");525526bzero(&mbqueue, sizeof(mbqueue));527528if (!status) {529struct ib_ah_attr av;530531if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))532ah = ipoib_create_ah(priv, priv->pd, &av);533}534535spin_lock_irqsave(&priv->lock, flags);536537if (ah) {538path->pathrec = *pathrec;539540old_ah = path->ah;541path->ah = ah;542543ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",544ah, be16_to_cpu(pathrec->dlid), pathrec->sl);545546for (;;) {547_IF_DEQUEUE(&path->queue, mb);548if (mb == NULL)549break;550_IF_ENQUEUE(&mbqueue, mb);551}552553#ifdef CONFIG_INFINIBAND_IPOIB_CM554if (ipoib_cm_enabled(priv, path->hwaddr) && !ipoib_cm_get(path))555ipoib_cm_set(path, ipoib_cm_create_tx(priv, path));556#endif557558path->valid = 1;559}560561path->query = NULL;562complete(&path->done);563564spin_unlock_irqrestore(&priv->lock, flags);565566if (old_ah)567ipoib_put_ah(old_ah);568569NET_EPOCH_ENTER(et);570for (;;) {571_IF_DEQUEUE(&mbqueue, mb);572if (mb == NULL)573break;574mb->m_pkthdr.rcvif = dev;575if (if_transmit(dev, mb))576ipoib_warn(priv, "dev_queue_xmit failed "577"to requeue packet\n");578}579NET_EPOCH_EXIT(et);580}581582static struct ipoib_path *583path_rec_create(struct ipoib_dev_priv *priv, uint8_t *hwaddr)584{585struct ipoib_path *path;586587if (!priv->broadcast)588return NULL;589590path = kzalloc(sizeof *path, GFP_ATOMIC);591if (!path)592return NULL;593594path->priv = priv;595596bzero(&path->queue, sizeof(path->queue));597598#ifdef CONFIG_INFINIBAND_IPOIB_CM599memcpy(&path->hwaddr, hwaddr, INFINIBAND_ALEN);600#endif601memcpy(path->pathrec.dgid.raw, &hwaddr[4], sizeof (union ib_gid));602path->pathrec.sgid = priv->local_gid;603path->pathrec.pkey = cpu_to_be16(priv->pkey);604path->pathrec.numb_path = 1;605path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;606607return path;608}609610static int611path_rec_start(struct ipoib_dev_priv *priv, struct ipoib_path *path)612{613if_t dev = priv->dev;614615ib_sa_comp_mask comp_mask = IB_SA_PATH_REC_MTU_SELECTOR | IB_SA_PATH_REC_MTU;616struct ib_sa_path_rec p_rec;617618p_rec = path->pathrec;619p_rec.mtu_selector = IB_SA_GT;620621switch (roundup_pow_of_two(if_getmtu(dev) + IPOIB_ENCAP_LEN)) {622case 512:623p_rec.mtu = IB_MTU_256;624break;625case 1024:626p_rec.mtu = IB_MTU_512;627break;628case 2048:629p_rec.mtu = IB_MTU_1024;630break;631case 4096:632p_rec.mtu = IB_MTU_2048;633break;634default:635/* Wildcard everything */636comp_mask = 0;637p_rec.mtu = 0;638p_rec.mtu_selector = 0;639}640641ipoib_dbg(priv, "Start path record lookup for %16D MTU > %d\n",642p_rec.dgid.raw, ":",643comp_mask ? ib_mtu_enum_to_int(p_rec.mtu) : 0);644645init_completion(&path->done);646647path->query_id =648ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,649&p_rec, comp_mask |650IB_SA_PATH_REC_DGID |651IB_SA_PATH_REC_SGID |652IB_SA_PATH_REC_NUMB_PATH |653IB_SA_PATH_REC_TRAFFIC_CLASS |654IB_SA_PATH_REC_PKEY,6551000, GFP_ATOMIC,656path_rec_completion,657path, &path->query);658if (path->query_id < 0) {659ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);660path->query = NULL;661complete(&path->done);662return path->query_id;663}664665return 0;666}667668static void669ipoib_unicast_send(struct mbuf *mb, struct ipoib_dev_priv *priv, struct ipoib_header *eh)670{671struct ipoib_path *path;672673path = __path_find(priv, eh->hwaddr + 4);674if (!path || !path->valid) {675int new_path = 0;676677if (!path) {678path = path_rec_create(priv, eh->hwaddr);679new_path = 1;680}681if (path) {682if (_IF_QLEN(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE)683_IF_ENQUEUE(&path->queue, mb);684else {685if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1);686m_freem(mb);687}688689if (!path->query && path_rec_start(priv, path)) {690if (new_path)691ipoib_path_free(priv, path);692return;693} else694__path_add(priv, path);695} else {696if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1);697m_freem(mb);698}699700return;701}702703if (ipoib_cm_get(path) && ipoib_cm_up(path)) {704ipoib_cm_send(priv, mb, ipoib_cm_get(path));705} else if (path->ah) {706ipoib_send(priv, mb, path->ah, IPOIB_QPN(eh->hwaddr));707} else if ((path->query || !path_rec_start(priv, path)) &&708path->queue.ifq_len < IPOIB_MAX_PATH_REC_QUEUE) {709_IF_ENQUEUE(&path->queue, mb);710} else {711if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1);712m_freem(mb);713}714}715716static int717ipoib_send_one(struct ipoib_dev_priv *priv, struct mbuf *mb)718{719struct ipoib_header *eh;720721eh = mtod(mb, struct ipoib_header *);722if (IPOIB_IS_MULTICAST(eh->hwaddr)) {723/* Add in the P_Key for multicast*/724eh->hwaddr[8] = (priv->pkey >> 8) & 0xff;725eh->hwaddr[9] = priv->pkey & 0xff;726727ipoib_mcast_send(priv, eh->hwaddr + 4, mb);728} else729ipoib_unicast_send(mb, priv, eh);730731return 0;732}733734void735ipoib_start_locked(if_t dev, struct ipoib_dev_priv *priv)736{737struct mbuf *mb;738739assert_spin_locked(&priv->lock);740741while (!if_sendq_empty(dev) &&742(if_getdrvflags(dev) & IFF_DRV_OACTIVE) == 0) {743mb = if_dequeue(dev);744if (mb == NULL)745break;746infiniband_bpf_mtap(dev, mb);747ipoib_send_one(priv, mb);748}749}750751static void752_ipoib_start(if_t dev, struct ipoib_dev_priv *priv)753{754755if ((if_getdrvflags(dev) & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=756IFF_DRV_RUNNING)757return;758759spin_lock(&priv->lock);760ipoib_start_locked(dev, priv);761spin_unlock(&priv->lock);762}763764static void765ipoib_start(if_t dev)766{767_ipoib_start(dev, if_getsoftc(dev));768}769770static void771ipoib_vlan_start(if_t dev)772{773struct ipoib_dev_priv *priv;774struct mbuf *mb;775776priv = VLAN_COOKIE(dev);777if (priv != NULL)778return _ipoib_start(dev, priv);779while (!if_sendq_empty(dev)) {780mb = if_dequeue(dev);781if (mb == NULL)782break;783m_freem(mb);784if_inc_counter(dev, IFCOUNTER_OERRORS, 1);785}786}787788int789ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port)790{791792/* Allocate RX/TX "rings" to hold queued mbs */793priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,794GFP_KERNEL);795if (!priv->rx_ring) {796printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",797ca->name, ipoib_recvq_size);798goto out;799}800801priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring, GFP_KERNEL);802if (!priv->tx_ring) {803printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",804ca->name, ipoib_sendq_size);805goto out_rx_ring_cleanup;806}807memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring);808809/* priv->tx_head, tx_tail & tx_outstanding are already 0 */810811if (ipoib_ib_dev_init(priv, ca, port))812goto out_tx_ring_cleanup;813814return 0;815816out_tx_ring_cleanup:817kfree(priv->tx_ring);818819out_rx_ring_cleanup:820kfree(priv->rx_ring);821822out:823return -ENOMEM;824}825826static void827ipoib_ifdetach(struct ipoib_dev_priv *priv)828{829if_t dev;830831dev = priv->dev;832if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {833priv->gone = 1;834infiniband_ifdetach(dev);835}836}837838static void839ipoib_detach(struct ipoib_dev_priv *priv)840{841if_t dev;842843dev = priv->dev;844if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {845if_free(dev);846free_unr(ipoib_unrhdr, priv->unit);847} else848VLAN_SETCOOKIE(priv->dev, NULL);849850free(priv, M_TEMP);851}852853void854ipoib_dev_cleanup(struct ipoib_dev_priv *priv)855{856struct ipoib_dev_priv *cpriv, *tcpriv;857858/* Delete any child interfaces first */859list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {860ipoib_ifdetach(cpriv);861ipoib_dev_cleanup(cpriv);862ipoib_detach(cpriv);863}864865ipoib_ib_dev_cleanup(priv);866867kfree(priv->rx_ring);868kfree(priv->tx_ring);869870priv->rx_ring = NULL;871priv->tx_ring = NULL;872}873874static struct ipoib_dev_priv *875ipoib_priv_alloc(void)876{877struct ipoib_dev_priv *priv;878879priv = malloc(sizeof(struct ipoib_dev_priv), M_TEMP, M_ZERO|M_WAITOK);880spin_lock_init(&priv->lock);881spin_lock_init(&priv->drain_lock);882mutex_init(&priv->vlan_mutex);883INIT_LIST_HEAD(&priv->path_list);884INIT_LIST_HEAD(&priv->child_intfs);885INIT_LIST_HEAD(&priv->dead_ahs);886INIT_LIST_HEAD(&priv->multicast_list);887INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);888INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task);889INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);890INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light);891INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal);892INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy);893INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);894INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);895memcpy(priv->broadcastaddr, ipv4_bcast_addr, INFINIBAND_ALEN);896897return (priv);898}899900struct ipoib_dev_priv *901ipoib_intf_alloc(const char *name, struct ib_device *hca)902{903struct ipoib_dev_priv *priv;904if_t dev;905906priv = ipoib_priv_alloc();907dev = priv->dev = if_alloc(IFT_INFINIBAND);908if_setsoftc(dev, priv);909priv->gone = 2; /* initializing */910priv->unit = alloc_unr(ipoib_unrhdr);911if (priv->unit == -1) {912if_free(dev);913free(priv, M_TEMP);914return NULL;915}916if_initname(dev, name, priv->unit);917if_setflags(dev, IFF_BROADCAST | IFF_MULTICAST);918if ((hca->attrs.device_cap_flags & IB_DEVICE_KNOWSEPOCH) == 0)919if_setflagbits(dev, IFF_NEEDSEPOCH, 0);920921infiniband_ifattach(priv->dev, NULL, priv->broadcastaddr);922923if_setinitfn(dev, ipoib_init);924if_setioctlfn(dev, ipoib_ioctl);925if_setstartfn(dev, ipoib_start);926927if_setsendqlen(dev, ipoib_sendq_size * 2);928929priv->dev = dev;930if_link_state_change(priv->dev, LINK_STATE_DOWN);931932return if_getsoftc(dev);933}934935int936ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)937{938struct ib_device_attr *device_attr = &hca->attrs;939940priv->hca_caps = device_attr->device_cap_flags;941942if_sethwassist(priv->dev, 0);943if_setcapabilities(priv->dev, 0);944945#ifndef CONFIG_INFINIBAND_IPOIB_CM946if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {947set_bit(IPOIB_FLAG_CSUM, &priv->flags);948if_sethwassist(priv->dev, CSUM_IP | CSUM_TCP | CSUM_UDP);949if_setcapabilities(priv->dev, IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM);950}951952#if 0953if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO) {954priv->dev->if_capabilities |= IFCAP_TSO4;955priv->dev->if_hwassist |= CSUM_TSO;956}957#endif958#endif959if_setcapabilitiesbit(priv->dev,960IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_LINKSTATE, 0);961if_setcapenable(priv->dev, if_getcapabilities(priv->dev));962963return 0;964}965966967static if_t968ipoib_add_port(const char *format, struct ib_device *hca, u8 port)969{970struct ipoib_dev_priv *priv;971struct ib_port_attr attr;972int result = -ENOMEM;973974priv = ipoib_intf_alloc(format, hca);975if (!priv)976goto alloc_mem_failed;977978if (!ib_query_port(hca, port, &attr))979priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);980else {981printk(KERN_WARNING "%s: ib_query_port %d failed\n",982hca->name, port);983goto device_init_failed;984}985986/* MTU will be reset when mcast join happens */987if_setmtu(priv->dev, IPOIB_UD_MTU(priv->max_ib_mtu));988priv->mcast_mtu = priv->admin_mtu = if_getmtu(priv->dev);989990result = ib_query_pkey(hca, port, 0, &priv->pkey);991if (result) {992printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",993hca->name, port, result);994goto device_init_failed;995}996997if (ipoib_set_dev_features(priv, hca))998goto device_init_failed;9991000/*1001* Set the full membership bit, so that we join the right1002* broadcast group, etc.1003*/1004priv->pkey |= 0x8000;10051006priv->broadcastaddr[8] = priv->pkey >> 8;1007priv->broadcastaddr[9] = priv->pkey & 0xff;10081009result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL);1010if (result) {1011printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",1012hca->name, port, result);1013goto device_init_failed;1014}1015memcpy(if_getlladdr(priv->dev) + 4, priv->local_gid.raw, sizeof(union ib_gid));10161017result = ipoib_dev_init(priv, hca, port);1018if (result < 0) {1019printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",1020hca->name, port, result);1021goto device_init_failed;1022}1023if (ipoib_cm_admin_enabled(priv))1024if_setmtu(priv->dev, IPOIB_CM_MTU(ipoib_cm_max_mtu(priv)));10251026INIT_IB_EVENT_HANDLER(&priv->event_handler,1027priv->ca, ipoib_event);1028result = ib_register_event_handler(&priv->event_handler);1029if (result < 0) {1030printk(KERN_WARNING "%s: ib_register_event_handler failed for "1031"port %d (ret = %d)\n",1032hca->name, port, result);1033goto event_failed;1034}1035if_printf(priv->dev, "Attached to %s port %d\n", hca->name, port);10361037priv->gone = 0; /* ready */10381039return priv->dev;10401041event_failed:1042ipoib_dev_cleanup(priv);10431044device_init_failed:1045ipoib_ifdetach(priv);1046ipoib_detach(priv);10471048alloc_mem_failed:1049return ERR_PTR(result);1050}10511052static void1053ipoib_add_one(struct ib_device *device)1054{1055struct list_head *dev_list;1056if_t dev;1057struct ipoib_dev_priv *priv;1058int s, e, p;10591060if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)1061return;10621063dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);1064if (!dev_list)1065return;10661067INIT_LIST_HEAD(dev_list);10681069if (device->node_type == RDMA_NODE_IB_SWITCH) {1070s = 0;1071e = 0;1072} else {1073s = 1;1074e = device->phys_port_cnt;1075}10761077for (p = s; p <= e; ++p) {1078if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND)1079continue;1080dev = ipoib_add_port("ib", device, p);1081if (!IS_ERR(dev)) {1082priv = if_getsoftc(dev);1083list_add_tail(&priv->list, dev_list);1084}1085}10861087ib_set_client_data(device, &ipoib_client, dev_list);1088}10891090static void1091ipoib_remove_one(struct ib_device *device, void *client_data)1092{1093struct ipoib_dev_priv *priv, *tmp;1094struct list_head *dev_list = client_data;10951096if (!dev_list)1097return;10981099if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)1100return;11011102list_for_each_entry_safe(priv, tmp, dev_list, list) {1103if (rdma_port_get_link_layer(device, priv->port) != IB_LINK_LAYER_INFINIBAND)1104continue;11051106ipoib_ifdetach(priv);1107ipoib_stop(priv);11081109ib_unregister_event_handler(&priv->event_handler);11101111flush_workqueue(ipoib_workqueue);11121113ipoib_dev_cleanup(priv);1114ipoib_detach(priv);1115}11161117kfree(dev_list);1118}11191120static u_int1121ipoib_match_dev_addr_cb(void *arg, struct ifaddr *ifa, u_int count)1122{1123struct sockaddr *addr = arg;11241125/* If a match is already found, skip this. */1126if (count > 0)1127return (0);11281129if (ifa->ifa_addr->sa_len != addr->sa_len)1130return (0);11311132if (memcmp(ifa->ifa_addr, addr, addr->sa_len) == 0)1133return (1);11341135return (0);1136}11371138static int1139ipoib_match_dev_addr(const struct sockaddr *addr, if_t dev)1140{1141struct epoch_tracker et;1142int retval = 0;11431144NET_EPOCH_ENTER(et);1145retval = if_foreach_addr_type(dev, addr->sa_family,1146ipoib_match_dev_addr_cb, __DECONST(void *, addr));1147NET_EPOCH_EXIT(et);11481149return (retval);1150}11511152/*1153* ipoib_match_gid_pkey_addr - returns the number of IPoIB netdevs on1154* top a given ipoib device matching a pkey_index and address, if one1155* exists.1156*1157* @found_net_dev: contains a matching net_device if the return value1158* >= 1, with a reference held.1159*/1160static int1161ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv,1162const union ib_gid *gid, u16 pkey_index, const struct sockaddr *addr,1163if_t *found_net_dev)1164{1165struct ipoib_dev_priv *child_priv;1166int matches = 0;11671168if (priv->pkey_index == pkey_index &&1169(!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) {1170if (addr == NULL || ipoib_match_dev_addr(addr, priv->dev) != 0) {1171if (*found_net_dev == NULL) {1172if_t net_dev;11731174if (priv->parent != NULL)1175net_dev = priv->parent;1176else1177net_dev = priv->dev;1178*found_net_dev = net_dev;1179dev_hold(net_dev);1180}1181matches++;1182}1183}11841185/* Check child interfaces */1186mutex_lock(&priv->vlan_mutex);1187list_for_each_entry(child_priv, &priv->child_intfs, list) {1188matches += ipoib_match_gid_pkey_addr(child_priv, gid,1189pkey_index, addr, found_net_dev);1190if (matches > 1)1191break;1192}1193mutex_unlock(&priv->vlan_mutex);11941195return matches;1196}11971198/*1199* __ipoib_get_net_dev_by_params - returns the number of matching1200* net_devs found (between 0 and 2). Also return the matching1201* net_device in the @net_dev parameter, holding a reference to the1202* net_device, if the number of matches >= 11203*/1204static int1205__ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port,1206u16 pkey_index, const union ib_gid *gid,1207const struct sockaddr *addr, if_t *net_dev)1208{1209struct ipoib_dev_priv *priv;1210int matches = 0;12111212*net_dev = NULL;12131214list_for_each_entry(priv, dev_list, list) {1215if (priv->port != port)1216continue;12171218matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index,1219addr, net_dev);12201221if (matches > 1)1222break;1223}12241225return matches;1226}12271228static if_t1229ipoib_get_net_dev_by_params(struct ib_device *dev, u8 port, u16 pkey,1230const union ib_gid *gid, const struct sockaddr *addr, void *client_data)1231{1232if_t net_dev;1233struct list_head *dev_list = client_data;1234u16 pkey_index;1235int matches;1236int ret;12371238if (!rdma_protocol_ib(dev, port))1239return NULL;12401241ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index);1242if (ret)1243return NULL;12441245if (!dev_list)1246return NULL;12471248/* See if we can find a unique device matching the L2 parameters */1249matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,1250gid, NULL, &net_dev);12511252switch (matches) {1253case 0:1254return NULL;1255case 1:1256return net_dev;1257}12581259dev_put(net_dev);12601261/* Couldn't find a unique device with L2 parameters only. Use L31262* address to uniquely match the net device */1263matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,1264gid, addr, &net_dev);1265switch (matches) {1266case 0:1267return NULL;1268default:1269dev_warn_ratelimited(&dev->dev,1270"duplicate IP address detected\n");1271/* Fall through */1272case 1:1273return net_dev;1274}1275}12761277static void1278ipoib_config_vlan(void *arg, if_t ifp, uint16_t vtag)1279{1280struct ipoib_dev_priv *parent;1281struct ipoib_dev_priv *priv;1282struct epoch_tracker et;1283if_t dev;1284uint16_t pkey;1285int error;12861287if (if_gettype(ifp) != IFT_INFINIBAND)1288return;1289NET_EPOCH_ENTER(et);1290dev = VLAN_DEVAT(ifp, vtag);1291NET_EPOCH_EXIT(et);1292if (dev == NULL)1293return;1294priv = NULL;1295error = 0;1296parent = if_getsoftc(ifp);1297/* We only support 15 bits of pkey. */1298if (vtag & 0x8000)1299return;1300pkey = vtag | 0x8000; /* Set full membership bit. */1301if (pkey == parent->pkey)1302return;1303/* Check for dups */1304mutex_lock(&parent->vlan_mutex);1305list_for_each_entry(priv, &parent->child_intfs, list) {1306if (priv->pkey == pkey) {1307priv = NULL;1308error = EBUSY;1309goto out;1310}1311}1312priv = ipoib_priv_alloc();1313priv->dev = dev;1314priv->max_ib_mtu = parent->max_ib_mtu;1315priv->mcast_mtu = priv->admin_mtu = if_getmtu(parent->dev);1316set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);1317error = ipoib_set_dev_features(priv, parent->ca);1318if (error)1319goto out;1320priv->pkey = pkey;1321priv->broadcastaddr[8] = pkey >> 8;1322priv->broadcastaddr[9] = pkey & 0xff;1323if_setbroadcastaddr(dev, priv->broadcastaddr);1324error = ipoib_dev_init(priv, parent->ca, parent->port);1325if (error)1326goto out;1327priv->parent = parent->dev;1328list_add_tail(&priv->list, &parent->child_intfs);1329VLAN_SETCOOKIE(dev, priv);1330if_setstartfn(dev, ipoib_vlan_start);1331if_setdrvflagbits(dev, 0, IFF_DRV_RUNNING);1332if_setifheaderlen(dev, IPOIB_HEADER_LEN);1333if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)1334ipoib_open(priv);1335mutex_unlock(&parent->vlan_mutex);1336return;1337out:1338mutex_unlock(&parent->vlan_mutex);1339if (priv)1340free(priv, M_TEMP);1341if (error)1342ipoib_warn(parent,1343"failed to initialize subinterface: device %s, port %d vtag 0x%X",1344parent->ca->name, parent->port, vtag);1345return;1346}13471348static void1349ipoib_unconfig_vlan(void *arg, if_t ifp, uint16_t vtag)1350{1351struct ipoib_dev_priv *parent;1352struct ipoib_dev_priv *priv;1353struct epoch_tracker et;1354if_t dev;1355uint16_t pkey;13561357if (if_gettype(ifp) != IFT_INFINIBAND)1358return;13591360NET_EPOCH_ENTER(et);1361dev = VLAN_DEVAT(ifp, vtag);1362NET_EPOCH_EXIT(et);1363if (dev)1364VLAN_SETCOOKIE(dev, NULL);1365pkey = vtag | 0x8000;1366parent = if_getsoftc(ifp);1367mutex_lock(&parent->vlan_mutex);1368list_for_each_entry(priv, &parent->child_intfs, list) {1369if (priv->pkey == pkey) {1370ipoib_dev_cleanup(priv);1371list_del(&priv->list);1372break;1373}1374}1375mutex_unlock(&parent->vlan_mutex);1376}13771378eventhandler_tag ipoib_vlan_attach;1379eventhandler_tag ipoib_vlan_detach;13801381static int __init1382ipoib_init_module(void)1383{1384int ret;13851386ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);1387ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);1388ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);13891390ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);1391ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);1392ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE,1393IPOIB_MIN_QUEUE_SIZE));1394#ifdef CONFIG_INFINIBAND_IPOIB_CM1395ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);1396#endif13971398ipoib_vlan_attach = EVENTHANDLER_REGISTER(vlan_config,1399ipoib_config_vlan, NULL, EVENTHANDLER_PRI_FIRST);1400ipoib_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,1401ipoib_unconfig_vlan, NULL, EVENTHANDLER_PRI_FIRST);14021403/*1404* We create our own workqueue mainly because we want to be1405* able to flush it when devices are being removed. We can't1406* use schedule_work()/flush_scheduled_work() because both1407* unregister_netdev() and linkwatch_event take the rtnl lock,1408* so flush_scheduled_work() can deadlock during device1409* removal.1410*/1411ipoib_workqueue = create_singlethread_workqueue("ipoib");1412if (!ipoib_workqueue) {1413ret = -ENOMEM;1414goto err_fs;1415}14161417ib_sa_register_client(&ipoib_sa_client);14181419ret = ib_register_client(&ipoib_client);1420if (ret)1421goto err_sa;14221423return 0;14241425err_sa:1426ib_sa_unregister_client(&ipoib_sa_client);1427destroy_workqueue(ipoib_workqueue);14281429err_fs:1430return ret;1431}14321433static void __exit1434ipoib_cleanup_module(void)1435{14361437EVENTHANDLER_DEREGISTER(vlan_config, ipoib_vlan_attach);1438EVENTHANDLER_DEREGISTER(vlan_unconfig, ipoib_vlan_detach);1439ib_unregister_client(&ipoib_client);1440ib_sa_unregister_client(&ipoib_sa_client);1441destroy_workqueue(ipoib_workqueue);1442}1443module_init_order(ipoib_init_module, SI_ORDER_FIFTH);1444module_exit_order(ipoib_cleanup_module, SI_ORDER_FIFTH);14451446static int1447ipoib_evhand(module_t mod, int event, void *arg)1448{1449return (0);1450}14511452static moduledata_t ipoib_mod = {1453.name = "ipoib",1454.evhand = ipoib_evhand,1455};14561457DECLARE_MODULE(ipoib, ipoib_mod, SI_SUB_LAST, SI_ORDER_ANY);1458MODULE_DEPEND(ipoib, ibcore, 1, 1, 1);1459MODULE_DEPEND(ipoib, if_infiniband, 1, 1, 1);1460MODULE_DEPEND(ipoib, linuxkpi, 1, 1, 1);146114621463