Path: blob/master/drivers/infiniband/ulp/ipoib/ipoib_main.c
15112 views
/*1* Copyright (c) 2004 Topspin Communications. All rights reserved.2* Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.3* Copyright (c) 2004 Voltaire, Inc. All rights reserved.4*5* This software is available to you under a choice of one of two6* licenses. You may choose to be licensed under the terms of the GNU7* General Public License (GPL) Version 2, available from the file8* COPYING in the main directory of this source tree, or the9* OpenIB.org BSD license below:10*11* Redistribution and use in source and binary forms, with or12* without modification, are permitted provided that the following13* conditions are met:14*15* - Redistributions of source code must retain the above16* copyright notice, this list of conditions and the following17* disclaimer.18*19* - Redistributions in binary form must reproduce the above20* copyright notice, this list of conditions and the following21* disclaimer in the documentation and/or other materials22* provided with the distribution.23*24* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,25* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF26* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND27* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS28* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN29* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN30* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE31* SOFTWARE.32*/3334#include "ipoib.h"3536#include <linux/module.h>3738#include <linux/init.h>39#include <linux/slab.h>40#include <linux/kernel.h>41#include <linux/vmalloc.h>4243#include <linux/if_arp.h> /* For ARPHRD_xxx */4445#include <linux/ip.h>46#include <linux/in.h>4748#include <net/dst.h>4950MODULE_AUTHOR("Roland Dreier");51MODULE_DESCRIPTION("IP-over-InfiniBand net driver");52MODULE_LICENSE("Dual BSD/GPL");5354int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;55int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;5657module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);58MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");59module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);60MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");6162#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG63int ipoib_debug_level;6465module_param_named(debug_level, ipoib_debug_level, int, 0644);66MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");67#endif6869struct ipoib_path_iter {70struct net_device *dev;71struct ipoib_path path;72};7374static const u8 ipv4_bcast_addr[] = {750x00, 0xff, 0xff, 0xff,760xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,770x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff78};7980struct workqueue_struct *ipoib_workqueue;8182struct ib_sa_client ipoib_sa_client;8384static void ipoib_add_one(struct ib_device *device);85static void ipoib_remove_one(struct ib_device *device);8687static struct ib_client ipoib_client = {88.name = "ipoib",89.add = ipoib_add_one,90.remove = ipoib_remove_one91};9293int ipoib_open(struct net_device *dev)94{95struct ipoib_dev_priv *priv = netdev_priv(dev);9697ipoib_dbg(priv, "bringing up interface\n");9899set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);100101if (ipoib_pkey_dev_delay_open(dev))102return 0;103104if (ipoib_ib_dev_open(dev))105goto err_disable;106107if (ipoib_ib_dev_up(dev))108goto err_stop;109110if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {111struct ipoib_dev_priv *cpriv;112113/* Bring up any child interfaces too */114mutex_lock(&priv->vlan_mutex);115list_for_each_entry(cpriv, &priv->child_intfs, list) {116int flags;117118flags = cpriv->dev->flags;119if (flags & IFF_UP)120continue;121122dev_change_flags(cpriv->dev, flags | IFF_UP);123}124mutex_unlock(&priv->vlan_mutex);125}126127netif_start_queue(dev);128129return 0;130131err_stop:132ipoib_ib_dev_stop(dev, 1);133134err_disable:135clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);136137return -EINVAL;138}139140static int ipoib_stop(struct net_device *dev)141{142struct ipoib_dev_priv *priv = netdev_priv(dev);143144ipoib_dbg(priv, "stopping interface\n");145146clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);147148netif_stop_queue(dev);149150ipoib_ib_dev_down(dev, 0);151ipoib_ib_dev_stop(dev, 0);152153if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {154struct ipoib_dev_priv *cpriv;155156/* Bring down any child interfaces too */157mutex_lock(&priv->vlan_mutex);158list_for_each_entry(cpriv, &priv->child_intfs, list) {159int flags;160161flags = cpriv->dev->flags;162if (!(flags & IFF_UP))163continue;164165dev_change_flags(cpriv->dev, flags & ~IFF_UP);166}167mutex_unlock(&priv->vlan_mutex);168}169170return 0;171}172173static u32 ipoib_fix_features(struct net_device *dev, u32 features)174{175struct ipoib_dev_priv *priv = netdev_priv(dev);176177if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))178features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO);179180return features;181}182183static int ipoib_change_mtu(struct net_device *dev, int new_mtu)184{185struct ipoib_dev_priv *priv = netdev_priv(dev);186187/* dev->mtu > 2K ==> connected mode */188if (ipoib_cm_admin_enabled(dev)) {189if (new_mtu > ipoib_cm_max_mtu(dev))190return -EINVAL;191192if (new_mtu > priv->mcast_mtu)193ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",194priv->mcast_mtu);195196dev->mtu = new_mtu;197return 0;198}199200if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))201return -EINVAL;202203priv->admin_mtu = new_mtu;204205dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);206207return 0;208}209210static struct ipoib_path *__path_find(struct net_device *dev, void *gid)211{212struct ipoib_dev_priv *priv = netdev_priv(dev);213struct rb_node *n = priv->path_tree.rb_node;214struct ipoib_path *path;215int ret;216217while (n) {218path = rb_entry(n, struct ipoib_path, rb_node);219220ret = memcmp(gid, path->pathrec.dgid.raw,221sizeof (union ib_gid));222223if (ret < 0)224n = n->rb_left;225else if (ret > 0)226n = n->rb_right;227else228return path;229}230231return NULL;232}233234static int __path_add(struct net_device *dev, struct ipoib_path *path)235{236struct ipoib_dev_priv *priv = netdev_priv(dev);237struct rb_node **n = &priv->path_tree.rb_node;238struct rb_node *pn = NULL;239struct ipoib_path *tpath;240int ret;241242while (*n) {243pn = *n;244tpath = rb_entry(pn, struct ipoib_path, rb_node);245246ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,247sizeof (union ib_gid));248if (ret < 0)249n = &pn->rb_left;250else if (ret > 0)251n = &pn->rb_right;252else253return -EEXIST;254}255256rb_link_node(&path->rb_node, pn, n);257rb_insert_color(&path->rb_node, &priv->path_tree);258259list_add_tail(&path->list, &priv->path_list);260261return 0;262}263264static void path_free(struct net_device *dev, struct ipoib_path *path)265{266struct ipoib_dev_priv *priv = netdev_priv(dev);267struct ipoib_neigh *neigh, *tn;268struct sk_buff *skb;269unsigned long flags;270271while ((skb = __skb_dequeue(&path->queue)))272dev_kfree_skb_irq(skb);273274spin_lock_irqsave(&priv->lock, flags);275276list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {277/*278* It's safe to call ipoib_put_ah() inside priv->lock279* here, because we know that path->ah will always280* hold one more reference, so ipoib_put_ah() will281* never do more than decrement the ref count.282*/283if (neigh->ah)284ipoib_put_ah(neigh->ah);285286ipoib_neigh_free(dev, neigh);287}288289spin_unlock_irqrestore(&priv->lock, flags);290291if (path->ah)292ipoib_put_ah(path->ah);293294kfree(path);295}296297#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG298299struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)300{301struct ipoib_path_iter *iter;302303iter = kmalloc(sizeof *iter, GFP_KERNEL);304if (!iter)305return NULL;306307iter->dev = dev;308memset(iter->path.pathrec.dgid.raw, 0, 16);309310if (ipoib_path_iter_next(iter)) {311kfree(iter);312return NULL;313}314315return iter;316}317318int ipoib_path_iter_next(struct ipoib_path_iter *iter)319{320struct ipoib_dev_priv *priv = netdev_priv(iter->dev);321struct rb_node *n;322struct ipoib_path *path;323int ret = 1;324325spin_lock_irq(&priv->lock);326327n = rb_first(&priv->path_tree);328329while (n) {330path = rb_entry(n, struct ipoib_path, rb_node);331332if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,333sizeof (union ib_gid)) < 0) {334iter->path = *path;335ret = 0;336break;337}338339n = rb_next(n);340}341342spin_unlock_irq(&priv->lock);343344return ret;345}346347void ipoib_path_iter_read(struct ipoib_path_iter *iter,348struct ipoib_path *path)349{350*path = iter->path;351}352353#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */354355void ipoib_mark_paths_invalid(struct net_device *dev)356{357struct ipoib_dev_priv *priv = netdev_priv(dev);358struct ipoib_path *path, *tp;359360spin_lock_irq(&priv->lock);361362list_for_each_entry_safe(path, tp, &priv->path_list, list) {363ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n",364be16_to_cpu(path->pathrec.dlid),365path->pathrec.dgid.raw);366path->valid = 0;367}368369spin_unlock_irq(&priv->lock);370}371372void ipoib_flush_paths(struct net_device *dev)373{374struct ipoib_dev_priv *priv = netdev_priv(dev);375struct ipoib_path *path, *tp;376LIST_HEAD(remove_list);377unsigned long flags;378379netif_tx_lock_bh(dev);380spin_lock_irqsave(&priv->lock, flags);381382list_splice_init(&priv->path_list, &remove_list);383384list_for_each_entry(path, &remove_list, list)385rb_erase(&path->rb_node, &priv->path_tree);386387list_for_each_entry_safe(path, tp, &remove_list, list) {388if (path->query)389ib_sa_cancel_query(path->query_id, path->query);390spin_unlock_irqrestore(&priv->lock, flags);391netif_tx_unlock_bh(dev);392wait_for_completion(&path->done);393path_free(dev, path);394netif_tx_lock_bh(dev);395spin_lock_irqsave(&priv->lock, flags);396}397398spin_unlock_irqrestore(&priv->lock, flags);399netif_tx_unlock_bh(dev);400}401402static void path_rec_completion(int status,403struct ib_sa_path_rec *pathrec,404void *path_ptr)405{406struct ipoib_path *path = path_ptr;407struct net_device *dev = path->dev;408struct ipoib_dev_priv *priv = netdev_priv(dev);409struct ipoib_ah *ah = NULL;410struct ipoib_ah *old_ah = NULL;411struct ipoib_neigh *neigh, *tn;412struct sk_buff_head skqueue;413struct sk_buff *skb;414unsigned long flags;415416if (!status)417ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",418be16_to_cpu(pathrec->dlid), pathrec->dgid.raw);419else420ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",421status, path->pathrec.dgid.raw);422423skb_queue_head_init(&skqueue);424425if (!status) {426struct ib_ah_attr av;427428if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))429ah = ipoib_create_ah(dev, priv->pd, &av);430}431432spin_lock_irqsave(&priv->lock, flags);433434if (ah) {435path->pathrec = *pathrec;436437old_ah = path->ah;438path->ah = ah;439440ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",441ah, be16_to_cpu(pathrec->dlid), pathrec->sl);442443while ((skb = __skb_dequeue(&path->queue)))444__skb_queue_tail(&skqueue, skb);445446list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {447if (neigh->ah) {448WARN_ON(neigh->ah != old_ah);449/*450* Dropping the ah reference inside451* priv->lock is safe here, because we452* will hold one more reference from453* the original value of path->ah (ie454* old_ah).455*/456ipoib_put_ah(neigh->ah);457}458kref_get(&path->ah->ref);459neigh->ah = path->ah;460memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,461sizeof(union ib_gid));462463if (ipoib_cm_enabled(dev, neigh->neighbour)) {464if (!ipoib_cm_get(neigh))465ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,466path,467neigh));468if (!ipoib_cm_get(neigh)) {469list_del(&neigh->list);470if (neigh->ah)471ipoib_put_ah(neigh->ah);472ipoib_neigh_free(dev, neigh);473continue;474}475}476477while ((skb = __skb_dequeue(&neigh->queue)))478__skb_queue_tail(&skqueue, skb);479}480path->valid = 1;481}482483path->query = NULL;484complete(&path->done);485486spin_unlock_irqrestore(&priv->lock, flags);487488if (old_ah)489ipoib_put_ah(old_ah);490491while ((skb = __skb_dequeue(&skqueue))) {492skb->dev = dev;493if (dev_queue_xmit(skb))494ipoib_warn(priv, "dev_queue_xmit failed "495"to requeue packet\n");496}497}498499static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)500{501struct ipoib_dev_priv *priv = netdev_priv(dev);502struct ipoib_path *path;503504if (!priv->broadcast)505return NULL;506507path = kzalloc(sizeof *path, GFP_ATOMIC);508if (!path)509return NULL;510511path->dev = dev;512513skb_queue_head_init(&path->queue);514515INIT_LIST_HEAD(&path->neigh_list);516517memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));518path->pathrec.sgid = priv->local_gid;519path->pathrec.pkey = cpu_to_be16(priv->pkey);520path->pathrec.numb_path = 1;521path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;522523return path;524}525526static int path_rec_start(struct net_device *dev,527struct ipoib_path *path)528{529struct ipoib_dev_priv *priv = netdev_priv(dev);530531ipoib_dbg(priv, "Start path record lookup for %pI6\n",532path->pathrec.dgid.raw);533534init_completion(&path->done);535536path->query_id =537ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,538&path->pathrec,539IB_SA_PATH_REC_DGID |540IB_SA_PATH_REC_SGID |541IB_SA_PATH_REC_NUMB_PATH |542IB_SA_PATH_REC_TRAFFIC_CLASS |543IB_SA_PATH_REC_PKEY,5441000, GFP_ATOMIC,545path_rec_completion,546path, &path->query);547if (path->query_id < 0) {548ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);549path->query = NULL;550complete(&path->done);551return path->query_id;552}553554return 0;555}556557static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)558{559struct ipoib_dev_priv *priv = netdev_priv(dev);560struct ipoib_path *path;561struct ipoib_neigh *neigh;562unsigned long flags;563564neigh = ipoib_neigh_alloc(skb_dst(skb)->neighbour, skb->dev);565if (!neigh) {566++dev->stats.tx_dropped;567dev_kfree_skb_any(skb);568return;569}570571spin_lock_irqsave(&priv->lock, flags);572573path = __path_find(dev, skb_dst(skb)->neighbour->ha + 4);574if (!path) {575path = path_rec_create(dev, skb_dst(skb)->neighbour->ha + 4);576if (!path)577goto err_path;578579__path_add(dev, path);580}581582list_add_tail(&neigh->list, &path->neigh_list);583584if (path->ah) {585kref_get(&path->ah->ref);586neigh->ah = path->ah;587memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,588sizeof(union ib_gid));589590if (ipoib_cm_enabled(dev, neigh->neighbour)) {591if (!ipoib_cm_get(neigh))592ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));593if (!ipoib_cm_get(neigh)) {594list_del(&neigh->list);595if (neigh->ah)596ipoib_put_ah(neigh->ah);597ipoib_neigh_free(dev, neigh);598goto err_drop;599}600if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)601__skb_queue_tail(&neigh->queue, skb);602else {603ipoib_warn(priv, "queue length limit %d. Packet drop.\n",604skb_queue_len(&neigh->queue));605goto err_drop;606}607} else {608spin_unlock_irqrestore(&priv->lock, flags);609ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb_dst(skb)->neighbour->ha));610return;611}612} else {613neigh->ah = NULL;614615if (!path->query && path_rec_start(dev, path))616goto err_list;617618__skb_queue_tail(&neigh->queue, skb);619}620621spin_unlock_irqrestore(&priv->lock, flags);622return;623624err_list:625list_del(&neigh->list);626627err_path:628ipoib_neigh_free(dev, neigh);629err_drop:630++dev->stats.tx_dropped;631dev_kfree_skb_any(skb);632633spin_unlock_irqrestore(&priv->lock, flags);634}635636static void ipoib_path_lookup(struct sk_buff *skb, struct net_device *dev)637{638struct ipoib_dev_priv *priv = netdev_priv(skb->dev);639640/* Look up path record for unicasts */641if (skb_dst(skb)->neighbour->ha[4] != 0xff) {642neigh_add_path(skb, dev);643return;644}645646/* Add in the P_Key for multicasts */647skb_dst(skb)->neighbour->ha[8] = (priv->pkey >> 8) & 0xff;648skb_dst(skb)->neighbour->ha[9] = priv->pkey & 0xff;649ipoib_mcast_send(dev, skb_dst(skb)->neighbour->ha + 4, skb);650}651652static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,653struct ipoib_pseudoheader *phdr)654{655struct ipoib_dev_priv *priv = netdev_priv(dev);656struct ipoib_path *path;657unsigned long flags;658659spin_lock_irqsave(&priv->lock, flags);660661path = __path_find(dev, phdr->hwaddr + 4);662if (!path || !path->valid) {663int new_path = 0;664665if (!path) {666path = path_rec_create(dev, phdr->hwaddr + 4);667new_path = 1;668}669if (path) {670/* put pseudoheader back on for next time */671skb_push(skb, sizeof *phdr);672__skb_queue_tail(&path->queue, skb);673674if (!path->query && path_rec_start(dev, path)) {675spin_unlock_irqrestore(&priv->lock, flags);676if (new_path)677path_free(dev, path);678return;679} else680__path_add(dev, path);681} else {682++dev->stats.tx_dropped;683dev_kfree_skb_any(skb);684}685686spin_unlock_irqrestore(&priv->lock, flags);687return;688}689690if (path->ah) {691ipoib_dbg(priv, "Send unicast ARP to %04x\n",692be16_to_cpu(path->pathrec.dlid));693694spin_unlock_irqrestore(&priv->lock, flags);695ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr));696return;697} else if ((path->query || !path_rec_start(dev, path)) &&698skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {699/* put pseudoheader back on for next time */700skb_push(skb, sizeof *phdr);701__skb_queue_tail(&path->queue, skb);702} else {703++dev->stats.tx_dropped;704dev_kfree_skb_any(skb);705}706707spin_unlock_irqrestore(&priv->lock, flags);708}709710static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)711{712struct ipoib_dev_priv *priv = netdev_priv(dev);713struct ipoib_neigh *neigh;714unsigned long flags;715716if (likely(skb_dst(skb) && skb_dst(skb)->neighbour)) {717if (unlikely(!*to_ipoib_neigh(skb_dst(skb)->neighbour))) {718ipoib_path_lookup(skb, dev);719return NETDEV_TX_OK;720}721722neigh = *to_ipoib_neigh(skb_dst(skb)->neighbour);723724if (unlikely((memcmp(&neigh->dgid.raw,725skb_dst(skb)->neighbour->ha + 4,726sizeof(union ib_gid))) ||727(neigh->dev != dev))) {728spin_lock_irqsave(&priv->lock, flags);729/*730* It's safe to call ipoib_put_ah() inside731* priv->lock here, because we know that732* path->ah will always hold one more reference,733* so ipoib_put_ah() will never do more than734* decrement the ref count.735*/736if (neigh->ah)737ipoib_put_ah(neigh->ah);738list_del(&neigh->list);739ipoib_neigh_free(dev, neigh);740spin_unlock_irqrestore(&priv->lock, flags);741ipoib_path_lookup(skb, dev);742return NETDEV_TX_OK;743}744745if (ipoib_cm_get(neigh)) {746if (ipoib_cm_up(neigh)) {747ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));748return NETDEV_TX_OK;749}750} else if (neigh->ah) {751ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(skb_dst(skb)->neighbour->ha));752return NETDEV_TX_OK;753}754755if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {756spin_lock_irqsave(&priv->lock, flags);757__skb_queue_tail(&neigh->queue, skb);758spin_unlock_irqrestore(&priv->lock, flags);759} else {760++dev->stats.tx_dropped;761dev_kfree_skb_any(skb);762}763} else {764struct ipoib_pseudoheader *phdr =765(struct ipoib_pseudoheader *) skb->data;766skb_pull(skb, sizeof *phdr);767768if (phdr->hwaddr[4] == 0xff) {769/* Add in the P_Key for multicast*/770phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;771phdr->hwaddr[9] = priv->pkey & 0xff;772773ipoib_mcast_send(dev, phdr->hwaddr + 4, skb);774} else {775/* unicast GID -- should be ARP or RARP reply */776777if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) &&778(be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) {779ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x %pI6\n",780skb_dst(skb) ? "neigh" : "dst",781be16_to_cpup((__be16 *) skb->data),782IPOIB_QPN(phdr->hwaddr),783phdr->hwaddr + 4);784dev_kfree_skb_any(skb);785++dev->stats.tx_dropped;786return NETDEV_TX_OK;787}788789unicast_arp_send(skb, dev, phdr);790}791}792793return NETDEV_TX_OK;794}795796static void ipoib_timeout(struct net_device *dev)797{798struct ipoib_dev_priv *priv = netdev_priv(dev);799800ipoib_warn(priv, "transmit timeout: latency %d msecs\n",801jiffies_to_msecs(jiffies - dev->trans_start));802ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n",803netif_queue_stopped(dev),804priv->tx_head, priv->tx_tail);805/* XXX reset QP, etc. */806}807808static int ipoib_hard_header(struct sk_buff *skb,809struct net_device *dev,810unsigned short type,811const void *daddr, const void *saddr, unsigned len)812{813struct ipoib_header *header;814815header = (struct ipoib_header *) skb_push(skb, sizeof *header);816817header->proto = htons(type);818header->reserved = 0;819820/*821* If we don't have a neighbour structure, stuff the822* destination address onto the front of the skb so we can823* figure out where to send the packet later.824*/825if ((!skb_dst(skb) || !skb_dst(skb)->neighbour) && daddr) {826struct ipoib_pseudoheader *phdr =827(struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr);828memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);829}830831return 0;832}833834static void ipoib_set_mcast_list(struct net_device *dev)835{836struct ipoib_dev_priv *priv = netdev_priv(dev);837838if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {839ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");840return;841}842843queue_work(ipoib_workqueue, &priv->restart_task);844}845846static void ipoib_neigh_cleanup(struct neighbour *n)847{848struct ipoib_neigh *neigh;849struct ipoib_dev_priv *priv = netdev_priv(n->dev);850unsigned long flags;851struct ipoib_ah *ah = NULL;852853neigh = *to_ipoib_neigh(n);854if (neigh)855priv = netdev_priv(neigh->dev);856else857return;858ipoib_dbg(priv,859"neigh_cleanup for %06x %pI6\n",860IPOIB_QPN(n->ha),861n->ha + 4);862863spin_lock_irqsave(&priv->lock, flags);864865if (neigh->ah)866ah = neigh->ah;867list_del(&neigh->list);868ipoib_neigh_free(n->dev, neigh);869870spin_unlock_irqrestore(&priv->lock, flags);871872if (ah)873ipoib_put_ah(ah);874}875876struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour,877struct net_device *dev)878{879struct ipoib_neigh *neigh;880881neigh = kmalloc(sizeof *neigh, GFP_ATOMIC);882if (!neigh)883return NULL;884885neigh->neighbour = neighbour;886neigh->dev = dev;887memset(&neigh->dgid.raw, 0, sizeof (union ib_gid));888*to_ipoib_neigh(neighbour) = neigh;889skb_queue_head_init(&neigh->queue);890ipoib_cm_set(neigh, NULL);891892return neigh;893}894895void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh)896{897struct sk_buff *skb;898*to_ipoib_neigh(neigh->neighbour) = NULL;899while ((skb = __skb_dequeue(&neigh->queue))) {900++dev->stats.tx_dropped;901dev_kfree_skb_any(skb);902}903if (ipoib_cm_get(neigh))904ipoib_cm_destroy_tx(ipoib_cm_get(neigh));905kfree(neigh);906}907908static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms)909{910parms->neigh_cleanup = ipoib_neigh_cleanup;911912return 0;913}914915int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)916{917struct ipoib_dev_priv *priv = netdev_priv(dev);918919/* Allocate RX/TX "rings" to hold queued skbs */920priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,921GFP_KERNEL);922if (!priv->rx_ring) {923printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",924ca->name, ipoib_recvq_size);925goto out;926}927928priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);929if (!priv->tx_ring) {930printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",931ca->name, ipoib_sendq_size);932goto out_rx_ring_cleanup;933}934935/* priv->tx_head, tx_tail & tx_outstanding are already 0 */936937if (ipoib_ib_dev_init(dev, ca, port))938goto out_tx_ring_cleanup;939940return 0;941942out_tx_ring_cleanup:943vfree(priv->tx_ring);944945out_rx_ring_cleanup:946kfree(priv->rx_ring);947948out:949return -ENOMEM;950}951952void ipoib_dev_cleanup(struct net_device *dev)953{954struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv;955956ipoib_delete_debug_files(dev);957958/* Delete any child interfaces first */959list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {960unregister_netdev(cpriv->dev);961ipoib_dev_cleanup(cpriv->dev);962free_netdev(cpriv->dev);963}964965ipoib_ib_dev_cleanup(dev);966967kfree(priv->rx_ring);968vfree(priv->tx_ring);969970priv->rx_ring = NULL;971priv->tx_ring = NULL;972}973974static const struct header_ops ipoib_header_ops = {975.create = ipoib_hard_header,976};977978static const struct net_device_ops ipoib_netdev_ops = {979.ndo_open = ipoib_open,980.ndo_stop = ipoib_stop,981.ndo_change_mtu = ipoib_change_mtu,982.ndo_fix_features = ipoib_fix_features,983.ndo_start_xmit = ipoib_start_xmit,984.ndo_tx_timeout = ipoib_timeout,985.ndo_set_multicast_list = ipoib_set_mcast_list,986.ndo_neigh_setup = ipoib_neigh_setup_dev,987};988989static void ipoib_setup(struct net_device *dev)990{991struct ipoib_dev_priv *priv = netdev_priv(dev);992993dev->netdev_ops = &ipoib_netdev_ops;994dev->header_ops = &ipoib_header_ops;995996ipoib_set_ethtool_ops(dev);997998netif_napi_add(dev, &priv->napi, ipoib_poll, 100);9991000dev->watchdog_timeo = HZ;10011002dev->flags |= IFF_BROADCAST | IFF_MULTICAST;10031004/*1005* We add in INFINIBAND_ALEN to allow for the destination1006* address "pseudoheader" for skbs without neighbour struct.1007*/1008dev->hard_header_len = IPOIB_ENCAP_LEN + INFINIBAND_ALEN;1009dev->addr_len = INFINIBAND_ALEN;1010dev->type = ARPHRD_INFINIBAND;1011dev->tx_queue_len = ipoib_sendq_size * 2;1012dev->features = (NETIF_F_VLAN_CHALLENGED |1013NETIF_F_HIGHDMA);1014dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;10151016memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);10171018netif_carrier_off(dev);10191020priv->dev = dev;10211022spin_lock_init(&priv->lock);10231024mutex_init(&priv->vlan_mutex);10251026INIT_LIST_HEAD(&priv->path_list);1027INIT_LIST_HEAD(&priv->child_intfs);1028INIT_LIST_HEAD(&priv->dead_ahs);1029INIT_LIST_HEAD(&priv->multicast_list);10301031INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);1032INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task);1033INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);1034INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light);1035INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal);1036INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy);1037INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);1038INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);1039}10401041struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)1042{1043struct net_device *dev;10441045dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name,1046ipoib_setup);1047if (!dev)1048return NULL;10491050return netdev_priv(dev);1051}10521053static ssize_t show_pkey(struct device *dev,1054struct device_attribute *attr, char *buf)1055{1056struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));10571058return sprintf(buf, "0x%04x\n", priv->pkey);1059}1060static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);10611062static ssize_t show_umcast(struct device *dev,1063struct device_attribute *attr, char *buf)1064{1065struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));10661067return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags));1068}10691070static ssize_t set_umcast(struct device *dev,1071struct device_attribute *attr,1072const char *buf, size_t count)1073{1074struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));1075unsigned long umcast_val = simple_strtoul(buf, NULL, 0);10761077if (umcast_val > 0) {1078set_bit(IPOIB_FLAG_UMCAST, &priv->flags);1079ipoib_warn(priv, "ignoring multicast groups joined directly "1080"by userspace\n");1081} else1082clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);10831084return count;1085}1086static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast);10871088int ipoib_add_umcast_attr(struct net_device *dev)1089{1090return device_create_file(&dev->dev, &dev_attr_umcast);1091}10921093static ssize_t create_child(struct device *dev,1094struct device_attribute *attr,1095const char *buf, size_t count)1096{1097int pkey;1098int ret;10991100if (sscanf(buf, "%i", &pkey) != 1)1101return -EINVAL;11021103if (pkey < 0 || pkey > 0xffff)1104return -EINVAL;11051106/*1107* Set the full membership bit, so that we join the right1108* broadcast group, etc.1109*/1110pkey |= 0x8000;11111112ret = ipoib_vlan_add(to_net_dev(dev), pkey);11131114return ret ? ret : count;1115}1116static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child);11171118static ssize_t delete_child(struct device *dev,1119struct device_attribute *attr,1120const char *buf, size_t count)1121{1122int pkey;1123int ret;11241125if (sscanf(buf, "%i", &pkey) != 1)1126return -EINVAL;11271128if (pkey < 0 || pkey > 0xffff)1129return -EINVAL;11301131ret = ipoib_vlan_delete(to_net_dev(dev), pkey);11321133return ret ? ret : count;11341135}1136static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child);11371138int ipoib_add_pkey_attr(struct net_device *dev)1139{1140return device_create_file(&dev->dev, &dev_attr_pkey);1141}11421143int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)1144{1145struct ib_device_attr *device_attr;1146int result = -ENOMEM;11471148device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);1149if (!device_attr) {1150printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",1151hca->name, sizeof *device_attr);1152return result;1153}11541155result = ib_query_device(hca, device_attr);1156if (result) {1157printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",1158hca->name, result);1159kfree(device_attr);1160return result;1161}1162priv->hca_caps = device_attr->device_cap_flags;11631164kfree(device_attr);11651166if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {1167priv->dev->hw_features = NETIF_F_SG |1168NETIF_F_IP_CSUM | NETIF_F_RXCSUM;11691170if (priv->hca_caps & IB_DEVICE_UD_TSO)1171priv->dev->hw_features |= NETIF_F_TSO;11721173priv->dev->features |= priv->dev->hw_features;1174}11751176return 0;1177}11781179static struct net_device *ipoib_add_port(const char *format,1180struct ib_device *hca, u8 port)1181{1182struct ipoib_dev_priv *priv;1183struct ib_port_attr attr;1184int result = -ENOMEM;11851186priv = ipoib_intf_alloc(format);1187if (!priv)1188goto alloc_mem_failed;11891190SET_NETDEV_DEV(priv->dev, hca->dma_device);1191priv->dev->dev_id = port - 1;11921193if (!ib_query_port(hca, port, &attr))1194priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);1195else {1196printk(KERN_WARNING "%s: ib_query_port %d failed\n",1197hca->name, port);1198goto device_init_failed;1199}12001201/* MTU will be reset when mcast join happens */1202priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu);1203priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu;12041205result = ib_query_pkey(hca, port, 0, &priv->pkey);1206if (result) {1207printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",1208hca->name, port, result);1209goto device_init_failed;1210}12111212if (ipoib_set_dev_features(priv, hca))1213goto device_init_failed;12141215/*1216* Set the full membership bit, so that we join the right1217* broadcast group, etc.1218*/1219priv->pkey |= 0x8000;12201221priv->dev->broadcast[8] = priv->pkey >> 8;1222priv->dev->broadcast[9] = priv->pkey & 0xff;12231224result = ib_query_gid(hca, port, 0, &priv->local_gid);1225if (result) {1226printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",1227hca->name, port, result);1228goto device_init_failed;1229} else1230memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));12311232result = ipoib_dev_init(priv->dev, hca, port);1233if (result < 0) {1234printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",1235hca->name, port, result);1236goto device_init_failed;1237}12381239INIT_IB_EVENT_HANDLER(&priv->event_handler,1240priv->ca, ipoib_event);1241result = ib_register_event_handler(&priv->event_handler);1242if (result < 0) {1243printk(KERN_WARNING "%s: ib_register_event_handler failed for "1244"port %d (ret = %d)\n",1245hca->name, port, result);1246goto event_failed;1247}12481249result = register_netdev(priv->dev);1250if (result) {1251printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",1252hca->name, port, result);1253goto register_failed;1254}12551256ipoib_create_debug_files(priv->dev);12571258if (ipoib_cm_add_mode_attr(priv->dev))1259goto sysfs_failed;1260if (ipoib_add_pkey_attr(priv->dev))1261goto sysfs_failed;1262if (ipoib_add_umcast_attr(priv->dev))1263goto sysfs_failed;1264if (device_create_file(&priv->dev->dev, &dev_attr_create_child))1265goto sysfs_failed;1266if (device_create_file(&priv->dev->dev, &dev_attr_delete_child))1267goto sysfs_failed;12681269return priv->dev;12701271sysfs_failed:1272ipoib_delete_debug_files(priv->dev);1273unregister_netdev(priv->dev);12741275register_failed:1276ib_unregister_event_handler(&priv->event_handler);1277flush_workqueue(ipoib_workqueue);12781279event_failed:1280ipoib_dev_cleanup(priv->dev);12811282device_init_failed:1283free_netdev(priv->dev);12841285alloc_mem_failed:1286return ERR_PTR(result);1287}12881289static void ipoib_add_one(struct ib_device *device)1290{1291struct list_head *dev_list;1292struct net_device *dev;1293struct ipoib_dev_priv *priv;1294int s, e, p;12951296if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)1297return;12981299dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);1300if (!dev_list)1301return;13021303INIT_LIST_HEAD(dev_list);13041305if (device->node_type == RDMA_NODE_IB_SWITCH) {1306s = 0;1307e = 0;1308} else {1309s = 1;1310e = device->phys_port_cnt;1311}13121313for (p = s; p <= e; ++p) {1314if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND)1315continue;1316dev = ipoib_add_port("ib%d", device, p);1317if (!IS_ERR(dev)) {1318priv = netdev_priv(dev);1319list_add_tail(&priv->list, dev_list);1320}1321}13221323ib_set_client_data(device, &ipoib_client, dev_list);1324}13251326static void ipoib_remove_one(struct ib_device *device)1327{1328struct ipoib_dev_priv *priv, *tmp;1329struct list_head *dev_list;13301331if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)1332return;13331334dev_list = ib_get_client_data(device, &ipoib_client);13351336list_for_each_entry_safe(priv, tmp, dev_list, list) {1337ib_unregister_event_handler(&priv->event_handler);13381339rtnl_lock();1340dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);1341rtnl_unlock();13421343flush_workqueue(ipoib_workqueue);13441345unregister_netdev(priv->dev);1346ipoib_dev_cleanup(priv->dev);1347free_netdev(priv->dev);1348}13491350kfree(dev_list);1351}13521353static int __init ipoib_init_module(void)1354{1355int ret;13561357ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);1358ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);1359ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);13601361ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);1362ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);1363ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE);1364#ifdef CONFIG_INFINIBAND_IPOIB_CM1365ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);1366#endif13671368/*1369* When copying small received packets, we only copy from the1370* linear data part of the SKB, so we rely on this condition.1371*/1372BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);13731374ret = ipoib_register_debugfs();1375if (ret)1376return ret;13771378/*1379* We create our own workqueue mainly because we want to be1380* able to flush it when devices are being removed. We can't1381* use schedule_work()/flush_scheduled_work() because both1382* unregister_netdev() and linkwatch_event take the rtnl lock,1383* so flush_scheduled_work() can deadlock during device1384* removal.1385*/1386ipoib_workqueue = create_singlethread_workqueue("ipoib");1387if (!ipoib_workqueue) {1388ret = -ENOMEM;1389goto err_fs;1390}13911392ib_sa_register_client(&ipoib_sa_client);13931394ret = ib_register_client(&ipoib_client);1395if (ret)1396goto err_sa;13971398return 0;13991400err_sa:1401ib_sa_unregister_client(&ipoib_sa_client);1402destroy_workqueue(ipoib_workqueue);14031404err_fs:1405ipoib_unregister_debugfs();14061407return ret;1408}14091410static void __exit ipoib_cleanup_module(void)1411{1412ib_unregister_client(&ipoib_client);1413ib_sa_unregister_client(&ipoib_sa_client);1414ipoib_unregister_debugfs();1415destroy_workqueue(ipoib_workqueue);1416}14171418module_init(ipoib_init_module);1419module_exit(ipoib_cleanup_module);142014211422