Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/drivers/misc/sgi-xp/xpnet.c
15111 views
1
/*
2
* This file is subject to the terms and conditions of the GNU General Public
3
* License. See the file "COPYING" in the main directory of this archive
4
* for more details.
5
*
6
* Copyright (C) 1999-2009 Silicon Graphics, Inc. All rights reserved.
7
*/
8
9
/*
10
* Cross Partition Network Interface (XPNET) support
11
*
12
* XPNET provides a virtual network layered on top of the Cross
13
* Partition communication layer.
14
*
15
* XPNET provides direct point-to-point and broadcast-like support
16
* for an ethernet-like device. The ethernet broadcast medium is
17
* replaced with a point-to-point message structure which passes
18
* pointers to a DMA-capable block that a remote partition should
19
* retrieve and pass to the upper level networking layer.
20
*
21
*/
22
23
#include <linux/slab.h>
24
#include <linux/module.h>
25
#include <linux/netdevice.h>
26
#include <linux/etherdevice.h>
27
#include "xp.h"
28
29
/*
30
* The message payload transferred by XPC.
31
*
32
* buf_pa is the physical address where the DMA should pull from.
33
*
34
* NOTE: for performance reasons, buf_pa should _ALWAYS_ begin on a
35
* cacheline boundary. To accomplish this, we record the number of
36
* bytes from the beginning of the first cacheline to the first useful
37
* byte of the skb (leadin_ignore) and the number of bytes from the
38
* last useful byte of the skb to the end of the last cacheline
39
* (tailout_ignore).
40
*
41
* size is the number of bytes to transfer which includes the skb->len
42
* (useful bytes of the senders skb) plus the leadin and tailout
43
*/
44
struct xpnet_message {
45
u16 version; /* Version for this message */
46
u16 embedded_bytes; /* #of bytes embedded in XPC message */
47
u32 magic; /* Special number indicating this is xpnet */
48
unsigned long buf_pa; /* phys address of buffer to retrieve */
49
u32 size; /* #of bytes in buffer */
50
u8 leadin_ignore; /* #of bytes to ignore at the beginning */
51
u8 tailout_ignore; /* #of bytes to ignore at the end */
52
unsigned char data; /* body of small packets */
53
};
54
55
/*
56
* Determine the size of our message, the cacheline aligned size,
57
* and then the number of message will request from XPC.
58
*
59
* XPC expects each message to exist in an individual cacheline.
60
*/
61
#define XPNET_MSG_SIZE XPC_MSG_PAYLOAD_MAX_SIZE
62
#define XPNET_MSG_DATA_MAX \
63
(XPNET_MSG_SIZE - offsetof(struct xpnet_message, data))
64
#define XPNET_MSG_NENTRIES (PAGE_SIZE / XPC_MSG_MAX_SIZE)
65
66
#define XPNET_MAX_KTHREADS (XPNET_MSG_NENTRIES + 1)
67
#define XPNET_MAX_IDLE_KTHREADS (XPNET_MSG_NENTRIES + 1)
68
69
/*
70
* Version number of XPNET implementation. XPNET can always talk to versions
71
* with same major #, and never talk to versions with a different version.
72
*/
73
#define _XPNET_VERSION(_major, _minor) (((_major) << 4) | (_minor))
74
#define XPNET_VERSION_MAJOR(_v) ((_v) >> 4)
75
#define XPNET_VERSION_MINOR(_v) ((_v) & 0xf)
76
77
#define XPNET_VERSION _XPNET_VERSION(1, 0) /* version 1.0 */
78
#define XPNET_VERSION_EMBED _XPNET_VERSION(1, 1) /* version 1.1 */
79
#define XPNET_MAGIC 0x88786984 /* "XNET" */
80
81
#define XPNET_VALID_MSG(_m) \
82
((XPNET_VERSION_MAJOR(_m->version) == XPNET_VERSION_MAJOR(XPNET_VERSION)) \
83
&& (msg->magic == XPNET_MAGIC))
84
85
#define XPNET_DEVICE_NAME "xp0"
86
87
/*
88
* When messages are queued with xpc_send_notify, a kmalloc'd buffer
89
* of the following type is passed as a notification cookie. When the
90
* notification function is called, we use the cookie to decide
91
* whether all outstanding message sends have completed. The skb can
92
* then be released.
93
*/
94
struct xpnet_pending_msg {
95
struct sk_buff *skb;
96
atomic_t use_count;
97
};
98
99
struct net_device *xpnet_device;
100
101
/*
102
* When we are notified of other partitions activating, we add them to
103
* our bitmask of partitions to which we broadcast.
104
*/
105
static unsigned long *xpnet_broadcast_partitions;
106
/* protect above */
107
static DEFINE_SPINLOCK(xpnet_broadcast_lock);
108
109
/*
110
* Since the Block Transfer Engine (BTE) is being used for the transfer
111
* and it relies upon cache-line size transfers, we need to reserve at
112
* least one cache-line for head and tail alignment. The BTE is
113
* limited to 8MB transfers.
114
*
115
* Testing has shown that changing MTU to greater than 64KB has no effect
116
* on TCP as the two sides negotiate a Max Segment Size that is limited
117
* to 64K. Other protocols May use packets greater than this, but for
118
* now, the default is 64KB.
119
*/
120
#define XPNET_MAX_MTU (0x800000UL - L1_CACHE_BYTES)
121
/* 32KB has been determined to be the ideal */
122
#define XPNET_DEF_MTU (0x8000UL)
123
124
/*
125
* The partid is encapsulated in the MAC address beginning in the following
126
* octet and it consists of two octets.
127
*/
128
#define XPNET_PARTID_OCTET 2
129
130
/* Define the XPNET debug device structures to be used with dev_dbg() et al */
131
132
struct device_driver xpnet_dbg_name = {
133
.name = "xpnet"
134
};
135
136
struct device xpnet_dbg_subname = {
137
.init_name = "", /* set to "" */
138
.driver = &xpnet_dbg_name
139
};
140
141
struct device *xpnet = &xpnet_dbg_subname;
142
143
/*
144
* Packet was recevied by XPC and forwarded to us.
145
*/
146
static void
147
xpnet_receive(short partid, int channel, struct xpnet_message *msg)
148
{
149
struct sk_buff *skb;
150
void *dst;
151
enum xp_retval ret;
152
153
if (!XPNET_VALID_MSG(msg)) {
154
/*
155
* Packet with a different XPC version. Ignore.
156
*/
157
xpc_received(partid, channel, (void *)msg);
158
159
xpnet_device->stats.rx_errors++;
160
161
return;
162
}
163
dev_dbg(xpnet, "received 0x%lx, %d, %d, %d\n", msg->buf_pa, msg->size,
164
msg->leadin_ignore, msg->tailout_ignore);
165
166
/* reserve an extra cache line */
167
skb = dev_alloc_skb(msg->size + L1_CACHE_BYTES);
168
if (!skb) {
169
dev_err(xpnet, "failed on dev_alloc_skb(%d)\n",
170
msg->size + L1_CACHE_BYTES);
171
172
xpc_received(partid, channel, (void *)msg);
173
174
xpnet_device->stats.rx_errors++;
175
176
return;
177
}
178
179
/*
180
* The allocated skb has some reserved space.
181
* In order to use xp_remote_memcpy(), we need to get the
182
* skb->data pointer moved forward.
183
*/
184
skb_reserve(skb, (L1_CACHE_BYTES - ((u64)skb->data &
185
(L1_CACHE_BYTES - 1)) +
186
msg->leadin_ignore));
187
188
/*
189
* Update the tail pointer to indicate data actually
190
* transferred.
191
*/
192
skb_put(skb, (msg->size - msg->leadin_ignore - msg->tailout_ignore));
193
194
/*
195
* Move the data over from the other side.
196
*/
197
if ((XPNET_VERSION_MINOR(msg->version) == 1) &&
198
(msg->embedded_bytes != 0)) {
199
dev_dbg(xpnet, "copying embedded message. memcpy(0x%p, 0x%p, "
200
"%lu)\n", skb->data, &msg->data,
201
(size_t)msg->embedded_bytes);
202
203
skb_copy_to_linear_data(skb, &msg->data,
204
(size_t)msg->embedded_bytes);
205
} else {
206
dst = (void *)((u64)skb->data & ~(L1_CACHE_BYTES - 1));
207
dev_dbg(xpnet, "transferring buffer to the skb->data area;\n\t"
208
"xp_remote_memcpy(0x%p, 0x%p, %hu)\n", dst,
209
(void *)msg->buf_pa, msg->size);
210
211
ret = xp_remote_memcpy(xp_pa(dst), msg->buf_pa, msg->size);
212
if (ret != xpSuccess) {
213
/*
214
* !!! Need better way of cleaning skb. Currently skb
215
* !!! appears in_use and we can't just call
216
* !!! dev_kfree_skb.
217
*/
218
dev_err(xpnet, "xp_remote_memcpy(0x%p, 0x%p, 0x%hx) "
219
"returned error=0x%x\n", dst,
220
(void *)msg->buf_pa, msg->size, ret);
221
222
xpc_received(partid, channel, (void *)msg);
223
224
xpnet_device->stats.rx_errors++;
225
226
return;
227
}
228
}
229
230
dev_dbg(xpnet, "<skb->head=0x%p skb->data=0x%p skb->tail=0x%p "
231
"skb->end=0x%p skb->len=%d\n", (void *)skb->head,
232
(void *)skb->data, skb_tail_pointer(skb), skb_end_pointer(skb),
233
skb->len);
234
235
skb->protocol = eth_type_trans(skb, xpnet_device);
236
skb->ip_summed = CHECKSUM_UNNECESSARY;
237
238
dev_dbg(xpnet, "passing skb to network layer\n"
239
"\tskb->head=0x%p skb->data=0x%p skb->tail=0x%p "
240
"skb->end=0x%p skb->len=%d\n",
241
(void *)skb->head, (void *)skb->data, skb_tail_pointer(skb),
242
skb_end_pointer(skb), skb->len);
243
244
xpnet_device->stats.rx_packets++;
245
xpnet_device->stats.rx_bytes += skb->len + ETH_HLEN;
246
247
netif_rx_ni(skb);
248
xpc_received(partid, channel, (void *)msg);
249
}
250
251
/*
252
* This is the handler which XPC calls during any sort of change in
253
* state or message reception on a connection.
254
*/
255
static void
256
xpnet_connection_activity(enum xp_retval reason, short partid, int channel,
257
void *data, void *key)
258
{
259
DBUG_ON(partid < 0 || partid >= xp_max_npartitions);
260
DBUG_ON(channel != XPC_NET_CHANNEL);
261
262
switch (reason) {
263
case xpMsgReceived: /* message received */
264
DBUG_ON(data == NULL);
265
266
xpnet_receive(partid, channel, (struct xpnet_message *)data);
267
break;
268
269
case xpConnected: /* connection completed to a partition */
270
spin_lock_bh(&xpnet_broadcast_lock);
271
__set_bit(partid, xpnet_broadcast_partitions);
272
spin_unlock_bh(&xpnet_broadcast_lock);
273
274
netif_carrier_on(xpnet_device);
275
276
dev_dbg(xpnet, "%s connected to partition %d\n",
277
xpnet_device->name, partid);
278
break;
279
280
default:
281
spin_lock_bh(&xpnet_broadcast_lock);
282
__clear_bit(partid, xpnet_broadcast_partitions);
283
spin_unlock_bh(&xpnet_broadcast_lock);
284
285
if (bitmap_empty((unsigned long *)xpnet_broadcast_partitions,
286
xp_max_npartitions)) {
287
netif_carrier_off(xpnet_device);
288
}
289
290
dev_dbg(xpnet, "%s disconnected from partition %d\n",
291
xpnet_device->name, partid);
292
break;
293
}
294
}
295
296
static int
297
xpnet_dev_open(struct net_device *dev)
298
{
299
enum xp_retval ret;
300
301
dev_dbg(xpnet, "calling xpc_connect(%d, 0x%p, NULL, %ld, %ld, %ld, "
302
"%ld)\n", XPC_NET_CHANNEL, xpnet_connection_activity,
303
(unsigned long)XPNET_MSG_SIZE,
304
(unsigned long)XPNET_MSG_NENTRIES,
305
(unsigned long)XPNET_MAX_KTHREADS,
306
(unsigned long)XPNET_MAX_IDLE_KTHREADS);
307
308
ret = xpc_connect(XPC_NET_CHANNEL, xpnet_connection_activity, NULL,
309
XPNET_MSG_SIZE, XPNET_MSG_NENTRIES,
310
XPNET_MAX_KTHREADS, XPNET_MAX_IDLE_KTHREADS);
311
if (ret != xpSuccess) {
312
dev_err(xpnet, "ifconfig up of %s failed on XPC connect, "
313
"ret=%d\n", dev->name, ret);
314
315
return -ENOMEM;
316
}
317
318
dev_dbg(xpnet, "ifconfig up of %s; XPC connected\n", dev->name);
319
320
return 0;
321
}
322
323
static int
324
xpnet_dev_stop(struct net_device *dev)
325
{
326
xpc_disconnect(XPC_NET_CHANNEL);
327
328
dev_dbg(xpnet, "ifconfig down of %s; XPC disconnected\n", dev->name);
329
330
return 0;
331
}
332
333
static int
334
xpnet_dev_change_mtu(struct net_device *dev, int new_mtu)
335
{
336
/* 68 comes from min TCP+IP+MAC header */
337
if ((new_mtu < 68) || (new_mtu > XPNET_MAX_MTU)) {
338
dev_err(xpnet, "ifconfig %s mtu %d failed; value must be "
339
"between 68 and %ld\n", dev->name, new_mtu,
340
XPNET_MAX_MTU);
341
return -EINVAL;
342
}
343
344
dev->mtu = new_mtu;
345
dev_dbg(xpnet, "ifconfig %s mtu set to %d\n", dev->name, new_mtu);
346
return 0;
347
}
348
349
/*
350
* Notification that the other end has received the message and
351
* DMA'd the skb information. At this point, they are done with
352
* our side. When all recipients are done processing, we
353
* release the skb and then release our pending message structure.
354
*/
355
static void
356
xpnet_send_completed(enum xp_retval reason, short partid, int channel,
357
void *__qm)
358
{
359
struct xpnet_pending_msg *queued_msg = (struct xpnet_pending_msg *)__qm;
360
361
DBUG_ON(queued_msg == NULL);
362
363
dev_dbg(xpnet, "message to %d notified with reason %d\n",
364
partid, reason);
365
366
if (atomic_dec_return(&queued_msg->use_count) == 0) {
367
dev_dbg(xpnet, "all acks for skb->head=-x%p\n",
368
(void *)queued_msg->skb->head);
369
370
dev_kfree_skb_any(queued_msg->skb);
371
kfree(queued_msg);
372
}
373
}
374
375
static void
376
xpnet_send(struct sk_buff *skb, struct xpnet_pending_msg *queued_msg,
377
u64 start_addr, u64 end_addr, u16 embedded_bytes, int dest_partid)
378
{
379
u8 msg_buffer[XPNET_MSG_SIZE];
380
struct xpnet_message *msg = (struct xpnet_message *)&msg_buffer;
381
u16 msg_size = sizeof(struct xpnet_message);
382
enum xp_retval ret;
383
384
msg->embedded_bytes = embedded_bytes;
385
if (unlikely(embedded_bytes != 0)) {
386
msg->version = XPNET_VERSION_EMBED;
387
dev_dbg(xpnet, "calling memcpy(0x%p, 0x%p, 0x%lx)\n",
388
&msg->data, skb->data, (size_t)embedded_bytes);
389
skb_copy_from_linear_data(skb, &msg->data,
390
(size_t)embedded_bytes);
391
msg_size += embedded_bytes - 1;
392
} else {
393
msg->version = XPNET_VERSION;
394
}
395
msg->magic = XPNET_MAGIC;
396
msg->size = end_addr - start_addr;
397
msg->leadin_ignore = (u64)skb->data - start_addr;
398
msg->tailout_ignore = end_addr - (u64)skb_tail_pointer(skb);
399
msg->buf_pa = xp_pa((void *)start_addr);
400
401
dev_dbg(xpnet, "sending XPC message to %d:%d\n"
402
"msg->buf_pa=0x%lx, msg->size=%u, "
403
"msg->leadin_ignore=%u, msg->tailout_ignore=%u\n",
404
dest_partid, XPC_NET_CHANNEL, msg->buf_pa, msg->size,
405
msg->leadin_ignore, msg->tailout_ignore);
406
407
atomic_inc(&queued_msg->use_count);
408
409
ret = xpc_send_notify(dest_partid, XPC_NET_CHANNEL, XPC_NOWAIT, msg,
410
msg_size, xpnet_send_completed, queued_msg);
411
if (unlikely(ret != xpSuccess))
412
atomic_dec(&queued_msg->use_count);
413
}
414
415
/*
416
* Network layer has formatted a packet (skb) and is ready to place it
417
* "on the wire". Prepare and send an xpnet_message to all partitions
418
* which have connected with us and are targets of this packet.
419
*
420
* MAC-NOTE: For the XPNET driver, the MAC address contains the
421
* destination partid. If the destination partid octets are 0xffff,
422
* this packet is to be broadcast to all connected partitions.
423
*/
424
static int
425
xpnet_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
426
{
427
struct xpnet_pending_msg *queued_msg;
428
u64 start_addr, end_addr;
429
short dest_partid;
430
u16 embedded_bytes = 0;
431
432
dev_dbg(xpnet, ">skb->head=0x%p skb->data=0x%p skb->tail=0x%p "
433
"skb->end=0x%p skb->len=%d\n", (void *)skb->head,
434
(void *)skb->data, skb_tail_pointer(skb), skb_end_pointer(skb),
435
skb->len);
436
437
if (skb->data[0] == 0x33) {
438
dev_kfree_skb(skb);
439
return NETDEV_TX_OK; /* nothing needed to be done */
440
}
441
442
/*
443
* The xpnet_pending_msg tracks how many outstanding
444
* xpc_send_notifies are relying on this skb. When none
445
* remain, release the skb.
446
*/
447
queued_msg = kmalloc(sizeof(struct xpnet_pending_msg), GFP_ATOMIC);
448
if (queued_msg == NULL) {
449
dev_warn(xpnet, "failed to kmalloc %ld bytes; dropping "
450
"packet\n", sizeof(struct xpnet_pending_msg));
451
452
dev->stats.tx_errors++;
453
dev_kfree_skb(skb);
454
return NETDEV_TX_OK;
455
}
456
457
/* get the beginning of the first cacheline and end of last */
458
start_addr = ((u64)skb->data & ~(L1_CACHE_BYTES - 1));
459
end_addr = L1_CACHE_ALIGN((u64)skb_tail_pointer(skb));
460
461
/* calculate how many bytes to embed in the XPC message */
462
if (unlikely(skb->len <= XPNET_MSG_DATA_MAX)) {
463
/* skb->data does fit so embed */
464
embedded_bytes = skb->len;
465
}
466
467
/*
468
* Since the send occurs asynchronously, we set the count to one
469
* and begin sending. Any sends that happen to complete before
470
* we are done sending will not free the skb. We will be left
471
* with that task during exit. This also handles the case of
472
* a packet destined for a partition which is no longer up.
473
*/
474
atomic_set(&queued_msg->use_count, 1);
475
queued_msg->skb = skb;
476
477
if (skb->data[0] == 0xff) {
478
/* we are being asked to broadcast to all partitions */
479
for_each_set_bit(dest_partid, xpnet_broadcast_partitions,
480
xp_max_npartitions) {
481
482
xpnet_send(skb, queued_msg, start_addr, end_addr,
483
embedded_bytes, dest_partid);
484
}
485
} else {
486
dest_partid = (short)skb->data[XPNET_PARTID_OCTET + 1];
487
dest_partid |= (short)skb->data[XPNET_PARTID_OCTET + 0] << 8;
488
489
if (dest_partid >= 0 &&
490
dest_partid < xp_max_npartitions &&
491
test_bit(dest_partid, xpnet_broadcast_partitions) != 0) {
492
493
xpnet_send(skb, queued_msg, start_addr, end_addr,
494
embedded_bytes, dest_partid);
495
}
496
}
497
498
dev->stats.tx_packets++;
499
dev->stats.tx_bytes += skb->len;
500
501
if (atomic_dec_return(&queued_msg->use_count) == 0) {
502
dev_kfree_skb(skb);
503
kfree(queued_msg);
504
}
505
506
return NETDEV_TX_OK;
507
}
508
509
/*
510
* Deal with transmit timeouts coming from the network layer.
511
*/
512
static void
513
xpnet_dev_tx_timeout(struct net_device *dev)
514
{
515
dev->stats.tx_errors++;
516
}
517
518
static const struct net_device_ops xpnet_netdev_ops = {
519
.ndo_open = xpnet_dev_open,
520
.ndo_stop = xpnet_dev_stop,
521
.ndo_start_xmit = xpnet_dev_hard_start_xmit,
522
.ndo_change_mtu = xpnet_dev_change_mtu,
523
.ndo_tx_timeout = xpnet_dev_tx_timeout,
524
.ndo_set_mac_address = eth_mac_addr,
525
.ndo_validate_addr = eth_validate_addr,
526
};
527
528
static int __init
529
xpnet_init(void)
530
{
531
int result;
532
533
if (!is_shub() && !is_uv())
534
return -ENODEV;
535
536
dev_info(xpnet, "registering network device %s\n", XPNET_DEVICE_NAME);
537
538
xpnet_broadcast_partitions = kzalloc(BITS_TO_LONGS(xp_max_npartitions) *
539
sizeof(long), GFP_KERNEL);
540
if (xpnet_broadcast_partitions == NULL)
541
return -ENOMEM;
542
543
/*
544
* use ether_setup() to init the majority of our device
545
* structure and then override the necessary pieces.
546
*/
547
xpnet_device = alloc_netdev(0, XPNET_DEVICE_NAME, ether_setup);
548
if (xpnet_device == NULL) {
549
kfree(xpnet_broadcast_partitions);
550
return -ENOMEM;
551
}
552
553
netif_carrier_off(xpnet_device);
554
555
xpnet_device->netdev_ops = &xpnet_netdev_ops;
556
xpnet_device->mtu = XPNET_DEF_MTU;
557
558
/*
559
* Multicast assumes the LSB of the first octet is set for multicast
560
* MAC addresses. We chose the first octet of the MAC to be unlikely
561
* to collide with any vendor's officially issued MAC.
562
*/
563
xpnet_device->dev_addr[0] = 0x02; /* locally administered, no OUI */
564
565
xpnet_device->dev_addr[XPNET_PARTID_OCTET + 1] = xp_partition_id;
566
xpnet_device->dev_addr[XPNET_PARTID_OCTET + 0] = (xp_partition_id >> 8);
567
568
/*
569
* ether_setup() sets this to a multicast device. We are
570
* really not supporting multicast at this time.
571
*/
572
xpnet_device->flags &= ~IFF_MULTICAST;
573
574
/*
575
* No need to checksum as it is a DMA transfer. The BTE will
576
* report an error if the data is not retrievable and the
577
* packet will be dropped.
578
*/
579
xpnet_device->features = NETIF_F_NO_CSUM;
580
581
result = register_netdev(xpnet_device);
582
if (result != 0) {
583
free_netdev(xpnet_device);
584
kfree(xpnet_broadcast_partitions);
585
}
586
587
return result;
588
}
589
590
module_init(xpnet_init);
591
592
static void __exit
593
xpnet_exit(void)
594
{
595
dev_info(xpnet, "unregistering network device %s\n",
596
xpnet_device[0].name);
597
598
unregister_netdev(xpnet_device);
599
free_netdev(xpnet_device);
600
kfree(xpnet_broadcast_partitions);
601
}
602
603
module_exit(xpnet_exit);
604
605
MODULE_AUTHOR("Silicon Graphics, Inc.");
606
MODULE_DESCRIPTION("Cross Partition Network adapter (XPNET)");
607
MODULE_LICENSE("GPL");
608
609