Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c
39566 views
1
/*-
2
* SPDX-License-Identifier: BSD-3-Clause
3
*
4
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
5
* The Regents of the University of California. All rights reserved.
6
* Copyright (c) 2004 The FreeBSD Foundation. All rights reserved.
7
* Copyright (c) 2004-2008 Robert N. M. Watson. All rights reserved.
8
*
9
* Redistribution and use in source and binary forms, with or without
10
* modification, are permitted provided that the following conditions
11
* are met:
12
* 1. Redistributions of source code must retain the above copyright
13
* notice, this list of conditions and the following disclaimer.
14
* 2. Redistributions in binary form must reproduce the above copyright
15
* notice, this list of conditions and the following disclaimer in the
16
* documentation and/or other materials provided with the distribution.
17
* 3. Neither the name of the University nor the names of its contributors
18
* may be used to endorse or promote products derived from this software
19
* without specific prior written permission.
20
*
21
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31
* SUCH DAMAGE.
32
*
33
* Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c
34
*/
35
36
/*
37
*
38
* Copyright (c) 2010 Isilon Systems, Inc.
39
* Copyright (c) 2010 iX Systems, Inc.
40
* Copyright (c) 2010 Panasas, Inc.
41
* All rights reserved.
42
*
43
* Redistribution and use in source and binary forms, with or without
44
* modification, are permitted provided that the following conditions
45
* are met:
46
* 1. Redistributions of source code must retain the above copyright
47
* notice unmodified, this list of conditions, and the following
48
* disclaimer.
49
* 2. Redistributions in binary form must reproduce the above copyright
50
* notice, this list of conditions and the following disclaimer in the
51
* documentation and/or other materials provided with the distribution.
52
*
53
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
54
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
55
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
56
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
57
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
58
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
59
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
60
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
61
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
62
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
63
*
64
*/
65
66
#include <sys/param.h>
67
#include <sys/eventhandler.h>
68
#include <sys/kernel.h>
69
#include <sys/malloc.h>
70
71
#include "sdp.h"
72
73
#include <net/if.h>
74
#include <net/route.h>
75
#include <net/vnet.h>
76
#include <sys/sysctl.h>
77
78
uma_zone_t sdp_zone;
79
struct rwlock sdp_lock;
80
LIST_HEAD(, sdp_sock) sdp_list;
81
82
struct workqueue_struct *rx_comp_wq;
83
84
RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock");
85
#define SDP_LIST_WLOCK() rw_wlock(&sdp_lock)
86
#define SDP_LIST_RLOCK() rw_rlock(&sdp_lock)
87
#define SDP_LIST_WUNLOCK() rw_wunlock(&sdp_lock)
88
#define SDP_LIST_RUNLOCK() rw_runlock(&sdp_lock)
89
#define SDP_LIST_WLOCK_ASSERT() rw_assert(&sdp_lock, RW_WLOCKED)
90
#define SDP_LIST_RLOCK_ASSERT() rw_assert(&sdp_lock, RW_RLOCKED)
91
#define SDP_LIST_LOCK_ASSERT() rw_assert(&sdp_lock, RW_LOCKED)
92
93
MALLOC_DEFINE(M_SDP, "sdp", "Sockets Direct Protocol");
94
95
static void sdp_stop_keepalive_timer(struct socket *so);
96
97
/*
98
* SDP protocol interface to socket abstraction.
99
*/
100
/*
101
* sdp_sendspace and sdp_recvspace are the default send and receive window
102
* sizes, respectively.
103
*/
104
u_long sdp_sendspace = 1024*32;
105
u_long sdp_recvspace = 1024*64;
106
107
static int sdp_count;
108
109
/*
110
* Disable async. CMA events for sockets which are being torn down.
111
*/
112
static void
113
sdp_destroy_cma(struct sdp_sock *ssk)
114
{
115
116
if (ssk->id == NULL)
117
return;
118
rdma_destroy_id(ssk->id);
119
ssk->id = NULL;
120
}
121
122
static int
123
sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred)
124
{
125
struct sockaddr_in *sin;
126
struct sockaddr_in null;
127
int error;
128
129
SDP_WLOCK_ASSERT(ssk);
130
131
if (ssk->lport != 0 || ssk->laddr != INADDR_ANY)
132
return (EINVAL);
133
/* rdma_bind_addr handles bind races. */
134
SDP_WUNLOCK(ssk);
135
if (ssk->id == NULL)
136
ssk->id = rdma_create_id(&init_net, sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC);
137
if (ssk->id == NULL) {
138
SDP_WLOCK(ssk);
139
return (ENOMEM);
140
}
141
if (nam == NULL) {
142
null.sin_family = AF_INET;
143
null.sin_len = sizeof(null);
144
null.sin_addr.s_addr = INADDR_ANY;
145
null.sin_port = 0;
146
bzero(&null.sin_zero, sizeof(null.sin_zero));
147
nam = (struct sockaddr *)&null;
148
}
149
error = -rdma_bind_addr(ssk->id, nam);
150
SDP_WLOCK(ssk);
151
if (error == 0) {
152
sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr;
153
ssk->laddr = sin->sin_addr.s_addr;
154
ssk->lport = sin->sin_port;
155
} else
156
sdp_destroy_cma(ssk);
157
return (error);
158
}
159
160
static void
161
sdp_pcbfree(struct sdp_sock *ssk)
162
{
163
164
KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk));
165
KASSERT((ssk->flags & SDP_DESTROY) == 0,
166
("ssk %p already destroyed", ssk));
167
168
sdp_dbg(ssk->socket, "Freeing pcb");
169
SDP_WLOCK_ASSERT(ssk);
170
ssk->flags |= SDP_DESTROY;
171
SDP_WUNLOCK(ssk);
172
SDP_LIST_WLOCK();
173
sdp_count--;
174
LIST_REMOVE(ssk, list);
175
SDP_LIST_WUNLOCK();
176
crfree(ssk->cred);
177
ssk->qp_active = 0;
178
if (ssk->qp) {
179
ib_destroy_qp(ssk->qp);
180
ssk->qp = NULL;
181
}
182
sdp_tx_ring_destroy(ssk);
183
sdp_rx_ring_destroy(ssk);
184
sdp_destroy_cma(ssk);
185
rw_destroy(&ssk->rx_ring.destroyed_lock);
186
rw_destroy(&ssk->lock);
187
uma_zfree(sdp_zone, ssk);
188
}
189
190
static int
191
sdp_getsockaddr(struct socket *so, struct sockaddr *sa)
192
{
193
struct sdp_sock *ssk = sdp_sk(so);
194
195
SDP_RLOCK(ssk);
196
*(struct sockaddr_in *)sa = (struct sockaddr_in ){
197
.sin_family = AF_INET,
198
.sin_len = sizeof(struct sockaddr_in),
199
.sin_addr.s_addr = ssk->laddr,
200
.sin_port = ssk->lport,
201
};
202
SDP_RUNLOCK(ssk);
203
204
return (0);
205
}
206
207
static int
208
sdp_getpeeraddr(struct socket *so, struct sockaddr *sa)
209
{
210
struct sdp_sock *ssk = sdp_sk(so);
211
212
SDP_RLOCK(ssk);
213
*(struct sockaddr_in *)sa = (struct sockaddr_in ){
214
.sin_family = AF_INET,
215
.sin_len = sizeof(struct sockaddr_in),
216
.sin_addr.s_addr = ssk->faddr,
217
.sin_port = ssk->fport,
218
};
219
SDP_RUNLOCK(ssk);
220
221
return (0);
222
}
223
224
#if 0
225
static void
226
sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg)
227
{
228
struct sdp_sock *ssk;
229
230
SDP_LIST_RLOCK();
231
LIST_FOREACH(ssk, &sdp_list, list) {
232
SDP_WLOCK(ssk);
233
func(ssk, arg);
234
SDP_WUNLOCK(ssk);
235
}
236
SDP_LIST_RUNLOCK();
237
}
238
#endif
239
240
static void
241
sdp_output_reset(struct sdp_sock *ssk)
242
{
243
struct rdma_cm_id *id;
244
245
SDP_WLOCK_ASSERT(ssk);
246
if (ssk->id) {
247
id = ssk->id;
248
ssk->qp_active = 0;
249
SDP_WUNLOCK(ssk);
250
rdma_disconnect(id);
251
SDP_WLOCK(ssk);
252
}
253
ssk->state = TCPS_CLOSED;
254
}
255
256
/*
257
* Attempt to close a SDP socket, marking it as dropped, and freeing
258
* the socket if we hold the only reference.
259
*/
260
static struct sdp_sock *
261
sdp_closed(struct sdp_sock *ssk)
262
{
263
struct socket *so;
264
265
SDP_WLOCK_ASSERT(ssk);
266
267
ssk->flags |= SDP_DROPPED;
268
so = ssk->socket;
269
soisdisconnected(so);
270
if (ssk->flags & SDP_SOCKREF) {
271
ssk->flags &= ~SDP_SOCKREF;
272
SDP_WUNLOCK(ssk);
273
sorele(so);
274
return (NULL);
275
}
276
return (ssk);
277
}
278
279
/*
280
* Perform timer based shutdowns which can not operate in
281
* callout context.
282
*/
283
static void
284
sdp_shutdown_task(void *data, int pending)
285
{
286
struct sdp_sock *ssk;
287
288
ssk = data;
289
SDP_WLOCK(ssk);
290
/*
291
* I don't think this can race with another call to pcbfree()
292
* because SDP_TIMEWAIT protects it. SDP_DESTROY may be redundant.
293
*/
294
if (ssk->flags & SDP_DESTROY)
295
panic("sdp_shutdown_task: Racing with pcbfree for ssk %p",
296
ssk);
297
if (ssk->flags & SDP_DISCON)
298
sdp_output_reset(ssk);
299
/* We have to clear this so sdp_detach() will call pcbfree(). */
300
ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT);
301
if ((ssk->flags & SDP_DROPPED) == 0 &&
302
sdp_closed(ssk) == NULL)
303
return;
304
if (ssk->socket == NULL) {
305
sdp_pcbfree(ssk);
306
return;
307
}
308
SDP_WUNLOCK(ssk);
309
}
310
311
/*
312
* 2msl has expired, schedule the shutdown task.
313
*/
314
static void
315
sdp_2msl_timeout(void *data)
316
{
317
struct sdp_sock *ssk;
318
319
ssk = data;
320
/* Callout canceled. */
321
if (!callout_active(&ssk->keep2msl))
322
goto out;
323
callout_deactivate(&ssk->keep2msl);
324
/* Should be impossible, defensive programming. */
325
if ((ssk->flags & SDP_TIMEWAIT) == 0)
326
goto out;
327
taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task);
328
out:
329
SDP_WUNLOCK(ssk);
330
return;
331
}
332
333
/*
334
* Schedule the 2msl wait timer.
335
*/
336
static void
337
sdp_2msl_wait(struct sdp_sock *ssk)
338
{
339
340
SDP_WLOCK_ASSERT(ssk);
341
ssk->flags |= SDP_TIMEWAIT;
342
ssk->state = TCPS_TIME_WAIT;
343
soisdisconnected(ssk->socket);
344
callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk);
345
}
346
347
/*
348
* Timed out waiting for the final fin/ack from rdma_disconnect().
349
*/
350
static void
351
sdp_dreq_timeout(void *data)
352
{
353
struct sdp_sock *ssk;
354
355
ssk = data;
356
/* Callout canceled. */
357
if (!callout_active(&ssk->keep2msl))
358
goto out;
359
/* Callout rescheduled, probably as a different timer. */
360
if (callout_pending(&ssk->keep2msl))
361
goto out;
362
callout_deactivate(&ssk->keep2msl);
363
if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK)
364
goto out;
365
if ((ssk->flags & SDP_DREQWAIT) == 0)
366
goto out;
367
ssk->flags &= ~SDP_DREQWAIT;
368
ssk->flags |= SDP_DISCON;
369
sdp_2msl_wait(ssk);
370
ssk->qp_active = 0;
371
out:
372
SDP_WUNLOCK(ssk);
373
}
374
375
/*
376
* Received the final fin/ack. Cancel the 2msl.
377
*/
378
void
379
sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk)
380
{
381
sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n");
382
ssk->flags &= ~SDP_DREQWAIT;
383
sdp_2msl_wait(ssk);
384
}
385
386
static int
387
sdp_init_sock(struct socket *sk)
388
{
389
struct sdp_sock *ssk = sdp_sk(sk);
390
391
sdp_dbg(sk, "%s\n", __func__);
392
393
callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED);
394
TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk);
395
#ifdef SDP_ZCOPY
396
INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout);
397
ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */
398
ssk->tx_ring.rdma_inflight = NULL;
399
#endif
400
atomic_set(&ssk->mseq_ack, 0);
401
sdp_rx_ring_init(ssk);
402
ssk->tx_ring.buffer = NULL;
403
404
return 0;
405
}
406
407
/*
408
* Allocate an sdp_sock for the socket and reserve socket buffer space.
409
*/
410
static int
411
sdp_attach(struct socket *so, int proto, struct thread *td)
412
{
413
struct sdp_sock *ssk;
414
int error;
415
416
ssk = sdp_sk(so);
417
KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so));
418
if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
419
error = soreserve(so, sdp_sendspace, sdp_recvspace);
420
if (error)
421
return (error);
422
}
423
so->so_rcv.sb_flags |= SB_AUTOSIZE;
424
so->so_snd.sb_flags |= SB_AUTOSIZE;
425
ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO);
426
if (ssk == NULL)
427
return (ENOBUFS);
428
rw_init(&ssk->lock, "sdpsock");
429
ssk->socket = so;
430
ssk->cred = crhold(so->so_cred);
431
so->so_pcb = (caddr_t)ssk;
432
sdp_init_sock(so);
433
ssk->flags = 0;
434
ssk->qp_active = 0;
435
ssk->state = TCPS_CLOSED;
436
mbufq_init(&ssk->rxctlq, INT_MAX);
437
SDP_LIST_WLOCK();
438
LIST_INSERT_HEAD(&sdp_list, ssk, list);
439
sdp_count++;
440
SDP_LIST_WUNLOCK();
441
442
return (0);
443
}
444
445
/*
446
* Detach SDP from the socket, potentially leaving it around for the
447
* timewait to expire.
448
*/
449
static void
450
sdp_detach(struct socket *so)
451
{
452
struct sdp_sock *ssk;
453
454
ssk = sdp_sk(so);
455
SDP_WLOCK(ssk);
456
KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL"));
457
ssk->socket->so_pcb = NULL;
458
ssk->socket = NULL;
459
if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT))
460
SDP_WUNLOCK(ssk);
461
else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT)
462
sdp_pcbfree(ssk);
463
else
464
panic("sdp_detach: Unexpected state, ssk %p.\n", ssk);
465
}
466
467
/*
468
* Allocate a local address for the socket.
469
*/
470
static int
471
sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
472
{
473
int error = 0;
474
struct sdp_sock *ssk;
475
struct sockaddr_in *sin;
476
477
sin = (struct sockaddr_in *)nam;
478
if (sin->sin_family != AF_INET)
479
return (EAFNOSUPPORT);
480
if (nam->sa_len != sizeof(*sin))
481
return (EINVAL);
482
if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
483
return (EAFNOSUPPORT);
484
485
ssk = sdp_sk(so);
486
SDP_WLOCK(ssk);
487
if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
488
error = EINVAL;
489
goto out;
490
}
491
error = sdp_pcbbind(ssk, nam, td->td_ucred);
492
out:
493
SDP_WUNLOCK(ssk);
494
495
return (error);
496
}
497
498
/*
499
* Prepare to accept connections.
500
*/
501
static int
502
sdp_listen(struct socket *so, int backlog, struct thread *td)
503
{
504
int error = 0;
505
struct sdp_sock *ssk;
506
507
ssk = sdp_sk(so);
508
SDP_WLOCK(ssk);
509
if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
510
error = EINVAL;
511
goto out;
512
}
513
if (error == 0 && ssk->lport == 0)
514
error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
515
SOCK_LOCK(so);
516
if (error == 0)
517
error = solisten_proto_check(so);
518
if (error == 0) {
519
solisten_proto(so, backlog);
520
ssk->state = TCPS_LISTEN;
521
}
522
SOCK_UNLOCK(so);
523
524
out:
525
SDP_WUNLOCK(ssk);
526
if (error == 0)
527
error = -rdma_listen(ssk->id, backlog);
528
return (error);
529
}
530
531
/*
532
* Initiate a SDP connection to nam.
533
*/
534
static int
535
sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td)
536
{
537
struct sockaddr_in src;
538
struct socket *so;
539
int error;
540
541
so = ssk->socket;
542
543
SDP_WLOCK_ASSERT(ssk);
544
if (ssk->lport == 0) {
545
error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
546
if (error)
547
return error;
548
}
549
src.sin_family = AF_INET;
550
src.sin_len = sizeof(src);
551
bzero(&src.sin_zero, sizeof(src.sin_zero));
552
src.sin_port = ssk->lport;
553
src.sin_addr.s_addr = ssk->laddr;
554
soisconnecting(so);
555
SDP_WUNLOCK(ssk);
556
error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam,
557
SDP_RESOLVE_TIMEOUT);
558
SDP_WLOCK(ssk);
559
if (error == 0)
560
ssk->state = TCPS_SYN_SENT;
561
562
return 0;
563
}
564
565
/*
566
* Initiate SDP connection.
567
*/
568
static int
569
sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
570
{
571
int error = 0;
572
struct sdp_sock *ssk;
573
struct sockaddr_in *sin;
574
575
sin = (struct sockaddr_in *)nam;
576
if (nam->sa_len != sizeof(*sin))
577
return (EINVAL);
578
if (sin->sin_family != AF_INET)
579
return (EAFNOSUPPORT);
580
if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
581
return (EAFNOSUPPORT);
582
if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0)
583
return (error);
584
ssk = sdp_sk(so);
585
SDP_WLOCK(ssk);
586
if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED))
587
error = EINVAL;
588
else
589
error = sdp_start_connect(ssk, nam, td);
590
SDP_WUNLOCK(ssk);
591
return (error);
592
}
593
594
/*
595
* Drop a SDP socket, reporting
596
* the specified error. If connection is synchronized,
597
* then send a RST to peer.
598
*/
599
static struct sdp_sock *
600
sdp_drop(struct sdp_sock *ssk, int errno)
601
{
602
struct socket *so;
603
604
SDP_WLOCK_ASSERT(ssk);
605
so = ssk->socket;
606
if (TCPS_HAVERCVDSYN(ssk->state))
607
sdp_output_reset(ssk);
608
if (errno == ETIMEDOUT && ssk->softerror)
609
errno = ssk->softerror;
610
so->so_error = errno;
611
return (sdp_closed(ssk));
612
}
613
614
/*
615
* User issued close, and wish to trail through shutdown states:
616
* if never received SYN, just forget it. If got a SYN from peer,
617
* but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
618
* If already got a FIN from peer, then almost done; go to LAST_ACK
619
* state. In all other cases, have already sent FIN to peer (e.g.
620
* after PRU_SHUTDOWN), and just have to play tedious game waiting
621
* for peer to send FIN or not respond to keep-alives, etc.
622
* We can let the user exit from the close as soon as the FIN is acked.
623
*/
624
static void
625
sdp_usrclosed(struct sdp_sock *ssk)
626
{
627
628
SDP_WLOCK_ASSERT(ssk);
629
630
switch (ssk->state) {
631
case TCPS_LISTEN:
632
ssk->state = TCPS_CLOSED;
633
SDP_WUNLOCK(ssk);
634
sdp_destroy_cma(ssk);
635
SDP_WLOCK(ssk);
636
/* FALLTHROUGH */
637
case TCPS_CLOSED:
638
ssk = sdp_closed(ssk);
639
/*
640
* sdp_closed() should never return NULL here as the socket is
641
* still open.
642
*/
643
KASSERT(ssk != NULL,
644
("sdp_usrclosed: sdp_closed() returned NULL"));
645
break;
646
647
case TCPS_SYN_SENT:
648
/* FALLTHROUGH */
649
case TCPS_SYN_RECEIVED:
650
ssk->flags |= SDP_NEEDFIN;
651
break;
652
653
case TCPS_ESTABLISHED:
654
ssk->flags |= SDP_NEEDFIN;
655
ssk->state = TCPS_FIN_WAIT_1;
656
break;
657
658
case TCPS_CLOSE_WAIT:
659
ssk->state = TCPS_LAST_ACK;
660
break;
661
}
662
if (ssk->state >= TCPS_FIN_WAIT_2) {
663
/* Prevent the connection hanging in FIN_WAIT_2 forever. */
664
if (ssk->state == TCPS_FIN_WAIT_2)
665
sdp_2msl_wait(ssk);
666
else
667
soisdisconnected(ssk->socket);
668
}
669
}
670
671
static void
672
sdp_output_disconnect(struct sdp_sock *ssk)
673
{
674
675
SDP_WLOCK_ASSERT(ssk);
676
callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT,
677
sdp_dreq_timeout, ssk);
678
ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT;
679
sdp_post_sends(ssk, M_NOWAIT);
680
}
681
682
/*
683
* Initiate or continue a disconnect.
684
* If embryonic state, just send reset (once).
685
* If in ``let data drain'' option and linger null, just drop.
686
* Otherwise (hard), mark socket disconnecting and drop
687
* current input data; switch states based on user close, and
688
* send segment to peer (with FIN).
689
*/
690
static void
691
sdp_start_disconnect(struct sdp_sock *ssk)
692
{
693
struct socket *so;
694
int unread;
695
696
so = ssk->socket;
697
SDP_WLOCK_ASSERT(ssk);
698
sdp_stop_keepalive_timer(so);
699
/*
700
* Neither sdp_closed() nor sdp_drop() should return NULL, as the
701
* socket is still open.
702
*/
703
if (ssk->state < TCPS_ESTABLISHED) {
704
ssk = sdp_closed(ssk);
705
KASSERT(ssk != NULL,
706
("sdp_start_disconnect: sdp_close() returned NULL"));
707
} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
708
ssk = sdp_drop(ssk, 0);
709
KASSERT(ssk != NULL,
710
("sdp_start_disconnect: sdp_drop() returned NULL"));
711
} else {
712
soisdisconnecting(so);
713
unread = sbused(&so->so_rcv);
714
sbflush(&so->so_rcv);
715
sdp_usrclosed(ssk);
716
if (!(ssk->flags & SDP_DROPPED)) {
717
if (unread)
718
sdp_output_reset(ssk);
719
else
720
sdp_output_disconnect(ssk);
721
}
722
}
723
}
724
725
/*
726
* User initiated disconnect.
727
*/
728
static int
729
sdp_disconnect(struct socket *so)
730
{
731
struct sdp_sock *ssk;
732
int error = 0;
733
734
ssk = sdp_sk(so);
735
SDP_WLOCK(ssk);
736
if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
737
error = ECONNRESET;
738
goto out;
739
}
740
sdp_start_disconnect(ssk);
741
out:
742
SDP_WUNLOCK(ssk);
743
return (error);
744
}
745
746
/*
747
* Accept a connection. Essentially all the work is done at higher levels;
748
* just return the address of the peer, storing through addr.
749
*
750
*
751
* XXX This is broken XXX
752
*
753
* The rationale for acquiring the sdp lock here is somewhat complicated,
754
* and is described in detail in the commit log entry for r175612. Acquiring
755
* it delays an accept(2) racing with sonewconn(), which inserts the socket
756
* before the address/port fields are initialized. A better fix would
757
* prevent the socket from being placed in the listen queue until all fields
758
* are fully initialized.
759
*/
760
static int
761
sdp_accept(struct socket *so, struct sockaddr *sa)
762
{
763
struct sdp_sock *ssk = NULL;
764
int error;
765
766
if (so->so_state & SS_ISDISCONNECTED)
767
return (ECONNABORTED);
768
769
error = 0;
770
ssk = sdp_sk(so);
771
SDP_WLOCK(ssk);
772
if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED))
773
error = ECONNABORTED;
774
else
775
*(struct sockaddr_in *)sa = (struct sockaddr_in ){
776
.sin_family = AF_INET,
777
.sin_len = sizeof(struct sockaddr_in),
778
.sin_addr.s_addr = ssk->faddr,
779
.sin_port = ssk->fport,
780
};
781
SDP_WUNLOCK(ssk);
782
783
return (error);
784
}
785
786
/*
787
* Mark the connection as being incapable of further output.
788
*/
789
static int
790
sdp_shutdown(struct socket *so, enum shutdown_how how)
791
{
792
struct sdp_sock *ssk = sdp_sk(so);
793
int error = 0;
794
795
SOCK_LOCK(so);
796
if ((so->so_state &
797
(SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
798
SOCK_UNLOCK(so);
799
return (ENOTCONN);
800
}
801
if (SOLISTENING(so)) {
802
if (how != SHUT_WR) {
803
so->so_error = ECONNABORTED;
804
solisten_wakeup(so); /* unlocks so */
805
} else
806
SOCK_UNLOCK(so);
807
return (0);
808
}
809
SOCK_UNLOCK(so);
810
811
switch (how) {
812
case SHUT_RD:
813
socantrcvmore(so);
814
sbrelease(so, SO_RCV);
815
break;
816
case SHUT_RDWR:
817
socantrcvmore(so);
818
sbrelease(so, SO_RCV);
819
/* FALLTHROUGH */
820
case SHUT_WR:
821
SDP_WLOCK(ssk);
822
if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
823
SDP_WUNLOCK(ssk);
824
error = ECONNRESET;
825
break;
826
}
827
socantsendmore(so);
828
sdp_usrclosed(ssk);
829
if (!(ssk->flags & SDP_DROPPED))
830
sdp_output_disconnect(ssk);
831
SDP_WUNLOCK(ssk);
832
}
833
wakeup(&so->so_timeo);
834
835
return (error);
836
}
837
838
static void
839
sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt)
840
{
841
struct mbuf *n;
842
int ncnt;
843
844
SOCKBUF_LOCK_ASSERT(sb);
845
SBLASTRECORDCHK(sb);
846
KASSERT(mb->m_flags & M_PKTHDR,
847
("sdp_append: %p Missing packet header.\n", mb));
848
n = sb->sb_lastrecord;
849
/*
850
* If the queue is empty just set all pointers and proceed.
851
*/
852
if (n == NULL) {
853
sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb;
854
for (; mb; mb = mb->m_next) {
855
sb->sb_mbtail = mb;
856
sballoc(sb, mb);
857
}
858
return;
859
}
860
/*
861
* Count the number of mbufs in the current tail.
862
*/
863
for (ncnt = 0; n->m_next; n = n->m_next)
864
ncnt++;
865
n = sb->sb_lastrecord;
866
/*
867
* If the two chains can fit in a single sdp packet and
868
* the last record has not been sent yet (WRITABLE) coalesce
869
* them. The lastrecord remains the same but we must strip the
870
* packet header and then let sbcompress do the hard part.
871
*/
872
if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES &&
873
n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE <
874
ssk->xmit_size_goal) {
875
m_adj(mb, SDP_HEAD_SIZE);
876
n->m_pkthdr.len += mb->m_pkthdr.len;
877
n->m_flags |= mb->m_flags & (M_PUSH | M_URG);
878
m_demote(mb, 1, 0);
879
sbcompress(sb, mb, sb->sb_mbtail);
880
return;
881
}
882
/*
883
* Not compressible, just append to the end and adjust counters.
884
*/
885
sb->sb_lastrecord->m_flags |= M_PUSH;
886
sb->sb_lastrecord->m_nextpkt = mb;
887
sb->sb_lastrecord = mb;
888
if (sb->sb_sndptr == NULL)
889
sb->sb_sndptr = mb;
890
for (; mb; mb = mb->m_next) {
891
sb->sb_mbtail = mb;
892
sballoc(sb, mb);
893
}
894
}
895
896
/*
897
* Do a send by putting data in output queue and updating urgent
898
* marker if URG set. Possibly send more data. Unlike the other
899
* pru_*() routines, the mbuf chains are our responsibility. We
900
* must either enqueue them or free them. The other pru_* routines
901
* generally are caller-frees.
902
*
903
* This comes from sendfile, normal sends will come from sdp_sosend().
904
*/
905
static int
906
sdp_send(struct socket *so, int flags, struct mbuf *m,
907
struct sockaddr *nam, struct mbuf *control, struct thread *td)
908
{
909
struct sdp_sock *ssk;
910
struct mbuf *n;
911
int error;
912
int cnt;
913
914
if (nam != NULL) {
915
if (nam->sa_family != AF_INET) {
916
if (control)
917
m_freem(control);
918
m_freem(m);
919
return (EAFNOSUPPORT);
920
}
921
if (nam->sa_len != sizeof(struct sockaddr_in)) {
922
if (control)
923
m_freem(control);
924
m_freem(m);
925
return (EINVAL);
926
}
927
}
928
929
error = 0;
930
ssk = sdp_sk(so);
931
KASSERT(m->m_flags & M_PKTHDR,
932
("sdp_send: %p no packet header", m));
933
M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK);
934
mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA;
935
for (n = m, cnt = 0; n->m_next; n = n->m_next)
936
cnt++;
937
if (cnt > SDP_MAX_SEND_SGES) {
938
n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES);
939
if (n == NULL) {
940
m_freem(m);
941
return (EMSGSIZE);
942
}
943
m = n;
944
for (cnt = 0; n->m_next; n = n->m_next)
945
cnt++;
946
}
947
SDP_WLOCK(ssk);
948
if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
949
if (control)
950
m_freem(control);
951
if (m)
952
m_freem(m);
953
error = ECONNRESET;
954
goto out;
955
}
956
if (control) {
957
/* SDP doesn't support control messages. */
958
if (control->m_len) {
959
m_freem(control);
960
if (m)
961
m_freem(m);
962
error = EINVAL;
963
goto out;
964
}
965
m_freem(control); /* empty control, just free it */
966
}
967
if (!(flags & PRUS_OOB)) {
968
SOCKBUF_LOCK(&so->so_snd);
969
sdp_append(ssk, &so->so_snd, m, cnt);
970
SOCKBUF_UNLOCK(&so->so_snd);
971
if (nam && ssk->state < TCPS_SYN_SENT) {
972
/*
973
* Do implied connect if not yet connected.
974
*/
975
error = sdp_start_connect(ssk, nam, td);
976
if (error)
977
goto out;
978
}
979
if (flags & PRUS_EOF) {
980
/*
981
* Close the send side of the connection after
982
* the data is sent.
983
*/
984
socantsendmore(so);
985
sdp_usrclosed(ssk);
986
if (!(ssk->flags & SDP_DROPPED))
987
sdp_output_disconnect(ssk);
988
} else if (!(ssk->flags & SDP_DROPPED) &&
989
!(flags & PRUS_MORETOCOME))
990
sdp_post_sends(ssk, M_NOWAIT);
991
SDP_WUNLOCK(ssk);
992
return (0);
993
} else {
994
SOCKBUF_LOCK(&so->so_snd);
995
if (sbspace(&so->so_snd) < -512) {
996
SOCKBUF_UNLOCK(&so->so_snd);
997
m_freem(m);
998
error = ENOBUFS;
999
goto out;
1000
}
1001
/*
1002
* According to RFC961 (Assigned Protocols),
1003
* the urgent pointer points to the last octet
1004
* of urgent data. We continue, however,
1005
* to consider it to indicate the first octet
1006
* of data past the urgent section.
1007
* Otherwise, snd_up should be one lower.
1008
*/
1009
m->m_flags |= M_URG | M_PUSH;
1010
sdp_append(ssk, &so->so_snd, m, cnt);
1011
SOCKBUF_UNLOCK(&so->so_snd);
1012
if (nam && ssk->state < TCPS_SYN_SENT) {
1013
/*
1014
* Do implied connect if not yet connected.
1015
*/
1016
error = sdp_start_connect(ssk, nam, td);
1017
if (error)
1018
goto out;
1019
}
1020
sdp_post_sends(ssk, M_NOWAIT);
1021
SDP_WUNLOCK(ssk);
1022
return (0);
1023
}
1024
out:
1025
SDP_WUNLOCK(ssk);
1026
return (error);
1027
}
1028
1029
/*
1030
* Send on a socket. If send must go all at once and message is larger than
1031
* send buffering, then hard error. Lock against other senders. If must go
1032
* all at once and not enough room now, then inform user that this would
1033
* block and do nothing. Otherwise, if nonblocking, send as much as
1034
* possible. The data to be sent is described by "uio" if nonzero, otherwise
1035
* by the mbuf chain "top" (which must be null if uio is not). Data provided
1036
* in mbuf chain must be small enough to send all at once.
1037
*
1038
* Returns nonzero on error, timeout or signal; callers must check for short
1039
* counts if EINTR/ERESTART are returned. Data and control buffers are freed
1040
* on return.
1041
*/
1042
static int
1043
sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1044
struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1045
{
1046
struct sdp_sock *ssk;
1047
long space, resid;
1048
int atomic;
1049
int error;
1050
int copy;
1051
1052
if (uio != NULL)
1053
resid = uio->uio_resid;
1054
else
1055
resid = top->m_pkthdr.len;
1056
atomic = top != NULL;
1057
if (control != NULL) {
1058
if (control->m_len) {
1059
m_freem(control);
1060
if (top)
1061
m_freem(top);
1062
return (EINVAL);
1063
}
1064
m_freem(control);
1065
control = NULL;
1066
}
1067
/*
1068
* In theory resid should be unsigned. However, space must be
1069
* signed, as it might be less than 0 if we over-committed, and we
1070
* must use a signed comparison of space and resid. On the other
1071
* hand, a negative resid causes us to loop sending 0-length
1072
* segments to the protocol.
1073
*
1074
* Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1075
* type sockets since that's an error.
1076
*/
1077
if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1078
error = EINVAL;
1079
goto out;
1080
}
1081
if (td != NULL)
1082
td->td_ru.ru_msgsnd++;
1083
1084
ssk = sdp_sk(so);
1085
error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
1086
if (error)
1087
goto out;
1088
1089
restart:
1090
do {
1091
SOCKBUF_LOCK(&so->so_snd);
1092
if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1093
SOCKBUF_UNLOCK(&so->so_snd);
1094
error = EPIPE;
1095
goto release;
1096
}
1097
if (so->so_error) {
1098
error = so->so_error;
1099
so->so_error = 0;
1100
SOCKBUF_UNLOCK(&so->so_snd);
1101
goto release;
1102
}
1103
if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) {
1104
SOCKBUF_UNLOCK(&so->so_snd);
1105
error = ENOTCONN;
1106
goto release;
1107
}
1108
space = sbspace(&so->so_snd);
1109
if (flags & MSG_OOB)
1110
space += 1024;
1111
if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) {
1112
SOCKBUF_UNLOCK(&so->so_snd);
1113
error = EMSGSIZE;
1114
goto release;
1115
}
1116
if (space < resid &&
1117
(atomic || space < so->so_snd.sb_lowat)) {
1118
if ((so->so_state & SS_NBIO) ||
1119
(flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
1120
SOCKBUF_UNLOCK(&so->so_snd);
1121
error = EWOULDBLOCK;
1122
goto release;
1123
}
1124
error = sbwait(so, SO_SND);
1125
SOCKBUF_UNLOCK(&so->so_snd);
1126
if (error)
1127
goto release;
1128
goto restart;
1129
}
1130
SOCKBUF_UNLOCK(&so->so_snd);
1131
do {
1132
if (uio == NULL) {
1133
resid = 0;
1134
if (flags & MSG_EOR)
1135
top->m_flags |= M_EOR;
1136
} else {
1137
/*
1138
* Copy the data from userland into a mbuf
1139
* chain. If no data is to be copied in,
1140
* a single empty mbuf is returned.
1141
*/
1142
copy = min(space,
1143
ssk->xmit_size_goal - SDP_HEAD_SIZE);
1144
top = m_uiotombuf(uio, M_WAITOK, copy,
1145
0, M_PKTHDR |
1146
((flags & MSG_EOR) ? M_EOR : 0));
1147
if (top == NULL) {
1148
/* only possible error */
1149
error = EFAULT;
1150
goto release;
1151
}
1152
space -= resid - uio->uio_resid;
1153
resid = uio->uio_resid;
1154
}
1155
/*
1156
* XXX all the SBS_CANTSENDMORE checks previously
1157
* done could be out of date after dropping the
1158
* socket lock.
1159
*/
1160
error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB :
1161
/*
1162
* Set EOF on the last send if the user specified
1163
* MSG_EOF.
1164
*/
1165
((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF :
1166
/* If there is more to send set PRUS_MORETOCOME. */
1167
(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1168
top, addr, NULL, td);
1169
top = NULL;
1170
if (error)
1171
goto release;
1172
} while (resid && space > 0);
1173
} while (resid);
1174
1175
release:
1176
SOCK_IO_SEND_UNLOCK(so);
1177
out:
1178
if (top != NULL)
1179
m_freem(top);
1180
return (error);
1181
}
1182
1183
/*
1184
* The part of soreceive() that implements reading non-inline out-of-band
1185
* data from a socket. For more complete comments, see soreceive(), from
1186
* which this code originated.
1187
*
1188
* Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1189
* unable to return an mbuf chain to the caller.
1190
*/
1191
static int
1192
soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1193
{
1194
struct protosw *pr = so->so_proto;
1195
struct mbuf *m;
1196
int error;
1197
1198
KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1199
1200
m = m_get(M_WAITOK, MT_DATA);
1201
error = pr->pr_rcvoob(so, m, flags & MSG_PEEK);
1202
if (error)
1203
goto bad;
1204
do {
1205
error = uiomove(mtod(m, void *),
1206
(int) min(uio->uio_resid, m->m_len), uio);
1207
m = m_free(m);
1208
} while (uio->uio_resid && error == 0 && m);
1209
bad:
1210
if (m != NULL)
1211
m_freem(m);
1212
return (error);
1213
}
1214
1215
/*
1216
* Optimized version of soreceive() for stream (TCP) sockets.
1217
*/
1218
static int
1219
sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio,
1220
struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1221
{
1222
int len = 0, error = 0, flags, oresid;
1223
struct sockbuf *sb;
1224
struct mbuf *m, *n = NULL;
1225
struct sdp_sock *ssk;
1226
1227
/* We only do stream sockets. */
1228
if (so->so_type != SOCK_STREAM)
1229
return (EINVAL);
1230
if (psa != NULL)
1231
*psa = NULL;
1232
if (controlp != NULL)
1233
return (EINVAL);
1234
if (flagsp != NULL)
1235
flags = *flagsp &~ MSG_EOR;
1236
else
1237
flags = 0;
1238
if (flags & MSG_OOB)
1239
return (soreceive_rcvoob(so, uio, flags));
1240
if (mp0 != NULL)
1241
*mp0 = NULL;
1242
1243
sb = &so->so_rcv;
1244
ssk = sdp_sk(so);
1245
1246
/* Prevent other readers from entering the socket. */
1247
error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
1248
if (error)
1249
return (error);
1250
SOCKBUF_LOCK(sb);
1251
1252
/* Easy one, no space to copyout anything. */
1253
if (uio->uio_resid == 0) {
1254
error = EINVAL;
1255
goto out;
1256
}
1257
oresid = uio->uio_resid;
1258
1259
/* We will never ever get anything unless we are connected. */
1260
if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1261
/* When disconnecting there may be still some data left. */
1262
if (sbavail(sb))
1263
goto deliver;
1264
if (!(so->so_state & SS_ISDISCONNECTED))
1265
error = ENOTCONN;
1266
goto out;
1267
}
1268
1269
/* Socket buffer is empty and we shall not block. */
1270
if (sbavail(sb) == 0 &&
1271
((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1272
error = EAGAIN;
1273
goto out;
1274
}
1275
1276
restart:
1277
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1278
1279
/* Abort if socket has reported problems. */
1280
if (so->so_error) {
1281
if (sbavail(sb))
1282
goto deliver;
1283
if (oresid > uio->uio_resid)
1284
goto out;
1285
error = so->so_error;
1286
if (!(flags & MSG_PEEK))
1287
so->so_error = 0;
1288
goto out;
1289
}
1290
1291
/* Door is closed. Deliver what is left, if any. */
1292
if (sb->sb_state & SBS_CANTRCVMORE) {
1293
if (sbavail(sb))
1294
goto deliver;
1295
else
1296
goto out;
1297
}
1298
1299
/* Socket buffer got some data that we shall deliver now. */
1300
if (sbavail(sb) && !(flags & MSG_WAITALL) &&
1301
((so->so_state & SS_NBIO) ||
1302
(flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1303
sbavail(sb) >= sb->sb_lowat ||
1304
sbavail(sb) >= uio->uio_resid ||
1305
sbavail(sb) >= sb->sb_hiwat) ) {
1306
goto deliver;
1307
}
1308
1309
/* On MSG_WAITALL we must wait until all data or error arrives. */
1310
if ((flags & MSG_WAITALL) &&
1311
(sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat))
1312
goto deliver;
1313
1314
/*
1315
* Wait and block until (more) data comes in.
1316
* NB: Drops the sockbuf lock during wait.
1317
*/
1318
error = sbwait(so, SO_RCV);
1319
if (error)
1320
goto out;
1321
goto restart;
1322
1323
deliver:
1324
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1325
KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__));
1326
KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1327
1328
/* Statistics. */
1329
if (uio->uio_td)
1330
uio->uio_td->td_ru.ru_msgrcv++;
1331
1332
/* Fill uio until full or current end of socket buffer is reached. */
1333
len = min(uio->uio_resid, sbavail(sb));
1334
if (mp0 != NULL) {
1335
/* Dequeue as many mbufs as possible. */
1336
if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1337
for (*mp0 = m = sb->sb_mb;
1338
m != NULL && m->m_len <= len;
1339
m = m->m_next) {
1340
len -= m->m_len;
1341
uio->uio_resid -= m->m_len;
1342
sbfree(sb, m);
1343
n = m;
1344
}
1345
sb->sb_mb = m;
1346
if (sb->sb_mb == NULL)
1347
SB_EMPTY_FIXUP(sb);
1348
n->m_next = NULL;
1349
}
1350
/* Copy the remainder. */
1351
if (len > 0) {
1352
KASSERT(sb->sb_mb != NULL,
1353
("%s: len > 0 && sb->sb_mb empty", __func__));
1354
1355
m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
1356
if (m == NULL)
1357
len = 0; /* Don't flush data from sockbuf. */
1358
else
1359
uio->uio_resid -= m->m_len;
1360
if (*mp0 != NULL)
1361
n->m_next = m;
1362
else
1363
*mp0 = m;
1364
if (*mp0 == NULL) {
1365
error = ENOBUFS;
1366
goto out;
1367
}
1368
}
1369
} else {
1370
/* NB: Must unlock socket buffer as uiomove may sleep. */
1371
SOCKBUF_UNLOCK(sb);
1372
error = m_mbuftouio(uio, sb->sb_mb, len);
1373
SOCKBUF_LOCK(sb);
1374
if (error)
1375
goto out;
1376
}
1377
SBLASTRECORDCHK(sb);
1378
SBLASTMBUFCHK(sb);
1379
1380
/*
1381
* Remove the delivered data from the socket buffer unless we
1382
* were only peeking.
1383
*/
1384
if (!(flags & MSG_PEEK)) {
1385
if (len > 0)
1386
sbdrop_locked(sb, len);
1387
1388
/* Notify protocol that we drained some data. */
1389
SOCKBUF_UNLOCK(sb);
1390
SDP_WLOCK(ssk);
1391
sdp_do_posts(ssk);
1392
SDP_WUNLOCK(ssk);
1393
SOCKBUF_LOCK(sb);
1394
}
1395
1396
/*
1397
* For MSG_WAITALL we may have to loop again and wait for
1398
* more data to come in.
1399
*/
1400
if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
1401
goto restart;
1402
out:
1403
SBLASTRECORDCHK(sb);
1404
SBLASTMBUFCHK(sb);
1405
SOCKBUF_UNLOCK(sb);
1406
SOCK_IO_RECV_UNLOCK(so);
1407
return (error);
1408
}
1409
1410
/*
1411
* Abort is used to teardown a connection typically while sitting in
1412
* the accept queue.
1413
*/
1414
void
1415
sdp_abort(struct socket *so)
1416
{
1417
struct sdp_sock *ssk;
1418
1419
ssk = sdp_sk(so);
1420
SDP_WLOCK(ssk);
1421
/*
1422
* If we have not yet dropped, do it now.
1423
*/
1424
if (!(ssk->flags & SDP_TIMEWAIT) &&
1425
!(ssk->flags & SDP_DROPPED))
1426
sdp_drop(ssk, ECONNABORTED);
1427
KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X",
1428
ssk, ssk->flags));
1429
SDP_WUNLOCK(ssk);
1430
}
1431
1432
/*
1433
* Close a SDP socket and initiate a friendly disconnect.
1434
*/
1435
static void
1436
sdp_close(struct socket *so)
1437
{
1438
struct sdp_sock *ssk;
1439
1440
ssk = sdp_sk(so);
1441
SDP_WLOCK(ssk);
1442
/*
1443
* If we have not yet dropped, do it now.
1444
*/
1445
if (!(ssk->flags & SDP_TIMEWAIT) &&
1446
!(ssk->flags & SDP_DROPPED))
1447
sdp_start_disconnect(ssk);
1448
1449
/*
1450
* If we've still not dropped let the socket layer know we're
1451
* holding on to the socket and pcb for a while.
1452
*/
1453
if (!(ssk->flags & SDP_DROPPED)) {
1454
ssk->flags |= SDP_SOCKREF;
1455
soref(so);
1456
}
1457
SDP_WUNLOCK(ssk);
1458
}
1459
1460
/*
1461
* User requests out-of-band data.
1462
*/
1463
static int
1464
sdp_rcvoob(struct socket *so, struct mbuf *m, int flags)
1465
{
1466
int error = 0;
1467
struct sdp_sock *ssk;
1468
1469
ssk = sdp_sk(so);
1470
SDP_WLOCK(ssk);
1471
if (!rx_ring_trylock(&ssk->rx_ring)) {
1472
SDP_WUNLOCK(ssk);
1473
return (ECONNRESET);
1474
}
1475
if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1476
error = ECONNRESET;
1477
goto out;
1478
}
1479
if ((so->so_oobmark == 0 &&
1480
(so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
1481
so->so_options & SO_OOBINLINE ||
1482
ssk->oobflags & SDP_HADOOB) {
1483
error = EINVAL;
1484
goto out;
1485
}
1486
if ((ssk->oobflags & SDP_HAVEOOB) == 0) {
1487
error = EWOULDBLOCK;
1488
goto out;
1489
}
1490
m->m_len = 1;
1491
*mtod(m, caddr_t) = ssk->iobc;
1492
if ((flags & MSG_PEEK) == 0)
1493
ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB);
1494
out:
1495
rx_ring_unlock(&ssk->rx_ring);
1496
SDP_WUNLOCK(ssk);
1497
return (error);
1498
}
1499
1500
void
1501
sdp_urg(struct sdp_sock *ssk, struct mbuf *mb)
1502
{
1503
struct mbuf *m;
1504
struct socket *so;
1505
1506
so = ssk->socket;
1507
if (so == NULL)
1508
return;
1509
1510
so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1;
1511
sohasoutofband(so);
1512
ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB);
1513
if (!(so->so_options & SO_OOBINLINE)) {
1514
for (m = mb; m->m_next != NULL; m = m->m_next);
1515
ssk->iobc = *(mtod(m, char *) + m->m_len - 1);
1516
ssk->oobflags |= SDP_HAVEOOB;
1517
m->m_len--;
1518
mb->m_pkthdr.len--;
1519
}
1520
}
1521
1522
/*
1523
* Notify a sdp socket of an asynchronous error.
1524
*
1525
* Do not wake up user since there currently is no mechanism for
1526
* reporting soft errors (yet - a kqueue filter may be added).
1527
*/
1528
struct sdp_sock *
1529
sdp_notify(struct sdp_sock *ssk, int error)
1530
{
1531
1532
SDP_WLOCK_ASSERT(ssk);
1533
1534
if ((ssk->flags & SDP_TIMEWAIT) ||
1535
(ssk->flags & SDP_DROPPED))
1536
return (ssk);
1537
1538
/*
1539
* Ignore some errors if we are hooked up.
1540
*/
1541
if (ssk->state == TCPS_ESTABLISHED &&
1542
(error == EHOSTUNREACH || error == ENETUNREACH ||
1543
error == EHOSTDOWN))
1544
return (ssk);
1545
ssk->softerror = error;
1546
return sdp_drop(ssk, error);
1547
}
1548
1549
static void
1550
sdp_keepalive_timeout(void *data)
1551
{
1552
struct sdp_sock *ssk;
1553
1554
ssk = data;
1555
/* Callout canceled. */
1556
if (!callout_active(&ssk->keep2msl))
1557
return;
1558
/* Callout rescheduled as a different kind of timer. */
1559
if (callout_pending(&ssk->keep2msl))
1560
goto out;
1561
callout_deactivate(&ssk->keep2msl);
1562
if (ssk->flags & SDP_DROPPED ||
1563
(ssk->socket->so_options & SO_KEEPALIVE) == 0)
1564
goto out;
1565
sdp_post_keepalive(ssk);
1566
callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1567
sdp_keepalive_timeout, ssk);
1568
out:
1569
SDP_WUNLOCK(ssk);
1570
}
1571
1572
1573
void
1574
sdp_start_keepalive_timer(struct socket *so)
1575
{
1576
struct sdp_sock *ssk;
1577
1578
ssk = sdp_sk(so);
1579
if (!callout_pending(&ssk->keep2msl))
1580
callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1581
sdp_keepalive_timeout, ssk);
1582
}
1583
1584
static void
1585
sdp_stop_keepalive_timer(struct socket *so)
1586
{
1587
struct sdp_sock *ssk;
1588
1589
ssk = sdp_sk(so);
1590
callout_stop(&ssk->keep2msl);
1591
}
1592
1593
/*
1594
* sdp_ctloutput() must drop the inpcb lock before performing copyin on
1595
* socket option arguments. When it re-acquires the lock after the copy, it
1596
* has to revalidate that the connection is still valid for the socket
1597
* option.
1598
*/
1599
#define SDP_WLOCK_RECHECK(inp) do { \
1600
SDP_WLOCK(ssk); \
1601
if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { \
1602
SDP_WUNLOCK(ssk); \
1603
return (ECONNRESET); \
1604
} \
1605
} while(0)
1606
1607
static int
1608
sdp_ctloutput(struct socket *so, struct sockopt *sopt)
1609
{
1610
int error, opt, optval;
1611
struct sdp_sock *ssk;
1612
1613
error = 0;
1614
ssk = sdp_sk(so);
1615
if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) {
1616
SDP_WLOCK(ssk);
1617
if (so->so_options & SO_KEEPALIVE)
1618
sdp_start_keepalive_timer(so);
1619
else
1620
sdp_stop_keepalive_timer(so);
1621
SDP_WUNLOCK(ssk);
1622
}
1623
if (sopt->sopt_level != IPPROTO_TCP)
1624
return (error);
1625
1626
SDP_WLOCK(ssk);
1627
if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1628
SDP_WUNLOCK(ssk);
1629
return (ECONNRESET);
1630
}
1631
1632
switch (sopt->sopt_dir) {
1633
case SOPT_SET:
1634
switch (sopt->sopt_name) {
1635
case TCP_NODELAY:
1636
SDP_WUNLOCK(ssk);
1637
error = sooptcopyin(sopt, &optval, sizeof optval,
1638
sizeof optval);
1639
if (error)
1640
return (error);
1641
1642
SDP_WLOCK_RECHECK(ssk);
1643
opt = SDP_NODELAY;
1644
if (optval)
1645
ssk->flags |= opt;
1646
else
1647
ssk->flags &= ~opt;
1648
sdp_do_posts(ssk);
1649
SDP_WUNLOCK(ssk);
1650
break;
1651
1652
default:
1653
SDP_WUNLOCK(ssk);
1654
error = ENOPROTOOPT;
1655
break;
1656
}
1657
break;
1658
1659
case SOPT_GET:
1660
switch (sopt->sopt_name) {
1661
case TCP_NODELAY:
1662
optval = ssk->flags & SDP_NODELAY;
1663
SDP_WUNLOCK(ssk);
1664
error = sooptcopyout(sopt, &optval, sizeof optval);
1665
break;
1666
default:
1667
SDP_WUNLOCK(ssk);
1668
error = ENOPROTOOPT;
1669
break;
1670
}
1671
break;
1672
}
1673
return (error);
1674
}
1675
#undef SDP_WLOCK_RECHECK
1676
1677
int sdp_mod_count = 0;
1678
int sdp_mod_usec = 0;
1679
1680
void
1681
sdp_set_default_moderation(struct sdp_sock *ssk)
1682
{
1683
if (sdp_mod_count <= 0 || sdp_mod_usec <= 0)
1684
return;
1685
ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec);
1686
}
1687
1688
static void
1689
sdp_dev_add(struct ib_device *device)
1690
{
1691
struct ib_fmr_pool_param param;
1692
struct sdp_device *sdp_dev;
1693
1694
sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO);
1695
sdp_dev->pd = ib_alloc_pd(device, 0);
1696
if (IS_ERR(sdp_dev->pd))
1697
goto out_pd;
1698
memset(&param, 0, sizeof param);
1699
param.max_pages_per_fmr = SDP_FMR_SIZE;
1700
param.page_shift = PAGE_SHIFT;
1701
param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ);
1702
param.pool_size = SDP_FMR_POOL_SIZE;
1703
param.dirty_watermark = SDP_FMR_DIRTY_SIZE;
1704
param.cache = 1;
1705
sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, &param);
1706
if (IS_ERR(sdp_dev->fmr_pool))
1707
goto out_fmr;
1708
ib_set_client_data(device, &sdp_client, sdp_dev);
1709
return;
1710
1711
out_fmr:
1712
ib_dealloc_pd(sdp_dev->pd);
1713
out_pd:
1714
free(sdp_dev, M_SDP);
1715
}
1716
1717
static void
1718
sdp_dev_rem(struct ib_device *device, void *client_data)
1719
{
1720
struct sdp_device *sdp_dev;
1721
struct sdp_sock *ssk;
1722
1723
SDP_LIST_WLOCK();
1724
LIST_FOREACH(ssk, &sdp_list, list) {
1725
if (ssk->ib_device != device)
1726
continue;
1727
SDP_WLOCK(ssk);
1728
if ((ssk->flags & SDP_DESTROY) == 0)
1729
ssk = sdp_notify(ssk, ECONNRESET);
1730
if (ssk)
1731
SDP_WUNLOCK(ssk);
1732
}
1733
SDP_LIST_WUNLOCK();
1734
/*
1735
* XXX Do I need to wait between these two?
1736
*/
1737
sdp_dev = ib_get_client_data(device, &sdp_client);
1738
if (!sdp_dev)
1739
return;
1740
ib_flush_fmr_pool(sdp_dev->fmr_pool);
1741
ib_destroy_fmr_pool(sdp_dev->fmr_pool);
1742
ib_dealloc_pd(sdp_dev->pd);
1743
free(sdp_dev, M_SDP);
1744
}
1745
1746
struct ib_client sdp_client =
1747
{ .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem };
1748
1749
1750
static int
1751
sdp_pcblist(SYSCTL_HANDLER_ARGS)
1752
{
1753
int error, n, i;
1754
struct sdp_sock *ssk;
1755
struct xinpgen xig;
1756
1757
/*
1758
* The process of preparing the TCB list is too time-consuming and
1759
* resource-intensive to repeat twice on every request.
1760
*/
1761
if (req->oldptr == NULL) {
1762
n = sdp_count;
1763
n += imax(n / 8, 10);
1764
req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
1765
return (0);
1766
}
1767
1768
if (req->newptr != NULL)
1769
return (EPERM);
1770
1771
/*
1772
* OK, now we're committed to doing something.
1773
*/
1774
SDP_LIST_RLOCK();
1775
n = sdp_count;
1776
SDP_LIST_RUNLOCK();
1777
1778
error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
1779
+ n * sizeof(struct xtcpcb));
1780
if (error != 0)
1781
return (error);
1782
1783
bzero(&xig, sizeof(xig));
1784
xig.xig_len = sizeof xig;
1785
xig.xig_count = n;
1786
xig.xig_gen = 0;
1787
xig.xig_sogen = so_gencnt;
1788
error = SYSCTL_OUT(req, &xig, sizeof xig);
1789
if (error)
1790
return (error);
1791
1792
SDP_LIST_RLOCK();
1793
for (ssk = LIST_FIRST(&sdp_list), i = 0;
1794
ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) {
1795
struct xtcpcb xt;
1796
1797
SDP_RLOCK(ssk);
1798
if (ssk->flags & SDP_TIMEWAIT) {
1799
if (ssk->cred != NULL)
1800
error = cr_cansee(req->td->td_ucred,
1801
ssk->cred);
1802
else
1803
error = EINVAL; /* Skip this inp. */
1804
} else if (ssk->socket)
1805
error = cr_canseesocket(req->td->td_ucred,
1806
ssk->socket);
1807
else
1808
error = EINVAL;
1809
if (error) {
1810
error = 0;
1811
goto next;
1812
}
1813
1814
bzero(&xt, sizeof(xt));
1815
xt.xt_len = sizeof xt;
1816
xt.xt_inp.inp_gencnt = 0;
1817
xt.xt_inp.inp_vflag = INP_IPV4;
1818
memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr));
1819
xt.xt_inp.inp_lport = ssk->lport;
1820
memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr));
1821
xt.xt_inp.inp_fport = ssk->fport;
1822
xt.t_state = ssk->state;
1823
if (ssk->socket != NULL)
1824
sotoxsocket(ssk->socket, &xt.xt_inp.xi_socket);
1825
xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP;
1826
SDP_RUNLOCK(ssk);
1827
error = SYSCTL_OUT(req, &xt, sizeof xt);
1828
if (error)
1829
break;
1830
i++;
1831
continue;
1832
next:
1833
SDP_RUNLOCK(ssk);
1834
}
1835
if (!error) {
1836
/*
1837
* Give the user an updated idea of our state.
1838
* If the generation differs from what we told
1839
* her before, she knows that something happened
1840
* while we were processing this request, and it
1841
* might be necessary to retry.
1842
*/
1843
xig.xig_gen = 0;
1844
xig.xig_sogen = so_gencnt;
1845
xig.xig_count = sdp_count;
1846
error = SYSCTL_OUT(req, &xig, sizeof xig);
1847
}
1848
SDP_LIST_RUNLOCK();
1849
return (error);
1850
}
1851
1852
SYSCTL_NODE(_net_inet, -1, sdp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1853
"SDP");
1854
1855
SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist,
1856
CTLFLAG_RD | CTLTYPE_STRUCT | CTLFLAG_MPSAFE,
1857
0, 0, sdp_pcblist, "S,xtcpcb",
1858
"List of active SDP connections");
1859
1860
static void
1861
sdp_zone_change(void *tag)
1862
{
1863
1864
uma_zone_set_max(sdp_zone, maxsockets);
1865
}
1866
1867
static void
1868
sdp_init(void *arg __unused)
1869
{
1870
1871
LIST_INIT(&sdp_list);
1872
sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock),
1873
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1874
uma_zone_set_max(sdp_zone, maxsockets);
1875
EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL,
1876
EVENTHANDLER_PRI_ANY);
1877
rx_comp_wq = create_singlethread_workqueue("rx_comp_wq");
1878
ib_register_client(&sdp_client);
1879
}
1880
SYSINIT(sdp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, sdp_init, NULL);
1881
1882
#define SDP_PROTOSW \
1883
.pr_type = SOCK_STREAM, \
1884
.pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,\
1885
.pr_ctloutput = sdp_ctloutput, \
1886
.pr_abort = sdp_abort, \
1887
.pr_accept = sdp_accept, \
1888
.pr_attach = sdp_attach, \
1889
.pr_bind = sdp_bind, \
1890
.pr_connect = sdp_connect, \
1891
.pr_detach = sdp_detach, \
1892
.pr_disconnect = sdp_disconnect, \
1893
.pr_listen = sdp_listen, \
1894
.pr_peeraddr = sdp_getpeeraddr, \
1895
.pr_rcvoob = sdp_rcvoob, \
1896
.pr_send = sdp_send, \
1897
.pr_sosend = sdp_sosend, \
1898
.pr_soreceive = sdp_sorecv, \
1899
.pr_shutdown = sdp_shutdown, \
1900
.pr_sockaddr = sdp_getsockaddr, \
1901
.pr_close = sdp_close
1902
1903
1904
static struct protosw sdp_ip_protosw = {
1905
.pr_protocol = IPPROTO_IP,
1906
SDP_PROTOSW
1907
};
1908
static struct protosw sdp_tcp_protosw = {
1909
.pr_protocol = IPPROTO_TCP,
1910
SDP_PROTOSW
1911
};
1912
1913
static struct domain sdpdomain = {
1914
.dom_family = AF_INET_SDP,
1915
.dom_name = "SDP",
1916
.dom_nprotosw = 2,
1917
.dom_protosw = {
1918
&sdp_ip_protosw,
1919
&sdp_tcp_protosw,
1920
},
1921
};
1922
1923
DOMAIN_SET(sdp);
1924
1925
int sdp_debug_level = 1;
1926
int sdp_data_debug_level = 0;
1927
1928