Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/netinet/in_pcb.c
39475 views
1
/*-
2
* SPDX-License-Identifier: BSD-3-Clause
3
*
4
* Copyright (c) 1982, 1986, 1991, 1993, 1995
5
* The Regents of the University of California.
6
* Copyright (c) 2007-2009 Robert N. M. Watson
7
* Copyright (c) 2010-2011 Juniper Networks, Inc.
8
* Copyright (c) 2021-2022 Gleb Smirnoff <[email protected]>
9
* All rights reserved.
10
*
11
* Portions of this software were developed by Robert N. M. Watson under
12
* contract to Juniper Networks, Inc.
13
*
14
* Redistribution and use in source and binary forms, with or without
15
* modification, are permitted provided that the following conditions
16
* are met:
17
* 1. Redistributions of source code must retain the above copyright
18
* notice, this list of conditions and the following disclaimer.
19
* 2. Redistributions in binary form must reproduce the above copyright
20
* notice, this list of conditions and the following disclaimer in the
21
* documentation and/or other materials provided with the distribution.
22
* 3. Neither the name of the University nor the names of its contributors
23
* may be used to endorse or promote products derived from this software
24
* without specific prior written permission.
25
*
26
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36
* SUCH DAMAGE.
37
*/
38
39
#include <sys/cdefs.h>
40
#include "opt_ddb.h"
41
#include "opt_ipsec.h"
42
#include "opt_inet.h"
43
#include "opt_inet6.h"
44
#include "opt_ratelimit.h"
45
#include "opt_route.h"
46
#include "opt_rss.h"
47
48
#include <sys/param.h>
49
#include <sys/hash.h>
50
#include <sys/systm.h>
51
#include <sys/libkern.h>
52
#include <sys/lock.h>
53
#include <sys/malloc.h>
54
#include <sys/mbuf.h>
55
#include <sys/eventhandler.h>
56
#include <sys/domain.h>
57
#include <sys/proc.h>
58
#include <sys/protosw.h>
59
#include <sys/smp.h>
60
#include <sys/smr.h>
61
#include <sys/socket.h>
62
#include <sys/socketvar.h>
63
#include <sys/sockio.h>
64
#include <sys/priv.h>
65
#include <sys/proc.h>
66
#include <sys/refcount.h>
67
#include <sys/jail.h>
68
#include <sys/kernel.h>
69
#include <sys/sysctl.h>
70
71
#ifdef DDB
72
#include <ddb/ddb.h>
73
#endif
74
75
#include <vm/uma.h>
76
#include <vm/vm.h>
77
78
#include <net/if.h>
79
#include <net/if_var.h>
80
#include <net/if_private.h>
81
#include <net/if_types.h>
82
#include <net/if_llatbl.h>
83
#include <net/route.h>
84
#include <net/rss_config.h>
85
#include <net/vnet.h>
86
87
#if defined(INET) || defined(INET6)
88
#include <netinet/in.h>
89
#include <netinet/in_pcb.h>
90
#include <netinet/in_pcb_var.h>
91
#include <netinet/tcp.h>
92
#ifdef INET
93
#include <netinet/in_var.h>
94
#include <netinet/in_fib.h>
95
#endif
96
#include <netinet/ip_var.h>
97
#ifdef INET6
98
#include <netinet/ip6.h>
99
#include <netinet6/in6_pcb.h>
100
#include <netinet6/in6_var.h>
101
#include <netinet6/ip6_var.h>
102
#endif /* INET6 */
103
#include <net/route/nhop.h>
104
#endif
105
106
#include <netipsec/ipsec_support.h>
107
108
#include <security/mac/mac_framework.h>
109
110
#define INPCBLBGROUP_SIZMIN 8
111
#define INPCBLBGROUP_SIZMAX 256
112
113
#define INP_FREED 0x00000200 /* Went through in_pcbfree(). */
114
#define INP_INLBGROUP 0x01000000 /* Inserted into inpcblbgroup. */
115
116
/*
117
* These configure the range of local port addresses assigned to
118
* "unspecified" outgoing connections/packets/whatever.
119
*/
120
VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */
121
VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */
122
VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */
123
VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */
124
VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */
125
VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */
126
127
/*
128
* Reserved ports accessible only to root. There are significant
129
* security considerations that must be accounted for when changing these,
130
* but the security benefits can be great. Please be careful.
131
*/
132
VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */
133
VNET_DEFINE(int, ipport_reservedlow);
134
135
/* Enable random ephemeral port allocation by default. */
136
VNET_DEFINE(int, ipport_randomized) = 1;
137
138
#ifdef INET
139
static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
140
struct in_addr faddr, u_int fport_arg,
141
struct in_addr laddr, u_int lport_arg,
142
int lookupflags, uint8_t numa_domain, int fib);
143
144
#define RANGECHK(var, min, max) \
145
if ((var) < (min)) { (var) = (min); } \
146
else if ((var) > (max)) { (var) = (max); }
147
148
static int
149
sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
150
{
151
int error;
152
153
error = sysctl_handle_int(oidp, arg1, arg2, req);
154
if (error == 0) {
155
RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
156
RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
157
RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
158
RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
159
RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
160
RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
161
}
162
return (error);
163
}
164
165
#undef RANGECHK
166
167
static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
168
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
169
"IP Ports");
170
171
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
172
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
173
&VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
174
"");
175
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
176
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
177
&VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
178
"");
179
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
180
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
181
&VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
182
"");
183
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
184
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
185
&VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
186
"");
187
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
188
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
189
&VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
190
"");
191
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
192
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
193
&VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
194
"");
195
SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
196
CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
197
&VNET_NAME(ipport_reservedhigh), 0, "");
198
SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
199
CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
200
SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
201
CTLFLAG_VNET | CTLFLAG_RW,
202
&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
203
204
#ifdef RATELIMIT
205
counter_u64_t rate_limit_new;
206
counter_u64_t rate_limit_chg;
207
counter_u64_t rate_limit_active;
208
counter_u64_t rate_limit_alloc_fail;
209
counter_u64_t rate_limit_set_ok;
210
211
static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
212
"IP Rate Limiting");
213
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
214
&rate_limit_active, "Active rate limited connections");
215
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
216
&rate_limit_alloc_fail, "Rate limited connection failures");
217
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
218
&rate_limit_set_ok, "Rate limited setting succeeded");
219
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD,
220
&rate_limit_new, "Total Rate limit new attempts");
221
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
222
&rate_limit_chg, "Total Rate limited change attempts");
223
#endif /* RATELIMIT */
224
225
#endif /* INET */
226
227
VNET_DEFINE(uint32_t, in_pcbhashseed);
228
static void
229
in_pcbhashseed_init(void)
230
{
231
232
V_in_pcbhashseed = arc4random();
233
}
234
VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
235
in_pcbhashseed_init, NULL);
236
237
#ifdef INET
238
VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 0;
239
#define V_connect_inaddr_wild VNET(connect_inaddr_wild)
240
SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild,
241
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0,
242
"Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)");
243
#endif
244
245
static void in_pcbremhash(struct inpcb *);
246
247
/*
248
* in_pcb.c: manage the Protocol Control Blocks.
249
*
250
* NOTE: It is assumed that most of these functions will be called with
251
* the pcbinfo lock held, and often, the inpcb lock held, as these utility
252
* functions often modify hash chains or addresses in pcbs.
253
*/
254
255
static struct inpcblbgroup *
256
in_pcblbgroup_alloc(struct ucred *cred, u_char vflag, uint16_t port,
257
const union in_dependaddr *addr, int size, uint8_t numa_domain, int fib)
258
{
259
struct inpcblbgroup *grp;
260
size_t bytes;
261
262
bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
263
grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
264
if (grp == NULL)
265
return (NULL);
266
LIST_INIT(&grp->il_pending);
267
grp->il_cred = crhold(cred);
268
grp->il_vflag = vflag;
269
grp->il_lport = port;
270
grp->il_numa_domain = numa_domain;
271
grp->il_fibnum = fib;
272
grp->il_dependladdr = *addr;
273
grp->il_inpsiz = size;
274
return (grp);
275
}
276
277
static void
278
in_pcblbgroup_free_deferred(epoch_context_t ctx)
279
{
280
struct inpcblbgroup *grp;
281
282
grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
283
crfree(grp->il_cred);
284
free(grp, M_PCB);
285
}
286
287
static void
288
in_pcblbgroup_free(struct inpcblbgroup *grp)
289
{
290
KASSERT(LIST_EMPTY(&grp->il_pending),
291
("local group %p still has pending inps", grp));
292
293
CK_LIST_REMOVE(grp, il_list);
294
NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
295
}
296
297
static struct inpcblbgroup *
298
in_pcblbgroup_find(struct inpcb *inp)
299
{
300
struct inpcbinfo *pcbinfo;
301
struct inpcblbgroup *grp;
302
struct inpcblbgrouphead *hdr;
303
304
INP_LOCK_ASSERT(inp);
305
306
pcbinfo = inp->inp_pcbinfo;
307
INP_HASH_LOCK_ASSERT(pcbinfo);
308
309
hdr = &pcbinfo->ipi_lbgrouphashbase[
310
INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
311
CK_LIST_FOREACH(grp, hdr, il_list) {
312
struct inpcb *inp1;
313
314
for (unsigned int i = 0; i < grp->il_inpcnt; i++) {
315
if (inp == grp->il_inp[i])
316
goto found;
317
}
318
LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) {
319
if (inp == inp1)
320
goto found;
321
}
322
}
323
found:
324
return (grp);
325
}
326
327
static void
328
in_pcblbgroup_insert(struct inpcblbgroup *grp, struct inpcb *inp)
329
{
330
KASSERT(grp->il_inpcnt < grp->il_inpsiz,
331
("invalid local group size %d and count %d", grp->il_inpsiz,
332
grp->il_inpcnt));
333
INP_WLOCK_ASSERT(inp);
334
335
if (inp->inp_socket->so_proto->pr_listen != pr_listen_notsupp &&
336
!SOLISTENING(inp->inp_socket)) {
337
/*
338
* If this is a TCP socket, it should not be visible to lbgroup
339
* lookups until listen() has been called.
340
*/
341
LIST_INSERT_HEAD(&grp->il_pending, inp, inp_lbgroup_list);
342
grp->il_pendcnt++;
343
} else {
344
grp->il_inp[grp->il_inpcnt] = inp;
345
346
/*
347
* Synchronize with in_pcblookup_lbgroup(): make sure that we
348
* don't expose a null slot to the lookup path.
349
*/
350
atomic_store_rel_int(&grp->il_inpcnt, grp->il_inpcnt + 1);
351
}
352
353
inp->inp_flags |= INP_INLBGROUP;
354
}
355
356
static struct inpcblbgroup *
357
in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
358
struct inpcblbgroup *old_grp, int size)
359
{
360
struct inpcblbgroup *grp;
361
int i;
362
363
grp = in_pcblbgroup_alloc(old_grp->il_cred, old_grp->il_vflag,
364
old_grp->il_lport, &old_grp->il_dependladdr, size,
365
old_grp->il_numa_domain, old_grp->il_fibnum);
366
if (grp == NULL)
367
return (NULL);
368
369
KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
370
("invalid new local group size %d and old local group count %d",
371
grp->il_inpsiz, old_grp->il_inpcnt));
372
373
for (i = 0; i < old_grp->il_inpcnt; ++i)
374
grp->il_inp[i] = old_grp->il_inp[i];
375
grp->il_inpcnt = old_grp->il_inpcnt;
376
CK_LIST_INSERT_HEAD(hdr, grp, il_list);
377
LIST_SWAP(&old_grp->il_pending, &grp->il_pending, inpcb,
378
inp_lbgroup_list);
379
grp->il_pendcnt = old_grp->il_pendcnt;
380
old_grp->il_pendcnt = 0;
381
in_pcblbgroup_free(old_grp);
382
return (grp);
383
}
384
385
/*
386
* Add PCB to load balance group for SO_REUSEPORT_LB option.
387
*/
388
static int
389
in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
390
{
391
const static struct timeval interval = { 60, 0 };
392
static struct timeval lastprint;
393
struct inpcbinfo *pcbinfo;
394
struct inpcblbgrouphead *hdr;
395
struct inpcblbgroup *grp;
396
uint32_t idx;
397
int fib;
398
399
pcbinfo = inp->inp_pcbinfo;
400
401
INP_WLOCK_ASSERT(inp);
402
INP_HASH_WLOCK_ASSERT(pcbinfo);
403
404
fib = (inp->inp_flags & INP_BOUNDFIB) != 0 ?
405
inp->inp_inc.inc_fibnum : RT_ALL_FIBS;
406
407
#ifdef INET6
408
/*
409
* Don't allow IPv4 mapped INET6 wild socket.
410
*/
411
if ((inp->inp_vflag & INP_IPV4) &&
412
inp->inp_laddr.s_addr == INADDR_ANY &&
413
INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
414
return (0);
415
}
416
#endif
417
418
idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
419
hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
420
CK_LIST_FOREACH(grp, hdr, il_list) {
421
if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison &&
422
grp->il_vflag == inp->inp_vflag &&
423
grp->il_lport == inp->inp_lport &&
424
grp->il_numa_domain == numa_domain &&
425
grp->il_fibnum == fib &&
426
memcmp(&grp->il_dependladdr,
427
&inp->inp_inc.inc_ie.ie_dependladdr,
428
sizeof(grp->il_dependladdr)) == 0) {
429
break;
430
}
431
}
432
if (grp == NULL) {
433
/* Create new load balance group. */
434
grp = in_pcblbgroup_alloc(inp->inp_cred, inp->inp_vflag,
435
inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
436
INPCBLBGROUP_SIZMIN, numa_domain, fib);
437
if (grp == NULL)
438
return (ENOMEM);
439
in_pcblbgroup_insert(grp, inp);
440
CK_LIST_INSERT_HEAD(hdr, grp, il_list);
441
} else if (grp->il_inpcnt + grp->il_pendcnt == grp->il_inpsiz) {
442
if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
443
if (ratecheck(&lastprint, &interval))
444
printf("lb group port %d, limit reached\n",
445
ntohs(grp->il_lport));
446
return (0);
447
}
448
449
/* Expand this local group. */
450
grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
451
if (grp == NULL)
452
return (ENOMEM);
453
in_pcblbgroup_insert(grp, inp);
454
} else {
455
in_pcblbgroup_insert(grp, inp);
456
}
457
return (0);
458
}
459
460
/*
461
* Remove PCB from load balance group.
462
*/
463
static void
464
in_pcbremlbgrouphash(struct inpcb *inp)
465
{
466
struct inpcbinfo *pcbinfo;
467
struct inpcblbgrouphead *hdr;
468
struct inpcblbgroup *grp;
469
struct inpcb *inp1;
470
int i;
471
472
pcbinfo = inp->inp_pcbinfo;
473
474
INP_WLOCK_ASSERT(inp);
475
MPASS(inp->inp_flags & INP_INLBGROUP);
476
INP_HASH_WLOCK_ASSERT(pcbinfo);
477
478
hdr = &pcbinfo->ipi_lbgrouphashbase[
479
INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
480
CK_LIST_FOREACH(grp, hdr, il_list) {
481
for (i = 0; i < grp->il_inpcnt; ++i) {
482
if (grp->il_inp[i] != inp)
483
continue;
484
485
if (grp->il_inpcnt == 1 &&
486
LIST_EMPTY(&grp->il_pending)) {
487
/* We are the last, free this local group. */
488
in_pcblbgroup_free(grp);
489
} else {
490
grp->il_inp[i] =
491
grp->il_inp[grp->il_inpcnt - 1];
492
493
/*
494
* Synchronize with in_pcblookup_lbgroup().
495
*/
496
atomic_store_rel_int(&grp->il_inpcnt,
497
grp->il_inpcnt - 1);
498
}
499
inp->inp_flags &= ~INP_INLBGROUP;
500
return;
501
}
502
LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) {
503
if (inp == inp1) {
504
LIST_REMOVE(inp, inp_lbgroup_list);
505
grp->il_pendcnt--;
506
inp->inp_flags &= ~INP_INLBGROUP;
507
return;
508
}
509
}
510
}
511
__assert_unreachable();
512
}
513
514
int
515
in_pcblbgroup_numa(struct inpcb *inp, int arg)
516
{
517
struct inpcbinfo *pcbinfo;
518
int error;
519
uint8_t numa_domain;
520
521
switch (arg) {
522
case TCP_REUSPORT_LB_NUMA_NODOM:
523
numa_domain = M_NODOM;
524
break;
525
case TCP_REUSPORT_LB_NUMA_CURDOM:
526
numa_domain = PCPU_GET(domain);
527
break;
528
default:
529
if (arg < 0 || arg >= vm_ndomains)
530
return (EINVAL);
531
numa_domain = arg;
532
}
533
534
pcbinfo = inp->inp_pcbinfo;
535
INP_WLOCK_ASSERT(inp);
536
INP_HASH_WLOCK(pcbinfo);
537
if (in_pcblbgroup_find(inp) != NULL) {
538
/* Remove it from the old group. */
539
in_pcbremlbgrouphash(inp);
540
/* Add it to the new group based on numa domain. */
541
in_pcbinslbgrouphash(inp, numa_domain);
542
error = 0;
543
} else {
544
error = ENOENT;
545
}
546
INP_HASH_WUNLOCK(pcbinfo);
547
return (error);
548
}
549
550
/* Make sure it is safe to use hashinit(9) on CK_LIST. */
551
CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb)));
552
553
/*
554
* Initialize an inpcbinfo - a per-VNET instance of connections db.
555
*/
556
void
557
in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
558
u_int hash_nelements, u_int porthash_nelements)
559
{
560
561
mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF);
562
mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name,
563
NULL, MTX_DEF);
564
#ifdef VIMAGE
565
pcbinfo->ipi_vnet = curvnet;
566
#endif
567
CK_LIST_INIT(&pcbinfo->ipi_listhead);
568
pcbinfo->ipi_count = 0;
569
pcbinfo->ipi_hash_exact = hashinit(hash_nelements, M_PCB,
570
&pcbinfo->ipi_hashmask);
571
pcbinfo->ipi_hash_wild = hashinit(hash_nelements, M_PCB,
572
&pcbinfo->ipi_hashmask);
573
porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
574
pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
575
&pcbinfo->ipi_porthashmask);
576
pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
577
&pcbinfo->ipi_lbgrouphashmask);
578
pcbinfo->ipi_zone = pcbstor->ips_zone;
579
pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
580
}
581
582
/*
583
* Destroy an inpcbinfo.
584
*/
585
void
586
in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
587
{
588
589
KASSERT(pcbinfo->ipi_count == 0,
590
("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
591
592
hashdestroy(pcbinfo->ipi_hash_exact, M_PCB, pcbinfo->ipi_hashmask);
593
hashdestroy(pcbinfo->ipi_hash_wild, M_PCB, pcbinfo->ipi_hashmask);
594
hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
595
pcbinfo->ipi_porthashmask);
596
hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
597
pcbinfo->ipi_lbgrouphashmask);
598
mtx_destroy(&pcbinfo->ipi_hash_lock);
599
mtx_destroy(&pcbinfo->ipi_lock);
600
}
601
602
/*
603
* Initialize a pcbstorage - per protocol zones to allocate inpcbs.
604
*/
605
static void inpcb_fini(void *, int);
606
void
607
in_pcbstorage_init(void *arg)
608
{
609
struct inpcbstorage *pcbstor = arg;
610
611
pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name,
612
pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit,
613
inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR);
614
}
615
616
/*
617
* Destroy a pcbstorage - used by unloadable protocols.
618
*/
619
void
620
in_pcbstorage_destroy(void *arg)
621
{
622
struct inpcbstorage *pcbstor = arg;
623
624
uma_zdestroy(pcbstor->ips_zone);
625
}
626
627
/*
628
* Allocate a PCB and associate it with the socket.
629
* On success return with the PCB locked.
630
*/
631
int
632
in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
633
{
634
struct inpcb *inp;
635
#if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
636
int error;
637
#endif
638
639
inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT);
640
if (inp == NULL)
641
return (ENOBUFS);
642
bzero(&inp->inp_start_zero, inp_zero_size);
643
#ifdef NUMA
644
inp->inp_numa_domain = M_NODOM;
645
#endif
646
inp->inp_pcbinfo = pcbinfo;
647
inp->inp_socket = so;
648
inp->inp_cred = crhold(so->so_cred);
649
inp->inp_inc.inc_fibnum = so->so_fibnum;
650
#ifdef MAC
651
error = mac_inpcb_init(inp, M_NOWAIT);
652
if (error != 0)
653
goto out;
654
mac_inpcb_create(so, inp);
655
#endif
656
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
657
error = ipsec_init_pcbpolicy(inp);
658
if (error != 0) {
659
#ifdef MAC
660
mac_inpcb_destroy(inp);
661
#endif
662
goto out;
663
}
664
#endif /*IPSEC*/
665
#ifdef INET6
666
if (INP_SOCKAF(so) == AF_INET6) {
667
inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6;
668
if (V_ip6_v6only)
669
inp->inp_flags |= IN6P_IPV6_V6ONLY;
670
#ifdef INET
671
else
672
inp->inp_vflag |= INP_IPV4;
673
#endif
674
if (V_ip6_auto_flowlabel)
675
inp->inp_flags |= IN6P_AUTOFLOWLABEL;
676
inp->in6p_hops = -1; /* use kernel default */
677
}
678
#endif
679
#if defined(INET) && defined(INET6)
680
else
681
#endif
682
#ifdef INET
683
inp->inp_vflag |= INP_IPV4;
684
#endif
685
inp->inp_smr = SMR_SEQ_INVALID;
686
687
/*
688
* Routes in inpcb's can cache L2 as well; they are guaranteed
689
* to be cleaned up.
690
*/
691
inp->inp_route.ro_flags = RT_LLE_CACHE;
692
refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */
693
INP_WLOCK(inp);
694
INP_INFO_WLOCK(pcbinfo);
695
pcbinfo->ipi_count++;
696
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
697
CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list);
698
INP_INFO_WUNLOCK(pcbinfo);
699
so->so_pcb = inp;
700
701
return (0);
702
703
#if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
704
out:
705
crfree(inp->inp_cred);
706
#ifdef INVARIANTS
707
inp->inp_cred = NULL;
708
#endif
709
uma_zfree_smr(pcbinfo->ipi_zone, inp);
710
return (error);
711
#endif
712
}
713
714
#ifdef INET
715
int
716
in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, int flags,
717
struct ucred *cred)
718
{
719
int anonport, error;
720
721
KASSERT(sin == NULL || sin->sin_family == AF_INET,
722
("%s: invalid address family for %p", __func__, sin));
723
KASSERT(sin == NULL || sin->sin_len == sizeof(struct sockaddr_in),
724
("%s: invalid address length for %p", __func__, sin));
725
INP_WLOCK_ASSERT(inp);
726
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
727
728
if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
729
return (EINVAL);
730
anonport = sin == NULL || sin->sin_port == 0;
731
error = in_pcbbind_setup(inp, sin, &inp->inp_laddr.s_addr,
732
&inp->inp_lport, flags, cred);
733
if (error)
734
return (error);
735
if (__predict_false((error = in_pcbinshash(inp)) != 0)) {
736
MPASS(inp->inp_socket->so_options & SO_REUSEPORT_LB);
737
inp->inp_laddr.s_addr = INADDR_ANY;
738
inp->inp_lport = 0;
739
inp->inp_flags &= ~INP_BOUNDFIB;
740
return (error);
741
}
742
if (anonport)
743
inp->inp_flags |= INP_ANONPORT;
744
return (0);
745
}
746
#endif
747
748
#if defined(INET) || defined(INET6)
749
/*
750
* Assign a local port like in_pcb_lport(), but also used with connect()
751
* and a foreign address and port. If fsa is non-NULL, choose a local port
752
* that is unused with those, otherwise one that is completely unused.
753
* lsa can be NULL for IPv6.
754
*/
755
int
756
in_pcb_lport_dest(const struct inpcb *inp, struct sockaddr *lsa,
757
u_short *lportp, struct sockaddr *fsa, u_short fport, struct ucred *cred,
758
int lookupflags)
759
{
760
struct inpcbinfo *pcbinfo;
761
struct inpcb *tmpinp;
762
unsigned short *lastport;
763
int count, error;
764
u_short aux, first, last, lport;
765
#ifdef INET
766
struct in_addr laddr, faddr;
767
#endif
768
#ifdef INET6
769
struct in6_addr *laddr6, *faddr6;
770
#endif
771
772
pcbinfo = inp->inp_pcbinfo;
773
774
/*
775
* Because no actual state changes occur here, a global write lock on
776
* the pcbinfo isn't required.
777
*/
778
INP_LOCK_ASSERT(inp);
779
INP_HASH_LOCK_ASSERT(pcbinfo);
780
781
if (inp->inp_flags & INP_HIGHPORT) {
782
first = V_ipport_hifirstauto; /* sysctl */
783
last = V_ipport_hilastauto;
784
lastport = &pcbinfo->ipi_lasthi;
785
} else if (inp->inp_flags & INP_LOWPORT) {
786
error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
787
if (error)
788
return (error);
789
first = V_ipport_lowfirstauto; /* 1023 */
790
last = V_ipport_lowlastauto; /* 600 */
791
lastport = &pcbinfo->ipi_lastlow;
792
} else {
793
first = V_ipport_firstauto; /* sysctl */
794
last = V_ipport_lastauto;
795
lastport = &pcbinfo->ipi_lastport;
796
}
797
798
/*
799
* Instead of having two loops further down counting up or down
800
* make sure that first is always <= last and go with only one
801
* code path implementing all logic.
802
*/
803
if (first > last) {
804
aux = first;
805
first = last;
806
last = aux;
807
}
808
809
#ifdef INET
810
laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */
811
if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
812
if (lsa != NULL)
813
laddr = ((struct sockaddr_in *)lsa)->sin_addr;
814
if (fsa != NULL)
815
faddr = ((struct sockaddr_in *)fsa)->sin_addr;
816
}
817
#endif
818
#ifdef INET6
819
laddr6 = NULL;
820
if ((inp->inp_vflag & INP_IPV6) != 0) {
821
if (lsa != NULL)
822
laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
823
if (fsa != NULL)
824
faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
825
}
826
#endif
827
828
tmpinp = NULL;
829
830
if (V_ipport_randomized)
831
*lastport = first + (arc4random() % (last - first));
832
833
count = last - first;
834
835
do {
836
if (count-- < 0) /* completely used? */
837
return (EADDRNOTAVAIL);
838
++*lastport;
839
if (*lastport < first || *lastport > last)
840
*lastport = first;
841
lport = htons(*lastport);
842
843
if (fsa != NULL) {
844
#ifdef INET
845
if (lsa->sa_family == AF_INET) {
846
tmpinp = in_pcblookup_hash_locked(pcbinfo,
847
faddr, fport, laddr, lport, lookupflags,
848
M_NODOM, RT_ALL_FIBS);
849
}
850
#endif
851
#ifdef INET6
852
if (lsa->sa_family == AF_INET6) {
853
tmpinp = in6_pcblookup_hash_locked(pcbinfo,
854
faddr6, fport, laddr6, lport, lookupflags,
855
M_NODOM, RT_ALL_FIBS);
856
}
857
#endif
858
} else {
859
#ifdef INET6
860
if ((inp->inp_vflag & INP_IPV6) != 0) {
861
tmpinp = in6_pcblookup_local(pcbinfo,
862
&inp->in6p_laddr, lport, RT_ALL_FIBS,
863
lookupflags, cred);
864
#ifdef INET
865
if (tmpinp == NULL &&
866
(inp->inp_vflag & INP_IPV4))
867
tmpinp = in_pcblookup_local(pcbinfo,
868
laddr, lport, RT_ALL_FIBS,
869
lookupflags, cred);
870
#endif
871
}
872
#endif
873
#if defined(INET) && defined(INET6)
874
else
875
#endif
876
#ifdef INET
877
tmpinp = in_pcblookup_local(pcbinfo, laddr,
878
lport, RT_ALL_FIBS, lookupflags, cred);
879
#endif
880
}
881
} while (tmpinp != NULL);
882
883
*lportp = lport;
884
885
return (0);
886
}
887
888
/*
889
* Select a local port (number) to use.
890
*/
891
int
892
in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
893
struct ucred *cred, int lookupflags)
894
{
895
struct sockaddr_in laddr;
896
897
if (laddrp) {
898
bzero(&laddr, sizeof(laddr));
899
laddr.sin_family = AF_INET;
900
laddr.sin_addr = *laddrp;
901
}
902
return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
903
NULL, lportp, NULL, 0, cred, lookupflags));
904
}
905
#endif /* INET || INET6 */
906
907
#ifdef INET
908
/*
909
* Determine whether the inpcb can be bound to the specified address/port tuple.
910
*/
911
static int
912
in_pcbbind_avail(struct inpcb *inp, const struct in_addr laddr,
913
const u_short lport, const int fib, int sooptions, int lookupflags,
914
struct ucred *cred)
915
{
916
int reuseport, reuseport_lb;
917
918
INP_LOCK_ASSERT(inp);
919
INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
920
921
reuseport = (sooptions & SO_REUSEPORT);
922
reuseport_lb = (sooptions & SO_REUSEPORT_LB);
923
924
if (IN_MULTICAST(ntohl(laddr.s_addr))) {
925
/*
926
* Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
927
* allow complete duplication of binding if
928
* SO_REUSEPORT is set, or if SO_REUSEADDR is set
929
* and a multicast address is bound on both
930
* new and duplicated sockets.
931
*/
932
if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT)) != 0)
933
reuseport = SO_REUSEADDR | SO_REUSEPORT;
934
/*
935
* XXX: How to deal with SO_REUSEPORT_LB here?
936
* Treat same as SO_REUSEPORT for now.
937
*/
938
if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT_LB)) != 0)
939
reuseport_lb = SO_REUSEADDR | SO_REUSEPORT_LB;
940
} else if (!in_nullhost(laddr)) {
941
struct sockaddr_in sin;
942
943
memset(&sin, 0, sizeof(sin));
944
sin.sin_family = AF_INET;
945
sin.sin_len = sizeof(sin);
946
sin.sin_addr = laddr;
947
948
/*
949
* Is the address a local IP address?
950
* If INP_BINDANY is set, then the socket may be bound
951
* to any endpoint address, local or not.
952
*/
953
if ((inp->inp_flags & INP_BINDANY) == 0 &&
954
ifa_ifwithaddr_check((const struct sockaddr *)&sin) == 0)
955
return (EADDRNOTAVAIL);
956
}
957
958
if (lport != 0) {
959
struct inpcb *t;
960
961
if (ntohs(lport) <= V_ipport_reservedhigh &&
962
ntohs(lport) >= V_ipport_reservedlow &&
963
priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
964
return (EACCES);
965
966
if (!IN_MULTICAST(ntohl(laddr.s_addr)) &&
967
priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
968
/*
969
* If a socket owned by a different user is already
970
* bound to this port, fail. In particular, SO_REUSE*
971
* can only be used to share a port among sockets owned
972
* by the same user.
973
*
974
* However, we can share a port with a connected socket
975
* which has a unique 4-tuple.
976
*/
977
t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport,
978
RT_ALL_FIBS, INPLOOKUP_WILDCARD, cred);
979
if (t != NULL &&
980
(inp->inp_socket->so_type != SOCK_STREAM ||
981
in_nullhost(t->inp_faddr)) &&
982
(inp->inp_cred->cr_uid != t->inp_cred->cr_uid))
983
return (EADDRINUSE);
984
}
985
t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, fib,
986
lookupflags, cred);
987
if (t != NULL && ((reuseport | reuseport_lb) &
988
t->inp_socket->so_options) == 0) {
989
#ifdef INET6
990
if (!in_nullhost(laddr) ||
991
!in_nullhost(t->inp_laddr) ||
992
(inp->inp_vflag & INP_IPV6PROTO) == 0 ||
993
(t->inp_vflag & INP_IPV6PROTO) == 0)
994
#endif
995
return (EADDRINUSE);
996
}
997
}
998
return (0);
999
}
1000
1001
/*
1002
* Set up a bind operation on a PCB, performing port allocation
1003
* as required, but do not actually modify the PCB. Callers can
1004
* either complete the bind by setting inp_laddr/inp_lport and
1005
* calling in_pcbinshash(), or they can just use the resulting
1006
* port and address to authorise the sending of a once-off packet.
1007
*
1008
* On error, the values of *laddrp and *lportp are not changed.
1009
*/
1010
int
1011
in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp,
1012
u_short *lportp, int flags, struct ucred *cred)
1013
{
1014
struct socket *so = inp->inp_socket;
1015
struct in_addr laddr;
1016
u_short lport = 0;
1017
int error, fib, lookupflags, sooptions;
1018
1019
/*
1020
* No state changes, so read locks are sufficient here.
1021
*/
1022
INP_LOCK_ASSERT(inp);
1023
INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
1024
1025
laddr.s_addr = *laddrp;
1026
if (sin != NULL && laddr.s_addr != INADDR_ANY)
1027
return (EINVAL);
1028
1029
lookupflags = 0;
1030
sooptions = atomic_load_int(&so->so_options);
1031
if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT | SO_REUSEPORT_LB)) == 0)
1032
lookupflags = INPLOOKUP_WILDCARD;
1033
if (sin == NULL) {
1034
if ((error = prison_local_ip4(cred, &laddr)) != 0)
1035
return (error);
1036
} else {
1037
KASSERT(sin->sin_family == AF_INET,
1038
("%s: invalid family for address %p", __func__, sin));
1039
KASSERT(sin->sin_len == sizeof(*sin),
1040
("%s: invalid length for address %p", __func__, sin));
1041
1042
error = prison_local_ip4(cred, &sin->sin_addr);
1043
if (error)
1044
return (error);
1045
if (sin->sin_port != *lportp) {
1046
/* Don't allow the port to change. */
1047
if (*lportp != 0)
1048
return (EINVAL);
1049
lport = sin->sin_port;
1050
}
1051
laddr = sin->sin_addr;
1052
1053
fib = (flags & INPBIND_FIB) != 0 ? inp->inp_inc.inc_fibnum :
1054
RT_ALL_FIBS;
1055
1056
/* See if this address/port combo is available. */
1057
error = in_pcbbind_avail(inp, laddr, lport, fib, sooptions,
1058
lookupflags, cred);
1059
if (error != 0)
1060
return (error);
1061
}
1062
if (*lportp != 0)
1063
lport = *lportp;
1064
if (lport == 0) {
1065
error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
1066
if (error != 0)
1067
return (error);
1068
}
1069
*laddrp = laddr.s_addr;
1070
*lportp = lport;
1071
if ((flags & INPBIND_FIB) != 0)
1072
inp->inp_flags |= INP_BOUNDFIB;
1073
return (0);
1074
}
1075
1076
/*
1077
* Connect from a socket to a specified address.
1078
* Both address and port must be specified in argument sin.
1079
* If don't have a local address for this socket yet,
1080
* then pick one.
1081
*/
1082
int
1083
in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred)
1084
{
1085
struct in_addr laddr, faddr;
1086
u_short lport;
1087
int error;
1088
bool anonport;
1089
1090
INP_WLOCK_ASSERT(inp);
1091
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1092
KASSERT(in_nullhost(inp->inp_faddr),
1093
("%s: inp is already connected", __func__));
1094
KASSERT(sin->sin_family == AF_INET,
1095
("%s: invalid address family for %p", __func__, sin));
1096
KASSERT(sin->sin_len == sizeof(*sin),
1097
("%s: invalid address length for %p", __func__, sin));
1098
1099
if (sin->sin_port == 0)
1100
return (EADDRNOTAVAIL);
1101
1102
anonport = (inp->inp_lport == 0);
1103
1104
if (__predict_false(in_broadcast(sin->sin_addr))) {
1105
if (!V_connect_inaddr_wild || CK_STAILQ_EMPTY(&V_in_ifaddrhead))
1106
return (ENETUNREACH);
1107
/*
1108
* If the destination address is INADDR_ANY, use the primary
1109
* local address. If the supplied address is INADDR_BROADCAST,
1110
* and the primary interface supports broadcast, choose the
1111
* broadcast address for that interface.
1112
*/
1113
if (in_nullhost(sin->sin_addr)) {
1114
faddr =
1115
IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
1116
if ((error = prison_get_ip4(cred, &faddr)) != 0)
1117
return (error);
1118
} else if (sin->sin_addr.s_addr == INADDR_BROADCAST &&
1119
CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags
1120
& IFF_BROADCAST) {
1121
faddr = satosin(&CK_STAILQ_FIRST(
1122
&V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
1123
} else
1124
faddr = sin->sin_addr;
1125
} else
1126
faddr = sin->sin_addr;
1127
1128
if (in_nullhost(inp->inp_laddr)) {
1129
error = in_pcbladdr(inp, &faddr, &laddr, cred);
1130
if (error)
1131
return (error);
1132
} else
1133
laddr = inp->inp_laddr;
1134
1135
if (anonport) {
1136
struct sockaddr_in lsin = {
1137
.sin_family = AF_INET,
1138
.sin_addr = laddr,
1139
};
1140
struct sockaddr_in fsin = {
1141
.sin_family = AF_INET,
1142
.sin_addr = faddr,
1143
};
1144
1145
error = in_pcb_lport_dest(inp, (struct sockaddr *)&lsin,
1146
&lport, (struct sockaddr *)&fsin, sin->sin_port, cred,
1147
INPLOOKUP_WILDCARD);
1148
if (error)
1149
return (error);
1150
} else if (in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
1151
sin->sin_port, laddr, inp->inp_lport, 0, M_NODOM, RT_ALL_FIBS) !=
1152
NULL)
1153
return (EADDRINUSE);
1154
else
1155
lport = inp->inp_lport;
1156
1157
MPASS(!in_nullhost(inp->inp_laddr) || inp->inp_lport != 0 ||
1158
!(inp->inp_flags & INP_INHASHLIST));
1159
1160
inp->inp_faddr = faddr;
1161
inp->inp_fport = sin->sin_port;
1162
inp->inp_laddr = laddr;
1163
inp->inp_lport = lport;
1164
1165
if ((inp->inp_flags & INP_INHASHLIST) == 0) {
1166
error = in_pcbinshash(inp);
1167
MPASS(error == 0);
1168
} else
1169
in_pcbrehash(inp);
1170
#ifdef ROUTE_MPATH
1171
if (CALC_FLOWID_OUTBOUND) {
1172
uint32_t hash_val, hash_type;
1173
1174
hash_val = fib4_calc_software_hash(inp->inp_laddr,
1175
inp->inp_faddr, 0, sin->sin_port,
1176
inp->inp_socket->so_proto->pr_protocol, &hash_type);
1177
1178
inp->inp_flowid = hash_val;
1179
inp->inp_flowtype = hash_type;
1180
}
1181
#endif
1182
if (anonport)
1183
inp->inp_flags |= INP_ANONPORT;
1184
return (0);
1185
}
1186
1187
/*
1188
* Do proper source address selection on an unbound socket in case
1189
* of connect. Take jails into account as well.
1190
*/
1191
int
1192
in_pcbladdr(const struct inpcb *inp, struct in_addr *faddr,
1193
struct in_addr *laddr, struct ucred *cred)
1194
{
1195
struct ifaddr *ifa;
1196
struct sockaddr *sa;
1197
struct sockaddr_in *sin, dst;
1198
struct nhop_object *nh;
1199
int error;
1200
1201
NET_EPOCH_ASSERT();
1202
KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
1203
1204
/*
1205
* Bypass source address selection and use the primary jail IP
1206
* if requested.
1207
*/
1208
if (!prison_saddrsel_ip4(cred, laddr))
1209
return (0);
1210
1211
/*
1212
* If the destination address is multicast and an outgoing
1213
* interface has been set as a multicast option, prefer the
1214
* address of that interface as our source address.
1215
*/
1216
if (IN_MULTICAST(ntohl(faddr->s_addr)) && inp->inp_moptions != NULL &&
1217
inp->inp_moptions->imo_multicast_ifp != NULL) {
1218
struct ifnet *ifp = inp->inp_moptions->imo_multicast_ifp;
1219
struct in_ifaddr *ia;
1220
1221
CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1222
if (ia->ia_ifp == ifp &&
1223
prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0)
1224
break;
1225
}
1226
if (ia == NULL)
1227
return (EADDRNOTAVAIL);
1228
*laddr = ia->ia_addr.sin_addr;
1229
return (0);
1230
}
1231
1232
error = 0;
1233
1234
nh = NULL;
1235
bzero(&dst, sizeof(dst));
1236
sin = &dst;
1237
sin->sin_family = AF_INET;
1238
sin->sin_len = sizeof(struct sockaddr_in);
1239
sin->sin_addr.s_addr = faddr->s_addr;
1240
1241
/*
1242
* If route is known our src addr is taken from the i/f,
1243
* else punt.
1244
*
1245
* Find out route to destination.
1246
*/
1247
if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
1248
nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr,
1249
0, NHR_NONE, 0);
1250
1251
/*
1252
* If we found a route, use the address corresponding to
1253
* the outgoing interface.
1254
*
1255
* Otherwise assume faddr is reachable on a directly connected
1256
* network and try to find a corresponding interface to take
1257
* the source address from.
1258
*/
1259
if (nh == NULL || nh->nh_ifp == NULL) {
1260
struct in_ifaddr *ia;
1261
struct ifnet *ifp;
1262
1263
ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
1264
inp->inp_socket->so_fibnum));
1265
if (ia == NULL) {
1266
ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
1267
inp->inp_socket->so_fibnum));
1268
}
1269
if (ia == NULL) {
1270
error = ENETUNREACH;
1271
goto done;
1272
}
1273
1274
if (!prison_flag(cred, PR_IP4)) {
1275
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1276
goto done;
1277
}
1278
1279
ifp = ia->ia_ifp;
1280
ia = NULL;
1281
CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1282
sa = ifa->ifa_addr;
1283
if (sa->sa_family != AF_INET)
1284
continue;
1285
sin = (struct sockaddr_in *)sa;
1286
if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1287
ia = (struct in_ifaddr *)ifa;
1288
break;
1289
}
1290
}
1291
if (ia != NULL) {
1292
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1293
goto done;
1294
}
1295
1296
/* 3. As a last resort return the 'default' jail address. */
1297
error = prison_get_ip4(cred, laddr);
1298
goto done;
1299
}
1300
1301
/*
1302
* If the outgoing interface on the route found is not
1303
* a loopback interface, use the address from that interface.
1304
* In case of jails do those three steps:
1305
* 1. check if the interface address belongs to the jail. If so use it.
1306
* 2. check if we have any address on the outgoing interface
1307
* belonging to this jail. If so use it.
1308
* 3. as a last resort return the 'default' jail address.
1309
*/
1310
if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
1311
struct in_ifaddr *ia;
1312
struct ifnet *ifp;
1313
1314
/* If not jailed, use the default returned. */
1315
if (!prison_flag(cred, PR_IP4)) {
1316
ia = (struct in_ifaddr *)nh->nh_ifa;
1317
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1318
goto done;
1319
}
1320
1321
/* Jailed. */
1322
/* 1. Check if the iface address belongs to the jail. */
1323
sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
1324
if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1325
ia = (struct in_ifaddr *)nh->nh_ifa;
1326
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1327
goto done;
1328
}
1329
1330
/*
1331
* 2. Check if we have any address on the outgoing interface
1332
* belonging to this jail.
1333
*/
1334
ia = NULL;
1335
ifp = nh->nh_ifp;
1336
CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1337
sa = ifa->ifa_addr;
1338
if (sa->sa_family != AF_INET)
1339
continue;
1340
sin = (struct sockaddr_in *)sa;
1341
if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1342
ia = (struct in_ifaddr *)ifa;
1343
break;
1344
}
1345
}
1346
if (ia != NULL) {
1347
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1348
goto done;
1349
}
1350
1351
/* 3. As a last resort return the 'default' jail address. */
1352
error = prison_get_ip4(cred, laddr);
1353
goto done;
1354
}
1355
1356
/*
1357
* The outgoing interface is marked with 'loopback net', so a route
1358
* to ourselves is here.
1359
* Try to find the interface of the destination address and then
1360
* take the address from there. That interface is not necessarily
1361
* a loopback interface.
1362
* In case of jails, check that it is an address of the jail
1363
* and if we cannot find, fall back to the 'default' jail address.
1364
*/
1365
if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
1366
struct in_ifaddr *ia;
1367
1368
ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst),
1369
inp->inp_socket->so_fibnum));
1370
if (ia == NULL)
1371
ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0,
1372
inp->inp_socket->so_fibnum));
1373
if (ia == NULL)
1374
ia = ifatoia(ifa_ifwithaddr(sintosa(&dst)));
1375
1376
if (!prison_flag(cred, PR_IP4)) {
1377
if (ia == NULL) {
1378
error = ENETUNREACH;
1379
goto done;
1380
}
1381
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1382
goto done;
1383
}
1384
1385
/* Jailed. */
1386
if (ia != NULL) {
1387
struct ifnet *ifp;
1388
1389
ifp = ia->ia_ifp;
1390
ia = NULL;
1391
CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1392
sa = ifa->ifa_addr;
1393
if (sa->sa_family != AF_INET)
1394
continue;
1395
sin = (struct sockaddr_in *)sa;
1396
if (prison_check_ip4(cred,
1397
&sin->sin_addr) == 0) {
1398
ia = (struct in_ifaddr *)ifa;
1399
break;
1400
}
1401
}
1402
if (ia != NULL) {
1403
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1404
goto done;
1405
}
1406
}
1407
1408
/* 3. As a last resort return the 'default' jail address. */
1409
error = prison_get_ip4(cred, laddr);
1410
goto done;
1411
}
1412
1413
done:
1414
if (error == 0 && laddr->s_addr == INADDR_ANY)
1415
return (EHOSTUNREACH);
1416
return (error);
1417
}
1418
1419
void
1420
in_pcbdisconnect(struct inpcb *inp)
1421
{
1422
1423
INP_WLOCK_ASSERT(inp);
1424
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1425
KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
1426
("%s: inp %p was already disconnected", __func__, inp));
1427
1428
in_pcbremhash_locked(inp);
1429
1430
/* See the comment in in_pcbinshash(). */
1431
inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr);
1432
inp->inp_laddr.s_addr = INADDR_ANY;
1433
inp->inp_faddr.s_addr = INADDR_ANY;
1434
inp->inp_fport = 0;
1435
}
1436
#endif /* INET */
1437
1438
void
1439
in_pcblisten(struct inpcb *inp)
1440
{
1441
struct inpcblbgroup *grp;
1442
1443
INP_WLOCK_ASSERT(inp);
1444
1445
if ((inp->inp_flags & INP_INLBGROUP) != 0) {
1446
struct inpcbinfo *pcbinfo;
1447
1448
pcbinfo = inp->inp_pcbinfo;
1449
INP_HASH_WLOCK(pcbinfo);
1450
grp = in_pcblbgroup_find(inp);
1451
LIST_REMOVE(inp, inp_lbgroup_list);
1452
grp->il_pendcnt--;
1453
in_pcblbgroup_insert(grp, inp);
1454
INP_HASH_WUNLOCK(pcbinfo);
1455
}
1456
}
1457
1458
/*
1459
* inpcb hash lookups are protected by SMR section.
1460
*
1461
* Once desired pcb has been found, switching from SMR section to a pcb
1462
* lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK
1463
* here because SMR is a critical section.
1464
* In 99%+ cases inp_smr_lock() would obtain the lock immediately.
1465
*/
1466
void
1467
inp_lock(struct inpcb *inp, const inp_lookup_t lock)
1468
{
1469
1470
lock == INPLOOKUP_RLOCKPCB ?
1471
rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock);
1472
}
1473
1474
void
1475
inp_unlock(struct inpcb *inp, const inp_lookup_t lock)
1476
{
1477
1478
lock == INPLOOKUP_RLOCKPCB ?
1479
rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock);
1480
}
1481
1482
int
1483
inp_trylock(struct inpcb *inp, const inp_lookup_t lock)
1484
{
1485
1486
return (lock == INPLOOKUP_RLOCKPCB ?
1487
rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock));
1488
}
1489
1490
static inline bool
1491
_inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags)
1492
{
1493
1494
MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB);
1495
SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr);
1496
1497
if (__predict_true(inp_trylock(inp, lock))) {
1498
if (__predict_false(inp->inp_flags & ignflags)) {
1499
smr_exit(inp->inp_pcbinfo->ipi_smr);
1500
inp_unlock(inp, lock);
1501
return (false);
1502
}
1503
smr_exit(inp->inp_pcbinfo->ipi_smr);
1504
return (true);
1505
}
1506
1507
if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1508
smr_exit(inp->inp_pcbinfo->ipi_smr);
1509
inp_lock(inp, lock);
1510
if (__predict_false(in_pcbrele(inp, lock)))
1511
return (false);
1512
/*
1513
* inp acquired through refcount & lock for sure didn't went
1514
* through uma_zfree(). However, it may have already went
1515
* through in_pcbfree() and has another reference, that
1516
* prevented its release by our in_pcbrele().
1517
*/
1518
if (__predict_false(inp->inp_flags & ignflags)) {
1519
inp_unlock(inp, lock);
1520
return (false);
1521
}
1522
return (true);
1523
} else {
1524
smr_exit(inp->inp_pcbinfo->ipi_smr);
1525
return (false);
1526
}
1527
}
1528
1529
bool
1530
inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
1531
{
1532
1533
/*
1534
* in_pcblookup() family of functions ignore not only freed entries,
1535
* that may be found due to lockless access to the hash, but dropped
1536
* entries, too.
1537
*/
1538
return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED));
1539
}
1540
1541
/*
1542
* inp_next() - inpcb hash/list traversal iterator
1543
*
1544
* Requires initialized struct inpcb_iterator for context.
1545
* The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR().
1546
*
1547
* - Iterator can have either write-lock or read-lock semantics, that can not
1548
* be changed later.
1549
* - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through
1550
* a single hash slot. Note: only rip_input() does the latter.
1551
* - Iterator may have optional bool matching function. The matching function
1552
* will be executed for each inpcb in the SMR context, so it can not acquire
1553
* locks and can safely access only immutable fields of inpcb.
1554
*
1555
* A fresh initialized iterator has NULL inpcb in its context and that
1556
* means that inp_next() call would return the very first inpcb on the list
1557
* locked with desired semantic. In all following calls the context pointer
1558
* shall hold the current inpcb pointer. The KPI user is not supposed to
1559
* unlock the current inpcb! Upon end of traversal inp_next() will return NULL
1560
* and write NULL to its context. After end of traversal an iterator can be
1561
* reused.
1562
*
1563
* List traversals have the following features/constraints:
1564
* - New entries won't be seen, as they are always added to the head of a list.
1565
* - Removed entries won't stop traversal as long as they are not added to
1566
* a different list. This is violated by in_pcbrehash().
1567
*/
1568
#define II_LIST_FIRST(ipi, hash) \
1569
(((hash) == INP_ALL_LIST) ? \
1570
CK_LIST_FIRST(&(ipi)->ipi_listhead) : \
1571
CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)]))
1572
#define II_LIST_NEXT(inp, hash) \
1573
(((hash) == INP_ALL_LIST) ? \
1574
CK_LIST_NEXT((inp), inp_list) : \
1575
CK_LIST_NEXT((inp), inp_hash_exact))
1576
#define II_LOCK_ASSERT(inp, lock) \
1577
rw_assert(&(inp)->inp_lock, \
1578
(lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED )
1579
struct inpcb *
1580
inp_next(struct inpcb_iterator *ii)
1581
{
1582
const struct inpcbinfo *ipi = ii->ipi;
1583
inp_match_t *match = ii->match;
1584
void *ctx = ii->ctx;
1585
inp_lookup_t lock = ii->lock;
1586
int hash = ii->hash;
1587
struct inpcb *inp;
1588
1589
if (ii->inp == NULL) { /* First call. */
1590
smr_enter(ipi->ipi_smr);
1591
/* This is unrolled CK_LIST_FOREACH(). */
1592
for (inp = II_LIST_FIRST(ipi, hash);
1593
inp != NULL;
1594
inp = II_LIST_NEXT(inp, hash)) {
1595
if (match != NULL && (match)(inp, ctx) == false)
1596
continue;
1597
if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED)))
1598
break;
1599
else {
1600
smr_enter(ipi->ipi_smr);
1601
MPASS(inp != II_LIST_FIRST(ipi, hash));
1602
inp = II_LIST_FIRST(ipi, hash);
1603
if (inp == NULL)
1604
break;
1605
}
1606
}
1607
1608
if (inp == NULL)
1609
smr_exit(ipi->ipi_smr);
1610
else
1611
ii->inp = inp;
1612
1613
return (inp);
1614
}
1615
1616
/* Not a first call. */
1617
smr_enter(ipi->ipi_smr);
1618
restart:
1619
inp = ii->inp;
1620
II_LOCK_ASSERT(inp, lock);
1621
next:
1622
inp = II_LIST_NEXT(inp, hash);
1623
if (inp == NULL) {
1624
smr_exit(ipi->ipi_smr);
1625
goto found;
1626
}
1627
1628
if (match != NULL && (match)(inp, ctx) == false)
1629
goto next;
1630
1631
if (__predict_true(inp_trylock(inp, lock))) {
1632
if (__predict_false(inp->inp_flags & INP_FREED)) {
1633
/*
1634
* Entries are never inserted in middle of a list, thus
1635
* as long as we are in SMR, we can continue traversal.
1636
* Jump to 'restart' should yield in the same result,
1637
* but could produce unnecessary looping. Could this
1638
* looping be unbound?
1639
*/
1640
inp_unlock(inp, lock);
1641
goto next;
1642
} else {
1643
smr_exit(ipi->ipi_smr);
1644
goto found;
1645
}
1646
}
1647
1648
/*
1649
* Can't obtain lock immediately, thus going hard. Once we exit the
1650
* SMR section we can no longer jump to 'next', and our only stable
1651
* anchoring point is ii->inp, which we keep locked for this case, so
1652
* we jump to 'restart'.
1653
*/
1654
if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1655
smr_exit(ipi->ipi_smr);
1656
inp_lock(inp, lock);
1657
if (__predict_false(in_pcbrele(inp, lock))) {
1658
smr_enter(ipi->ipi_smr);
1659
goto restart;
1660
}
1661
/*
1662
* See comment in inp_smr_lock().
1663
*/
1664
if (__predict_false(inp->inp_flags & INP_FREED)) {
1665
inp_unlock(inp, lock);
1666
smr_enter(ipi->ipi_smr);
1667
goto restart;
1668
}
1669
} else
1670
goto next;
1671
1672
found:
1673
inp_unlock(ii->inp, lock);
1674
ii->inp = inp;
1675
1676
return (ii->inp);
1677
}
1678
1679
/*
1680
* in_pcbref() bumps the reference count on an inpcb in order to maintain
1681
* stability of an inpcb pointer despite the inpcb lock being released or
1682
* SMR section exited.
1683
*
1684
* To free a reference later in_pcbrele_(r|w)locked() must be performed.
1685
*/
1686
void
1687
in_pcbref(struct inpcb *inp)
1688
{
1689
u_int old __diagused;
1690
1691
old = refcount_acquire(&inp->inp_refcount);
1692
KASSERT(old > 0, ("%s: refcount 0", __func__));
1693
}
1694
1695
/*
1696
* Drop a refcount on an inpcb elevated using in_pcbref(), potentially
1697
* freeing the pcb, if the reference was very last.
1698
*/
1699
bool
1700
in_pcbrele_rlocked(struct inpcb *inp)
1701
{
1702
1703
INP_RLOCK_ASSERT(inp);
1704
1705
if (!refcount_release(&inp->inp_refcount))
1706
return (false);
1707
1708
MPASS(inp->inp_flags & INP_FREED);
1709
MPASS(inp->inp_socket == NULL);
1710
crfree(inp->inp_cred);
1711
#ifdef INVARIANTS
1712
inp->inp_cred = NULL;
1713
#endif
1714
INP_RUNLOCK(inp);
1715
uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1716
return (true);
1717
}
1718
1719
bool
1720
in_pcbrele_wlocked(struct inpcb *inp)
1721
{
1722
1723
INP_WLOCK_ASSERT(inp);
1724
1725
if (!refcount_release(&inp->inp_refcount))
1726
return (false);
1727
1728
MPASS(inp->inp_flags & INP_FREED);
1729
MPASS(inp->inp_socket == NULL);
1730
crfree(inp->inp_cred);
1731
#ifdef INVARIANTS
1732
inp->inp_cred = NULL;
1733
#endif
1734
INP_WUNLOCK(inp);
1735
uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1736
return (true);
1737
}
1738
1739
bool
1740
in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
1741
{
1742
1743
return (lock == INPLOOKUP_RLOCKPCB ?
1744
in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp));
1745
}
1746
1747
/*
1748
* Dereference and rlock inp, for which the caller must own the
1749
* reference. Returns true if inp no longer usable, false otherwise.
1750
*/
1751
bool
1752
in_pcbrele_rlock(struct inpcb *inp)
1753
{
1754
INP_RLOCK(inp);
1755
if (in_pcbrele_rlocked(inp))
1756
return (true);
1757
if ((inp->inp_flags & INP_FREED) != 0) {
1758
INP_RUNLOCK(inp);
1759
return (true);
1760
}
1761
return (false);
1762
}
1763
1764
/*
1765
* Unconditionally schedule an inpcb to be freed by decrementing its
1766
* reference count, which should occur only after the inpcb has been detached
1767
* from its socket. If another thread holds a temporary reference (acquired
1768
* using in_pcbref()) then the free is deferred until that reference is
1769
* released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked.
1770
* Almost all work, including removal from global lists, is done in this
1771
* context, where the pcbinfo lock is held.
1772
*/
1773
void
1774
in_pcbfree(struct inpcb *inp)
1775
{
1776
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1777
#ifdef INET
1778
struct ip_moptions *imo;
1779
#endif
1780
#ifdef INET6
1781
struct ip6_moptions *im6o;
1782
#endif
1783
1784
INP_WLOCK_ASSERT(inp);
1785
KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1786
KASSERT((inp->inp_flags & INP_FREED) == 0,
1787
("%s: called twice for pcb %p", __func__, inp));
1788
1789
/*
1790
* in_pcblookup_local() and in6_pcblookup_local() may return an inpcb
1791
* from the hash without acquiring inpcb lock, they rely on the hash
1792
* lock, thus in_pcbremhash() should be the first action.
1793
*/
1794
if (inp->inp_flags & INP_INHASHLIST)
1795
in_pcbremhash(inp);
1796
INP_INFO_WLOCK(pcbinfo);
1797
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
1798
pcbinfo->ipi_count--;
1799
CK_LIST_REMOVE(inp, inp_list);
1800
INP_INFO_WUNLOCK(pcbinfo);
1801
1802
#ifdef RATELIMIT
1803
if (inp->inp_snd_tag != NULL)
1804
in_pcbdetach_txrtlmt(inp);
1805
#endif
1806
inp->inp_flags |= INP_FREED;
1807
inp->inp_socket->so_pcb = NULL;
1808
inp->inp_socket = NULL;
1809
1810
RO_INVALIDATE_CACHE(&inp->inp_route);
1811
#ifdef MAC
1812
mac_inpcb_destroy(inp);
1813
#endif
1814
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
1815
if (inp->inp_sp != NULL)
1816
ipsec_delete_pcbpolicy(inp);
1817
#endif
1818
#ifdef INET
1819
if (inp->inp_options)
1820
(void)m_free(inp->inp_options);
1821
DEBUG_POISON_POINTER(inp->inp_options);
1822
imo = inp->inp_moptions;
1823
DEBUG_POISON_POINTER(inp->inp_moptions);
1824
#endif
1825
#ifdef INET6
1826
if (inp->inp_vflag & INP_IPV6PROTO) {
1827
ip6_freepcbopts(inp->in6p_outputopts);
1828
DEBUG_POISON_POINTER(inp->in6p_outputopts);
1829
im6o = inp->in6p_moptions;
1830
DEBUG_POISON_POINTER(inp->in6p_moptions);
1831
} else
1832
im6o = NULL;
1833
#endif
1834
1835
if (__predict_false(in_pcbrele_wlocked(inp) == false)) {
1836
INP_WUNLOCK(inp);
1837
}
1838
#ifdef INET6
1839
ip6_freemoptions(im6o);
1840
#endif
1841
#ifdef INET
1842
inp_freemoptions(imo);
1843
#endif
1844
}
1845
1846
/*
1847
* Different protocols initialize their inpcbs differently - giving
1848
* different name to the lock. But they all are disposed the same.
1849
*/
1850
static void
1851
inpcb_fini(void *mem, int size)
1852
{
1853
struct inpcb *inp = mem;
1854
1855
INP_LOCK_DESTROY(inp);
1856
}
1857
1858
/*
1859
* in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
1860
* port reservation, and preventing it from being returned by inpcb lookups.
1861
*
1862
* It is used by TCP to mark an inpcb as unused and avoid future packet
1863
* delivery or event notification when a socket remains open but TCP has
1864
* closed. This might occur as a result of a shutdown()-initiated TCP close
1865
* or a RST on the wire, and allows the port binding to be reused while still
1866
* maintaining the invariant that so_pcb always points to a valid inpcb until
1867
* in_pcbdetach().
1868
*
1869
* XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
1870
* in_pcbpurgeif0()?
1871
*/
1872
void
1873
in_pcbdrop(struct inpcb *inp)
1874
{
1875
1876
INP_WLOCK_ASSERT(inp);
1877
1878
inp->inp_flags |= INP_DROPPED;
1879
if (inp->inp_flags & INP_INHASHLIST)
1880
in_pcbremhash(inp);
1881
}
1882
1883
#ifdef INET
1884
/*
1885
* Common routines to return the socket addresses associated with inpcbs.
1886
*/
1887
int
1888
in_getsockaddr(struct socket *so, struct sockaddr *sa)
1889
{
1890
struct inpcb *inp;
1891
1892
inp = sotoinpcb(so);
1893
KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1894
1895
*(struct sockaddr_in *)sa = (struct sockaddr_in ){
1896
.sin_len = sizeof(struct sockaddr_in),
1897
.sin_family = AF_INET,
1898
.sin_port = inp->inp_lport,
1899
.sin_addr = inp->inp_laddr,
1900
};
1901
1902
return (0);
1903
}
1904
1905
int
1906
in_getpeeraddr(struct socket *so, struct sockaddr *sa)
1907
{
1908
struct inpcb *inp;
1909
1910
inp = sotoinpcb(so);
1911
KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
1912
1913
*(struct sockaddr_in *)sa = (struct sockaddr_in ){
1914
.sin_len = sizeof(struct sockaddr_in),
1915
.sin_family = AF_INET,
1916
.sin_port = inp->inp_fport,
1917
.sin_addr = inp->inp_faddr,
1918
};
1919
1920
return (0);
1921
}
1922
1923
static bool
1924
inp_v4_multi_match(const struct inpcb *inp, void *v __unused)
1925
{
1926
1927
if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL)
1928
return (true);
1929
else
1930
return (false);
1931
}
1932
1933
void
1934
in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1935
{
1936
struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
1937
inp_v4_multi_match, NULL);
1938
struct inpcb *inp;
1939
struct in_multi *inm;
1940
struct in_mfilter *imf;
1941
struct ip_moptions *imo;
1942
1943
IN_MULTI_LOCK_ASSERT();
1944
1945
while ((inp = inp_next(&inpi)) != NULL) {
1946
INP_WLOCK_ASSERT(inp);
1947
1948
imo = inp->inp_moptions;
1949
/*
1950
* Unselect the outgoing interface if it is being
1951
* detached.
1952
*/
1953
if (imo->imo_multicast_ifp == ifp)
1954
imo->imo_multicast_ifp = NULL;
1955
1956
/*
1957
* Drop multicast group membership if we joined
1958
* through the interface being detached.
1959
*
1960
* XXX This can all be deferred to an epoch_call
1961
*/
1962
restart:
1963
IP_MFILTER_FOREACH(imf, &imo->imo_head) {
1964
if ((inm = imf->imf_inm) == NULL)
1965
continue;
1966
if (inm->inm_ifp != ifp)
1967
continue;
1968
ip_mfilter_remove(&imo->imo_head, imf);
1969
in_leavegroup_locked(inm, NULL);
1970
ip_mfilter_free(imf);
1971
goto restart;
1972
}
1973
}
1974
}
1975
1976
/*
1977
* Lookup a PCB based on the local address and port. Caller must hold the
1978
* hash lock. No inpcb locks or references are acquired.
1979
*/
1980
#define INP_LOOKUP_MAPPED_PCB_COST 3
1981
struct inpcb *
1982
in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
1983
u_short lport, int fib, int lookupflags, struct ucred *cred)
1984
{
1985
struct inpcb *inp;
1986
#ifdef INET6
1987
int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
1988
#else
1989
int matchwild = 3;
1990
#endif
1991
int wildcard;
1992
1993
KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
1994
("%s: invalid lookup flags %d", __func__, lookupflags));
1995
KASSERT(fib == RT_ALL_FIBS || (fib >= 0 && fib < V_rt_numfibs),
1996
("%s: invalid fib %d", __func__, fib));
1997
1998
INP_HASH_LOCK_ASSERT(pcbinfo);
1999
2000
if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
2001
struct inpcbhead *head;
2002
/*
2003
* Look for an unconnected (wildcard foreign addr) PCB that
2004
* matches the local address and port we're looking for.
2005
*/
2006
head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2007
pcbinfo->ipi_hashmask)];
2008
CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2009
#ifdef INET6
2010
/* XXX inp locking */
2011
if ((inp->inp_vflag & INP_IPV4) == 0)
2012
continue;
2013
#endif
2014
if (inp->inp_faddr.s_addr == INADDR_ANY &&
2015
inp->inp_laddr.s_addr == laddr.s_addr &&
2016
inp->inp_lport == lport && (fib == RT_ALL_FIBS ||
2017
inp->inp_inc.inc_fibnum == fib)) {
2018
/*
2019
* Found?
2020
*/
2021
if (prison_equal_ip4(cred->cr_prison,
2022
inp->inp_cred->cr_prison))
2023
return (inp);
2024
}
2025
}
2026
/*
2027
* Not found.
2028
*/
2029
return (NULL);
2030
} else {
2031
struct inpcbhead *porthash;
2032
struct inpcb *match = NULL;
2033
2034
/*
2035
* Port is in use by one or more PCBs. Look for best
2036
* fit.
2037
*/
2038
porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
2039
pcbinfo->ipi_porthashmask)];
2040
CK_LIST_FOREACH(inp, porthash, inp_portlist) {
2041
if (inp->inp_lport != lport)
2042
continue;
2043
if (!prison_equal_ip4(inp->inp_cred->cr_prison,
2044
cred->cr_prison))
2045
continue;
2046
if (fib != RT_ALL_FIBS &&
2047
inp->inp_inc.inc_fibnum != fib)
2048
continue;
2049
wildcard = 0;
2050
#ifdef INET6
2051
/* XXX inp locking */
2052
if ((inp->inp_vflag & INP_IPV4) == 0)
2053
continue;
2054
/*
2055
* We never select the PCB that has INP_IPV6 flag and
2056
* is bound to :: if we have another PCB which is bound
2057
* to 0.0.0.0. If a PCB has the INP_IPV6 flag, then we
2058
* set its cost higher than IPv4 only PCBs.
2059
*
2060
* Note that the case only happens when a socket is
2061
* bound to ::, under the condition that the use of the
2062
* mapped address is allowed.
2063
*/
2064
if ((inp->inp_vflag & INP_IPV6) != 0)
2065
wildcard += INP_LOOKUP_MAPPED_PCB_COST;
2066
#endif
2067
if (inp->inp_faddr.s_addr != INADDR_ANY)
2068
wildcard++;
2069
if (inp->inp_laddr.s_addr != INADDR_ANY) {
2070
if (laddr.s_addr == INADDR_ANY)
2071
wildcard++;
2072
else if (inp->inp_laddr.s_addr != laddr.s_addr)
2073
continue;
2074
} else {
2075
if (laddr.s_addr != INADDR_ANY)
2076
wildcard++;
2077
}
2078
if (wildcard < matchwild) {
2079
match = inp;
2080
matchwild = wildcard;
2081
if (matchwild == 0)
2082
break;
2083
}
2084
}
2085
return (match);
2086
}
2087
}
2088
#undef INP_LOOKUP_MAPPED_PCB_COST
2089
2090
static bool
2091
in_pcblookup_lb_match(const struct inpcblbgroup *grp, int domain, int fib)
2092
{
2093
return ((domain == M_NODOM || domain == grp->il_numa_domain) &&
2094
(fib == RT_ALL_FIBS || fib == grp->il_fibnum));
2095
}
2096
2097
static struct inpcb *
2098
in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
2099
const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr,
2100
uint16_t lport, int domain, int fib)
2101
{
2102
const struct inpcblbgrouphead *hdr;
2103
struct inpcblbgroup *grp;
2104
struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild;
2105
struct inpcb *inp;
2106
u_int count;
2107
2108
INP_HASH_LOCK_ASSERT(pcbinfo);
2109
NET_EPOCH_ASSERT();
2110
2111
hdr = &pcbinfo->ipi_lbgrouphashbase[
2112
INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
2113
2114
/*
2115
* Search for an LB group match based on the following criteria:
2116
* - prefer jailed groups to non-jailed groups
2117
* - prefer exact source address matches to wildcard matches
2118
* - prefer groups bound to the specified NUMA domain
2119
*/
2120
jail_exact = jail_wild = local_exact = local_wild = NULL;
2121
CK_LIST_FOREACH(grp, hdr, il_list) {
2122
bool injail;
2123
2124
#ifdef INET6
2125
if (!(grp->il_vflag & INP_IPV4))
2126
continue;
2127
#endif
2128
if (grp->il_lport != lport)
2129
continue;
2130
2131
injail = prison_flag(grp->il_cred, PR_IP4) != 0;
2132
if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison,
2133
laddr) != 0)
2134
continue;
2135
2136
if (grp->il_laddr.s_addr == laddr->s_addr) {
2137
if (injail) {
2138
jail_exact = grp;
2139
if (in_pcblookup_lb_match(grp, domain, fib))
2140
/* This is a perfect match. */
2141
goto out;
2142
} else if (local_exact == NULL ||
2143
in_pcblookup_lb_match(grp, domain, fib)) {
2144
local_exact = grp;
2145
}
2146
} else if (grp->il_laddr.s_addr == INADDR_ANY) {
2147
if (injail) {
2148
if (jail_wild == NULL ||
2149
in_pcblookup_lb_match(grp, domain, fib))
2150
jail_wild = grp;
2151
} else if (local_wild == NULL ||
2152
in_pcblookup_lb_match(grp, domain, fib)) {
2153
local_wild = grp;
2154
}
2155
}
2156
}
2157
2158
if (jail_exact != NULL)
2159
grp = jail_exact;
2160
else if (jail_wild != NULL)
2161
grp = jail_wild;
2162
else if (local_exact != NULL)
2163
grp = local_exact;
2164
else
2165
grp = local_wild;
2166
if (grp == NULL)
2167
return (NULL);
2168
2169
out:
2170
/*
2171
* Synchronize with in_pcblbgroup_insert().
2172
*/
2173
count = atomic_load_acq_int(&grp->il_inpcnt);
2174
if (count == 0)
2175
return (NULL);
2176
inp = grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count];
2177
KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
2178
return (inp);
2179
}
2180
2181
static bool
2182
in_pcblookup_exact_match(const struct inpcb *inp, struct in_addr faddr,
2183
u_short fport, struct in_addr laddr, u_short lport)
2184
{
2185
#ifdef INET6
2186
/* XXX inp locking */
2187
if ((inp->inp_vflag & INP_IPV4) == 0)
2188
return (false);
2189
#endif
2190
if (inp->inp_faddr.s_addr == faddr.s_addr &&
2191
inp->inp_laddr.s_addr == laddr.s_addr &&
2192
inp->inp_fport == fport &&
2193
inp->inp_lport == lport)
2194
return (true);
2195
return (false);
2196
}
2197
2198
static struct inpcb *
2199
in_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2200
u_short fport, struct in_addr laddr, u_short lport)
2201
{
2202
struct inpcbhead *head;
2203
struct inpcb *inp;
2204
2205
INP_HASH_LOCK_ASSERT(pcbinfo);
2206
2207
head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport,
2208
pcbinfo->ipi_hashmask)];
2209
CK_LIST_FOREACH(inp, head, inp_hash_exact) {
2210
if (in_pcblookup_exact_match(inp, faddr, fport, laddr, lport))
2211
return (inp);
2212
}
2213
return (NULL);
2214
}
2215
2216
typedef enum {
2217
INPLOOKUP_MATCH_NONE = 0,
2218
INPLOOKUP_MATCH_WILD = 1,
2219
INPLOOKUP_MATCH_LADDR = 2,
2220
} inp_lookup_match_t;
2221
2222
static inp_lookup_match_t
2223
in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr,
2224
u_short lport, int fib)
2225
{
2226
#ifdef INET6
2227
/* XXX inp locking */
2228
if ((inp->inp_vflag & INP_IPV4) == 0)
2229
return (INPLOOKUP_MATCH_NONE);
2230
#endif
2231
if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport)
2232
return (INPLOOKUP_MATCH_NONE);
2233
if (fib != RT_ALL_FIBS && inp->inp_inc.inc_fibnum != fib)
2234
return (INPLOOKUP_MATCH_NONE);
2235
if (inp->inp_laddr.s_addr == INADDR_ANY)
2236
return (INPLOOKUP_MATCH_WILD);
2237
if (inp->inp_laddr.s_addr == laddr.s_addr)
2238
return (INPLOOKUP_MATCH_LADDR);
2239
return (INPLOOKUP_MATCH_NONE);
2240
}
2241
2242
#define INP_LOOKUP_AGAIN ((struct inpcb *)(uintptr_t)-1)
2243
2244
static struct inpcb *
2245
in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2246
u_short lport, int fib, const inp_lookup_t lockflags)
2247
{
2248
struct inpcbhead *head;
2249
struct inpcb *inp;
2250
2251
KASSERT(SMR_ENTERED(pcbinfo->ipi_smr),
2252
("%s: not in SMR read section", __func__));
2253
2254
head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2255
pcbinfo->ipi_hashmask)];
2256
CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2257
inp_lookup_match_t match;
2258
2259
match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2260
if (match == INPLOOKUP_MATCH_NONE)
2261
continue;
2262
2263
if (__predict_true(inp_smr_lock(inp, lockflags))) {
2264
match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2265
if (match != INPLOOKUP_MATCH_NONE &&
2266
prison_check_ip4_locked(inp->inp_cred->cr_prison,
2267
&laddr) == 0)
2268
return (inp);
2269
inp_unlock(inp, lockflags);
2270
}
2271
2272
/*
2273
* The matching socket disappeared out from under us. Fall back
2274
* to a serialized lookup.
2275
*/
2276
return (INP_LOOKUP_AGAIN);
2277
}
2278
return (NULL);
2279
}
2280
2281
static struct inpcb *
2282
in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2283
u_short lport, int fib)
2284
{
2285
struct inpcbhead *head;
2286
struct inpcb *inp, *local_wild, *local_exact, *jail_wild;
2287
#ifdef INET6
2288
struct inpcb *local_wild_mapped;
2289
#endif
2290
2291
INP_HASH_LOCK_ASSERT(pcbinfo);
2292
2293
/*
2294
* Order of socket selection - we always prefer jails.
2295
* 1. jailed, non-wild.
2296
* 2. jailed, wild.
2297
* 3. non-jailed, non-wild.
2298
* 4. non-jailed, wild.
2299
*/
2300
head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2301
pcbinfo->ipi_hashmask)];
2302
local_wild = local_exact = jail_wild = NULL;
2303
#ifdef INET6
2304
local_wild_mapped = NULL;
2305
#endif
2306
CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2307
inp_lookup_match_t match;
2308
bool injail;
2309
2310
match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2311
if (match == INPLOOKUP_MATCH_NONE)
2312
continue;
2313
2314
injail = prison_flag(inp->inp_cred, PR_IP4) != 0;
2315
if (injail) {
2316
if (prison_check_ip4_locked(inp->inp_cred->cr_prison,
2317
&laddr) != 0)
2318
continue;
2319
} else {
2320
if (local_exact != NULL)
2321
continue;
2322
}
2323
2324
if (match == INPLOOKUP_MATCH_LADDR) {
2325
if (injail)
2326
return (inp);
2327
local_exact = inp;
2328
} else {
2329
#ifdef INET6
2330
/* XXX inp locking, NULL check */
2331
if (inp->inp_vflag & INP_IPV6PROTO)
2332
local_wild_mapped = inp;
2333
else
2334
#endif
2335
if (injail)
2336
jail_wild = inp;
2337
else
2338
local_wild = inp;
2339
}
2340
}
2341
if (jail_wild != NULL)
2342
return (jail_wild);
2343
if (local_exact != NULL)
2344
return (local_exact);
2345
if (local_wild != NULL)
2346
return (local_wild);
2347
#ifdef INET6
2348
if (local_wild_mapped != NULL)
2349
return (local_wild_mapped);
2350
#endif
2351
return (NULL);
2352
}
2353
2354
/*
2355
* Lookup PCB in hash list, using pcbinfo tables. This variation assumes
2356
* that the caller has either locked the hash list, which usually happens
2357
* for bind(2) operations, or is in SMR section, which happens when sorting
2358
* out incoming packets.
2359
*/
2360
static struct inpcb *
2361
in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2362
u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2363
uint8_t numa_domain, int fib)
2364
{
2365
struct inpcb *inp;
2366
const u_short fport = fport_arg, lport = lport_arg;
2367
2368
KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD | INPLOOKUP_FIB)) == 0,
2369
("%s: invalid lookup flags %d", __func__, lookupflags));
2370
KASSERT(faddr.s_addr != INADDR_ANY,
2371
("%s: invalid foreign address", __func__));
2372
KASSERT(laddr.s_addr != INADDR_ANY,
2373
("%s: invalid local address", __func__));
2374
INP_HASH_WLOCK_ASSERT(pcbinfo);
2375
2376
inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport);
2377
if (inp != NULL)
2378
return (inp);
2379
2380
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2381
inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
2382
&laddr, lport, numa_domain, fib);
2383
if (inp == NULL) {
2384
inp = in_pcblookup_hash_wild_locked(pcbinfo, laddr,
2385
lport, fib);
2386
}
2387
}
2388
2389
return (inp);
2390
}
2391
2392
static struct inpcb *
2393
in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2394
u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2395
uint8_t numa_domain, int fib)
2396
{
2397
struct inpcb *inp;
2398
const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
2399
2400
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2401
("%s: LOCKPCB not set", __func__));
2402
2403
INP_HASH_WLOCK(pcbinfo);
2404
inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
2405
lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain, fib);
2406
if (inp != NULL && !inp_trylock(inp, lockflags)) {
2407
in_pcbref(inp);
2408
INP_HASH_WUNLOCK(pcbinfo);
2409
inp_lock(inp, lockflags);
2410
if (in_pcbrele(inp, lockflags))
2411
/* XXX-MJ or retry until we get a negative match? */
2412
inp = NULL;
2413
} else {
2414
INP_HASH_WUNLOCK(pcbinfo);
2415
}
2416
return (inp);
2417
}
2418
2419
static struct inpcb *
2420
in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2421
u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2422
uint8_t numa_domain, int fib)
2423
{
2424
struct inpcb *inp;
2425
const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
2426
const u_short fport = fport_arg, lport = lport_arg;
2427
2428
KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2429
("%s: invalid lookup flags %d", __func__, lookupflags));
2430
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2431
("%s: LOCKPCB not set", __func__));
2432
2433
smr_enter(pcbinfo->ipi_smr);
2434
inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport);
2435
if (inp != NULL) {
2436
if (__predict_true(inp_smr_lock(inp, lockflags))) {
2437
/*
2438
* Revalidate the 4-tuple, the socket could have been
2439
* disconnected.
2440
*/
2441
if (__predict_true(in_pcblookup_exact_match(inp,
2442
faddr, fport, laddr, lport)))
2443
return (inp);
2444
inp_unlock(inp, lockflags);
2445
}
2446
2447
/*
2448
* We failed to lock the inpcb, or its connection state changed
2449
* out from under us. Fall back to a precise search.
2450
*/
2451
return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2452
lookupflags, numa_domain, fib));
2453
}
2454
2455
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2456
inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
2457
&laddr, lport, numa_domain, fib);
2458
if (inp != NULL) {
2459
if (__predict_true(inp_smr_lock(inp, lockflags))) {
2460
if (__predict_true(in_pcblookup_wild_match(inp,
2461
laddr, lport, fib) != INPLOOKUP_MATCH_NONE))
2462
return (inp);
2463
inp_unlock(inp, lockflags);
2464
}
2465
inp = INP_LOOKUP_AGAIN;
2466
} else {
2467
inp = in_pcblookup_hash_wild_smr(pcbinfo, laddr, lport,
2468
fib, lockflags);
2469
}
2470
if (inp == INP_LOOKUP_AGAIN) {
2471
return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr,
2472
lport, lookupflags, numa_domain, fib));
2473
}
2474
}
2475
2476
if (inp == NULL)
2477
smr_exit(pcbinfo->ipi_smr);
2478
2479
return (inp);
2480
}
2481
2482
/*
2483
* Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
2484
* from which a pre-calculated hash value may be extracted.
2485
*/
2486
struct inpcb *
2487
in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
2488
struct in_addr laddr, u_int lport, int lookupflags,
2489
struct ifnet *ifp)
2490
{
2491
int fib;
2492
2493
fib = (lookupflags & INPLOOKUP_FIB) ? if_getfib(ifp) : RT_ALL_FIBS;
2494
return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport,
2495
lookupflags, M_NODOM, fib));
2496
}
2497
2498
struct inpcb *
2499
in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2500
u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2501
struct ifnet *ifp __unused, struct mbuf *m)
2502
{
2503
int fib;
2504
2505
M_ASSERTPKTHDR(m);
2506
fib = (lookupflags & INPLOOKUP_FIB) ? M_GETFIB(m) : RT_ALL_FIBS;
2507
return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport,
2508
lookupflags, m->m_pkthdr.numa_domain, fib));
2509
}
2510
#endif /* INET */
2511
2512
static bool
2513
in_pcbjailed(const struct inpcb *inp, unsigned int flag)
2514
{
2515
return (prison_flag(inp->inp_cred, flag) != 0);
2516
}
2517
2518
/*
2519
* Insert the PCB into a hash chain using ordering rules which ensure that
2520
* in_pcblookup_hash_wild_*() always encounter the highest-ranking PCB first.
2521
*
2522
* Specifically, keep jailed PCBs in front of non-jailed PCBs, and keep PCBs
2523
* with exact local addresses ahead of wildcard PCBs. Unbound v4-mapped v6 PCBs
2524
* always appear last no matter whether they are jailed.
2525
*/
2526
static void
2527
_in_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
2528
{
2529
struct inpcb *last;
2530
bool bound, injail;
2531
2532
INP_LOCK_ASSERT(inp);
2533
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2534
2535
last = NULL;
2536
bound = inp->inp_laddr.s_addr != INADDR_ANY;
2537
if (!bound && (inp->inp_vflag & INP_IPV6PROTO) != 0) {
2538
CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2539
if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2540
CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2541
return;
2542
}
2543
}
2544
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2545
return;
2546
}
2547
2548
injail = in_pcbjailed(inp, PR_IP4);
2549
if (!injail) {
2550
CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2551
if (!in_pcbjailed(last, PR_IP4))
2552
break;
2553
if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2554
CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2555
return;
2556
}
2557
}
2558
} else if (!CK_LIST_EMPTY(pcbhash) &&
2559
!in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP4)) {
2560
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2561
return;
2562
}
2563
if (!bound) {
2564
CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
2565
if (last->inp_laddr.s_addr == INADDR_ANY)
2566
break;
2567
if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2568
CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2569
return;
2570
}
2571
}
2572
}
2573
if (last == NULL)
2574
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2575
else
2576
CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
2577
}
2578
2579
#ifdef INET6
2580
/*
2581
* See the comment above _in_pcbinshash_wild().
2582
*/
2583
static void
2584
_in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
2585
{
2586
struct inpcb *last;
2587
bool bound, injail;
2588
2589
INP_LOCK_ASSERT(inp);
2590
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2591
2592
last = NULL;
2593
bound = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr);
2594
injail = in_pcbjailed(inp, PR_IP6);
2595
if (!injail) {
2596
CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2597
if (!in_pcbjailed(last, PR_IP6))
2598
break;
2599
if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2600
CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2601
return;
2602
}
2603
}
2604
} else if (!CK_LIST_EMPTY(pcbhash) &&
2605
!in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP6)) {
2606
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2607
return;
2608
}
2609
if (!bound) {
2610
CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
2611
if (IN6_IS_ADDR_UNSPECIFIED(&last->in6p_laddr))
2612
break;
2613
if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2614
CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2615
return;
2616
}
2617
}
2618
}
2619
if (last == NULL)
2620
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2621
else
2622
CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
2623
}
2624
#endif
2625
2626
/*
2627
* Insert PCB onto various hash lists.
2628
*
2629
* With normal sockets this function shall not fail, so it could return void.
2630
* But for SO_REUSEPORT_LB it may need to allocate memory with locks held,
2631
* that's the only condition when it can fail.
2632
*/
2633
int
2634
in_pcbinshash(struct inpcb *inp)
2635
{
2636
struct inpcbhead *pcbhash, *pcbporthash;
2637
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2638
uint32_t hash;
2639
bool connected;
2640
2641
INP_WLOCK_ASSERT(inp);
2642
INP_HASH_WLOCK_ASSERT(pcbinfo);
2643
KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
2644
("in_pcbinshash: INP_INHASHLIST"));
2645
2646
#ifdef INET6
2647
if (inp->inp_vflag & INP_IPV6) {
2648
hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
2649
inp->inp_fport, pcbinfo->ipi_hashmask);
2650
connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
2651
} else
2652
#endif
2653
{
2654
hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
2655
inp->inp_fport, pcbinfo->ipi_hashmask);
2656
connected = !in_nullhost(inp->inp_faddr);
2657
}
2658
2659
if (connected)
2660
pcbhash = &pcbinfo->ipi_hash_exact[hash];
2661
else
2662
pcbhash = &pcbinfo->ipi_hash_wild[hash];
2663
2664
pcbporthash = &pcbinfo->ipi_porthashbase[
2665
INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
2666
2667
/*
2668
* Add entry to load balance group.
2669
* Only do this if SO_REUSEPORT_LB is set.
2670
*/
2671
if ((inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) {
2672
int error = in_pcbinslbgrouphash(inp, M_NODOM);
2673
if (error != 0)
2674
return (error);
2675
}
2676
2677
/*
2678
* The PCB may have been disconnected in the past. Before we can safely
2679
* make it visible in the hash table, we must wait for all readers which
2680
* may be traversing this PCB to finish.
2681
*/
2682
if (inp->inp_smr != SMR_SEQ_INVALID) {
2683
smr_wait(pcbinfo->ipi_smr, inp->inp_smr);
2684
inp->inp_smr = SMR_SEQ_INVALID;
2685
}
2686
2687
if (connected)
2688
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact);
2689
else {
2690
#ifdef INET6
2691
if ((inp->inp_vflag & INP_IPV6) != 0)
2692
_in6_pcbinshash_wild(pcbhash, inp);
2693
else
2694
#endif
2695
_in_pcbinshash_wild(pcbhash, inp);
2696
}
2697
CK_LIST_INSERT_HEAD(pcbporthash, inp, inp_portlist);
2698
inp->inp_flags |= INP_INHASHLIST;
2699
2700
return (0);
2701
}
2702
2703
void
2704
in_pcbremhash_locked(struct inpcb *inp)
2705
{
2706
2707
INP_WLOCK_ASSERT(inp);
2708
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2709
MPASS(inp->inp_flags & INP_INHASHLIST);
2710
2711
if ((inp->inp_flags & INP_INLBGROUP) != 0)
2712
in_pcbremlbgrouphash(inp);
2713
#ifdef INET6
2714
if (inp->inp_vflag & INP_IPV6) {
2715
if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
2716
CK_LIST_REMOVE(inp, inp_hash_wild);
2717
else
2718
CK_LIST_REMOVE(inp, inp_hash_exact);
2719
} else
2720
#endif
2721
{
2722
if (in_nullhost(inp->inp_faddr))
2723
CK_LIST_REMOVE(inp, inp_hash_wild);
2724
else
2725
CK_LIST_REMOVE(inp, inp_hash_exact);
2726
}
2727
CK_LIST_REMOVE(inp, inp_portlist);
2728
inp->inp_flags &= ~INP_INHASHLIST;
2729
}
2730
2731
static void
2732
in_pcbremhash(struct inpcb *inp)
2733
{
2734
INP_HASH_WLOCK(inp->inp_pcbinfo);
2735
in_pcbremhash_locked(inp);
2736
INP_HASH_WUNLOCK(inp->inp_pcbinfo);
2737
}
2738
2739
/*
2740
* Move PCB to the proper hash bucket when { faddr, fport } have been
2741
* changed. NOTE: This does not handle the case of the lport changing (the
2742
* hashed port list would have to be updated as well), so the lport must
2743
* not change after in_pcbinshash() has been called.
2744
*/
2745
void
2746
in_pcbrehash(struct inpcb *inp)
2747
{
2748
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2749
struct inpcbhead *head;
2750
uint32_t hash;
2751
bool connected;
2752
2753
INP_WLOCK_ASSERT(inp);
2754
INP_HASH_WLOCK_ASSERT(pcbinfo);
2755
KASSERT(inp->inp_flags & INP_INHASHLIST,
2756
("%s: !INP_INHASHLIST", __func__));
2757
KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
2758
("%s: inp was disconnected", __func__));
2759
2760
#ifdef INET6
2761
if (inp->inp_vflag & INP_IPV6) {
2762
hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
2763
inp->inp_fport, pcbinfo->ipi_hashmask);
2764
connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
2765
} else
2766
#endif
2767
{
2768
hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
2769
inp->inp_fport, pcbinfo->ipi_hashmask);
2770
connected = !in_nullhost(inp->inp_faddr);
2771
}
2772
2773
/*
2774
* When rehashing, the caller must ensure that either the new or the old
2775
* foreign address was unspecified.
2776
*/
2777
if (connected)
2778
CK_LIST_REMOVE(inp, inp_hash_wild);
2779
else
2780
CK_LIST_REMOVE(inp, inp_hash_exact);
2781
2782
if (connected) {
2783
head = &pcbinfo->ipi_hash_exact[hash];
2784
CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact);
2785
} else {
2786
head = &pcbinfo->ipi_hash_wild[hash];
2787
CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild);
2788
}
2789
}
2790
2791
/*
2792
* Check for alternatives when higher level complains
2793
* about service problems. For now, invalidate cached
2794
* routing information. If the route was created dynamically
2795
* (by a redirect), time to try a default gateway again.
2796
*/
2797
void
2798
in_losing(struct inpcb *inp)
2799
{
2800
2801
RO_INVALIDATE_CACHE(&inp->inp_route);
2802
return;
2803
}
2804
2805
/*
2806
* A set label operation has occurred at the socket layer, propagate the
2807
* label change into the in_pcb for the socket.
2808
*/
2809
void
2810
in_pcbsosetlabel(struct socket *so)
2811
{
2812
#ifdef MAC
2813
struct inpcb *inp;
2814
2815
inp = sotoinpcb(so);
2816
KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
2817
2818
INP_WLOCK(inp);
2819
SOCK_LOCK(so);
2820
mac_inpcb_sosetlabel(so, inp);
2821
SOCK_UNLOCK(so);
2822
INP_WUNLOCK(inp);
2823
#endif
2824
}
2825
2826
void
2827
inp_wlock(struct inpcb *inp)
2828
{
2829
2830
INP_WLOCK(inp);
2831
}
2832
2833
void
2834
inp_wunlock(struct inpcb *inp)
2835
{
2836
2837
INP_WUNLOCK(inp);
2838
}
2839
2840
void
2841
inp_rlock(struct inpcb *inp)
2842
{
2843
2844
INP_RLOCK(inp);
2845
}
2846
2847
void
2848
inp_runlock(struct inpcb *inp)
2849
{
2850
2851
INP_RUNLOCK(inp);
2852
}
2853
2854
#ifdef INVARIANT_SUPPORT
2855
void
2856
inp_lock_assert(struct inpcb *inp)
2857
{
2858
2859
INP_WLOCK_ASSERT(inp);
2860
}
2861
2862
void
2863
inp_unlock_assert(struct inpcb *inp)
2864
{
2865
2866
INP_UNLOCK_ASSERT(inp);
2867
}
2868
#endif
2869
2870
void
2871
inp_apply_all(struct inpcbinfo *pcbinfo,
2872
void (*func)(struct inpcb *, void *), void *arg)
2873
{
2874
struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
2875
INPLOOKUP_WLOCKPCB);
2876
struct inpcb *inp;
2877
2878
while ((inp = inp_next(&inpi)) != NULL)
2879
func(inp, arg);
2880
}
2881
2882
struct socket *
2883
inp_inpcbtosocket(struct inpcb *inp)
2884
{
2885
2886
INP_WLOCK_ASSERT(inp);
2887
return (inp->inp_socket);
2888
}
2889
2890
void
2891
inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
2892
uint32_t *faddr, uint16_t *fp)
2893
{
2894
2895
INP_LOCK_ASSERT(inp);
2896
*laddr = inp->inp_laddr.s_addr;
2897
*faddr = inp->inp_faddr.s_addr;
2898
*lp = inp->inp_lport;
2899
*fp = inp->inp_fport;
2900
}
2901
2902
/*
2903
* Create an external-format (``xinpcb'') structure using the information in
2904
* the kernel-format in_pcb structure pointed to by inp. This is done to
2905
* reduce the spew of irrelevant information over this interface, to isolate
2906
* user code from changes in the kernel structure, and potentially to provide
2907
* information-hiding if we decide that some of this information should be
2908
* hidden from users.
2909
*/
2910
void
2911
in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
2912
{
2913
2914
bzero(xi, sizeof(*xi));
2915
xi->xi_len = sizeof(struct xinpcb);
2916
if (inp->inp_socket)
2917
sotoxsocket(inp->inp_socket, &xi->xi_socket);
2918
bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
2919
xi->inp_gencnt = inp->inp_gencnt;
2920
xi->inp_flow = inp->inp_flow;
2921
xi->inp_flowid = inp->inp_flowid;
2922
xi->inp_flowtype = inp->inp_flowtype;
2923
xi->inp_flags = inp->inp_flags;
2924
xi->inp_flags2 = inp->inp_flags2;
2925
xi->in6p_cksum = inp->in6p_cksum;
2926
xi->in6p_hops = inp->in6p_hops;
2927
xi->inp_ip_tos = inp->inp_ip_tos;
2928
xi->inp_vflag = inp->inp_vflag;
2929
xi->inp_ip_ttl = inp->inp_ip_ttl;
2930
xi->inp_ip_p = inp->inp_ip_p;
2931
xi->inp_ip_minttl = inp->inp_ip_minttl;
2932
}
2933
2934
int
2935
sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
2936
int (*ctloutput_set)(struct inpcb *, struct sockopt *))
2937
{
2938
struct sockopt sopt;
2939
struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
2940
INPLOOKUP_WLOCKPCB);
2941
struct inpcb *inp;
2942
struct sockopt_parameters *params;
2943
struct socket *so;
2944
int error;
2945
char buf[1024];
2946
2947
if (req->oldptr != NULL || req->oldlen != 0)
2948
return (EINVAL);
2949
if (req->newptr == NULL)
2950
return (EPERM);
2951
if (req->newlen > sizeof(buf))
2952
return (ENOMEM);
2953
error = SYSCTL_IN(req, buf, req->newlen);
2954
if (error != 0)
2955
return (error);
2956
if (req->newlen < sizeof(struct sockopt_parameters))
2957
return (EINVAL);
2958
params = (struct sockopt_parameters *)buf;
2959
sopt.sopt_level = params->sop_level;
2960
sopt.sopt_name = params->sop_optname;
2961
sopt.sopt_dir = SOPT_SET;
2962
sopt.sopt_val = params->sop_optval;
2963
sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters);
2964
sopt.sopt_td = NULL;
2965
#ifdef INET6
2966
if (params->sop_inc.inc_flags & INC_ISIPV6) {
2967
if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_laddr))
2968
params->sop_inc.inc6_laddr.s6_addr16[1] =
2969
htons(params->sop_inc.inc6_zoneid & 0xffff);
2970
if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_faddr))
2971
params->sop_inc.inc6_faddr.s6_addr16[1] =
2972
htons(params->sop_inc.inc6_zoneid & 0xffff);
2973
}
2974
#endif
2975
if (params->sop_inc.inc_lport != htons(0) &&
2976
params->sop_inc.inc_fport != htons(0)) {
2977
#ifdef INET6
2978
if (params->sop_inc.inc_flags & INC_ISIPV6)
2979
inpi.hash = INP6_PCBHASH(
2980
&params->sop_inc.inc6_faddr,
2981
params->sop_inc.inc_lport,
2982
params->sop_inc.inc_fport,
2983
pcbinfo->ipi_hashmask);
2984
else
2985
#endif
2986
inpi.hash = INP_PCBHASH(
2987
&params->sop_inc.inc_faddr,
2988
params->sop_inc.inc_lport,
2989
params->sop_inc.inc_fport,
2990
pcbinfo->ipi_hashmask);
2991
}
2992
while ((inp = inp_next(&inpi)) != NULL)
2993
if (inp->inp_gencnt == params->sop_id) {
2994
if (inp->inp_flags & INP_DROPPED) {
2995
INP_WUNLOCK(inp);
2996
return (ECONNRESET);
2997
}
2998
so = inp->inp_socket;
2999
KASSERT(so != NULL, ("inp_socket == NULL"));
3000
soref(so);
3001
if (params->sop_level == SOL_SOCKET) {
3002
INP_WUNLOCK(inp);
3003
error = sosetopt(so, &sopt);
3004
} else
3005
error = (*ctloutput_set)(inp, &sopt);
3006
sorele(so);
3007
break;
3008
}
3009
if (inp == NULL)
3010
error = ESRCH;
3011
return (error);
3012
}
3013
3014
#ifdef DDB
3015
static void
3016
db_print_indent(int indent)
3017
{
3018
int i;
3019
3020
for (i = 0; i < indent; i++)
3021
db_printf(" ");
3022
}
3023
3024
static void
3025
db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
3026
{
3027
char faddr_str[48], laddr_str[48];
3028
3029
db_print_indent(indent);
3030
db_printf("%s at %p\n", name, inc);
3031
3032
indent += 2;
3033
3034
#ifdef INET6
3035
if (inc->inc_flags & INC_ISIPV6) {
3036
/* IPv6. */
3037
ip6_sprintf(laddr_str, &inc->inc6_laddr);
3038
ip6_sprintf(faddr_str, &inc->inc6_faddr);
3039
} else
3040
#endif
3041
{
3042
/* IPv4. */
3043
inet_ntoa_r(inc->inc_laddr, laddr_str);
3044
inet_ntoa_r(inc->inc_faddr, faddr_str);
3045
}
3046
db_print_indent(indent);
3047
db_printf("inc_laddr %s inc_lport %u\n", laddr_str,
3048
ntohs(inc->inc_lport));
3049
db_print_indent(indent);
3050
db_printf("inc_faddr %s inc_fport %u\n", faddr_str,
3051
ntohs(inc->inc_fport));
3052
}
3053
3054
static void
3055
db_print_inpflags(int inp_flags)
3056
{
3057
int comma;
3058
3059
comma = 0;
3060
if (inp_flags & INP_RECVOPTS) {
3061
db_printf("%sINP_RECVOPTS", comma ? ", " : "");
3062
comma = 1;
3063
}
3064
if (inp_flags & INP_RECVRETOPTS) {
3065
db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
3066
comma = 1;
3067
}
3068
if (inp_flags & INP_RECVDSTADDR) {
3069
db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
3070
comma = 1;
3071
}
3072
if (inp_flags & INP_ORIGDSTADDR) {
3073
db_printf("%sINP_ORIGDSTADDR", comma ? ", " : "");
3074
comma = 1;
3075
}
3076
if (inp_flags & INP_HDRINCL) {
3077
db_printf("%sINP_HDRINCL", comma ? ", " : "");
3078
comma = 1;
3079
}
3080
if (inp_flags & INP_HIGHPORT) {
3081
db_printf("%sINP_HIGHPORT", comma ? ", " : "");
3082
comma = 1;
3083
}
3084
if (inp_flags & INP_LOWPORT) {
3085
db_printf("%sINP_LOWPORT", comma ? ", " : "");
3086
comma = 1;
3087
}
3088
if (inp_flags & INP_ANONPORT) {
3089
db_printf("%sINP_ANONPORT", comma ? ", " : "");
3090
comma = 1;
3091
}
3092
if (inp_flags & INP_RECVIF) {
3093
db_printf("%sINP_RECVIF", comma ? ", " : "");
3094
comma = 1;
3095
}
3096
if (inp_flags & INP_MTUDISC) {
3097
db_printf("%sINP_MTUDISC", comma ? ", " : "");
3098
comma = 1;
3099
}
3100
if (inp_flags & INP_RECVTTL) {
3101
db_printf("%sINP_RECVTTL", comma ? ", " : "");
3102
comma = 1;
3103
}
3104
if (inp_flags & INP_DONTFRAG) {
3105
db_printf("%sINP_DONTFRAG", comma ? ", " : "");
3106
comma = 1;
3107
}
3108
if (inp_flags & INP_RECVTOS) {
3109
db_printf("%sINP_RECVTOS", comma ? ", " : "");
3110
comma = 1;
3111
}
3112
if (inp_flags & IN6P_IPV6_V6ONLY) {
3113
db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
3114
comma = 1;
3115
}
3116
if (inp_flags & IN6P_PKTINFO) {
3117
db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
3118
comma = 1;
3119
}
3120
if (inp_flags & IN6P_HOPLIMIT) {
3121
db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
3122
comma = 1;
3123
}
3124
if (inp_flags & IN6P_HOPOPTS) {
3125
db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
3126
comma = 1;
3127
}
3128
if (inp_flags & IN6P_DSTOPTS) {
3129
db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
3130
comma = 1;
3131
}
3132
if (inp_flags & IN6P_RTHDR) {
3133
db_printf("%sIN6P_RTHDR", comma ? ", " : "");
3134
comma = 1;
3135
}
3136
if (inp_flags & IN6P_RTHDRDSTOPTS) {
3137
db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
3138
comma = 1;
3139
}
3140
if (inp_flags & IN6P_TCLASS) {
3141
db_printf("%sIN6P_TCLASS", comma ? ", " : "");
3142
comma = 1;
3143
}
3144
if (inp_flags & IN6P_AUTOFLOWLABEL) {
3145
db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
3146
comma = 1;
3147
}
3148
if (inp_flags & INP_ONESBCAST) {
3149
db_printf("%sINP_ONESBCAST", comma ? ", " : "");
3150
comma = 1;
3151
}
3152
if (inp_flags & INP_DROPPED) {
3153
db_printf("%sINP_DROPPED", comma ? ", " : "");
3154
comma = 1;
3155
}
3156
if (inp_flags & INP_SOCKREF) {
3157
db_printf("%sINP_SOCKREF", comma ? ", " : "");
3158
comma = 1;
3159
}
3160
if (inp_flags & IN6P_RFC2292) {
3161
db_printf("%sIN6P_RFC2292", comma ? ", " : "");
3162
comma = 1;
3163
}
3164
if (inp_flags & IN6P_MTU) {
3165
db_printf("IN6P_MTU%s", comma ? ", " : "");
3166
comma = 1;
3167
}
3168
}
3169
3170
static void
3171
db_print_inpvflag(u_char inp_vflag)
3172
{
3173
int comma;
3174
3175
comma = 0;
3176
if (inp_vflag & INP_IPV4) {
3177
db_printf("%sINP_IPV4", comma ? ", " : "");
3178
comma = 1;
3179
}
3180
if (inp_vflag & INP_IPV6) {
3181
db_printf("%sINP_IPV6", comma ? ", " : "");
3182
comma = 1;
3183
}
3184
if (inp_vflag & INP_IPV6PROTO) {
3185
db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
3186
comma = 1;
3187
}
3188
}
3189
3190
static void
3191
db_print_inpcb(struct inpcb *inp, const char *name, int indent)
3192
{
3193
3194
db_print_indent(indent);
3195
db_printf("%s at %p\n", name, inp);
3196
3197
indent += 2;
3198
3199
db_print_indent(indent);
3200
db_printf("inp_flow: 0x%x\n", inp->inp_flow);
3201
3202
db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
3203
3204
db_print_indent(indent);
3205
db_printf("inp_label: %p inp_flags: 0x%x (",
3206
inp->inp_label, inp->inp_flags);
3207
db_print_inpflags(inp->inp_flags);
3208
db_printf(")\n");
3209
3210
db_print_indent(indent);
3211
db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp,
3212
inp->inp_vflag);
3213
db_print_inpvflag(inp->inp_vflag);
3214
db_printf(")\n");
3215
3216
db_print_indent(indent);
3217
db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n",
3218
inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
3219
3220
db_print_indent(indent);
3221
#ifdef INET6
3222
if (inp->inp_vflag & INP_IPV6) {
3223
db_printf("in6p_options: %p in6p_outputopts: %p "
3224
"in6p_moptions: %p\n", inp->in6p_options,
3225
inp->in6p_outputopts, inp->in6p_moptions);
3226
db_printf("in6p_icmp6filt: %p in6p_cksum %d "
3227
"in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
3228
inp->in6p_hops);
3229
} else
3230
#endif
3231
{
3232
db_printf("inp_ip_tos: %d inp_ip_options: %p "
3233
"inp_ip_moptions: %p\n", inp->inp_ip_tos,
3234
inp->inp_options, inp->inp_moptions);
3235
}
3236
3237
db_print_indent(indent);
3238
db_printf("inp_gencnt: %ju\n", (uintmax_t)inp->inp_gencnt);
3239
}
3240
3241
DB_SHOW_COMMAND(inpcb, db_show_inpcb)
3242
{
3243
struct inpcb *inp;
3244
3245
if (!have_addr) {
3246
db_printf("usage: show inpcb <addr>\n");
3247
return;
3248
}
3249
inp = (struct inpcb *)addr;
3250
3251
db_print_inpcb(inp, "inpcb", 0);
3252
}
3253
#endif /* DDB */
3254
3255
#ifdef RATELIMIT
3256
/*
3257
* Modify TX rate limit based on the existing "inp->inp_snd_tag",
3258
* if any.
3259
*/
3260
int
3261
in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
3262
{
3263
union if_snd_tag_modify_params params = {
3264
.rate_limit.max_rate = max_pacing_rate,
3265
.rate_limit.flags = M_NOWAIT,
3266
};
3267
struct m_snd_tag *mst;
3268
int error;
3269
3270
mst = inp->inp_snd_tag;
3271
if (mst == NULL)
3272
return (EINVAL);
3273
3274
if (mst->sw->snd_tag_modify == NULL) {
3275
error = EOPNOTSUPP;
3276
} else {
3277
error = mst->sw->snd_tag_modify(mst, &params);
3278
}
3279
return (error);
3280
}
3281
3282
/*
3283
* Query existing TX rate limit based on the existing
3284
* "inp->inp_snd_tag", if any.
3285
*/
3286
int
3287
in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
3288
{
3289
union if_snd_tag_query_params params = { };
3290
struct m_snd_tag *mst;
3291
int error;
3292
3293
mst = inp->inp_snd_tag;
3294
if (mst == NULL)
3295
return (EINVAL);
3296
3297
if (mst->sw->snd_tag_query == NULL) {
3298
error = EOPNOTSUPP;
3299
} else {
3300
error = mst->sw->snd_tag_query(mst, &params);
3301
if (error == 0 && p_max_pacing_rate != NULL)
3302
*p_max_pacing_rate = params.rate_limit.max_rate;
3303
}
3304
return (error);
3305
}
3306
3307
/*
3308
* Query existing TX queue level based on the existing
3309
* "inp->inp_snd_tag", if any.
3310
*/
3311
int
3312
in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
3313
{
3314
union if_snd_tag_query_params params = { };
3315
struct m_snd_tag *mst;
3316
int error;
3317
3318
mst = inp->inp_snd_tag;
3319
if (mst == NULL)
3320
return (EINVAL);
3321
3322
if (mst->sw->snd_tag_query == NULL)
3323
return (EOPNOTSUPP);
3324
3325
error = mst->sw->snd_tag_query(mst, &params);
3326
if (error == 0 && p_txqueue_level != NULL)
3327
*p_txqueue_level = params.rate_limit.queue_level;
3328
return (error);
3329
}
3330
3331
/*
3332
* Allocate a new TX rate limit send tag from the network interface
3333
* given by the "ifp" argument and save it in "inp->inp_snd_tag":
3334
*/
3335
int
3336
in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
3337
uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
3338
3339
{
3340
union if_snd_tag_alloc_params params = {
3341
.rate_limit.hdr.type = (max_pacing_rate == -1U) ?
3342
IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
3343
.rate_limit.hdr.flowid = flowid,
3344
.rate_limit.hdr.flowtype = flowtype,
3345
.rate_limit.hdr.numa_domain = inp->inp_numa_domain,
3346
.rate_limit.max_rate = max_pacing_rate,
3347
.rate_limit.flags = M_NOWAIT,
3348
};
3349
int error;
3350
3351
INP_WLOCK_ASSERT(inp);
3352
3353
/*
3354
* If there is already a send tag, or the INP is being torn
3355
* down, allocating a new send tag is not allowed. Else send
3356
* tags may leak.
3357
*/
3358
if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0)
3359
return (EINVAL);
3360
3361
error = m_snd_tag_alloc(ifp, &params, st);
3362
#ifdef INET
3363
if (error == 0) {
3364
counter_u64_add(rate_limit_set_ok, 1);
3365
counter_u64_add(rate_limit_active, 1);
3366
} else if (error != EOPNOTSUPP)
3367
counter_u64_add(rate_limit_alloc_fail, 1);
3368
#endif
3369
return (error);
3370
}
3371
3372
void
3373
in_pcbdetach_tag(struct m_snd_tag *mst)
3374
{
3375
3376
m_snd_tag_rele(mst);
3377
#ifdef INET
3378
counter_u64_add(rate_limit_active, -1);
3379
#endif
3380
}
3381
3382
/*
3383
* Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
3384
* if any:
3385
*/
3386
void
3387
in_pcbdetach_txrtlmt(struct inpcb *inp)
3388
{
3389
struct m_snd_tag *mst;
3390
3391
INP_WLOCK_ASSERT(inp);
3392
3393
mst = inp->inp_snd_tag;
3394
inp->inp_snd_tag = NULL;
3395
3396
if (mst == NULL)
3397
return;
3398
3399
m_snd_tag_rele(mst);
3400
#ifdef INET
3401
counter_u64_add(rate_limit_active, -1);
3402
#endif
3403
}
3404
3405
int
3406
in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
3407
{
3408
int error;
3409
3410
/*
3411
* If the existing send tag is for the wrong interface due to
3412
* a route change, first drop the existing tag. Set the
3413
* CHANGED flag so that we will keep trying to allocate a new
3414
* tag if we fail to allocate one this time.
3415
*/
3416
if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
3417
in_pcbdetach_txrtlmt(inp);
3418
inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3419
}
3420
3421
/*
3422
* NOTE: When attaching to a network interface a reference is
3423
* made to ensure the network interface doesn't go away until
3424
* all ratelimit connections are gone. The network interface
3425
* pointers compared below represent valid network interfaces,
3426
* except when comparing towards NULL.
3427
*/
3428
if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
3429
error = 0;
3430
} else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
3431
if (inp->inp_snd_tag != NULL)
3432
in_pcbdetach_txrtlmt(inp);
3433
error = 0;
3434
} else if (inp->inp_snd_tag == NULL) {
3435
/*
3436
* In order to utilize packet pacing with RSS, we need
3437
* to wait until there is a valid RSS hash before we
3438
* can proceed:
3439
*/
3440
if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
3441
error = EAGAIN;
3442
} else {
3443
error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
3444
mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
3445
}
3446
} else {
3447
error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
3448
}
3449
if (error == 0 || error == EOPNOTSUPP)
3450
inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
3451
3452
return (error);
3453
}
3454
3455
/*
3456
* This function should be called when the INP_RATE_LIMIT_CHANGED flag
3457
* is set in the fast path and will attach/detach/modify the TX rate
3458
* limit send tag based on the socket's so_max_pacing_rate value.
3459
*/
3460
void
3461
in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
3462
{
3463
struct socket *socket;
3464
uint32_t max_pacing_rate;
3465
bool did_upgrade;
3466
3467
if (inp == NULL)
3468
return;
3469
3470
socket = inp->inp_socket;
3471
if (socket == NULL)
3472
return;
3473
3474
if (!INP_WLOCKED(inp)) {
3475
/*
3476
* NOTE: If the write locking fails, we need to bail
3477
* out and use the non-ratelimited ring for the
3478
* transmit until there is a new chance to get the
3479
* write lock.
3480
*/
3481
if (!INP_TRY_UPGRADE(inp))
3482
return;
3483
did_upgrade = 1;
3484
} else {
3485
did_upgrade = 0;
3486
}
3487
3488
/*
3489
* NOTE: The so_max_pacing_rate value is read unlocked,
3490
* because atomic updates are not required since the variable
3491
* is checked at every mbuf we send. It is assumed that the
3492
* variable read itself will be atomic.
3493
*/
3494
max_pacing_rate = socket->so_max_pacing_rate;
3495
3496
in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
3497
3498
if (did_upgrade)
3499
INP_DOWNGRADE(inp);
3500
}
3501
3502
/*
3503
* Track route changes for TX rate limiting.
3504
*/
3505
void
3506
in_pcboutput_eagain(struct inpcb *inp)
3507
{
3508
bool did_upgrade;
3509
3510
if (inp == NULL)
3511
return;
3512
3513
if (inp->inp_snd_tag == NULL)
3514
return;
3515
3516
if (!INP_WLOCKED(inp)) {
3517
/*
3518
* NOTE: If the write locking fails, we need to bail
3519
* out and use the non-ratelimited ring for the
3520
* transmit until there is a new chance to get the
3521
* write lock.
3522
*/
3523
if (!INP_TRY_UPGRADE(inp))
3524
return;
3525
did_upgrade = 1;
3526
} else {
3527
did_upgrade = 0;
3528
}
3529
3530
/* detach rate limiting */
3531
in_pcbdetach_txrtlmt(inp);
3532
3533
/* make sure new mbuf send tag allocation is made */
3534
inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3535
3536
if (did_upgrade)
3537
INP_DOWNGRADE(inp);
3538
}
3539
3540
#ifdef INET
3541
static void
3542
rl_init(void *st)
3543
{
3544
rate_limit_new = counter_u64_alloc(M_WAITOK);
3545
rate_limit_chg = counter_u64_alloc(M_WAITOK);
3546
rate_limit_active = counter_u64_alloc(M_WAITOK);
3547
rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
3548
rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
3549
}
3550
3551
SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
3552
#endif
3553
#endif /* RATELIMIT */
3554
3555