Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/netinet/in_pcb.c
105517 views
1
/*-
2
* SPDX-License-Identifier: BSD-3-Clause
3
*
4
* Copyright (c) 1982, 1986, 1991, 1993, 1995
5
* The Regents of the University of California.
6
* Copyright (c) 2007-2009 Robert N. M. Watson
7
* Copyright (c) 2010-2011 Juniper Networks, Inc.
8
* Copyright (c) 2021-2022 Gleb Smirnoff <[email protected]>
9
* All rights reserved.
10
*
11
* Portions of this software were developed by Robert N. M. Watson under
12
* contract to Juniper Networks, Inc.
13
*
14
* Redistribution and use in source and binary forms, with or without
15
* modification, are permitted provided that the following conditions
16
* are met:
17
* 1. Redistributions of source code must retain the above copyright
18
* notice, this list of conditions and the following disclaimer.
19
* 2. Redistributions in binary form must reproduce the above copyright
20
* notice, this list of conditions and the following disclaimer in the
21
* documentation and/or other materials provided with the distribution.
22
* 3. Neither the name of the University nor the names of its contributors
23
* may be used to endorse or promote products derived from this software
24
* without specific prior written permission.
25
*
26
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36
* SUCH DAMAGE.
37
*/
38
39
#include "opt_ddb.h"
40
#include "opt_ipsec.h"
41
#include "opt_inet.h"
42
#include "opt_inet6.h"
43
#include "opt_ratelimit.h"
44
#include "opt_route.h"
45
#include "opt_rss.h"
46
47
#include <sys/param.h>
48
#include <sys/hash.h>
49
#include <sys/systm.h>
50
#include <sys/libkern.h>
51
#include <sys/lock.h>
52
#include <sys/malloc.h>
53
#include <sys/mbuf.h>
54
#include <sys/eventhandler.h>
55
#include <sys/domain.h>
56
#include <sys/proc.h>
57
#include <sys/protosw.h>
58
#include <sys/smp.h>
59
#include <sys/smr.h>
60
#include <sys/socket.h>
61
#include <sys/socketvar.h>
62
#include <sys/sockio.h>
63
#include <sys/priv.h>
64
#include <sys/proc.h>
65
#include <sys/refcount.h>
66
#include <sys/jail.h>
67
#include <sys/kernel.h>
68
#include <sys/sysctl.h>
69
70
#ifdef DDB
71
#include <ddb/ddb.h>
72
#endif
73
74
#include <vm/uma.h>
75
#include <vm/vm.h>
76
77
#include <net/if.h>
78
#include <net/if_var.h>
79
#include <net/if_private.h>
80
#include <net/if_types.h>
81
#include <net/if_llatbl.h>
82
#include <net/route.h>
83
#include <net/rss_config.h>
84
#include <net/vnet.h>
85
86
#if defined(INET) || defined(INET6)
87
#include <netinet/in.h>
88
#include <netinet/in_pcb.h>
89
#include <netinet/in_pcb_var.h>
90
#include <netinet/tcp.h>
91
#ifdef INET
92
#include <netinet/in_var.h>
93
#include <netinet/in_fib.h>
94
#endif
95
#include <netinet/ip_var.h>
96
#ifdef INET6
97
#include <netinet/ip6.h>
98
#include <netinet6/in6_pcb.h>
99
#include <netinet6/in6_var.h>
100
#include <netinet6/ip6_var.h>
101
#endif /* INET6 */
102
#include <net/route/nhop.h>
103
#endif
104
105
#include <netipsec/ipsec_support.h>
106
107
#include <security/mac/mac_framework.h>
108
109
#define INPCBLBGROUP_SIZMIN 8
110
#define INPCBLBGROUP_SIZMAX 256
111
112
#define INP_FREED 0x00000200 /* Went through in_pcbfree(). */
113
#define INP_INLBGROUP 0x01000000 /* Inserted into inpcblbgroup. */
114
115
/*
116
* These configure the range of local port addresses assigned to
117
* "unspecified" outgoing connections/packets/whatever.
118
*/
119
VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */
120
VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */
121
VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */
122
VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */
123
VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */
124
VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */
125
126
/*
127
* Reserved ports accessible only to root. There are significant
128
* security considerations that must be accounted for when changing these,
129
* but the security benefits can be great. Please be careful.
130
*/
131
VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */
132
VNET_DEFINE(int, ipport_reservedlow);
133
134
/* Enable random ephemeral port allocation by default. */
135
VNET_DEFINE(int, ipport_randomized) = 1;
136
137
#ifdef INET
138
static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
139
struct in_addr faddr, u_int fport_arg,
140
struct in_addr laddr, u_int lport_arg,
141
int lookupflags, uint8_t numa_domain, int fib);
142
143
#define RANGECHK(var, min, max) \
144
if ((var) < (min)) { (var) = (min); } \
145
else if ((var) > (max)) { (var) = (max); }
146
147
static int
148
sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
149
{
150
int error;
151
152
error = sysctl_handle_int(oidp, arg1, arg2, req);
153
if (error == 0) {
154
RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
155
RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
156
RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
157
RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
158
RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
159
RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
160
}
161
return (error);
162
}
163
164
#undef RANGECHK
165
166
static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
167
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
168
"IP Ports");
169
170
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
171
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
172
&VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
173
"");
174
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
175
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
176
&VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
177
"");
178
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
179
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
180
&VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
181
"");
182
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
183
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
184
&VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
185
"");
186
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
187
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
188
&VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
189
"");
190
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
191
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
192
&VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
193
"");
194
SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
195
CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
196
&VNET_NAME(ipport_reservedhigh), 0, "");
197
SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
198
CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
199
SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
200
CTLFLAG_VNET | CTLFLAG_RW,
201
&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
202
203
#ifdef RATELIMIT
204
counter_u64_t rate_limit_new;
205
counter_u64_t rate_limit_chg;
206
counter_u64_t rate_limit_active;
207
counter_u64_t rate_limit_alloc_fail;
208
counter_u64_t rate_limit_set_ok;
209
210
static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
211
"IP Rate Limiting");
212
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
213
&rate_limit_active, "Active rate limited connections");
214
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
215
&rate_limit_alloc_fail, "Rate limited connection failures");
216
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
217
&rate_limit_set_ok, "Rate limited setting succeeded");
218
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD,
219
&rate_limit_new, "Total Rate limit new attempts");
220
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
221
&rate_limit_chg, "Total Rate limited change attempts");
222
#endif /* RATELIMIT */
223
224
#endif /* INET */
225
226
VNET_DEFINE(uint32_t, in_pcbhashseed);
227
static void
228
in_pcbhashseed_init(void)
229
{
230
231
V_in_pcbhashseed = arc4random();
232
}
233
VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
234
in_pcbhashseed_init, NULL);
235
236
#ifdef INET
237
VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 0;
238
#define V_connect_inaddr_wild VNET(connect_inaddr_wild)
239
SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild,
240
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0,
241
"Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)");
242
#endif
243
244
static void in_pcbremhash(struct inpcb *);
245
246
/*
247
* in_pcb.c: manage the Protocol Control Blocks.
248
*
249
* NOTE: It is assumed that most of these functions will be called with
250
* the pcbinfo lock held, and often, the inpcb lock held, as these utility
251
* functions often modify hash chains or addresses in pcbs.
252
*/
253
254
static struct inpcblbgroup *
255
in_pcblbgroup_alloc(struct ucred *cred, u_char vflag, uint16_t port,
256
const union in_dependaddr *addr, int size, uint8_t numa_domain, int fib)
257
{
258
struct inpcblbgroup *grp;
259
size_t bytes;
260
261
bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
262
grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
263
if (grp == NULL)
264
return (NULL);
265
LIST_INIT(&grp->il_pending);
266
grp->il_cred = crhold(cred);
267
grp->il_vflag = vflag;
268
grp->il_lport = port;
269
grp->il_numa_domain = numa_domain;
270
grp->il_fibnum = fib;
271
grp->il_dependladdr = *addr;
272
grp->il_inpsiz = size;
273
return (grp);
274
}
275
276
static void
277
in_pcblbgroup_free_deferred(epoch_context_t ctx)
278
{
279
struct inpcblbgroup *grp;
280
281
grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
282
crfree(grp->il_cred);
283
free(grp, M_PCB);
284
}
285
286
static void
287
in_pcblbgroup_free(struct inpcblbgroup *grp)
288
{
289
KASSERT(LIST_EMPTY(&grp->il_pending),
290
("local group %p still has pending inps", grp));
291
292
CK_LIST_REMOVE(grp, il_list);
293
NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
294
}
295
296
static struct inpcblbgroup *
297
in_pcblbgroup_find(struct inpcb *inp)
298
{
299
struct inpcbinfo *pcbinfo;
300
struct inpcblbgroup *grp;
301
struct inpcblbgrouphead *hdr;
302
303
INP_LOCK_ASSERT(inp);
304
305
pcbinfo = inp->inp_pcbinfo;
306
INP_HASH_LOCK_ASSERT(pcbinfo);
307
308
hdr = &pcbinfo->ipi_lbgrouphashbase[
309
INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
310
CK_LIST_FOREACH(grp, hdr, il_list) {
311
struct inpcb *inp1;
312
313
for (unsigned int i = 0; i < grp->il_inpcnt; i++) {
314
if (inp == grp->il_inp[i])
315
goto found;
316
}
317
LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) {
318
if (inp == inp1)
319
goto found;
320
}
321
}
322
found:
323
return (grp);
324
}
325
326
static void
327
in_pcblbgroup_insert(struct inpcblbgroup *grp, struct inpcb *inp)
328
{
329
KASSERT(grp->il_inpcnt < grp->il_inpsiz,
330
("invalid local group size %d and count %d", grp->il_inpsiz,
331
grp->il_inpcnt));
332
INP_WLOCK_ASSERT(inp);
333
334
if (inp->inp_socket->so_proto->pr_listen != pr_listen_notsupp &&
335
!SOLISTENING(inp->inp_socket)) {
336
/*
337
* If this is a TCP socket, it should not be visible to lbgroup
338
* lookups until listen() has been called.
339
*/
340
LIST_INSERT_HEAD(&grp->il_pending, inp, inp_lbgroup_list);
341
grp->il_pendcnt++;
342
} else {
343
grp->il_inp[grp->il_inpcnt] = inp;
344
345
/*
346
* Synchronize with in_pcblookup_lbgroup(): make sure that we
347
* don't expose a null slot to the lookup path.
348
*/
349
atomic_store_rel_int(&grp->il_inpcnt, grp->il_inpcnt + 1);
350
}
351
352
inp->inp_flags |= INP_INLBGROUP;
353
}
354
355
static struct inpcblbgroup *
356
in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
357
struct inpcblbgroup *old_grp, int size)
358
{
359
struct inpcblbgroup *grp;
360
int i;
361
362
grp = in_pcblbgroup_alloc(old_grp->il_cred, old_grp->il_vflag,
363
old_grp->il_lport, &old_grp->il_dependladdr, size,
364
old_grp->il_numa_domain, old_grp->il_fibnum);
365
if (grp == NULL)
366
return (NULL);
367
368
KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
369
("invalid new local group size %d and old local group count %d",
370
grp->il_inpsiz, old_grp->il_inpcnt));
371
372
for (i = 0; i < old_grp->il_inpcnt; ++i)
373
grp->il_inp[i] = old_grp->il_inp[i];
374
grp->il_inpcnt = old_grp->il_inpcnt;
375
CK_LIST_INSERT_HEAD(hdr, grp, il_list);
376
LIST_SWAP(&old_grp->il_pending, &grp->il_pending, inpcb,
377
inp_lbgroup_list);
378
grp->il_pendcnt = old_grp->il_pendcnt;
379
old_grp->il_pendcnt = 0;
380
in_pcblbgroup_free(old_grp);
381
return (grp);
382
}
383
384
/*
385
* Add PCB to load balance group for SO_REUSEPORT_LB option.
386
*/
387
static int
388
in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
389
{
390
const static struct timeval interval = { 60, 0 };
391
static struct timeval lastprint;
392
struct inpcbinfo *pcbinfo;
393
struct inpcblbgrouphead *hdr;
394
struct inpcblbgroup *grp;
395
uint32_t idx;
396
int fib;
397
398
pcbinfo = inp->inp_pcbinfo;
399
400
INP_WLOCK_ASSERT(inp);
401
INP_HASH_WLOCK_ASSERT(pcbinfo);
402
403
fib = (inp->inp_flags & INP_BOUNDFIB) != 0 ?
404
inp->inp_inc.inc_fibnum : RT_ALL_FIBS;
405
406
#ifdef INET6
407
/*
408
* Don't allow IPv4 mapped INET6 wild socket.
409
*/
410
if ((inp->inp_vflag & INP_IPV4) &&
411
inp->inp_laddr.s_addr == INADDR_ANY &&
412
INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
413
return (0);
414
}
415
#endif
416
417
idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
418
hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
419
CK_LIST_FOREACH(grp, hdr, il_list) {
420
if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison &&
421
grp->il_vflag == inp->inp_vflag &&
422
grp->il_lport == inp->inp_lport &&
423
grp->il_numa_domain == numa_domain &&
424
grp->il_fibnum == fib &&
425
memcmp(&grp->il_dependladdr,
426
&inp->inp_inc.inc_ie.ie_dependladdr,
427
sizeof(grp->il_dependladdr)) == 0) {
428
break;
429
}
430
}
431
if (grp == NULL) {
432
/* Create new load balance group. */
433
grp = in_pcblbgroup_alloc(inp->inp_cred, inp->inp_vflag,
434
inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
435
INPCBLBGROUP_SIZMIN, numa_domain, fib);
436
if (grp == NULL)
437
return (ENOMEM);
438
in_pcblbgroup_insert(grp, inp);
439
CK_LIST_INSERT_HEAD(hdr, grp, il_list);
440
} else if (grp->il_inpcnt + grp->il_pendcnt == grp->il_inpsiz) {
441
if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
442
if (ratecheck(&lastprint, &interval))
443
printf("lb group port %d, limit reached\n",
444
ntohs(grp->il_lport));
445
return (0);
446
}
447
448
/* Expand this local group. */
449
grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
450
if (grp == NULL)
451
return (ENOMEM);
452
in_pcblbgroup_insert(grp, inp);
453
} else {
454
in_pcblbgroup_insert(grp, inp);
455
}
456
return (0);
457
}
458
459
/*
460
* Remove PCB from load balance group.
461
*/
462
static void
463
in_pcbremlbgrouphash(struct inpcb *inp)
464
{
465
struct inpcbinfo *pcbinfo;
466
struct inpcblbgrouphead *hdr;
467
struct inpcblbgroup *grp;
468
struct inpcb *inp1;
469
int i;
470
471
pcbinfo = inp->inp_pcbinfo;
472
473
INP_WLOCK_ASSERT(inp);
474
MPASS(inp->inp_flags & INP_INLBGROUP);
475
INP_HASH_WLOCK_ASSERT(pcbinfo);
476
477
hdr = &pcbinfo->ipi_lbgrouphashbase[
478
INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
479
CK_LIST_FOREACH(grp, hdr, il_list) {
480
for (i = 0; i < grp->il_inpcnt; ++i) {
481
if (grp->il_inp[i] != inp)
482
continue;
483
484
if (grp->il_inpcnt == 1 &&
485
LIST_EMPTY(&grp->il_pending)) {
486
/* We are the last, free this local group. */
487
in_pcblbgroup_free(grp);
488
} else {
489
grp->il_inp[i] =
490
grp->il_inp[grp->il_inpcnt - 1];
491
492
/*
493
* Synchronize with in_pcblookup_lbgroup().
494
*/
495
atomic_store_rel_int(&grp->il_inpcnt,
496
grp->il_inpcnt - 1);
497
}
498
inp->inp_flags &= ~INP_INLBGROUP;
499
return;
500
}
501
LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) {
502
if (inp == inp1) {
503
LIST_REMOVE(inp, inp_lbgroup_list);
504
grp->il_pendcnt--;
505
inp->inp_flags &= ~INP_INLBGROUP;
506
return;
507
}
508
}
509
}
510
__assert_unreachable();
511
}
512
513
int
514
in_pcblbgroup_numa(struct inpcb *inp, int arg)
515
{
516
struct inpcbinfo *pcbinfo;
517
int error;
518
uint8_t numa_domain;
519
520
switch (arg) {
521
case TCP_REUSPORT_LB_NUMA_NODOM:
522
numa_domain = M_NODOM;
523
break;
524
case TCP_REUSPORT_LB_NUMA_CURDOM:
525
numa_domain = PCPU_GET(domain);
526
break;
527
default:
528
if (arg < 0 || arg >= vm_ndomains)
529
return (EINVAL);
530
numa_domain = arg;
531
}
532
533
pcbinfo = inp->inp_pcbinfo;
534
INP_WLOCK_ASSERT(inp);
535
INP_HASH_WLOCK(pcbinfo);
536
if (in_pcblbgroup_find(inp) != NULL) {
537
/* Remove it from the old group. */
538
in_pcbremlbgrouphash(inp);
539
/* Add it to the new group based on numa domain. */
540
in_pcbinslbgrouphash(inp, numa_domain);
541
error = 0;
542
} else {
543
error = ENOENT;
544
}
545
INP_HASH_WUNLOCK(pcbinfo);
546
return (error);
547
}
548
549
/* Make sure it is safe to use hashinit(9) on CK_LIST. */
550
CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb)));
551
552
/*
553
* Initialize an inpcbinfo - a per-VNET instance of connections db.
554
*/
555
void
556
in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
557
u_int hash_nelements, u_int porthash_nelements)
558
{
559
560
mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF);
561
mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name,
562
NULL, MTX_DEF);
563
#ifdef VIMAGE
564
pcbinfo->ipi_vnet = curvnet;
565
#endif
566
CK_LIST_INIT(&pcbinfo->ipi_listhead);
567
pcbinfo->ipi_count = 0;
568
pcbinfo->ipi_hash_exact = hashinit(hash_nelements, M_PCB,
569
&pcbinfo->ipi_hashmask);
570
pcbinfo->ipi_hash_wild = hashinit(hash_nelements, M_PCB,
571
&pcbinfo->ipi_hashmask);
572
porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
573
pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
574
&pcbinfo->ipi_porthashmask);
575
pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
576
&pcbinfo->ipi_lbgrouphashmask);
577
pcbinfo->ipi_zone = pcbstor->ips_zone;
578
pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
579
}
580
581
/*
582
* Destroy an inpcbinfo.
583
*/
584
void
585
in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
586
{
587
588
KASSERT(pcbinfo->ipi_count == 0,
589
("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
590
591
hashdestroy(pcbinfo->ipi_hash_exact, M_PCB, pcbinfo->ipi_hashmask);
592
hashdestroy(pcbinfo->ipi_hash_wild, M_PCB, pcbinfo->ipi_hashmask);
593
hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
594
pcbinfo->ipi_porthashmask);
595
hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
596
pcbinfo->ipi_lbgrouphashmask);
597
mtx_destroy(&pcbinfo->ipi_hash_lock);
598
mtx_destroy(&pcbinfo->ipi_lock);
599
}
600
601
/*
602
* Initialize a pcbstorage - per protocol zones to allocate inpcbs.
603
*/
604
static void inpcb_fini(void *, int);
605
void
606
in_pcbstorage_init(void *arg)
607
{
608
struct inpcbstorage *pcbstor = arg;
609
610
pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name,
611
pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit,
612
inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR);
613
}
614
615
/*
616
* Destroy a pcbstorage - used by unloadable protocols.
617
*/
618
void
619
in_pcbstorage_destroy(void *arg)
620
{
621
struct inpcbstorage *pcbstor = arg;
622
623
uma_zdestroy(pcbstor->ips_zone);
624
}
625
626
/*
627
* Allocate a PCB and associate it with the socket.
628
* On success return with the PCB locked.
629
*/
630
int
631
in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
632
{
633
struct inpcb *inp;
634
#if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
635
int error;
636
#endif
637
638
inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT);
639
if (inp == NULL)
640
return (ENOBUFS);
641
bzero(&inp->inp_start_zero, inp_zero_size);
642
#ifdef NUMA
643
inp->inp_numa_domain = M_NODOM;
644
#endif
645
inp->inp_pcbinfo = pcbinfo;
646
inp->inp_socket = so;
647
inp->inp_cred = crhold(so->so_cred);
648
inp->inp_inc.inc_fibnum = so->so_fibnum;
649
#ifdef MAC
650
error = mac_inpcb_init(inp, M_NOWAIT);
651
if (error != 0)
652
goto out;
653
mac_inpcb_create(so, inp);
654
#endif
655
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
656
error = ipsec_init_pcbpolicy(inp);
657
if (error != 0) {
658
#ifdef MAC
659
mac_inpcb_destroy(inp);
660
#endif
661
goto out;
662
}
663
#endif /*IPSEC*/
664
#ifdef INET6
665
if (INP_SOCKAF(so) == AF_INET6) {
666
inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6;
667
if (V_ip6_v6only)
668
inp->inp_flags |= IN6P_IPV6_V6ONLY;
669
#ifdef INET
670
else
671
inp->inp_vflag |= INP_IPV4;
672
#endif
673
if (V_ip6_auto_flowlabel)
674
inp->inp_flags |= IN6P_AUTOFLOWLABEL;
675
inp->in6p_hops = -1; /* use kernel default */
676
}
677
#endif
678
#if defined(INET) && defined(INET6)
679
else
680
#endif
681
#ifdef INET
682
inp->inp_vflag |= INP_IPV4;
683
#endif
684
inp->inp_smr = SMR_SEQ_INVALID;
685
686
/*
687
* Routes in inpcb's can cache L2 as well; they are guaranteed
688
* to be cleaned up.
689
*/
690
inp->inp_route.ro_flags = RT_LLE_CACHE;
691
refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */
692
INP_WLOCK(inp);
693
INP_INFO_WLOCK(pcbinfo);
694
pcbinfo->ipi_count++;
695
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
696
CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list);
697
INP_INFO_WUNLOCK(pcbinfo);
698
so->so_pcb = inp;
699
700
return (0);
701
702
#if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
703
out:
704
crfree(inp->inp_cred);
705
#ifdef INVARIANTS
706
inp->inp_cred = NULL;
707
#endif
708
uma_zfree_smr(pcbinfo->ipi_zone, inp);
709
return (error);
710
#endif
711
}
712
713
#ifdef INET
714
int
715
in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, int flags,
716
struct ucred *cred)
717
{
718
int anonport, error;
719
720
KASSERT(sin == NULL || sin->sin_family == AF_INET,
721
("%s: invalid address family for %p", __func__, sin));
722
KASSERT(sin == NULL || sin->sin_len == sizeof(struct sockaddr_in),
723
("%s: invalid address length for %p", __func__, sin));
724
INP_WLOCK_ASSERT(inp);
725
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
726
727
if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
728
return (EINVAL);
729
anonport = sin == NULL || sin->sin_port == 0;
730
error = in_pcbbind_setup(inp, sin, &inp->inp_laddr.s_addr,
731
&inp->inp_lport, flags, cred);
732
if (error)
733
return (error);
734
if (__predict_false((error = in_pcbinshash(inp)) != 0)) {
735
MPASS(inp->inp_socket->so_options & SO_REUSEPORT_LB);
736
inp->inp_laddr.s_addr = INADDR_ANY;
737
inp->inp_lport = 0;
738
inp->inp_flags &= ~INP_BOUNDFIB;
739
return (error);
740
}
741
if (anonport)
742
inp->inp_flags |= INP_ANONPORT;
743
return (0);
744
}
745
#endif
746
747
#if defined(INET) || defined(INET6)
748
/*
749
* Assign a local port like in_pcb_lport(), but also used with connect()
750
* and a foreign address and port. If fsa is non-NULL, choose a local port
751
* that is unused with those, otherwise one that is completely unused.
752
* lsa can be NULL for IPv6.
753
*/
754
int
755
in_pcb_lport_dest(const struct inpcb *inp, struct sockaddr *lsa,
756
u_short *lportp, struct sockaddr *fsa, u_short fport, struct ucred *cred,
757
int lookupflags)
758
{
759
struct inpcbinfo *pcbinfo;
760
struct inpcb *tmpinp;
761
unsigned short *lastport;
762
int count, error;
763
u_short aux, first, last, lport;
764
#ifdef INET
765
struct in_addr laddr, faddr;
766
#endif
767
#ifdef INET6
768
struct in6_addr *laddr6, *faddr6;
769
#endif
770
771
pcbinfo = inp->inp_pcbinfo;
772
773
/*
774
* Because no actual state changes occur here, a global write lock on
775
* the pcbinfo isn't required.
776
*/
777
INP_LOCK_ASSERT(inp);
778
INP_HASH_LOCK_ASSERT(pcbinfo);
779
780
if (inp->inp_flags & INP_HIGHPORT) {
781
first = V_ipport_hifirstauto; /* sysctl */
782
last = V_ipport_hilastauto;
783
lastport = &pcbinfo->ipi_lasthi;
784
} else if (inp->inp_flags & INP_LOWPORT) {
785
error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
786
if (error)
787
return (error);
788
first = V_ipport_lowfirstauto; /* 1023 */
789
last = V_ipport_lowlastauto; /* 600 */
790
lastport = &pcbinfo->ipi_lastlow;
791
} else {
792
first = V_ipport_firstauto; /* sysctl */
793
last = V_ipport_lastauto;
794
lastport = &pcbinfo->ipi_lastport;
795
}
796
797
/*
798
* Instead of having two loops further down counting up or down
799
* make sure that first is always <= last and go with only one
800
* code path implementing all logic.
801
*/
802
if (first > last) {
803
aux = first;
804
first = last;
805
last = aux;
806
}
807
808
#ifdef INET
809
laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */
810
if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
811
if (lsa != NULL)
812
laddr = ((struct sockaddr_in *)lsa)->sin_addr;
813
if (fsa != NULL)
814
faddr = ((struct sockaddr_in *)fsa)->sin_addr;
815
}
816
#endif
817
#ifdef INET6
818
laddr6 = NULL;
819
if ((inp->inp_vflag & INP_IPV6) != 0) {
820
if (lsa != NULL)
821
laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
822
if (fsa != NULL)
823
faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
824
}
825
#endif
826
827
tmpinp = NULL;
828
829
if (V_ipport_randomized)
830
*lastport = first + (arc4random() % (last - first));
831
832
count = last - first;
833
834
do {
835
if (count-- < 0) /* completely used? */
836
return (EADDRNOTAVAIL);
837
++*lastport;
838
if (*lastport < first || *lastport > last)
839
*lastport = first;
840
lport = htons(*lastport);
841
842
if (fsa != NULL) {
843
#ifdef INET
844
if (lsa->sa_family == AF_INET) {
845
tmpinp = in_pcblookup_hash_locked(pcbinfo,
846
faddr, fport, laddr, lport, lookupflags,
847
M_NODOM, RT_ALL_FIBS);
848
}
849
#endif
850
#ifdef INET6
851
if (lsa->sa_family == AF_INET6) {
852
tmpinp = in6_pcblookup_hash_locked(pcbinfo,
853
faddr6, fport, laddr6, lport, lookupflags,
854
M_NODOM, RT_ALL_FIBS);
855
}
856
#endif
857
} else {
858
#ifdef INET6
859
if ((inp->inp_vflag & INP_IPV6) != 0) {
860
tmpinp = in6_pcblookup_local(pcbinfo,
861
&inp->in6p_laddr, lport, RT_ALL_FIBS,
862
lookupflags, cred);
863
#ifdef INET
864
if (tmpinp == NULL &&
865
(inp->inp_vflag & INP_IPV4))
866
tmpinp = in_pcblookup_local(pcbinfo,
867
laddr, lport, RT_ALL_FIBS,
868
lookupflags, cred);
869
#endif
870
}
871
#endif
872
#if defined(INET) && defined(INET6)
873
else
874
#endif
875
#ifdef INET
876
tmpinp = in_pcblookup_local(pcbinfo, laddr,
877
lport, RT_ALL_FIBS, lookupflags, cred);
878
#endif
879
}
880
} while (tmpinp != NULL);
881
882
*lportp = lport;
883
884
return (0);
885
}
886
887
/*
888
* Select a local port (number) to use.
889
*/
890
int
891
in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
892
struct ucred *cred, int lookupflags)
893
{
894
struct sockaddr_in laddr;
895
896
if (laddrp) {
897
bzero(&laddr, sizeof(laddr));
898
laddr.sin_family = AF_INET;
899
laddr.sin_addr = *laddrp;
900
}
901
return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
902
NULL, lportp, NULL, 0, cred, lookupflags));
903
}
904
#endif /* INET || INET6 */
905
906
#ifdef INET
907
/*
908
* Determine whether the inpcb can be bound to the specified address/port tuple.
909
*/
910
static int
911
in_pcbbind_avail(struct inpcb *inp, const struct in_addr laddr,
912
const u_short lport, const int fib, int sooptions, int lookupflags,
913
struct ucred *cred)
914
{
915
int reuseport, reuseport_lb;
916
917
INP_LOCK_ASSERT(inp);
918
INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
919
920
reuseport = (sooptions & SO_REUSEPORT);
921
reuseport_lb = (sooptions & SO_REUSEPORT_LB);
922
923
if (IN_MULTICAST(ntohl(laddr.s_addr))) {
924
/*
925
* Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
926
* allow complete duplication of binding if
927
* SO_REUSEPORT is set, or if SO_REUSEADDR is set
928
* and a multicast address is bound on both
929
* new and duplicated sockets.
930
*/
931
if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT)) != 0)
932
reuseport = SO_REUSEADDR | SO_REUSEPORT;
933
/*
934
* XXX: How to deal with SO_REUSEPORT_LB here?
935
* Treat same as SO_REUSEPORT for now.
936
*/
937
if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT_LB)) != 0)
938
reuseport_lb = SO_REUSEADDR | SO_REUSEPORT_LB;
939
} else if (!in_nullhost(laddr)) {
940
struct sockaddr_in sin;
941
942
memset(&sin, 0, sizeof(sin));
943
sin.sin_family = AF_INET;
944
sin.sin_len = sizeof(sin);
945
sin.sin_addr = laddr;
946
947
/*
948
* Is the address a local IP address?
949
* If INP_BINDANY is set, then the socket may be bound
950
* to any endpoint address, local or not.
951
*/
952
if ((inp->inp_flags & INP_BINDANY) == 0 &&
953
ifa_ifwithaddr_check((const struct sockaddr *)&sin) == 0)
954
return (EADDRNOTAVAIL);
955
}
956
957
if (lport != 0) {
958
struct inpcb *t;
959
960
if (ntohs(lport) <= V_ipport_reservedhigh &&
961
ntohs(lport) >= V_ipport_reservedlow &&
962
priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
963
return (EACCES);
964
965
if (!IN_MULTICAST(ntohl(laddr.s_addr)) &&
966
priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
967
/*
968
* If a socket owned by a different user is already
969
* bound to this port, fail. In particular, SO_REUSE*
970
* can only be used to share a port among sockets owned
971
* by the same user.
972
*
973
* However, we can share a port with a connected socket
974
* which has a unique 4-tuple.
975
*/
976
t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport,
977
RT_ALL_FIBS, INPLOOKUP_WILDCARD, cred);
978
if (t != NULL &&
979
(inp->inp_socket->so_type != SOCK_STREAM ||
980
in_nullhost(t->inp_faddr)) &&
981
(inp->inp_cred->cr_uid != t->inp_cred->cr_uid))
982
return (EADDRINUSE);
983
}
984
t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, fib,
985
lookupflags, cred);
986
if (t != NULL && ((reuseport | reuseport_lb) &
987
t->inp_socket->so_options) == 0) {
988
#ifdef INET6
989
if (!in_nullhost(laddr) ||
990
!in_nullhost(t->inp_laddr) ||
991
(inp->inp_vflag & INP_IPV6PROTO) == 0 ||
992
(t->inp_vflag & INP_IPV6PROTO) == 0)
993
#endif
994
return (EADDRINUSE);
995
}
996
}
997
return (0);
998
}
999
1000
/*
1001
* Set up a bind operation on a PCB, performing port allocation
1002
* as required, but do not actually modify the PCB. Callers can
1003
* either complete the bind by setting inp_laddr/inp_lport and
1004
* calling in_pcbinshash(), or they can just use the resulting
1005
* port and address to authorise the sending of a once-off packet.
1006
*
1007
* On error, the values of *laddrp and *lportp are not changed.
1008
*/
1009
int
1010
in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp,
1011
u_short *lportp, int flags, struct ucred *cred)
1012
{
1013
struct socket *so = inp->inp_socket;
1014
struct in_addr laddr;
1015
u_short lport = 0;
1016
int error, fib, lookupflags, sooptions;
1017
1018
/*
1019
* No state changes, so read locks are sufficient here.
1020
*/
1021
INP_LOCK_ASSERT(inp);
1022
INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
1023
1024
laddr.s_addr = *laddrp;
1025
if (sin != NULL && laddr.s_addr != INADDR_ANY)
1026
return (EINVAL);
1027
1028
lookupflags = 0;
1029
sooptions = atomic_load_int(&so->so_options);
1030
if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT | SO_REUSEPORT_LB)) == 0)
1031
lookupflags = INPLOOKUP_WILDCARD;
1032
if (sin == NULL) {
1033
if ((error = prison_local_ip4(cred, &laddr)) != 0)
1034
return (error);
1035
} else {
1036
KASSERT(sin->sin_family == AF_INET,
1037
("%s: invalid family for address %p", __func__, sin));
1038
KASSERT(sin->sin_len == sizeof(*sin),
1039
("%s: invalid length for address %p", __func__, sin));
1040
1041
error = prison_local_ip4(cred, &sin->sin_addr);
1042
if (error)
1043
return (error);
1044
if (sin->sin_port != *lportp) {
1045
/* Don't allow the port to change. */
1046
if (*lportp != 0)
1047
return (EINVAL);
1048
lport = sin->sin_port;
1049
}
1050
laddr = sin->sin_addr;
1051
1052
fib = (flags & INPBIND_FIB) != 0 ? inp->inp_inc.inc_fibnum :
1053
RT_ALL_FIBS;
1054
1055
/* See if this address/port combo is available. */
1056
error = in_pcbbind_avail(inp, laddr, lport, fib, sooptions,
1057
lookupflags, cred);
1058
if (error != 0)
1059
return (error);
1060
}
1061
if (*lportp != 0)
1062
lport = *lportp;
1063
if (lport == 0) {
1064
error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
1065
if (error != 0)
1066
return (error);
1067
}
1068
*laddrp = laddr.s_addr;
1069
*lportp = lport;
1070
if ((flags & INPBIND_FIB) != 0)
1071
inp->inp_flags |= INP_BOUNDFIB;
1072
return (0);
1073
}
1074
1075
/*
1076
* Connect from a socket to a specified address.
1077
* Both address and port must be specified in argument sin.
1078
* If don't have a local address for this socket yet,
1079
* then pick one.
1080
*/
1081
int
1082
in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred)
1083
{
1084
struct in_addr laddr, faddr;
1085
u_short lport;
1086
int error;
1087
bool anonport;
1088
1089
INP_WLOCK_ASSERT(inp);
1090
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1091
KASSERT(in_nullhost(inp->inp_faddr),
1092
("%s: inp is already connected", __func__));
1093
KASSERT(sin->sin_family == AF_INET,
1094
("%s: invalid address family for %p", __func__, sin));
1095
KASSERT(sin->sin_len == sizeof(*sin),
1096
("%s: invalid address length for %p", __func__, sin));
1097
1098
if (sin->sin_port == 0)
1099
return (EADDRNOTAVAIL);
1100
1101
anonport = (inp->inp_lport == 0);
1102
1103
if (__predict_false(in_broadcast(sin->sin_addr))) {
1104
if (!V_connect_inaddr_wild || CK_STAILQ_EMPTY(&V_in_ifaddrhead))
1105
return (ENETUNREACH);
1106
/*
1107
* If the destination address is INADDR_ANY, use the primary
1108
* local address. If the supplied address is INADDR_BROADCAST,
1109
* and the primary interface supports broadcast, choose the
1110
* broadcast address for that interface.
1111
*/
1112
if (in_nullhost(sin->sin_addr)) {
1113
faddr =
1114
IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
1115
if ((error = prison_get_ip4(cred, &faddr)) != 0)
1116
return (error);
1117
} else if (sin->sin_addr.s_addr == INADDR_BROADCAST &&
1118
CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags
1119
& IFF_BROADCAST) {
1120
faddr = satosin(&CK_STAILQ_FIRST(
1121
&V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
1122
} else
1123
faddr = sin->sin_addr;
1124
} else
1125
faddr = sin->sin_addr;
1126
1127
if (in_nullhost(inp->inp_laddr)) {
1128
error = in_pcbladdr(inp, &faddr, &laddr, cred);
1129
if (error)
1130
return (error);
1131
} else
1132
laddr = inp->inp_laddr;
1133
1134
if (anonport) {
1135
struct sockaddr_in lsin = {
1136
.sin_family = AF_INET,
1137
.sin_addr = laddr,
1138
};
1139
struct sockaddr_in fsin = {
1140
.sin_family = AF_INET,
1141
.sin_addr = faddr,
1142
};
1143
1144
error = in_pcb_lport_dest(inp, (struct sockaddr *)&lsin,
1145
&lport, (struct sockaddr *)&fsin, sin->sin_port, cred,
1146
INPLOOKUP_WILDCARD);
1147
if (error)
1148
return (error);
1149
} else if (in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
1150
sin->sin_port, laddr, inp->inp_lport, 0, M_NODOM, RT_ALL_FIBS) !=
1151
NULL)
1152
return (EADDRINUSE);
1153
else
1154
lport = inp->inp_lport;
1155
1156
MPASS(!in_nullhost(inp->inp_laddr) || inp->inp_lport != 0 ||
1157
!(inp->inp_flags & INP_INHASHLIST));
1158
1159
inp->inp_faddr = faddr;
1160
inp->inp_fport = sin->sin_port;
1161
inp->inp_laddr = laddr;
1162
inp->inp_lport = lport;
1163
1164
if ((inp->inp_flags & INP_INHASHLIST) == 0) {
1165
error = in_pcbinshash(inp);
1166
MPASS(error == 0);
1167
} else
1168
in_pcbrehash(inp);
1169
#ifdef ROUTE_MPATH
1170
if (CALC_FLOWID_OUTBOUND) {
1171
uint32_t hash_val, hash_type;
1172
1173
hash_val = fib4_calc_software_hash(inp->inp_laddr,
1174
inp->inp_faddr, 0, sin->sin_port,
1175
inp->inp_socket->so_proto->pr_protocol, &hash_type);
1176
1177
inp->inp_flowid = hash_val;
1178
inp->inp_flowtype = hash_type;
1179
}
1180
#endif
1181
if (anonport)
1182
inp->inp_flags |= INP_ANONPORT;
1183
return (0);
1184
}
1185
1186
/*
1187
* Do proper source address selection on an unbound socket in case
1188
* of connect. Take jails into account as well.
1189
*/
1190
int
1191
in_pcbladdr(const struct inpcb *inp, struct in_addr *faddr,
1192
struct in_addr *laddr, struct ucred *cred)
1193
{
1194
struct ifaddr *ifa;
1195
struct sockaddr *sa;
1196
struct sockaddr_in *sin, dst;
1197
struct nhop_object *nh;
1198
int error;
1199
1200
NET_EPOCH_ASSERT();
1201
KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
1202
1203
/*
1204
* Bypass source address selection and use the primary jail IP
1205
* if requested.
1206
*/
1207
if (!prison_saddrsel_ip4(cred, laddr))
1208
return (0);
1209
1210
/*
1211
* If the destination address is multicast and an outgoing
1212
* interface has been set as a multicast option, prefer the
1213
* address of that interface as our source address.
1214
*/
1215
if (IN_MULTICAST(ntohl(faddr->s_addr)) && inp->inp_moptions != NULL &&
1216
inp->inp_moptions->imo_multicast_ifp != NULL) {
1217
struct ifnet *ifp = inp->inp_moptions->imo_multicast_ifp;
1218
struct in_ifaddr *ia;
1219
1220
CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1221
if (ia->ia_ifp == ifp &&
1222
prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0)
1223
break;
1224
}
1225
if (ia == NULL)
1226
return (EADDRNOTAVAIL);
1227
*laddr = ia->ia_addr.sin_addr;
1228
return (0);
1229
}
1230
1231
error = 0;
1232
1233
nh = NULL;
1234
bzero(&dst, sizeof(dst));
1235
sin = &dst;
1236
sin->sin_family = AF_INET;
1237
sin->sin_len = sizeof(struct sockaddr_in);
1238
sin->sin_addr.s_addr = faddr->s_addr;
1239
1240
/*
1241
* If route is known our src addr is taken from the i/f,
1242
* else punt.
1243
*
1244
* Find out route to destination.
1245
*/
1246
if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
1247
nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr,
1248
0, NHR_NONE, 0);
1249
1250
/*
1251
* If we found a route, use the address corresponding to
1252
* the outgoing interface.
1253
*
1254
* Otherwise assume faddr is reachable on a directly connected
1255
* network and try to find a corresponding interface to take
1256
* the source address from.
1257
*/
1258
if (nh == NULL || nh->nh_ifp == NULL) {
1259
struct in_ifaddr *ia;
1260
struct ifnet *ifp;
1261
1262
ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
1263
inp->inp_socket->so_fibnum));
1264
if (ia == NULL) {
1265
ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
1266
inp->inp_socket->so_fibnum));
1267
}
1268
if (ia == NULL) {
1269
error = ENETUNREACH;
1270
goto done;
1271
}
1272
1273
if (!prison_flag(cred, PR_IP4)) {
1274
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1275
goto done;
1276
}
1277
1278
ifp = ia->ia_ifp;
1279
ia = NULL;
1280
CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1281
sa = ifa->ifa_addr;
1282
if (sa->sa_family != AF_INET)
1283
continue;
1284
sin = (struct sockaddr_in *)sa;
1285
if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1286
ia = (struct in_ifaddr *)ifa;
1287
break;
1288
}
1289
}
1290
if (ia != NULL) {
1291
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1292
goto done;
1293
}
1294
1295
/* 3. As a last resort return the 'default' jail address. */
1296
error = prison_get_ip4(cred, laddr);
1297
goto done;
1298
}
1299
1300
/*
1301
* If the outgoing interface on the route found is not
1302
* a loopback interface, use the address from that interface.
1303
* In case of jails do those three steps:
1304
* 1. check if the interface address belongs to the jail. If so use it.
1305
* 2. check if we have any address on the outgoing interface
1306
* belonging to this jail. If so use it.
1307
* 3. as a last resort return the 'default' jail address.
1308
*/
1309
if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
1310
struct in_ifaddr *ia;
1311
struct ifnet *ifp;
1312
1313
/* If not jailed, use the default returned. */
1314
if (!prison_flag(cred, PR_IP4)) {
1315
ia = (struct in_ifaddr *)nh->nh_ifa;
1316
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1317
goto done;
1318
}
1319
1320
/* Jailed. */
1321
/* 1. Check if the iface address belongs to the jail. */
1322
sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
1323
if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1324
ia = (struct in_ifaddr *)nh->nh_ifa;
1325
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1326
goto done;
1327
}
1328
1329
/*
1330
* 2. Check if we have any address on the outgoing interface
1331
* belonging to this jail.
1332
*/
1333
ia = NULL;
1334
ifp = nh->nh_ifp;
1335
CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1336
sa = ifa->ifa_addr;
1337
if (sa->sa_family != AF_INET)
1338
continue;
1339
sin = (struct sockaddr_in *)sa;
1340
if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1341
ia = (struct in_ifaddr *)ifa;
1342
break;
1343
}
1344
}
1345
if (ia != NULL) {
1346
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1347
goto done;
1348
}
1349
1350
/* 3. As a last resort return the 'default' jail address. */
1351
error = prison_get_ip4(cred, laddr);
1352
goto done;
1353
}
1354
1355
/*
1356
* The outgoing interface is marked with 'loopback net', so a route
1357
* to ourselves is here.
1358
* Try to find the interface of the destination address and then
1359
* take the address from there. That interface is not necessarily
1360
* a loopback interface.
1361
* In case of jails, check that it is an address of the jail
1362
* and if we cannot find, fall back to the 'default' jail address.
1363
*/
1364
if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
1365
struct in_ifaddr *ia;
1366
1367
ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst),
1368
inp->inp_socket->so_fibnum));
1369
if (ia == NULL)
1370
ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0,
1371
inp->inp_socket->so_fibnum));
1372
if (ia == NULL)
1373
ia = ifatoia(ifa_ifwithaddr(sintosa(&dst)));
1374
1375
if (!prison_flag(cred, PR_IP4)) {
1376
if (ia == NULL) {
1377
error = ENETUNREACH;
1378
goto done;
1379
}
1380
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1381
goto done;
1382
}
1383
1384
/* Jailed. */
1385
if (ia != NULL) {
1386
struct ifnet *ifp;
1387
1388
ifp = ia->ia_ifp;
1389
ia = NULL;
1390
CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1391
sa = ifa->ifa_addr;
1392
if (sa->sa_family != AF_INET)
1393
continue;
1394
sin = (struct sockaddr_in *)sa;
1395
if (prison_check_ip4(cred,
1396
&sin->sin_addr) == 0) {
1397
ia = (struct in_ifaddr *)ifa;
1398
break;
1399
}
1400
}
1401
if (ia != NULL) {
1402
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1403
goto done;
1404
}
1405
}
1406
1407
/* 3. As a last resort return the 'default' jail address. */
1408
error = prison_get_ip4(cred, laddr);
1409
goto done;
1410
}
1411
1412
done:
1413
if (error == 0 && laddr->s_addr == INADDR_ANY)
1414
return (EHOSTUNREACH);
1415
return (error);
1416
}
1417
1418
void
1419
in_pcbdisconnect(struct inpcb *inp)
1420
{
1421
1422
INP_WLOCK_ASSERT(inp);
1423
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1424
KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
1425
("%s: inp %p was already disconnected", __func__, inp));
1426
1427
in_pcbremhash_locked(inp);
1428
1429
/* See the comment in in_pcbinshash(). */
1430
inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr);
1431
inp->inp_laddr.s_addr = INADDR_ANY;
1432
inp->inp_faddr.s_addr = INADDR_ANY;
1433
inp->inp_fport = 0;
1434
}
1435
#endif /* INET */
1436
1437
void
1438
in_pcblisten(struct inpcb *inp)
1439
{
1440
struct inpcblbgroup *grp;
1441
1442
INP_WLOCK_ASSERT(inp);
1443
1444
if ((inp->inp_flags & INP_INLBGROUP) != 0) {
1445
struct inpcbinfo *pcbinfo;
1446
1447
pcbinfo = inp->inp_pcbinfo;
1448
INP_HASH_WLOCK(pcbinfo);
1449
grp = in_pcblbgroup_find(inp);
1450
LIST_REMOVE(inp, inp_lbgroup_list);
1451
grp->il_pendcnt--;
1452
in_pcblbgroup_insert(grp, inp);
1453
INP_HASH_WUNLOCK(pcbinfo);
1454
}
1455
}
1456
1457
/*
1458
* inpcb hash lookups are protected by SMR section.
1459
*
1460
* Once desired pcb has been found, switching from SMR section to a pcb
1461
* lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK
1462
* here because SMR is a critical section.
1463
* In 99%+ cases inp_smr_lock() would obtain the lock immediately.
1464
*/
1465
void
1466
inp_lock(struct inpcb *inp, const inp_lookup_t lock)
1467
{
1468
1469
lock == INPLOOKUP_RLOCKPCB ?
1470
rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock);
1471
}
1472
1473
void
1474
inp_unlock(struct inpcb *inp, const inp_lookup_t lock)
1475
{
1476
1477
lock == INPLOOKUP_RLOCKPCB ?
1478
rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock);
1479
}
1480
1481
int
1482
inp_trylock(struct inpcb *inp, const inp_lookup_t lock)
1483
{
1484
1485
return (lock == INPLOOKUP_RLOCKPCB ?
1486
rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock));
1487
}
1488
1489
static inline bool
1490
_inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags)
1491
{
1492
1493
MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB);
1494
SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr);
1495
1496
if (__predict_true(inp_trylock(inp, lock))) {
1497
if (__predict_false(inp->inp_flags & ignflags)) {
1498
smr_exit(inp->inp_pcbinfo->ipi_smr);
1499
inp_unlock(inp, lock);
1500
return (false);
1501
}
1502
smr_exit(inp->inp_pcbinfo->ipi_smr);
1503
return (true);
1504
}
1505
1506
if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1507
smr_exit(inp->inp_pcbinfo->ipi_smr);
1508
inp_lock(inp, lock);
1509
if (__predict_false(in_pcbrele(inp, lock)))
1510
return (false);
1511
/*
1512
* inp acquired through refcount & lock for sure didn't went
1513
* through uma_zfree(). However, it may have already went
1514
* through in_pcbfree() and has another reference, that
1515
* prevented its release by our in_pcbrele().
1516
*/
1517
if (__predict_false(inp->inp_flags & ignflags)) {
1518
inp_unlock(inp, lock);
1519
return (false);
1520
}
1521
return (true);
1522
} else {
1523
smr_exit(inp->inp_pcbinfo->ipi_smr);
1524
return (false);
1525
}
1526
}
1527
1528
bool
1529
inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
1530
{
1531
1532
/*
1533
* in_pcblookup() family of functions ignore not only freed entries,
1534
* that may be found due to lockless access to the hash, but dropped
1535
* entries, too.
1536
*/
1537
return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED));
1538
}
1539
1540
/*
1541
* inp_next() - inpcb hash/list traversal iterator
1542
*
1543
* Requires initialized struct inpcb_iterator for context.
1544
* The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR().
1545
*
1546
* - Iterator can have either write-lock or read-lock semantics, that can not
1547
* be changed later.
1548
* - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through
1549
* a single hash slot. Note: only rip_input() does the latter.
1550
* - Iterator may have optional bool matching function. The matching function
1551
* will be executed for each inpcb in the SMR context, so it can not acquire
1552
* locks and can safely access only immutable fields of inpcb.
1553
*
1554
* A fresh initialized iterator has NULL inpcb in its context and that
1555
* means that inp_next() call would return the very first inpcb on the list
1556
* locked with desired semantic. In all following calls the context pointer
1557
* shall hold the current inpcb pointer. The KPI user is not supposed to
1558
* unlock the current inpcb! Upon end of traversal inp_next() will return NULL
1559
* and write NULL to its context. After end of traversal an iterator can be
1560
* reused.
1561
*
1562
* List traversals have the following features/constraints:
1563
* - New entries won't be seen, as they are always added to the head of a list.
1564
* - Removed entries won't stop traversal as long as they are not added to
1565
* a different list. This is violated by in_pcbrehash().
1566
*/
1567
#define II_LIST_FIRST(ipi, hash) \
1568
(((hash) == INP_ALL_LIST) ? \
1569
CK_LIST_FIRST(&(ipi)->ipi_listhead) : \
1570
CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)]))
1571
#define II_LIST_NEXT(inp, hash) \
1572
(((hash) == INP_ALL_LIST) ? \
1573
CK_LIST_NEXT((inp), inp_list) : \
1574
CK_LIST_NEXT((inp), inp_hash_exact))
1575
#define II_LOCK_ASSERT(inp, lock) \
1576
rw_assert(&(inp)->inp_lock, \
1577
(lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED )
1578
struct inpcb *
1579
inp_next(struct inpcb_iterator *ii)
1580
{
1581
const struct inpcbinfo *ipi = ii->ipi;
1582
inp_match_t *match = ii->match;
1583
void *ctx = ii->ctx;
1584
inp_lookup_t lock = ii->lock;
1585
int hash = ii->hash;
1586
struct inpcb *inp;
1587
1588
if (ii->inp == NULL) { /* First call. */
1589
smr_enter(ipi->ipi_smr);
1590
/* This is unrolled CK_LIST_FOREACH(). */
1591
for (inp = II_LIST_FIRST(ipi, hash);
1592
inp != NULL;
1593
inp = II_LIST_NEXT(inp, hash)) {
1594
if (match != NULL && (match)(inp, ctx) == false)
1595
continue;
1596
if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED)))
1597
break;
1598
else {
1599
smr_enter(ipi->ipi_smr);
1600
MPASS(inp != II_LIST_FIRST(ipi, hash));
1601
inp = II_LIST_FIRST(ipi, hash);
1602
if (inp == NULL)
1603
break;
1604
}
1605
}
1606
1607
if (inp == NULL)
1608
smr_exit(ipi->ipi_smr);
1609
else
1610
ii->inp = inp;
1611
1612
return (inp);
1613
}
1614
1615
/* Not a first call. */
1616
smr_enter(ipi->ipi_smr);
1617
restart:
1618
inp = ii->inp;
1619
II_LOCK_ASSERT(inp, lock);
1620
next:
1621
inp = II_LIST_NEXT(inp, hash);
1622
if (inp == NULL) {
1623
smr_exit(ipi->ipi_smr);
1624
goto found;
1625
}
1626
1627
if (match != NULL && (match)(inp, ctx) == false)
1628
goto next;
1629
1630
if (__predict_true(inp_trylock(inp, lock))) {
1631
if (__predict_false(inp->inp_flags & INP_FREED)) {
1632
/*
1633
* Entries are never inserted in middle of a list, thus
1634
* as long as we are in SMR, we can continue traversal.
1635
* Jump to 'restart' should yield in the same result,
1636
* but could produce unnecessary looping. Could this
1637
* looping be unbound?
1638
*/
1639
inp_unlock(inp, lock);
1640
goto next;
1641
} else {
1642
smr_exit(ipi->ipi_smr);
1643
goto found;
1644
}
1645
}
1646
1647
/*
1648
* Can't obtain lock immediately, thus going hard. Once we exit the
1649
* SMR section we can no longer jump to 'next', and our only stable
1650
* anchoring point is ii->inp, which we keep locked for this case, so
1651
* we jump to 'restart'.
1652
*/
1653
if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1654
smr_exit(ipi->ipi_smr);
1655
inp_lock(inp, lock);
1656
if (__predict_false(in_pcbrele(inp, lock))) {
1657
smr_enter(ipi->ipi_smr);
1658
goto restart;
1659
}
1660
/*
1661
* See comment in inp_smr_lock().
1662
*/
1663
if (__predict_false(inp->inp_flags & INP_FREED)) {
1664
inp_unlock(inp, lock);
1665
smr_enter(ipi->ipi_smr);
1666
goto restart;
1667
}
1668
} else
1669
goto next;
1670
1671
found:
1672
inp_unlock(ii->inp, lock);
1673
ii->inp = inp;
1674
1675
return (ii->inp);
1676
}
1677
1678
/*
1679
* in_pcbref() bumps the reference count on an inpcb in order to maintain
1680
* stability of an inpcb pointer despite the inpcb lock being released or
1681
* SMR section exited.
1682
*
1683
* To free a reference later in_pcbrele_(r|w)locked() must be performed.
1684
*/
1685
void
1686
in_pcbref(struct inpcb *inp)
1687
{
1688
u_int old __diagused;
1689
1690
old = refcount_acquire(&inp->inp_refcount);
1691
KASSERT(old > 0, ("%s: refcount 0", __func__));
1692
}
1693
1694
/*
1695
* Drop a refcount on an inpcb elevated using in_pcbref(), potentially
1696
* freeing the pcb, if the reference was very last.
1697
*/
1698
bool
1699
in_pcbrele_rlocked(struct inpcb *inp)
1700
{
1701
1702
INP_RLOCK_ASSERT(inp);
1703
1704
if (!refcount_release(&inp->inp_refcount))
1705
return (false);
1706
1707
MPASS(inp->inp_flags & INP_FREED);
1708
MPASS(inp->inp_socket == NULL);
1709
crfree(inp->inp_cred);
1710
#ifdef INVARIANTS
1711
inp->inp_cred = NULL;
1712
#endif
1713
INP_RUNLOCK(inp);
1714
uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1715
return (true);
1716
}
1717
1718
bool
1719
in_pcbrele_wlocked(struct inpcb *inp)
1720
{
1721
1722
INP_WLOCK_ASSERT(inp);
1723
1724
if (!refcount_release(&inp->inp_refcount))
1725
return (false);
1726
1727
MPASS(inp->inp_flags & INP_FREED);
1728
MPASS(inp->inp_socket == NULL);
1729
crfree(inp->inp_cred);
1730
#ifdef INVARIANTS
1731
inp->inp_cred = NULL;
1732
#endif
1733
INP_WUNLOCK(inp);
1734
uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1735
return (true);
1736
}
1737
1738
bool
1739
in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
1740
{
1741
1742
return (lock == INPLOOKUP_RLOCKPCB ?
1743
in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp));
1744
}
1745
1746
/*
1747
* Dereference and rlock inp, for which the caller must own the
1748
* reference. Returns true if inp no longer usable, false otherwise.
1749
*/
1750
bool
1751
in_pcbrele_rlock(struct inpcb *inp)
1752
{
1753
INP_RLOCK(inp);
1754
if (in_pcbrele_rlocked(inp))
1755
return (true);
1756
if ((inp->inp_flags & INP_FREED) != 0) {
1757
INP_RUNLOCK(inp);
1758
return (true);
1759
}
1760
return (false);
1761
}
1762
1763
/*
1764
* Unconditionally schedule an inpcb to be freed by decrementing its
1765
* reference count, which should occur only after the inpcb has been detached
1766
* from its socket. If another thread holds a temporary reference (acquired
1767
* using in_pcbref()) then the free is deferred until that reference is
1768
* released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked.
1769
* Almost all work, including removal from global lists, is done in this
1770
* context, where the pcbinfo lock is held.
1771
*/
1772
void
1773
in_pcbfree(struct inpcb *inp)
1774
{
1775
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1776
#ifdef INET
1777
struct ip_moptions *imo;
1778
#endif
1779
#ifdef INET6
1780
struct ip6_moptions *im6o;
1781
#endif
1782
1783
INP_WLOCK_ASSERT(inp);
1784
KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1785
KASSERT((inp->inp_flags & INP_FREED) == 0,
1786
("%s: called twice for pcb %p", __func__, inp));
1787
1788
/*
1789
* in_pcblookup_local() and in6_pcblookup_local() may return an inpcb
1790
* from the hash without acquiring inpcb lock, they rely on the hash
1791
* lock, thus in_pcbremhash() should be the first action.
1792
*/
1793
if (inp->inp_flags & INP_INHASHLIST)
1794
in_pcbremhash(inp);
1795
INP_INFO_WLOCK(pcbinfo);
1796
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
1797
pcbinfo->ipi_count--;
1798
CK_LIST_REMOVE(inp, inp_list);
1799
INP_INFO_WUNLOCK(pcbinfo);
1800
1801
#ifdef RATELIMIT
1802
if (inp->inp_snd_tag != NULL)
1803
in_pcbdetach_txrtlmt(inp);
1804
#endif
1805
inp->inp_flags |= INP_FREED;
1806
inp->inp_socket->so_pcb = NULL;
1807
inp->inp_socket = NULL;
1808
1809
RO_INVALIDATE_CACHE(&inp->inp_route);
1810
#ifdef MAC
1811
mac_inpcb_destroy(inp);
1812
#endif
1813
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
1814
if (inp->inp_sp != NULL)
1815
ipsec_delete_pcbpolicy(inp);
1816
#endif
1817
#ifdef INET
1818
if (inp->inp_options)
1819
(void)m_free(inp->inp_options);
1820
DEBUG_POISON_POINTER(inp->inp_options);
1821
imo = inp->inp_moptions;
1822
DEBUG_POISON_POINTER(inp->inp_moptions);
1823
#endif
1824
#ifdef INET6
1825
if (inp->inp_vflag & INP_IPV6PROTO) {
1826
ip6_freepcbopts(inp->in6p_outputopts);
1827
DEBUG_POISON_POINTER(inp->in6p_outputopts);
1828
im6o = inp->in6p_moptions;
1829
DEBUG_POISON_POINTER(inp->in6p_moptions);
1830
} else
1831
im6o = NULL;
1832
#endif
1833
1834
if (__predict_false(in_pcbrele_wlocked(inp) == false)) {
1835
INP_WUNLOCK(inp);
1836
}
1837
#ifdef INET6
1838
ip6_freemoptions(im6o);
1839
#endif
1840
#ifdef INET
1841
inp_freemoptions(imo);
1842
#endif
1843
}
1844
1845
/*
1846
* Different protocols initialize their inpcbs differently - giving
1847
* different name to the lock. But they all are disposed the same.
1848
*/
1849
static void
1850
inpcb_fini(void *mem, int size)
1851
{
1852
struct inpcb *inp = mem;
1853
1854
INP_LOCK_DESTROY(inp);
1855
}
1856
1857
/*
1858
* in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
1859
* port reservation, and preventing it from being returned by inpcb lookups.
1860
*
1861
* It is used by TCP to mark an inpcb as unused and avoid future packet
1862
* delivery or event notification when a socket remains open but TCP has
1863
* closed. This might occur as a result of a shutdown()-initiated TCP close
1864
* or a RST on the wire, and allows the port binding to be reused while still
1865
* maintaining the invariant that so_pcb always points to a valid inpcb until
1866
* in_pcbdetach().
1867
*
1868
* XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
1869
* in_pcbpurgeif0()?
1870
*/
1871
void
1872
in_pcbdrop(struct inpcb *inp)
1873
{
1874
1875
INP_WLOCK_ASSERT(inp);
1876
1877
inp->inp_flags |= INP_DROPPED;
1878
if (inp->inp_flags & INP_INHASHLIST)
1879
in_pcbremhash(inp);
1880
}
1881
1882
#ifdef INET
1883
/*
1884
* Common routines to return the socket addresses associated with inpcbs.
1885
*/
1886
int
1887
in_getsockaddr(struct socket *so, struct sockaddr *sa)
1888
{
1889
struct inpcb *inp;
1890
1891
inp = sotoinpcb(so);
1892
KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1893
1894
*(struct sockaddr_in *)sa = (struct sockaddr_in ){
1895
.sin_len = sizeof(struct sockaddr_in),
1896
.sin_family = AF_INET,
1897
.sin_port = inp->inp_lport,
1898
.sin_addr = inp->inp_laddr,
1899
};
1900
1901
return (0);
1902
}
1903
1904
int
1905
in_getpeeraddr(struct socket *so, struct sockaddr *sa)
1906
{
1907
struct inpcb *inp;
1908
1909
inp = sotoinpcb(so);
1910
KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
1911
1912
*(struct sockaddr_in *)sa = (struct sockaddr_in ){
1913
.sin_len = sizeof(struct sockaddr_in),
1914
.sin_family = AF_INET,
1915
.sin_port = inp->inp_fport,
1916
.sin_addr = inp->inp_faddr,
1917
};
1918
1919
return (0);
1920
}
1921
1922
static bool
1923
inp_v4_multi_match(const struct inpcb *inp, void *v __unused)
1924
{
1925
1926
if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL)
1927
return (true);
1928
else
1929
return (false);
1930
}
1931
1932
void
1933
in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1934
{
1935
struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
1936
inp_v4_multi_match, NULL);
1937
struct inpcb *inp;
1938
struct in_multi *inm;
1939
struct in_mfilter *imf;
1940
struct ip_moptions *imo;
1941
1942
IN_MULTI_LOCK_ASSERT();
1943
1944
while ((inp = inp_next(&inpi)) != NULL) {
1945
INP_WLOCK_ASSERT(inp);
1946
1947
imo = inp->inp_moptions;
1948
/*
1949
* Unselect the outgoing interface if it is being
1950
* detached.
1951
*/
1952
if (imo->imo_multicast_ifp == ifp)
1953
imo->imo_multicast_ifp = NULL;
1954
1955
/*
1956
* Drop multicast group membership if we joined
1957
* through the interface being detached.
1958
*
1959
* XXX This can all be deferred to an epoch_call
1960
*/
1961
restart:
1962
IP_MFILTER_FOREACH(imf, &imo->imo_head) {
1963
if ((inm = imf->imf_inm) == NULL)
1964
continue;
1965
if (inm->inm_ifp != ifp)
1966
continue;
1967
ip_mfilter_remove(&imo->imo_head, imf);
1968
in_leavegroup_locked(inm, NULL);
1969
ip_mfilter_free(imf);
1970
goto restart;
1971
}
1972
}
1973
}
1974
1975
/*
1976
* Lookup a PCB based on the local address and port. Caller must hold the
1977
* hash lock. No inpcb locks or references are acquired.
1978
*/
1979
#define INP_LOOKUP_MAPPED_PCB_COST 3
1980
struct inpcb *
1981
in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
1982
u_short lport, int fib, int lookupflags, struct ucred *cred)
1983
{
1984
struct inpcb *inp;
1985
#ifdef INET6
1986
int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
1987
#else
1988
int matchwild = 3;
1989
#endif
1990
int wildcard;
1991
1992
KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
1993
("%s: invalid lookup flags %d", __func__, lookupflags));
1994
KASSERT(fib == RT_ALL_FIBS || (fib >= 0 && fib < V_rt_numfibs),
1995
("%s: invalid fib %d", __func__, fib));
1996
1997
INP_HASH_LOCK_ASSERT(pcbinfo);
1998
1999
if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
2000
struct inpcbhead *head;
2001
/*
2002
* Look for an unconnected (wildcard foreign addr) PCB that
2003
* matches the local address and port we're looking for.
2004
*/
2005
head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2006
pcbinfo->ipi_hashmask)];
2007
CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2008
#ifdef INET6
2009
/* XXX inp locking */
2010
if ((inp->inp_vflag & INP_IPV4) == 0)
2011
continue;
2012
#endif
2013
if (inp->inp_faddr.s_addr == INADDR_ANY &&
2014
inp->inp_laddr.s_addr == laddr.s_addr &&
2015
inp->inp_lport == lport && (fib == RT_ALL_FIBS ||
2016
inp->inp_inc.inc_fibnum == fib)) {
2017
/*
2018
* Found?
2019
*/
2020
if (prison_equal_ip4(cred->cr_prison,
2021
inp->inp_cred->cr_prison))
2022
return (inp);
2023
}
2024
}
2025
/*
2026
* Not found.
2027
*/
2028
return (NULL);
2029
} else {
2030
struct inpcbhead *porthash;
2031
struct inpcb *match = NULL;
2032
2033
/*
2034
* Port is in use by one or more PCBs. Look for best
2035
* fit.
2036
*/
2037
porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
2038
pcbinfo->ipi_porthashmask)];
2039
CK_LIST_FOREACH(inp, porthash, inp_portlist) {
2040
if (inp->inp_lport != lport)
2041
continue;
2042
if (!prison_equal_ip4(inp->inp_cred->cr_prison,
2043
cred->cr_prison))
2044
continue;
2045
if (fib != RT_ALL_FIBS &&
2046
inp->inp_inc.inc_fibnum != fib)
2047
continue;
2048
wildcard = 0;
2049
#ifdef INET6
2050
/* XXX inp locking */
2051
if ((inp->inp_vflag & INP_IPV4) == 0)
2052
continue;
2053
/*
2054
* We never select the PCB that has INP_IPV6 flag and
2055
* is bound to :: if we have another PCB which is bound
2056
* to 0.0.0.0. If a PCB has the INP_IPV6 flag, then we
2057
* set its cost higher than IPv4 only PCBs.
2058
*
2059
* Note that the case only happens when a socket is
2060
* bound to ::, under the condition that the use of the
2061
* mapped address is allowed.
2062
*/
2063
if ((inp->inp_vflag & INP_IPV6) != 0)
2064
wildcard += INP_LOOKUP_MAPPED_PCB_COST;
2065
#endif
2066
if (inp->inp_faddr.s_addr != INADDR_ANY)
2067
wildcard++;
2068
if (inp->inp_laddr.s_addr != INADDR_ANY) {
2069
if (laddr.s_addr == INADDR_ANY)
2070
wildcard++;
2071
else if (inp->inp_laddr.s_addr != laddr.s_addr)
2072
continue;
2073
} else {
2074
if (laddr.s_addr != INADDR_ANY)
2075
wildcard++;
2076
}
2077
if (wildcard < matchwild) {
2078
match = inp;
2079
matchwild = wildcard;
2080
if (matchwild == 0)
2081
break;
2082
}
2083
}
2084
return (match);
2085
}
2086
}
2087
#undef INP_LOOKUP_MAPPED_PCB_COST
2088
2089
static bool
2090
in_pcblookup_lb_match(const struct inpcblbgroup *grp, int domain, int fib)
2091
{
2092
return ((domain == M_NODOM || domain == grp->il_numa_domain) &&
2093
(fib == RT_ALL_FIBS || fib == grp->il_fibnum));
2094
}
2095
2096
static struct inpcb *
2097
in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
2098
const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr,
2099
uint16_t lport, int domain, int fib)
2100
{
2101
const struct inpcblbgrouphead *hdr;
2102
struct inpcblbgroup *grp;
2103
struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild;
2104
struct inpcb *inp;
2105
u_int count;
2106
2107
INP_HASH_LOCK_ASSERT(pcbinfo);
2108
NET_EPOCH_ASSERT();
2109
2110
hdr = &pcbinfo->ipi_lbgrouphashbase[
2111
INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
2112
2113
/*
2114
* Search for an LB group match based on the following criteria:
2115
* - prefer jailed groups to non-jailed groups
2116
* - prefer exact source address matches to wildcard matches
2117
* - prefer groups bound to the specified NUMA domain
2118
*/
2119
jail_exact = jail_wild = local_exact = local_wild = NULL;
2120
CK_LIST_FOREACH(grp, hdr, il_list) {
2121
bool injail;
2122
2123
#ifdef INET6
2124
if (!(grp->il_vflag & INP_IPV4))
2125
continue;
2126
#endif
2127
if (grp->il_lport != lport)
2128
continue;
2129
2130
injail = prison_flag(grp->il_cred, PR_IP4) != 0;
2131
if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison,
2132
laddr) != 0)
2133
continue;
2134
2135
if (grp->il_laddr.s_addr == laddr->s_addr) {
2136
if (injail) {
2137
jail_exact = grp;
2138
if (in_pcblookup_lb_match(grp, domain, fib))
2139
/* This is a perfect match. */
2140
goto out;
2141
} else if (local_exact == NULL ||
2142
in_pcblookup_lb_match(grp, domain, fib)) {
2143
local_exact = grp;
2144
}
2145
} else if (grp->il_laddr.s_addr == INADDR_ANY) {
2146
if (injail) {
2147
if (jail_wild == NULL ||
2148
in_pcblookup_lb_match(grp, domain, fib))
2149
jail_wild = grp;
2150
} else if (local_wild == NULL ||
2151
in_pcblookup_lb_match(grp, domain, fib)) {
2152
local_wild = grp;
2153
}
2154
}
2155
}
2156
2157
if (jail_exact != NULL)
2158
grp = jail_exact;
2159
else if (jail_wild != NULL)
2160
grp = jail_wild;
2161
else if (local_exact != NULL)
2162
grp = local_exact;
2163
else
2164
grp = local_wild;
2165
if (grp == NULL)
2166
return (NULL);
2167
2168
out:
2169
/*
2170
* Synchronize with in_pcblbgroup_insert().
2171
*/
2172
count = atomic_load_acq_int(&grp->il_inpcnt);
2173
if (count == 0)
2174
return (NULL);
2175
inp = grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count];
2176
KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
2177
return (inp);
2178
}
2179
2180
static bool
2181
in_pcblookup_exact_match(const struct inpcb *inp, struct in_addr faddr,
2182
u_short fport, struct in_addr laddr, u_short lport)
2183
{
2184
#ifdef INET6
2185
/* XXX inp locking */
2186
if ((inp->inp_vflag & INP_IPV4) == 0)
2187
return (false);
2188
#endif
2189
if (inp->inp_faddr.s_addr == faddr.s_addr &&
2190
inp->inp_laddr.s_addr == laddr.s_addr &&
2191
inp->inp_fport == fport &&
2192
inp->inp_lport == lport)
2193
return (true);
2194
return (false);
2195
}
2196
2197
static struct inpcb *
2198
in_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2199
u_short fport, struct in_addr laddr, u_short lport)
2200
{
2201
struct inpcbhead *head;
2202
struct inpcb *inp;
2203
2204
INP_HASH_LOCK_ASSERT(pcbinfo);
2205
2206
head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport,
2207
pcbinfo->ipi_hashmask)];
2208
CK_LIST_FOREACH(inp, head, inp_hash_exact) {
2209
if (in_pcblookup_exact_match(inp, faddr, fport, laddr, lport))
2210
return (inp);
2211
}
2212
return (NULL);
2213
}
2214
2215
typedef enum {
2216
INPLOOKUP_MATCH_NONE = 0,
2217
INPLOOKUP_MATCH_WILD = 1,
2218
INPLOOKUP_MATCH_LADDR = 2,
2219
} inp_lookup_match_t;
2220
2221
static inp_lookup_match_t
2222
in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr,
2223
u_short lport, int fib)
2224
{
2225
#ifdef INET6
2226
/* XXX inp locking */
2227
if ((inp->inp_vflag & INP_IPV4) == 0)
2228
return (INPLOOKUP_MATCH_NONE);
2229
#endif
2230
if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport)
2231
return (INPLOOKUP_MATCH_NONE);
2232
if (fib != RT_ALL_FIBS && inp->inp_inc.inc_fibnum != fib)
2233
return (INPLOOKUP_MATCH_NONE);
2234
if (inp->inp_laddr.s_addr == INADDR_ANY)
2235
return (INPLOOKUP_MATCH_WILD);
2236
if (inp->inp_laddr.s_addr == laddr.s_addr)
2237
return (INPLOOKUP_MATCH_LADDR);
2238
return (INPLOOKUP_MATCH_NONE);
2239
}
2240
2241
#define INP_LOOKUP_AGAIN ((struct inpcb *)(uintptr_t)-1)
2242
2243
static struct inpcb *
2244
in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2245
u_short lport, int fib, const inp_lookup_t lockflags)
2246
{
2247
struct inpcbhead *head;
2248
struct inpcb *inp;
2249
2250
KASSERT(SMR_ENTERED(pcbinfo->ipi_smr),
2251
("%s: not in SMR read section", __func__));
2252
2253
head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2254
pcbinfo->ipi_hashmask)];
2255
CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2256
inp_lookup_match_t match;
2257
2258
match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2259
if (match == INPLOOKUP_MATCH_NONE)
2260
continue;
2261
2262
if (__predict_true(inp_smr_lock(inp, lockflags))) {
2263
match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2264
if (match != INPLOOKUP_MATCH_NONE &&
2265
prison_check_ip4_locked(inp->inp_cred->cr_prison,
2266
&laddr) == 0)
2267
return (inp);
2268
inp_unlock(inp, lockflags);
2269
}
2270
2271
/*
2272
* The matching socket disappeared out from under us. Fall back
2273
* to a serialized lookup.
2274
*/
2275
return (INP_LOOKUP_AGAIN);
2276
}
2277
return (NULL);
2278
}
2279
2280
static struct inpcb *
2281
in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2282
u_short lport, int fib)
2283
{
2284
struct inpcbhead *head;
2285
struct inpcb *inp, *local_wild, *local_exact, *jail_wild;
2286
#ifdef INET6
2287
struct inpcb *local_wild_mapped;
2288
#endif
2289
2290
INP_HASH_LOCK_ASSERT(pcbinfo);
2291
2292
/*
2293
* Order of socket selection - we always prefer jails.
2294
* 1. jailed, non-wild.
2295
* 2. jailed, wild.
2296
* 3. non-jailed, non-wild.
2297
* 4. non-jailed, wild.
2298
*/
2299
head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2300
pcbinfo->ipi_hashmask)];
2301
local_wild = local_exact = jail_wild = NULL;
2302
#ifdef INET6
2303
local_wild_mapped = NULL;
2304
#endif
2305
CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2306
inp_lookup_match_t match;
2307
bool injail;
2308
2309
match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2310
if (match == INPLOOKUP_MATCH_NONE)
2311
continue;
2312
2313
injail = prison_flag(inp->inp_cred, PR_IP4) != 0;
2314
if (injail) {
2315
if (prison_check_ip4_locked(inp->inp_cred->cr_prison,
2316
&laddr) != 0)
2317
continue;
2318
} else {
2319
if (local_exact != NULL)
2320
continue;
2321
}
2322
2323
if (match == INPLOOKUP_MATCH_LADDR) {
2324
if (injail)
2325
return (inp);
2326
local_exact = inp;
2327
} else {
2328
#ifdef INET6
2329
/* XXX inp locking, NULL check */
2330
if (inp->inp_vflag & INP_IPV6PROTO)
2331
local_wild_mapped = inp;
2332
else
2333
#endif
2334
if (injail)
2335
jail_wild = inp;
2336
else
2337
local_wild = inp;
2338
}
2339
}
2340
if (jail_wild != NULL)
2341
return (jail_wild);
2342
if (local_exact != NULL)
2343
return (local_exact);
2344
if (local_wild != NULL)
2345
return (local_wild);
2346
#ifdef INET6
2347
if (local_wild_mapped != NULL)
2348
return (local_wild_mapped);
2349
#endif
2350
return (NULL);
2351
}
2352
2353
/*
2354
* Lookup PCB in hash list, using pcbinfo tables. This variation assumes
2355
* that the caller has either locked the hash list, which usually happens
2356
* for bind(2) operations, or is in SMR section, which happens when sorting
2357
* out incoming packets.
2358
*/
2359
static struct inpcb *
2360
in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2361
u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2362
uint8_t numa_domain, int fib)
2363
{
2364
struct inpcb *inp;
2365
const u_short fport = fport_arg, lport = lport_arg;
2366
2367
KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD | INPLOOKUP_FIB)) == 0,
2368
("%s: invalid lookup flags %d", __func__, lookupflags));
2369
KASSERT(faddr.s_addr != INADDR_ANY,
2370
("%s: invalid foreign address", __func__));
2371
KASSERT(laddr.s_addr != INADDR_ANY,
2372
("%s: invalid local address", __func__));
2373
INP_HASH_WLOCK_ASSERT(pcbinfo);
2374
2375
inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport);
2376
if (inp != NULL)
2377
return (inp);
2378
2379
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2380
inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
2381
&laddr, lport, numa_domain, fib);
2382
if (inp == NULL) {
2383
inp = in_pcblookup_hash_wild_locked(pcbinfo, laddr,
2384
lport, fib);
2385
}
2386
}
2387
2388
return (inp);
2389
}
2390
2391
static struct inpcb *
2392
in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2393
u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2394
uint8_t numa_domain, int fib)
2395
{
2396
struct inpcb *inp;
2397
const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
2398
2399
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2400
("%s: LOCKPCB not set", __func__));
2401
2402
INP_HASH_WLOCK(pcbinfo);
2403
inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
2404
lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain, fib);
2405
if (inp != NULL && !inp_trylock(inp, lockflags)) {
2406
in_pcbref(inp);
2407
INP_HASH_WUNLOCK(pcbinfo);
2408
inp_lock(inp, lockflags);
2409
if (in_pcbrele(inp, lockflags))
2410
/* XXX-MJ or retry until we get a negative match? */
2411
inp = NULL;
2412
} else {
2413
INP_HASH_WUNLOCK(pcbinfo);
2414
}
2415
return (inp);
2416
}
2417
2418
static struct inpcb *
2419
in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2420
u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2421
uint8_t numa_domain, int fib)
2422
{
2423
struct inpcb *inp;
2424
const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
2425
const u_short fport = fport_arg, lport = lport_arg;
2426
2427
KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2428
("%s: invalid lookup flags %d", __func__, lookupflags));
2429
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2430
("%s: LOCKPCB not set", __func__));
2431
2432
smr_enter(pcbinfo->ipi_smr);
2433
inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport);
2434
if (inp != NULL) {
2435
if (__predict_true(inp_smr_lock(inp, lockflags))) {
2436
/*
2437
* Revalidate the 4-tuple, the socket could have been
2438
* disconnected.
2439
*/
2440
if (__predict_true(in_pcblookup_exact_match(inp,
2441
faddr, fport, laddr, lport)))
2442
return (inp);
2443
inp_unlock(inp, lockflags);
2444
}
2445
2446
/*
2447
* We failed to lock the inpcb, or its connection state changed
2448
* out from under us. Fall back to a precise search.
2449
*/
2450
return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2451
lookupflags, numa_domain, fib));
2452
}
2453
2454
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2455
inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
2456
&laddr, lport, numa_domain, fib);
2457
if (inp != NULL) {
2458
if (__predict_true(inp_smr_lock(inp, lockflags))) {
2459
if (__predict_true(in_pcblookup_wild_match(inp,
2460
laddr, lport, fib) != INPLOOKUP_MATCH_NONE))
2461
return (inp);
2462
inp_unlock(inp, lockflags);
2463
}
2464
inp = INP_LOOKUP_AGAIN;
2465
} else {
2466
inp = in_pcblookup_hash_wild_smr(pcbinfo, laddr, lport,
2467
fib, lockflags);
2468
}
2469
if (inp == INP_LOOKUP_AGAIN) {
2470
return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr,
2471
lport, lookupflags, numa_domain, fib));
2472
}
2473
}
2474
2475
if (inp == NULL)
2476
smr_exit(pcbinfo->ipi_smr);
2477
2478
return (inp);
2479
}
2480
2481
/*
2482
* Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
2483
* from which a pre-calculated hash value may be extracted.
2484
*/
2485
struct inpcb *
2486
in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
2487
struct in_addr laddr, u_int lport, int lookupflags,
2488
struct ifnet *ifp)
2489
{
2490
int fib;
2491
2492
fib = (lookupflags & INPLOOKUP_FIB) ? if_getfib(ifp) : RT_ALL_FIBS;
2493
return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport,
2494
lookupflags, M_NODOM, fib));
2495
}
2496
2497
struct inpcb *
2498
in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2499
u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2500
struct ifnet *ifp __unused, struct mbuf *m)
2501
{
2502
int fib;
2503
2504
M_ASSERTPKTHDR(m);
2505
fib = (lookupflags & INPLOOKUP_FIB) ? M_GETFIB(m) : RT_ALL_FIBS;
2506
return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport,
2507
lookupflags, m->m_pkthdr.numa_domain, fib));
2508
}
2509
#endif /* INET */
2510
2511
static bool
2512
in_pcbjailed(const struct inpcb *inp, unsigned int flag)
2513
{
2514
return (prison_flag(inp->inp_cred, flag) != 0);
2515
}
2516
2517
/*
2518
* Insert the PCB into a hash chain using ordering rules which ensure that
2519
* in_pcblookup_hash_wild_*() always encounter the highest-ranking PCB first.
2520
*
2521
* Specifically, keep jailed PCBs in front of non-jailed PCBs, and keep PCBs
2522
* with exact local addresses ahead of wildcard PCBs. Unbound v4-mapped v6 PCBs
2523
* always appear last no matter whether they are jailed.
2524
*/
2525
static void
2526
_in_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
2527
{
2528
struct inpcb *last;
2529
bool bound, injail;
2530
2531
INP_LOCK_ASSERT(inp);
2532
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2533
2534
last = NULL;
2535
bound = inp->inp_laddr.s_addr != INADDR_ANY;
2536
if (!bound && (inp->inp_vflag & INP_IPV6PROTO) != 0) {
2537
CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2538
if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2539
CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2540
return;
2541
}
2542
}
2543
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2544
return;
2545
}
2546
2547
injail = in_pcbjailed(inp, PR_IP4);
2548
if (!injail) {
2549
CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2550
if (!in_pcbjailed(last, PR_IP4))
2551
break;
2552
if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2553
CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2554
return;
2555
}
2556
}
2557
} else if (!CK_LIST_EMPTY(pcbhash) &&
2558
!in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP4)) {
2559
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2560
return;
2561
}
2562
if (!bound) {
2563
CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
2564
if (last->inp_laddr.s_addr == INADDR_ANY)
2565
break;
2566
if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2567
CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2568
return;
2569
}
2570
}
2571
}
2572
if (last == NULL)
2573
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2574
else
2575
CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
2576
}
2577
2578
#ifdef INET6
2579
/*
2580
* See the comment above _in_pcbinshash_wild().
2581
*/
2582
static void
2583
_in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
2584
{
2585
struct inpcb *last;
2586
bool bound, injail;
2587
2588
INP_LOCK_ASSERT(inp);
2589
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2590
2591
last = NULL;
2592
bound = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr);
2593
injail = in_pcbjailed(inp, PR_IP6);
2594
if (!injail) {
2595
CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2596
if (!in_pcbjailed(last, PR_IP6))
2597
break;
2598
if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2599
CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2600
return;
2601
}
2602
}
2603
} else if (!CK_LIST_EMPTY(pcbhash) &&
2604
!in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP6)) {
2605
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2606
return;
2607
}
2608
if (!bound) {
2609
CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
2610
if (IN6_IS_ADDR_UNSPECIFIED(&last->in6p_laddr))
2611
break;
2612
if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2613
CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2614
return;
2615
}
2616
}
2617
}
2618
if (last == NULL)
2619
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2620
else
2621
CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
2622
}
2623
#endif
2624
2625
/*
2626
* Insert PCB onto various hash lists.
2627
*
2628
* With normal sockets this function shall not fail, so it could return void.
2629
* But for SO_REUSEPORT_LB it may need to allocate memory with locks held,
2630
* that's the only condition when it can fail.
2631
*/
2632
int
2633
in_pcbinshash(struct inpcb *inp)
2634
{
2635
struct inpcbhead *pcbhash, *pcbporthash;
2636
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2637
uint32_t hash;
2638
bool connected;
2639
2640
INP_WLOCK_ASSERT(inp);
2641
INP_HASH_WLOCK_ASSERT(pcbinfo);
2642
KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
2643
("in_pcbinshash: INP_INHASHLIST"));
2644
2645
#ifdef INET6
2646
if (inp->inp_vflag & INP_IPV6) {
2647
hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
2648
inp->inp_fport, pcbinfo->ipi_hashmask);
2649
connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
2650
} else
2651
#endif
2652
{
2653
hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
2654
inp->inp_fport, pcbinfo->ipi_hashmask);
2655
connected = !in_nullhost(inp->inp_faddr);
2656
}
2657
2658
if (connected)
2659
pcbhash = &pcbinfo->ipi_hash_exact[hash];
2660
else
2661
pcbhash = &pcbinfo->ipi_hash_wild[hash];
2662
2663
pcbporthash = &pcbinfo->ipi_porthashbase[
2664
INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
2665
2666
/*
2667
* Ignore SO_REUSEPORT_LB if the socket is connected. Really this case
2668
* should be an error, but for UDP sockets it is not, and some
2669
* applications erroneously set it on connected UDP sockets, so we can't
2670
* change this without breaking compatibility.
2671
*/
2672
if (!connected &&
2673
(inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) {
2674
int error = in_pcbinslbgrouphash(inp, M_NODOM);
2675
if (error != 0)
2676
return (error);
2677
}
2678
2679
/*
2680
* The PCB may have been disconnected in the past. Before we can safely
2681
* make it visible in the hash table, we must wait for all readers which
2682
* may be traversing this PCB to finish.
2683
*/
2684
if (inp->inp_smr != SMR_SEQ_INVALID) {
2685
smr_wait(pcbinfo->ipi_smr, inp->inp_smr);
2686
inp->inp_smr = SMR_SEQ_INVALID;
2687
}
2688
2689
if (connected)
2690
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact);
2691
else {
2692
#ifdef INET6
2693
if ((inp->inp_vflag & INP_IPV6) != 0)
2694
_in6_pcbinshash_wild(pcbhash, inp);
2695
else
2696
#endif
2697
_in_pcbinshash_wild(pcbhash, inp);
2698
}
2699
CK_LIST_INSERT_HEAD(pcbporthash, inp, inp_portlist);
2700
inp->inp_flags |= INP_INHASHLIST;
2701
2702
return (0);
2703
}
2704
2705
void
2706
in_pcbremhash_locked(struct inpcb *inp)
2707
{
2708
2709
INP_WLOCK_ASSERT(inp);
2710
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2711
MPASS(inp->inp_flags & INP_INHASHLIST);
2712
2713
if ((inp->inp_flags & INP_INLBGROUP) != 0)
2714
in_pcbremlbgrouphash(inp);
2715
#ifdef INET6
2716
if (inp->inp_vflag & INP_IPV6) {
2717
if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
2718
CK_LIST_REMOVE(inp, inp_hash_wild);
2719
else
2720
CK_LIST_REMOVE(inp, inp_hash_exact);
2721
} else
2722
#endif
2723
{
2724
if (in_nullhost(inp->inp_faddr))
2725
CK_LIST_REMOVE(inp, inp_hash_wild);
2726
else
2727
CK_LIST_REMOVE(inp, inp_hash_exact);
2728
}
2729
CK_LIST_REMOVE(inp, inp_portlist);
2730
inp->inp_flags &= ~INP_INHASHLIST;
2731
}
2732
2733
static void
2734
in_pcbremhash(struct inpcb *inp)
2735
{
2736
INP_HASH_WLOCK(inp->inp_pcbinfo);
2737
in_pcbremhash_locked(inp);
2738
INP_HASH_WUNLOCK(inp->inp_pcbinfo);
2739
}
2740
2741
/*
2742
* Move PCB to the proper hash bucket when { faddr, fport } have been
2743
* changed. NOTE: This does not handle the case of the lport changing (the
2744
* hashed port list would have to be updated as well), so the lport must
2745
* not change after in_pcbinshash() has been called.
2746
*/
2747
void
2748
in_pcbrehash(struct inpcb *inp)
2749
{
2750
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2751
struct inpcbhead *head;
2752
uint32_t hash;
2753
bool connected;
2754
2755
INP_WLOCK_ASSERT(inp);
2756
INP_HASH_WLOCK_ASSERT(pcbinfo);
2757
KASSERT(inp->inp_flags & INP_INHASHLIST,
2758
("%s: !INP_INHASHLIST", __func__));
2759
KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
2760
("%s: inp was disconnected", __func__));
2761
2762
#ifdef INET6
2763
if (inp->inp_vflag & INP_IPV6) {
2764
hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
2765
inp->inp_fport, pcbinfo->ipi_hashmask);
2766
connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
2767
} else
2768
#endif
2769
{
2770
hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
2771
inp->inp_fport, pcbinfo->ipi_hashmask);
2772
connected = !in_nullhost(inp->inp_faddr);
2773
}
2774
2775
/* See the comment in in_pcbinshash(). */
2776
if (connected && (inp->inp_flags & INP_INLBGROUP) != 0)
2777
in_pcbremlbgrouphash(inp);
2778
2779
/*
2780
* When rehashing, the caller must ensure that either the new or the old
2781
* foreign address was unspecified.
2782
*/
2783
if (connected)
2784
CK_LIST_REMOVE(inp, inp_hash_wild);
2785
else
2786
CK_LIST_REMOVE(inp, inp_hash_exact);
2787
2788
if (connected) {
2789
head = &pcbinfo->ipi_hash_exact[hash];
2790
CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact);
2791
} else {
2792
head = &pcbinfo->ipi_hash_wild[hash];
2793
CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild);
2794
}
2795
}
2796
2797
/*
2798
* Check for alternatives when higher level complains
2799
* about service problems. For now, invalidate cached
2800
* routing information. If the route was created dynamically
2801
* (by a redirect), time to try a default gateway again.
2802
*/
2803
void
2804
in_losing(struct inpcb *inp)
2805
{
2806
2807
RO_INVALIDATE_CACHE(&inp->inp_route);
2808
return;
2809
}
2810
2811
/*
2812
* A set label operation has occurred at the socket layer, propagate the
2813
* label change into the in_pcb for the socket.
2814
*/
2815
void
2816
in_pcbsosetlabel(struct socket *so)
2817
{
2818
#ifdef MAC
2819
struct inpcb *inp;
2820
2821
inp = sotoinpcb(so);
2822
KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
2823
2824
INP_WLOCK(inp);
2825
SOCK_LOCK(so);
2826
mac_inpcb_sosetlabel(so, inp);
2827
SOCK_UNLOCK(so);
2828
INP_WUNLOCK(inp);
2829
#endif
2830
}
2831
2832
void
2833
inp_wlock(struct inpcb *inp)
2834
{
2835
2836
INP_WLOCK(inp);
2837
}
2838
2839
void
2840
inp_wunlock(struct inpcb *inp)
2841
{
2842
2843
INP_WUNLOCK(inp);
2844
}
2845
2846
void
2847
inp_rlock(struct inpcb *inp)
2848
{
2849
2850
INP_RLOCK(inp);
2851
}
2852
2853
void
2854
inp_runlock(struct inpcb *inp)
2855
{
2856
2857
INP_RUNLOCK(inp);
2858
}
2859
2860
#ifdef INVARIANT_SUPPORT
2861
void
2862
inp_lock_assert(struct inpcb *inp)
2863
{
2864
2865
INP_WLOCK_ASSERT(inp);
2866
}
2867
2868
void
2869
inp_unlock_assert(struct inpcb *inp)
2870
{
2871
2872
INP_UNLOCK_ASSERT(inp);
2873
}
2874
#endif
2875
2876
void
2877
inp_apply_all(struct inpcbinfo *pcbinfo,
2878
void (*func)(struct inpcb *, void *), void *arg)
2879
{
2880
struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
2881
INPLOOKUP_WLOCKPCB);
2882
struct inpcb *inp;
2883
2884
while ((inp = inp_next(&inpi)) != NULL)
2885
func(inp, arg);
2886
}
2887
2888
struct socket *
2889
inp_inpcbtosocket(struct inpcb *inp)
2890
{
2891
2892
INP_WLOCK_ASSERT(inp);
2893
return (inp->inp_socket);
2894
}
2895
2896
void
2897
inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
2898
uint32_t *faddr, uint16_t *fp)
2899
{
2900
2901
INP_LOCK_ASSERT(inp);
2902
*laddr = inp->inp_laddr.s_addr;
2903
*faddr = inp->inp_faddr.s_addr;
2904
*lp = inp->inp_lport;
2905
*fp = inp->inp_fport;
2906
}
2907
2908
/*
2909
* Create an external-format (``xinpcb'') structure using the information in
2910
* the kernel-format in_pcb structure pointed to by inp. This is done to
2911
* reduce the spew of irrelevant information over this interface, to isolate
2912
* user code from changes in the kernel structure, and potentially to provide
2913
* information-hiding if we decide that some of this information should be
2914
* hidden from users.
2915
*/
2916
void
2917
in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
2918
{
2919
2920
bzero(xi, sizeof(*xi));
2921
xi->xi_len = sizeof(struct xinpcb);
2922
if (inp->inp_socket)
2923
sotoxsocket(inp->inp_socket, &xi->xi_socket);
2924
bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
2925
xi->inp_gencnt = inp->inp_gencnt;
2926
xi->inp_flow = inp->inp_flow;
2927
xi->inp_flowid = inp->inp_flowid;
2928
xi->inp_flowtype = inp->inp_flowtype;
2929
xi->inp_flags = inp->inp_flags;
2930
xi->inp_flags2 = inp->inp_flags2;
2931
xi->in6p_cksum = inp->in6p_cksum;
2932
xi->in6p_hops = inp->in6p_hops;
2933
xi->inp_ip_tos = inp->inp_ip_tos;
2934
xi->inp_vflag = inp->inp_vflag;
2935
xi->inp_ip_ttl = inp->inp_ip_ttl;
2936
xi->inp_ip_p = inp->inp_ip_p;
2937
xi->inp_ip_minttl = inp->inp_ip_minttl;
2938
}
2939
2940
int
2941
sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
2942
int (*ctloutput_set)(struct inpcb *, struct sockopt *))
2943
{
2944
struct sockopt sopt;
2945
struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
2946
INPLOOKUP_WLOCKPCB);
2947
struct inpcb *inp;
2948
struct sockopt_parameters *params;
2949
struct socket *so;
2950
int error;
2951
char buf[1024];
2952
2953
if (req->oldptr != NULL || req->oldlen != 0)
2954
return (EINVAL);
2955
if (req->newptr == NULL)
2956
return (EPERM);
2957
if (req->newlen > sizeof(buf))
2958
return (ENOMEM);
2959
error = SYSCTL_IN(req, buf, req->newlen);
2960
if (error != 0)
2961
return (error);
2962
if (req->newlen < sizeof(struct sockopt_parameters))
2963
return (EINVAL);
2964
params = (struct sockopt_parameters *)buf;
2965
sopt.sopt_level = params->sop_level;
2966
sopt.sopt_name = params->sop_optname;
2967
sopt.sopt_dir = SOPT_SET;
2968
sopt.sopt_val = params->sop_optval;
2969
sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters);
2970
sopt.sopt_td = NULL;
2971
#ifdef INET6
2972
if (params->sop_inc.inc_flags & INC_ISIPV6) {
2973
if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_laddr))
2974
params->sop_inc.inc6_laddr.s6_addr16[1] =
2975
htons(params->sop_inc.inc6_zoneid & 0xffff);
2976
if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_faddr))
2977
params->sop_inc.inc6_faddr.s6_addr16[1] =
2978
htons(params->sop_inc.inc6_zoneid & 0xffff);
2979
}
2980
#endif
2981
if (params->sop_inc.inc_lport != htons(0) &&
2982
params->sop_inc.inc_fport != htons(0)) {
2983
#ifdef INET6
2984
if (params->sop_inc.inc_flags & INC_ISIPV6)
2985
inpi.hash = INP6_PCBHASH(
2986
&params->sop_inc.inc6_faddr,
2987
params->sop_inc.inc_lport,
2988
params->sop_inc.inc_fport,
2989
pcbinfo->ipi_hashmask);
2990
else
2991
#endif
2992
inpi.hash = INP_PCBHASH(
2993
&params->sop_inc.inc_faddr,
2994
params->sop_inc.inc_lport,
2995
params->sop_inc.inc_fport,
2996
pcbinfo->ipi_hashmask);
2997
}
2998
while ((inp = inp_next(&inpi)) != NULL)
2999
if (inp->inp_gencnt == params->sop_id) {
3000
if (inp->inp_flags & INP_DROPPED) {
3001
INP_WUNLOCK(inp);
3002
return (ECONNRESET);
3003
}
3004
so = inp->inp_socket;
3005
KASSERT(so != NULL, ("inp_socket == NULL"));
3006
soref(so);
3007
if (params->sop_level == SOL_SOCKET) {
3008
INP_WUNLOCK(inp);
3009
error = sosetopt(so, &sopt);
3010
} else
3011
error = (*ctloutput_set)(inp, &sopt);
3012
sorele(so);
3013
break;
3014
}
3015
if (inp == NULL)
3016
error = ESRCH;
3017
return (error);
3018
}
3019
3020
#ifdef DDB
3021
static void
3022
db_print_indent(int indent)
3023
{
3024
int i;
3025
3026
for (i = 0; i < indent; i++)
3027
db_printf(" ");
3028
}
3029
3030
static void
3031
db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
3032
{
3033
char faddr_str[48], laddr_str[48];
3034
3035
db_print_indent(indent);
3036
db_printf("%s at %p\n", name, inc);
3037
3038
indent += 2;
3039
3040
#ifdef INET6
3041
if (inc->inc_flags & INC_ISIPV6) {
3042
/* IPv6. */
3043
ip6_sprintf(laddr_str, &inc->inc6_laddr);
3044
ip6_sprintf(faddr_str, &inc->inc6_faddr);
3045
} else
3046
#endif
3047
{
3048
/* IPv4. */
3049
inet_ntoa_r(inc->inc_laddr, laddr_str);
3050
inet_ntoa_r(inc->inc_faddr, faddr_str);
3051
}
3052
db_print_indent(indent);
3053
db_printf("inc_laddr %s inc_lport %u\n", laddr_str,
3054
ntohs(inc->inc_lport));
3055
db_print_indent(indent);
3056
db_printf("inc_faddr %s inc_fport %u\n", faddr_str,
3057
ntohs(inc->inc_fport));
3058
}
3059
3060
void
3061
db_print_inpcb(struct inpcb *inp, const char *name, int indent)
3062
{
3063
3064
db_print_indent(indent);
3065
db_printf("%s at %p\n", name, inp);
3066
3067
indent += 2;
3068
3069
db_print_indent(indent);
3070
db_printf("inp_flow: 0x%x inp_label: %p\n", inp->inp_flow,
3071
inp->inp_label);
3072
3073
db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
3074
3075
db_print_indent(indent);
3076
db_printf("inp_flags: 0x%b\n", inp->inp_flags, INP_FLAGS_BITS);
3077
3078
db_print_indent(indent);
3079
db_printf("inp_flags2: 0x%b\n", inp->inp_flags2, INP_FLAGS2_BITS);
3080
3081
db_print_indent(indent);
3082
db_printf("inp_sp: %p inp_vflag: 0x%b\n", inp->inp_sp,
3083
inp->inp_vflag, INP_VFLAGS_BITS);
3084
3085
db_print_indent(indent);
3086
db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n",
3087
inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
3088
3089
#ifdef INET6
3090
if (inp->inp_vflag & INP_IPV6) {
3091
db_print_indent(indent);
3092
db_printf("in6p_options: %p in6p_outputopts: %p "
3093
"in6p_moptions: %p\n", inp->in6p_options,
3094
inp->in6p_outputopts, inp->in6p_moptions);
3095
db_print_indent(indent);
3096
db_printf("in6p_icmp6filt: %p in6p_cksum %d "
3097
"in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
3098
inp->in6p_hops);
3099
} else
3100
#endif
3101
{
3102
db_print_indent(indent);
3103
db_printf("inp_ip_tos: %d inp_ip_options: %p "
3104
"inp_ip_moptions: %p\n", inp->inp_ip_tos,
3105
inp->inp_options, inp->inp_moptions);
3106
}
3107
3108
db_print_indent(indent);
3109
db_printf("inp_gencnt: %ju\n", (uintmax_t)inp->inp_gencnt);
3110
}
3111
3112
DB_SHOW_COMMAND(inpcb, db_show_inpcb)
3113
{
3114
struct inpcb *inp;
3115
3116
if (!have_addr) {
3117
db_printf("usage: show inpcb <addr>\n");
3118
return;
3119
}
3120
inp = (struct inpcb *)addr;
3121
3122
db_print_inpcb(inp, "inpcb", 0);
3123
}
3124
#endif /* DDB */
3125
3126
#ifdef RATELIMIT
3127
/*
3128
* Modify TX rate limit based on the existing "inp->inp_snd_tag",
3129
* if any.
3130
*/
3131
int
3132
in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
3133
{
3134
union if_snd_tag_modify_params params = {
3135
.rate_limit.max_rate = max_pacing_rate,
3136
.rate_limit.flags = M_NOWAIT,
3137
};
3138
struct m_snd_tag *mst;
3139
int error;
3140
3141
mst = inp->inp_snd_tag;
3142
if (mst == NULL)
3143
return (EINVAL);
3144
3145
if (mst->sw->snd_tag_modify == NULL) {
3146
error = EOPNOTSUPP;
3147
} else {
3148
error = mst->sw->snd_tag_modify(mst, &params);
3149
}
3150
return (error);
3151
}
3152
3153
/*
3154
* Query existing TX rate limit based on the existing
3155
* "inp->inp_snd_tag", if any.
3156
*/
3157
int
3158
in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
3159
{
3160
union if_snd_tag_query_params params = { };
3161
struct m_snd_tag *mst;
3162
int error;
3163
3164
mst = inp->inp_snd_tag;
3165
if (mst == NULL)
3166
return (EINVAL);
3167
3168
if (mst->sw->snd_tag_query == NULL) {
3169
error = EOPNOTSUPP;
3170
} else {
3171
error = mst->sw->snd_tag_query(mst, &params);
3172
if (error == 0 && p_max_pacing_rate != NULL)
3173
*p_max_pacing_rate = params.rate_limit.max_rate;
3174
}
3175
return (error);
3176
}
3177
3178
/*
3179
* Query existing TX queue level based on the existing
3180
* "inp->inp_snd_tag", if any.
3181
*/
3182
int
3183
in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
3184
{
3185
union if_snd_tag_query_params params = { };
3186
struct m_snd_tag *mst;
3187
int error;
3188
3189
mst = inp->inp_snd_tag;
3190
if (mst == NULL)
3191
return (EINVAL);
3192
3193
if (mst->sw->snd_tag_query == NULL)
3194
return (EOPNOTSUPP);
3195
3196
error = mst->sw->snd_tag_query(mst, &params);
3197
if (error == 0 && p_txqueue_level != NULL)
3198
*p_txqueue_level = params.rate_limit.queue_level;
3199
return (error);
3200
}
3201
3202
/*
3203
* Allocate a new TX rate limit send tag from the network interface
3204
* given by the "ifp" argument and save it in "inp->inp_snd_tag":
3205
*/
3206
int
3207
in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
3208
uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
3209
3210
{
3211
union if_snd_tag_alloc_params params = {
3212
.rate_limit.hdr.type = (max_pacing_rate == -1U) ?
3213
IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
3214
.rate_limit.hdr.flowid = flowid,
3215
.rate_limit.hdr.flowtype = flowtype,
3216
.rate_limit.hdr.numa_domain = inp->inp_numa_domain,
3217
.rate_limit.max_rate = max_pacing_rate,
3218
.rate_limit.flags = M_NOWAIT,
3219
};
3220
int error;
3221
3222
INP_WLOCK_ASSERT(inp);
3223
3224
/*
3225
* If there is already a send tag, or the INP is being torn
3226
* down, allocating a new send tag is not allowed. Else send
3227
* tags may leak.
3228
*/
3229
if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0)
3230
return (EINVAL);
3231
3232
error = m_snd_tag_alloc(ifp, &params, st);
3233
#ifdef INET
3234
if (error == 0) {
3235
counter_u64_add(rate_limit_set_ok, 1);
3236
counter_u64_add(rate_limit_active, 1);
3237
} else if (error != EOPNOTSUPP)
3238
counter_u64_add(rate_limit_alloc_fail, 1);
3239
#endif
3240
return (error);
3241
}
3242
3243
void
3244
in_pcbdetach_tag(struct m_snd_tag *mst)
3245
{
3246
3247
m_snd_tag_rele(mst);
3248
#ifdef INET
3249
counter_u64_add(rate_limit_active, -1);
3250
#endif
3251
}
3252
3253
/*
3254
* Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
3255
* if any:
3256
*/
3257
void
3258
in_pcbdetach_txrtlmt(struct inpcb *inp)
3259
{
3260
struct m_snd_tag *mst;
3261
3262
INP_WLOCK_ASSERT(inp);
3263
3264
mst = inp->inp_snd_tag;
3265
inp->inp_snd_tag = NULL;
3266
3267
if (mst == NULL)
3268
return;
3269
3270
m_snd_tag_rele(mst);
3271
#ifdef INET
3272
counter_u64_add(rate_limit_active, -1);
3273
#endif
3274
}
3275
3276
int
3277
in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
3278
{
3279
int error;
3280
3281
/*
3282
* If the existing send tag is for the wrong interface due to
3283
* a route change, first drop the existing tag. Set the
3284
* CHANGED flag so that we will keep trying to allocate a new
3285
* tag if we fail to allocate one this time.
3286
*/
3287
if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
3288
in_pcbdetach_txrtlmt(inp);
3289
inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3290
}
3291
3292
/*
3293
* NOTE: When attaching to a network interface a reference is
3294
* made to ensure the network interface doesn't go away until
3295
* all ratelimit connections are gone. The network interface
3296
* pointers compared below represent valid network interfaces,
3297
* except when comparing towards NULL.
3298
*/
3299
if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
3300
error = 0;
3301
} else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
3302
if (inp->inp_snd_tag != NULL)
3303
in_pcbdetach_txrtlmt(inp);
3304
error = 0;
3305
} else if (inp->inp_snd_tag == NULL) {
3306
/*
3307
* In order to utilize packet pacing with RSS, we need
3308
* to wait until there is a valid RSS hash before we
3309
* can proceed:
3310
*/
3311
if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
3312
error = EAGAIN;
3313
} else {
3314
error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
3315
mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
3316
}
3317
} else {
3318
error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
3319
}
3320
if (error == 0 || error == EOPNOTSUPP)
3321
inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
3322
3323
return (error);
3324
}
3325
3326
/*
3327
* This function should be called when the INP_RATE_LIMIT_CHANGED flag
3328
* is set in the fast path and will attach/detach/modify the TX rate
3329
* limit send tag based on the socket's so_max_pacing_rate value.
3330
*/
3331
void
3332
in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
3333
{
3334
struct socket *socket;
3335
uint32_t max_pacing_rate;
3336
bool did_upgrade;
3337
3338
if (inp == NULL)
3339
return;
3340
3341
socket = inp->inp_socket;
3342
if (socket == NULL)
3343
return;
3344
3345
if (!INP_WLOCKED(inp)) {
3346
/*
3347
* NOTE: If the write locking fails, we need to bail
3348
* out and use the non-ratelimited ring for the
3349
* transmit until there is a new chance to get the
3350
* write lock.
3351
*/
3352
if (!INP_TRY_UPGRADE(inp))
3353
return;
3354
did_upgrade = 1;
3355
} else {
3356
did_upgrade = 0;
3357
}
3358
3359
/*
3360
* NOTE: The so_max_pacing_rate value is read unlocked,
3361
* because atomic updates are not required since the variable
3362
* is checked at every mbuf we send. It is assumed that the
3363
* variable read itself will be atomic.
3364
*/
3365
max_pacing_rate = socket->so_max_pacing_rate;
3366
3367
in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
3368
3369
if (did_upgrade)
3370
INP_DOWNGRADE(inp);
3371
}
3372
3373
/*
3374
* Track route changes for TX rate limiting.
3375
*/
3376
void
3377
in_pcboutput_eagain(struct inpcb *inp)
3378
{
3379
bool did_upgrade;
3380
3381
if (inp == NULL)
3382
return;
3383
3384
if (inp->inp_snd_tag == NULL)
3385
return;
3386
3387
if (!INP_WLOCKED(inp)) {
3388
/*
3389
* NOTE: If the write locking fails, we need to bail
3390
* out and use the non-ratelimited ring for the
3391
* transmit until there is a new chance to get the
3392
* write lock.
3393
*/
3394
if (!INP_TRY_UPGRADE(inp))
3395
return;
3396
did_upgrade = 1;
3397
} else {
3398
did_upgrade = 0;
3399
}
3400
3401
/* detach rate limiting */
3402
in_pcbdetach_txrtlmt(inp);
3403
3404
/* make sure new mbuf send tag allocation is made */
3405
inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3406
3407
if (did_upgrade)
3408
INP_DOWNGRADE(inp);
3409
}
3410
3411
#ifdef INET
3412
static void
3413
rl_init(void *st)
3414
{
3415
rate_limit_new = counter_u64_alloc(M_WAITOK);
3416
rate_limit_chg = counter_u64_alloc(M_WAITOK);
3417
rate_limit_active = counter_u64_alloc(M_WAITOK);
3418
rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
3419
rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
3420
}
3421
3422
SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
3423
#endif
3424
#endif /* RATELIMIT */
3425
3426