Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/net/ipv4/inet_hashtables.c
26285 views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3
* INET An implementation of the TCP/IP protocol suite for the LINUX
4
* operating system. INET is implemented using the BSD Socket
5
* interface as the means of communication with the user level.
6
*
7
* Generic INET transport hashtables
8
*
9
* Authors: Lotsa people, from code originally in tcp
10
*/
11
12
#include <linux/module.h>
13
#include <linux/random.h>
14
#include <linux/sched.h>
15
#include <linux/slab.h>
16
#include <linux/wait.h>
17
#include <linux/vmalloc.h>
18
#include <linux/memblock.h>
19
20
#include <net/addrconf.h>
21
#include <net/inet_connection_sock.h>
22
#include <net/inet_hashtables.h>
23
#if IS_ENABLED(CONFIG_IPV6)
24
#include <net/inet6_hashtables.h>
25
#endif
26
#include <net/hotdata.h>
27
#include <net/ip.h>
28
#include <net/rps.h>
29
#include <net/secure_seq.h>
30
#include <net/sock_reuseport.h>
31
#include <net/tcp.h>
32
33
u32 inet_ehashfn(const struct net *net, const __be32 laddr,
34
const __u16 lport, const __be32 faddr,
35
const __be16 fport)
36
{
37
net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));
38
39
return lport + __inet_ehashfn(laddr, 0, faddr, fport,
40
inet_ehash_secret + net_hash_mix(net));
41
}
42
EXPORT_SYMBOL_GPL(inet_ehashfn);
43
44
/* This function handles inet_sock, but also timewait and request sockets
45
* for IPv4/IPv6.
46
*/
47
static u32 sk_ehashfn(const struct sock *sk)
48
{
49
#if IS_ENABLED(CONFIG_IPV6)
50
if (sk->sk_family == AF_INET6 &&
51
!ipv6_addr_v4mapped(&sk->sk_v6_daddr))
52
return inet6_ehashfn(sock_net(sk),
53
&sk->sk_v6_rcv_saddr, sk->sk_num,
54
&sk->sk_v6_daddr, sk->sk_dport);
55
#endif
56
return inet_ehashfn(sock_net(sk),
57
sk->sk_rcv_saddr, sk->sk_num,
58
sk->sk_daddr, sk->sk_dport);
59
}
60
61
/*
62
* Allocate and initialize a new local port bind bucket.
63
* The bindhash mutex for snum's hash chain must be held here.
64
*/
65
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
66
struct net *net,
67
struct inet_bind_hashbucket *head,
68
const unsigned short snum,
69
int l3mdev)
70
{
71
struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
72
73
if (tb) {
74
write_pnet(&tb->ib_net, net);
75
tb->l3mdev = l3mdev;
76
tb->port = snum;
77
tb->fastreuse = 0;
78
tb->fastreuseport = 0;
79
INIT_HLIST_HEAD(&tb->bhash2);
80
hlist_add_head_rcu(&tb->node, &head->chain);
81
}
82
return tb;
83
}
84
85
/*
86
* Caller must hold hashbucket lock for this tb with local BH disabled
87
*/
88
void inet_bind_bucket_destroy(struct inet_bind_bucket *tb)
89
{
90
if (hlist_empty(&tb->bhash2)) {
91
hlist_del_rcu(&tb->node);
92
kfree_rcu(tb, rcu);
93
}
94
}
95
96
bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net,
97
unsigned short port, int l3mdev)
98
{
99
return net_eq(ib_net(tb), net) && tb->port == port &&
100
tb->l3mdev == l3mdev;
101
}
102
103
static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2,
104
struct net *net,
105
struct inet_bind_hashbucket *head,
106
struct inet_bind_bucket *tb,
107
const struct sock *sk)
108
{
109
write_pnet(&tb2->ib_net, net);
110
tb2->l3mdev = tb->l3mdev;
111
tb2->port = tb->port;
112
#if IS_ENABLED(CONFIG_IPV6)
113
BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED));
114
if (sk->sk_family == AF_INET6) {
115
tb2->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
116
tb2->v6_rcv_saddr = sk->sk_v6_rcv_saddr;
117
} else {
118
tb2->addr_type = IPV6_ADDR_MAPPED;
119
ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb2->v6_rcv_saddr);
120
}
121
#else
122
tb2->rcv_saddr = sk->sk_rcv_saddr;
123
#endif
124
INIT_HLIST_HEAD(&tb2->owners);
125
hlist_add_head(&tb2->node, &head->chain);
126
hlist_add_head(&tb2->bhash_node, &tb->bhash2);
127
}
128
129
struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep,
130
struct net *net,
131
struct inet_bind_hashbucket *head,
132
struct inet_bind_bucket *tb,
133
const struct sock *sk)
134
{
135
struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC);
136
137
if (tb2)
138
inet_bind2_bucket_init(tb2, net, head, tb, sk);
139
140
return tb2;
141
}
142
143
/* Caller must hold hashbucket lock for this tb with local BH disabled */
144
void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
145
{
146
if (hlist_empty(&tb->owners)) {
147
__hlist_del(&tb->node);
148
__hlist_del(&tb->bhash_node);
149
kmem_cache_free(cachep, tb);
150
}
151
}
152
153
static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2,
154
const struct sock *sk)
155
{
156
#if IS_ENABLED(CONFIG_IPV6)
157
if (sk->sk_family == AF_INET6)
158
return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr);
159
160
if (tb2->addr_type != IPV6_ADDR_MAPPED)
161
return false;
162
#endif
163
return tb2->rcv_saddr == sk->sk_rcv_saddr;
164
}
165
166
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
167
struct inet_bind2_bucket *tb2, unsigned short port)
168
{
169
inet_sk(sk)->inet_num = port;
170
inet_csk(sk)->icsk_bind_hash = tb;
171
inet_csk(sk)->icsk_bind2_hash = tb2;
172
sk_add_bind_node(sk, &tb2->owners);
173
}
174
175
/*
176
* Get rid of any references to a local port held by the given sock.
177
*/
178
static void __inet_put_port(struct sock *sk)
179
{
180
struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
181
struct inet_bind_hashbucket *head, *head2;
182
struct net *net = sock_net(sk);
183
struct inet_bind_bucket *tb;
184
int bhash;
185
186
bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size);
187
head = &hashinfo->bhash[bhash];
188
head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num);
189
190
spin_lock(&head->lock);
191
tb = inet_csk(sk)->icsk_bind_hash;
192
inet_csk(sk)->icsk_bind_hash = NULL;
193
inet_sk(sk)->inet_num = 0;
194
195
spin_lock(&head2->lock);
196
if (inet_csk(sk)->icsk_bind2_hash) {
197
struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash;
198
199
__sk_del_bind_node(sk);
200
inet_csk(sk)->icsk_bind2_hash = NULL;
201
inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
202
}
203
spin_unlock(&head2->lock);
204
205
inet_bind_bucket_destroy(tb);
206
spin_unlock(&head->lock);
207
}
208
209
void inet_put_port(struct sock *sk)
210
{
211
local_bh_disable();
212
__inet_put_port(sk);
213
local_bh_enable();
214
}
215
EXPORT_SYMBOL(inet_put_port);
216
217
int __inet_inherit_port(const struct sock *sk, struct sock *child)
218
{
219
struct inet_hashinfo *table = tcp_get_hashinfo(sk);
220
unsigned short port = inet_sk(child)->inet_num;
221
struct inet_bind_hashbucket *head, *head2;
222
bool created_inet_bind_bucket = false;
223
struct net *net = sock_net(sk);
224
bool update_fastreuse = false;
225
struct inet_bind2_bucket *tb2;
226
struct inet_bind_bucket *tb;
227
int bhash, l3mdev;
228
229
bhash = inet_bhashfn(net, port, table->bhash_size);
230
head = &table->bhash[bhash];
231
head2 = inet_bhashfn_portaddr(table, child, net, port);
232
233
spin_lock(&head->lock);
234
spin_lock(&head2->lock);
235
tb = inet_csk(sk)->icsk_bind_hash;
236
tb2 = inet_csk(sk)->icsk_bind2_hash;
237
if (unlikely(!tb || !tb2)) {
238
spin_unlock(&head2->lock);
239
spin_unlock(&head->lock);
240
return -ENOENT;
241
}
242
if (tb->port != port) {
243
l3mdev = inet_sk_bound_l3mdev(sk);
244
245
/* NOTE: using tproxy and redirecting skbs to a proxy
246
* on a different listener port breaks the assumption
247
* that the listener socket's icsk_bind_hash is the same
248
* as that of the child socket. We have to look up or
249
* create a new bind bucket for the child here. */
250
inet_bind_bucket_for_each(tb, &head->chain) {
251
if (inet_bind_bucket_match(tb, net, port, l3mdev))
252
break;
253
}
254
if (!tb) {
255
tb = inet_bind_bucket_create(table->bind_bucket_cachep,
256
net, head, port, l3mdev);
257
if (!tb) {
258
spin_unlock(&head2->lock);
259
spin_unlock(&head->lock);
260
return -ENOMEM;
261
}
262
created_inet_bind_bucket = true;
263
}
264
update_fastreuse = true;
265
266
goto bhash2_find;
267
} else if (!inet_bind2_bucket_addr_match(tb2, child)) {
268
l3mdev = inet_sk_bound_l3mdev(sk);
269
270
bhash2_find:
271
tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child);
272
if (!tb2) {
273
tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep,
274
net, head2, tb, child);
275
if (!tb2)
276
goto error;
277
}
278
}
279
if (update_fastreuse)
280
inet_csk_update_fastreuse(tb, child);
281
inet_bind_hash(child, tb, tb2, port);
282
spin_unlock(&head2->lock);
283
spin_unlock(&head->lock);
284
285
return 0;
286
287
error:
288
if (created_inet_bind_bucket)
289
inet_bind_bucket_destroy(tb);
290
spin_unlock(&head2->lock);
291
spin_unlock(&head->lock);
292
return -ENOMEM;
293
}
294
EXPORT_SYMBOL_GPL(__inet_inherit_port);
295
296
static struct inet_listen_hashbucket *
297
inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
298
{
299
u32 hash;
300
301
#if IS_ENABLED(CONFIG_IPV6)
302
if (sk->sk_family == AF_INET6)
303
hash = ipv6_portaddr_hash(sock_net(sk),
304
&sk->sk_v6_rcv_saddr,
305
inet_sk(sk)->inet_num);
306
else
307
#endif
308
hash = ipv4_portaddr_hash(sock_net(sk),
309
inet_sk(sk)->inet_rcv_saddr,
310
inet_sk(sk)->inet_num);
311
return inet_lhash2_bucket(h, hash);
312
}
313
314
static inline int compute_score(struct sock *sk, const struct net *net,
315
const unsigned short hnum, const __be32 daddr,
316
const int dif, const int sdif)
317
{
318
int score = -1;
319
320
if (net_eq(sock_net(sk), net) && sk->sk_num == hnum &&
321
!ipv6_only_sock(sk)) {
322
if (sk->sk_rcv_saddr != daddr)
323
return -1;
324
325
if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
326
return -1;
327
score = sk->sk_bound_dev_if ? 2 : 1;
328
329
if (sk->sk_family == PF_INET)
330
score++;
331
if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
332
score++;
333
}
334
return score;
335
}
336
337
/**
338
* inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary.
339
* @net: network namespace.
340
* @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP.
341
* @skb: context for a potential SK_REUSEPORT program.
342
* @doff: header offset.
343
* @saddr: source address.
344
* @sport: source port.
345
* @daddr: destination address.
346
* @hnum: destination port in host byte order.
347
* @ehashfn: hash function used to generate the fallback hash.
348
*
349
* Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to
350
* the selected sock or an error.
351
*/
352
struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk,
353
struct sk_buff *skb, int doff,
354
__be32 saddr, __be16 sport,
355
__be32 daddr, unsigned short hnum,
356
inet_ehashfn_t *ehashfn)
357
{
358
struct sock *reuse_sk = NULL;
359
u32 phash;
360
361
if (sk->sk_reuseport) {
362
phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn,
363
net, daddr, hnum, saddr, sport);
364
reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
365
}
366
return reuse_sk;
367
}
368
EXPORT_SYMBOL_GPL(inet_lookup_reuseport);
369
370
/*
371
* Here are some nice properties to exploit here. The BSD API
372
* does not allow a listening sock to specify the remote port nor the
373
* remote address for the connection. So always assume those are both
374
* wildcarded during the search since they can never be otherwise.
375
*/
376
377
/* called with rcu_read_lock() : No refcount taken on the socket */
378
static struct sock *inet_lhash2_lookup(const struct net *net,
379
struct inet_listen_hashbucket *ilb2,
380
struct sk_buff *skb, int doff,
381
const __be32 saddr, __be16 sport,
382
const __be32 daddr, const unsigned short hnum,
383
const int dif, const int sdif)
384
{
385
struct sock *sk, *result = NULL;
386
struct hlist_nulls_node *node;
387
int score, hiscore = 0;
388
389
sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
390
score = compute_score(sk, net, hnum, daddr, dif, sdif);
391
if (score > hiscore) {
392
result = inet_lookup_reuseport(net, sk, skb, doff,
393
saddr, sport, daddr, hnum, inet_ehashfn);
394
if (result)
395
return result;
396
397
result = sk;
398
hiscore = score;
399
}
400
}
401
402
return result;
403
}
404
405
struct sock *inet_lookup_run_sk_lookup(const struct net *net,
406
int protocol,
407
struct sk_buff *skb, int doff,
408
__be32 saddr, __be16 sport,
409
__be32 daddr, u16 hnum, const int dif,
410
inet_ehashfn_t *ehashfn)
411
{
412
struct sock *sk, *reuse_sk;
413
bool no_reuseport;
414
415
no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport,
416
daddr, hnum, dif, &sk);
417
if (no_reuseport || IS_ERR_OR_NULL(sk))
418
return sk;
419
420
reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum,
421
ehashfn);
422
if (reuse_sk)
423
sk = reuse_sk;
424
return sk;
425
}
426
427
struct sock *__inet_lookup_listener(const struct net *net,
428
struct inet_hashinfo *hashinfo,
429
struct sk_buff *skb, int doff,
430
const __be32 saddr, __be16 sport,
431
const __be32 daddr, const unsigned short hnum,
432
const int dif, const int sdif)
433
{
434
struct inet_listen_hashbucket *ilb2;
435
struct sock *result = NULL;
436
unsigned int hash2;
437
438
/* Lookup redirect from BPF */
439
if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
440
hashinfo == net->ipv4.tcp_death_row.hashinfo) {
441
result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff,
442
saddr, sport, daddr, hnum, dif,
443
inet_ehashfn);
444
if (result)
445
goto done;
446
}
447
448
hash2 = ipv4_portaddr_hash(net, daddr, hnum);
449
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
450
451
result = inet_lhash2_lookup(net, ilb2, skb, doff,
452
saddr, sport, daddr, hnum,
453
dif, sdif);
454
if (result)
455
goto done;
456
457
/* Lookup lhash2 with INADDR_ANY */
458
hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
459
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
460
461
result = inet_lhash2_lookup(net, ilb2, skb, doff,
462
saddr, sport, htonl(INADDR_ANY), hnum,
463
dif, sdif);
464
done:
465
if (IS_ERR(result))
466
return NULL;
467
return result;
468
}
469
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
470
471
/* All sockets share common refcount, but have different destructors */
472
void sock_gen_put(struct sock *sk)
473
{
474
if (!refcount_dec_and_test(&sk->sk_refcnt))
475
return;
476
477
if (sk->sk_state == TCP_TIME_WAIT)
478
inet_twsk_free(inet_twsk(sk));
479
else if (sk->sk_state == TCP_NEW_SYN_RECV)
480
reqsk_free(inet_reqsk(sk));
481
else
482
sk_free(sk);
483
}
484
EXPORT_SYMBOL_GPL(sock_gen_put);
485
486
void sock_edemux(struct sk_buff *skb)
487
{
488
sock_gen_put(skb->sk);
489
}
490
EXPORT_SYMBOL(sock_edemux);
491
492
struct sock *__inet_lookup_established(const struct net *net,
493
struct inet_hashinfo *hashinfo,
494
const __be32 saddr, const __be16 sport,
495
const __be32 daddr, const u16 hnum,
496
const int dif, const int sdif)
497
{
498
INET_ADDR_COOKIE(acookie, saddr, daddr);
499
const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
500
struct sock *sk;
501
const struct hlist_nulls_node *node;
502
/* Optimize here for direct hit, only listening connections can
503
* have wildcards anyways.
504
*/
505
unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
506
unsigned int slot = hash & hashinfo->ehash_mask;
507
struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
508
509
begin:
510
sk_nulls_for_each_rcu(sk, node, &head->chain) {
511
if (sk->sk_hash != hash)
512
continue;
513
if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) {
514
if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
515
goto out;
516
if (unlikely(!inet_match(net, sk, acookie,
517
ports, dif, sdif))) {
518
sock_gen_put(sk);
519
goto begin;
520
}
521
goto found;
522
}
523
}
524
/*
525
* if the nulls value we got at the end of this lookup is
526
* not the expected one, we must restart lookup.
527
* We probably met an item that was moved to another chain.
528
*/
529
if (get_nulls_value(node) != slot)
530
goto begin;
531
out:
532
sk = NULL;
533
found:
534
return sk;
535
}
536
EXPORT_SYMBOL_GPL(__inet_lookup_established);
537
538
/* called with local bh disabled */
539
static int __inet_check_established(struct inet_timewait_death_row *death_row,
540
struct sock *sk, __u16 lport,
541
struct inet_timewait_sock **twp,
542
bool rcu_lookup,
543
u32 hash)
544
{
545
struct inet_hashinfo *hinfo = death_row->hashinfo;
546
struct inet_sock *inet = inet_sk(sk);
547
__be32 daddr = inet->inet_rcv_saddr;
548
__be32 saddr = inet->inet_daddr;
549
int dif = sk->sk_bound_dev_if;
550
struct net *net = sock_net(sk);
551
int sdif = l3mdev_master_ifindex_by_index(net, dif);
552
INET_ADDR_COOKIE(acookie, saddr, daddr);
553
const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
554
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
555
struct inet_timewait_sock *tw = NULL;
556
const struct hlist_nulls_node *node;
557
struct sock *sk2;
558
spinlock_t *lock;
559
560
if (rcu_lookup) {
561
sk_nulls_for_each(sk2, node, &head->chain) {
562
if (sk2->sk_hash != hash ||
563
!inet_match(net, sk2, acookie, ports, dif, sdif))
564
continue;
565
if (sk2->sk_state == TCP_TIME_WAIT)
566
break;
567
return -EADDRNOTAVAIL;
568
}
569
return 0;
570
}
571
572
lock = inet_ehash_lockp(hinfo, hash);
573
spin_lock(lock);
574
575
sk_nulls_for_each(sk2, node, &head->chain) {
576
if (sk2->sk_hash != hash)
577
continue;
578
579
if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) {
580
if (sk2->sk_state == TCP_TIME_WAIT) {
581
tw = inet_twsk(sk2);
582
if (sk->sk_protocol == IPPROTO_TCP &&
583
tcp_twsk_unique(sk, sk2, twp))
584
break;
585
}
586
goto not_unique;
587
}
588
}
589
590
/* Must record num and sport now. Otherwise we will see
591
* in hash table socket with a funny identity.
592
*/
593
inet->inet_num = lport;
594
inet->inet_sport = htons(lport);
595
sk->sk_hash = hash;
596
WARN_ON(!sk_unhashed(sk));
597
__sk_nulls_add_node_rcu(sk, &head->chain);
598
if (tw) {
599
sk_nulls_del_node_init_rcu((struct sock *)tw);
600
__NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
601
}
602
spin_unlock(lock);
603
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
604
605
if (twp) {
606
*twp = tw;
607
} else if (tw) {
608
/* Silly. Should hash-dance instead... */
609
inet_twsk_deschedule_put(tw);
610
}
611
return 0;
612
613
not_unique:
614
spin_unlock(lock);
615
return -EADDRNOTAVAIL;
616
}
617
618
static u64 inet_sk_port_offset(const struct sock *sk)
619
{
620
const struct inet_sock *inet = inet_sk(sk);
621
622
return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
623
inet->inet_daddr,
624
inet->inet_dport);
625
}
626
627
/* Searches for an exsiting socket in the ehash bucket list.
628
* Returns true if found, false otherwise.
629
*/
630
static bool inet_ehash_lookup_by_sk(struct sock *sk,
631
struct hlist_nulls_head *list)
632
{
633
const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num);
634
const int sdif = sk->sk_bound_dev_if;
635
const int dif = sk->sk_bound_dev_if;
636
const struct hlist_nulls_node *node;
637
struct net *net = sock_net(sk);
638
struct sock *esk;
639
640
INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr);
641
642
sk_nulls_for_each_rcu(esk, node, list) {
643
if (esk->sk_hash != sk->sk_hash)
644
continue;
645
if (sk->sk_family == AF_INET) {
646
if (unlikely(inet_match(net, esk, acookie,
647
ports, dif, sdif))) {
648
return true;
649
}
650
}
651
#if IS_ENABLED(CONFIG_IPV6)
652
else if (sk->sk_family == AF_INET6) {
653
if (unlikely(inet6_match(net, esk,
654
&sk->sk_v6_daddr,
655
&sk->sk_v6_rcv_saddr,
656
ports, dif, sdif))) {
657
return true;
658
}
659
}
660
#endif
661
}
662
return false;
663
}
664
665
/* Insert a socket into ehash, and eventually remove another one
666
* (The another one can be a SYN_RECV or TIMEWAIT)
667
* If an existing socket already exists, socket sk is not inserted,
668
* and sets found_dup_sk parameter to true.
669
*/
670
bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
671
{
672
struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
673
struct inet_ehash_bucket *head;
674
struct hlist_nulls_head *list;
675
spinlock_t *lock;
676
bool ret = true;
677
678
WARN_ON_ONCE(!sk_unhashed(sk));
679
680
sk->sk_hash = sk_ehashfn(sk);
681
head = inet_ehash_bucket(hashinfo, sk->sk_hash);
682
list = &head->chain;
683
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
684
685
spin_lock(lock);
686
if (osk) {
687
WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
688
ret = sk_nulls_del_node_init_rcu(osk);
689
} else if (found_dup_sk) {
690
*found_dup_sk = inet_ehash_lookup_by_sk(sk, list);
691
if (*found_dup_sk)
692
ret = false;
693
}
694
695
if (ret)
696
__sk_nulls_add_node_rcu(sk, list);
697
698
spin_unlock(lock);
699
700
return ret;
701
}
702
703
bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
704
{
705
bool ok = inet_ehash_insert(sk, osk, found_dup_sk);
706
707
if (ok) {
708
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
709
} else {
710
this_cpu_inc(*sk->sk_prot->orphan_count);
711
inet_sk_set_state(sk, TCP_CLOSE);
712
sock_set_flag(sk, SOCK_DEAD);
713
inet_csk_destroy_sock(sk);
714
}
715
return ok;
716
}
717
EXPORT_IPV6_MOD(inet_ehash_nolisten);
718
719
static int inet_reuseport_add_sock(struct sock *sk,
720
struct inet_listen_hashbucket *ilb)
721
{
722
struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
723
const struct hlist_nulls_node *node;
724
kuid_t uid = sk_uid(sk);
725
struct sock *sk2;
726
727
sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) {
728
if (sk2 != sk &&
729
sk2->sk_family == sk->sk_family &&
730
ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
731
sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
732
inet_csk(sk2)->icsk_bind_hash == tb &&
733
sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) &&
734
inet_rcv_saddr_equal(sk, sk2, false))
735
return reuseport_add_sock(sk, sk2,
736
inet_rcv_saddr_any(sk));
737
}
738
739
return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
740
}
741
742
int __inet_hash(struct sock *sk, struct sock *osk)
743
{
744
struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
745
struct inet_listen_hashbucket *ilb2;
746
int err = 0;
747
748
if (sk->sk_state != TCP_LISTEN) {
749
local_bh_disable();
750
inet_ehash_nolisten(sk, osk, NULL);
751
local_bh_enable();
752
return 0;
753
}
754
WARN_ON(!sk_unhashed(sk));
755
ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
756
757
spin_lock(&ilb2->lock);
758
if (sk->sk_reuseport) {
759
err = inet_reuseport_add_sock(sk, ilb2);
760
if (err)
761
goto unlock;
762
}
763
sock_set_flag(sk, SOCK_RCU_FREE);
764
if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
765
sk->sk_family == AF_INET6)
766
__sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
767
else
768
__sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
769
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
770
unlock:
771
spin_unlock(&ilb2->lock);
772
773
return err;
774
}
775
EXPORT_IPV6_MOD(__inet_hash);
776
777
int inet_hash(struct sock *sk)
778
{
779
int err = 0;
780
781
if (sk->sk_state != TCP_CLOSE)
782
err = __inet_hash(sk, NULL);
783
784
return err;
785
}
786
787
void inet_unhash(struct sock *sk)
788
{
789
struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
790
791
if (sk_unhashed(sk))
792
return;
793
794
sock_rps_delete_flow(sk);
795
if (sk->sk_state == TCP_LISTEN) {
796
struct inet_listen_hashbucket *ilb2;
797
798
ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
799
/* Don't disable bottom halves while acquiring the lock to
800
* avoid circular locking dependency on PREEMPT_RT.
801
*/
802
spin_lock(&ilb2->lock);
803
if (sk_unhashed(sk)) {
804
spin_unlock(&ilb2->lock);
805
return;
806
}
807
808
if (rcu_access_pointer(sk->sk_reuseport_cb))
809
reuseport_stop_listen_sock(sk);
810
811
__sk_nulls_del_node_init_rcu(sk);
812
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
813
spin_unlock(&ilb2->lock);
814
} else {
815
spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
816
817
spin_lock_bh(lock);
818
if (sk_unhashed(sk)) {
819
spin_unlock_bh(lock);
820
return;
821
}
822
__sk_nulls_del_node_init_rcu(sk);
823
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
824
spin_unlock_bh(lock);
825
}
826
}
827
EXPORT_IPV6_MOD(inet_unhash);
828
829
static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb,
830
const struct net *net, unsigned short port,
831
int l3mdev, const struct sock *sk)
832
{
833
if (!net_eq(ib2_net(tb), net) || tb->port != port ||
834
tb->l3mdev != l3mdev)
835
return false;
836
837
return inet_bind2_bucket_addr_match(tb, sk);
838
}
839
840
bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net,
841
unsigned short port, int l3mdev, const struct sock *sk)
842
{
843
if (!net_eq(ib2_net(tb), net) || tb->port != port ||
844
tb->l3mdev != l3mdev)
845
return false;
846
847
#if IS_ENABLED(CONFIG_IPV6)
848
if (tb->addr_type == IPV6_ADDR_ANY)
849
return true;
850
851
if (tb->addr_type != IPV6_ADDR_MAPPED)
852
return false;
853
854
if (sk->sk_family == AF_INET6 &&
855
!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
856
return false;
857
#endif
858
return tb->rcv_saddr == 0;
859
}
860
861
/* The socket's bhash2 hashbucket spinlock must be held when this is called */
862
struct inet_bind2_bucket *
863
inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net,
864
unsigned short port, int l3mdev, const struct sock *sk)
865
{
866
struct inet_bind2_bucket *bhash2 = NULL;
867
868
inet_bind_bucket_for_each(bhash2, &head->chain)
869
if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk))
870
break;
871
872
return bhash2;
873
}
874
875
struct inet_bind_hashbucket *
876
inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port)
877
{
878
struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
879
u32 hash;
880
881
#if IS_ENABLED(CONFIG_IPV6)
882
if (sk->sk_family == AF_INET6)
883
hash = ipv6_portaddr_hash(net, &in6addr_any, port);
884
else
885
#endif
886
hash = ipv4_portaddr_hash(net, 0, port);
887
888
return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
889
}
890
891
static void inet_update_saddr(struct sock *sk, void *saddr, int family)
892
{
893
if (family == AF_INET) {
894
inet_sk(sk)->inet_saddr = *(__be32 *)saddr;
895
sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr);
896
}
897
#if IS_ENABLED(CONFIG_IPV6)
898
else {
899
sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr;
900
}
901
#endif
902
}
903
904
static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset)
905
{
906
struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
907
struct inet_bind_hashbucket *head, *head2;
908
struct inet_bind2_bucket *tb2, *new_tb2;
909
int l3mdev = inet_sk_bound_l3mdev(sk);
910
int port = inet_sk(sk)->inet_num;
911
struct net *net = sock_net(sk);
912
int bhash;
913
914
if (!inet_csk(sk)->icsk_bind2_hash) {
915
/* Not bind()ed before. */
916
if (reset)
917
inet_reset_saddr(sk);
918
else
919
inet_update_saddr(sk, saddr, family);
920
921
return 0;
922
}
923
924
/* Allocate a bind2 bucket ahead of time to avoid permanently putting
925
* the bhash2 table in an inconsistent state if a new tb2 bucket
926
* allocation fails.
927
*/
928
new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC);
929
if (!new_tb2) {
930
if (reset) {
931
/* The (INADDR_ANY, port) bucket might have already
932
* been freed, then we cannot fixup icsk_bind2_hash,
933
* so we give up and unlink sk from bhash/bhash2 not
934
* to leave inconsistency in bhash2.
935
*/
936
inet_put_port(sk);
937
inet_reset_saddr(sk);
938
}
939
940
return -ENOMEM;
941
}
942
943
bhash = inet_bhashfn(net, port, hinfo->bhash_size);
944
head = &hinfo->bhash[bhash];
945
head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
946
947
/* If we change saddr locklessly, another thread
948
* iterating over bhash might see corrupted address.
949
*/
950
spin_lock_bh(&head->lock);
951
952
spin_lock(&head2->lock);
953
__sk_del_bind_node(sk);
954
inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash);
955
spin_unlock(&head2->lock);
956
957
if (reset)
958
inet_reset_saddr(sk);
959
else
960
inet_update_saddr(sk, saddr, family);
961
962
head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
963
964
spin_lock(&head2->lock);
965
tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
966
if (!tb2) {
967
tb2 = new_tb2;
968
inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk);
969
}
970
inet_csk(sk)->icsk_bind2_hash = tb2;
971
sk_add_bind_node(sk, &tb2->owners);
972
spin_unlock(&head2->lock);
973
974
spin_unlock_bh(&head->lock);
975
976
if (tb2 != new_tb2)
977
kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2);
978
979
return 0;
980
}
981
982
int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family)
983
{
984
return __inet_bhash2_update_saddr(sk, saddr, family, false);
985
}
986
EXPORT_IPV6_MOD(inet_bhash2_update_saddr);
987
988
void inet_bhash2_reset_saddr(struct sock *sk)
989
{
990
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
991
__inet_bhash2_update_saddr(sk, NULL, 0, true);
992
}
993
EXPORT_IPV6_MOD(inet_bhash2_reset_saddr);
994
995
/* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm
996
* Note that we use 32bit integers (vs RFC 'short integers')
997
* because 2^16 is not a multiple of num_ephemeral and this
998
* property might be used by clever attacker.
999
*
1000
* RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though
1001
* attacks were since demonstrated, thus we use 65536 by default instead
1002
* to really give more isolation and privacy, at the expense of 256kB
1003
* of kernel memory.
1004
*/
1005
#define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER)
1006
static u32 *table_perturb;
1007
1008
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
1009
struct sock *sk, u64 port_offset,
1010
u32 hash_port0,
1011
int (*check_established)(struct inet_timewait_death_row *,
1012
struct sock *, __u16, struct inet_timewait_sock **,
1013
bool rcu_lookup, u32 hash))
1014
{
1015
struct inet_hashinfo *hinfo = death_row->hashinfo;
1016
struct inet_bind_hashbucket *head, *head2;
1017
struct inet_timewait_sock *tw = NULL;
1018
int port = inet_sk(sk)->inet_num;
1019
struct net *net = sock_net(sk);
1020
struct inet_bind2_bucket *tb2;
1021
struct inet_bind_bucket *tb;
1022
bool tb_created = false;
1023
u32 remaining, offset;
1024
int ret, i, low, high;
1025
bool local_ports;
1026
int step, l3mdev;
1027
u32 index;
1028
1029
if (port) {
1030
local_bh_disable();
1031
ret = check_established(death_row, sk, port, NULL, false,
1032
hash_port0 + port);
1033
local_bh_enable();
1034
return ret;
1035
}
1036
1037
l3mdev = inet_sk_bound_l3mdev(sk);
1038
1039
local_ports = inet_sk_get_local_port_range(sk, &low, &high);
1040
step = local_ports ? 1 : 2;
1041
1042
high++; /* [32768, 60999] -> [32768, 61000[ */
1043
remaining = high - low;
1044
if (!local_ports && remaining > 1)
1045
remaining &= ~1U;
1046
1047
get_random_sleepable_once(table_perturb,
1048
INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb));
1049
index = port_offset & (INET_TABLE_PERTURB_SIZE - 1);
1050
1051
offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32);
1052
offset %= remaining;
1053
1054
/* In first pass we try ports of @low parity.
1055
* inet_csk_get_port() does the opposite choice.
1056
*/
1057
if (!local_ports)
1058
offset &= ~1U;
1059
other_parity_scan:
1060
port = low + offset;
1061
for (i = 0; i < remaining; i += step, port += step) {
1062
if (unlikely(port >= high))
1063
port -= remaining;
1064
if (inet_is_local_reserved_port(net, port))
1065
continue;
1066
head = &hinfo->bhash[inet_bhashfn(net, port,
1067
hinfo->bhash_size)];
1068
rcu_read_lock();
1069
hlist_for_each_entry_rcu(tb, &head->chain, node) {
1070
if (!inet_bind_bucket_match(tb, net, port, l3mdev))
1071
continue;
1072
if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) {
1073
rcu_read_unlock();
1074
goto next_port;
1075
}
1076
if (!check_established(death_row, sk, port, &tw, true,
1077
hash_port0 + port))
1078
break;
1079
rcu_read_unlock();
1080
goto next_port;
1081
}
1082
rcu_read_unlock();
1083
1084
spin_lock_bh(&head->lock);
1085
1086
/* Does not bother with rcv_saddr checks, because
1087
* the established check is already unique enough.
1088
*/
1089
inet_bind_bucket_for_each(tb, &head->chain) {
1090
if (inet_bind_bucket_match(tb, net, port, l3mdev)) {
1091
if (tb->fastreuse >= 0 ||
1092
tb->fastreuseport >= 0)
1093
goto next_port_unlock;
1094
WARN_ON(hlist_empty(&tb->bhash2));
1095
if (!check_established(death_row, sk,
1096
port, &tw, false,
1097
hash_port0 + port))
1098
goto ok;
1099
goto next_port_unlock;
1100
}
1101
}
1102
1103
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
1104
net, head, port, l3mdev);
1105
if (!tb) {
1106
spin_unlock_bh(&head->lock);
1107
return -ENOMEM;
1108
}
1109
tb_created = true;
1110
tb->fastreuse = -1;
1111
tb->fastreuseport = -1;
1112
goto ok;
1113
next_port_unlock:
1114
spin_unlock_bh(&head->lock);
1115
next_port:
1116
cond_resched();
1117
}
1118
1119
if (!local_ports) {
1120
offset++;
1121
if ((offset & 1) && remaining > 1)
1122
goto other_parity_scan;
1123
}
1124
return -EADDRNOTAVAIL;
1125
1126
ok:
1127
/* Find the corresponding tb2 bucket since we need to
1128
* add the socket to the bhash2 table as well
1129
*/
1130
head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
1131
spin_lock(&head2->lock);
1132
1133
tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
1134
if (!tb2) {
1135
tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net,
1136
head2, tb, sk);
1137
if (!tb2)
1138
goto error;
1139
}
1140
1141
/* Here we want to add a little bit of randomness to the next source
1142
* port that will be chosen. We use a max() with a random here so that
1143
* on low contention the randomness is maximal and on high contention
1144
* it may be inexistent.
1145
*/
1146
i = max_t(int, i, get_random_u32_below(8) * step);
1147
WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step);
1148
1149
/* Head lock still held and bh's disabled */
1150
inet_bind_hash(sk, tb, tb2, port);
1151
1152
if (sk_unhashed(sk)) {
1153
inet_sk(sk)->inet_sport = htons(port);
1154
inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
1155
}
1156
if (tw)
1157
inet_twsk_bind_unhash(tw, hinfo);
1158
1159
spin_unlock(&head2->lock);
1160
spin_unlock(&head->lock);
1161
1162
if (tw)
1163
inet_twsk_deschedule_put(tw);
1164
local_bh_enable();
1165
return 0;
1166
1167
error:
1168
if (sk_hashed(sk)) {
1169
spinlock_t *lock = inet_ehash_lockp(hinfo, sk->sk_hash);
1170
1171
sock_prot_inuse_add(net, sk->sk_prot, -1);
1172
1173
spin_lock(lock);
1174
__sk_nulls_del_node_init_rcu(sk);
1175
spin_unlock(lock);
1176
1177
sk->sk_hash = 0;
1178
inet_sk(sk)->inet_sport = 0;
1179
inet_sk(sk)->inet_num = 0;
1180
1181
if (tw)
1182
inet_twsk_bind_unhash(tw, hinfo);
1183
}
1184
1185
spin_unlock(&head2->lock);
1186
if (tb_created)
1187
inet_bind_bucket_destroy(tb);
1188
spin_unlock(&head->lock);
1189
1190
if (tw)
1191
inet_twsk_deschedule_put(tw);
1192
1193
local_bh_enable();
1194
1195
return -ENOMEM;
1196
}
1197
1198
/*
1199
* Bind a port for a connect operation and hash it.
1200
*/
1201
int inet_hash_connect(struct inet_timewait_death_row *death_row,
1202
struct sock *sk)
1203
{
1204
const struct inet_sock *inet = inet_sk(sk);
1205
const struct net *net = sock_net(sk);
1206
u64 port_offset = 0;
1207
u32 hash_port0;
1208
1209
if (!inet_sk(sk)->inet_num)
1210
port_offset = inet_sk_port_offset(sk);
1211
1212
hash_port0 = inet_ehashfn(net, inet->inet_rcv_saddr, 0,
1213
inet->inet_daddr, inet->inet_dport);
1214
1215
return __inet_hash_connect(death_row, sk, port_offset, hash_port0,
1216
__inet_check_established);
1217
}
1218
1219
static void init_hashinfo_lhash2(struct inet_hashinfo *h)
1220
{
1221
int i;
1222
1223
for (i = 0; i <= h->lhash2_mask; i++) {
1224
spin_lock_init(&h->lhash2[i].lock);
1225
INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head,
1226
i + LISTENING_NULLS_BASE);
1227
}
1228
}
1229
1230
void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
1231
unsigned long numentries, int scale,
1232
unsigned long low_limit,
1233
unsigned long high_limit)
1234
{
1235
h->lhash2 = alloc_large_system_hash(name,
1236
sizeof(*h->lhash2),
1237
numentries,
1238
scale,
1239
0,
1240
NULL,
1241
&h->lhash2_mask,
1242
low_limit,
1243
high_limit);
1244
init_hashinfo_lhash2(h);
1245
1246
/* this one is used for source ports of outgoing connections */
1247
table_perturb = alloc_large_system_hash("Table-perturb",
1248
sizeof(*table_perturb),
1249
INET_TABLE_PERTURB_SIZE,
1250
0, 0, NULL, NULL,
1251
INET_TABLE_PERTURB_SIZE,
1252
INET_TABLE_PERTURB_SIZE);
1253
}
1254
1255
int inet_hashinfo2_init_mod(struct inet_hashinfo *h)
1256
{
1257
h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL);
1258
if (!h->lhash2)
1259
return -ENOMEM;
1260
1261
h->lhash2_mask = INET_LHTABLE_SIZE - 1;
1262
/* INET_LHTABLE_SIZE must be a power of 2 */
1263
BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask);
1264
1265
init_hashinfo_lhash2(h);
1266
return 0;
1267
}
1268
1269
int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
1270
{
1271
unsigned int locksz = sizeof(spinlock_t);
1272
unsigned int i, nblocks = 1;
1273
spinlock_t *ptr = NULL;
1274
1275
if (locksz == 0)
1276
goto set_mask;
1277
1278
/* Allocate 2 cache lines or at least one spinlock per cpu. */
1279
nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U) * num_possible_cpus();
1280
1281
/* At least one page per NUMA node. */
1282
nblocks = max(nblocks, num_online_nodes() * PAGE_SIZE / locksz);
1283
1284
nblocks = roundup_pow_of_two(nblocks);
1285
1286
/* No more locks than number of hash buckets. */
1287
nblocks = min(nblocks, hashinfo->ehash_mask + 1);
1288
1289
if (num_online_nodes() > 1) {
1290
/* Use vmalloc() to allow NUMA policy to spread pages
1291
* on all available nodes if desired.
1292
*/
1293
ptr = vmalloc_array(nblocks, locksz);
1294
}
1295
if (!ptr) {
1296
ptr = kvmalloc_array(nblocks, locksz, GFP_KERNEL);
1297
if (!ptr)
1298
return -ENOMEM;
1299
}
1300
for (i = 0; i < nblocks; i++)
1301
spin_lock_init(&ptr[i]);
1302
hashinfo->ehash_locks = ptr;
1303
set_mask:
1304
hashinfo->ehash_locks_mask = nblocks - 1;
1305
return 0;
1306
}
1307
1308
struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo,
1309
unsigned int ehash_entries)
1310
{
1311
struct inet_hashinfo *new_hashinfo;
1312
int i;
1313
1314
new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL);
1315
if (!new_hashinfo)
1316
goto err;
1317
1318
new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket),
1319
GFP_KERNEL_ACCOUNT);
1320
if (!new_hashinfo->ehash)
1321
goto free_hashinfo;
1322
1323
new_hashinfo->ehash_mask = ehash_entries - 1;
1324
1325
if (inet_ehash_locks_alloc(new_hashinfo))
1326
goto free_ehash;
1327
1328
for (i = 0; i < ehash_entries; i++)
1329
INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i);
1330
1331
new_hashinfo->pernet = true;
1332
1333
return new_hashinfo;
1334
1335
free_ehash:
1336
vfree(new_hashinfo->ehash);
1337
free_hashinfo:
1338
kfree(new_hashinfo);
1339
err:
1340
return NULL;
1341
}
1342
1343
void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo)
1344
{
1345
if (!hashinfo->pernet)
1346
return;
1347
1348
inet_ehash_locks_free(hashinfo);
1349
vfree(hashinfo->ehash);
1350
kfree(hashinfo);
1351
}
1352
1353