Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/net/mptcp/pm.c
26278 views
1
// SPDX-License-Identifier: GPL-2.0
2
/* Multipath TCP
3
*
4
* Copyright (c) 2019, Intel Corporation.
5
*/
6
#define pr_fmt(fmt) "MPTCP: " fmt
7
8
#include <linux/rculist.h>
9
#include <linux/spinlock.h>
10
#include "protocol.h"
11
#include "mib.h"
12
13
#define ADD_ADDR_RETRANS_MAX 3
14
15
struct mptcp_pm_add_entry {
16
struct list_head list;
17
struct mptcp_addr_info addr;
18
u8 retrans_times;
19
struct timer_list add_timer;
20
struct mptcp_sock *sock;
21
};
22
23
static DEFINE_SPINLOCK(mptcp_pm_list_lock);
24
static LIST_HEAD(mptcp_pm_list);
25
26
/* path manager helpers */
27
28
/* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses,
29
* otherwise allow any matching local/remote pair
30
*/
31
bool mptcp_pm_addr_families_match(const struct sock *sk,
32
const struct mptcp_addr_info *loc,
33
const struct mptcp_addr_info *rem)
34
{
35
bool mptcp_is_v4 = sk->sk_family == AF_INET;
36
37
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
38
bool loc_is_v4 = loc->family == AF_INET || ipv6_addr_v4mapped(&loc->addr6);
39
bool rem_is_v4 = rem->family == AF_INET || ipv6_addr_v4mapped(&rem->addr6);
40
41
if (mptcp_is_v4)
42
return loc_is_v4 && rem_is_v4;
43
44
if (ipv6_only_sock(sk))
45
return !loc_is_v4 && !rem_is_v4;
46
47
return loc_is_v4 == rem_is_v4;
48
#else
49
return mptcp_is_v4 && loc->family == AF_INET && rem->family == AF_INET;
50
#endif
51
}
52
53
bool mptcp_addresses_equal(const struct mptcp_addr_info *a,
54
const struct mptcp_addr_info *b, bool use_port)
55
{
56
bool addr_equals = false;
57
58
if (a->family == b->family) {
59
if (a->family == AF_INET)
60
addr_equals = a->addr.s_addr == b->addr.s_addr;
61
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
62
else
63
addr_equals = ipv6_addr_equal(&a->addr6, &b->addr6);
64
} else if (a->family == AF_INET) {
65
if (ipv6_addr_v4mapped(&b->addr6))
66
addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3];
67
} else if (b->family == AF_INET) {
68
if (ipv6_addr_v4mapped(&a->addr6))
69
addr_equals = a->addr6.s6_addr32[3] == b->addr.s_addr;
70
#endif
71
}
72
73
if (!addr_equals)
74
return false;
75
if (!use_port)
76
return true;
77
78
return a->port == b->port;
79
}
80
81
void mptcp_local_address(const struct sock_common *skc,
82
struct mptcp_addr_info *addr)
83
{
84
addr->family = skc->skc_family;
85
addr->port = htons(skc->skc_num);
86
if (addr->family == AF_INET)
87
addr->addr.s_addr = skc->skc_rcv_saddr;
88
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
89
else if (addr->family == AF_INET6)
90
addr->addr6 = skc->skc_v6_rcv_saddr;
91
#endif
92
}
93
94
void mptcp_remote_address(const struct sock_common *skc,
95
struct mptcp_addr_info *addr)
96
{
97
addr->family = skc->skc_family;
98
addr->port = skc->skc_dport;
99
if (addr->family == AF_INET)
100
addr->addr.s_addr = skc->skc_daddr;
101
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
102
else if (addr->family == AF_INET6)
103
addr->addr6 = skc->skc_v6_daddr;
104
#endif
105
}
106
107
static bool mptcp_pm_is_init_remote_addr(struct mptcp_sock *msk,
108
const struct mptcp_addr_info *remote)
109
{
110
struct mptcp_addr_info mpc_remote;
111
112
mptcp_remote_address((struct sock_common *)msk, &mpc_remote);
113
return mptcp_addresses_equal(&mpc_remote, remote, remote->port);
114
}
115
116
bool mptcp_lookup_subflow_by_saddr(const struct list_head *list,
117
const struct mptcp_addr_info *saddr)
118
{
119
struct mptcp_subflow_context *subflow;
120
struct mptcp_addr_info cur;
121
struct sock_common *skc;
122
123
list_for_each_entry(subflow, list, node) {
124
skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow);
125
126
mptcp_local_address(skc, &cur);
127
if (mptcp_addresses_equal(&cur, saddr, saddr->port))
128
return true;
129
}
130
131
return false;
132
}
133
134
static struct mptcp_pm_add_entry *
135
mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk,
136
const struct mptcp_addr_info *addr)
137
{
138
struct mptcp_pm_add_entry *entry;
139
140
lockdep_assert_held(&msk->pm.lock);
141
142
list_for_each_entry(entry, &msk->pm.anno_list, list) {
143
if (mptcp_addresses_equal(&entry->addr, addr, true))
144
return entry;
145
}
146
147
return NULL;
148
}
149
150
bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk,
151
const struct mptcp_addr_info *addr)
152
{
153
struct mptcp_pm_add_entry *entry;
154
bool ret;
155
156
entry = mptcp_pm_del_add_timer(msk, addr, false);
157
ret = entry;
158
kfree(entry);
159
160
return ret;
161
}
162
163
bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk)
164
{
165
struct mptcp_pm_add_entry *entry;
166
struct mptcp_addr_info saddr;
167
bool ret = false;
168
169
mptcp_local_address((struct sock_common *)sk, &saddr);
170
171
spin_lock_bh(&msk->pm.lock);
172
list_for_each_entry(entry, &msk->pm.anno_list, list) {
173
if (mptcp_addresses_equal(&entry->addr, &saddr, true)) {
174
ret = true;
175
goto out;
176
}
177
}
178
179
out:
180
spin_unlock_bh(&msk->pm.lock);
181
return ret;
182
}
183
184
static void __mptcp_pm_send_ack(struct mptcp_sock *msk,
185
struct mptcp_subflow_context *subflow,
186
bool prio, bool backup)
187
{
188
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
189
bool slow;
190
191
pr_debug("send ack for %s\n",
192
prio ? "mp_prio" :
193
(mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr"));
194
195
slow = lock_sock_fast(ssk);
196
if (prio) {
197
subflow->send_mp_prio = 1;
198
subflow->request_bkup = backup;
199
}
200
201
__mptcp_subflow_send_ack(ssk);
202
unlock_sock_fast(ssk, slow);
203
}
204
205
void mptcp_pm_send_ack(struct mptcp_sock *msk,
206
struct mptcp_subflow_context *subflow,
207
bool prio, bool backup)
208
{
209
spin_unlock_bh(&msk->pm.lock);
210
__mptcp_pm_send_ack(msk, subflow, prio, backup);
211
spin_lock_bh(&msk->pm.lock);
212
}
213
214
void mptcp_pm_addr_send_ack(struct mptcp_sock *msk)
215
{
216
struct mptcp_subflow_context *subflow, *alt = NULL;
217
218
msk_owned_by_me(msk);
219
lockdep_assert_held(&msk->pm.lock);
220
221
if (!mptcp_pm_should_add_signal(msk) &&
222
!mptcp_pm_should_rm_signal(msk))
223
return;
224
225
mptcp_for_each_subflow(msk, subflow) {
226
if (__mptcp_subflow_active(subflow)) {
227
if (!subflow->stale) {
228
mptcp_pm_send_ack(msk, subflow, false, false);
229
return;
230
}
231
232
if (!alt)
233
alt = subflow;
234
}
235
}
236
237
if (alt)
238
mptcp_pm_send_ack(msk, alt, false, false);
239
}
240
241
int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk,
242
struct mptcp_addr_info *addr,
243
struct mptcp_addr_info *rem,
244
u8 bkup)
245
{
246
struct mptcp_subflow_context *subflow;
247
248
pr_debug("bkup=%d\n", bkup);
249
250
mptcp_for_each_subflow(msk, subflow) {
251
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
252
struct mptcp_addr_info local, remote;
253
254
mptcp_local_address((struct sock_common *)ssk, &local);
255
if (!mptcp_addresses_equal(&local, addr, addr->port))
256
continue;
257
258
if (rem && rem->family != AF_UNSPEC) {
259
mptcp_remote_address((struct sock_common *)ssk, &remote);
260
if (!mptcp_addresses_equal(&remote, rem, rem->port))
261
continue;
262
}
263
264
__mptcp_pm_send_ack(msk, subflow, true, bkup);
265
return 0;
266
}
267
268
return -EINVAL;
269
}
270
271
static void mptcp_pm_add_timer(struct timer_list *timer)
272
{
273
struct mptcp_pm_add_entry *entry = timer_container_of(entry, timer,
274
add_timer);
275
struct mptcp_sock *msk = entry->sock;
276
struct sock *sk = (struct sock *)msk;
277
unsigned int timeout;
278
279
pr_debug("msk=%p\n", msk);
280
281
if (!msk)
282
return;
283
284
if (inet_sk_state_load(sk) == TCP_CLOSE)
285
return;
286
287
if (!entry->addr.id)
288
return;
289
290
if (mptcp_pm_should_add_signal_addr(msk)) {
291
sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8);
292
goto out;
293
}
294
295
timeout = mptcp_get_add_addr_timeout(sock_net(sk));
296
if (!timeout)
297
goto out;
298
299
spin_lock_bh(&msk->pm.lock);
300
301
if (!mptcp_pm_should_add_signal_addr(msk)) {
302
pr_debug("retransmit ADD_ADDR id=%d\n", entry->addr.id);
303
mptcp_pm_announce_addr(msk, &entry->addr, false);
304
mptcp_pm_add_addr_send_ack(msk);
305
entry->retrans_times++;
306
}
307
308
if (entry->retrans_times < ADD_ADDR_RETRANS_MAX)
309
sk_reset_timer(sk, timer,
310
jiffies + timeout);
311
312
spin_unlock_bh(&msk->pm.lock);
313
314
if (entry->retrans_times == ADD_ADDR_RETRANS_MAX)
315
mptcp_pm_subflow_established(msk);
316
317
out:
318
__sock_put(sk);
319
}
320
321
struct mptcp_pm_add_entry *
322
mptcp_pm_del_add_timer(struct mptcp_sock *msk,
323
const struct mptcp_addr_info *addr, bool check_id)
324
{
325
struct mptcp_pm_add_entry *entry;
326
struct sock *sk = (struct sock *)msk;
327
struct timer_list *add_timer = NULL;
328
329
spin_lock_bh(&msk->pm.lock);
330
entry = mptcp_lookup_anno_list_by_saddr(msk, addr);
331
if (entry && (!check_id || entry->addr.id == addr->id)) {
332
entry->retrans_times = ADD_ADDR_RETRANS_MAX;
333
add_timer = &entry->add_timer;
334
}
335
if (!check_id && entry)
336
list_del(&entry->list);
337
spin_unlock_bh(&msk->pm.lock);
338
339
/* no lock, because sk_stop_timer_sync() is calling timer_delete_sync() */
340
if (add_timer)
341
sk_stop_timer_sync(sk, add_timer);
342
343
return entry;
344
}
345
346
bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
347
const struct mptcp_addr_info *addr)
348
{
349
struct mptcp_pm_add_entry *add_entry = NULL;
350
struct sock *sk = (struct sock *)msk;
351
struct net *net = sock_net(sk);
352
unsigned int timeout;
353
354
lockdep_assert_held(&msk->pm.lock);
355
356
add_entry = mptcp_lookup_anno_list_by_saddr(msk, addr);
357
358
if (add_entry) {
359
if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk)))
360
return false;
361
362
goto reset_timer;
363
}
364
365
add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC);
366
if (!add_entry)
367
return false;
368
369
list_add(&add_entry->list, &msk->pm.anno_list);
370
371
add_entry->addr = *addr;
372
add_entry->sock = msk;
373
add_entry->retrans_times = 0;
374
375
timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0);
376
reset_timer:
377
timeout = mptcp_get_add_addr_timeout(net);
378
if (timeout)
379
sk_reset_timer(sk, &add_entry->add_timer, jiffies + timeout);
380
381
return true;
382
}
383
384
static void mptcp_pm_free_anno_list(struct mptcp_sock *msk)
385
{
386
struct mptcp_pm_add_entry *entry, *tmp;
387
struct sock *sk = (struct sock *)msk;
388
LIST_HEAD(free_list);
389
390
pr_debug("msk=%p\n", msk);
391
392
spin_lock_bh(&msk->pm.lock);
393
list_splice_init(&msk->pm.anno_list, &free_list);
394
spin_unlock_bh(&msk->pm.lock);
395
396
list_for_each_entry_safe(entry, tmp, &free_list, list) {
397
sk_stop_timer_sync(sk, &entry->add_timer);
398
kfree(entry);
399
}
400
}
401
402
/* path manager command handlers */
403
404
int mptcp_pm_announce_addr(struct mptcp_sock *msk,
405
const struct mptcp_addr_info *addr,
406
bool echo)
407
{
408
u8 add_addr = READ_ONCE(msk->pm.addr_signal);
409
410
pr_debug("msk=%p, local_id=%d, echo=%d\n", msk, addr->id, echo);
411
412
lockdep_assert_held(&msk->pm.lock);
413
414
if (add_addr &
415
(echo ? BIT(MPTCP_ADD_ADDR_ECHO) : BIT(MPTCP_ADD_ADDR_SIGNAL))) {
416
MPTCP_INC_STATS(sock_net((struct sock *)msk),
417
echo ? MPTCP_MIB_ECHOADDTXDROP : MPTCP_MIB_ADDADDRTXDROP);
418
return -EINVAL;
419
}
420
421
if (echo) {
422
msk->pm.remote = *addr;
423
add_addr |= BIT(MPTCP_ADD_ADDR_ECHO);
424
} else {
425
msk->pm.local = *addr;
426
add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL);
427
}
428
WRITE_ONCE(msk->pm.addr_signal, add_addr);
429
return 0;
430
}
431
432
int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list)
433
{
434
u8 rm_addr = READ_ONCE(msk->pm.addr_signal);
435
436
pr_debug("msk=%p, rm_list_nr=%d\n", msk, rm_list->nr);
437
438
if (rm_addr) {
439
MPTCP_ADD_STATS(sock_net((struct sock *)msk),
440
MPTCP_MIB_RMADDRTXDROP, rm_list->nr);
441
return -EINVAL;
442
}
443
444
msk->pm.rm_list_tx = *rm_list;
445
rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL);
446
WRITE_ONCE(msk->pm.addr_signal, rm_addr);
447
mptcp_pm_addr_send_ack(msk);
448
return 0;
449
}
450
451
/* path manager event handlers */
452
453
void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side)
454
{
455
struct mptcp_pm_data *pm = &msk->pm;
456
457
pr_debug("msk=%p, token=%u side=%d\n", msk, READ_ONCE(msk->token), server_side);
458
459
WRITE_ONCE(pm->server_side, server_side);
460
mptcp_event(MPTCP_EVENT_CREATED, msk, ssk, GFP_ATOMIC);
461
}
462
463
bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
464
{
465
struct mptcp_pm_data *pm = &msk->pm;
466
unsigned int subflows_max;
467
int ret = 0;
468
469
if (mptcp_pm_is_userspace(msk)) {
470
if (mptcp_userspace_pm_active(msk)) {
471
spin_lock_bh(&pm->lock);
472
pm->subflows++;
473
spin_unlock_bh(&pm->lock);
474
return true;
475
}
476
return false;
477
}
478
479
subflows_max = mptcp_pm_get_subflows_max(msk);
480
481
pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, pm->subflows,
482
subflows_max, READ_ONCE(pm->accept_subflow));
483
484
/* try to avoid acquiring the lock below */
485
if (!READ_ONCE(pm->accept_subflow))
486
return false;
487
488
spin_lock_bh(&pm->lock);
489
if (READ_ONCE(pm->accept_subflow)) {
490
ret = pm->subflows < subflows_max;
491
if (ret && ++pm->subflows == subflows_max)
492
WRITE_ONCE(pm->accept_subflow, false);
493
}
494
spin_unlock_bh(&pm->lock);
495
496
return ret;
497
}
498
499
/* return true if the new status bit is currently cleared, that is, this event
500
* can be server, eventually by an already scheduled work
501
*/
502
static bool mptcp_pm_schedule_work(struct mptcp_sock *msk,
503
enum mptcp_pm_status new_status)
504
{
505
pr_debug("msk=%p status=%x new=%lx\n", msk, msk->pm.status,
506
BIT(new_status));
507
if (msk->pm.status & BIT(new_status))
508
return false;
509
510
msk->pm.status |= BIT(new_status);
511
mptcp_schedule_work((struct sock *)msk);
512
return true;
513
}
514
515
void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk)
516
{
517
struct mptcp_pm_data *pm = &msk->pm;
518
bool announce = false;
519
520
pr_debug("msk=%p\n", msk);
521
522
spin_lock_bh(&pm->lock);
523
524
/* mptcp_pm_fully_established() can be invoked by multiple
525
* racing paths - accept() and check_fully_established()
526
* be sure to serve this event only once.
527
*/
528
if (READ_ONCE(pm->work_pending) &&
529
!(pm->status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)))
530
mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED);
531
532
if ((pm->status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0)
533
announce = true;
534
535
pm->status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED);
536
spin_unlock_bh(&pm->lock);
537
538
if (announce)
539
mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, GFP_ATOMIC);
540
}
541
542
void mptcp_pm_connection_closed(struct mptcp_sock *msk)
543
{
544
pr_debug("msk=%p\n", msk);
545
546
if (msk->token)
547
mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
548
}
549
550
void mptcp_pm_subflow_established(struct mptcp_sock *msk)
551
{
552
struct mptcp_pm_data *pm = &msk->pm;
553
554
pr_debug("msk=%p\n", msk);
555
556
if (!READ_ONCE(pm->work_pending))
557
return;
558
559
spin_lock_bh(&pm->lock);
560
561
if (READ_ONCE(pm->work_pending))
562
mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
563
564
spin_unlock_bh(&pm->lock);
565
}
566
567
void mptcp_pm_subflow_check_next(struct mptcp_sock *msk,
568
const struct mptcp_subflow_context *subflow)
569
{
570
struct mptcp_pm_data *pm = &msk->pm;
571
bool update_subflows;
572
573
update_subflows = subflow->request_join || subflow->mp_join;
574
if (mptcp_pm_is_userspace(msk)) {
575
if (update_subflows) {
576
spin_lock_bh(&pm->lock);
577
pm->subflows--;
578
spin_unlock_bh(&pm->lock);
579
}
580
return;
581
}
582
583
if (!READ_ONCE(pm->work_pending) && !update_subflows)
584
return;
585
586
spin_lock_bh(&pm->lock);
587
if (update_subflows)
588
__mptcp_pm_close_subflow(msk);
589
590
/* Even if this subflow is not really established, tell the PM to try
591
* to pick the next ones, if possible.
592
*/
593
if (mptcp_pm_nl_check_work_pending(msk))
594
mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
595
596
spin_unlock_bh(&pm->lock);
597
}
598
599
void mptcp_pm_add_addr_received(const struct sock *ssk,
600
const struct mptcp_addr_info *addr)
601
{
602
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
603
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
604
struct mptcp_pm_data *pm = &msk->pm;
605
606
pr_debug("msk=%p remote_id=%d accept=%d\n", msk, addr->id,
607
READ_ONCE(pm->accept_addr));
608
609
mptcp_event_addr_announced(ssk, addr);
610
611
spin_lock_bh(&pm->lock);
612
613
if (mptcp_pm_is_userspace(msk)) {
614
if (mptcp_userspace_pm_active(msk)) {
615
mptcp_pm_announce_addr(msk, addr, true);
616
mptcp_pm_add_addr_send_ack(msk);
617
} else {
618
__MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP);
619
}
620
/* id0 should not have a different address */
621
} else if ((addr->id == 0 && !mptcp_pm_is_init_remote_addr(msk, addr)) ||
622
(addr->id > 0 && !READ_ONCE(pm->accept_addr))) {
623
mptcp_pm_announce_addr(msk, addr, true);
624
mptcp_pm_add_addr_send_ack(msk);
625
} else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) {
626
pm->remote = *addr;
627
} else {
628
__MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP);
629
}
630
631
spin_unlock_bh(&pm->lock);
632
}
633
634
void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk,
635
const struct mptcp_addr_info *addr)
636
{
637
struct mptcp_pm_data *pm = &msk->pm;
638
639
pr_debug("msk=%p\n", msk);
640
641
if (!READ_ONCE(pm->work_pending))
642
return;
643
644
spin_lock_bh(&pm->lock);
645
646
if (mptcp_lookup_anno_list_by_saddr(msk, addr) && READ_ONCE(pm->work_pending))
647
mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
648
649
spin_unlock_bh(&pm->lock);
650
}
651
652
void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk)
653
{
654
if (!mptcp_pm_should_add_signal(msk))
655
return;
656
657
mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK);
658
}
659
660
static void mptcp_pm_rm_addr_or_subflow(struct mptcp_sock *msk,
661
const struct mptcp_rm_list *rm_list,
662
enum linux_mptcp_mib_field rm_type)
663
{
664
struct mptcp_subflow_context *subflow, *tmp;
665
struct sock *sk = (struct sock *)msk;
666
u8 i;
667
668
pr_debug("%s rm_list_nr %d\n",
669
rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr);
670
671
msk_owned_by_me(msk);
672
673
if (sk->sk_state == TCP_LISTEN)
674
return;
675
676
if (!rm_list->nr)
677
return;
678
679
if (list_empty(&msk->conn_list))
680
return;
681
682
for (i = 0; i < rm_list->nr; i++) {
683
u8 rm_id = rm_list->ids[i];
684
bool removed = false;
685
686
mptcp_for_each_subflow_safe(msk, subflow, tmp) {
687
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
688
u8 remote_id = READ_ONCE(subflow->remote_id);
689
int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
690
u8 id = subflow_get_local_id(subflow);
691
692
if ((1 << inet_sk_state_load(ssk)) &
693
(TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | TCPF_CLOSE))
694
continue;
695
if (rm_type == MPTCP_MIB_RMADDR && remote_id != rm_id)
696
continue;
697
if (rm_type == MPTCP_MIB_RMSUBFLOW && id != rm_id)
698
continue;
699
700
pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u\n",
701
rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow",
702
i, rm_id, id, remote_id, msk->mpc_endpoint_id);
703
spin_unlock_bh(&msk->pm.lock);
704
mptcp_subflow_shutdown(sk, ssk, how);
705
removed |= subflow->request_join;
706
707
/* the following takes care of updating the subflows counter */
708
mptcp_close_ssk(sk, ssk, subflow);
709
spin_lock_bh(&msk->pm.lock);
710
711
if (rm_type == MPTCP_MIB_RMSUBFLOW)
712
__MPTCP_INC_STATS(sock_net(sk), rm_type);
713
}
714
715
if (rm_type == MPTCP_MIB_RMADDR) {
716
__MPTCP_INC_STATS(sock_net(sk), rm_type);
717
if (removed && mptcp_pm_is_kernel(msk))
718
mptcp_pm_nl_rm_addr(msk, rm_id);
719
}
720
}
721
}
722
723
static void mptcp_pm_rm_addr_recv(struct mptcp_sock *msk)
724
{
725
mptcp_pm_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR);
726
}
727
728
void mptcp_pm_rm_subflow(struct mptcp_sock *msk,
729
const struct mptcp_rm_list *rm_list)
730
{
731
mptcp_pm_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW);
732
}
733
734
void mptcp_pm_rm_addr_received(struct mptcp_sock *msk,
735
const struct mptcp_rm_list *rm_list)
736
{
737
struct mptcp_pm_data *pm = &msk->pm;
738
u8 i;
739
740
pr_debug("msk=%p remote_ids_nr=%d\n", msk, rm_list->nr);
741
742
for (i = 0; i < rm_list->nr; i++)
743
mptcp_event_addr_removed(msk, rm_list->ids[i]);
744
745
spin_lock_bh(&pm->lock);
746
if (mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED))
747
pm->rm_list_rx = *rm_list;
748
else
749
__MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_RMADDRDROP);
750
spin_unlock_bh(&pm->lock);
751
}
752
753
void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup)
754
{
755
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
756
struct sock *sk = subflow->conn;
757
struct mptcp_sock *msk;
758
759
pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup);
760
msk = mptcp_sk(sk);
761
if (subflow->backup != bkup)
762
subflow->backup = bkup;
763
764
mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC);
765
}
766
767
void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
768
{
769
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
770
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
771
772
pr_debug("fail_seq=%llu\n", fail_seq);
773
774
/* After accepting the fail, we can't create any other subflows */
775
spin_lock_bh(&msk->fallback_lock);
776
if (!msk->allow_infinite_fallback) {
777
spin_unlock_bh(&msk->fallback_lock);
778
return;
779
}
780
msk->allow_subflows = false;
781
spin_unlock_bh(&msk->fallback_lock);
782
783
if (!subflow->fail_tout) {
784
pr_debug("send MP_FAIL response and infinite map\n");
785
786
subflow->send_mp_fail = 1;
787
subflow->send_infinite_map = 1;
788
tcp_send_ack(sk);
789
} else {
790
pr_debug("MP_FAIL response received\n");
791
WRITE_ONCE(subflow->fail_tout, 0);
792
}
793
}
794
795
bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb,
796
unsigned int opt_size, unsigned int remaining,
797
struct mptcp_addr_info *addr, bool *echo,
798
bool *drop_other_suboptions)
799
{
800
int ret = false;
801
u8 add_addr;
802
u8 family;
803
bool port;
804
805
spin_lock_bh(&msk->pm.lock);
806
807
/* double check after the lock is acquired */
808
if (!mptcp_pm_should_add_signal(msk))
809
goto out_unlock;
810
811
/* always drop every other options for pure ack ADD_ADDR; this is a
812
* plain dup-ack from TCP perspective. The other MPTCP-relevant info,
813
* if any, will be carried by the 'original' TCP ack
814
*/
815
if (skb && skb_is_tcp_pure_ack(skb)) {
816
remaining += opt_size;
817
*drop_other_suboptions = true;
818
}
819
820
*echo = mptcp_pm_should_add_signal_echo(msk);
821
port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port);
822
823
family = *echo ? msk->pm.remote.family : msk->pm.local.family;
824
if (remaining < mptcp_add_addr_len(family, *echo, port))
825
goto out_unlock;
826
827
if (*echo) {
828
*addr = msk->pm.remote;
829
add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_ECHO);
830
} else {
831
*addr = msk->pm.local;
832
add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_SIGNAL);
833
}
834
WRITE_ONCE(msk->pm.addr_signal, add_addr);
835
ret = true;
836
837
out_unlock:
838
spin_unlock_bh(&msk->pm.lock);
839
return ret;
840
}
841
842
bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
843
struct mptcp_rm_list *rm_list)
844
{
845
int ret = false, len;
846
u8 rm_addr;
847
848
spin_lock_bh(&msk->pm.lock);
849
850
/* double check after the lock is acquired */
851
if (!mptcp_pm_should_rm_signal(msk))
852
goto out_unlock;
853
854
rm_addr = msk->pm.addr_signal & ~BIT(MPTCP_RM_ADDR_SIGNAL);
855
len = mptcp_rm_addr_len(&msk->pm.rm_list_tx);
856
if (len < 0) {
857
WRITE_ONCE(msk->pm.addr_signal, rm_addr);
858
goto out_unlock;
859
}
860
if (remaining < len)
861
goto out_unlock;
862
863
*rm_list = msk->pm.rm_list_tx;
864
WRITE_ONCE(msk->pm.addr_signal, rm_addr);
865
ret = true;
866
867
out_unlock:
868
spin_unlock_bh(&msk->pm.lock);
869
return ret;
870
}
871
872
int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
873
{
874
struct mptcp_pm_addr_entry skc_local = { 0 };
875
struct mptcp_addr_info msk_local;
876
877
if (WARN_ON_ONCE(!msk))
878
return -1;
879
880
/* The 0 ID mapping is defined by the first subflow, copied into the msk
881
* addr
882
*/
883
mptcp_local_address((struct sock_common *)msk, &msk_local);
884
mptcp_local_address((struct sock_common *)skc, &skc_local.addr);
885
if (mptcp_addresses_equal(&msk_local, &skc_local.addr, false))
886
return 0;
887
888
skc_local.addr.id = 0;
889
skc_local.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT;
890
891
if (mptcp_pm_is_userspace(msk))
892
return mptcp_userspace_pm_get_local_id(msk, &skc_local);
893
return mptcp_pm_nl_get_local_id(msk, &skc_local);
894
}
895
896
bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc)
897
{
898
struct mptcp_addr_info skc_local;
899
900
mptcp_local_address((struct sock_common *)skc, &skc_local);
901
902
if (mptcp_pm_is_userspace(msk))
903
return mptcp_userspace_pm_is_backup(msk, &skc_local);
904
905
return mptcp_pm_nl_is_backup(msk, &skc_local);
906
}
907
908
static void mptcp_pm_subflows_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
909
{
910
struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk);
911
struct sock *sk = (struct sock *)msk;
912
unsigned int active_max_loss_cnt;
913
struct net *net = sock_net(sk);
914
unsigned int stale_loss_cnt;
915
bool slow;
916
917
stale_loss_cnt = mptcp_stale_loss_cnt(net);
918
if (subflow->stale || !stale_loss_cnt || subflow->stale_count <= stale_loss_cnt)
919
return;
920
921
/* look for another available subflow not in loss state */
922
active_max_loss_cnt = max_t(int, stale_loss_cnt - 1, 1);
923
mptcp_for_each_subflow(msk, iter) {
924
if (iter != subflow && mptcp_subflow_active(iter) &&
925
iter->stale_count < active_max_loss_cnt) {
926
/* we have some alternatives, try to mark this subflow as idle ...*/
927
slow = lock_sock_fast(ssk);
928
if (!tcp_rtx_and_write_queues_empty(ssk)) {
929
subflow->stale = 1;
930
__mptcp_retransmit_pending_data(sk);
931
MPTCP_INC_STATS(net, MPTCP_MIB_SUBFLOWSTALE);
932
}
933
unlock_sock_fast(ssk, slow);
934
935
/* always try to push the pending data regardless of re-injections:
936
* we can possibly use backup subflows now, and subflow selection
937
* is cheap under the msk socket lock
938
*/
939
__mptcp_push_pending(sk, 0);
940
return;
941
}
942
}
943
}
944
945
void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
946
{
947
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
948
u32 rcv_tstamp = READ_ONCE(tcp_sk(ssk)->rcv_tstamp);
949
950
/* keep track of rtx periods with no progress */
951
if (!subflow->stale_count) {
952
subflow->stale_rcv_tstamp = rcv_tstamp;
953
subflow->stale_count++;
954
} else if (subflow->stale_rcv_tstamp == rcv_tstamp) {
955
if (subflow->stale_count < U8_MAX)
956
subflow->stale_count++;
957
mptcp_pm_subflows_chk_stale(msk, ssk);
958
} else {
959
subflow->stale_count = 0;
960
mptcp_subflow_set_active(subflow);
961
}
962
}
963
964
void mptcp_pm_worker(struct mptcp_sock *msk)
965
{
966
struct mptcp_pm_data *pm = &msk->pm;
967
968
msk_owned_by_me(msk);
969
970
if (!(pm->status & MPTCP_PM_WORK_MASK))
971
return;
972
973
spin_lock_bh(&msk->pm.lock);
974
975
pr_debug("msk=%p status=%x\n", msk, pm->status);
976
if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) {
977
pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK);
978
mptcp_pm_addr_send_ack(msk);
979
}
980
if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) {
981
pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED);
982
mptcp_pm_rm_addr_recv(msk);
983
}
984
__mptcp_pm_kernel_worker(msk);
985
986
spin_unlock_bh(&msk->pm.lock);
987
}
988
989
void mptcp_pm_destroy(struct mptcp_sock *msk)
990
{
991
mptcp_pm_free_anno_list(msk);
992
993
if (mptcp_pm_is_userspace(msk))
994
mptcp_userspace_pm_free_local_addr_list(msk);
995
}
996
997
void mptcp_pm_data_reset(struct mptcp_sock *msk)
998
{
999
u8 pm_type = mptcp_get_pm_type(sock_net((struct sock *)msk));
1000
struct mptcp_pm_data *pm = &msk->pm;
1001
1002
memset(&pm->reset, 0, sizeof(pm->reset));
1003
pm->rm_list_tx.nr = 0;
1004
pm->rm_list_rx.nr = 0;
1005
WRITE_ONCE(pm->pm_type, pm_type);
1006
1007
if (pm_type == MPTCP_PM_TYPE_KERNEL) {
1008
bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk);
1009
1010
/* pm->work_pending must be only be set to 'true' when
1011
* pm->pm_type is set to MPTCP_PM_TYPE_KERNEL
1012
*/
1013
WRITE_ONCE(pm->work_pending,
1014
(!!mptcp_pm_get_local_addr_max(msk) &&
1015
subflows_allowed) ||
1016
!!mptcp_pm_get_add_addr_signal_max(msk));
1017
WRITE_ONCE(pm->accept_addr,
1018
!!mptcp_pm_get_add_addr_accept_max(msk) &&
1019
subflows_allowed);
1020
WRITE_ONCE(pm->accept_subflow, subflows_allowed);
1021
1022
bitmap_fill(pm->id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
1023
}
1024
}
1025
1026
void mptcp_pm_data_init(struct mptcp_sock *msk)
1027
{
1028
spin_lock_init(&msk->pm.lock);
1029
INIT_LIST_HEAD(&msk->pm.anno_list);
1030
INIT_LIST_HEAD(&msk->pm.userspace_pm_local_addr_list);
1031
mptcp_pm_data_reset(msk);
1032
}
1033
1034
void __init mptcp_pm_init(void)
1035
{
1036
mptcp_pm_kernel_register();
1037
mptcp_pm_userspace_register();
1038
mptcp_pm_nl_init();
1039
}
1040
1041
/* Must be called with rcu read lock held */
1042
struct mptcp_pm_ops *mptcp_pm_find(const char *name)
1043
{
1044
struct mptcp_pm_ops *pm_ops;
1045
1046
list_for_each_entry_rcu(pm_ops, &mptcp_pm_list, list) {
1047
if (!strcmp(pm_ops->name, name))
1048
return pm_ops;
1049
}
1050
1051
return NULL;
1052
}
1053
1054
int mptcp_pm_validate(struct mptcp_pm_ops *pm_ops)
1055
{
1056
return 0;
1057
}
1058
1059
int mptcp_pm_register(struct mptcp_pm_ops *pm_ops)
1060
{
1061
int ret;
1062
1063
ret = mptcp_pm_validate(pm_ops);
1064
if (ret)
1065
return ret;
1066
1067
spin_lock(&mptcp_pm_list_lock);
1068
if (mptcp_pm_find(pm_ops->name)) {
1069
spin_unlock(&mptcp_pm_list_lock);
1070
return -EEXIST;
1071
}
1072
list_add_tail_rcu(&pm_ops->list, &mptcp_pm_list);
1073
spin_unlock(&mptcp_pm_list_lock);
1074
1075
pr_debug("%s registered\n", pm_ops->name);
1076
return 0;
1077
}
1078
1079
void mptcp_pm_unregister(struct mptcp_pm_ops *pm_ops)
1080
{
1081
/* skip unregistering the default path manager */
1082
if (WARN_ON_ONCE(pm_ops == &mptcp_pm_kernel))
1083
return;
1084
1085
spin_lock(&mptcp_pm_list_lock);
1086
list_del_rcu(&pm_ops->list);
1087
spin_unlock(&mptcp_pm_list_lock);
1088
}
1089
1090
/* Build string with list of available path manager values.
1091
* Similar to tcp_get_available_congestion_control()
1092
*/
1093
void mptcp_pm_get_available(char *buf, size_t maxlen)
1094
{
1095
struct mptcp_pm_ops *pm_ops;
1096
size_t offs = 0;
1097
1098
rcu_read_lock();
1099
list_for_each_entry_rcu(pm_ops, &mptcp_pm_list, list) {
1100
offs += snprintf(buf + offs, maxlen - offs, "%s%s",
1101
offs == 0 ? "" : " ", pm_ops->name);
1102
1103
if (WARN_ON_ONCE(offs >= maxlen))
1104
break;
1105
}
1106
rcu_read_unlock();
1107
}
1108
1109