Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/net/ipv6/ioam6_iptunnel.c
26285 views
1
// SPDX-License-Identifier: GPL-2.0+
2
/*
3
* IPv6 IOAM Lightweight Tunnel implementation
4
*
5
* Author:
6
* Justin Iurman <[email protected]>
7
*/
8
9
#include <linux/kernel.h>
10
#include <linux/skbuff.h>
11
#include <linux/net.h>
12
#include <linux/in6.h>
13
#include <linux/ioam6.h>
14
#include <linux/ioam6_iptunnel.h>
15
#include <net/dst.h>
16
#include <net/sock.h>
17
#include <net/lwtunnel.h>
18
#include <net/ioam6.h>
19
#include <net/netlink.h>
20
#include <net/ipv6.h>
21
#include <net/dst_cache.h>
22
#include <net/ip6_route.h>
23
#include <net/addrconf.h>
24
25
#define IOAM6_MASK_SHORT_FIELDS 0xff100000
26
#define IOAM6_MASK_WIDE_FIELDS 0xe00000
27
28
struct ioam6_lwt_encap {
29
struct ipv6_hopopt_hdr eh;
30
u8 pad[2]; /* 2-octet padding for 4n-alignment */
31
struct ioam6_hdr ioamh;
32
struct ioam6_trace_hdr traceh;
33
} __packed;
34
35
struct ioam6_lwt_freq {
36
u32 k;
37
u32 n;
38
};
39
40
struct ioam6_lwt {
41
struct dst_entry null_dst;
42
struct dst_cache cache;
43
struct ioam6_lwt_freq freq;
44
atomic_t pkt_cnt;
45
u8 mode;
46
bool has_tunsrc;
47
struct in6_addr tunsrc;
48
struct in6_addr tundst;
49
struct ioam6_lwt_encap tuninfo;
50
};
51
52
static const struct netlink_range_validation freq_range = {
53
.min = IOAM6_IPTUNNEL_FREQ_MIN,
54
.max = IOAM6_IPTUNNEL_FREQ_MAX,
55
};
56
57
static struct ioam6_lwt *ioam6_lwt_state(struct lwtunnel_state *lwt)
58
{
59
return (struct ioam6_lwt *)lwt->data;
60
}
61
62
static struct ioam6_lwt_encap *ioam6_lwt_info(struct lwtunnel_state *lwt)
63
{
64
return &ioam6_lwt_state(lwt)->tuninfo;
65
}
66
67
static struct ioam6_trace_hdr *ioam6_lwt_trace(struct lwtunnel_state *lwt)
68
{
69
return &(ioam6_lwt_state(lwt)->tuninfo.traceh);
70
}
71
72
static const struct nla_policy ioam6_iptunnel_policy[IOAM6_IPTUNNEL_MAX + 1] = {
73
[IOAM6_IPTUNNEL_FREQ_K] = NLA_POLICY_FULL_RANGE(NLA_U32, &freq_range),
74
[IOAM6_IPTUNNEL_FREQ_N] = NLA_POLICY_FULL_RANGE(NLA_U32, &freq_range),
75
[IOAM6_IPTUNNEL_MODE] = NLA_POLICY_RANGE(NLA_U8,
76
IOAM6_IPTUNNEL_MODE_MIN,
77
IOAM6_IPTUNNEL_MODE_MAX),
78
[IOAM6_IPTUNNEL_SRC] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
79
[IOAM6_IPTUNNEL_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
80
[IOAM6_IPTUNNEL_TRACE] = NLA_POLICY_EXACT_LEN(
81
sizeof(struct ioam6_trace_hdr)),
82
};
83
84
static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace)
85
{
86
u32 fields;
87
88
if (!trace->type_be32 || !trace->remlen ||
89
trace->remlen > IOAM6_TRACE_DATA_SIZE_MAX / 4 ||
90
trace->type.bit12 | trace->type.bit13 | trace->type.bit14 |
91
trace->type.bit15 | trace->type.bit16 | trace->type.bit17 |
92
trace->type.bit18 | trace->type.bit19 | trace->type.bit20 |
93
trace->type.bit21 | trace->type.bit23)
94
return false;
95
96
trace->nodelen = 0;
97
fields = be32_to_cpu(trace->type_be32);
98
99
trace->nodelen += hweight32(fields & IOAM6_MASK_SHORT_FIELDS)
100
* (sizeof(__be32) / 4);
101
trace->nodelen += hweight32(fields & IOAM6_MASK_WIDE_FIELDS)
102
* (sizeof(__be64) / 4);
103
104
return true;
105
}
106
107
static int ioam6_build_state(struct net *net, struct nlattr *nla,
108
unsigned int family, const void *cfg,
109
struct lwtunnel_state **ts,
110
struct netlink_ext_ack *extack)
111
{
112
struct nlattr *tb[IOAM6_IPTUNNEL_MAX + 1];
113
struct ioam6_lwt_encap *tuninfo;
114
struct ioam6_trace_hdr *trace;
115
struct lwtunnel_state *lwt;
116
struct ioam6_lwt *ilwt;
117
int len_aligned, err;
118
u32 freq_k, freq_n;
119
u8 mode;
120
121
if (family != AF_INET6)
122
return -EINVAL;
123
124
err = nla_parse_nested(tb, IOAM6_IPTUNNEL_MAX, nla,
125
ioam6_iptunnel_policy, extack);
126
if (err < 0)
127
return err;
128
129
if ((!tb[IOAM6_IPTUNNEL_FREQ_K] && tb[IOAM6_IPTUNNEL_FREQ_N]) ||
130
(tb[IOAM6_IPTUNNEL_FREQ_K] && !tb[IOAM6_IPTUNNEL_FREQ_N])) {
131
NL_SET_ERR_MSG(extack, "freq: missing parameter");
132
return -EINVAL;
133
} else if (!tb[IOAM6_IPTUNNEL_FREQ_K] && !tb[IOAM6_IPTUNNEL_FREQ_N]) {
134
freq_k = IOAM6_IPTUNNEL_FREQ_MIN;
135
freq_n = IOAM6_IPTUNNEL_FREQ_MIN;
136
} else {
137
freq_k = nla_get_u32(tb[IOAM6_IPTUNNEL_FREQ_K]);
138
freq_n = nla_get_u32(tb[IOAM6_IPTUNNEL_FREQ_N]);
139
140
if (freq_k > freq_n) {
141
NL_SET_ERR_MSG(extack, "freq: k > n is forbidden");
142
return -EINVAL;
143
}
144
}
145
146
mode = nla_get_u8_default(tb[IOAM6_IPTUNNEL_MODE],
147
IOAM6_IPTUNNEL_MODE_INLINE);
148
149
if (tb[IOAM6_IPTUNNEL_SRC] && mode == IOAM6_IPTUNNEL_MODE_INLINE) {
150
NL_SET_ERR_MSG(extack, "no tunnel src expected with this mode");
151
return -EINVAL;
152
}
153
154
if (!tb[IOAM6_IPTUNNEL_DST] && mode != IOAM6_IPTUNNEL_MODE_INLINE) {
155
NL_SET_ERR_MSG(extack, "this mode needs a tunnel destination");
156
return -EINVAL;
157
}
158
159
if (!tb[IOAM6_IPTUNNEL_TRACE]) {
160
NL_SET_ERR_MSG(extack, "missing trace");
161
return -EINVAL;
162
}
163
164
trace = nla_data(tb[IOAM6_IPTUNNEL_TRACE]);
165
if (!ioam6_validate_trace_hdr(trace)) {
166
NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_TRACE],
167
"invalid trace validation");
168
return -EINVAL;
169
}
170
171
len_aligned = ALIGN(trace->remlen * 4, 8);
172
lwt = lwtunnel_state_alloc(sizeof(*ilwt) + len_aligned);
173
if (!lwt)
174
return -ENOMEM;
175
176
ilwt = ioam6_lwt_state(lwt);
177
err = dst_cache_init(&ilwt->cache, GFP_ATOMIC);
178
if (err)
179
goto free_lwt;
180
181
/* This "fake" dst_entry will be stored in a dst_cache, which will call
182
* dst_hold() and dst_release() on it. We must ensure that dst_destroy()
183
* will never be called. For that, its initial refcount is 1 and +1 when
184
* it is stored in the cache. Then, +1/-1 each time we read the cache
185
* and release it. Long story short, we're fine.
186
*/
187
dst_init(&ilwt->null_dst, NULL, NULL, DST_OBSOLETE_NONE, DST_NOCOUNT);
188
189
atomic_set(&ilwt->pkt_cnt, 0);
190
ilwt->freq.k = freq_k;
191
ilwt->freq.n = freq_n;
192
193
ilwt->mode = mode;
194
195
if (!tb[IOAM6_IPTUNNEL_SRC]) {
196
ilwt->has_tunsrc = false;
197
} else {
198
ilwt->has_tunsrc = true;
199
ilwt->tunsrc = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_SRC]);
200
201
if (ipv6_addr_any(&ilwt->tunsrc)) {
202
NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_SRC],
203
"invalid tunnel source address");
204
err = -EINVAL;
205
goto free_cache;
206
}
207
}
208
209
if (tb[IOAM6_IPTUNNEL_DST]) {
210
ilwt->tundst = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_DST]);
211
212
if (ipv6_addr_any(&ilwt->tundst)) {
213
NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_DST],
214
"invalid tunnel dest address");
215
err = -EINVAL;
216
goto free_cache;
217
}
218
}
219
220
tuninfo = ioam6_lwt_info(lwt);
221
tuninfo->eh.hdrlen = ((sizeof(*tuninfo) + len_aligned) >> 3) - 1;
222
tuninfo->pad[0] = IPV6_TLV_PADN;
223
tuninfo->ioamh.type = IOAM6_TYPE_PREALLOC;
224
tuninfo->ioamh.opt_type = IPV6_TLV_IOAM;
225
tuninfo->ioamh.opt_len = sizeof(tuninfo->ioamh) - 2 + sizeof(*trace)
226
+ trace->remlen * 4;
227
228
memcpy(&tuninfo->traceh, trace, sizeof(*trace));
229
230
if (len_aligned - trace->remlen * 4) {
231
tuninfo->traceh.data[trace->remlen * 4] = IPV6_TLV_PADN;
232
tuninfo->traceh.data[trace->remlen * 4 + 1] = 2;
233
}
234
235
lwt->type = LWTUNNEL_ENCAP_IOAM6;
236
lwt->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
237
238
*ts = lwt;
239
240
return 0;
241
free_cache:
242
dst_cache_destroy(&ilwt->cache);
243
free_lwt:
244
kfree(lwt);
245
return err;
246
}
247
248
static int ioam6_do_fill(struct net *net, struct sk_buff *skb)
249
{
250
struct ioam6_trace_hdr *trace;
251
struct ioam6_namespace *ns;
252
253
trace = (struct ioam6_trace_hdr *)(skb_transport_header(skb)
254
+ sizeof(struct ipv6_hopopt_hdr) + 2
255
+ sizeof(struct ioam6_hdr));
256
257
ns = ioam6_namespace(net, trace->namespace_id);
258
if (ns)
259
ioam6_fill_trace_data(skb, ns, trace, false);
260
261
return 0;
262
}
263
264
static int ioam6_do_inline(struct net *net, struct sk_buff *skb,
265
struct ioam6_lwt_encap *tuninfo,
266
struct dst_entry *cache_dst)
267
{
268
struct ipv6hdr *oldhdr, *hdr;
269
int hdrlen, err;
270
271
hdrlen = (tuninfo->eh.hdrlen + 1) << 3;
272
273
err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb));
274
if (unlikely(err))
275
return err;
276
277
oldhdr = ipv6_hdr(skb);
278
skb_pull(skb, sizeof(*oldhdr));
279
skb_postpull_rcsum(skb, skb_network_header(skb), sizeof(*oldhdr));
280
281
skb_push(skb, sizeof(*oldhdr) + hdrlen);
282
skb_reset_network_header(skb);
283
skb_mac_header_rebuild(skb);
284
285
hdr = ipv6_hdr(skb);
286
memmove(hdr, oldhdr, sizeof(*oldhdr));
287
tuninfo->eh.nexthdr = hdr->nexthdr;
288
289
skb_set_transport_header(skb, sizeof(*hdr));
290
skb_postpush_rcsum(skb, hdr, sizeof(*hdr) + hdrlen);
291
292
memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen);
293
294
hdr->nexthdr = NEXTHDR_HOP;
295
hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr));
296
297
return ioam6_do_fill(net, skb);
298
}
299
300
static int ioam6_do_encap(struct net *net, struct sk_buff *skb,
301
struct ioam6_lwt_encap *tuninfo,
302
bool has_tunsrc,
303
struct in6_addr *tunsrc,
304
struct in6_addr *tundst,
305
struct dst_entry *cache_dst)
306
{
307
struct dst_entry *dst = skb_dst(skb);
308
struct ipv6hdr *hdr, *inner_hdr;
309
int hdrlen, len, err;
310
311
hdrlen = (tuninfo->eh.hdrlen + 1) << 3;
312
len = sizeof(*hdr) + hdrlen;
313
314
err = skb_cow_head(skb, len + dst_dev_overhead(cache_dst, skb));
315
if (unlikely(err))
316
return err;
317
318
inner_hdr = ipv6_hdr(skb);
319
320
skb_push(skb, len);
321
skb_reset_network_header(skb);
322
skb_mac_header_rebuild(skb);
323
skb_set_transport_header(skb, sizeof(*hdr));
324
325
tuninfo->eh.nexthdr = NEXTHDR_IPV6;
326
memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen);
327
328
hdr = ipv6_hdr(skb);
329
memcpy(hdr, inner_hdr, sizeof(*hdr));
330
331
hdr->nexthdr = NEXTHDR_HOP;
332
hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr));
333
hdr->daddr = *tundst;
334
335
if (has_tunsrc)
336
memcpy(&hdr->saddr, tunsrc, sizeof(*tunsrc));
337
else
338
ipv6_dev_get_saddr(net, dst_dev(dst), &hdr->daddr,
339
IPV6_PREFER_SRC_PUBLIC, &hdr->saddr);
340
341
skb_postpush_rcsum(skb, hdr, len);
342
343
return ioam6_do_fill(net, skb);
344
}
345
346
static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
347
{
348
struct dst_entry *orig_dst = skb_dst(skb);
349
struct dst_entry *dst = NULL;
350
struct ioam6_lwt *ilwt;
351
int err = -EINVAL;
352
u32 pkt_cnt;
353
354
if (skb->protocol != htons(ETH_P_IPV6))
355
goto drop;
356
357
ilwt = ioam6_lwt_state(orig_dst->lwtstate);
358
359
/* Check for insertion frequency (i.e., "k over n" insertions) */
360
pkt_cnt = atomic_fetch_inc(&ilwt->pkt_cnt);
361
if (pkt_cnt % ilwt->freq.n >= ilwt->freq.k)
362
goto out;
363
364
local_bh_disable();
365
dst = dst_cache_get(&ilwt->cache);
366
local_bh_enable();
367
368
/* This is how we notify that the destination does not change after
369
* transformation and that we need to use orig_dst instead of the cache
370
*/
371
if (dst == &ilwt->null_dst) {
372
dst_release(dst);
373
374
dst = orig_dst;
375
/* keep refcount balance: dst_release() is called at the end */
376
dst_hold(dst);
377
}
378
379
switch (ilwt->mode) {
380
case IOAM6_IPTUNNEL_MODE_INLINE:
381
do_inline:
382
/* Direct insertion - if there is no Hop-by-Hop yet */
383
if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP)
384
goto out;
385
386
err = ioam6_do_inline(net, skb, &ilwt->tuninfo, dst);
387
if (unlikely(err))
388
goto drop;
389
390
break;
391
case IOAM6_IPTUNNEL_MODE_ENCAP:
392
do_encap:
393
/* Encapsulation (ip6ip6) */
394
err = ioam6_do_encap(net, skb, &ilwt->tuninfo,
395
ilwt->has_tunsrc, &ilwt->tunsrc,
396
&ilwt->tundst, dst);
397
if (unlikely(err))
398
goto drop;
399
400
break;
401
case IOAM6_IPTUNNEL_MODE_AUTO:
402
/* Automatic (RFC8200 compliant):
403
* - local packets -> INLINE mode
404
* - in-transit packets -> ENCAP mode
405
*/
406
if (!skb->dev)
407
goto do_inline;
408
409
goto do_encap;
410
default:
411
goto drop;
412
}
413
414
if (unlikely(!dst)) {
415
struct ipv6hdr *hdr = ipv6_hdr(skb);
416
struct flowi6 fl6;
417
418
memset(&fl6, 0, sizeof(fl6));
419
fl6.daddr = hdr->daddr;
420
fl6.saddr = hdr->saddr;
421
fl6.flowlabel = ip6_flowinfo(hdr);
422
fl6.flowi6_mark = skb->mark;
423
fl6.flowi6_proto = hdr->nexthdr;
424
425
dst = ip6_route_output(net, NULL, &fl6);
426
if (dst->error) {
427
err = dst->error;
428
goto drop;
429
}
430
431
/* If the destination is the same after transformation (which is
432
* a valid use case for IOAM), then we don't want to add it to
433
* the cache in order to avoid a reference loop. Instead, we add
434
* our fake dst_entry to the cache as a way to detect this case.
435
* Otherwise, we add the resolved destination to the cache.
436
*/
437
local_bh_disable();
438
if (orig_dst->lwtstate == dst->lwtstate)
439
dst_cache_set_ip6(&ilwt->cache,
440
&ilwt->null_dst, &fl6.saddr);
441
else
442
dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr);
443
local_bh_enable();
444
445
err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst)));
446
if (unlikely(err))
447
goto drop;
448
}
449
450
/* avoid lwtunnel_output() reentry loop when destination is the same
451
* after transformation (e.g., with the inline mode)
452
*/
453
if (orig_dst->lwtstate != dst->lwtstate) {
454
skb_dst_drop(skb);
455
skb_dst_set(skb, dst);
456
return dst_output(net, sk, skb);
457
}
458
out:
459
dst_release(dst);
460
return orig_dst->lwtstate->orig_output(net, sk, skb);
461
drop:
462
dst_release(dst);
463
kfree_skb(skb);
464
return err;
465
}
466
467
static void ioam6_destroy_state(struct lwtunnel_state *lwt)
468
{
469
/* Since the refcount of per-cpu dst_entry caches will never be 0 (see
470
* why above) when our "fake" dst_entry is used, it is not necessary to
471
* remove them before calling dst_cache_destroy()
472
*/
473
dst_cache_destroy(&ioam6_lwt_state(lwt)->cache);
474
}
475
476
static int ioam6_fill_encap_info(struct sk_buff *skb,
477
struct lwtunnel_state *lwtstate)
478
{
479
struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate);
480
int err;
481
482
err = nla_put_u32(skb, IOAM6_IPTUNNEL_FREQ_K, ilwt->freq.k);
483
if (err)
484
goto ret;
485
486
err = nla_put_u32(skb, IOAM6_IPTUNNEL_FREQ_N, ilwt->freq.n);
487
if (err)
488
goto ret;
489
490
err = nla_put_u8(skb, IOAM6_IPTUNNEL_MODE, ilwt->mode);
491
if (err)
492
goto ret;
493
494
if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) {
495
if (ilwt->has_tunsrc) {
496
err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_SRC,
497
&ilwt->tunsrc);
498
if (err)
499
goto ret;
500
}
501
502
err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_DST, &ilwt->tundst);
503
if (err)
504
goto ret;
505
}
506
507
err = nla_put(skb, IOAM6_IPTUNNEL_TRACE, sizeof(ilwt->tuninfo.traceh),
508
&ilwt->tuninfo.traceh);
509
ret:
510
return err;
511
}
512
513
static int ioam6_encap_nlsize(struct lwtunnel_state *lwtstate)
514
{
515
struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate);
516
int nlsize;
517
518
nlsize = nla_total_size(sizeof(ilwt->freq.k)) +
519
nla_total_size(sizeof(ilwt->freq.n)) +
520
nla_total_size(sizeof(ilwt->mode)) +
521
nla_total_size(sizeof(ilwt->tuninfo.traceh));
522
523
if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) {
524
if (ilwt->has_tunsrc)
525
nlsize += nla_total_size(sizeof(ilwt->tunsrc));
526
527
nlsize += nla_total_size(sizeof(ilwt->tundst));
528
}
529
530
return nlsize;
531
}
532
533
static int ioam6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
534
{
535
struct ioam6_trace_hdr *trace_a = ioam6_lwt_trace(a);
536
struct ioam6_trace_hdr *trace_b = ioam6_lwt_trace(b);
537
struct ioam6_lwt *ilwt_a = ioam6_lwt_state(a);
538
struct ioam6_lwt *ilwt_b = ioam6_lwt_state(b);
539
540
return (ilwt_a->freq.k != ilwt_b->freq.k ||
541
ilwt_a->freq.n != ilwt_b->freq.n ||
542
ilwt_a->mode != ilwt_b->mode ||
543
ilwt_a->has_tunsrc != ilwt_b->has_tunsrc ||
544
(ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE &&
545
!ipv6_addr_equal(&ilwt_a->tundst, &ilwt_b->tundst)) ||
546
(ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE &&
547
ilwt_a->has_tunsrc &&
548
!ipv6_addr_equal(&ilwt_a->tunsrc, &ilwt_b->tunsrc)) ||
549
trace_a->namespace_id != trace_b->namespace_id);
550
}
551
552
static const struct lwtunnel_encap_ops ioam6_iptun_ops = {
553
.build_state = ioam6_build_state,
554
.destroy_state = ioam6_destroy_state,
555
.output = ioam6_output,
556
.fill_encap = ioam6_fill_encap_info,
557
.get_encap_size = ioam6_encap_nlsize,
558
.cmp_encap = ioam6_encap_cmp,
559
.owner = THIS_MODULE,
560
};
561
562
int __init ioam6_iptunnel_init(void)
563
{
564
return lwtunnel_encap_add_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6);
565
}
566
567
void ioam6_iptunnel_exit(void)
568
{
569
lwtunnel_encap_del_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6);
570
}
571
572