Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/netinet/in_fib_dxr.c
105585 views
1
/*-
2
* SPDX-License-Identifier: BSD-2-Clause
3
*
4
* Copyright (c) 2012-2024 Marko Zec
5
* Copyright (c) 2005, 2018 University of Zagreb
6
* Copyright (c) 2005 International Computer Science Institute
7
*
8
* Redistribution and use in source and binary forms, with or without
9
* modification, are permitted provided that the following conditions
10
* are met:
11
* 1. Redistributions of source code must retain the above copyright
12
* notice, this list of conditions and the following disclaimer.
13
* 2. Redistributions in binary form must reproduce the above copyright
14
* notice, this list of conditions and the following disclaimer in the
15
* documentation and/or other materials provided with the distribution.
16
*
17
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27
* SUCH DAMAGE.
28
*/
29
30
/*
31
* An implementation of DXR, a simple IPv4 LPM scheme with compact lookup
32
* structures and a trivial search procedure. More significant bits of
33
* the search key are used to directly index a two-stage trie, while the
34
* remaining bits are used for finding the next hop in a sorted array.
35
* More details in:
36
*
37
* M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per
38
* second in software, ACM SIGCOMM Computer Communication Review, September
39
* 2012
40
*
41
* M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing
42
* lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split
43
*/
44
45
#include "opt_inet.h"
46
47
#include <sys/param.h>
48
#include <sys/kernel.h>
49
#include <sys/ctype.h>
50
#include <sys/epoch.h>
51
#include <sys/malloc.h>
52
#include <sys/module.h>
53
#include <sys/socket.h>
54
#include <sys/sysctl.h>
55
#include <sys/syslog.h>
56
57
#include <vm/uma.h>
58
59
#include <netinet/in.h>
60
#include <netinet/in_fib.h>
61
62
#include <net/route.h>
63
#include <net/route/route_ctl.h>
64
#include <net/route/fib_algo.h>
65
66
#define DXR_TRIE_BITS 20
67
68
CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24);
69
70
#if DXR_TRIE_BITS > 16
71
#define DXR_D 16
72
#else
73
#define DXR_D (DXR_TRIE_BITS - 1)
74
#endif
75
76
#define D_TBL_SIZE (1 << DXR_D)
77
#define DIRECT_TBL_SIZE (1 << DXR_TRIE_BITS)
78
#define DXR_RANGE_MASK (0xffffffffU >> DXR_TRIE_BITS)
79
#define DXR_RANGE_SHIFT (32 - DXR_TRIE_BITS)
80
81
#define DESC_BASE_BITS 22
82
#define DESC_FRAGMENTS_BITS (32 - DESC_BASE_BITS)
83
#define BASE_MAX ((1 << DESC_BASE_BITS) - 1)
84
#define RTBL_SIZE_INCR (BASE_MAX / 64)
85
86
#if DXR_TRIE_BITS < 24
87
#define FRAGS_MASK_SHORT ((1 << (23 - DXR_TRIE_BITS)) - 1)
88
#else
89
#define FRAGS_MASK_SHORT 0
90
#endif
91
#define FRAGS_PREF_SHORT (((1 << DESC_FRAGMENTS_BITS) - 1) & \
92
~FRAGS_MASK_SHORT)
93
#define FRAGS_MARK_XL (FRAGS_PREF_SHORT - 1)
94
#define FRAGS_MARK_HIT (FRAGS_PREF_SHORT - 2)
95
96
#define IS_SHORT_FORMAT(x) ((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT)
97
#define IS_LONG_FORMAT(x) ((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT)
98
#define IS_XL_FORMAT(x) (x == FRAGS_MARK_XL)
99
100
#define RE_SHORT_MAX_NH ((1 << (DXR_TRIE_BITS - 8)) - 1)
101
102
#define CHUNK_HASH_BITS 16
103
#define CHUNK_HASH_SIZE (1 << CHUNK_HASH_BITS)
104
#define CHUNK_HASH_MASK (CHUNK_HASH_SIZE - 1)
105
106
#define TRIE_HASH_BITS 16
107
#define TRIE_HASH_SIZE (1 << TRIE_HASH_BITS)
108
#define TRIE_HASH_MASK (TRIE_HASH_SIZE - 1)
109
110
#define XTBL_SIZE_INCR (DIRECT_TBL_SIZE / 16)
111
112
#define UNUSED_BUCKETS 8
113
114
/* Lookup structure elements */
115
116
struct direct_entry {
117
uint32_t fragments: DESC_FRAGMENTS_BITS,
118
base: DESC_BASE_BITS;
119
};
120
121
struct range_entry_long {
122
uint32_t start: DXR_RANGE_SHIFT,
123
nexthop: DXR_TRIE_BITS;
124
};
125
126
#if DXR_TRIE_BITS < 24
127
struct range_entry_short {
128
uint16_t start: DXR_RANGE_SHIFT - 8,
129
nexthop: DXR_TRIE_BITS - 8;
130
};
131
#endif
132
133
/* Auxiliary structures */
134
135
struct heap_entry {
136
uint32_t start;
137
uint32_t end;
138
uint32_t preflen;
139
uint32_t nexthop;
140
};
141
142
struct chunk_desc {
143
LIST_ENTRY(chunk_desc) cd_all_le;
144
LIST_ENTRY(chunk_desc) cd_hash_le;
145
uint32_t cd_hash;
146
uint32_t cd_refcnt;
147
uint32_t cd_base;
148
uint32_t cd_cur_size;
149
uint32_t cd_max_size;
150
};
151
152
struct trie_desc {
153
LIST_ENTRY(trie_desc) td_all_le;
154
LIST_ENTRY(trie_desc) td_hash_le;
155
uint32_t td_hash;
156
uint32_t td_index;
157
uint32_t td_refcnt;
158
};
159
160
struct dxr_aux {
161
/* Glue to external state */
162
struct fib_data *fd;
163
uint32_t fibnum;
164
int refcnt;
165
166
/* Auxiliary build-time tables */
167
struct direct_entry direct_tbl[DIRECT_TBL_SIZE];
168
uint16_t d_tbl[D_TBL_SIZE];
169
struct direct_entry *x_tbl;
170
union {
171
struct range_entry_long re;
172
uint32_t fragments;
173
} *range_tbl;
174
175
/* Auxiliary internal state */
176
uint32_t updates_mask[DIRECT_TBL_SIZE / 32];
177
struct trie_desc *trietbl[D_TBL_SIZE];
178
LIST_HEAD(, chunk_desc) chunk_hashtbl[CHUNK_HASH_SIZE];
179
LIST_HEAD(, chunk_desc) all_chunks;
180
LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS];
181
LIST_HEAD(, trie_desc) trie_hashtbl[TRIE_HASH_SIZE];
182
LIST_HEAD(, trie_desc) all_trie;
183
LIST_HEAD(, trie_desc) unused_trie; /* abuses hash link entry */
184
struct sockaddr_in dst;
185
struct sockaddr_in mask;
186
struct heap_entry heap[33];
187
uint32_t prefixes;
188
uint32_t updates_low;
189
uint32_t updates_high;
190
uint32_t unused_chunks_size;
191
uint32_t xtbl_size;
192
uint32_t all_trie_cnt;
193
uint32_t unused_trie_cnt;
194
uint32_t trie_rebuilt_prefixes;
195
uint32_t heap_index;
196
uint32_t d_bits;
197
uint32_t rtbl_size;
198
uint32_t rtbl_top;
199
uint32_t rtbl_work_frags;
200
uint32_t work_chunk;
201
};
202
203
/* Main lookup structure container */
204
205
struct dxr {
206
/* Lookup tables */
207
void *d;
208
void *x;
209
void *r;
210
struct nhop_object **nh_tbl;
211
212
/* Glue to external state */
213
struct dxr_aux *aux;
214
struct fib_data *fd;
215
struct epoch_context epoch_ctx;
216
uint32_t fibnum;
217
uint16_t d_shift;
218
uint16_t x_shift;
219
uint32_t x_mask;
220
};
221
222
static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM");
223
static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary");
224
225
uma_zone_t chunk_zone;
226
uma_zone_t trie_zone;
227
228
VNET_DEFINE_STATIC(int, frag_limit) = 100;
229
#define V_frag_limit VNET(frag_limit)
230
231
/* Binary search for a matching address range */
232
#define DXR_LOOKUP_STAGE \
233
if (masked_dst < range[middle].start) { \
234
upperbound = middle; \
235
middle = (middle + lowerbound) / 2; \
236
} else if (masked_dst < range[middle + 1].start) \
237
return (range[middle].nexthop); \
238
else { \
239
lowerbound = middle + 1; \
240
middle = (upperbound + middle + 1) / 2; \
241
} \
242
if (upperbound == lowerbound) \
243
return (range[lowerbound].nexthop);
244
245
static int
246
range_lookup(struct range_entry_long *rt, struct direct_entry de, uint32_t dst)
247
{
248
uint32_t base;
249
uint32_t upperbound;
250
uint32_t middle;
251
uint32_t lowerbound;
252
uint32_t masked_dst;
253
254
base = de.base;
255
lowerbound = 0;
256
masked_dst = dst & DXR_RANGE_MASK;
257
258
#if DXR_TRIE_BITS < 24
259
if (__predict_true(IS_SHORT_FORMAT(de.fragments))) {
260
upperbound = de.fragments & FRAGS_MASK_SHORT;
261
struct range_entry_short *range =
262
(struct range_entry_short *) &rt[base];
263
264
masked_dst >>= 8;
265
middle = upperbound;
266
upperbound = upperbound * 2 + 1;
267
268
for (;;) {
269
DXR_LOOKUP_STAGE
270
DXR_LOOKUP_STAGE
271
}
272
}
273
#endif
274
275
upperbound = de.fragments;
276
middle = upperbound / 2;
277
struct range_entry_long *range = &rt[base];
278
if (__predict_false(IS_XL_FORMAT(de.fragments))) {
279
upperbound = *((uint32_t *) range);
280
range++;
281
middle = upperbound / 2;
282
}
283
284
for (;;) {
285
DXR_LOOKUP_STAGE
286
DXR_LOOKUP_STAGE
287
}
288
}
289
290
#define DXR_LOOKUP_DEFINE(D) \
291
static int inline \
292
dxr_lookup_##D(struct dxr *dxr, uint32_t dst) \
293
{ \
294
struct direct_entry de; \
295
uint16_t *dt = dxr->d; \
296
struct direct_entry *xt = dxr->x; \
297
\
298
de = xt[(dt[dst >> (32 - (D))] << (DXR_TRIE_BITS - (D))) \
299
+ ((dst >> DXR_RANGE_SHIFT) & \
300
(0xffffffffU >> (32 - DXR_TRIE_BITS + (D))))]; \
301
if (__predict_true(de.fragments == FRAGS_MARK_HIT)) \
302
return (de.base); \
303
return (range_lookup(dxr->r, de, dst)); \
304
} \
305
\
306
static struct nhop_object * \
307
dxr_fib_lookup_##D(void *algo_data, \
308
const struct flm_lookup_key key, uint32_t scopeid __unused) \
309
{ \
310
struct dxr *dxr = algo_data; \
311
\
312
return (dxr->nh_tbl[dxr_lookup_##D(dxr, \
313
ntohl(key.addr4.s_addr))]); \
314
}
315
316
#if DXR_TRIE_BITS > 16
317
DXR_LOOKUP_DEFINE(16)
318
#endif
319
DXR_LOOKUP_DEFINE(15)
320
DXR_LOOKUP_DEFINE(14)
321
DXR_LOOKUP_DEFINE(13)
322
DXR_LOOKUP_DEFINE(12)
323
DXR_LOOKUP_DEFINE(11)
324
DXR_LOOKUP_DEFINE(10)
325
DXR_LOOKUP_DEFINE(9)
326
327
static int inline
328
dxr_lookup(struct dxr *dxr, uint32_t dst)
329
{
330
struct direct_entry de;
331
uint16_t *dt = dxr->d;
332
struct direct_entry *xt = dxr->x;
333
334
de = xt[(dt[dst >> dxr->d_shift] << dxr->x_shift) +
335
((dst >> DXR_RANGE_SHIFT) & dxr->x_mask)];
336
if (__predict_true(de.fragments == FRAGS_MARK_HIT))
337
return (de.base);
338
return (range_lookup(dxr->r, de, dst));
339
}
340
341
static void
342
initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
343
{
344
struct heap_entry *fhp = &da->heap[0];
345
struct rtentry *rt;
346
struct route_nhop_data rnd;
347
348
da->heap_index = 0;
349
da->dst.sin_addr.s_addr = htonl(dst_u32);
350
rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED,
351
&rnd);
352
if (rt != NULL) {
353
struct in_addr addr;
354
uint32_t scopeid;
355
356
rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid);
357
fhp->start = ntohl(addr.s_addr);
358
fhp->end = fhp->start;
359
if (fhp->preflen < 32)
360
fhp->end |= (0xffffffffU >> fhp->preflen);
361
fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop);
362
} else {
363
fhp->preflen = fhp->nexthop = fhp->start = 0;
364
fhp->end = 0xffffffffU;
365
}
366
}
367
368
static uint32_t
369
chunk_size(struct dxr_aux *da, struct direct_entry *fdesc)
370
{
371
372
if (IS_SHORT_FORMAT(fdesc->fragments))
373
return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1);
374
else if (IS_XL_FORMAT(fdesc->fragments))
375
return (da->range_tbl[fdesc->base].fragments + 2);
376
else /* if (IS_LONG_FORMAT(fdesc->fragments)) */
377
return (fdesc->fragments + 1);
378
}
379
380
static uint32_t
381
chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc)
382
{
383
uint32_t size = chunk_size(da, fdesc);
384
uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base];
385
uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size];
386
uint32_t hash = fdesc->fragments;
387
388
for (; p < l; p++)
389
hash = (hash << 7) + (hash >> 13) + *p;
390
391
return (hash + (hash >> 16));
392
}
393
394
static int
395
chunk_ref(struct dxr_aux *da, uint32_t chunk)
396
{
397
struct direct_entry *fdesc = &da->direct_tbl[chunk];
398
struct chunk_desc *cdp, *empty_cdp;
399
uint32_t base = fdesc->base;
400
uint32_t size = chunk_size(da, fdesc);
401
uint32_t hash = chunk_hash(da, fdesc);
402
int i;
403
404
/* Find an existing descriptor */
405
LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
406
cd_hash_le) {
407
if (cdp->cd_hash != hash || cdp->cd_cur_size != size ||
408
memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
409
sizeof(struct range_entry_long) * size))
410
continue;
411
da->rtbl_top = fdesc->base;
412
fdesc->base = cdp->cd_base;
413
cdp->cd_refcnt++;
414
return (0);
415
}
416
417
/* No matching chunks found. Find an empty one to recycle. */
418
for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++)
419
cdp = LIST_FIRST(&da->unused_chunks[i]);
420
421
if (cdp == NULL)
422
LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le)
423
if (empty_cdp->cd_max_size >= size && (cdp == NULL ||
424
empty_cdp->cd_max_size < cdp->cd_max_size)) {
425
cdp = empty_cdp;
426
if (empty_cdp->cd_max_size == size)
427
break;
428
}
429
430
if (cdp != NULL) {
431
/* Copy from heap into the recycled chunk */
432
bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base],
433
size * sizeof(struct range_entry_long));
434
fdesc->base = cdp->cd_base;
435
da->rtbl_top -= size;
436
da->unused_chunks_size -= cdp->cd_max_size;
437
if (cdp->cd_max_size > size) {
438
/* Split the range in two, need a new descriptor */
439
empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT);
440
if (empty_cdp == NULL)
441
return (1);
442
LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le);
443
empty_cdp->cd_base = cdp->cd_base + size;
444
empty_cdp->cd_cur_size = 0;
445
empty_cdp->cd_max_size = cdp->cd_max_size - size;
446
447
i = empty_cdp->cd_max_size;
448
if (i >= UNUSED_BUCKETS)
449
i = 0;
450
LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp,
451
cd_hash_le);
452
453
da->unused_chunks_size += empty_cdp->cd_max_size;
454
cdp->cd_max_size = size;
455
}
456
LIST_REMOVE(cdp, cd_hash_le);
457
} else {
458
/* Alloc a new descriptor at the top of the heap*/
459
cdp = uma_zalloc(chunk_zone, M_NOWAIT);
460
if (cdp == NULL)
461
return (1);
462
cdp->cd_max_size = size;
463
cdp->cd_base = fdesc->base;
464
LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le);
465
MPASS(cdp->cd_base + cdp->cd_max_size == da->rtbl_top);
466
}
467
468
cdp->cd_hash = hash;
469
cdp->cd_refcnt = 1;
470
cdp->cd_cur_size = size;
471
LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp,
472
cd_hash_le);
473
if (da->rtbl_top >= da->rtbl_size) {
474
if (da->rtbl_top >= BASE_MAX) {
475
FIB_PRINTF(LOG_ERR, da->fd,
476
"structural limit exceeded at %d "
477
"range table elements", da->rtbl_top);
478
return (1);
479
}
480
da->rtbl_size += RTBL_SIZE_INCR;
481
i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX;
482
FIB_PRINTF(i, da->fd, "range table at %d%% structural limit",
483
da->rtbl_top * 100 / BASE_MAX);
484
da->range_tbl = realloc(da->range_tbl,
485
sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT,
486
M_DXRAUX, M_NOWAIT);
487
if (da->range_tbl == NULL) {
488
FIB_PRINTF(LOG_NOTICE, da->fd,
489
"Unable to allocate DXR range table");
490
return (1);
491
}
492
}
493
494
return (0);
495
}
496
497
static void
498
chunk_unref(struct dxr_aux *da, uint32_t chunk)
499
{
500
struct direct_entry *fdesc = &da->direct_tbl[chunk];
501
struct chunk_desc *cdp, *cdp2;
502
uint32_t base = fdesc->base;
503
uint32_t size = chunk_size(da, fdesc);
504
uint32_t hash = chunk_hash(da, fdesc);
505
int i;
506
507
/* Find the corresponding descriptor */
508
LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
509
cd_hash_le)
510
if (cdp->cd_hash == hash && cdp->cd_cur_size == size &&
511
memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
512
sizeof(struct range_entry_long) * size) == 0)
513
break;
514
515
MPASS(cdp != NULL);
516
if (--cdp->cd_refcnt > 0)
517
return;
518
519
LIST_REMOVE(cdp, cd_hash_le);
520
da->unused_chunks_size += cdp->cd_max_size;
521
cdp->cd_cur_size = 0;
522
523
/* Attempt to merge with the preceding chunk, if empty */
524
cdp2 = LIST_NEXT(cdp, cd_all_le);
525
if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
526
MPASS(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base);
527
LIST_REMOVE(cdp, cd_all_le);
528
LIST_REMOVE(cdp2, cd_hash_le);
529
cdp2->cd_max_size += cdp->cd_max_size;
530
uma_zfree(chunk_zone, cdp);
531
cdp = cdp2;
532
}
533
534
/* Attempt to merge with the subsequent chunk, if empty */
535
cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le);
536
if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
537
MPASS(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base);
538
LIST_REMOVE(cdp, cd_all_le);
539
LIST_REMOVE(cdp2, cd_hash_le);
540
cdp2->cd_max_size += cdp->cd_max_size;
541
cdp2->cd_base = cdp->cd_base;
542
uma_zfree(chunk_zone, cdp);
543
cdp = cdp2;
544
}
545
546
if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) {
547
/* Free the chunk on the top of the range heap, trim the heap */
548
MPASS(cdp == LIST_FIRST(&da->all_chunks));
549
da->rtbl_top -= cdp->cd_max_size;
550
da->unused_chunks_size -= cdp->cd_max_size;
551
LIST_REMOVE(cdp, cd_all_le);
552
uma_zfree(chunk_zone, cdp);
553
return;
554
}
555
556
i = cdp->cd_max_size;
557
if (i >= UNUSED_BUCKETS)
558
i = 0;
559
LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le);
560
}
561
562
static uint32_t
563
trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index)
564
{
565
uint32_t i, *val;
566
uint32_t hash = 0;
567
568
for (i = 0; i < (1 << dxr_x); i++) {
569
hash = (hash << 3) ^ (hash >> 3);
570
val = (uint32_t *)
571
(void *) &da->direct_tbl[(index << dxr_x) + i];
572
hash += (*val << 5);
573
hash += (*val >> 5);
574
}
575
576
return (hash + (hash >> 16));
577
}
578
579
static int
580
trie_ref(struct dxr_aux *da, uint32_t index)
581
{
582
struct trie_desc *tp;
583
uint32_t dxr_d = da->d_bits;
584
uint32_t dxr_x = DXR_TRIE_BITS - dxr_d;
585
uint32_t hash = trie_hash(da, dxr_x, index);
586
587
/* Find an existing descriptor */
588
LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le)
589
if (tp->td_hash == hash &&
590
memcmp(&da->direct_tbl[index << dxr_x],
591
&da->x_tbl[tp->td_index << dxr_x],
592
sizeof(*da->x_tbl) << dxr_x) == 0) {
593
tp->td_refcnt++;
594
da->trietbl[index] = tp;
595
return(tp->td_index);
596
}
597
598
tp = LIST_FIRST(&da->unused_trie);
599
if (tp != NULL) {
600
LIST_REMOVE(tp, td_hash_le);
601
da->unused_trie_cnt--;
602
} else {
603
tp = uma_zalloc(trie_zone, M_NOWAIT);
604
if (tp == NULL)
605
return (-1);
606
LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le);
607
tp->td_index = da->all_trie_cnt++;
608
}
609
610
tp->td_hash = hash;
611
tp->td_refcnt = 1;
612
LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp,
613
td_hash_le);
614
memcpy(&da->x_tbl[tp->td_index << dxr_x],
615
&da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x);
616
da->trietbl[index] = tp;
617
if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) {
618
da->xtbl_size += XTBL_SIZE_INCR;
619
da->x_tbl = realloc(da->x_tbl,
620
sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT);
621
if (da->x_tbl == NULL) {
622
FIB_PRINTF(LOG_NOTICE, da->fd,
623
"Unable to allocate DXR extension table");
624
return (-1);
625
}
626
}
627
return(tp->td_index);
628
}
629
630
static void
631
trie_unref(struct dxr_aux *da, uint32_t index)
632
{
633
struct trie_desc *tp = da->trietbl[index];
634
635
if (tp == NULL)
636
return;
637
da->trietbl[index] = NULL;
638
if (--tp->td_refcnt > 0)
639
return;
640
641
LIST_REMOVE(tp, td_hash_le);
642
da->unused_trie_cnt++;
643
if (tp->td_index != da->all_trie_cnt - 1) {
644
LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le);
645
return;
646
}
647
648
do {
649
da->all_trie_cnt--;
650
da->unused_trie_cnt--;
651
LIST_REMOVE(tp, td_all_le);
652
uma_zfree(trie_zone, tp);
653
LIST_FOREACH(tp, &da->unused_trie, td_hash_le)
654
if (tp->td_index == da->all_trie_cnt - 1) {
655
LIST_REMOVE(tp, td_hash_le);
656
break;
657
}
658
} while (tp != NULL);
659
}
660
661
static void
662
heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen,
663
uint32_t nh)
664
{
665
struct heap_entry *fhp;
666
int i;
667
668
for (i = da->heap_index; i >= 0; i--) {
669
if (preflen > da->heap[i].preflen)
670
break;
671
else if (preflen < da->heap[i].preflen)
672
da->heap[i + 1] = da->heap[i];
673
else
674
return;
675
}
676
677
fhp = &da->heap[i + 1];
678
fhp->preflen = preflen;
679
fhp->start = start;
680
fhp->end = end;
681
fhp->nexthop = nh;
682
da->heap_index++;
683
}
684
685
static int
686
dxr_walk(struct rtentry *rt, void *arg)
687
{
688
struct dxr_aux *da = arg;
689
uint32_t chunk = da->work_chunk;
690
uint32_t first = chunk << DXR_RANGE_SHIFT;
691
uint32_t last = first | DXR_RANGE_MASK;
692
struct range_entry_long *fp =
693
&da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
694
struct heap_entry *fhp = &da->heap[da->heap_index];
695
uint32_t preflen, nh, start, end, scopeid;
696
struct in_addr addr;
697
698
rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid);
699
start = ntohl(addr.s_addr);
700
if (start > last)
701
return (-1); /* Beyond chunk boundaries, we are done */
702
if (start < first)
703
return (0); /* Skip this route */
704
705
end = start;
706
if (preflen < 32)
707
end |= (0xffffffffU >> preflen);
708
nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt));
709
710
if (start == fhp->start)
711
heap_inject(da, start, end, preflen, nh);
712
else {
713
/* start > fhp->start */
714
while (start > fhp->end) {
715
uint32_t oend = fhp->end;
716
717
if (da->heap_index > 0) {
718
fhp--;
719
da->heap_index--;
720
} else
721
initheap(da, fhp->end + 1, chunk);
722
if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
723
fp++;
724
da->rtbl_work_frags++;
725
fp->start = (oend + 1) & DXR_RANGE_MASK;
726
fp->nexthop = fhp->nexthop;
727
}
728
}
729
if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) &&
730
nh != fp->nexthop) {
731
fp++;
732
da->rtbl_work_frags++;
733
fp->start = start & DXR_RANGE_MASK;
734
} else if (da->rtbl_work_frags) {
735
if ((--fp)->nexthop == nh)
736
da->rtbl_work_frags--;
737
else
738
fp++;
739
}
740
fp->nexthop = nh;
741
heap_inject(da, start, end, preflen, nh);
742
}
743
744
return (0);
745
}
746
747
static int
748
update_chunk(struct dxr_aux *da, uint32_t chunk)
749
{
750
struct range_entry_long *fp;
751
#if DXR_TRIE_BITS < 24
752
struct range_entry_short *fps;
753
uint32_t start, nh, i;
754
#endif
755
struct heap_entry *fhp;
756
uint32_t first = chunk << DXR_RANGE_SHIFT;
757
uint32_t last = first | DXR_RANGE_MASK;
758
759
if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT)
760
chunk_unref(da, chunk);
761
762
initheap(da, first, chunk);
763
764
fp = &da->range_tbl[da->rtbl_top].re;
765
da->rtbl_work_frags = 0;
766
fp->start = first & DXR_RANGE_MASK;
767
fp->nexthop = da->heap[0].nexthop;
768
769
da->dst.sin_addr.s_addr = htonl(first);
770
da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK);
771
772
da->work_chunk = chunk;
773
rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED,
774
(struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask,
775
dxr_walk, da);
776
777
/* Flush any remaining objects on the heap */
778
fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
779
fhp = &da->heap[da->heap_index];
780
while (fhp->preflen > DXR_TRIE_BITS) {
781
uint32_t oend = fhp->end;
782
783
if (da->heap_index > 0) {
784
fhp--;
785
da->heap_index--;
786
} else
787
initheap(da, fhp->end + 1, chunk);
788
if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
789
/* Have we crossed the upper chunk boundary? */
790
if (oend >= last)
791
break;
792
fp++;
793
da->rtbl_work_frags++;
794
fp->start = (oend + 1) & DXR_RANGE_MASK;
795
fp->nexthop = fhp->nexthop;
796
}
797
}
798
799
/* Direct hit if the chunk contains only a single fragment */
800
if (da->rtbl_work_frags == 0) {
801
da->direct_tbl[chunk].base = fp->nexthop;
802
da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT;
803
return (0);
804
}
805
806
da->direct_tbl[chunk].base = da->rtbl_top;
807
da->direct_tbl[chunk].fragments = da->rtbl_work_frags;
808
809
#if DXR_TRIE_BITS < 24
810
/* Check whether the chunk can be more compactly encoded */
811
fp = &da->range_tbl[da->rtbl_top].re;
812
for (i = 0; i <= da->rtbl_work_frags; i++, fp++)
813
if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH)
814
break;
815
if (i == da->rtbl_work_frags + 1) {
816
fp = &da->range_tbl[da->rtbl_top].re;
817
fps = (void *) fp;
818
for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) {
819
start = fp->start;
820
nh = fp->nexthop;
821
fps->start = start >> 8;
822
fps->nexthop = nh;
823
}
824
fps->start = start >> 8;
825
fps->nexthop = nh;
826
da->rtbl_work_frags >>= 1;
827
da->direct_tbl[chunk].fragments =
828
da->rtbl_work_frags | FRAGS_PREF_SHORT;
829
} else
830
#endif
831
if (da->rtbl_work_frags >= FRAGS_MARK_HIT) {
832
da->direct_tbl[chunk].fragments = FRAGS_MARK_XL;
833
memmove(&da->range_tbl[da->rtbl_top + 1],
834
&da->range_tbl[da->rtbl_top],
835
(da->rtbl_work_frags + 1) * sizeof(*da->range_tbl));
836
da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags;
837
da->rtbl_work_frags++;
838
}
839
da->rtbl_top += (da->rtbl_work_frags + 1);
840
return (chunk_ref(da, chunk));
841
}
842
843
static void
844
dxr_build(struct dxr *dxr)
845
{
846
struct dxr_aux *da = dxr->aux;
847
struct chunk_desc *cdp;
848
struct rib_rtable_info rinfo;
849
struct timeval t0, t1, t2, t3;
850
uint32_t r_size, dxr_tot_size;
851
uint32_t i, m, range_rebuild = 0;
852
uint32_t range_frag;
853
struct trie_desc *tp;
854
uint32_t d_tbl_size, dxr_x, d_size, x_size;
855
uint32_t ti, trie_rebuild = 0, prev_size = 0;
856
uint32_t trie_frag;
857
858
MPASS(dxr->d == NULL);
859
860
if (da == NULL) {
861
da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT);
862
if (da == NULL) {
863
FIB_PRINTF(LOG_NOTICE, dxr->fd,
864
"Unable to allocate DXR aux struct");
865
return;
866
}
867
dxr->aux = da;
868
da->fibnum = dxr->fibnum;
869
da->fd = dxr->fd;
870
da->refcnt = 1;
871
LIST_INIT(&da->all_chunks);
872
LIST_INIT(&da->all_trie);
873
da->rtbl_size = RTBL_SIZE_INCR;
874
da->range_tbl = NULL;
875
da->xtbl_size = XTBL_SIZE_INCR;
876
da->x_tbl = NULL;
877
bzero(&da->dst, sizeof(da->dst));
878
bzero(&da->mask, sizeof(da->mask));
879
da->dst.sin_len = sizeof(da->dst);
880
da->mask.sin_len = sizeof(da->mask);
881
da->dst.sin_family = AF_INET;
882
da->mask.sin_family = AF_INET;
883
}
884
if (da->range_tbl == NULL) {
885
da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size
886
+ FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT);
887
if (da->range_tbl == NULL) {
888
FIB_PRINTF(LOG_NOTICE, da->fd,
889
"Unable to allocate DXR range table");
890
return;
891
}
892
range_rebuild = 1;
893
}
894
if (da->x_tbl == NULL) {
895
da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size,
896
M_DXRAUX, M_NOWAIT);
897
if (da->x_tbl == NULL) {
898
FIB_PRINTF(LOG_NOTICE, da->fd,
899
"Unable to allocate DXR extension table");
900
return;
901
}
902
trie_rebuild = 1;
903
}
904
905
microuptime(&t0);
906
907
dxr->nh_tbl = fib_get_nhop_array(da->fd);
908
fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
909
910
if (da->updates_low > da->updates_high)
911
range_rebuild = 1;
912
913
range_build:
914
if (range_rebuild) {
915
/* Bulk cleanup */
916
bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl));
917
while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
918
LIST_REMOVE(cdp, cd_all_le);
919
uma_zfree(chunk_zone, cdp);
920
}
921
for (i = 0; i < UNUSED_BUCKETS; i++)
922
LIST_INIT(&da->unused_chunks[i]);
923
da->unused_chunks_size = 0;
924
da->rtbl_top = 0;
925
da->updates_low = 0;
926
da->updates_high = DIRECT_TBL_SIZE - 1;
927
memset(da->updates_mask, 0xff, sizeof(da->updates_mask));
928
for (i = 0; i < DIRECT_TBL_SIZE; i++) {
929
da->direct_tbl[i].fragments = FRAGS_MARK_HIT;
930
da->direct_tbl[i].base = 0;
931
}
932
}
933
da->prefixes = rinfo.num_prefixes;
934
935
/* DXR: construct direct & range table */
936
for (i = da->updates_low; i <= da->updates_high; i++) {
937
m = da->updates_mask[i >> 5] >> (i & 0x1f);
938
if (m == 0)
939
i |= 0x1f;
940
else if (m & 1 && update_chunk(da, i) != 0)
941
return;
942
}
943
944
range_frag = 0;
945
if (da->rtbl_top)
946
range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top;
947
if (range_frag > V_frag_limit) {
948
range_rebuild = 1;
949
goto range_build;
950
}
951
952
r_size = sizeof(*da->range_tbl) * da->rtbl_top;
953
microuptime(&t1);
954
955
if (range_rebuild ||
956
abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1)
957
trie_rebuild = 1;
958
959
trie_build:
960
if (trie_rebuild) {
961
da->trie_rebuilt_prefixes = da->prefixes;
962
da->d_bits = DXR_D;
963
da->updates_low = 0;
964
da->updates_high = DIRECT_TBL_SIZE - 1;
965
if (!range_rebuild)
966
memset(da->updates_mask, 0xff,
967
sizeof(da->updates_mask));
968
}
969
970
dxr2_try_squeeze:
971
if (trie_rebuild) {
972
/* Bulk cleanup */
973
bzero(da->trietbl, sizeof(da->trietbl));
974
bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl));
975
while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
976
LIST_REMOVE(tp, td_all_le);
977
uma_zfree(trie_zone, tp);
978
}
979
LIST_INIT(&da->unused_trie);
980
da->all_trie_cnt = da->unused_trie_cnt = 0;
981
}
982
983
/* Populate d_tbl, x_tbl */
984
dxr_x = DXR_TRIE_BITS - da->d_bits;
985
d_tbl_size = (1 << da->d_bits);
986
987
for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x;
988
i++) {
989
if (!trie_rebuild) {
990
m = 0;
991
for (int j = 0; j < (1 << dxr_x); j += 32)
992
m |= da->updates_mask[((i << dxr_x) + j) >> 5];
993
if (m == 0)
994
continue;
995
trie_unref(da, i);
996
}
997
ti = trie_ref(da, i);
998
if (ti < 0)
999
return;
1000
da->d_tbl[i] = ti;
1001
}
1002
1003
trie_frag = 0;
1004
if (da->all_trie_cnt)
1005
trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt;
1006
if (trie_frag > V_frag_limit) {
1007
trie_rebuild = 1;
1008
goto trie_build;
1009
}
1010
1011
d_size = sizeof(*da->d_tbl) * d_tbl_size;
1012
x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size
1013
* da->all_trie_cnt;
1014
dxr_tot_size = d_size + x_size + r_size;
1015
1016
if (trie_rebuild == 1) {
1017
/* Try to find a more compact D/X split */
1018
if (prev_size == 0 || dxr_tot_size <= prev_size)
1019
da->d_bits--;
1020
else {
1021
da->d_bits++;
1022
trie_rebuild = 2;
1023
}
1024
prev_size = dxr_tot_size;
1025
goto dxr2_try_squeeze;
1026
}
1027
microuptime(&t2);
1028
1029
dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT);
1030
if (dxr->d == NULL) {
1031
FIB_PRINTF(LOG_NOTICE, da->fd,
1032
"Unable to allocate DXR lookup table");
1033
return;
1034
}
1035
memcpy(dxr->d, da->d_tbl, d_size);
1036
dxr->x = ((char *) dxr->d) + d_size;
1037
memcpy(dxr->x, da->x_tbl, x_size);
1038
dxr->r = ((char *) dxr->x) + x_size;
1039
dxr->d_shift = 32 - da->d_bits;
1040
dxr->x_shift = dxr_x;
1041
dxr->x_mask = 0xffffffffU >> (32 - dxr_x);
1042
memcpy(dxr->r, da->range_tbl, r_size);
1043
1044
if (da->updates_low <= da->updates_high)
1045
bzero(&da->updates_mask[da->updates_low / 32],
1046
(da->updates_high - da->updates_low) / 8 + 1);
1047
da->updates_low = DIRECT_TBL_SIZE - 1;
1048
da->updates_high = 0;
1049
microuptime(&t3);
1050
1051
FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)",
1052
da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops);
1053
i = dxr_tot_size * 100;
1054
if (rinfo.num_prefixes)
1055
i /= rinfo.num_prefixes;
1056
FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix",
1057
dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100,
1058
i / 100, i % 100);
1059
FIB_PRINTF(LOG_INFO, da->fd,
1060
"%d.%02d%% trie, %d.%02d%% range fragmentation",
1061
trie_frag / 100, trie_frag % 100,
1062
range_frag / 100, range_frag % 100);
1063
i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
1064
FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms",
1065
range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1066
i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec;
1067
FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms",
1068
trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1069
i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec;
1070
FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms",
1071
i / 1000, i % 1000);
1072
}
1073
1074
/*
1075
* Glue functions for attaching to the FIB_ALGO infrastructure.
1076
*/
1077
1078
static struct nhop_object *
1079
dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key,
1080
uint32_t scopeid)
1081
{
1082
struct dxr *dxr = algo_data;
1083
1084
return (dxr->nh_tbl[dxr_lookup(dxr, ntohl(key.addr4.s_addr))]);
1085
}
1086
1087
static enum flm_op_result
1088
dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data)
1089
{
1090
struct dxr *old_dxr = old_data;
1091
struct dxr_aux *da = NULL;
1092
struct dxr *dxr;
1093
1094
dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT);
1095
if (dxr == NULL) {
1096
FIB_PRINTF(LOG_NOTICE, fd,
1097
"Unable to allocate DXR container struct");
1098
return (FLM_REBUILD);
1099
}
1100
1101
/* Check whether we may reuse the old auxiliary structures */
1102
if (old_dxr != NULL && old_dxr->aux != NULL &&
1103
old_dxr->aux->fd == fd) {
1104
da = old_dxr->aux;
1105
atomic_add_int(&da->refcnt, 1);
1106
}
1107
1108
dxr->aux = da;
1109
dxr->d = NULL;
1110
dxr->fd = fd;
1111
dxr->fibnum = fibnum;
1112
*data = dxr;
1113
1114
return (FLM_SUCCESS);
1115
}
1116
1117
static void
1118
dxr_destroy(void *data)
1119
{
1120
struct dxr *dxr = data;
1121
struct dxr_aux *da = dxr->aux;
1122
struct chunk_desc *cdp;
1123
struct trie_desc *tp;
1124
1125
free(dxr->d, M_DXRLPM);
1126
free(dxr, M_DXRAUX);
1127
1128
if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1)
1129
return;
1130
1131
/* Release auxiliary structures */
1132
while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
1133
LIST_REMOVE(cdp, cd_all_le);
1134
uma_zfree(chunk_zone, cdp);
1135
}
1136
while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
1137
LIST_REMOVE(tp, td_all_le);
1138
uma_zfree(trie_zone, tp);
1139
}
1140
free(da->range_tbl, M_DXRAUX);
1141
free(da->x_tbl, M_DXRAUX);
1142
free(da, M_DXRAUX);
1143
}
1144
1145
static void
1146
epoch_dxr_destroy(epoch_context_t ctx)
1147
{
1148
struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx);
1149
1150
dxr_destroy(dxr);
1151
}
1152
1153
static void *
1154
choose_lookup_fn(struct dxr_aux *da)
1155
{
1156
1157
switch (da->d_bits) {
1158
#if DXR_TRIE_BITS > 16
1159
case 16:
1160
return (dxr_fib_lookup_16);
1161
#endif
1162
case 15:
1163
return (dxr_fib_lookup_15);
1164
case 14:
1165
return (dxr_fib_lookup_14);
1166
case 13:
1167
return (dxr_fib_lookup_13);
1168
case 12:
1169
return (dxr_fib_lookup_12);
1170
case 11:
1171
return (dxr_fib_lookup_11);
1172
case 10:
1173
return (dxr_fib_lookup_10);
1174
case 9:
1175
return (dxr_fib_lookup_9);
1176
}
1177
return (dxr_fib_lookup);
1178
}
1179
1180
static enum flm_op_result
1181
dxr_dump_end(void *data, struct fib_dp *dp)
1182
{
1183
struct dxr *dxr = data;
1184
struct dxr_aux *da;
1185
1186
dxr_build(dxr);
1187
1188
da = dxr->aux;
1189
if (da == NULL || dxr->d == NULL)
1190
return (FLM_REBUILD);
1191
1192
if (da->rtbl_top >= BASE_MAX)
1193
return (FLM_ERROR);
1194
1195
dp->f = choose_lookup_fn(da);
1196
dp->arg = dxr;
1197
1198
return (FLM_SUCCESS);
1199
}
1200
1201
static enum flm_op_result
1202
dxr_dump_rib_item(struct rtentry *rt, void *data)
1203
{
1204
1205
return (FLM_SUCCESS);
1206
}
1207
1208
static enum flm_op_result
1209
dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc,
1210
void *data)
1211
{
1212
1213
return (FLM_BATCH);
1214
}
1215
1216
static enum flm_op_result
1217
dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q,
1218
void *data)
1219
{
1220
struct dxr *dxr = data;
1221
struct dxr *new_dxr;
1222
struct dxr_aux *da;
1223
struct fib_dp new_dp;
1224
enum flm_op_result res;
1225
uint32_t ip, plen, hmask, start, end, i, ui;
1226
#ifdef INVARIANTS
1227
struct rib_rtable_info rinfo;
1228
int update_delta = 0;
1229
#endif
1230
1231
MPASS(data != NULL);
1232
MPASS(q != NULL);
1233
MPASS(q->count < q->size);
1234
1235
da = dxr->aux;
1236
MPASS(da != NULL);
1237
MPASS(da->fd == dxr->fd);
1238
MPASS(da->refcnt > 0);
1239
1240
FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count);
1241
for (ui = 0; ui < q->count; ui++) {
1242
#ifdef INVARIANTS
1243
if (q->entries[ui].nh_new != NULL)
1244
update_delta++;
1245
if (q->entries[ui].nh_old != NULL)
1246
update_delta--;
1247
#endif
1248
plen = q->entries[ui].plen;
1249
ip = ntohl(q->entries[ui].addr4.s_addr);
1250
if (plen < 32)
1251
hmask = 0xffffffffU >> plen;
1252
else
1253
hmask = 0;
1254
start = (ip & ~hmask) >> DXR_RANGE_SHIFT;
1255
end = (ip | hmask) >> DXR_RANGE_SHIFT;
1256
1257
if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f)
1258
for (i = start >> 5; i <= end >> 5; i++)
1259
da->updates_mask[i] = 0xffffffffU;
1260
else
1261
for (i = start; i <= end; i++)
1262
da->updates_mask[i >> 5] |= (1 << (i & 0x1f));
1263
if (start < da->updates_low)
1264
da->updates_low = start;
1265
if (end > da->updates_high)
1266
da->updates_high = end;
1267
}
1268
1269
#ifdef INVARIANTS
1270
fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
1271
MPASS(da->prefixes + update_delta == rinfo.num_prefixes);
1272
#endif
1273
1274
res = dxr_init(0, dxr->fd, data, (void **) &new_dxr);
1275
if (res != FLM_SUCCESS)
1276
return (res);
1277
1278
dxr_build(new_dxr);
1279
1280
/* Structural limit exceeded, hard error */
1281
if (da->rtbl_top >= BASE_MAX) {
1282
dxr_destroy(new_dxr);
1283
return (FLM_ERROR);
1284
}
1285
1286
if (new_dxr->d == NULL) {
1287
dxr_destroy(new_dxr);
1288
return (FLM_REBUILD);
1289
}
1290
1291
new_dp.f = choose_lookup_fn(da);
1292
new_dp.arg = new_dxr;
1293
if (fib_set_datapath_ptr(dxr->fd, &new_dp)) {
1294
fib_set_algo_ptr(dxr->fd, new_dxr);
1295
fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx);
1296
return (FLM_SUCCESS);
1297
}
1298
1299
FIB_PRINTF(LOG_NOTICE, dxr->fd, "fib_set_datapath_ptr() failed");
1300
dxr_destroy(new_dxr);
1301
return (FLM_REBUILD);
1302
}
1303
1304
static uint8_t
1305
dxr_get_pref(const struct rib_rtable_info *rinfo)
1306
{
1307
1308
/* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */
1309
return (251);
1310
}
1311
1312
SYSCTL_DECL(_net_route_algo);
1313
SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1314
"DXR tunables");
1315
1316
static int
1317
sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)
1318
{
1319
char buf[8];
1320
int error, new, i;
1321
1322
snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1323
V_frag_limit % 100);
1324
error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
1325
if (error != 0 || req->newptr == NULL)
1326
return (error);
1327
if (!isdigit(*buf) && *buf != '.')
1328
return (EINVAL);
1329
for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++)
1330
new = new * 10 + buf[i] - '0';
1331
new *= 100;
1332
if (buf[i++] == '.') {
1333
if (!isdigit(buf[i]))
1334
return (EINVAL);
1335
new += (buf[i++] - '0') * 10;
1336
if (isdigit(buf[i]))
1337
new += buf[i++] - '0';
1338
}
1339
if (new > 1000)
1340
return (EINVAL);
1341
V_frag_limit = new;
1342
snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1343
V_frag_limit % 100);
1344
return (0);
1345
}
1346
1347
SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit,
1348
CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET,
1349
0, 0, sysctl_dxr_frag_limit, "A",
1350
"Fragmentation threshold to full rebuild");
1351
1352
static struct fib_lookup_module fib_dxr_mod = {
1353
.flm_name = "dxr",
1354
.flm_family = AF_INET,
1355
.flm_init_cb = dxr_init,
1356
.flm_destroy_cb = dxr_destroy,
1357
.flm_dump_rib_item_cb = dxr_dump_rib_item,
1358
.flm_dump_end_cb = dxr_dump_end,
1359
.flm_change_rib_item_cb = dxr_change_rib_item,
1360
.flm_change_rib_items_cb = dxr_change_rib_batch,
1361
.flm_get_pref = dxr_get_pref,
1362
};
1363
1364
static int
1365
dxr_modevent(module_t mod, int type, void *unused)
1366
{
1367
int error;
1368
1369
switch (type) {
1370
case MOD_LOAD:
1371
chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc),
1372
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1373
trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc),
1374
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1375
fib_module_register(&fib_dxr_mod);
1376
return(0);
1377
case MOD_UNLOAD:
1378
error = fib_module_unregister(&fib_dxr_mod);
1379
if (error)
1380
return (error);
1381
uma_zdestroy(chunk_zone);
1382
uma_zdestroy(trie_zone);
1383
return(0);
1384
default:
1385
return(EOPNOTSUPP);
1386
}
1387
}
1388
1389
static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0};
1390
1391
DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
1392
MODULE_VERSION(fib_dxr, 1);
1393
1394