Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/netinet/in_fib_dxr.c
39475 views
1
/*-
2
* SPDX-License-Identifier: BSD-2-Clause
3
*
4
* Copyright (c) 2012-2024 Marko Zec
5
* Copyright (c) 2005, 2018 University of Zagreb
6
* Copyright (c) 2005 International Computer Science Institute
7
*
8
* Redistribution and use in source and binary forms, with or without
9
* modification, are permitted provided that the following conditions
10
* are met:
11
* 1. Redistributions of source code must retain the above copyright
12
* notice, this list of conditions and the following disclaimer.
13
* 2. Redistributions in binary form must reproduce the above copyright
14
* notice, this list of conditions and the following disclaimer in the
15
* documentation and/or other materials provided with the distribution.
16
*
17
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27
* SUCH DAMAGE.
28
*/
29
30
/*
31
* An implementation of DXR, a simple IPv4 LPM scheme with compact lookup
32
* structures and a trivial search procedure. More significant bits of
33
* the search key are used to directly index a two-stage trie, while the
34
* remaining bits are used for finding the next hop in a sorted array.
35
* More details in:
36
*
37
* M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per
38
* second in software, ACM SIGCOMM Computer Communication Review, September
39
* 2012
40
*
41
* M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing
42
* lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split
43
*/
44
45
#include <sys/cdefs.h>
46
#include "opt_inet.h"
47
48
#include <sys/param.h>
49
#include <sys/kernel.h>
50
#include <sys/ctype.h>
51
#include <sys/epoch.h>
52
#include <sys/malloc.h>
53
#include <sys/module.h>
54
#include <sys/socket.h>
55
#include <sys/sysctl.h>
56
#include <sys/syslog.h>
57
58
#include <vm/uma.h>
59
60
#include <netinet/in.h>
61
#include <netinet/in_fib.h>
62
63
#include <net/route.h>
64
#include <net/route/route_ctl.h>
65
#include <net/route/fib_algo.h>
66
67
#define DXR_TRIE_BITS 20
68
69
CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24);
70
71
#if DXR_TRIE_BITS > 16
72
#define DXR_D 16
73
#else
74
#define DXR_D (DXR_TRIE_BITS - 1)
75
#endif
76
77
#define D_TBL_SIZE (1 << DXR_D)
78
#define DIRECT_TBL_SIZE (1 << DXR_TRIE_BITS)
79
#define DXR_RANGE_MASK (0xffffffffU >> DXR_TRIE_BITS)
80
#define DXR_RANGE_SHIFT (32 - DXR_TRIE_BITS)
81
82
#define DESC_BASE_BITS 22
83
#define DESC_FRAGMENTS_BITS (32 - DESC_BASE_BITS)
84
#define BASE_MAX ((1 << DESC_BASE_BITS) - 1)
85
#define RTBL_SIZE_INCR (BASE_MAX / 64)
86
87
#if DXR_TRIE_BITS < 24
88
#define FRAGS_MASK_SHORT ((1 << (23 - DXR_TRIE_BITS)) - 1)
89
#else
90
#define FRAGS_MASK_SHORT 0
91
#endif
92
#define FRAGS_PREF_SHORT (((1 << DESC_FRAGMENTS_BITS) - 1) & \
93
~FRAGS_MASK_SHORT)
94
#define FRAGS_MARK_XL (FRAGS_PREF_SHORT - 1)
95
#define FRAGS_MARK_HIT (FRAGS_PREF_SHORT - 2)
96
97
#define IS_SHORT_FORMAT(x) ((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT)
98
#define IS_LONG_FORMAT(x) ((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT)
99
#define IS_XL_FORMAT(x) (x == FRAGS_MARK_XL)
100
101
#define RE_SHORT_MAX_NH ((1 << (DXR_TRIE_BITS - 8)) - 1)
102
103
#define CHUNK_HASH_BITS 16
104
#define CHUNK_HASH_SIZE (1 << CHUNK_HASH_BITS)
105
#define CHUNK_HASH_MASK (CHUNK_HASH_SIZE - 1)
106
107
#define TRIE_HASH_BITS 16
108
#define TRIE_HASH_SIZE (1 << TRIE_HASH_BITS)
109
#define TRIE_HASH_MASK (TRIE_HASH_SIZE - 1)
110
111
#define XTBL_SIZE_INCR (DIRECT_TBL_SIZE / 16)
112
113
#define UNUSED_BUCKETS 8
114
115
/* Lookup structure elements */
116
117
struct direct_entry {
118
uint32_t fragments: DESC_FRAGMENTS_BITS,
119
base: DESC_BASE_BITS;
120
};
121
122
struct range_entry_long {
123
uint32_t start: DXR_RANGE_SHIFT,
124
nexthop: DXR_TRIE_BITS;
125
};
126
127
#if DXR_TRIE_BITS < 24
128
struct range_entry_short {
129
uint16_t start: DXR_RANGE_SHIFT - 8,
130
nexthop: DXR_TRIE_BITS - 8;
131
};
132
#endif
133
134
/* Auxiliary structures */
135
136
struct heap_entry {
137
uint32_t start;
138
uint32_t end;
139
uint32_t preflen;
140
uint32_t nexthop;
141
};
142
143
struct chunk_desc {
144
LIST_ENTRY(chunk_desc) cd_all_le;
145
LIST_ENTRY(chunk_desc) cd_hash_le;
146
uint32_t cd_hash;
147
uint32_t cd_refcnt;
148
uint32_t cd_base;
149
uint32_t cd_cur_size;
150
uint32_t cd_max_size;
151
};
152
153
struct trie_desc {
154
LIST_ENTRY(trie_desc) td_all_le;
155
LIST_ENTRY(trie_desc) td_hash_le;
156
uint32_t td_hash;
157
uint32_t td_index;
158
uint32_t td_refcnt;
159
};
160
161
struct dxr_aux {
162
/* Glue to external state */
163
struct fib_data *fd;
164
uint32_t fibnum;
165
int refcnt;
166
167
/* Auxiliary build-time tables */
168
struct direct_entry direct_tbl[DIRECT_TBL_SIZE];
169
uint16_t d_tbl[D_TBL_SIZE];
170
struct direct_entry *x_tbl;
171
union {
172
struct range_entry_long re;
173
uint32_t fragments;
174
} *range_tbl;
175
176
/* Auxiliary internal state */
177
uint32_t updates_mask[DIRECT_TBL_SIZE / 32];
178
struct trie_desc *trietbl[D_TBL_SIZE];
179
LIST_HEAD(, chunk_desc) chunk_hashtbl[CHUNK_HASH_SIZE];
180
LIST_HEAD(, chunk_desc) all_chunks;
181
LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS];
182
LIST_HEAD(, trie_desc) trie_hashtbl[TRIE_HASH_SIZE];
183
LIST_HEAD(, trie_desc) all_trie;
184
LIST_HEAD(, trie_desc) unused_trie; /* abuses hash link entry */
185
struct sockaddr_in dst;
186
struct sockaddr_in mask;
187
struct heap_entry heap[33];
188
uint32_t prefixes;
189
uint32_t updates_low;
190
uint32_t updates_high;
191
uint32_t unused_chunks_size;
192
uint32_t xtbl_size;
193
uint32_t all_trie_cnt;
194
uint32_t unused_trie_cnt;
195
uint32_t trie_rebuilt_prefixes;
196
uint32_t heap_index;
197
uint32_t d_bits;
198
uint32_t rtbl_size;
199
uint32_t rtbl_top;
200
uint32_t rtbl_work_frags;
201
uint32_t work_chunk;
202
};
203
204
/* Main lookup structure container */
205
206
struct dxr {
207
/* Lookup tables */
208
void *d;
209
void *x;
210
void *r;
211
struct nhop_object **nh_tbl;
212
213
/* Glue to external state */
214
struct dxr_aux *aux;
215
struct fib_data *fd;
216
struct epoch_context epoch_ctx;
217
uint32_t fibnum;
218
uint16_t d_shift;
219
uint16_t x_shift;
220
uint32_t x_mask;
221
};
222
223
static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM");
224
static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary");
225
226
uma_zone_t chunk_zone;
227
uma_zone_t trie_zone;
228
229
VNET_DEFINE_STATIC(int, frag_limit) = 100;
230
#define V_frag_limit VNET(frag_limit)
231
232
/* Binary search for a matching address range */
233
#define DXR_LOOKUP_STAGE \
234
if (masked_dst < range[middle].start) { \
235
upperbound = middle; \
236
middle = (middle + lowerbound) / 2; \
237
} else if (masked_dst < range[middle + 1].start) \
238
return (range[middle].nexthop); \
239
else { \
240
lowerbound = middle + 1; \
241
middle = (upperbound + middle + 1) / 2; \
242
} \
243
if (upperbound == lowerbound) \
244
return (range[lowerbound].nexthop);
245
246
static int
247
range_lookup(struct range_entry_long *rt, struct direct_entry de, uint32_t dst)
248
{
249
uint32_t base;
250
uint32_t upperbound;
251
uint32_t middle;
252
uint32_t lowerbound;
253
uint32_t masked_dst;
254
255
base = de.base;
256
lowerbound = 0;
257
masked_dst = dst & DXR_RANGE_MASK;
258
259
#if DXR_TRIE_BITS < 24
260
if (__predict_true(IS_SHORT_FORMAT(de.fragments))) {
261
upperbound = de.fragments & FRAGS_MASK_SHORT;
262
struct range_entry_short *range =
263
(struct range_entry_short *) &rt[base];
264
265
masked_dst >>= 8;
266
middle = upperbound;
267
upperbound = upperbound * 2 + 1;
268
269
for (;;) {
270
DXR_LOOKUP_STAGE
271
DXR_LOOKUP_STAGE
272
}
273
}
274
#endif
275
276
upperbound = de.fragments;
277
middle = upperbound / 2;
278
struct range_entry_long *range = &rt[base];
279
if (__predict_false(IS_XL_FORMAT(de.fragments))) {
280
upperbound = *((uint32_t *) range);
281
range++;
282
middle = upperbound / 2;
283
}
284
285
for (;;) {
286
DXR_LOOKUP_STAGE
287
DXR_LOOKUP_STAGE
288
}
289
}
290
291
#define DXR_LOOKUP_DEFINE(D) \
292
static int inline \
293
dxr_lookup_##D(struct dxr *dxr, uint32_t dst) \
294
{ \
295
struct direct_entry de; \
296
uint16_t *dt = dxr->d; \
297
struct direct_entry *xt = dxr->x; \
298
\
299
de = xt[(dt[dst >> (32 - (D))] << (DXR_TRIE_BITS - (D))) \
300
+ ((dst >> DXR_RANGE_SHIFT) & \
301
(0xffffffffU >> (32 - DXR_TRIE_BITS + (D))))]; \
302
if (__predict_true(de.fragments == FRAGS_MARK_HIT)) \
303
return (de.base); \
304
return (range_lookup(dxr->r, de, dst)); \
305
} \
306
\
307
static struct nhop_object * \
308
dxr_fib_lookup_##D(void *algo_data, \
309
const struct flm_lookup_key key, uint32_t scopeid __unused) \
310
{ \
311
struct dxr *dxr = algo_data; \
312
\
313
return (dxr->nh_tbl[dxr_lookup_##D(dxr, \
314
ntohl(key.addr4.s_addr))]); \
315
}
316
317
#if DXR_TRIE_BITS > 16
318
DXR_LOOKUP_DEFINE(16)
319
#endif
320
DXR_LOOKUP_DEFINE(15)
321
DXR_LOOKUP_DEFINE(14)
322
DXR_LOOKUP_DEFINE(13)
323
DXR_LOOKUP_DEFINE(12)
324
DXR_LOOKUP_DEFINE(11)
325
DXR_LOOKUP_DEFINE(10)
326
DXR_LOOKUP_DEFINE(9)
327
328
static int inline
329
dxr_lookup(struct dxr *dxr, uint32_t dst)
330
{
331
struct direct_entry de;
332
uint16_t *dt = dxr->d;
333
struct direct_entry *xt = dxr->x;
334
335
de = xt[(dt[dst >> dxr->d_shift] << dxr->x_shift) +
336
((dst >> DXR_RANGE_SHIFT) & dxr->x_mask)];
337
if (__predict_true(de.fragments == FRAGS_MARK_HIT))
338
return (de.base);
339
return (range_lookup(dxr->r, de, dst));
340
}
341
342
static void
343
initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
344
{
345
struct heap_entry *fhp = &da->heap[0];
346
struct rtentry *rt;
347
struct route_nhop_data rnd;
348
349
da->heap_index = 0;
350
da->dst.sin_addr.s_addr = htonl(dst_u32);
351
rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED,
352
&rnd);
353
if (rt != NULL) {
354
struct in_addr addr;
355
uint32_t scopeid;
356
357
rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid);
358
fhp->start = ntohl(addr.s_addr);
359
fhp->end = fhp->start;
360
if (fhp->preflen < 32)
361
fhp->end |= (0xffffffffU >> fhp->preflen);
362
fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop);
363
} else {
364
fhp->preflen = fhp->nexthop = fhp->start = 0;
365
fhp->end = 0xffffffffU;
366
}
367
}
368
369
static uint32_t
370
chunk_size(struct dxr_aux *da, struct direct_entry *fdesc)
371
{
372
373
if (IS_SHORT_FORMAT(fdesc->fragments))
374
return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1);
375
else if (IS_XL_FORMAT(fdesc->fragments))
376
return (da->range_tbl[fdesc->base].fragments + 2);
377
else /* if (IS_LONG_FORMAT(fdesc->fragments)) */
378
return (fdesc->fragments + 1);
379
}
380
381
static uint32_t
382
chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc)
383
{
384
uint32_t size = chunk_size(da, fdesc);
385
uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base];
386
uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size];
387
uint32_t hash = fdesc->fragments;
388
389
for (; p < l; p++)
390
hash = (hash << 7) + (hash >> 13) + *p;
391
392
return (hash + (hash >> 16));
393
}
394
395
static int
396
chunk_ref(struct dxr_aux *da, uint32_t chunk)
397
{
398
struct direct_entry *fdesc = &da->direct_tbl[chunk];
399
struct chunk_desc *cdp, *empty_cdp;
400
uint32_t base = fdesc->base;
401
uint32_t size = chunk_size(da, fdesc);
402
uint32_t hash = chunk_hash(da, fdesc);
403
int i;
404
405
/* Find an existing descriptor */
406
LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
407
cd_hash_le) {
408
if (cdp->cd_hash != hash || cdp->cd_cur_size != size ||
409
memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
410
sizeof(struct range_entry_long) * size))
411
continue;
412
da->rtbl_top = fdesc->base;
413
fdesc->base = cdp->cd_base;
414
cdp->cd_refcnt++;
415
return (0);
416
}
417
418
/* No matching chunks found. Find an empty one to recycle. */
419
for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++)
420
cdp = LIST_FIRST(&da->unused_chunks[i]);
421
422
if (cdp == NULL)
423
LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le)
424
if (empty_cdp->cd_max_size >= size && (cdp == NULL ||
425
empty_cdp->cd_max_size < cdp->cd_max_size)) {
426
cdp = empty_cdp;
427
if (empty_cdp->cd_max_size == size)
428
break;
429
}
430
431
if (cdp != NULL) {
432
/* Copy from heap into the recycled chunk */
433
bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base],
434
size * sizeof(struct range_entry_long));
435
fdesc->base = cdp->cd_base;
436
da->rtbl_top -= size;
437
da->unused_chunks_size -= cdp->cd_max_size;
438
if (cdp->cd_max_size > size) {
439
/* Split the range in two, need a new descriptor */
440
empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT);
441
if (empty_cdp == NULL)
442
return (1);
443
LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le);
444
empty_cdp->cd_base = cdp->cd_base + size;
445
empty_cdp->cd_cur_size = 0;
446
empty_cdp->cd_max_size = cdp->cd_max_size - size;
447
448
i = empty_cdp->cd_max_size;
449
if (i >= UNUSED_BUCKETS)
450
i = 0;
451
LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp,
452
cd_hash_le);
453
454
da->unused_chunks_size += empty_cdp->cd_max_size;
455
cdp->cd_max_size = size;
456
}
457
LIST_REMOVE(cdp, cd_hash_le);
458
} else {
459
/* Alloc a new descriptor at the top of the heap*/
460
cdp = uma_zalloc(chunk_zone, M_NOWAIT);
461
if (cdp == NULL)
462
return (1);
463
cdp->cd_max_size = size;
464
cdp->cd_base = fdesc->base;
465
LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le);
466
MPASS(cdp->cd_base + cdp->cd_max_size == da->rtbl_top);
467
}
468
469
cdp->cd_hash = hash;
470
cdp->cd_refcnt = 1;
471
cdp->cd_cur_size = size;
472
LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp,
473
cd_hash_le);
474
if (da->rtbl_top >= da->rtbl_size) {
475
if (da->rtbl_top >= BASE_MAX) {
476
FIB_PRINTF(LOG_ERR, da->fd,
477
"structural limit exceeded at %d "
478
"range table elements", da->rtbl_top);
479
return (1);
480
}
481
da->rtbl_size += RTBL_SIZE_INCR;
482
i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX;
483
FIB_PRINTF(i, da->fd, "range table at %d%% structural limit",
484
da->rtbl_top * 100 / BASE_MAX);
485
da->range_tbl = realloc(da->range_tbl,
486
sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT,
487
M_DXRAUX, M_NOWAIT);
488
if (da->range_tbl == NULL) {
489
FIB_PRINTF(LOG_NOTICE, da->fd,
490
"Unable to allocate DXR range table");
491
return (1);
492
}
493
}
494
495
return (0);
496
}
497
498
static void
499
chunk_unref(struct dxr_aux *da, uint32_t chunk)
500
{
501
struct direct_entry *fdesc = &da->direct_tbl[chunk];
502
struct chunk_desc *cdp, *cdp2;
503
uint32_t base = fdesc->base;
504
uint32_t size = chunk_size(da, fdesc);
505
uint32_t hash = chunk_hash(da, fdesc);
506
int i;
507
508
/* Find the corresponding descriptor */
509
LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
510
cd_hash_le)
511
if (cdp->cd_hash == hash && cdp->cd_cur_size == size &&
512
memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
513
sizeof(struct range_entry_long) * size) == 0)
514
break;
515
516
MPASS(cdp != NULL);
517
if (--cdp->cd_refcnt > 0)
518
return;
519
520
LIST_REMOVE(cdp, cd_hash_le);
521
da->unused_chunks_size += cdp->cd_max_size;
522
cdp->cd_cur_size = 0;
523
524
/* Attempt to merge with the preceding chunk, if empty */
525
cdp2 = LIST_NEXT(cdp, cd_all_le);
526
if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
527
MPASS(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base);
528
LIST_REMOVE(cdp, cd_all_le);
529
LIST_REMOVE(cdp2, cd_hash_le);
530
cdp2->cd_max_size += cdp->cd_max_size;
531
uma_zfree(chunk_zone, cdp);
532
cdp = cdp2;
533
}
534
535
/* Attempt to merge with the subsequent chunk, if empty */
536
cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le);
537
if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
538
MPASS(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base);
539
LIST_REMOVE(cdp, cd_all_le);
540
LIST_REMOVE(cdp2, cd_hash_le);
541
cdp2->cd_max_size += cdp->cd_max_size;
542
cdp2->cd_base = cdp->cd_base;
543
uma_zfree(chunk_zone, cdp);
544
cdp = cdp2;
545
}
546
547
if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) {
548
/* Free the chunk on the top of the range heap, trim the heap */
549
MPASS(cdp == LIST_FIRST(&da->all_chunks));
550
da->rtbl_top -= cdp->cd_max_size;
551
da->unused_chunks_size -= cdp->cd_max_size;
552
LIST_REMOVE(cdp, cd_all_le);
553
uma_zfree(chunk_zone, cdp);
554
return;
555
}
556
557
i = cdp->cd_max_size;
558
if (i >= UNUSED_BUCKETS)
559
i = 0;
560
LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le);
561
}
562
563
static uint32_t
564
trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index)
565
{
566
uint32_t i, *val;
567
uint32_t hash = 0;
568
569
for (i = 0; i < (1 << dxr_x); i++) {
570
hash = (hash << 3) ^ (hash >> 3);
571
val = (uint32_t *)
572
(void *) &da->direct_tbl[(index << dxr_x) + i];
573
hash += (*val << 5);
574
hash += (*val >> 5);
575
}
576
577
return (hash + (hash >> 16));
578
}
579
580
static int
581
trie_ref(struct dxr_aux *da, uint32_t index)
582
{
583
struct trie_desc *tp;
584
uint32_t dxr_d = da->d_bits;
585
uint32_t dxr_x = DXR_TRIE_BITS - dxr_d;
586
uint32_t hash = trie_hash(da, dxr_x, index);
587
588
/* Find an existing descriptor */
589
LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le)
590
if (tp->td_hash == hash &&
591
memcmp(&da->direct_tbl[index << dxr_x],
592
&da->x_tbl[tp->td_index << dxr_x],
593
sizeof(*da->x_tbl) << dxr_x) == 0) {
594
tp->td_refcnt++;
595
da->trietbl[index] = tp;
596
return(tp->td_index);
597
}
598
599
tp = LIST_FIRST(&da->unused_trie);
600
if (tp != NULL) {
601
LIST_REMOVE(tp, td_hash_le);
602
da->unused_trie_cnt--;
603
} else {
604
tp = uma_zalloc(trie_zone, M_NOWAIT);
605
if (tp == NULL)
606
return (-1);
607
LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le);
608
tp->td_index = da->all_trie_cnt++;
609
}
610
611
tp->td_hash = hash;
612
tp->td_refcnt = 1;
613
LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp,
614
td_hash_le);
615
memcpy(&da->x_tbl[tp->td_index << dxr_x],
616
&da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x);
617
da->trietbl[index] = tp;
618
if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) {
619
da->xtbl_size += XTBL_SIZE_INCR;
620
da->x_tbl = realloc(da->x_tbl,
621
sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT);
622
if (da->x_tbl == NULL) {
623
FIB_PRINTF(LOG_NOTICE, da->fd,
624
"Unable to allocate DXR extension table");
625
return (-1);
626
}
627
}
628
return(tp->td_index);
629
}
630
631
static void
632
trie_unref(struct dxr_aux *da, uint32_t index)
633
{
634
struct trie_desc *tp = da->trietbl[index];
635
636
if (tp == NULL)
637
return;
638
da->trietbl[index] = NULL;
639
if (--tp->td_refcnt > 0)
640
return;
641
642
LIST_REMOVE(tp, td_hash_le);
643
da->unused_trie_cnt++;
644
if (tp->td_index != da->all_trie_cnt - 1) {
645
LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le);
646
return;
647
}
648
649
do {
650
da->all_trie_cnt--;
651
da->unused_trie_cnt--;
652
LIST_REMOVE(tp, td_all_le);
653
uma_zfree(trie_zone, tp);
654
LIST_FOREACH(tp, &da->unused_trie, td_hash_le)
655
if (tp->td_index == da->all_trie_cnt - 1) {
656
LIST_REMOVE(tp, td_hash_le);
657
break;
658
}
659
} while (tp != NULL);
660
}
661
662
static void
663
heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen,
664
uint32_t nh)
665
{
666
struct heap_entry *fhp;
667
int i;
668
669
for (i = da->heap_index; i >= 0; i--) {
670
if (preflen > da->heap[i].preflen)
671
break;
672
else if (preflen < da->heap[i].preflen)
673
da->heap[i + 1] = da->heap[i];
674
else
675
return;
676
}
677
678
fhp = &da->heap[i + 1];
679
fhp->preflen = preflen;
680
fhp->start = start;
681
fhp->end = end;
682
fhp->nexthop = nh;
683
da->heap_index++;
684
}
685
686
static int
687
dxr_walk(struct rtentry *rt, void *arg)
688
{
689
struct dxr_aux *da = arg;
690
uint32_t chunk = da->work_chunk;
691
uint32_t first = chunk << DXR_RANGE_SHIFT;
692
uint32_t last = first | DXR_RANGE_MASK;
693
struct range_entry_long *fp =
694
&da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
695
struct heap_entry *fhp = &da->heap[da->heap_index];
696
uint32_t preflen, nh, start, end, scopeid;
697
struct in_addr addr;
698
699
rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid);
700
start = ntohl(addr.s_addr);
701
if (start > last)
702
return (-1); /* Beyond chunk boundaries, we are done */
703
if (start < first)
704
return (0); /* Skip this route */
705
706
end = start;
707
if (preflen < 32)
708
end |= (0xffffffffU >> preflen);
709
nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt));
710
711
if (start == fhp->start)
712
heap_inject(da, start, end, preflen, nh);
713
else {
714
/* start > fhp->start */
715
while (start > fhp->end) {
716
uint32_t oend = fhp->end;
717
718
if (da->heap_index > 0) {
719
fhp--;
720
da->heap_index--;
721
} else
722
initheap(da, fhp->end + 1, chunk);
723
if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
724
fp++;
725
da->rtbl_work_frags++;
726
fp->start = (oend + 1) & DXR_RANGE_MASK;
727
fp->nexthop = fhp->nexthop;
728
}
729
}
730
if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) &&
731
nh != fp->nexthop) {
732
fp++;
733
da->rtbl_work_frags++;
734
fp->start = start & DXR_RANGE_MASK;
735
} else if (da->rtbl_work_frags) {
736
if ((--fp)->nexthop == nh)
737
da->rtbl_work_frags--;
738
else
739
fp++;
740
}
741
fp->nexthop = nh;
742
heap_inject(da, start, end, preflen, nh);
743
}
744
745
return (0);
746
}
747
748
static int
749
update_chunk(struct dxr_aux *da, uint32_t chunk)
750
{
751
struct range_entry_long *fp;
752
#if DXR_TRIE_BITS < 24
753
struct range_entry_short *fps;
754
uint32_t start, nh, i;
755
#endif
756
struct heap_entry *fhp;
757
uint32_t first = chunk << DXR_RANGE_SHIFT;
758
uint32_t last = first | DXR_RANGE_MASK;
759
760
if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT)
761
chunk_unref(da, chunk);
762
763
initheap(da, first, chunk);
764
765
fp = &da->range_tbl[da->rtbl_top].re;
766
da->rtbl_work_frags = 0;
767
fp->start = first & DXR_RANGE_MASK;
768
fp->nexthop = da->heap[0].nexthop;
769
770
da->dst.sin_addr.s_addr = htonl(first);
771
da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK);
772
773
da->work_chunk = chunk;
774
rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED,
775
(struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask,
776
dxr_walk, da);
777
778
/* Flush any remaining objects on the heap */
779
fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
780
fhp = &da->heap[da->heap_index];
781
while (fhp->preflen > DXR_TRIE_BITS) {
782
uint32_t oend = fhp->end;
783
784
if (da->heap_index > 0) {
785
fhp--;
786
da->heap_index--;
787
} else
788
initheap(da, fhp->end + 1, chunk);
789
if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
790
/* Have we crossed the upper chunk boundary? */
791
if (oend >= last)
792
break;
793
fp++;
794
da->rtbl_work_frags++;
795
fp->start = (oend + 1) & DXR_RANGE_MASK;
796
fp->nexthop = fhp->nexthop;
797
}
798
}
799
800
/* Direct hit if the chunk contains only a single fragment */
801
if (da->rtbl_work_frags == 0) {
802
da->direct_tbl[chunk].base = fp->nexthop;
803
da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT;
804
return (0);
805
}
806
807
da->direct_tbl[chunk].base = da->rtbl_top;
808
da->direct_tbl[chunk].fragments = da->rtbl_work_frags;
809
810
#if DXR_TRIE_BITS < 24
811
/* Check whether the chunk can be more compactly encoded */
812
fp = &da->range_tbl[da->rtbl_top].re;
813
for (i = 0; i <= da->rtbl_work_frags; i++, fp++)
814
if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH)
815
break;
816
if (i == da->rtbl_work_frags + 1) {
817
fp = &da->range_tbl[da->rtbl_top].re;
818
fps = (void *) fp;
819
for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) {
820
start = fp->start;
821
nh = fp->nexthop;
822
fps->start = start >> 8;
823
fps->nexthop = nh;
824
}
825
fps->start = start >> 8;
826
fps->nexthop = nh;
827
da->rtbl_work_frags >>= 1;
828
da->direct_tbl[chunk].fragments =
829
da->rtbl_work_frags | FRAGS_PREF_SHORT;
830
} else
831
#endif
832
if (da->rtbl_work_frags >= FRAGS_MARK_HIT) {
833
da->direct_tbl[chunk].fragments = FRAGS_MARK_XL;
834
memmove(&da->range_tbl[da->rtbl_top + 1],
835
&da->range_tbl[da->rtbl_top],
836
(da->rtbl_work_frags + 1) * sizeof(*da->range_tbl));
837
da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags;
838
da->rtbl_work_frags++;
839
}
840
da->rtbl_top += (da->rtbl_work_frags + 1);
841
return (chunk_ref(da, chunk));
842
}
843
844
static void
845
dxr_build(struct dxr *dxr)
846
{
847
struct dxr_aux *da = dxr->aux;
848
struct chunk_desc *cdp;
849
struct rib_rtable_info rinfo;
850
struct timeval t0, t1, t2, t3;
851
uint32_t r_size, dxr_tot_size;
852
uint32_t i, m, range_rebuild = 0;
853
uint32_t range_frag;
854
struct trie_desc *tp;
855
uint32_t d_tbl_size, dxr_x, d_size, x_size;
856
uint32_t ti, trie_rebuild = 0, prev_size = 0;
857
uint32_t trie_frag;
858
859
MPASS(dxr->d == NULL);
860
861
if (da == NULL) {
862
da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT);
863
if (da == NULL) {
864
FIB_PRINTF(LOG_NOTICE, dxr->fd,
865
"Unable to allocate DXR aux struct");
866
return;
867
}
868
dxr->aux = da;
869
da->fibnum = dxr->fibnum;
870
da->fd = dxr->fd;
871
da->refcnt = 1;
872
LIST_INIT(&da->all_chunks);
873
LIST_INIT(&da->all_trie);
874
da->rtbl_size = RTBL_SIZE_INCR;
875
da->range_tbl = NULL;
876
da->xtbl_size = XTBL_SIZE_INCR;
877
da->x_tbl = NULL;
878
bzero(&da->dst, sizeof(da->dst));
879
bzero(&da->mask, sizeof(da->mask));
880
da->dst.sin_len = sizeof(da->dst);
881
da->mask.sin_len = sizeof(da->mask);
882
da->dst.sin_family = AF_INET;
883
da->mask.sin_family = AF_INET;
884
}
885
if (da->range_tbl == NULL) {
886
da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size
887
+ FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT);
888
if (da->range_tbl == NULL) {
889
FIB_PRINTF(LOG_NOTICE, da->fd,
890
"Unable to allocate DXR range table");
891
return;
892
}
893
range_rebuild = 1;
894
}
895
if (da->x_tbl == NULL) {
896
da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size,
897
M_DXRAUX, M_NOWAIT);
898
if (da->x_tbl == NULL) {
899
FIB_PRINTF(LOG_NOTICE, da->fd,
900
"Unable to allocate DXR extension table");
901
return;
902
}
903
trie_rebuild = 1;
904
}
905
906
microuptime(&t0);
907
908
dxr->nh_tbl = fib_get_nhop_array(da->fd);
909
fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
910
911
if (da->updates_low > da->updates_high)
912
range_rebuild = 1;
913
914
range_build:
915
if (range_rebuild) {
916
/* Bulk cleanup */
917
bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl));
918
while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
919
LIST_REMOVE(cdp, cd_all_le);
920
uma_zfree(chunk_zone, cdp);
921
}
922
for (i = 0; i < UNUSED_BUCKETS; i++)
923
LIST_INIT(&da->unused_chunks[i]);
924
da->unused_chunks_size = 0;
925
da->rtbl_top = 0;
926
da->updates_low = 0;
927
da->updates_high = DIRECT_TBL_SIZE - 1;
928
memset(da->updates_mask, 0xff, sizeof(da->updates_mask));
929
for (i = 0; i < DIRECT_TBL_SIZE; i++) {
930
da->direct_tbl[i].fragments = FRAGS_MARK_HIT;
931
da->direct_tbl[i].base = 0;
932
}
933
}
934
da->prefixes = rinfo.num_prefixes;
935
936
/* DXR: construct direct & range table */
937
for (i = da->updates_low; i <= da->updates_high; i++) {
938
m = da->updates_mask[i >> 5] >> (i & 0x1f);
939
if (m == 0)
940
i |= 0x1f;
941
else if (m & 1 && update_chunk(da, i) != 0)
942
return;
943
}
944
945
range_frag = 0;
946
if (da->rtbl_top)
947
range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top;
948
if (range_frag > V_frag_limit) {
949
range_rebuild = 1;
950
goto range_build;
951
}
952
953
r_size = sizeof(*da->range_tbl) * da->rtbl_top;
954
microuptime(&t1);
955
956
if (range_rebuild ||
957
abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1)
958
trie_rebuild = 1;
959
960
trie_build:
961
if (trie_rebuild) {
962
da->trie_rebuilt_prefixes = da->prefixes;
963
da->d_bits = DXR_D;
964
da->updates_low = 0;
965
da->updates_high = DIRECT_TBL_SIZE - 1;
966
if (!range_rebuild)
967
memset(da->updates_mask, 0xff,
968
sizeof(da->updates_mask));
969
}
970
971
dxr2_try_squeeze:
972
if (trie_rebuild) {
973
/* Bulk cleanup */
974
bzero(da->trietbl, sizeof(da->trietbl));
975
bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl));
976
while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
977
LIST_REMOVE(tp, td_all_le);
978
uma_zfree(trie_zone, tp);
979
}
980
LIST_INIT(&da->unused_trie);
981
da->all_trie_cnt = da->unused_trie_cnt = 0;
982
}
983
984
/* Populate d_tbl, x_tbl */
985
dxr_x = DXR_TRIE_BITS - da->d_bits;
986
d_tbl_size = (1 << da->d_bits);
987
988
for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x;
989
i++) {
990
if (!trie_rebuild) {
991
m = 0;
992
for (int j = 0; j < (1 << dxr_x); j += 32)
993
m |= da->updates_mask[((i << dxr_x) + j) >> 5];
994
if (m == 0)
995
continue;
996
trie_unref(da, i);
997
}
998
ti = trie_ref(da, i);
999
if (ti < 0)
1000
return;
1001
da->d_tbl[i] = ti;
1002
}
1003
1004
trie_frag = 0;
1005
if (da->all_trie_cnt)
1006
trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt;
1007
if (trie_frag > V_frag_limit) {
1008
trie_rebuild = 1;
1009
goto trie_build;
1010
}
1011
1012
d_size = sizeof(*da->d_tbl) * d_tbl_size;
1013
x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size
1014
* da->all_trie_cnt;
1015
dxr_tot_size = d_size + x_size + r_size;
1016
1017
if (trie_rebuild == 1) {
1018
/* Try to find a more compact D/X split */
1019
if (prev_size == 0 || dxr_tot_size <= prev_size)
1020
da->d_bits--;
1021
else {
1022
da->d_bits++;
1023
trie_rebuild = 2;
1024
}
1025
prev_size = dxr_tot_size;
1026
goto dxr2_try_squeeze;
1027
}
1028
microuptime(&t2);
1029
1030
dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT);
1031
if (dxr->d == NULL) {
1032
FIB_PRINTF(LOG_NOTICE, da->fd,
1033
"Unable to allocate DXR lookup table");
1034
return;
1035
}
1036
memcpy(dxr->d, da->d_tbl, d_size);
1037
dxr->x = ((char *) dxr->d) + d_size;
1038
memcpy(dxr->x, da->x_tbl, x_size);
1039
dxr->r = ((char *) dxr->x) + x_size;
1040
dxr->d_shift = 32 - da->d_bits;
1041
dxr->x_shift = dxr_x;
1042
dxr->x_mask = 0xffffffffU >> (32 - dxr_x);
1043
memcpy(dxr->r, da->range_tbl, r_size);
1044
1045
if (da->updates_low <= da->updates_high)
1046
bzero(&da->updates_mask[da->updates_low / 32],
1047
(da->updates_high - da->updates_low) / 8 + 1);
1048
da->updates_low = DIRECT_TBL_SIZE - 1;
1049
da->updates_high = 0;
1050
microuptime(&t3);
1051
1052
FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)",
1053
da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops);
1054
i = dxr_tot_size * 100;
1055
if (rinfo.num_prefixes)
1056
i /= rinfo.num_prefixes;
1057
FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix",
1058
dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100,
1059
i / 100, i % 100);
1060
FIB_PRINTF(LOG_INFO, da->fd,
1061
"%d.%02d%% trie, %d.%02d%% range fragmentation",
1062
trie_frag / 100, trie_frag % 100,
1063
range_frag / 100, range_frag % 100);
1064
i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
1065
FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms",
1066
range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1067
i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec;
1068
FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms",
1069
trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1070
i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec;
1071
FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms",
1072
i / 1000, i % 1000);
1073
}
1074
1075
/*
1076
* Glue functions for attaching to the FIB_ALGO infrastructure.
1077
*/
1078
1079
static struct nhop_object *
1080
dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key,
1081
uint32_t scopeid)
1082
{
1083
struct dxr *dxr = algo_data;
1084
1085
return (dxr->nh_tbl[dxr_lookup(dxr, ntohl(key.addr4.s_addr))]);
1086
}
1087
1088
static enum flm_op_result
1089
dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data)
1090
{
1091
struct dxr *old_dxr = old_data;
1092
struct dxr_aux *da = NULL;
1093
struct dxr *dxr;
1094
1095
dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT);
1096
if (dxr == NULL) {
1097
FIB_PRINTF(LOG_NOTICE, fd,
1098
"Unable to allocate DXR container struct");
1099
return (FLM_REBUILD);
1100
}
1101
1102
/* Check whether we may reuse the old auxiliary structures */
1103
if (old_dxr != NULL && old_dxr->aux != NULL &&
1104
old_dxr->aux->fd == fd) {
1105
da = old_dxr->aux;
1106
atomic_add_int(&da->refcnt, 1);
1107
}
1108
1109
dxr->aux = da;
1110
dxr->d = NULL;
1111
dxr->fd = fd;
1112
dxr->fibnum = fibnum;
1113
*data = dxr;
1114
1115
return (FLM_SUCCESS);
1116
}
1117
1118
static void
1119
dxr_destroy(void *data)
1120
{
1121
struct dxr *dxr = data;
1122
struct dxr_aux *da = dxr->aux;
1123
struct chunk_desc *cdp;
1124
struct trie_desc *tp;
1125
1126
free(dxr->d, M_DXRLPM);
1127
free(dxr, M_DXRAUX);
1128
1129
if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1)
1130
return;
1131
1132
/* Release auxiliary structures */
1133
while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
1134
LIST_REMOVE(cdp, cd_all_le);
1135
uma_zfree(chunk_zone, cdp);
1136
}
1137
while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
1138
LIST_REMOVE(tp, td_all_le);
1139
uma_zfree(trie_zone, tp);
1140
}
1141
free(da->range_tbl, M_DXRAUX);
1142
free(da->x_tbl, M_DXRAUX);
1143
free(da, M_DXRAUX);
1144
}
1145
1146
static void
1147
epoch_dxr_destroy(epoch_context_t ctx)
1148
{
1149
struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx);
1150
1151
dxr_destroy(dxr);
1152
}
1153
1154
static void *
1155
choose_lookup_fn(struct dxr_aux *da)
1156
{
1157
1158
switch (da->d_bits) {
1159
#if DXR_TRIE_BITS > 16
1160
case 16:
1161
return (dxr_fib_lookup_16);
1162
#endif
1163
case 15:
1164
return (dxr_fib_lookup_15);
1165
case 14:
1166
return (dxr_fib_lookup_14);
1167
case 13:
1168
return (dxr_fib_lookup_13);
1169
case 12:
1170
return (dxr_fib_lookup_12);
1171
case 11:
1172
return (dxr_fib_lookup_11);
1173
case 10:
1174
return (dxr_fib_lookup_10);
1175
case 9:
1176
return (dxr_fib_lookup_9);
1177
}
1178
return (dxr_fib_lookup);
1179
}
1180
1181
static enum flm_op_result
1182
dxr_dump_end(void *data, struct fib_dp *dp)
1183
{
1184
struct dxr *dxr = data;
1185
struct dxr_aux *da;
1186
1187
dxr_build(dxr);
1188
1189
da = dxr->aux;
1190
if (da == NULL || dxr->d == NULL)
1191
return (FLM_REBUILD);
1192
1193
if (da->rtbl_top >= BASE_MAX)
1194
return (FLM_ERROR);
1195
1196
dp->f = choose_lookup_fn(da);
1197
dp->arg = dxr;
1198
1199
return (FLM_SUCCESS);
1200
}
1201
1202
static enum flm_op_result
1203
dxr_dump_rib_item(struct rtentry *rt, void *data)
1204
{
1205
1206
return (FLM_SUCCESS);
1207
}
1208
1209
static enum flm_op_result
1210
dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc,
1211
void *data)
1212
{
1213
1214
return (FLM_BATCH);
1215
}
1216
1217
static enum flm_op_result
1218
dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q,
1219
void *data)
1220
{
1221
struct dxr *dxr = data;
1222
struct dxr *new_dxr;
1223
struct dxr_aux *da;
1224
struct fib_dp new_dp;
1225
enum flm_op_result res;
1226
uint32_t ip, plen, hmask, start, end, i, ui;
1227
#ifdef INVARIANTS
1228
struct rib_rtable_info rinfo;
1229
int update_delta = 0;
1230
#endif
1231
1232
MPASS(data != NULL);
1233
MPASS(q != NULL);
1234
MPASS(q->count < q->size);
1235
1236
da = dxr->aux;
1237
MPASS(da != NULL);
1238
MPASS(da->fd == dxr->fd);
1239
MPASS(da->refcnt > 0);
1240
1241
FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count);
1242
for (ui = 0; ui < q->count; ui++) {
1243
#ifdef INVARIANTS
1244
if (q->entries[ui].nh_new != NULL)
1245
update_delta++;
1246
if (q->entries[ui].nh_old != NULL)
1247
update_delta--;
1248
#endif
1249
plen = q->entries[ui].plen;
1250
ip = ntohl(q->entries[ui].addr4.s_addr);
1251
if (plen < 32)
1252
hmask = 0xffffffffU >> plen;
1253
else
1254
hmask = 0;
1255
start = (ip & ~hmask) >> DXR_RANGE_SHIFT;
1256
end = (ip | hmask) >> DXR_RANGE_SHIFT;
1257
1258
if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f)
1259
for (i = start >> 5; i <= end >> 5; i++)
1260
da->updates_mask[i] = 0xffffffffU;
1261
else
1262
for (i = start; i <= end; i++)
1263
da->updates_mask[i >> 5] |= (1 << (i & 0x1f));
1264
if (start < da->updates_low)
1265
da->updates_low = start;
1266
if (end > da->updates_high)
1267
da->updates_high = end;
1268
}
1269
1270
#ifdef INVARIANTS
1271
fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
1272
MPASS(da->prefixes + update_delta == rinfo.num_prefixes);
1273
#endif
1274
1275
res = dxr_init(0, dxr->fd, data, (void **) &new_dxr);
1276
if (res != FLM_SUCCESS)
1277
return (res);
1278
1279
dxr_build(new_dxr);
1280
1281
/* Structural limit exceeded, hard error */
1282
if (da->rtbl_top >= BASE_MAX) {
1283
dxr_destroy(new_dxr);
1284
return (FLM_ERROR);
1285
}
1286
1287
if (new_dxr->d == NULL) {
1288
dxr_destroy(new_dxr);
1289
return (FLM_REBUILD);
1290
}
1291
1292
new_dp.f = choose_lookup_fn(da);
1293
new_dp.arg = new_dxr;
1294
if (fib_set_datapath_ptr(dxr->fd, &new_dp)) {
1295
fib_set_algo_ptr(dxr->fd, new_dxr);
1296
fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx);
1297
return (FLM_SUCCESS);
1298
}
1299
1300
FIB_PRINTF(LOG_NOTICE, dxr->fd, "fib_set_datapath_ptr() failed");
1301
dxr_destroy(new_dxr);
1302
return (FLM_REBUILD);
1303
}
1304
1305
static uint8_t
1306
dxr_get_pref(const struct rib_rtable_info *rinfo)
1307
{
1308
1309
/* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */
1310
return (251);
1311
}
1312
1313
SYSCTL_DECL(_net_route_algo);
1314
SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1315
"DXR tunables");
1316
1317
static int
1318
sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)
1319
{
1320
char buf[8];
1321
int error, new, i;
1322
1323
snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1324
V_frag_limit % 100);
1325
error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
1326
if (error != 0 || req->newptr == NULL)
1327
return (error);
1328
if (!isdigit(*buf) && *buf != '.')
1329
return (EINVAL);
1330
for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++)
1331
new = new * 10 + buf[i] - '0';
1332
new *= 100;
1333
if (buf[i++] == '.') {
1334
if (!isdigit(buf[i]))
1335
return (EINVAL);
1336
new += (buf[i++] - '0') * 10;
1337
if (isdigit(buf[i]))
1338
new += buf[i++] - '0';
1339
}
1340
if (new > 1000)
1341
return (EINVAL);
1342
V_frag_limit = new;
1343
snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1344
V_frag_limit % 100);
1345
return (0);
1346
}
1347
1348
SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit,
1349
CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET,
1350
0, 0, sysctl_dxr_frag_limit, "A",
1351
"Fragmentation threshold to full rebuild");
1352
1353
static struct fib_lookup_module fib_dxr_mod = {
1354
.flm_name = "dxr",
1355
.flm_family = AF_INET,
1356
.flm_init_cb = dxr_init,
1357
.flm_destroy_cb = dxr_destroy,
1358
.flm_dump_rib_item_cb = dxr_dump_rib_item,
1359
.flm_dump_end_cb = dxr_dump_end,
1360
.flm_change_rib_item_cb = dxr_change_rib_item,
1361
.flm_change_rib_items_cb = dxr_change_rib_batch,
1362
.flm_get_pref = dxr_get_pref,
1363
};
1364
1365
static int
1366
dxr_modevent(module_t mod, int type, void *unused)
1367
{
1368
int error;
1369
1370
switch (type) {
1371
case MOD_LOAD:
1372
chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc),
1373
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1374
trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc),
1375
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1376
fib_module_register(&fib_dxr_mod);
1377
return(0);
1378
case MOD_UNLOAD:
1379
error = fib_module_unregister(&fib_dxr_mod);
1380
if (error)
1381
return (error);
1382
uma_zdestroy(chunk_zone);
1383
uma_zdestroy(trie_zone);
1384
return(0);
1385
default:
1386
return(EOPNOTSUPP);
1387
}
1388
}
1389
1390
static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0};
1391
1392
DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
1393
MODULE_VERSION(fib_dxr, 1);
1394
1395