Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/kernel/bpf/hashtab.c
49041 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3
* Copyright (c) 2016 Facebook
4
*/
5
#include <linux/bpf.h>
6
#include <linux/btf.h>
7
#include <linux/jhash.h>
8
#include <linux/filter.h>
9
#include <linux/rculist_nulls.h>
10
#include <linux/rcupdate_wait.h>
11
#include <linux/random.h>
12
#include <uapi/linux/btf.h>
13
#include <linux/rcupdate_trace.h>
14
#include <linux/btf_ids.h>
15
#include "percpu_freelist.h"
16
#include "bpf_lru_list.h"
17
#include "map_in_map.h"
18
#include <linux/bpf_mem_alloc.h>
19
#include <asm/rqspinlock.h>
20
21
#define HTAB_CREATE_FLAG_MASK \
22
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
23
BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
24
25
#define BATCH_OPS(_name) \
26
.map_lookup_batch = \
27
_name##_map_lookup_batch, \
28
.map_lookup_and_delete_batch = \
29
_name##_map_lookup_and_delete_batch, \
30
.map_update_batch = \
31
generic_map_update_batch, \
32
.map_delete_batch = \
33
generic_map_delete_batch
34
35
/*
36
* The bucket lock has two protection scopes:
37
*
38
* 1) Serializing concurrent operations from BPF programs on different
39
* CPUs
40
*
41
* 2) Serializing concurrent operations from BPF programs and sys_bpf()
42
*
43
* BPF programs can execute in any context including perf, kprobes and
44
* tracing. As there are almost no limits where perf, kprobes and tracing
45
* can be invoked from the lock operations need to be protected against
46
* deadlocks. Deadlocks can be caused by recursion and by an invocation in
47
* the lock held section when functions which acquire this lock are invoked
48
* from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
49
* variable bpf_prog_active, which prevents BPF programs attached to perf
50
* events, kprobes and tracing to be invoked before the prior invocation
51
* from one of these contexts completed. sys_bpf() uses the same mechanism
52
* by pinning the task to the current CPU and incrementing the recursion
53
* protection across the map operation.
54
*
55
* This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
56
* operations like memory allocations (even with GFP_ATOMIC) from atomic
57
* contexts. This is required because even with GFP_ATOMIC the memory
58
* allocator calls into code paths which acquire locks with long held lock
59
* sections. To ensure the deterministic behaviour these locks are regular
60
* spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
61
* true atomic contexts on an RT kernel are the low level hardware
62
* handling, scheduling, low level interrupt handling, NMIs etc. None of
63
* these contexts should ever do memory allocations.
64
*
65
* As regular device interrupt handlers and soft interrupts are forced into
66
* thread context, the existing code which does
67
* spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*();
68
* just works.
69
*
70
* In theory the BPF locks could be converted to regular spinlocks as well,
71
* but the bucket locks and percpu_freelist locks can be taken from
72
* arbitrary contexts (perf, kprobes, tracepoints) which are required to be
73
* atomic contexts even on RT. Before the introduction of bpf_mem_alloc,
74
* it is only safe to use raw spinlock for preallocated hash map on a RT kernel,
75
* because there is no memory allocation within the lock held sections. However
76
* after hash map was fully converted to use bpf_mem_alloc, there will be
77
* non-synchronous memory allocation for non-preallocated hash map, so it is
78
* safe to always use raw spinlock for bucket lock.
79
*/
80
struct bucket {
81
struct hlist_nulls_head head;
82
rqspinlock_t raw_lock;
83
};
84
85
#define HASHTAB_MAP_LOCK_COUNT 8
86
#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
87
88
struct bpf_htab {
89
struct bpf_map map;
90
struct bpf_mem_alloc ma;
91
struct bpf_mem_alloc pcpu_ma;
92
struct bucket *buckets;
93
void *elems;
94
union {
95
struct pcpu_freelist freelist;
96
struct bpf_lru lru;
97
};
98
struct htab_elem *__percpu *extra_elems;
99
/* number of elements in non-preallocated hashtable are kept
100
* in either pcount or count
101
*/
102
struct percpu_counter pcount;
103
atomic_t count;
104
bool use_percpu_counter;
105
u32 n_buckets; /* number of hash buckets */
106
u32 elem_size; /* size of each element in bytes */
107
u32 hashrnd;
108
};
109
110
/* each htab element is struct htab_elem + key + value */
111
struct htab_elem {
112
union {
113
struct hlist_nulls_node hash_node;
114
struct {
115
void *padding;
116
union {
117
struct pcpu_freelist_node fnode;
118
struct htab_elem *batch_flink;
119
};
120
};
121
};
122
union {
123
/* pointer to per-cpu pointer */
124
void *ptr_to_pptr;
125
struct bpf_lru_node lru_node;
126
};
127
u32 hash;
128
char key[] __aligned(8);
129
};
130
131
static inline bool htab_is_prealloc(const struct bpf_htab *htab)
132
{
133
return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
134
}
135
136
static void htab_init_buckets(struct bpf_htab *htab)
137
{
138
unsigned int i;
139
140
for (i = 0; i < htab->n_buckets; i++) {
141
INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
142
raw_res_spin_lock_init(&htab->buckets[i].raw_lock);
143
cond_resched();
144
}
145
}
146
147
static inline int htab_lock_bucket(struct bucket *b, unsigned long *pflags)
148
{
149
unsigned long flags;
150
int ret;
151
152
ret = raw_res_spin_lock_irqsave(&b->raw_lock, flags);
153
if (ret)
154
return ret;
155
*pflags = flags;
156
return 0;
157
}
158
159
static inline void htab_unlock_bucket(struct bucket *b, unsigned long flags)
160
{
161
raw_res_spin_unlock_irqrestore(&b->raw_lock, flags);
162
}
163
164
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
165
166
static bool htab_is_lru(const struct bpf_htab *htab)
167
{
168
return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
169
htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
170
}
171
172
static bool htab_is_percpu(const struct bpf_htab *htab)
173
{
174
return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
175
htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
176
}
177
178
static inline bool is_fd_htab(const struct bpf_htab *htab)
179
{
180
return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS;
181
}
182
183
static inline void *htab_elem_value(struct htab_elem *l, u32 key_size)
184
{
185
return l->key + round_up(key_size, 8);
186
}
187
188
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
189
void __percpu *pptr)
190
{
191
*(void __percpu **)htab_elem_value(l, key_size) = pptr;
192
}
193
194
static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
195
{
196
return *(void __percpu **)htab_elem_value(l, key_size);
197
}
198
199
static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
200
{
201
return *(void **)htab_elem_value(l, map->key_size);
202
}
203
204
static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
205
{
206
return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
207
}
208
209
/* Both percpu and fd htab support in-place update, so no need for
210
* extra elem. LRU itself can remove the least used element, so
211
* there is no need for an extra elem during map_update.
212
*/
213
static bool htab_has_extra_elems(struct bpf_htab *htab)
214
{
215
return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
216
}
217
218
static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
219
{
220
u32 num_entries = htab->map.max_entries;
221
int i;
222
223
if (htab_has_extra_elems(htab))
224
num_entries += num_possible_cpus();
225
226
for (i = 0; i < num_entries; i++) {
227
struct htab_elem *elem;
228
229
elem = get_htab_elem(htab, i);
230
bpf_map_free_internal_structs(&htab->map,
231
htab_elem_value(elem, htab->map.key_size));
232
cond_resched();
233
}
234
}
235
236
static void htab_free_prealloced_fields(struct bpf_htab *htab)
237
{
238
u32 num_entries = htab->map.max_entries;
239
int i;
240
241
if (IS_ERR_OR_NULL(htab->map.record))
242
return;
243
if (htab_has_extra_elems(htab))
244
num_entries += num_possible_cpus();
245
for (i = 0; i < num_entries; i++) {
246
struct htab_elem *elem;
247
248
elem = get_htab_elem(htab, i);
249
if (htab_is_percpu(htab)) {
250
void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
251
int cpu;
252
253
for_each_possible_cpu(cpu) {
254
bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
255
cond_resched();
256
}
257
} else {
258
bpf_obj_free_fields(htab->map.record,
259
htab_elem_value(elem, htab->map.key_size));
260
cond_resched();
261
}
262
cond_resched();
263
}
264
}
265
266
static void htab_free_elems(struct bpf_htab *htab)
267
{
268
int i;
269
270
if (!htab_is_percpu(htab))
271
goto free_elems;
272
273
for (i = 0; i < htab->map.max_entries; i++) {
274
void __percpu *pptr;
275
276
pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
277
htab->map.key_size);
278
free_percpu(pptr);
279
cond_resched();
280
}
281
free_elems:
282
bpf_map_area_free(htab->elems);
283
}
284
285
/* The LRU list has a lock (lru_lock). Each htab bucket has a lock
286
* (bucket_lock). If both locks need to be acquired together, the lock
287
* order is always lru_lock -> bucket_lock and this only happens in
288
* bpf_lru_list.c logic. For example, certain code path of
289
* bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),
290
* will acquire lru_lock first followed by acquiring bucket_lock.
291
*
292
* In hashtab.c, to avoid deadlock, lock acquisition of
293
* bucket_lock followed by lru_lock is not allowed. In such cases,
294
* bucket_lock needs to be released first before acquiring lru_lock.
295
*/
296
static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
297
u32 hash)
298
{
299
struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
300
struct htab_elem *l;
301
302
if (node) {
303
bpf_map_inc_elem_count(&htab->map);
304
l = container_of(node, struct htab_elem, lru_node);
305
memcpy(l->key, key, htab->map.key_size);
306
return l;
307
}
308
309
return NULL;
310
}
311
312
static int prealloc_init(struct bpf_htab *htab)
313
{
314
u32 num_entries = htab->map.max_entries;
315
int err = -ENOMEM, i;
316
317
if (htab_has_extra_elems(htab))
318
num_entries += num_possible_cpus();
319
320
htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries,
321
htab->map.numa_node);
322
if (!htab->elems)
323
return -ENOMEM;
324
325
if (!htab_is_percpu(htab))
326
goto skip_percpu_elems;
327
328
for (i = 0; i < num_entries; i++) {
329
u32 size = round_up(htab->map.value_size, 8);
330
void __percpu *pptr;
331
332
pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
333
GFP_USER | __GFP_NOWARN);
334
if (!pptr)
335
goto free_elems;
336
htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
337
pptr);
338
cond_resched();
339
}
340
341
skip_percpu_elems:
342
if (htab_is_lru(htab))
343
err = bpf_lru_init(&htab->lru,
344
htab->map.map_flags & BPF_F_NO_COMMON_LRU,
345
offsetof(struct htab_elem, hash) -
346
offsetof(struct htab_elem, lru_node),
347
htab_lru_map_delete_node,
348
htab);
349
else
350
err = pcpu_freelist_init(&htab->freelist);
351
352
if (err)
353
goto free_elems;
354
355
if (htab_is_lru(htab))
356
bpf_lru_populate(&htab->lru, htab->elems,
357
offsetof(struct htab_elem, lru_node),
358
htab->elem_size, num_entries);
359
else
360
pcpu_freelist_populate(&htab->freelist,
361
htab->elems + offsetof(struct htab_elem, fnode),
362
htab->elem_size, num_entries);
363
364
return 0;
365
366
free_elems:
367
htab_free_elems(htab);
368
return err;
369
}
370
371
static void prealloc_destroy(struct bpf_htab *htab)
372
{
373
htab_free_elems(htab);
374
375
if (htab_is_lru(htab))
376
bpf_lru_destroy(&htab->lru);
377
else
378
pcpu_freelist_destroy(&htab->freelist);
379
}
380
381
static int alloc_extra_elems(struct bpf_htab *htab)
382
{
383
struct htab_elem *__percpu *pptr, *l_new;
384
struct pcpu_freelist_node *l;
385
int cpu;
386
387
pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8,
388
GFP_USER | __GFP_NOWARN);
389
if (!pptr)
390
return -ENOMEM;
391
392
for_each_possible_cpu(cpu) {
393
l = pcpu_freelist_pop(&htab->freelist);
394
/* pop will succeed, since prealloc_init()
395
* preallocated extra num_possible_cpus elements
396
*/
397
l_new = container_of(l, struct htab_elem, fnode);
398
*per_cpu_ptr(pptr, cpu) = l_new;
399
}
400
htab->extra_elems = pptr;
401
return 0;
402
}
403
404
/* Called from syscall */
405
static int htab_map_alloc_check(union bpf_attr *attr)
406
{
407
bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
408
attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
409
bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
410
attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
411
/* percpu_lru means each cpu has its own LRU list.
412
* it is different from BPF_MAP_TYPE_PERCPU_HASH where
413
* the map's value itself is percpu. percpu_lru has
414
* nothing to do with the map's value.
415
*/
416
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
417
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
418
bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
419
int numa_node = bpf_map_attr_numa_node(attr);
420
421
BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
422
offsetof(struct htab_elem, hash_node.pprev));
423
424
if (zero_seed && !capable(CAP_SYS_ADMIN))
425
/* Guard against local DoS, and discourage production use. */
426
return -EPERM;
427
428
if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK ||
429
!bpf_map_flags_access_ok(attr->map_flags))
430
return -EINVAL;
431
432
if (!lru && percpu_lru)
433
return -EINVAL;
434
435
if (lru && !prealloc)
436
return -ENOTSUPP;
437
438
if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
439
return -EINVAL;
440
441
/* check sanity of attributes.
442
* value_size == 0 may be allowed in the future to use map as a set
443
*/
444
if (attr->max_entries == 0 || attr->key_size == 0 ||
445
attr->value_size == 0)
446
return -EINVAL;
447
448
if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE -
449
sizeof(struct htab_elem))
450
/* if key_size + value_size is bigger, the user space won't be
451
* able to access the elements via bpf syscall. This check
452
* also makes sure that the elem_size doesn't overflow and it's
453
* kmalloc-able later in htab_map_update_elem()
454
*/
455
return -E2BIG;
456
/* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
457
if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE)
458
return -E2BIG;
459
460
return 0;
461
}
462
463
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
464
{
465
bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
466
attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
467
/* percpu_lru means each cpu has its own LRU list.
468
* it is different from BPF_MAP_TYPE_PERCPU_HASH where
469
* the map's value itself is percpu. percpu_lru has
470
* nothing to do with the map's value.
471
*/
472
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
473
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
474
struct bpf_htab *htab;
475
int err;
476
477
htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
478
if (!htab)
479
return ERR_PTR(-ENOMEM);
480
481
bpf_map_init_from_attr(&htab->map, attr);
482
483
if (percpu_lru) {
484
/* ensure each CPU's lru list has >=1 elements.
485
* since we are at it, make each lru list has the same
486
* number of elements.
487
*/
488
htab->map.max_entries = roundup(attr->max_entries,
489
num_possible_cpus());
490
if (htab->map.max_entries < attr->max_entries)
491
htab->map.max_entries = rounddown(attr->max_entries,
492
num_possible_cpus());
493
}
494
495
/* hash table size must be power of 2; roundup_pow_of_two() can overflow
496
* into UB on 32-bit arches, so check that first
497
*/
498
err = -E2BIG;
499
if (htab->map.max_entries > 1UL << 31)
500
goto free_htab;
501
502
htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
503
504
htab->elem_size = sizeof(struct htab_elem) +
505
round_up(htab->map.key_size, 8);
506
if (percpu)
507
htab->elem_size += sizeof(void *);
508
else
509
htab->elem_size += round_up(htab->map.value_size, 8);
510
511
/* check for u32 overflow */
512
if (htab->n_buckets > U32_MAX / sizeof(struct bucket))
513
goto free_htab;
514
515
err = bpf_map_init_elem_count(&htab->map);
516
if (err)
517
goto free_htab;
518
519
err = -ENOMEM;
520
htab->buckets = bpf_map_area_alloc(htab->n_buckets *
521
sizeof(struct bucket),
522
htab->map.numa_node);
523
if (!htab->buckets)
524
goto free_elem_count;
525
526
if (htab->map.map_flags & BPF_F_ZERO_SEED)
527
htab->hashrnd = 0;
528
else
529
htab->hashrnd = get_random_u32();
530
531
htab_init_buckets(htab);
532
533
/* compute_batch_value() computes batch value as num_online_cpus() * 2
534
* and __percpu_counter_compare() needs
535
* htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
536
* for percpu_counter to be faster than atomic_t. In practice the average bpf
537
* hash map size is 10k, which means that a system with 64 cpus will fill
538
* hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
539
* define our own batch count as 32 then 10k hash map can be filled up to 80%:
540
* 10k - 8k > 32 _batch_ * 64 _cpus_
541
* and __percpu_counter_compare() will still be fast. At that point hash map
542
* collisions will dominate its performance anyway. Assume that hash map filled
543
* to 50+% isn't going to be O(1) and use the following formula to choose
544
* between percpu_counter and atomic_t.
545
*/
546
#define PERCPU_COUNTER_BATCH 32
547
if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH)
548
htab->use_percpu_counter = true;
549
550
if (htab->use_percpu_counter) {
551
err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
552
if (err)
553
goto free_map_locked;
554
}
555
556
if (prealloc) {
557
err = prealloc_init(htab);
558
if (err)
559
goto free_map_locked;
560
561
if (htab_has_extra_elems(htab)) {
562
err = alloc_extra_elems(htab);
563
if (err)
564
goto free_prealloc;
565
}
566
} else {
567
err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
568
if (err)
569
goto free_map_locked;
570
if (percpu) {
571
err = bpf_mem_alloc_init(&htab->pcpu_ma,
572
round_up(htab->map.value_size, 8), true);
573
if (err)
574
goto free_map_locked;
575
}
576
}
577
578
return &htab->map;
579
580
free_prealloc:
581
prealloc_destroy(htab);
582
free_map_locked:
583
if (htab->use_percpu_counter)
584
percpu_counter_destroy(&htab->pcount);
585
bpf_map_area_free(htab->buckets);
586
bpf_mem_alloc_destroy(&htab->pcpu_ma);
587
bpf_mem_alloc_destroy(&htab->ma);
588
free_elem_count:
589
bpf_map_free_elem_count(&htab->map);
590
free_htab:
591
bpf_map_area_free(htab);
592
return ERR_PTR(err);
593
}
594
595
static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
596
{
597
if (likely(key_len % 4 == 0))
598
return jhash2(key, key_len / 4, hashrnd);
599
return jhash(key, key_len, hashrnd);
600
}
601
602
static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
603
{
604
return &htab->buckets[hash & (htab->n_buckets - 1)];
605
}
606
607
static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
608
{
609
return &__select_bucket(htab, hash)->head;
610
}
611
612
/* this lookup function can only be called with bucket lock taken */
613
static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
614
void *key, u32 key_size)
615
{
616
struct hlist_nulls_node *n;
617
struct htab_elem *l;
618
619
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
620
if (l->hash == hash && !memcmp(&l->key, key, key_size))
621
return l;
622
623
return NULL;
624
}
625
626
/* can be called without bucket lock. it will repeat the loop in
627
* the unlikely event when elements moved from one bucket into another
628
* while link list is being walked
629
*/
630
static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
631
u32 hash, void *key,
632
u32 key_size, u32 n_buckets)
633
{
634
struct hlist_nulls_node *n;
635
struct htab_elem *l;
636
637
again:
638
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
639
if (l->hash == hash && !memcmp(&l->key, key, key_size))
640
return l;
641
642
if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
643
goto again;
644
645
return NULL;
646
}
647
648
/* Called from syscall or from eBPF program directly, so
649
* arguments have to match bpf_map_lookup_elem() exactly.
650
* The return value is adjusted by BPF instructions
651
* in htab_map_gen_lookup().
652
*/
653
static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
654
{
655
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
656
struct hlist_nulls_head *head;
657
struct htab_elem *l;
658
u32 hash, key_size;
659
660
WARN_ON_ONCE(!bpf_rcu_lock_held());
661
662
key_size = map->key_size;
663
664
hash = htab_map_hash(key, key_size, htab->hashrnd);
665
666
head = select_bucket(htab, hash);
667
668
l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
669
670
return l;
671
}
672
673
static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
674
{
675
struct htab_elem *l = __htab_map_lookup_elem(map, key);
676
677
if (l)
678
return htab_elem_value(l, map->key_size);
679
680
return NULL;
681
}
682
683
/* inline bpf_map_lookup_elem() call.
684
* Instead of:
685
* bpf_prog
686
* bpf_map_lookup_elem
687
* map->ops->map_lookup_elem
688
* htab_map_lookup_elem
689
* __htab_map_lookup_elem
690
* do:
691
* bpf_prog
692
* __htab_map_lookup_elem
693
*/
694
static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
695
{
696
struct bpf_insn *insn = insn_buf;
697
const int ret = BPF_REG_0;
698
699
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
700
(void *(*)(struct bpf_map *map, void *key))NULL));
701
*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
702
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
703
*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
704
offsetof(struct htab_elem, key) +
705
round_up(map->key_size, 8));
706
return insn - insn_buf;
707
}
708
709
static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
710
void *key, const bool mark)
711
{
712
struct htab_elem *l = __htab_map_lookup_elem(map, key);
713
714
if (l) {
715
if (mark)
716
bpf_lru_node_set_ref(&l->lru_node);
717
return htab_elem_value(l, map->key_size);
718
}
719
720
return NULL;
721
}
722
723
static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
724
{
725
return __htab_lru_map_lookup_elem(map, key, true);
726
}
727
728
static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
729
{
730
return __htab_lru_map_lookup_elem(map, key, false);
731
}
732
733
static int htab_lru_map_gen_lookup(struct bpf_map *map,
734
struct bpf_insn *insn_buf)
735
{
736
struct bpf_insn *insn = insn_buf;
737
const int ret = BPF_REG_0;
738
const int ref_reg = BPF_REG_1;
739
740
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
741
(void *(*)(struct bpf_map *map, void *key))NULL));
742
*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
743
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
744
*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
745
offsetof(struct htab_elem, lru_node) +
746
offsetof(struct bpf_lru_node, ref));
747
*insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1);
748
*insn++ = BPF_ST_MEM(BPF_B, ret,
749
offsetof(struct htab_elem, lru_node) +
750
offsetof(struct bpf_lru_node, ref),
751
1);
752
*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
753
offsetof(struct htab_elem, key) +
754
round_up(map->key_size, 8));
755
return insn - insn_buf;
756
}
757
758
static void check_and_free_fields(struct bpf_htab *htab,
759
struct htab_elem *elem)
760
{
761
if (IS_ERR_OR_NULL(htab->map.record))
762
return;
763
764
if (htab_is_percpu(htab)) {
765
void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
766
int cpu;
767
768
for_each_possible_cpu(cpu)
769
bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
770
} else {
771
void *map_value = htab_elem_value(elem, htab->map.key_size);
772
773
bpf_obj_free_fields(htab->map.record, map_value);
774
}
775
}
776
777
/* It is called from the bpf_lru_list when the LRU needs to delete
778
* older elements from the htab.
779
*/
780
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
781
{
782
struct bpf_htab *htab = arg;
783
struct htab_elem *l = NULL, *tgt_l;
784
struct hlist_nulls_head *head;
785
struct hlist_nulls_node *n;
786
unsigned long flags;
787
struct bucket *b;
788
int ret;
789
790
tgt_l = container_of(node, struct htab_elem, lru_node);
791
b = __select_bucket(htab, tgt_l->hash);
792
head = &b->head;
793
794
ret = htab_lock_bucket(b, &flags);
795
if (ret)
796
return false;
797
798
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
799
if (l == tgt_l) {
800
hlist_nulls_del_rcu(&l->hash_node);
801
bpf_map_dec_elem_count(&htab->map);
802
break;
803
}
804
805
htab_unlock_bucket(b, flags);
806
807
if (l == tgt_l)
808
check_and_free_fields(htab, l);
809
return l == tgt_l;
810
}
811
812
/* Called from syscall */
813
static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
814
{
815
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
816
struct hlist_nulls_head *head;
817
struct htab_elem *l, *next_l;
818
u32 hash, key_size;
819
int i = 0;
820
821
WARN_ON_ONCE(!rcu_read_lock_held());
822
823
key_size = map->key_size;
824
825
if (!key)
826
goto find_first_elem;
827
828
hash = htab_map_hash(key, key_size, htab->hashrnd);
829
830
head = select_bucket(htab, hash);
831
832
/* lookup the key */
833
l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
834
835
if (!l)
836
goto find_first_elem;
837
838
/* key was found, get next key in the same bucket */
839
next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
840
struct htab_elem, hash_node);
841
842
if (next_l) {
843
/* if next elem in this hash list is non-zero, just return it */
844
memcpy(next_key, next_l->key, key_size);
845
return 0;
846
}
847
848
/* no more elements in this hash list, go to the next bucket */
849
i = hash & (htab->n_buckets - 1);
850
i++;
851
852
find_first_elem:
853
/* iterate over buckets */
854
for (; i < htab->n_buckets; i++) {
855
head = select_bucket(htab, i);
856
857
/* pick first element in the bucket */
858
next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
859
struct htab_elem, hash_node);
860
if (next_l) {
861
/* if it's not empty, just return it */
862
memcpy(next_key, next_l->key, key_size);
863
return 0;
864
}
865
}
866
867
/* iterated over all buckets and all elements */
868
return -ENOENT;
869
}
870
871
static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
872
{
873
check_and_free_fields(htab, l);
874
875
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
876
bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
877
bpf_mem_cache_free(&htab->ma, l);
878
}
879
880
static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
881
{
882
struct bpf_map *map = &htab->map;
883
void *ptr;
884
885
if (map->ops->map_fd_put_ptr) {
886
ptr = fd_htab_map_get_ptr(map, l);
887
map->ops->map_fd_put_ptr(map, ptr, true);
888
}
889
}
890
891
static bool is_map_full(struct bpf_htab *htab)
892
{
893
if (htab->use_percpu_counter)
894
return __percpu_counter_compare(&htab->pcount, htab->map.max_entries,
895
PERCPU_COUNTER_BATCH) >= 0;
896
return atomic_read(&htab->count) >= htab->map.max_entries;
897
}
898
899
static void inc_elem_count(struct bpf_htab *htab)
900
{
901
bpf_map_inc_elem_count(&htab->map);
902
903
if (htab->use_percpu_counter)
904
percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
905
else
906
atomic_inc(&htab->count);
907
}
908
909
static void dec_elem_count(struct bpf_htab *htab)
910
{
911
bpf_map_dec_elem_count(&htab->map);
912
913
if (htab->use_percpu_counter)
914
percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
915
else
916
atomic_dec(&htab->count);
917
}
918
919
920
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
921
{
922
htab_put_fd_value(htab, l);
923
924
if (htab_is_prealloc(htab)) {
925
bpf_map_dec_elem_count(&htab->map);
926
check_and_free_fields(htab, l);
927
pcpu_freelist_push(&htab->freelist, &l->fnode);
928
} else {
929
dec_elem_count(htab);
930
htab_elem_free(htab, l);
931
}
932
}
933
934
static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
935
void *value, bool onallcpus)
936
{
937
void *ptr;
938
939
if (!onallcpus) {
940
/* copy true value_size bytes */
941
ptr = this_cpu_ptr(pptr);
942
copy_map_value(&htab->map, ptr, value);
943
bpf_obj_free_fields(htab->map.record, ptr);
944
} else {
945
u32 size = round_up(htab->map.value_size, 8);
946
int off = 0, cpu;
947
948
for_each_possible_cpu(cpu) {
949
ptr = per_cpu_ptr(pptr, cpu);
950
copy_map_value_long(&htab->map, ptr, value + off);
951
bpf_obj_free_fields(htab->map.record, ptr);
952
off += size;
953
}
954
}
955
}
956
957
static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
958
void *value, bool onallcpus)
959
{
960
/* When not setting the initial value on all cpus, zero-fill element
961
* values for other cpus. Otherwise, bpf program has no way to ensure
962
* known initial values for cpus other than current one
963
* (onallcpus=false always when coming from bpf prog).
964
*/
965
if (!onallcpus) {
966
int current_cpu = raw_smp_processor_id();
967
int cpu;
968
969
for_each_possible_cpu(cpu) {
970
if (cpu == current_cpu)
971
copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value);
972
else /* Since elem is preallocated, we cannot touch special fields */
973
zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));
974
}
975
} else {
976
pcpu_copy_value(htab, pptr, value, onallcpus);
977
}
978
}
979
980
static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
981
{
982
return is_fd_htab(htab) && BITS_PER_LONG == 64;
983
}
984
985
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
986
void *value, u32 key_size, u32 hash,
987
bool percpu, bool onallcpus,
988
struct htab_elem *old_elem)
989
{
990
u32 size = htab->map.value_size;
991
bool prealloc = htab_is_prealloc(htab);
992
struct htab_elem *l_new, **pl_new;
993
void __percpu *pptr;
994
995
if (prealloc) {
996
if (old_elem) {
997
/* if we're updating the existing element,
998
* use per-cpu extra elems to avoid freelist_pop/push
999
*/
1000
pl_new = this_cpu_ptr(htab->extra_elems);
1001
l_new = *pl_new;
1002
*pl_new = old_elem;
1003
} else {
1004
struct pcpu_freelist_node *l;
1005
1006
l = __pcpu_freelist_pop(&htab->freelist);
1007
if (!l)
1008
return ERR_PTR(-E2BIG);
1009
l_new = container_of(l, struct htab_elem, fnode);
1010
bpf_map_inc_elem_count(&htab->map);
1011
}
1012
} else {
1013
if (is_map_full(htab))
1014
if (!old_elem)
1015
/* when map is full and update() is replacing
1016
* old element, it's ok to allocate, since
1017
* old element will be freed immediately.
1018
* Otherwise return an error
1019
*/
1020
return ERR_PTR(-E2BIG);
1021
inc_elem_count(htab);
1022
l_new = bpf_mem_cache_alloc(&htab->ma);
1023
if (!l_new) {
1024
l_new = ERR_PTR(-ENOMEM);
1025
goto dec_count;
1026
}
1027
}
1028
1029
memcpy(l_new->key, key, key_size);
1030
if (percpu) {
1031
if (prealloc) {
1032
pptr = htab_elem_get_ptr(l_new, key_size);
1033
} else {
1034
/* alloc_percpu zero-fills */
1035
void *ptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
1036
1037
if (!ptr) {
1038
bpf_mem_cache_free(&htab->ma, l_new);
1039
l_new = ERR_PTR(-ENOMEM);
1040
goto dec_count;
1041
}
1042
l_new->ptr_to_pptr = ptr;
1043
pptr = *(void __percpu **)ptr;
1044
}
1045
1046
pcpu_init_value(htab, pptr, value, onallcpus);
1047
1048
if (!prealloc)
1049
htab_elem_set_ptr(l_new, key_size, pptr);
1050
} else if (fd_htab_map_needs_adjust(htab)) {
1051
size = round_up(size, 8);
1052
memcpy(htab_elem_value(l_new, key_size), value, size);
1053
} else {
1054
copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value);
1055
}
1056
1057
l_new->hash = hash;
1058
return l_new;
1059
dec_count:
1060
dec_elem_count(htab);
1061
return l_new;
1062
}
1063
1064
static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
1065
u64 map_flags)
1066
{
1067
if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
1068
/* elem already exists */
1069
return -EEXIST;
1070
1071
if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
1072
/* elem doesn't exist, cannot update it */
1073
return -ENOENT;
1074
1075
return 0;
1076
}
1077
1078
/* Called from syscall or from eBPF program */
1079
static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
1080
u64 map_flags)
1081
{
1082
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1083
struct htab_elem *l_new, *l_old;
1084
struct hlist_nulls_head *head;
1085
unsigned long flags;
1086
struct bucket *b;
1087
u32 key_size, hash;
1088
int ret;
1089
1090
if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
1091
/* unknown flags */
1092
return -EINVAL;
1093
1094
WARN_ON_ONCE(!bpf_rcu_lock_held());
1095
1096
key_size = map->key_size;
1097
1098
hash = htab_map_hash(key, key_size, htab->hashrnd);
1099
1100
b = __select_bucket(htab, hash);
1101
head = &b->head;
1102
1103
if (unlikely(map_flags & BPF_F_LOCK)) {
1104
if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK)))
1105
return -EINVAL;
1106
/* find an element without taking the bucket lock */
1107
l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
1108
htab->n_buckets);
1109
ret = check_flags(htab, l_old, map_flags);
1110
if (ret)
1111
return ret;
1112
if (l_old) {
1113
/* grab the element lock and update value in place */
1114
copy_map_value_locked(map,
1115
htab_elem_value(l_old, key_size),
1116
value, false);
1117
return 0;
1118
}
1119
/* fall through, grab the bucket lock and lookup again.
1120
* 99.9% chance that the element won't be found,
1121
* but second lookup under lock has to be done.
1122
*/
1123
}
1124
1125
ret = htab_lock_bucket(b, &flags);
1126
if (ret)
1127
return ret;
1128
1129
l_old = lookup_elem_raw(head, hash, key, key_size);
1130
1131
ret = check_flags(htab, l_old, map_flags);
1132
if (ret)
1133
goto err;
1134
1135
if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
1136
/* first lookup without the bucket lock didn't find the element,
1137
* but second lookup with the bucket lock found it.
1138
* This case is highly unlikely, but has to be dealt with:
1139
* grab the element lock in addition to the bucket lock
1140
* and update element in place
1141
*/
1142
copy_map_value_locked(map,
1143
htab_elem_value(l_old, key_size),
1144
value, false);
1145
ret = 0;
1146
goto err;
1147
}
1148
1149
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
1150
l_old);
1151
if (IS_ERR(l_new)) {
1152
/* all pre-allocated elements are in use or memory exhausted */
1153
ret = PTR_ERR(l_new);
1154
goto err;
1155
}
1156
1157
/* add new element to the head of the list, so that
1158
* concurrent search will find it before old elem
1159
*/
1160
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1161
if (l_old) {
1162
hlist_nulls_del_rcu(&l_old->hash_node);
1163
1164
/* l_old has already been stashed in htab->extra_elems, free
1165
* its special fields before it is available for reuse.
1166
*/
1167
if (htab_is_prealloc(htab))
1168
check_and_free_fields(htab, l_old);
1169
}
1170
htab_unlock_bucket(b, flags);
1171
if (l_old && !htab_is_prealloc(htab))
1172
free_htab_elem(htab, l_old);
1173
return 0;
1174
err:
1175
htab_unlock_bucket(b, flags);
1176
return ret;
1177
}
1178
1179
static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
1180
{
1181
check_and_free_fields(htab, elem);
1182
bpf_map_dec_elem_count(&htab->map);
1183
bpf_lru_push_free(&htab->lru, &elem->lru_node);
1184
}
1185
1186
static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
1187
u64 map_flags)
1188
{
1189
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1190
struct htab_elem *l_new, *l_old = NULL;
1191
struct hlist_nulls_head *head;
1192
unsigned long flags;
1193
struct bucket *b;
1194
u32 key_size, hash;
1195
int ret;
1196
1197
if (unlikely(map_flags > BPF_EXIST))
1198
/* unknown flags */
1199
return -EINVAL;
1200
1201
WARN_ON_ONCE(!bpf_rcu_lock_held());
1202
1203
key_size = map->key_size;
1204
1205
hash = htab_map_hash(key, key_size, htab->hashrnd);
1206
1207
b = __select_bucket(htab, hash);
1208
head = &b->head;
1209
1210
/* For LRU, we need to alloc before taking bucket's
1211
* spinlock because getting free nodes from LRU may need
1212
* to remove older elements from htab and this removal
1213
* operation will need a bucket lock.
1214
*/
1215
l_new = prealloc_lru_pop(htab, key, hash);
1216
if (!l_new)
1217
return -ENOMEM;
1218
copy_map_value(&htab->map, htab_elem_value(l_new, map->key_size), value);
1219
1220
ret = htab_lock_bucket(b, &flags);
1221
if (ret)
1222
goto err_lock_bucket;
1223
1224
l_old = lookup_elem_raw(head, hash, key, key_size);
1225
1226
ret = check_flags(htab, l_old, map_flags);
1227
if (ret)
1228
goto err;
1229
1230
/* add new element to the head of the list, so that
1231
* concurrent search will find it before old elem
1232
*/
1233
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1234
if (l_old) {
1235
bpf_lru_node_set_ref(&l_new->lru_node);
1236
hlist_nulls_del_rcu(&l_old->hash_node);
1237
}
1238
ret = 0;
1239
1240
err:
1241
htab_unlock_bucket(b, flags);
1242
1243
err_lock_bucket:
1244
if (ret)
1245
htab_lru_push_free(htab, l_new);
1246
else if (l_old)
1247
htab_lru_push_free(htab, l_old);
1248
1249
return ret;
1250
}
1251
1252
static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
1253
void *value, u64 map_flags,
1254
bool percpu, bool onallcpus)
1255
{
1256
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1257
struct htab_elem *l_new, *l_old;
1258
struct hlist_nulls_head *head;
1259
void *old_map_ptr = NULL;
1260
unsigned long flags;
1261
struct bucket *b;
1262
u32 key_size, hash;
1263
int ret;
1264
1265
if (unlikely(map_flags > BPF_EXIST))
1266
/* unknown flags */
1267
return -EINVAL;
1268
1269
WARN_ON_ONCE(!bpf_rcu_lock_held());
1270
1271
key_size = map->key_size;
1272
1273
hash = htab_map_hash(key, key_size, htab->hashrnd);
1274
1275
b = __select_bucket(htab, hash);
1276
head = &b->head;
1277
1278
ret = htab_lock_bucket(b, &flags);
1279
if (ret)
1280
return ret;
1281
1282
l_old = lookup_elem_raw(head, hash, key, key_size);
1283
1284
ret = check_flags(htab, l_old, map_flags);
1285
if (ret)
1286
goto err;
1287
1288
if (l_old) {
1289
/* Update value in-place */
1290
if (percpu) {
1291
pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1292
value, onallcpus);
1293
} else {
1294
void **inner_map_pptr = htab_elem_value(l_old, key_size);
1295
1296
old_map_ptr = *inner_map_pptr;
1297
WRITE_ONCE(*inner_map_pptr, *(void **)value);
1298
}
1299
} else {
1300
l_new = alloc_htab_elem(htab, key, value, key_size,
1301
hash, percpu, onallcpus, NULL);
1302
if (IS_ERR(l_new)) {
1303
ret = PTR_ERR(l_new);
1304
goto err;
1305
}
1306
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1307
}
1308
err:
1309
htab_unlock_bucket(b, flags);
1310
if (old_map_ptr)
1311
map->ops->map_fd_put_ptr(map, old_map_ptr, true);
1312
return ret;
1313
}
1314
1315
static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1316
void *value, u64 map_flags,
1317
bool onallcpus)
1318
{
1319
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1320
struct htab_elem *l_new = NULL, *l_old;
1321
struct hlist_nulls_head *head;
1322
unsigned long flags;
1323
struct bucket *b;
1324
u32 key_size, hash;
1325
int ret;
1326
1327
if (unlikely(map_flags > BPF_EXIST))
1328
/* unknown flags */
1329
return -EINVAL;
1330
1331
WARN_ON_ONCE(!bpf_rcu_lock_held());
1332
1333
key_size = map->key_size;
1334
1335
hash = htab_map_hash(key, key_size, htab->hashrnd);
1336
1337
b = __select_bucket(htab, hash);
1338
head = &b->head;
1339
1340
/* For LRU, we need to alloc before taking bucket's
1341
* spinlock because LRU's elem alloc may need
1342
* to remove older elem from htab and this removal
1343
* operation will need a bucket lock.
1344
*/
1345
if (map_flags != BPF_EXIST) {
1346
l_new = prealloc_lru_pop(htab, key, hash);
1347
if (!l_new)
1348
return -ENOMEM;
1349
}
1350
1351
ret = htab_lock_bucket(b, &flags);
1352
if (ret)
1353
goto err_lock_bucket;
1354
1355
l_old = lookup_elem_raw(head, hash, key, key_size);
1356
1357
ret = check_flags(htab, l_old, map_flags);
1358
if (ret)
1359
goto err;
1360
1361
if (l_old) {
1362
bpf_lru_node_set_ref(&l_old->lru_node);
1363
1364
/* per-cpu hash map can update value in-place */
1365
pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1366
value, onallcpus);
1367
} else {
1368
pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
1369
value, onallcpus);
1370
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1371
l_new = NULL;
1372
}
1373
ret = 0;
1374
err:
1375
htab_unlock_bucket(b, flags);
1376
err_lock_bucket:
1377
if (l_new) {
1378
bpf_map_dec_elem_count(&htab->map);
1379
bpf_lru_push_free(&htab->lru, &l_new->lru_node);
1380
}
1381
return ret;
1382
}
1383
1384
static long htab_percpu_map_update_elem(struct bpf_map *map, void *key,
1385
void *value, u64 map_flags)
1386
{
1387
return htab_map_update_elem_in_place(map, key, value, map_flags, true, false);
1388
}
1389
1390
static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1391
void *value, u64 map_flags)
1392
{
1393
return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
1394
false);
1395
}
1396
1397
/* Called from syscall or from eBPF program */
1398
static long htab_map_delete_elem(struct bpf_map *map, void *key)
1399
{
1400
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1401
struct hlist_nulls_head *head;
1402
struct bucket *b;
1403
struct htab_elem *l;
1404
unsigned long flags;
1405
u32 hash, key_size;
1406
int ret;
1407
1408
WARN_ON_ONCE(!bpf_rcu_lock_held());
1409
1410
key_size = map->key_size;
1411
1412
hash = htab_map_hash(key, key_size, htab->hashrnd);
1413
b = __select_bucket(htab, hash);
1414
head = &b->head;
1415
1416
ret = htab_lock_bucket(b, &flags);
1417
if (ret)
1418
return ret;
1419
1420
l = lookup_elem_raw(head, hash, key, key_size);
1421
if (l)
1422
hlist_nulls_del_rcu(&l->hash_node);
1423
else
1424
ret = -ENOENT;
1425
1426
htab_unlock_bucket(b, flags);
1427
1428
if (l)
1429
free_htab_elem(htab, l);
1430
return ret;
1431
}
1432
1433
static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
1434
{
1435
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1436
struct hlist_nulls_head *head;
1437
struct bucket *b;
1438
struct htab_elem *l;
1439
unsigned long flags;
1440
u32 hash, key_size;
1441
int ret;
1442
1443
WARN_ON_ONCE(!bpf_rcu_lock_held());
1444
1445
key_size = map->key_size;
1446
1447
hash = htab_map_hash(key, key_size, htab->hashrnd);
1448
b = __select_bucket(htab, hash);
1449
head = &b->head;
1450
1451
ret = htab_lock_bucket(b, &flags);
1452
if (ret)
1453
return ret;
1454
1455
l = lookup_elem_raw(head, hash, key, key_size);
1456
1457
if (l)
1458
hlist_nulls_del_rcu(&l->hash_node);
1459
else
1460
ret = -ENOENT;
1461
1462
htab_unlock_bucket(b, flags);
1463
if (l)
1464
htab_lru_push_free(htab, l);
1465
return ret;
1466
}
1467
1468
static void delete_all_elements(struct bpf_htab *htab)
1469
{
1470
int i;
1471
1472
/* It's called from a worker thread and migration has been disabled,
1473
* therefore, it is OK to invoke bpf_mem_cache_free() directly.
1474
*/
1475
for (i = 0; i < htab->n_buckets; i++) {
1476
struct hlist_nulls_head *head = select_bucket(htab, i);
1477
struct hlist_nulls_node *n;
1478
struct htab_elem *l;
1479
1480
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1481
hlist_nulls_del_rcu(&l->hash_node);
1482
htab_elem_free(htab, l);
1483
}
1484
cond_resched();
1485
}
1486
}
1487
1488
static void htab_free_malloced_internal_structs(struct bpf_htab *htab)
1489
{
1490
int i;
1491
1492
rcu_read_lock();
1493
for (i = 0; i < htab->n_buckets; i++) {
1494
struct hlist_nulls_head *head = select_bucket(htab, i);
1495
struct hlist_nulls_node *n;
1496
struct htab_elem *l;
1497
1498
hlist_nulls_for_each_entry(l, n, head, hash_node) {
1499
/* We only free internal structs on uref dropping to zero */
1500
bpf_map_free_internal_structs(&htab->map,
1501
htab_elem_value(l, htab->map.key_size));
1502
}
1503
cond_resched_rcu();
1504
}
1505
rcu_read_unlock();
1506
}
1507
1508
static void htab_map_free_internal_structs(struct bpf_map *map)
1509
{
1510
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1511
1512
/* We only free internal structs on uref dropping to zero */
1513
if (!bpf_map_has_internal_structs(map))
1514
return;
1515
1516
if (htab_is_prealloc(htab))
1517
htab_free_prealloced_internal_structs(htab);
1518
else
1519
htab_free_malloced_internal_structs(htab);
1520
}
1521
1522
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
1523
static void htab_map_free(struct bpf_map *map)
1524
{
1525
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1526
1527
/* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
1528
* bpf_free_used_maps() is called after bpf prog is no longer executing.
1529
* There is no need to synchronize_rcu() here to protect map elements.
1530
*/
1531
1532
/* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
1533
* underneath and is responsible for waiting for callbacks to finish
1534
* during bpf_mem_alloc_destroy().
1535
*/
1536
if (!htab_is_prealloc(htab)) {
1537
delete_all_elements(htab);
1538
} else {
1539
htab_free_prealloced_fields(htab);
1540
prealloc_destroy(htab);
1541
}
1542
1543
bpf_map_free_elem_count(map);
1544
free_percpu(htab->extra_elems);
1545
bpf_map_area_free(htab->buckets);
1546
bpf_mem_alloc_destroy(&htab->pcpu_ma);
1547
bpf_mem_alloc_destroy(&htab->ma);
1548
if (htab->use_percpu_counter)
1549
percpu_counter_destroy(&htab->pcount);
1550
bpf_map_area_free(htab);
1551
}
1552
1553
static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
1554
struct seq_file *m)
1555
{
1556
void *value;
1557
1558
rcu_read_lock();
1559
1560
value = htab_map_lookup_elem(map, key);
1561
if (!value) {
1562
rcu_read_unlock();
1563
return;
1564
}
1565
1566
btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
1567
seq_puts(m, ": ");
1568
btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
1569
seq_putc(m, '\n');
1570
1571
rcu_read_unlock();
1572
}
1573
1574
static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1575
void *value, bool is_lru_map,
1576
bool is_percpu, u64 flags)
1577
{
1578
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1579
struct hlist_nulls_head *head;
1580
unsigned long bflags;
1581
struct htab_elem *l;
1582
u32 hash, key_size;
1583
struct bucket *b;
1584
int ret;
1585
1586
key_size = map->key_size;
1587
1588
hash = htab_map_hash(key, key_size, htab->hashrnd);
1589
b = __select_bucket(htab, hash);
1590
head = &b->head;
1591
1592
ret = htab_lock_bucket(b, &bflags);
1593
if (ret)
1594
return ret;
1595
1596
l = lookup_elem_raw(head, hash, key, key_size);
1597
if (!l) {
1598
ret = -ENOENT;
1599
goto out_unlock;
1600
}
1601
1602
if (is_percpu) {
1603
u32 roundup_value_size = round_up(map->value_size, 8);
1604
void __percpu *pptr;
1605
int off = 0, cpu;
1606
1607
pptr = htab_elem_get_ptr(l, key_size);
1608
for_each_possible_cpu(cpu) {
1609
copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
1610
check_and_init_map_value(&htab->map, value + off);
1611
off += roundup_value_size;
1612
}
1613
} else {
1614
void *src = htab_elem_value(l, map->key_size);
1615
1616
if (flags & BPF_F_LOCK)
1617
copy_map_value_locked(map, value, src, true);
1618
else
1619
copy_map_value(map, value, src);
1620
/* Zeroing special fields in the temp buffer */
1621
check_and_init_map_value(map, value);
1622
}
1623
hlist_nulls_del_rcu(&l->hash_node);
1624
1625
out_unlock:
1626
htab_unlock_bucket(b, bflags);
1627
1628
if (l) {
1629
if (is_lru_map)
1630
htab_lru_push_free(htab, l);
1631
else
1632
free_htab_elem(htab, l);
1633
}
1634
1635
return ret;
1636
}
1637
1638
static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1639
void *value, u64 flags)
1640
{
1641
return __htab_map_lookup_and_delete_elem(map, key, value, false, false,
1642
flags);
1643
}
1644
1645
static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1646
void *key, void *value,
1647
u64 flags)
1648
{
1649
return __htab_map_lookup_and_delete_elem(map, key, value, false, true,
1650
flags);
1651
}
1652
1653
static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1654
void *value, u64 flags)
1655
{
1656
return __htab_map_lookup_and_delete_elem(map, key, value, true, false,
1657
flags);
1658
}
1659
1660
static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1661
void *key, void *value,
1662
u64 flags)
1663
{
1664
return __htab_map_lookup_and_delete_elem(map, key, value, true, true,
1665
flags);
1666
}
1667
1668
static int
1669
__htab_map_lookup_and_delete_batch(struct bpf_map *map,
1670
const union bpf_attr *attr,
1671
union bpf_attr __user *uattr,
1672
bool do_delete, bool is_lru_map,
1673
bool is_percpu)
1674
{
1675
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1676
void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
1677
void __user *uvalues = u64_to_user_ptr(attr->batch.values);
1678
void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
1679
void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1680
u32 batch, max_count, size, bucket_size, map_id;
1681
u32 bucket_cnt, total, key_size, value_size;
1682
struct htab_elem *node_to_free = NULL;
1683
u64 elem_map_flags, map_flags;
1684
struct hlist_nulls_head *head;
1685
struct hlist_nulls_node *n;
1686
unsigned long flags = 0;
1687
bool locked = false;
1688
struct htab_elem *l;
1689
struct bucket *b;
1690
int ret = 0;
1691
1692
elem_map_flags = attr->batch.elem_flags;
1693
if ((elem_map_flags & ~BPF_F_LOCK) ||
1694
((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
1695
return -EINVAL;
1696
1697
map_flags = attr->batch.flags;
1698
if (map_flags)
1699
return -EINVAL;
1700
1701
max_count = attr->batch.count;
1702
if (!max_count)
1703
return 0;
1704
1705
if (put_user(0, &uattr->batch.count))
1706
return -EFAULT;
1707
1708
batch = 0;
1709
if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
1710
return -EFAULT;
1711
1712
if (batch >= htab->n_buckets)
1713
return -ENOENT;
1714
1715
key_size = htab->map.key_size;
1716
value_size = htab->map.value_size;
1717
size = round_up(value_size, 8);
1718
if (is_percpu)
1719
value_size = size * num_possible_cpus();
1720
total = 0;
1721
/* while experimenting with hash tables with sizes ranging from 10 to
1722
* 1000, it was observed that a bucket can have up to 5 entries.
1723
*/
1724
bucket_size = 5;
1725
1726
alloc:
1727
/* We cannot do copy_from_user or copy_to_user inside
1728
* the rcu_read_lock. Allocate enough space here.
1729
*/
1730
keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN);
1731
values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN);
1732
if (!keys || !values) {
1733
ret = -ENOMEM;
1734
goto after_loop;
1735
}
1736
1737
again:
1738
bpf_disable_instrumentation();
1739
rcu_read_lock();
1740
again_nocopy:
1741
dst_key = keys;
1742
dst_val = values;
1743
b = &htab->buckets[batch];
1744
head = &b->head;
1745
/* do not grab the lock unless need it (bucket_cnt > 0). */
1746
if (locked) {
1747
ret = htab_lock_bucket(b, &flags);
1748
if (ret) {
1749
rcu_read_unlock();
1750
bpf_enable_instrumentation();
1751
goto after_loop;
1752
}
1753
}
1754
1755
bucket_cnt = 0;
1756
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
1757
bucket_cnt++;
1758
1759
if (bucket_cnt && !locked) {
1760
locked = true;
1761
goto again_nocopy;
1762
}
1763
1764
if (bucket_cnt > (max_count - total)) {
1765
if (total == 0)
1766
ret = -ENOSPC;
1767
/* Note that since bucket_cnt > 0 here, it is implicit
1768
* that the locked was grabbed, so release it.
1769
*/
1770
htab_unlock_bucket(b, flags);
1771
rcu_read_unlock();
1772
bpf_enable_instrumentation();
1773
goto after_loop;
1774
}
1775
1776
if (bucket_cnt > bucket_size) {
1777
bucket_size = bucket_cnt;
1778
/* Note that since bucket_cnt > 0 here, it is implicit
1779
* that the locked was grabbed, so release it.
1780
*/
1781
htab_unlock_bucket(b, flags);
1782
rcu_read_unlock();
1783
bpf_enable_instrumentation();
1784
kvfree(keys);
1785
kvfree(values);
1786
goto alloc;
1787
}
1788
1789
/* Next block is only safe to run if you have grabbed the lock */
1790
if (!locked)
1791
goto next_batch;
1792
1793
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1794
memcpy(dst_key, l->key, key_size);
1795
1796
if (is_percpu) {
1797
int off = 0, cpu;
1798
void __percpu *pptr;
1799
1800
pptr = htab_elem_get_ptr(l, map->key_size);
1801
for_each_possible_cpu(cpu) {
1802
copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu));
1803
check_and_init_map_value(&htab->map, dst_val + off);
1804
off += size;
1805
}
1806
} else {
1807
value = htab_elem_value(l, key_size);
1808
if (is_fd_htab(htab)) {
1809
struct bpf_map **inner_map = value;
1810
1811
/* Actual value is the id of the inner map */
1812
map_id = map->ops->map_fd_sys_lookup_elem(*inner_map);
1813
value = &map_id;
1814
}
1815
1816
if (elem_map_flags & BPF_F_LOCK)
1817
copy_map_value_locked(map, dst_val, value,
1818
true);
1819
else
1820
copy_map_value(map, dst_val, value);
1821
/* Zeroing special fields in the temp buffer */
1822
check_and_init_map_value(map, dst_val);
1823
}
1824
if (do_delete) {
1825
hlist_nulls_del_rcu(&l->hash_node);
1826
1827
/* bpf_lru_push_free() will acquire lru_lock, which
1828
* may cause deadlock. See comments in function
1829
* prealloc_lru_pop(). Let us do bpf_lru_push_free()
1830
* after releasing the bucket lock.
1831
*
1832
* For htab of maps, htab_put_fd_value() in
1833
* free_htab_elem() may acquire a spinlock with bucket
1834
* lock being held and it violates the lock rule, so
1835
* invoke free_htab_elem() after unlock as well.
1836
*/
1837
l->batch_flink = node_to_free;
1838
node_to_free = l;
1839
}
1840
dst_key += key_size;
1841
dst_val += value_size;
1842
}
1843
1844
htab_unlock_bucket(b, flags);
1845
locked = false;
1846
1847
while (node_to_free) {
1848
l = node_to_free;
1849
node_to_free = node_to_free->batch_flink;
1850
if (is_lru_map)
1851
htab_lru_push_free(htab, l);
1852
else
1853
free_htab_elem(htab, l);
1854
}
1855
1856
next_batch:
1857
/* If we are not copying data, we can go to next bucket and avoid
1858
* unlocking the rcu.
1859
*/
1860
if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
1861
batch++;
1862
goto again_nocopy;
1863
}
1864
1865
rcu_read_unlock();
1866
bpf_enable_instrumentation();
1867
if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
1868
key_size * bucket_cnt) ||
1869
copy_to_user(uvalues + total * value_size, values,
1870
value_size * bucket_cnt))) {
1871
ret = -EFAULT;
1872
goto after_loop;
1873
}
1874
1875
total += bucket_cnt;
1876
batch++;
1877
if (batch >= htab->n_buckets) {
1878
ret = -ENOENT;
1879
goto after_loop;
1880
}
1881
goto again;
1882
1883
after_loop:
1884
if (ret == -EFAULT)
1885
goto out;
1886
1887
/* copy # of entries and next batch */
1888
ubatch = u64_to_user_ptr(attr->batch.out_batch);
1889
if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
1890
put_user(total, &uattr->batch.count))
1891
ret = -EFAULT;
1892
1893
out:
1894
kvfree(keys);
1895
kvfree(values);
1896
return ret;
1897
}
1898
1899
static int
1900
htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1901
union bpf_attr __user *uattr)
1902
{
1903
return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1904
false, true);
1905
}
1906
1907
static int
1908
htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1909
const union bpf_attr *attr,
1910
union bpf_attr __user *uattr)
1911
{
1912
return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1913
false, true);
1914
}
1915
1916
static int
1917
htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1918
union bpf_attr __user *uattr)
1919
{
1920
return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1921
false, false);
1922
}
1923
1924
static int
1925
htab_map_lookup_and_delete_batch(struct bpf_map *map,
1926
const union bpf_attr *attr,
1927
union bpf_attr __user *uattr)
1928
{
1929
return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1930
false, false);
1931
}
1932
1933
static int
1934
htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
1935
const union bpf_attr *attr,
1936
union bpf_attr __user *uattr)
1937
{
1938
return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1939
true, true);
1940
}
1941
1942
static int
1943
htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1944
const union bpf_attr *attr,
1945
union bpf_attr __user *uattr)
1946
{
1947
return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1948
true, true);
1949
}
1950
1951
static int
1952
htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1953
union bpf_attr __user *uattr)
1954
{
1955
return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1956
true, false);
1957
}
1958
1959
static int
1960
htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
1961
const union bpf_attr *attr,
1962
union bpf_attr __user *uattr)
1963
{
1964
return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1965
true, false);
1966
}
1967
1968
struct bpf_iter_seq_hash_map_info {
1969
struct bpf_map *map;
1970
struct bpf_htab *htab;
1971
void *percpu_value_buf; // non-zero means percpu hash
1972
u32 bucket_id;
1973
u32 skip_elems;
1974
};
1975
1976
static struct htab_elem *
1977
bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
1978
struct htab_elem *prev_elem)
1979
{
1980
const struct bpf_htab *htab = info->htab;
1981
u32 skip_elems = info->skip_elems;
1982
u32 bucket_id = info->bucket_id;
1983
struct hlist_nulls_head *head;
1984
struct hlist_nulls_node *n;
1985
struct htab_elem *elem;
1986
struct bucket *b;
1987
u32 i, count;
1988
1989
if (bucket_id >= htab->n_buckets)
1990
return NULL;
1991
1992
/* try to find next elem in the same bucket */
1993
if (prev_elem) {
1994
/* no update/deletion on this bucket, prev_elem should be still valid
1995
* and we won't skip elements.
1996
*/
1997
n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));
1998
elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node);
1999
if (elem)
2000
return elem;
2001
2002
/* not found, unlock and go to the next bucket */
2003
b = &htab->buckets[bucket_id++];
2004
rcu_read_unlock();
2005
skip_elems = 0;
2006
}
2007
2008
for (i = bucket_id; i < htab->n_buckets; i++) {
2009
b = &htab->buckets[i];
2010
rcu_read_lock();
2011
2012
count = 0;
2013
head = &b->head;
2014
hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
2015
if (count >= skip_elems) {
2016
info->bucket_id = i;
2017
info->skip_elems = count;
2018
return elem;
2019
}
2020
count++;
2021
}
2022
2023
rcu_read_unlock();
2024
skip_elems = 0;
2025
}
2026
2027
info->bucket_id = i;
2028
info->skip_elems = 0;
2029
return NULL;
2030
}
2031
2032
static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos)
2033
{
2034
struct bpf_iter_seq_hash_map_info *info = seq->private;
2035
struct htab_elem *elem;
2036
2037
elem = bpf_hash_map_seq_find_next(info, NULL);
2038
if (!elem)
2039
return NULL;
2040
2041
if (*pos == 0)
2042
++*pos;
2043
return elem;
2044
}
2045
2046
static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2047
{
2048
struct bpf_iter_seq_hash_map_info *info = seq->private;
2049
2050
++*pos;
2051
++info->skip_elems;
2052
return bpf_hash_map_seq_find_next(info, v);
2053
}
2054
2055
static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
2056
{
2057
struct bpf_iter_seq_hash_map_info *info = seq->private;
2058
struct bpf_iter__bpf_map_elem ctx = {};
2059
struct bpf_map *map = info->map;
2060
struct bpf_iter_meta meta;
2061
int ret = 0, off = 0, cpu;
2062
u32 roundup_value_size;
2063
struct bpf_prog *prog;
2064
void __percpu *pptr;
2065
2066
meta.seq = seq;
2067
prog = bpf_iter_get_info(&meta, elem == NULL);
2068
if (prog) {
2069
ctx.meta = &meta;
2070
ctx.map = info->map;
2071
if (elem) {
2072
ctx.key = elem->key;
2073
if (!info->percpu_value_buf) {
2074
ctx.value = htab_elem_value(elem, map->key_size);
2075
} else {
2076
roundup_value_size = round_up(map->value_size, 8);
2077
pptr = htab_elem_get_ptr(elem, map->key_size);
2078
for_each_possible_cpu(cpu) {
2079
copy_map_value_long(map, info->percpu_value_buf + off,
2080
per_cpu_ptr(pptr, cpu));
2081
check_and_init_map_value(map, info->percpu_value_buf + off);
2082
off += roundup_value_size;
2083
}
2084
ctx.value = info->percpu_value_buf;
2085
}
2086
}
2087
ret = bpf_iter_run_prog(prog, &ctx);
2088
}
2089
2090
return ret;
2091
}
2092
2093
static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)
2094
{
2095
return __bpf_hash_map_seq_show(seq, v);
2096
}
2097
2098
static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)
2099
{
2100
if (!v)
2101
(void)__bpf_hash_map_seq_show(seq, NULL);
2102
else
2103
rcu_read_unlock();
2104
}
2105
2106
static int bpf_iter_init_hash_map(void *priv_data,
2107
struct bpf_iter_aux_info *aux)
2108
{
2109
struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2110
struct bpf_map *map = aux->map;
2111
void *value_buf;
2112
u32 buf_size;
2113
2114
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
2115
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
2116
buf_size = round_up(map->value_size, 8) * num_possible_cpus();
2117
value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
2118
if (!value_buf)
2119
return -ENOMEM;
2120
2121
seq_info->percpu_value_buf = value_buf;
2122
}
2123
2124
bpf_map_inc_with_uref(map);
2125
seq_info->map = map;
2126
seq_info->htab = container_of(map, struct bpf_htab, map);
2127
return 0;
2128
}
2129
2130
static void bpf_iter_fini_hash_map(void *priv_data)
2131
{
2132
struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2133
2134
bpf_map_put_with_uref(seq_info->map);
2135
kfree(seq_info->percpu_value_buf);
2136
}
2137
2138
static const struct seq_operations bpf_hash_map_seq_ops = {
2139
.start = bpf_hash_map_seq_start,
2140
.next = bpf_hash_map_seq_next,
2141
.stop = bpf_hash_map_seq_stop,
2142
.show = bpf_hash_map_seq_show,
2143
};
2144
2145
static const struct bpf_iter_seq_info iter_seq_info = {
2146
.seq_ops = &bpf_hash_map_seq_ops,
2147
.init_seq_private = bpf_iter_init_hash_map,
2148
.fini_seq_private = bpf_iter_fini_hash_map,
2149
.seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info),
2150
};
2151
2152
static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
2153
void *callback_ctx, u64 flags)
2154
{
2155
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2156
struct hlist_nulls_head *head;
2157
struct hlist_nulls_node *n;
2158
struct htab_elem *elem;
2159
int i, num_elems = 0;
2160
void __percpu *pptr;
2161
struct bucket *b;
2162
void *key, *val;
2163
bool is_percpu;
2164
u64 ret = 0;
2165
2166
cant_migrate();
2167
2168
if (flags != 0)
2169
return -EINVAL;
2170
2171
is_percpu = htab_is_percpu(htab);
2172
2173
/* migration has been disabled, so percpu value prepared here will be
2174
* the same as the one seen by the bpf program with
2175
* bpf_map_lookup_elem().
2176
*/
2177
for (i = 0; i < htab->n_buckets; i++) {
2178
b = &htab->buckets[i];
2179
rcu_read_lock();
2180
head = &b->head;
2181
hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) {
2182
key = elem->key;
2183
if (is_percpu) {
2184
/* current cpu value for percpu map */
2185
pptr = htab_elem_get_ptr(elem, map->key_size);
2186
val = this_cpu_ptr(pptr);
2187
} else {
2188
val = htab_elem_value(elem, map->key_size);
2189
}
2190
num_elems++;
2191
ret = callback_fn((u64)(long)map, (u64)(long)key,
2192
(u64)(long)val, (u64)(long)callback_ctx, 0);
2193
/* return value: 0 - continue, 1 - stop and return */
2194
if (ret) {
2195
rcu_read_unlock();
2196
goto out;
2197
}
2198
}
2199
rcu_read_unlock();
2200
}
2201
out:
2202
return num_elems;
2203
}
2204
2205
static u64 htab_map_mem_usage(const struct bpf_map *map)
2206
{
2207
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2208
u32 value_size = round_up(htab->map.value_size, 8);
2209
bool prealloc = htab_is_prealloc(htab);
2210
bool percpu = htab_is_percpu(htab);
2211
bool lru = htab_is_lru(htab);
2212
u64 num_entries;
2213
u64 usage = sizeof(struct bpf_htab);
2214
2215
usage += sizeof(struct bucket) * htab->n_buckets;
2216
usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT;
2217
if (prealloc) {
2218
num_entries = map->max_entries;
2219
if (htab_has_extra_elems(htab))
2220
num_entries += num_possible_cpus();
2221
2222
usage += htab->elem_size * num_entries;
2223
2224
if (percpu)
2225
usage += value_size * num_possible_cpus() * num_entries;
2226
else if (!lru)
2227
usage += sizeof(struct htab_elem *) * num_possible_cpus();
2228
} else {
2229
#define LLIST_NODE_SZ sizeof(struct llist_node)
2230
2231
num_entries = htab->use_percpu_counter ?
2232
percpu_counter_sum(&htab->pcount) :
2233
atomic_read(&htab->count);
2234
usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries;
2235
if (percpu) {
2236
usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries;
2237
usage += value_size * num_possible_cpus() * num_entries;
2238
}
2239
}
2240
return usage;
2241
}
2242
2243
BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab)
2244
const struct bpf_map_ops htab_map_ops = {
2245
.map_meta_equal = bpf_map_meta_equal,
2246
.map_alloc_check = htab_map_alloc_check,
2247
.map_alloc = htab_map_alloc,
2248
.map_free = htab_map_free,
2249
.map_get_next_key = htab_map_get_next_key,
2250
.map_release_uref = htab_map_free_internal_structs,
2251
.map_lookup_elem = htab_map_lookup_elem,
2252
.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
2253
.map_update_elem = htab_map_update_elem,
2254
.map_delete_elem = htab_map_delete_elem,
2255
.map_gen_lookup = htab_map_gen_lookup,
2256
.map_seq_show_elem = htab_map_seq_show_elem,
2257
.map_set_for_each_callback_args = map_set_for_each_callback_args,
2258
.map_for_each_callback = bpf_for_each_hash_elem,
2259
.map_mem_usage = htab_map_mem_usage,
2260
BATCH_OPS(htab),
2261
.map_btf_id = &htab_map_btf_ids[0],
2262
.iter_seq_info = &iter_seq_info,
2263
};
2264
2265
const struct bpf_map_ops htab_lru_map_ops = {
2266
.map_meta_equal = bpf_map_meta_equal,
2267
.map_alloc_check = htab_map_alloc_check,
2268
.map_alloc = htab_map_alloc,
2269
.map_free = htab_map_free,
2270
.map_get_next_key = htab_map_get_next_key,
2271
.map_release_uref = htab_map_free_internal_structs,
2272
.map_lookup_elem = htab_lru_map_lookup_elem,
2273
.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
2274
.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
2275
.map_update_elem = htab_lru_map_update_elem,
2276
.map_delete_elem = htab_lru_map_delete_elem,
2277
.map_gen_lookup = htab_lru_map_gen_lookup,
2278
.map_seq_show_elem = htab_map_seq_show_elem,
2279
.map_set_for_each_callback_args = map_set_for_each_callback_args,
2280
.map_for_each_callback = bpf_for_each_hash_elem,
2281
.map_mem_usage = htab_map_mem_usage,
2282
BATCH_OPS(htab_lru),
2283
.map_btf_id = &htab_map_btf_ids[0],
2284
.iter_seq_info = &iter_seq_info,
2285
};
2286
2287
/* Called from eBPF program */
2288
static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
2289
{
2290
struct htab_elem *l = __htab_map_lookup_elem(map, key);
2291
2292
if (l)
2293
return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2294
else
2295
return NULL;
2296
}
2297
2298
/* inline bpf_map_lookup_elem() call for per-CPU hashmap */
2299
static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
2300
{
2301
struct bpf_insn *insn = insn_buf;
2302
2303
if (!bpf_jit_supports_percpu_insn())
2304
return -EOPNOTSUPP;
2305
2306
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
2307
(void *(*)(struct bpf_map *map, void *key))NULL));
2308
*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
2309
*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3);
2310
*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0,
2311
offsetof(struct htab_elem, key) + roundup(map->key_size, 8));
2312
*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
2313
*insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
2314
2315
return insn - insn_buf;
2316
}
2317
2318
static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
2319
{
2320
struct htab_elem *l;
2321
2322
if (cpu >= nr_cpu_ids)
2323
return NULL;
2324
2325
l = __htab_map_lookup_elem(map, key);
2326
if (l)
2327
return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2328
else
2329
return NULL;
2330
}
2331
2332
static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
2333
{
2334
struct htab_elem *l = __htab_map_lookup_elem(map, key);
2335
2336
if (l) {
2337
bpf_lru_node_set_ref(&l->lru_node);
2338
return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2339
}
2340
2341
return NULL;
2342
}
2343
2344
static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
2345
{
2346
struct htab_elem *l;
2347
2348
if (cpu >= nr_cpu_ids)
2349
return NULL;
2350
2351
l = __htab_map_lookup_elem(map, key);
2352
if (l) {
2353
bpf_lru_node_set_ref(&l->lru_node);
2354
return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2355
}
2356
2357
return NULL;
2358
}
2359
2360
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
2361
{
2362
struct htab_elem *l;
2363
void __percpu *pptr;
2364
int ret = -ENOENT;
2365
int cpu, off = 0;
2366
u32 size;
2367
2368
/* per_cpu areas are zero-filled and bpf programs can only
2369
* access 'value_size' of them, so copying rounded areas
2370
* will not leak any kernel data
2371
*/
2372
size = round_up(map->value_size, 8);
2373
rcu_read_lock();
2374
l = __htab_map_lookup_elem(map, key);
2375
if (!l)
2376
goto out;
2377
/* We do not mark LRU map element here in order to not mess up
2378
* eviction heuristics when user space does a map walk.
2379
*/
2380
pptr = htab_elem_get_ptr(l, map->key_size);
2381
for_each_possible_cpu(cpu) {
2382
copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
2383
check_and_init_map_value(map, value + off);
2384
off += size;
2385
}
2386
ret = 0;
2387
out:
2388
rcu_read_unlock();
2389
return ret;
2390
}
2391
2392
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
2393
u64 map_flags)
2394
{
2395
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2396
int ret;
2397
2398
rcu_read_lock();
2399
if (htab_is_lru(htab))
2400
ret = __htab_lru_percpu_map_update_elem(map, key, value,
2401
map_flags, true);
2402
else
2403
ret = htab_map_update_elem_in_place(map, key, value, map_flags,
2404
true, true);
2405
rcu_read_unlock();
2406
2407
return ret;
2408
}
2409
2410
static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
2411
struct seq_file *m)
2412
{
2413
struct htab_elem *l;
2414
void __percpu *pptr;
2415
int cpu;
2416
2417
rcu_read_lock();
2418
2419
l = __htab_map_lookup_elem(map, key);
2420
if (!l) {
2421
rcu_read_unlock();
2422
return;
2423
}
2424
2425
btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
2426
seq_puts(m, ": {\n");
2427
pptr = htab_elem_get_ptr(l, map->key_size);
2428
for_each_possible_cpu(cpu) {
2429
seq_printf(m, "\tcpu%d: ", cpu);
2430
btf_type_seq_show(map->btf, map->btf_value_type_id,
2431
per_cpu_ptr(pptr, cpu), m);
2432
seq_putc(m, '\n');
2433
}
2434
seq_puts(m, "}\n");
2435
2436
rcu_read_unlock();
2437
}
2438
2439
const struct bpf_map_ops htab_percpu_map_ops = {
2440
.map_meta_equal = bpf_map_meta_equal,
2441
.map_alloc_check = htab_map_alloc_check,
2442
.map_alloc = htab_map_alloc,
2443
.map_free = htab_map_free,
2444
.map_get_next_key = htab_map_get_next_key,
2445
.map_lookup_elem = htab_percpu_map_lookup_elem,
2446
.map_gen_lookup = htab_percpu_map_gen_lookup,
2447
.map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
2448
.map_update_elem = htab_percpu_map_update_elem,
2449
.map_delete_elem = htab_map_delete_elem,
2450
.map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem,
2451
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
2452
.map_set_for_each_callback_args = map_set_for_each_callback_args,
2453
.map_for_each_callback = bpf_for_each_hash_elem,
2454
.map_mem_usage = htab_map_mem_usage,
2455
BATCH_OPS(htab_percpu),
2456
.map_btf_id = &htab_map_btf_ids[0],
2457
.iter_seq_info = &iter_seq_info,
2458
};
2459
2460
const struct bpf_map_ops htab_lru_percpu_map_ops = {
2461
.map_meta_equal = bpf_map_meta_equal,
2462
.map_alloc_check = htab_map_alloc_check,
2463
.map_alloc = htab_map_alloc,
2464
.map_free = htab_map_free,
2465
.map_get_next_key = htab_map_get_next_key,
2466
.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
2467
.map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
2468
.map_update_elem = htab_lru_percpu_map_update_elem,
2469
.map_delete_elem = htab_lru_map_delete_elem,
2470
.map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem,
2471
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
2472
.map_set_for_each_callback_args = map_set_for_each_callback_args,
2473
.map_for_each_callback = bpf_for_each_hash_elem,
2474
.map_mem_usage = htab_map_mem_usage,
2475
BATCH_OPS(htab_lru_percpu),
2476
.map_btf_id = &htab_map_btf_ids[0],
2477
.iter_seq_info = &iter_seq_info,
2478
};
2479
2480
static int fd_htab_map_alloc_check(union bpf_attr *attr)
2481
{
2482
if (attr->value_size != sizeof(u32))
2483
return -EINVAL;
2484
return htab_map_alloc_check(attr);
2485
}
2486
2487
static void fd_htab_map_free(struct bpf_map *map)
2488
{
2489
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2490
struct hlist_nulls_node *n;
2491
struct hlist_nulls_head *head;
2492
struct htab_elem *l;
2493
int i;
2494
2495
for (i = 0; i < htab->n_buckets; i++) {
2496
head = select_bucket(htab, i);
2497
2498
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
2499
void *ptr = fd_htab_map_get_ptr(map, l);
2500
2501
map->ops->map_fd_put_ptr(map, ptr, false);
2502
}
2503
}
2504
2505
htab_map_free(map);
2506
}
2507
2508
/* only called from syscall */
2509
int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
2510
{
2511
void **ptr;
2512
int ret = 0;
2513
2514
if (!map->ops->map_fd_sys_lookup_elem)
2515
return -ENOTSUPP;
2516
2517
rcu_read_lock();
2518
ptr = htab_map_lookup_elem(map, key);
2519
if (ptr)
2520
*value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
2521
else
2522
ret = -ENOENT;
2523
rcu_read_unlock();
2524
2525
return ret;
2526
}
2527
2528
/* Only called from syscall */
2529
int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
2530
void *key, void *value, u64 map_flags)
2531
{
2532
void *ptr;
2533
int ret;
2534
2535
ptr = map->ops->map_fd_get_ptr(map, map_file, *(int *)value);
2536
if (IS_ERR(ptr))
2537
return PTR_ERR(ptr);
2538
2539
/* The htab bucket lock is always held during update operations in fd
2540
* htab map, and the following rcu_read_lock() is only used to avoid
2541
* the WARN_ON_ONCE in htab_map_update_elem_in_place().
2542
*/
2543
rcu_read_lock();
2544
ret = htab_map_update_elem_in_place(map, key, &ptr, map_flags, false, false);
2545
rcu_read_unlock();
2546
if (ret)
2547
map->ops->map_fd_put_ptr(map, ptr, false);
2548
2549
return ret;
2550
}
2551
2552
static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
2553
{
2554
struct bpf_map *map, *inner_map_meta;
2555
2556
inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
2557
if (IS_ERR(inner_map_meta))
2558
return inner_map_meta;
2559
2560
map = htab_map_alloc(attr);
2561
if (IS_ERR(map)) {
2562
bpf_map_meta_free(inner_map_meta);
2563
return map;
2564
}
2565
2566
map->inner_map_meta = inner_map_meta;
2567
2568
return map;
2569
}
2570
2571
static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
2572
{
2573
struct bpf_map **inner_map = htab_map_lookup_elem(map, key);
2574
2575
if (!inner_map)
2576
return NULL;
2577
2578
return READ_ONCE(*inner_map);
2579
}
2580
2581
static int htab_of_map_gen_lookup(struct bpf_map *map,
2582
struct bpf_insn *insn_buf)
2583
{
2584
struct bpf_insn *insn = insn_buf;
2585
const int ret = BPF_REG_0;
2586
2587
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
2588
(void *(*)(struct bpf_map *map, void *key))NULL));
2589
*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
2590
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
2591
*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
2592
offsetof(struct htab_elem, key) +
2593
round_up(map->key_size, 8));
2594
*insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
2595
2596
return insn - insn_buf;
2597
}
2598
2599
static void htab_of_map_free(struct bpf_map *map)
2600
{
2601
bpf_map_meta_free(map->inner_map_meta);
2602
fd_htab_map_free(map);
2603
}
2604
2605
const struct bpf_map_ops htab_of_maps_map_ops = {
2606
.map_alloc_check = fd_htab_map_alloc_check,
2607
.map_alloc = htab_of_map_alloc,
2608
.map_free = htab_of_map_free,
2609
.map_get_next_key = htab_map_get_next_key,
2610
.map_lookup_elem = htab_of_map_lookup_elem,
2611
.map_delete_elem = htab_map_delete_elem,
2612
.map_fd_get_ptr = bpf_map_fd_get_ptr,
2613
.map_fd_put_ptr = bpf_map_fd_put_ptr,
2614
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
2615
.map_gen_lookup = htab_of_map_gen_lookup,
2616
.map_check_btf = map_check_no_btf,
2617
.map_mem_usage = htab_map_mem_usage,
2618
BATCH_OPS(htab),
2619
.map_btf_id = &htab_map_btf_ids[0],
2620
};
2621
2622