CoCalc -- hashtab.c

GitHub Repository: torvalds/linux
Path: blob/master/kernel/bpf/hashtab.c
⁴⁹⁶⁹⁸ views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3
 * Copyright (c) 2016 Facebook
4
 */
5
#include <linux/bpf.h>
6
#include <linux/btf.h>
7
#include <linux/jhash.h>
8
#include <linux/filter.h>
9
#include <linux/rculist_nulls.h>
10
#include <linux/rcupdate_wait.h>
11
#include <linux/random.h>
12
#include <uapi/linux/btf.h>
13
#include <linux/rcupdate_trace.h>
14
#include <linux/btf_ids.h>
15
#include "percpu_freelist.h"
16
#include "bpf_lru_list.h"
17
#include "map_in_map.h"
18
#include <linux/bpf_mem_alloc.h>
19
#include <asm/rqspinlock.h>
20

21
#define HTAB_CREATE_FLAG_MASK						\
22
	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |	\
23
	 BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
24

25
#define BATCH_OPS(_name)			\
26
	.map_lookup_batch =			\
27
	_name##_map_lookup_batch,		\
28
	.map_lookup_and_delete_batch =		\
29
	_name##_map_lookup_and_delete_batch,	\
30
	.map_update_batch =			\
31
	generic_map_update_batch,		\
32
	.map_delete_batch =			\
33
	generic_map_delete_batch
34

35
/*
36
 * The bucket lock has two protection scopes:
37
 *
38
 * 1) Serializing concurrent operations from BPF programs on different
39
 *    CPUs
40
 *
41
 * 2) Serializing concurrent operations from BPF programs and sys_bpf()
42
 *
43
 * BPF programs can execute in any context including perf, kprobes and
44
 * tracing. As there are almost no limits where perf, kprobes and tracing
45
 * can be invoked from the lock operations need to be protected against
46
 * deadlocks. Deadlocks can be caused by recursion and by an invocation in
47
 * the lock held section when functions which acquire this lock are invoked
48
 * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
49
 * variable bpf_prog_active, which prevents BPF programs attached to perf
50
 * events, kprobes and tracing to be invoked before the prior invocation
51
 * from one of these contexts completed. sys_bpf() uses the same mechanism
52
 * by pinning the task to the current CPU and incrementing the recursion
53
 * protection across the map operation.
54
 *
55
 * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
56
 * operations like memory allocations (even with GFP_ATOMIC) from atomic
57
 * contexts. This is required because even with GFP_ATOMIC the memory
58
 * allocator calls into code paths which acquire locks with long held lock
59
 * sections. To ensure the deterministic behaviour these locks are regular
60
 * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
61
 * true atomic contexts on an RT kernel are the low level hardware
62
 * handling, scheduling, low level interrupt handling, NMIs etc. None of
63
 * these contexts should ever do memory allocations.
64
 *
65
 * As regular device interrupt handlers and soft interrupts are forced into
66
 * thread context, the existing code which does
67
 *   spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*();
68
 * just works.
69
 *
70
 * In theory the BPF locks could be converted to regular spinlocks as well,
71
 * but the bucket locks and percpu_freelist locks can be taken from
72
 * arbitrary contexts (perf, kprobes, tracepoints) which are required to be
73
 * atomic contexts even on RT. Before the introduction of bpf_mem_alloc,
74
 * it is only safe to use raw spinlock for preallocated hash map on a RT kernel,
75
 * because there is no memory allocation within the lock held sections. However
76
 * after hash map was fully converted to use bpf_mem_alloc, there will be
77
 * non-synchronous memory allocation for non-preallocated hash map, so it is
78
 * safe to always use raw spinlock for bucket lock.
79
 */
80
struct bucket {
81
	struct hlist_nulls_head head;
82
	rqspinlock_t raw_lock;
83
};
84

85
#define HASHTAB_MAP_LOCK_COUNT 8
86
#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
87

88
struct bpf_htab {
89
	struct bpf_map map;
90
	struct bpf_mem_alloc ma;
91
	struct bpf_mem_alloc pcpu_ma;
92
	struct bucket *buckets;
93
	void *elems;
94
	union {
95
		struct pcpu_freelist freelist;
96
		struct bpf_lru lru;
97
	};
98
	struct htab_elem *__percpu *extra_elems;
99
	/* number of elements in non-preallocated hashtable are kept
100
	 * in either pcount or count
101
	 */
102
	struct percpu_counter pcount;
103
	atomic_t count;
104
	bool use_percpu_counter;
105
	u32 n_buckets;	/* number of hash buckets */
106
	u32 elem_size;	/* size of each element in bytes */
107
	u32 hashrnd;
108
};
109

110
/* each htab element is struct htab_elem + key + value */
111
struct htab_elem {
112
	union {
113
		struct hlist_nulls_node hash_node;
114
		struct {
115
			void *padding;
116
			union {
117
				struct pcpu_freelist_node fnode;
118
				struct htab_elem *batch_flink;
119
			};
120
		};
121
	};
122
	union {
123
		/* pointer to per-cpu pointer */
124
		void *ptr_to_pptr;
125
		struct bpf_lru_node lru_node;
126
	};
127
	u32 hash;
128
	char key[] __aligned(8);
129
};
130

131
static inline bool htab_is_prealloc(const struct bpf_htab *htab)
132
{
133
	return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
134
}
135

136
static void htab_init_buckets(struct bpf_htab *htab)
137
{
138
	unsigned int i;
139

140
	for (i = 0; i < htab->n_buckets; i++) {
141
		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
142
		raw_res_spin_lock_init(&htab->buckets[i].raw_lock);
143
		cond_resched();
144
	}
145
}
146

147
static inline int htab_lock_bucket(struct bucket *b, unsigned long *pflags)
148
{
149
	unsigned long flags;
150
	int ret;
151

152
	ret = raw_res_spin_lock_irqsave(&b->raw_lock, flags);
153
	if (ret)
154
		return ret;
155
	*pflags = flags;
156
	return 0;
157
}
158

159
static inline void htab_unlock_bucket(struct bucket *b, unsigned long flags)
160
{
161
	raw_res_spin_unlock_irqrestore(&b->raw_lock, flags);
162
}
163

164
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
165

166
static bool htab_is_lru(const struct bpf_htab *htab)
167
{
168
	return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
169
		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
170
}
171

172
static bool htab_is_percpu(const struct bpf_htab *htab)
173
{
174
	return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
175
		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
176
}
177

178
static inline bool is_fd_htab(const struct bpf_htab *htab)
179
{
180
	return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS;
181
}
182

183
static inline void *htab_elem_value(struct htab_elem *l, u32 key_size)
184
{
185
	return l->key + round_up(key_size, 8);
186
}
187

188
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
189
				     void __percpu *pptr)
190
{
191
	*(void __percpu **)htab_elem_value(l, key_size) = pptr;
192
}
193

194
static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
195
{
196
	return *(void __percpu **)htab_elem_value(l, key_size);
197
}
198

199
static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
200
{
201
	return *(void **)htab_elem_value(l, map->key_size);
202
}
203

204
static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
205
{
206
	return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
207
}
208

209
/* Both percpu and fd htab support in-place update, so no need for
210
 * extra elem. LRU itself can remove the least used element, so
211
 * there is no need for an extra elem during map_update.
212
 */
213
static bool htab_has_extra_elems(struct bpf_htab *htab)
214
{
215
	return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
216
}
217

218
static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
219
{
220
	u32 num_entries = htab->map.max_entries;
221
	int i;
222

223
	if (htab_has_extra_elems(htab))
224
		num_entries += num_possible_cpus();
225

226
	for (i = 0; i < num_entries; i++) {
227
		struct htab_elem *elem;
228

229
		elem = get_htab_elem(htab, i);
230
		bpf_map_free_internal_structs(&htab->map,
231
					      htab_elem_value(elem, htab->map.key_size));
232
		cond_resched();
233
	}
234
}
235

236
static void htab_free_prealloced_fields(struct bpf_htab *htab)
237
{
238
	u32 num_entries = htab->map.max_entries;
239
	int i;
240

241
	if (IS_ERR_OR_NULL(htab->map.record))
242
		return;
243
	if (htab_has_extra_elems(htab))
244
		num_entries += num_possible_cpus();
245
	for (i = 0; i < num_entries; i++) {
246
		struct htab_elem *elem;
247

248
		elem = get_htab_elem(htab, i);
249
		if (htab_is_percpu(htab)) {
250
			void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
251
			int cpu;
252

253
			for_each_possible_cpu(cpu) {
254
				bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
255
				cond_resched();
256
			}
257
		} else {
258
			bpf_obj_free_fields(htab->map.record,
259
					    htab_elem_value(elem, htab->map.key_size));
260
			cond_resched();
261
		}
262
		cond_resched();
263
	}
264
}
265

266
static void htab_free_elems(struct bpf_htab *htab)
267
{
268
	int i;
269

270
	if (!htab_is_percpu(htab))
271
		goto free_elems;
272

273
	for (i = 0; i < htab->map.max_entries; i++) {
274
		void __percpu *pptr;
275

276
		pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
277
					 htab->map.key_size);
278
		free_percpu(pptr);
279
		cond_resched();
280
	}
281
free_elems:
282
	bpf_map_area_free(htab->elems);
283
}
284

285
/* The LRU list has a lock (lru_lock). Each htab bucket has a lock
286
 * (bucket_lock). If both locks need to be acquired together, the lock
287
 * order is always lru_lock -> bucket_lock and this only happens in
288
 * bpf_lru_list.c logic. For example, certain code path of
289
 * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),
290
 * will acquire lru_lock first followed by acquiring bucket_lock.
291
 *
292
 * In hashtab.c, to avoid deadlock, lock acquisition of
293
 * bucket_lock followed by lru_lock is not allowed. In such cases,
294
 * bucket_lock needs to be released first before acquiring lru_lock.
295
 */
296
static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
297
					  u32 hash)
298
{
299
	struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
300
	struct htab_elem *l;
301

302
	if (node) {
303
		bpf_map_inc_elem_count(&htab->map);
304
		l = container_of(node, struct htab_elem, lru_node);
305
		memcpy(l->key, key, htab->map.key_size);
306
		return l;
307
	}
308

309
	return NULL;
310
}
311

312
static int prealloc_init(struct bpf_htab *htab)
313
{
314
	u32 num_entries = htab->map.max_entries;
315
	int err = -ENOMEM, i;
316

317
	if (htab_has_extra_elems(htab))
318
		num_entries += num_possible_cpus();
319

320
	htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries,
321
					 htab->map.numa_node);
322
	if (!htab->elems)
323
		return -ENOMEM;
324

325
	if (!htab_is_percpu(htab))
326
		goto skip_percpu_elems;
327

328
	for (i = 0; i < num_entries; i++) {
329
		u32 size = round_up(htab->map.value_size, 8);
330
		void __percpu *pptr;
331

332
		pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
333
					    GFP_USER | __GFP_NOWARN);
334
		if (!pptr)
335
			goto free_elems;
336
		htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
337
				  pptr);
338
		cond_resched();
339
	}
340

341
skip_percpu_elems:
342
	if (htab_is_lru(htab))
343
		err = bpf_lru_init(&htab->lru,
344
				   htab->map.map_flags & BPF_F_NO_COMMON_LRU,
345
				   offsetof(struct htab_elem, hash) -
346
				   offsetof(struct htab_elem, lru_node),
347
				   htab_lru_map_delete_node,
348
				   htab);
349
	else
350
		err = pcpu_freelist_init(&htab->freelist);
351

352
	if (err)
353
		goto free_elems;
354

355
	if (htab_is_lru(htab))
356
		bpf_lru_populate(&htab->lru, htab->elems,
357
				 offsetof(struct htab_elem, lru_node),
358
				 htab->elem_size, num_entries);
359
	else
360
		pcpu_freelist_populate(&htab->freelist,
361
				       htab->elems + offsetof(struct htab_elem, fnode),
362
				       htab->elem_size, num_entries);
363

364
	return 0;
365

366
free_elems:
367
	htab_free_elems(htab);
368
	return err;
369
}
370

371
static void prealloc_destroy(struct bpf_htab *htab)
372
{
373
	htab_free_elems(htab);
374

375
	if (htab_is_lru(htab))
376
		bpf_lru_destroy(&htab->lru);
377
	else
378
		pcpu_freelist_destroy(&htab->freelist);
379
}
380

381
static int alloc_extra_elems(struct bpf_htab *htab)
382
{
383
	struct htab_elem *__percpu *pptr, *l_new;
384
	struct pcpu_freelist_node *l;
385
	int cpu;
386

387
	pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8,
388
				    GFP_USER | __GFP_NOWARN);
389
	if (!pptr)
390
		return -ENOMEM;
391

392
	for_each_possible_cpu(cpu) {
393
		l = pcpu_freelist_pop(&htab->freelist);
394
		/* pop will succeed, since prealloc_init()
395
		 * preallocated extra num_possible_cpus elements
396
		 */
397
		l_new = container_of(l, struct htab_elem, fnode);
398
		*per_cpu_ptr(pptr, cpu) = l_new;
399
	}
400
	htab->extra_elems = pptr;
401
	return 0;
402
}
403

404
/* Called from syscall */
405
static int htab_map_alloc_check(union bpf_attr *attr)
406
{
407
	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
408
		       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
409
	bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
410
		    attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
411
	/* percpu_lru means each cpu has its own LRU list.
412
	 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
413
	 * the map's value itself is percpu.  percpu_lru has
414
	 * nothing to do with the map's value.
415
	 */
416
	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
417
	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
418
	bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
419
	int numa_node = bpf_map_attr_numa_node(attr);
420

421
	BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
422
		     offsetof(struct htab_elem, hash_node.pprev));
423

424
	if (zero_seed && !capable(CAP_SYS_ADMIN))
425
		/* Guard against local DoS, and discourage production use. */
426
		return -EPERM;
427

428
	if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK ||
429
	    !bpf_map_flags_access_ok(attr->map_flags))
430
		return -EINVAL;
431

432
	if (!lru && percpu_lru)
433
		return -EINVAL;
434

435
	if (lru && !prealloc)
436
		return -ENOTSUPP;
437

438
	if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
439
		return -EINVAL;
440

441
	/* check sanity of attributes.
442
	 * value_size == 0 may be allowed in the future to use map as a set
443
	 */
444
	if (attr->max_entries == 0 || attr->key_size == 0 ||
445
	    attr->value_size == 0)
446
		return -EINVAL;
447

448
	if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE -
449
	   sizeof(struct htab_elem))
450
		/* if key_size + value_size is bigger, the user space won't be
451
		 * able to access the elements via bpf syscall. This check
452
		 * also makes sure that the elem_size doesn't overflow and it's
453
		 * kmalloc-able later in htab_map_update_elem()
454
		 */
455
		return -E2BIG;
456
	/* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
457
	if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE)
458
		return -E2BIG;
459

460
	return 0;
461
}
462

463
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
464
{
465
	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
466
		       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
467
	/* percpu_lru means each cpu has its own LRU list.
468
	 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
469
	 * the map's value itself is percpu.  percpu_lru has
470
	 * nothing to do with the map's value.
471
	 */
472
	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
473
	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
474
	struct bpf_htab *htab;
475
	int err;
476

477
	htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
478
	if (!htab)
479
		return ERR_PTR(-ENOMEM);
480

481
	bpf_map_init_from_attr(&htab->map, attr);
482

483
	if (percpu_lru) {
484
		/* ensure each CPU's lru list has >=1 elements.
485
		 * since we are at it, make each lru list has the same
486
		 * number of elements.
487
		 */
488
		htab->map.max_entries = roundup(attr->max_entries,
489
						num_possible_cpus());
490
		if (htab->map.max_entries < attr->max_entries)
491
			htab->map.max_entries = rounddown(attr->max_entries,
492
							  num_possible_cpus());
493
	}
494

495
	/* hash table size must be power of 2; roundup_pow_of_two() can overflow
496
	 * into UB on 32-bit arches, so check that first
497
	 */
498
	err = -E2BIG;
499
	if (htab->map.max_entries > 1UL << 31)
500
		goto free_htab;
501

502
	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
503

504
	htab->elem_size = sizeof(struct htab_elem) +
505
			  round_up(htab->map.key_size, 8);
506
	if (percpu)
507
		htab->elem_size += sizeof(void *);
508
	else
509
		htab->elem_size += round_up(htab->map.value_size, 8);
510

511
	/* check for u32 overflow */
512
	if (htab->n_buckets > U32_MAX / sizeof(struct bucket))
513
		goto free_htab;
514

515
	err = bpf_map_init_elem_count(&htab->map);
516
	if (err)
517
		goto free_htab;
518

519
	err = -ENOMEM;
520
	htab->buckets = bpf_map_area_alloc(htab->n_buckets *
521
					   sizeof(struct bucket),
522
					   htab->map.numa_node);
523
	if (!htab->buckets)
524
		goto free_elem_count;
525

526
	if (htab->map.map_flags & BPF_F_ZERO_SEED)
527
		htab->hashrnd = 0;
528
	else
529
		htab->hashrnd = get_random_u32();
530

531
	htab_init_buckets(htab);
532

533
/* compute_batch_value() computes batch value as num_online_cpus() * 2
534
 * and __percpu_counter_compare() needs
535
 * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
536
 * for percpu_counter to be faster than atomic_t. In practice the average bpf
537
 * hash map size is 10k, which means that a system with 64 cpus will fill
538
 * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
539
 * define our own batch count as 32 then 10k hash map can be filled up to 80%:
540
 * 10k - 8k > 32 _batch_ * 64 _cpus_
541
 * and __percpu_counter_compare() will still be fast. At that point hash map
542
 * collisions will dominate its performance anyway. Assume that hash map filled
543
 * to 50+% isn't going to be O(1) and use the following formula to choose
544
 * between percpu_counter and atomic_t.
545
 */
546
#define PERCPU_COUNTER_BATCH 32
547
	if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH)
548
		htab->use_percpu_counter = true;
549

550
	if (htab->use_percpu_counter) {
551
		err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
552
		if (err)
553
			goto free_map_locked;
554
	}
555

556
	if (prealloc) {
557
		err = prealloc_init(htab);
558
		if (err)
559
			goto free_map_locked;
560

561
		if (htab_has_extra_elems(htab)) {
562
			err = alloc_extra_elems(htab);
563
			if (err)
564
				goto free_prealloc;
565
		}
566
	} else {
567
		err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
568
		if (err)
569
			goto free_map_locked;
570
		if (percpu) {
571
			err = bpf_mem_alloc_init(&htab->pcpu_ma,
572
						 round_up(htab->map.value_size, 8), true);
573
			if (err)
574
				goto free_map_locked;
575
		}
576
	}
577

578
	return &htab->map;
579

580
free_prealloc:
581
	prealloc_destroy(htab);
582
free_map_locked:
583
	if (htab->use_percpu_counter)
584
		percpu_counter_destroy(&htab->pcount);
585
	bpf_map_area_free(htab->buckets);
586
	bpf_mem_alloc_destroy(&htab->pcpu_ma);
587
	bpf_mem_alloc_destroy(&htab->ma);
588
free_elem_count:
589
	bpf_map_free_elem_count(&htab->map);
590
free_htab:
591
	bpf_map_area_free(htab);
592
	return ERR_PTR(err);
593
}
594

595
static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
596
{
597
	if (likely(key_len % 4 == 0))
598
		return jhash2(key, key_len / 4, hashrnd);
599
	return jhash(key, key_len, hashrnd);
600
}
601

602
static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
603
{
604
	return &htab->buckets[hash & (htab->n_buckets - 1)];
605
}
606

607
static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
608
{
609
	return &__select_bucket(htab, hash)->head;
610
}
611

612
/* this lookup function can only be called with bucket lock taken */
613
static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
614
					 void *key, u32 key_size)
615
{
616
	struct hlist_nulls_node *n;
617
	struct htab_elem *l;
618

619
	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
620
		if (l->hash == hash && !memcmp(&l->key, key, key_size))
621
			return l;
622

623
	return NULL;
624
}
625

626
/* can be called without bucket lock. it will repeat the loop in
627
 * the unlikely event when elements moved from one bucket into another
628
 * while link list is being walked
629
 */
630
static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
631
					       u32 hash, void *key,
632
					       u32 key_size, u32 n_buckets)
633
{
634
	struct hlist_nulls_node *n;
635
	struct htab_elem *l;
636

637
again:
638
	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
639
		if (l->hash == hash && !memcmp(&l->key, key, key_size))
640
			return l;
641

642
	if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
643
		goto again;
644

645
	return NULL;
646
}
647

648
/* Called from syscall or from eBPF program directly, so
649
 * arguments have to match bpf_map_lookup_elem() exactly.
650
 * The return value is adjusted by BPF instructions
651
 * in htab_map_gen_lookup().
652
 */
653
static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
654
{
655
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
656
	struct hlist_nulls_head *head;
657
	struct htab_elem *l;
658
	u32 hash, key_size;
659

660
	WARN_ON_ONCE(!bpf_rcu_lock_held());
661

662
	key_size = map->key_size;
663

664
	hash = htab_map_hash(key, key_size, htab->hashrnd);
665

666
	head = select_bucket(htab, hash);
667

668
	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
669

670
	return l;
671
}
672

673
static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
674
{
675
	struct htab_elem *l = __htab_map_lookup_elem(map, key);
676

677
	if (l)
678
		return htab_elem_value(l, map->key_size);
679

680
	return NULL;
681
}
682

683
/* inline bpf_map_lookup_elem() call.
684
 * Instead of:
685
 * bpf_prog
686
 *   bpf_map_lookup_elem
687
 *     map->ops->map_lookup_elem
688
 *       htab_map_lookup_elem
689
 *         __htab_map_lookup_elem
690
 * do:
691
 * bpf_prog
692
 *   __htab_map_lookup_elem
693
 */
694
static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
695
{
696
	struct bpf_insn *insn = insn_buf;
697
	const int ret = BPF_REG_0;
698

699
	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
700
		     (void *(*)(struct bpf_map *map, void *key))NULL));
701
	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
702
	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
703
	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
704
				offsetof(struct htab_elem, key) +
705
				round_up(map->key_size, 8));
706
	return insn - insn_buf;
707
}
708

709
static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
710
							void *key, const bool mark)
711
{
712
	struct htab_elem *l = __htab_map_lookup_elem(map, key);
713

714
	if (l) {
715
		if (mark)
716
			bpf_lru_node_set_ref(&l->lru_node);
717
		return htab_elem_value(l, map->key_size);
718
	}
719

720
	return NULL;
721
}
722

723
static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
724
{
725
	return __htab_lru_map_lookup_elem(map, key, true);
726
}
727

728
static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
729
{
730
	return __htab_lru_map_lookup_elem(map, key, false);
731
}
732

733
static int htab_lru_map_gen_lookup(struct bpf_map *map,
734
				   struct bpf_insn *insn_buf)
735
{
736
	struct bpf_insn *insn = insn_buf;
737
	const int ret = BPF_REG_0;
738
	const int ref_reg = BPF_REG_1;
739

740
	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
741
		     (void *(*)(struct bpf_map *map, void *key))NULL));
742
	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
743
	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
744
	*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
745
			      offsetof(struct htab_elem, lru_node) +
746
			      offsetof(struct bpf_lru_node, ref));
747
	*insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1);
748
	*insn++ = BPF_ST_MEM(BPF_B, ret,
749
			     offsetof(struct htab_elem, lru_node) +
750
			     offsetof(struct bpf_lru_node, ref),
751
			     1);
752
	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
753
				offsetof(struct htab_elem, key) +
754
				round_up(map->key_size, 8));
755
	return insn - insn_buf;
756
}
757

758
static void check_and_free_fields(struct bpf_htab *htab,
759
				  struct htab_elem *elem)
760
{
761
	if (IS_ERR_OR_NULL(htab->map.record))
762
		return;
763

764
	if (htab_is_percpu(htab)) {
765
		void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
766
		int cpu;
767

768
		for_each_possible_cpu(cpu)
769
			bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
770
	} else {
771
		void *map_value = htab_elem_value(elem, htab->map.key_size);
772

773
		bpf_obj_free_fields(htab->map.record, map_value);
774
	}
775
}
776

777
/* It is called from the bpf_lru_list when the LRU needs to delete
778
 * older elements from the htab.
779
 */
780
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
781
{
782
	struct bpf_htab *htab = arg;
783
	struct htab_elem *l = NULL, *tgt_l;
784
	struct hlist_nulls_head *head;
785
	struct hlist_nulls_node *n;
786
	unsigned long flags;
787
	struct bucket *b;
788
	int ret;
789

790
	tgt_l = container_of(node, struct htab_elem, lru_node);
791
	b = __select_bucket(htab, tgt_l->hash);
792
	head = &b->head;
793

794
	ret = htab_lock_bucket(b, &flags);
795
	if (ret)
796
		return false;
797

798
	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
799
		if (l == tgt_l) {
800
			hlist_nulls_del_rcu(&l->hash_node);
801
			bpf_map_dec_elem_count(&htab->map);
802
			break;
803
		}
804

805
	htab_unlock_bucket(b, flags);
806

807
	if (l == tgt_l)
808
		check_and_free_fields(htab, l);
809
	return l == tgt_l;
810
}
811

812
/* Called from syscall */
813
static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
814
{
815
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
816
	struct hlist_nulls_head *head;
817
	struct htab_elem *l, *next_l;
818
	u32 hash, key_size;
819
	int i = 0;
820

821
	WARN_ON_ONCE(!rcu_read_lock_held());
822

823
	key_size = map->key_size;
824

825
	if (!key)
826
		goto find_first_elem;
827

828
	hash = htab_map_hash(key, key_size, htab->hashrnd);
829

830
	head = select_bucket(htab, hash);
831

832
	/* lookup the key */
833
	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
834

835
	if (!l)
836
		goto find_first_elem;
837

838
	/* key was found, get next key in the same bucket */
839
	next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
840
				  struct htab_elem, hash_node);
841

842
	if (next_l) {
843
		/* if next elem in this hash list is non-zero, just return it */
844
		memcpy(next_key, next_l->key, key_size);
845
		return 0;
846
	}
847

848
	/* no more elements in this hash list, go to the next bucket */
849
	i = hash & (htab->n_buckets - 1);
850
	i++;
851

852
find_first_elem:
853
	/* iterate over buckets */
854
	for (; i < htab->n_buckets; i++) {
855
		head = select_bucket(htab, i);
856

857
		/* pick first element in the bucket */
858
		next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
859
					  struct htab_elem, hash_node);
860
		if (next_l) {
861
			/* if it's not empty, just return it */
862
			memcpy(next_key, next_l->key, key_size);
863
			return 0;
864
		}
865
	}
866

867
	/* iterated over all buckets and all elements */
868
	return -ENOENT;
869
}
870

871
static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
872
{
873
	check_and_free_fields(htab, l);
874

875
	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
876
		bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
877
	bpf_mem_cache_free(&htab->ma, l);
878
}
879

880
static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
881
{
882
	struct bpf_map *map = &htab->map;
883
	void *ptr;
884

885
	if (map->ops->map_fd_put_ptr) {
886
		ptr = fd_htab_map_get_ptr(map, l);
887
		map->ops->map_fd_put_ptr(map, ptr, true);
888
	}
889
}
890

891
static bool is_map_full(struct bpf_htab *htab)
892
{
893
	if (htab->use_percpu_counter)
894
		return __percpu_counter_compare(&htab->pcount, htab->map.max_entries,
895
						PERCPU_COUNTER_BATCH) >= 0;
896
	return atomic_read(&htab->count) >= htab->map.max_entries;
897
}
898

899
static void inc_elem_count(struct bpf_htab *htab)
900
{
901
	bpf_map_inc_elem_count(&htab->map);
902

903
	if (htab->use_percpu_counter)
904
		percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
905
	else
906
		atomic_inc(&htab->count);
907
}
908

909
static void dec_elem_count(struct bpf_htab *htab)
910
{
911
	bpf_map_dec_elem_count(&htab->map);
912

913
	if (htab->use_percpu_counter)
914
		percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
915
	else
916
		atomic_dec(&htab->count);
917
}
918

919

920
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
921
{
922
	htab_put_fd_value(htab, l);
923

924
	if (htab_is_prealloc(htab)) {
925
		bpf_map_dec_elem_count(&htab->map);
926
		check_and_free_fields(htab, l);
927
		pcpu_freelist_push(&htab->freelist, &l->fnode);
928
	} else {
929
		dec_elem_count(htab);
930
		htab_elem_free(htab, l);
931
	}
932
}
933

934
static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
935
			    void *value, bool onallcpus)
936
{
937
	void *ptr;
938

939
	if (!onallcpus) {
940
		/* copy true value_size bytes */
941
		ptr = this_cpu_ptr(pptr);
942
		copy_map_value(&htab->map, ptr, value);
943
		bpf_obj_free_fields(htab->map.record, ptr);
944
	} else {
945
		u32 size = round_up(htab->map.value_size, 8);
946
		int off = 0, cpu;
947

948
		for_each_possible_cpu(cpu) {
949
			ptr = per_cpu_ptr(pptr, cpu);
950
			copy_map_value_long(&htab->map, ptr, value + off);
951
			bpf_obj_free_fields(htab->map.record, ptr);
952
			off += size;
953
		}
954
	}
955
}
956

957
static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
958
			    void *value, bool onallcpus)
959
{
960
	/* When not setting the initial value on all cpus, zero-fill element
961
	 * values for other cpus. Otherwise, bpf program has no way to ensure
962
	 * known initial values for cpus other than current one
963
	 * (onallcpus=false always when coming from bpf prog).
964
	 */
965
	if (!onallcpus) {
966
		int current_cpu = raw_smp_processor_id();
967
		int cpu;
968

969
		for_each_possible_cpu(cpu) {
970
			if (cpu == current_cpu)
971
				copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value);
972
			else /* Since elem is preallocated, we cannot touch special fields */
973
				zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));
974
		}
975
	} else {
976
		pcpu_copy_value(htab, pptr, value, onallcpus);
977
	}
978
}
979

980
static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
981
{
982
	return is_fd_htab(htab) && BITS_PER_LONG == 64;
983
}
984

985
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
986
					 void *value, u32 key_size, u32 hash,
987
					 bool percpu, bool onallcpus,
988
					 struct htab_elem *old_elem)
989
{
990
	u32 size = htab->map.value_size;
991
	bool prealloc = htab_is_prealloc(htab);
992
	struct htab_elem *l_new, **pl_new;
993
	void __percpu *pptr;
994

995
	if (prealloc) {
996
		if (old_elem) {
997
			/* if we're updating the existing element,
998
			 * use per-cpu extra elems to avoid freelist_pop/push
999
			 */
1000
			pl_new = this_cpu_ptr(htab->extra_elems);
1001
			l_new = *pl_new;
1002
			*pl_new = old_elem;
1003
		} else {
1004
			struct pcpu_freelist_node *l;
1005

1006
			l = __pcpu_freelist_pop(&htab->freelist);
1007
			if (!l)
1008
				return ERR_PTR(-E2BIG);
1009
			l_new = container_of(l, struct htab_elem, fnode);
1010
			bpf_map_inc_elem_count(&htab->map);
1011
		}
1012
	} else {
1013
		if (is_map_full(htab))
1014
			if (!old_elem)
1015
				/* when map is full and update() is replacing
1016
				 * old element, it's ok to allocate, since
1017
				 * old element will be freed immediately.
1018
				 * Otherwise return an error
1019
				 */
1020
				return ERR_PTR(-E2BIG);
1021
		inc_elem_count(htab);
1022
		l_new = bpf_mem_cache_alloc(&htab->ma);
1023
		if (!l_new) {
1024
			l_new = ERR_PTR(-ENOMEM);
1025
			goto dec_count;
1026
		}
1027
	}
1028

1029
	memcpy(l_new->key, key, key_size);
1030
	if (percpu) {
1031
		if (prealloc) {
1032
			pptr = htab_elem_get_ptr(l_new, key_size);
1033
		} else {
1034
			/* alloc_percpu zero-fills */
1035
			void *ptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
1036

1037
			if (!ptr) {
1038
				bpf_mem_cache_free(&htab->ma, l_new);
1039
				l_new = ERR_PTR(-ENOMEM);
1040
				goto dec_count;
1041
			}
1042
			l_new->ptr_to_pptr = ptr;
1043
			pptr = *(void __percpu **)ptr;
1044
		}
1045

1046
		pcpu_init_value(htab, pptr, value, onallcpus);
1047

1048
		if (!prealloc)
1049
			htab_elem_set_ptr(l_new, key_size, pptr);
1050
	} else if (fd_htab_map_needs_adjust(htab)) {
1051
		size = round_up(size, 8);
1052
		memcpy(htab_elem_value(l_new, key_size), value, size);
1053
	} else {
1054
		copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value);
1055
	}
1056

1057
	l_new->hash = hash;
1058
	return l_new;
1059
dec_count:
1060
	dec_elem_count(htab);
1061
	return l_new;
1062
}
1063

1064
static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
1065
		       u64 map_flags)
1066
{
1067
	if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
1068
		/* elem already exists */
1069
		return -EEXIST;
1070

1071
	if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
1072
		/* elem doesn't exist, cannot update it */
1073
		return -ENOENT;
1074

1075
	return 0;
1076
}
1077

1078
/* Called from syscall or from eBPF program */
1079
static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
1080
				 u64 map_flags)
1081
{
1082
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1083
	struct htab_elem *l_new, *l_old;
1084
	struct hlist_nulls_head *head;
1085
	unsigned long flags;
1086
	struct bucket *b;
1087
	u32 key_size, hash;
1088
	int ret;
1089

1090
	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
1091
		/* unknown flags */
1092
		return -EINVAL;
1093

1094
	WARN_ON_ONCE(!bpf_rcu_lock_held());
1095

1096
	key_size = map->key_size;
1097

1098
	hash = htab_map_hash(key, key_size, htab->hashrnd);
1099

1100
	b = __select_bucket(htab, hash);
1101
	head = &b->head;
1102

1103
	if (unlikely(map_flags & BPF_F_LOCK)) {
1104
		if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK)))
1105
			return -EINVAL;
1106
		/* find an element without taking the bucket lock */
1107
		l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
1108
					      htab->n_buckets);
1109
		ret = check_flags(htab, l_old, map_flags);
1110
		if (ret)
1111
			return ret;
1112
		if (l_old) {
1113
			/* grab the element lock and update value in place */
1114
			copy_map_value_locked(map,
1115
					      htab_elem_value(l_old, key_size),
1116
					      value, false);
1117
			return 0;
1118
		}
1119
		/* fall through, grab the bucket lock and lookup again.
1120
		 * 99.9% chance that the element won't be found,
1121
		 * but second lookup under lock has to be done.
1122
		 */
1123
	}
1124

1125
	ret = htab_lock_bucket(b, &flags);
1126
	if (ret)
1127
		return ret;
1128

1129
	l_old = lookup_elem_raw(head, hash, key, key_size);
1130

1131
	ret = check_flags(htab, l_old, map_flags);
1132
	if (ret)
1133
		goto err;
1134

1135
	if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
1136
		/* first lookup without the bucket lock didn't find the element,
1137
		 * but second lookup with the bucket lock found it.
1138
		 * This case is highly unlikely, but has to be dealt with:
1139
		 * grab the element lock in addition to the bucket lock
1140
		 * and update element in place
1141
		 */
1142
		copy_map_value_locked(map,
1143
				      htab_elem_value(l_old, key_size),
1144
				      value, false);
1145
		ret = 0;
1146
		goto err;
1147
	}
1148

1149
	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
1150
				l_old);
1151
	if (IS_ERR(l_new)) {
1152
		/* all pre-allocated elements are in use or memory exhausted */
1153
		ret = PTR_ERR(l_new);
1154
		goto err;
1155
	}
1156

1157
	/* add new element to the head of the list, so that
1158
	 * concurrent search will find it before old elem
1159
	 */
1160
	hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1161
	if (l_old) {
1162
		hlist_nulls_del_rcu(&l_old->hash_node);
1163

1164
		/* l_old has already been stashed in htab->extra_elems, free
1165
		 * its special fields before it is available for reuse.
1166
		 */
1167
		if (htab_is_prealloc(htab))
1168
			check_and_free_fields(htab, l_old);
1169
	}
1170
	htab_unlock_bucket(b, flags);
1171
	if (l_old && !htab_is_prealloc(htab))
1172
		free_htab_elem(htab, l_old);
1173
	return 0;
1174
err:
1175
	htab_unlock_bucket(b, flags);
1176
	return ret;
1177
}
1178

1179
static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
1180
{
1181
	check_and_free_fields(htab, elem);
1182
	bpf_map_dec_elem_count(&htab->map);
1183
	bpf_lru_push_free(&htab->lru, &elem->lru_node);
1184
}
1185

1186
static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
1187
				     u64 map_flags)
1188
{
1189
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1190
	struct htab_elem *l_new, *l_old = NULL;
1191
	struct hlist_nulls_head *head;
1192
	unsigned long flags;
1193
	struct bucket *b;
1194
	u32 key_size, hash;
1195
	int ret;
1196

1197
	if (unlikely(map_flags > BPF_EXIST))
1198
		/* unknown flags */
1199
		return -EINVAL;
1200

1201
	WARN_ON_ONCE(!bpf_rcu_lock_held());
1202

1203
	key_size = map->key_size;
1204

1205
	hash = htab_map_hash(key, key_size, htab->hashrnd);
1206

1207
	b = __select_bucket(htab, hash);
1208
	head = &b->head;
1209

1210
	/* For LRU, we need to alloc before taking bucket's
1211
	 * spinlock because getting free nodes from LRU may need
1212
	 * to remove older elements from htab and this removal
1213
	 * operation will need a bucket lock.
1214
	 */
1215
	l_new = prealloc_lru_pop(htab, key, hash);
1216
	if (!l_new)
1217
		return -ENOMEM;
1218
	copy_map_value(&htab->map, htab_elem_value(l_new, map->key_size), value);
1219

1220
	ret = htab_lock_bucket(b, &flags);
1221
	if (ret)
1222
		goto err_lock_bucket;
1223

1224
	l_old = lookup_elem_raw(head, hash, key, key_size);
1225

1226
	ret = check_flags(htab, l_old, map_flags);
1227
	if (ret)
1228
		goto err;
1229

1230
	/* add new element to the head of the list, so that
1231
	 * concurrent search will find it before old elem
1232
	 */
1233
	hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1234
	if (l_old) {
1235
		bpf_lru_node_set_ref(&l_new->lru_node);
1236
		hlist_nulls_del_rcu(&l_old->hash_node);
1237
	}
1238
	ret = 0;
1239

1240
err:
1241
	htab_unlock_bucket(b, flags);
1242

1243
err_lock_bucket:
1244
	if (ret)
1245
		htab_lru_push_free(htab, l_new);
1246
	else if (l_old)
1247
		htab_lru_push_free(htab, l_old);
1248

1249
	return ret;
1250
}
1251

1252
static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
1253
					  void *value, u64 map_flags,
1254
					  bool percpu, bool onallcpus)
1255
{
1256
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1257
	struct htab_elem *l_new, *l_old;
1258
	struct hlist_nulls_head *head;
1259
	void *old_map_ptr = NULL;
1260
	unsigned long flags;
1261
	struct bucket *b;
1262
	u32 key_size, hash;
1263
	int ret;
1264

1265
	if (unlikely(map_flags > BPF_EXIST))
1266
		/* unknown flags */
1267
		return -EINVAL;
1268

1269
	WARN_ON_ONCE(!bpf_rcu_lock_held());
1270

1271
	key_size = map->key_size;
1272

1273
	hash = htab_map_hash(key, key_size, htab->hashrnd);
1274

1275
	b = __select_bucket(htab, hash);
1276
	head = &b->head;
1277

1278
	ret = htab_lock_bucket(b, &flags);
1279
	if (ret)
1280
		return ret;
1281

1282
	l_old = lookup_elem_raw(head, hash, key, key_size);
1283

1284
	ret = check_flags(htab, l_old, map_flags);
1285
	if (ret)
1286
		goto err;
1287

1288
	if (l_old) {
1289
		/* Update value in-place */
1290
		if (percpu) {
1291
			pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1292
					value, onallcpus);
1293
		} else {
1294
			void **inner_map_pptr = htab_elem_value(l_old, key_size);
1295

1296
			old_map_ptr = *inner_map_pptr;
1297
			WRITE_ONCE(*inner_map_pptr, *(void **)value);
1298
		}
1299
	} else {
1300
		l_new = alloc_htab_elem(htab, key, value, key_size,
1301
					hash, percpu, onallcpus, NULL);
1302
		if (IS_ERR(l_new)) {
1303
			ret = PTR_ERR(l_new);
1304
			goto err;
1305
		}
1306
		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1307
	}
1308
err:
1309
	htab_unlock_bucket(b, flags);
1310
	if (old_map_ptr)
1311
		map->ops->map_fd_put_ptr(map, old_map_ptr, true);
1312
	return ret;
1313
}
1314

1315
static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1316
					      void *value, u64 map_flags,
1317
					      bool onallcpus)
1318
{
1319
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1320
	struct htab_elem *l_new = NULL, *l_old;
1321
	struct hlist_nulls_head *head;
1322
	unsigned long flags;
1323
	struct bucket *b;
1324
	u32 key_size, hash;
1325
	int ret;
1326

1327
	if (unlikely(map_flags > BPF_EXIST))
1328
		/* unknown flags */
1329
		return -EINVAL;
1330

1331
	WARN_ON_ONCE(!bpf_rcu_lock_held());
1332

1333
	key_size = map->key_size;
1334

1335
	hash = htab_map_hash(key, key_size, htab->hashrnd);
1336

1337
	b = __select_bucket(htab, hash);
1338
	head = &b->head;
1339

1340
	/* For LRU, we need to alloc before taking bucket's
1341
	 * spinlock because LRU's elem alloc may need
1342
	 * to remove older elem from htab and this removal
1343
	 * operation will need a bucket lock.
1344
	 */
1345
	if (map_flags != BPF_EXIST) {
1346
		l_new = prealloc_lru_pop(htab, key, hash);
1347
		if (!l_new)
1348
			return -ENOMEM;
1349
	}
1350

1351
	ret = htab_lock_bucket(b, &flags);
1352
	if (ret)
1353
		goto err_lock_bucket;
1354

1355
	l_old = lookup_elem_raw(head, hash, key, key_size);
1356

1357
	ret = check_flags(htab, l_old, map_flags);
1358
	if (ret)
1359
		goto err;
1360

1361
	if (l_old) {
1362
		bpf_lru_node_set_ref(&l_old->lru_node);
1363

1364
		/* per-cpu hash map can update value in-place */
1365
		pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1366
				value, onallcpus);
1367
	} else {
1368
		pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
1369
				value, onallcpus);
1370
		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1371
		l_new = NULL;
1372
	}
1373
	ret = 0;
1374
err:
1375
	htab_unlock_bucket(b, flags);
1376
err_lock_bucket:
1377
	if (l_new) {
1378
		bpf_map_dec_elem_count(&htab->map);
1379
		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
1380
	}
1381
	return ret;
1382
}
1383

1384
static long htab_percpu_map_update_elem(struct bpf_map *map, void *key,
1385
					void *value, u64 map_flags)
1386
{
1387
	return htab_map_update_elem_in_place(map, key, value, map_flags, true, false);
1388
}
1389

1390
static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1391
					    void *value, u64 map_flags)
1392
{
1393
	return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
1394
						 false);
1395
}
1396

1397
/* Called from syscall or from eBPF program */
1398
static long htab_map_delete_elem(struct bpf_map *map, void *key)
1399
{
1400
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1401
	struct hlist_nulls_head *head;
1402
	struct bucket *b;
1403
	struct htab_elem *l;
1404
	unsigned long flags;
1405
	u32 hash, key_size;
1406
	int ret;
1407

1408
	WARN_ON_ONCE(!bpf_rcu_lock_held());
1409

1410
	key_size = map->key_size;
1411

1412
	hash = htab_map_hash(key, key_size, htab->hashrnd);
1413
	b = __select_bucket(htab, hash);
1414
	head = &b->head;
1415

1416
	ret = htab_lock_bucket(b, &flags);
1417
	if (ret)
1418
		return ret;
1419

1420
	l = lookup_elem_raw(head, hash, key, key_size);
1421
	if (l)
1422
		hlist_nulls_del_rcu(&l->hash_node);
1423
	else
1424
		ret = -ENOENT;
1425

1426
	htab_unlock_bucket(b, flags);
1427

1428
	if (l)
1429
		free_htab_elem(htab, l);
1430
	return ret;
1431
}
1432

1433
static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
1434
{
1435
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1436
	struct hlist_nulls_head *head;
1437
	struct bucket *b;
1438
	struct htab_elem *l;
1439
	unsigned long flags;
1440
	u32 hash, key_size;
1441
	int ret;
1442

1443
	WARN_ON_ONCE(!bpf_rcu_lock_held());
1444

1445
	key_size = map->key_size;
1446

1447
	hash = htab_map_hash(key, key_size, htab->hashrnd);
1448
	b = __select_bucket(htab, hash);
1449
	head = &b->head;
1450

1451
	ret = htab_lock_bucket(b, &flags);
1452
	if (ret)
1453
		return ret;
1454

1455
	l = lookup_elem_raw(head, hash, key, key_size);
1456

1457
	if (l)
1458
		hlist_nulls_del_rcu(&l->hash_node);
1459
	else
1460
		ret = -ENOENT;
1461

1462
	htab_unlock_bucket(b, flags);
1463
	if (l)
1464
		htab_lru_push_free(htab, l);
1465
	return ret;
1466
}
1467

1468
static void delete_all_elements(struct bpf_htab *htab)
1469
{
1470
	int i;
1471

1472
	/* It's called from a worker thread and migration has been disabled,
1473
	 * therefore, it is OK to invoke bpf_mem_cache_free() directly.
1474
	 */
1475
	for (i = 0; i < htab->n_buckets; i++) {
1476
		struct hlist_nulls_head *head = select_bucket(htab, i);
1477
		struct hlist_nulls_node *n;
1478
		struct htab_elem *l;
1479

1480
		hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1481
			hlist_nulls_del_rcu(&l->hash_node);
1482
			htab_elem_free(htab, l);
1483
		}
1484
		cond_resched();
1485
	}
1486
}
1487

1488
static void htab_free_malloced_internal_structs(struct bpf_htab *htab)
1489
{
1490
	int i;
1491

1492
	rcu_read_lock();
1493
	for (i = 0; i < htab->n_buckets; i++) {
1494
		struct hlist_nulls_head *head = select_bucket(htab, i);
1495
		struct hlist_nulls_node *n;
1496
		struct htab_elem *l;
1497

1498
		hlist_nulls_for_each_entry(l, n, head, hash_node) {
1499
			/* We only free internal structs on uref dropping to zero */
1500
			bpf_map_free_internal_structs(&htab->map,
1501
						      htab_elem_value(l, htab->map.key_size));
1502
		}
1503
		cond_resched_rcu();
1504
	}
1505
	rcu_read_unlock();
1506
}
1507

1508
static void htab_map_free_internal_structs(struct bpf_map *map)
1509
{
1510
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1511

1512
	/* We only free internal structs on uref dropping to zero */
1513
	if (!bpf_map_has_internal_structs(map))
1514
		return;
1515

1516
	if (htab_is_prealloc(htab))
1517
		htab_free_prealloced_internal_structs(htab);
1518
	else
1519
		htab_free_malloced_internal_structs(htab);
1520
}
1521

1522
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
1523
static void htab_map_free(struct bpf_map *map)
1524
{
1525
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1526

1527
	/* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
1528
	 * bpf_free_used_maps() is called after bpf prog is no longer executing.
1529
	 * There is no need to synchronize_rcu() here to protect map elements.
1530
	 */
1531

1532
	/* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
1533
	 * underneath and is responsible for waiting for callbacks to finish
1534
	 * during bpf_mem_alloc_destroy().
1535
	 */
1536
	if (!htab_is_prealloc(htab)) {
1537
		delete_all_elements(htab);
1538
	} else {
1539
		htab_free_prealloced_fields(htab);
1540
		prealloc_destroy(htab);
1541
	}
1542

1543
	bpf_map_free_elem_count(map);
1544
	free_percpu(htab->extra_elems);
1545
	bpf_map_area_free(htab->buckets);
1546
	bpf_mem_alloc_destroy(&htab->pcpu_ma);
1547
	bpf_mem_alloc_destroy(&htab->ma);
1548
	if (htab->use_percpu_counter)
1549
		percpu_counter_destroy(&htab->pcount);
1550
	bpf_map_area_free(htab);
1551
}
1552

1553
static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
1554
				   struct seq_file *m)
1555
{
1556
	void *value;
1557

1558
	rcu_read_lock();
1559

1560
	value = htab_map_lookup_elem(map, key);
1561
	if (!value) {
1562
		rcu_read_unlock();
1563
		return;
1564
	}
1565

1566
	btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
1567
	seq_puts(m, ": ");
1568
	btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
1569
	seq_putc(m, '\n');
1570

1571
	rcu_read_unlock();
1572
}
1573

1574
static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1575
					     void *value, bool is_lru_map,
1576
					     bool is_percpu, u64 flags)
1577
{
1578
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1579
	struct hlist_nulls_head *head;
1580
	unsigned long bflags;
1581
	struct htab_elem *l;
1582
	u32 hash, key_size;
1583
	struct bucket *b;
1584
	int ret;
1585

1586
	key_size = map->key_size;
1587

1588
	hash = htab_map_hash(key, key_size, htab->hashrnd);
1589
	b = __select_bucket(htab, hash);
1590
	head = &b->head;
1591

1592
	ret = htab_lock_bucket(b, &bflags);
1593
	if (ret)
1594
		return ret;
1595

1596
	l = lookup_elem_raw(head, hash, key, key_size);
1597
	if (!l) {
1598
		ret = -ENOENT;
1599
		goto out_unlock;
1600
	}
1601

1602
	if (is_percpu) {
1603
		u32 roundup_value_size = round_up(map->value_size, 8);
1604
		void __percpu *pptr;
1605
		int off = 0, cpu;
1606

1607
		pptr = htab_elem_get_ptr(l, key_size);
1608
		for_each_possible_cpu(cpu) {
1609
			copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
1610
			check_and_init_map_value(&htab->map, value + off);
1611
			off += roundup_value_size;
1612
		}
1613
	} else {
1614
		void *src = htab_elem_value(l, map->key_size);
1615

1616
		if (flags & BPF_F_LOCK)
1617
			copy_map_value_locked(map, value, src, true);
1618
		else
1619
			copy_map_value(map, value, src);
1620
		/* Zeroing special fields in the temp buffer */
1621
		check_and_init_map_value(map, value);
1622
	}
1623
	hlist_nulls_del_rcu(&l->hash_node);
1624

1625
out_unlock:
1626
	htab_unlock_bucket(b, bflags);
1627

1628
	if (l) {
1629
		if (is_lru_map)
1630
			htab_lru_push_free(htab, l);
1631
		else
1632
			free_htab_elem(htab, l);
1633
	}
1634

1635
	return ret;
1636
}
1637

1638
static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1639
					   void *value, u64 flags)
1640
{
1641
	return __htab_map_lookup_and_delete_elem(map, key, value, false, false,
1642
						 flags);
1643
}
1644

1645
static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1646
						  void *key, void *value,
1647
						  u64 flags)
1648
{
1649
	return __htab_map_lookup_and_delete_elem(map, key, value, false, true,
1650
						 flags);
1651
}
1652

1653
static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1654
					       void *value, u64 flags)
1655
{
1656
	return __htab_map_lookup_and_delete_elem(map, key, value, true, false,
1657
						 flags);
1658
}
1659

1660
static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1661
						      void *key, void *value,
1662
						      u64 flags)
1663
{
1664
	return __htab_map_lookup_and_delete_elem(map, key, value, true, true,
1665
						 flags);
1666
}
1667

1668
static int
1669
__htab_map_lookup_and_delete_batch(struct bpf_map *map,
1670
				   const union bpf_attr *attr,
1671
				   union bpf_attr __user *uattr,
1672
				   bool do_delete, bool is_lru_map,
1673
				   bool is_percpu)
1674
{
1675
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1676
	void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
1677
	void __user *uvalues = u64_to_user_ptr(attr->batch.values);
1678
	void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
1679
	void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1680
	u32 batch, max_count, size, bucket_size, map_id;
1681
	u32 bucket_cnt, total, key_size, value_size;
1682
	struct htab_elem *node_to_free = NULL;
1683
	u64 elem_map_flags, map_flags;
1684
	struct hlist_nulls_head *head;
1685
	struct hlist_nulls_node *n;
1686
	unsigned long flags = 0;
1687
	bool locked = false;
1688
	struct htab_elem *l;
1689
	struct bucket *b;
1690
	int ret = 0;
1691

1692
	elem_map_flags = attr->batch.elem_flags;
1693
	if ((elem_map_flags & ~BPF_F_LOCK) ||
1694
	    ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
1695
		return -EINVAL;
1696

1697
	map_flags = attr->batch.flags;
1698
	if (map_flags)
1699
		return -EINVAL;
1700

1701
	max_count = attr->batch.count;
1702
	if (!max_count)
1703
		return 0;
1704

1705
	if (put_user(0, &uattr->batch.count))
1706
		return -EFAULT;
1707

1708
	batch = 0;
1709
	if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
1710
		return -EFAULT;
1711

1712
	if (batch >= htab->n_buckets)
1713
		return -ENOENT;
1714

1715
	key_size = htab->map.key_size;
1716
	value_size = htab->map.value_size;
1717
	size = round_up(value_size, 8);
1718
	if (is_percpu)
1719
		value_size = size * num_possible_cpus();
1720
	total = 0;
1721
	/* while experimenting with hash tables with sizes ranging from 10 to
1722
	 * 1000, it was observed that a bucket can have up to 5 entries.
1723
	 */
1724
	bucket_size = 5;
1725

1726
alloc:
1727
	/* We cannot do copy_from_user or copy_to_user inside
1728
	 * the rcu_read_lock. Allocate enough space here.
1729
	 */
1730
	keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN);
1731
	values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN);
1732
	if (!keys || !values) {
1733
		ret = -ENOMEM;
1734
		goto after_loop;
1735
	}
1736

1737
again:
1738
	bpf_disable_instrumentation();
1739
	rcu_read_lock();
1740
again_nocopy:
1741
	dst_key = keys;
1742
	dst_val = values;
1743
	b = &htab->buckets[batch];
1744
	head = &b->head;
1745
	/* do not grab the lock unless need it (bucket_cnt > 0). */
1746
	if (locked) {
1747
		ret = htab_lock_bucket(b, &flags);
1748
		if (ret) {
1749
			rcu_read_unlock();
1750
			bpf_enable_instrumentation();
1751
			goto after_loop;
1752
		}
1753
	}
1754

1755
	bucket_cnt = 0;
1756
	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
1757
		bucket_cnt++;
1758

1759
	if (bucket_cnt && !locked) {
1760
		locked = true;
1761
		goto again_nocopy;
1762
	}
1763

1764
	if (bucket_cnt > (max_count - total)) {
1765
		if (total == 0)
1766
			ret = -ENOSPC;
1767
		/* Note that since bucket_cnt > 0 here, it is implicit
1768
		 * that the locked was grabbed, so release it.
1769
		 */
1770
		htab_unlock_bucket(b, flags);
1771
		rcu_read_unlock();
1772
		bpf_enable_instrumentation();
1773
		goto after_loop;
1774
	}
1775

1776
	if (bucket_cnt > bucket_size) {
1777
		bucket_size = bucket_cnt;
1778
		/* Note that since bucket_cnt > 0 here, it is implicit
1779
		 * that the locked was grabbed, so release it.
1780
		 */
1781
		htab_unlock_bucket(b, flags);
1782
		rcu_read_unlock();
1783
		bpf_enable_instrumentation();
1784
		kvfree(keys);
1785
		kvfree(values);
1786
		goto alloc;
1787
	}
1788

1789
	/* Next block is only safe to run if you have grabbed the lock */
1790
	if (!locked)
1791
		goto next_batch;
1792

1793
	hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1794
		memcpy(dst_key, l->key, key_size);
1795

1796
		if (is_percpu) {
1797
			int off = 0, cpu;
1798
			void __percpu *pptr;
1799

1800
			pptr = htab_elem_get_ptr(l, map->key_size);
1801
			for_each_possible_cpu(cpu) {
1802
				copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu));
1803
				check_and_init_map_value(&htab->map, dst_val + off);
1804
				off += size;
1805
			}
1806
		} else {
1807
			value = htab_elem_value(l, key_size);
1808
			if (is_fd_htab(htab)) {
1809
				struct bpf_map **inner_map = value;
1810

1811
				 /* Actual value is the id of the inner map */
1812
				map_id = map->ops->map_fd_sys_lookup_elem(*inner_map);
1813
				value = &map_id;
1814
			}
1815

1816
			if (elem_map_flags & BPF_F_LOCK)
1817
				copy_map_value_locked(map, dst_val, value,
1818
						      true);
1819
			else
1820
				copy_map_value(map, dst_val, value);
1821
			/* Zeroing special fields in the temp buffer */
1822
			check_and_init_map_value(map, dst_val);
1823
		}
1824
		if (do_delete) {
1825
			hlist_nulls_del_rcu(&l->hash_node);
1826

1827
			/* bpf_lru_push_free() will acquire lru_lock, which
1828
			 * may cause deadlock. See comments in function
1829
			 * prealloc_lru_pop(). Let us do bpf_lru_push_free()
1830
			 * after releasing the bucket lock.
1831
			 *
1832
			 * For htab of maps, htab_put_fd_value() in
1833
			 * free_htab_elem() may acquire a spinlock with bucket
1834
			 * lock being held and it violates the lock rule, so
1835
			 * invoke free_htab_elem() after unlock as well.
1836
			 */
1837
			l->batch_flink = node_to_free;
1838
			node_to_free = l;
1839
		}
1840
		dst_key += key_size;
1841
		dst_val += value_size;
1842
	}
1843

1844
	htab_unlock_bucket(b, flags);
1845
	locked = false;
1846

1847
	while (node_to_free) {
1848
		l = node_to_free;
1849
		node_to_free = node_to_free->batch_flink;
1850
		if (is_lru_map)
1851
			htab_lru_push_free(htab, l);
1852
		else
1853
			free_htab_elem(htab, l);
1854
	}
1855

1856
next_batch:
1857
	/* If we are not copying data, we can go to next bucket and avoid
1858
	 * unlocking the rcu.
1859
	 */
1860
	if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
1861
		batch++;
1862
		goto again_nocopy;
1863
	}
1864

1865
	rcu_read_unlock();
1866
	bpf_enable_instrumentation();
1867
	if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
1868
	    key_size * bucket_cnt) ||
1869
	    copy_to_user(uvalues + total * value_size, values,
1870
	    value_size * bucket_cnt))) {
1871
		ret = -EFAULT;
1872
		goto after_loop;
1873
	}
1874

1875
	total += bucket_cnt;
1876
	batch++;
1877
	if (batch >= htab->n_buckets) {
1878
		ret = -ENOENT;
1879
		goto after_loop;
1880
	}
1881
	goto again;
1882

1883
after_loop:
1884
	if (ret == -EFAULT)
1885
		goto out;
1886

1887
	/* copy # of entries and next batch */
1888
	ubatch = u64_to_user_ptr(attr->batch.out_batch);
1889
	if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
1890
	    put_user(total, &uattr->batch.count))
1891
		ret = -EFAULT;
1892

1893
out:
1894
	kvfree(keys);
1895
	kvfree(values);
1896
	return ret;
1897
}
1898

1899
static int
1900
htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1901
			     union bpf_attr __user *uattr)
1902
{
1903
	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1904
						  false, true);
1905
}
1906

1907
static int
1908
htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1909
					const union bpf_attr *attr,
1910
					union bpf_attr __user *uattr)
1911
{
1912
	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1913
						  false, true);
1914
}
1915

1916
static int
1917
htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1918
		      union bpf_attr __user *uattr)
1919
{
1920
	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1921
						  false, false);
1922
}
1923

1924
static int
1925
htab_map_lookup_and_delete_batch(struct bpf_map *map,
1926
				 const union bpf_attr *attr,
1927
				 union bpf_attr __user *uattr)
1928
{
1929
	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1930
						  false, false);
1931
}
1932

1933
static int
1934
htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
1935
				 const union bpf_attr *attr,
1936
				 union bpf_attr __user *uattr)
1937
{
1938
	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1939
						  true, true);
1940
}
1941

1942
static int
1943
htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1944
					    const union bpf_attr *attr,
1945
					    union bpf_attr __user *uattr)
1946
{
1947
	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1948
						  true, true);
1949
}
1950

1951
static int
1952
htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1953
			  union bpf_attr __user *uattr)
1954
{
1955
	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1956
						  true, false);
1957
}
1958

1959
static int
1960
htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
1961
				     const union bpf_attr *attr,
1962
				     union bpf_attr __user *uattr)
1963
{
1964
	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1965
						  true, false);
1966
}
1967

1968
struct bpf_iter_seq_hash_map_info {
1969
	struct bpf_map *map;
1970
	struct bpf_htab *htab;
1971
	void *percpu_value_buf; // non-zero means percpu hash
1972
	u32 bucket_id;
1973
	u32 skip_elems;
1974
};
1975

1976
static struct htab_elem *
1977
bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
1978
			   struct htab_elem *prev_elem)
1979
{
1980
	const struct bpf_htab *htab = info->htab;
1981
	u32 skip_elems = info->skip_elems;
1982
	u32 bucket_id = info->bucket_id;
1983
	struct hlist_nulls_head *head;
1984
	struct hlist_nulls_node *n;
1985
	struct htab_elem *elem;
1986
	struct bucket *b;
1987
	u32 i, count;
1988

1989
	if (bucket_id >= htab->n_buckets)
1990
		return NULL;
1991

1992
	/* try to find next elem in the same bucket */
1993
	if (prev_elem) {
1994
		/* no update/deletion on this bucket, prev_elem should be still valid
1995
		 * and we won't skip elements.
1996
		 */
1997
		n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));
1998
		elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node);
1999
		if (elem)
2000
			return elem;
2001

2002
		/* not found, unlock and go to the next bucket */
2003
		b = &htab->buckets[bucket_id++];
2004
		rcu_read_unlock();
2005
		skip_elems = 0;
2006
	}
2007

2008
	for (i = bucket_id; i < htab->n_buckets; i++) {
2009
		b = &htab->buckets[i];
2010
		rcu_read_lock();
2011

2012
		count = 0;
2013
		head = &b->head;
2014
		hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
2015
			if (count >= skip_elems) {
2016
				info->bucket_id = i;
2017
				info->skip_elems = count;
2018
				return elem;
2019
			}
2020
			count++;
2021
		}
2022

2023
		rcu_read_unlock();
2024
		skip_elems = 0;
2025
	}
2026

2027
	info->bucket_id = i;
2028
	info->skip_elems = 0;
2029
	return NULL;
2030
}
2031

2032
static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos)
2033
{
2034
	struct bpf_iter_seq_hash_map_info *info = seq->private;
2035
	struct htab_elem *elem;
2036

2037
	elem = bpf_hash_map_seq_find_next(info, NULL);
2038
	if (!elem)
2039
		return NULL;
2040

2041
	if (*pos == 0)
2042
		++*pos;
2043
	return elem;
2044
}
2045

2046
static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2047
{
2048
	struct bpf_iter_seq_hash_map_info *info = seq->private;
2049

2050
	++*pos;
2051
	++info->skip_elems;
2052
	return bpf_hash_map_seq_find_next(info, v);
2053
}
2054

2055
static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
2056
{
2057
	struct bpf_iter_seq_hash_map_info *info = seq->private;
2058
	struct bpf_iter__bpf_map_elem ctx = {};
2059
	struct bpf_map *map = info->map;
2060
	struct bpf_iter_meta meta;
2061
	int ret = 0, off = 0, cpu;
2062
	u32 roundup_value_size;
2063
	struct bpf_prog *prog;
2064
	void __percpu *pptr;
2065

2066
	meta.seq = seq;
2067
	prog = bpf_iter_get_info(&meta, elem == NULL);
2068
	if (prog) {
2069
		ctx.meta = &meta;
2070
		ctx.map = info->map;
2071
		if (elem) {
2072
			ctx.key = elem->key;
2073
			if (!info->percpu_value_buf) {
2074
				ctx.value = htab_elem_value(elem, map->key_size);
2075
			} else {
2076
				roundup_value_size = round_up(map->value_size, 8);
2077
				pptr = htab_elem_get_ptr(elem, map->key_size);
2078
				for_each_possible_cpu(cpu) {
2079
					copy_map_value_long(map, info->percpu_value_buf + off,
2080
							    per_cpu_ptr(pptr, cpu));
2081
					check_and_init_map_value(map, info->percpu_value_buf + off);
2082
					off += roundup_value_size;
2083
				}
2084
				ctx.value = info->percpu_value_buf;
2085
			}
2086
		}
2087
		ret = bpf_iter_run_prog(prog, &ctx);
2088
	}
2089

2090
	return ret;
2091
}
2092

2093
static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)
2094
{
2095
	return __bpf_hash_map_seq_show(seq, v);
2096
}
2097

2098
static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)
2099
{
2100
	if (!v)
2101
		(void)__bpf_hash_map_seq_show(seq, NULL);
2102
	else
2103
		rcu_read_unlock();
2104
}
2105

2106
static int bpf_iter_init_hash_map(void *priv_data,
2107
				  struct bpf_iter_aux_info *aux)
2108
{
2109
	struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2110
	struct bpf_map *map = aux->map;
2111
	void *value_buf;
2112
	u32 buf_size;
2113

2114
	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
2115
	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
2116
		buf_size = round_up(map->value_size, 8) * num_possible_cpus();
2117
		value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
2118
		if (!value_buf)
2119
			return -ENOMEM;
2120

2121
		seq_info->percpu_value_buf = value_buf;
2122
	}
2123

2124
	bpf_map_inc_with_uref(map);
2125
	seq_info->map = map;
2126
	seq_info->htab = container_of(map, struct bpf_htab, map);
2127
	return 0;
2128
}
2129

2130
static void bpf_iter_fini_hash_map(void *priv_data)
2131
{
2132
	struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2133

2134
	bpf_map_put_with_uref(seq_info->map);
2135
	kfree(seq_info->percpu_value_buf);
2136
}
2137

2138
static const struct seq_operations bpf_hash_map_seq_ops = {
2139
	.start	= bpf_hash_map_seq_start,
2140
	.next	= bpf_hash_map_seq_next,
2141
	.stop	= bpf_hash_map_seq_stop,
2142
	.show	= bpf_hash_map_seq_show,
2143
};
2144

2145
static const struct bpf_iter_seq_info iter_seq_info = {
2146
	.seq_ops		= &bpf_hash_map_seq_ops,
2147
	.init_seq_private	= bpf_iter_init_hash_map,
2148
	.fini_seq_private	= bpf_iter_fini_hash_map,
2149
	.seq_priv_size		= sizeof(struct bpf_iter_seq_hash_map_info),
2150
};
2151

2152
static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
2153
				   void *callback_ctx, u64 flags)
2154
{
2155
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2156
	struct hlist_nulls_head *head;
2157
	struct hlist_nulls_node *n;
2158
	struct htab_elem *elem;
2159
	int i, num_elems = 0;
2160
	void __percpu *pptr;
2161
	struct bucket *b;
2162
	void *key, *val;
2163
	bool is_percpu;
2164
	u64 ret = 0;
2165

2166
	cant_migrate();
2167

2168
	if (flags != 0)
2169
		return -EINVAL;
2170

2171
	is_percpu = htab_is_percpu(htab);
2172

2173
	/* migration has been disabled, so percpu value prepared here will be
2174
	 * the same as the one seen by the bpf program with
2175
	 * bpf_map_lookup_elem().
2176
	 */
2177
	for (i = 0; i < htab->n_buckets; i++) {
2178
		b = &htab->buckets[i];
2179
		rcu_read_lock();
2180
		head = &b->head;
2181
		hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) {
2182
			key = elem->key;
2183
			if (is_percpu) {
2184
				/* current cpu value for percpu map */
2185
				pptr = htab_elem_get_ptr(elem, map->key_size);
2186
				val = this_cpu_ptr(pptr);
2187
			} else {
2188
				val = htab_elem_value(elem, map->key_size);
2189
			}
2190
			num_elems++;
2191
			ret = callback_fn((u64)(long)map, (u64)(long)key,
2192
					  (u64)(long)val, (u64)(long)callback_ctx, 0);
2193
			/* return value: 0 - continue, 1 - stop and return */
2194
			if (ret) {
2195
				rcu_read_unlock();
2196
				goto out;
2197
			}
2198
		}
2199
		rcu_read_unlock();
2200
	}
2201
out:
2202
	return num_elems;
2203
}
2204

2205
static u64 htab_map_mem_usage(const struct bpf_map *map)
2206
{
2207
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2208
	u32 value_size = round_up(htab->map.value_size, 8);
2209
	bool prealloc = htab_is_prealloc(htab);
2210
	bool percpu = htab_is_percpu(htab);
2211
	bool lru = htab_is_lru(htab);
2212
	u64 num_entries;
2213
	u64 usage = sizeof(struct bpf_htab);
2214

2215
	usage += sizeof(struct bucket) * htab->n_buckets;
2216
	usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT;
2217
	if (prealloc) {
2218
		num_entries = map->max_entries;
2219
		if (htab_has_extra_elems(htab))
2220
			num_entries += num_possible_cpus();
2221

2222
		usage += htab->elem_size * num_entries;
2223

2224
		if (percpu)
2225
			usage += value_size * num_possible_cpus() * num_entries;
2226
		else if (!lru)
2227
			usage += sizeof(struct htab_elem *) * num_possible_cpus();
2228
	} else {
2229
#define LLIST_NODE_SZ sizeof(struct llist_node)
2230

2231
		num_entries = htab->use_percpu_counter ?
2232
					  percpu_counter_sum(&htab->pcount) :
2233
					  atomic_read(&htab->count);
2234
		usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries;
2235
		if (percpu) {
2236
			usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries;
2237
			usage += value_size * num_possible_cpus() * num_entries;
2238
		}
2239
	}
2240
	return usage;
2241
}
2242

2243
BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab)
2244
const struct bpf_map_ops htab_map_ops = {
2245
	.map_meta_equal = bpf_map_meta_equal,
2246
	.map_alloc_check = htab_map_alloc_check,
2247
	.map_alloc = htab_map_alloc,
2248
	.map_free = htab_map_free,
2249
	.map_get_next_key = htab_map_get_next_key,
2250
	.map_release_uref = htab_map_free_internal_structs,
2251
	.map_lookup_elem = htab_map_lookup_elem,
2252
	.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
2253
	.map_update_elem = htab_map_update_elem,
2254
	.map_delete_elem = htab_map_delete_elem,
2255
	.map_gen_lookup = htab_map_gen_lookup,
2256
	.map_seq_show_elem = htab_map_seq_show_elem,
2257
	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2258
	.map_for_each_callback = bpf_for_each_hash_elem,
2259
	.map_mem_usage = htab_map_mem_usage,
2260
	BATCH_OPS(htab),
2261
	.map_btf_id = &htab_map_btf_ids[0],
2262
	.iter_seq_info = &iter_seq_info,
2263
};
2264

2265
const struct bpf_map_ops htab_lru_map_ops = {
2266
	.map_meta_equal = bpf_map_meta_equal,
2267
	.map_alloc_check = htab_map_alloc_check,
2268
	.map_alloc = htab_map_alloc,
2269
	.map_free = htab_map_free,
2270
	.map_get_next_key = htab_map_get_next_key,
2271
	.map_release_uref = htab_map_free_internal_structs,
2272
	.map_lookup_elem = htab_lru_map_lookup_elem,
2273
	.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
2274
	.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
2275
	.map_update_elem = htab_lru_map_update_elem,
2276
	.map_delete_elem = htab_lru_map_delete_elem,
2277
	.map_gen_lookup = htab_lru_map_gen_lookup,
2278
	.map_seq_show_elem = htab_map_seq_show_elem,
2279
	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2280
	.map_for_each_callback = bpf_for_each_hash_elem,
2281
	.map_mem_usage = htab_map_mem_usage,
2282
	BATCH_OPS(htab_lru),
2283
	.map_btf_id = &htab_map_btf_ids[0],
2284
	.iter_seq_info = &iter_seq_info,
2285
};
2286

2287
/* Called from eBPF program */
2288
static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
2289
{
2290
	struct htab_elem *l = __htab_map_lookup_elem(map, key);
2291

2292
	if (l)
2293
		return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2294
	else
2295
		return NULL;
2296
}
2297

2298
/* inline bpf_map_lookup_elem() call for per-CPU hashmap */
2299
static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
2300
{
2301
	struct bpf_insn *insn = insn_buf;
2302

2303
	if (!bpf_jit_supports_percpu_insn())
2304
		return -EOPNOTSUPP;
2305

2306
	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
2307
		     (void *(*)(struct bpf_map *map, void *key))NULL));
2308
	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
2309
	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3);
2310
	*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0,
2311
				offsetof(struct htab_elem, key) + roundup(map->key_size, 8));
2312
	*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
2313
	*insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
2314

2315
	return insn - insn_buf;
2316
}
2317

2318
static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
2319
{
2320
	struct htab_elem *l;
2321

2322
	if (cpu >= nr_cpu_ids)
2323
		return NULL;
2324

2325
	l = __htab_map_lookup_elem(map, key);
2326
	if (l)
2327
		return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2328
	else
2329
		return NULL;
2330
}
2331

2332
static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
2333
{
2334
	struct htab_elem *l = __htab_map_lookup_elem(map, key);
2335

2336
	if (l) {
2337
		bpf_lru_node_set_ref(&l->lru_node);
2338
		return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2339
	}
2340

2341
	return NULL;
2342
}
2343

2344
static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
2345
{
2346
	struct htab_elem *l;
2347

2348
	if (cpu >= nr_cpu_ids)
2349
		return NULL;
2350

2351
	l = __htab_map_lookup_elem(map, key);
2352
	if (l) {
2353
		bpf_lru_node_set_ref(&l->lru_node);
2354
		return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2355
	}
2356

2357
	return NULL;
2358
}
2359

2360
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
2361
{
2362
	struct htab_elem *l;
2363
	void __percpu *pptr;
2364
	int ret = -ENOENT;
2365
	int cpu, off = 0;
2366
	u32 size;
2367

2368
	/* per_cpu areas are zero-filled and bpf programs can only
2369
	 * access 'value_size' of them, so copying rounded areas
2370
	 * will not leak any kernel data
2371
	 */
2372
	size = round_up(map->value_size, 8);
2373
	rcu_read_lock();
2374
	l = __htab_map_lookup_elem(map, key);
2375
	if (!l)
2376
		goto out;
2377
	/* We do not mark LRU map element here in order to not mess up
2378
	 * eviction heuristics when user space does a map walk.
2379
	 */
2380
	pptr = htab_elem_get_ptr(l, map->key_size);
2381
	for_each_possible_cpu(cpu) {
2382
		copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
2383
		check_and_init_map_value(map, value + off);
2384
		off += size;
2385
	}
2386
	ret = 0;
2387
out:
2388
	rcu_read_unlock();
2389
	return ret;
2390
}
2391

2392
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
2393
			   u64 map_flags)
2394
{
2395
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2396
	int ret;
2397

2398
	rcu_read_lock();
2399
	if (htab_is_lru(htab))
2400
		ret = __htab_lru_percpu_map_update_elem(map, key, value,
2401
							map_flags, true);
2402
	else
2403
		ret = htab_map_update_elem_in_place(map, key, value, map_flags,
2404
						    true, true);
2405
	rcu_read_unlock();
2406

2407
	return ret;
2408
}
2409

2410
static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
2411
					  struct seq_file *m)
2412
{
2413
	struct htab_elem *l;
2414
	void __percpu *pptr;
2415
	int cpu;
2416

2417
	rcu_read_lock();
2418

2419
	l = __htab_map_lookup_elem(map, key);
2420
	if (!l) {
2421
		rcu_read_unlock();
2422
		return;
2423
	}
2424

2425
	btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
2426
	seq_puts(m, ": {\n");
2427
	pptr = htab_elem_get_ptr(l, map->key_size);
2428
	for_each_possible_cpu(cpu) {
2429
		seq_printf(m, "\tcpu%d: ", cpu);
2430
		btf_type_seq_show(map->btf, map->btf_value_type_id,
2431
				  per_cpu_ptr(pptr, cpu), m);
2432
		seq_putc(m, '\n');
2433
	}
2434
	seq_puts(m, "}\n");
2435

2436
	rcu_read_unlock();
2437
}
2438

2439
const struct bpf_map_ops htab_percpu_map_ops = {
2440
	.map_meta_equal = bpf_map_meta_equal,
2441
	.map_alloc_check = htab_map_alloc_check,
2442
	.map_alloc = htab_map_alloc,
2443
	.map_free = htab_map_free,
2444
	.map_get_next_key = htab_map_get_next_key,
2445
	.map_lookup_elem = htab_percpu_map_lookup_elem,
2446
	.map_gen_lookup = htab_percpu_map_gen_lookup,
2447
	.map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
2448
	.map_update_elem = htab_percpu_map_update_elem,
2449
	.map_delete_elem = htab_map_delete_elem,
2450
	.map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem,
2451
	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
2452
	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2453
	.map_for_each_callback = bpf_for_each_hash_elem,
2454
	.map_mem_usage = htab_map_mem_usage,
2455
	BATCH_OPS(htab_percpu),
2456
	.map_btf_id = &htab_map_btf_ids[0],
2457
	.iter_seq_info = &iter_seq_info,
2458
};
2459

2460
const struct bpf_map_ops htab_lru_percpu_map_ops = {
2461
	.map_meta_equal = bpf_map_meta_equal,
2462
	.map_alloc_check = htab_map_alloc_check,
2463
	.map_alloc = htab_map_alloc,
2464
	.map_free = htab_map_free,
2465
	.map_get_next_key = htab_map_get_next_key,
2466
	.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
2467
	.map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
2468
	.map_update_elem = htab_lru_percpu_map_update_elem,
2469
	.map_delete_elem = htab_lru_map_delete_elem,
2470
	.map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem,
2471
	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
2472
	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2473
	.map_for_each_callback = bpf_for_each_hash_elem,
2474
	.map_mem_usage = htab_map_mem_usage,
2475
	BATCH_OPS(htab_lru_percpu),
2476
	.map_btf_id = &htab_map_btf_ids[0],
2477
	.iter_seq_info = &iter_seq_info,
2478
};
2479

2480
static int fd_htab_map_alloc_check(union bpf_attr *attr)
2481
{
2482
	if (attr->value_size != sizeof(u32))
2483
		return -EINVAL;
2484
	return htab_map_alloc_check(attr);
2485
}
2486

2487
static void fd_htab_map_free(struct bpf_map *map)
2488
{
2489
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2490
	struct hlist_nulls_node *n;
2491
	struct hlist_nulls_head *head;
2492
	struct htab_elem *l;
2493
	int i;
2494

2495
	for (i = 0; i < htab->n_buckets; i++) {
2496
		head = select_bucket(htab, i);
2497

2498
		hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
2499
			void *ptr = fd_htab_map_get_ptr(map, l);
2500

2501
			map->ops->map_fd_put_ptr(map, ptr, false);
2502
		}
2503
	}
2504

2505
	htab_map_free(map);
2506
}
2507

2508
/* only called from syscall */
2509
int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
2510
{
2511
	void **ptr;
2512
	int ret = 0;
2513

2514
	if (!map->ops->map_fd_sys_lookup_elem)
2515
		return -ENOTSUPP;
2516

2517
	rcu_read_lock();
2518
	ptr = htab_map_lookup_elem(map, key);
2519
	if (ptr)
2520
		*value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
2521
	else
2522
		ret = -ENOENT;
2523
	rcu_read_unlock();
2524

2525
	return ret;
2526
}
2527

2528
/* Only called from syscall */
2529
int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
2530
				void *key, void *value, u64 map_flags)
2531
{
2532
	void *ptr;
2533
	int ret;
2534

2535
	ptr = map->ops->map_fd_get_ptr(map, map_file, *(int *)value);
2536
	if (IS_ERR(ptr))
2537
		return PTR_ERR(ptr);
2538

2539
	/* The htab bucket lock is always held during update operations in fd
2540
	 * htab map, and the following rcu_read_lock() is only used to avoid
2541
	 * the WARN_ON_ONCE in htab_map_update_elem_in_place().
2542
	 */
2543
	rcu_read_lock();
2544
	ret = htab_map_update_elem_in_place(map, key, &ptr, map_flags, false, false);
2545
	rcu_read_unlock();
2546
	if (ret)
2547
		map->ops->map_fd_put_ptr(map, ptr, false);
2548

2549
	return ret;
2550
}
2551

2552
static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
2553
{
2554
	struct bpf_map *map, *inner_map_meta;
2555

2556
	inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
2557
	if (IS_ERR(inner_map_meta))
2558
		return inner_map_meta;
2559

2560
	map = htab_map_alloc(attr);
2561
	if (IS_ERR(map)) {
2562
		bpf_map_meta_free(inner_map_meta);
2563
		return map;
2564
	}
2565

2566
	map->inner_map_meta = inner_map_meta;
2567

2568
	return map;
2569
}
2570

2571
static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
2572
{
2573
	struct bpf_map **inner_map  = htab_map_lookup_elem(map, key);
2574

2575
	if (!inner_map)
2576
		return NULL;
2577

2578
	return READ_ONCE(*inner_map);
2579
}
2580

2581
static int htab_of_map_gen_lookup(struct bpf_map *map,
2582
				  struct bpf_insn *insn_buf)
2583
{
2584
	struct bpf_insn *insn = insn_buf;
2585
	const int ret = BPF_REG_0;
2586

2587
	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
2588
		     (void *(*)(struct bpf_map *map, void *key))NULL));
2589
	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
2590
	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
2591
	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
2592
				offsetof(struct htab_elem, key) +
2593
				round_up(map->key_size, 8));
2594
	*insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
2595

2596
	return insn - insn_buf;
2597
}
2598

2599
static void htab_of_map_free(struct bpf_map *map)
2600
{
2601
	bpf_map_meta_free(map->inner_map_meta);
2602
	fd_htab_map_free(map);
2603
}
2604

2605
const struct bpf_map_ops htab_of_maps_map_ops = {
2606
	.map_alloc_check = fd_htab_map_alloc_check,
2607
	.map_alloc = htab_of_map_alloc,
2608
	.map_free = htab_of_map_free,
2609
	.map_get_next_key = htab_map_get_next_key,
2610
	.map_lookup_elem = htab_of_map_lookup_elem,
2611
	.map_delete_elem = htab_map_delete_elem,
2612
	.map_fd_get_ptr = bpf_map_fd_get_ptr,
2613
	.map_fd_put_ptr = bpf_map_fd_put_ptr,
2614
	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
2615
	.map_gen_lookup = htab_of_map_gen_lookup,
2616
	.map_check_btf = map_check_no_btf,
2617
	.map_mem_usage = htab_map_mem_usage,
2618
	BATCH_OPS(htab),
2619
	.map_btf_id = &htab_map_btf_ids[0],
2620
};
2621

2622
Product

Resources

Company