CoCalc -- core.c

GitHub Repository: torvalds/linux
Path: blob/master/kernel/bpf/core.c
⁴⁹²⁷⁰ views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3
 * Linux Socket Filter - Kernel level socket filtering
4
 *
5
 * Based on the design of the Berkeley Packet Filter. The new
6
 * internal format has been designed by PLUMgrid:
7
 *
8
 *	Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
9
 *
10
 * Authors:
11
 *
12
 *	Jay Schulist <[email protected]>
13
 *	Alexei Starovoitov <[email protected]>
14
 *	Daniel Borkmann <[email protected]>
15
 *
16
 * Andi Kleen - Fix a few bad bugs and races.
17
 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
18
 */
19

20
#include <uapi/linux/btf.h>
21
#include <crypto/sha1.h>
22
#include <linux/filter.h>
23
#include <linux/skbuff.h>
24
#include <linux/vmalloc.h>
25
#include <linux/prandom.h>
26
#include <linux/bpf.h>
27
#include <linux/btf.h>
28
#include <linux/objtool.h>
29
#include <linux/overflow.h>
30
#include <linux/rbtree_latch.h>
31
#include <linux/kallsyms.h>
32
#include <linux/rcupdate.h>
33
#include <linux/perf_event.h>
34
#include <linux/extable.h>
35
#include <linux/log2.h>
36
#include <linux/bpf_verifier.h>
37
#include <linux/nodemask.h>
38
#include <linux/nospec.h>
39
#include <linux/bpf_mem_alloc.h>
40
#include <linux/memcontrol.h>
41
#include <linux/execmem.h>
42
#include <crypto/sha2.h>
43

44
#include <asm/barrier.h>
45
#include <linux/unaligned.h>
46

47
/* Registers */
48
#define BPF_R0	regs[BPF_REG_0]
49
#define BPF_R1	regs[BPF_REG_1]
50
#define BPF_R2	regs[BPF_REG_2]
51
#define BPF_R3	regs[BPF_REG_3]
52
#define BPF_R4	regs[BPF_REG_4]
53
#define BPF_R5	regs[BPF_REG_5]
54
#define BPF_R6	regs[BPF_REG_6]
55
#define BPF_R7	regs[BPF_REG_7]
56
#define BPF_R8	regs[BPF_REG_8]
57
#define BPF_R9	regs[BPF_REG_9]
58
#define BPF_R10	regs[BPF_REG_10]
59

60
/* Named registers */
61
#define DST	regs[insn->dst_reg]
62
#define SRC	regs[insn->src_reg]
63
#define FP	regs[BPF_REG_FP]
64
#define AX	regs[BPF_REG_AX]
65
#define ARG1	regs[BPF_REG_ARG1]
66
#define CTX	regs[BPF_REG_CTX]
67
#define OFF	insn->off
68
#define IMM	insn->imm
69

70
struct bpf_mem_alloc bpf_global_ma;
71
bool bpf_global_ma_set;
72

73
/* No hurry in this branch
74
 *
75
 * Exported for the bpf jit load helper.
76
 */
77
void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
78
{
79
	u8 *ptr = NULL;
80

81
	if (k >= SKF_NET_OFF) {
82
		ptr = skb_network_header(skb) + k - SKF_NET_OFF;
83
	} else if (k >= SKF_LL_OFF) {
84
		if (unlikely(!skb_mac_header_was_set(skb)))
85
			return NULL;
86
		ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
87
	}
88
	if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
89
		return ptr;
90

91
	return NULL;
92
}
93

94
/* tell bpf programs that include vmlinux.h kernel's PAGE_SIZE */
95
enum page_size_enum {
96
	__PAGE_SIZE = PAGE_SIZE
97
};
98

99
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
100
{
101
	gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
102
	struct bpf_prog_aux *aux;
103
	struct bpf_prog *fp;
104

105
	size = round_up(size, __PAGE_SIZE);
106
	fp = __vmalloc(size, gfp_flags);
107
	if (fp == NULL)
108
		return NULL;
109

110
	aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
111
	if (aux == NULL) {
112
		vfree(fp);
113
		return NULL;
114
	}
115
	fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
116
	if (!fp->active) {
117
		vfree(fp);
118
		kfree(aux);
119
		return NULL;
120
	}
121

122
	fp->pages = size / PAGE_SIZE;
123
	fp->aux = aux;
124
	fp->aux->main_prog_aux = aux;
125
	fp->aux->prog = fp;
126
	fp->jit_requested = ebpf_jit_enabled();
127
	fp->blinding_requested = bpf_jit_blinding_enabled(fp);
128
#ifdef CONFIG_CGROUP_BPF
129
	aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
130
#endif
131

132
	INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
133
#ifdef CONFIG_FINEIBT
134
	INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode);
135
#endif
136
	mutex_init(&fp->aux->used_maps_mutex);
137
	mutex_init(&fp->aux->ext_mutex);
138
	mutex_init(&fp->aux->dst_mutex);
139

140
#ifdef CONFIG_BPF_SYSCALL
141
	bpf_prog_stream_init(fp);
142
#endif
143

144
	return fp;
145
}
146

147
struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
148
{
149
	gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
150
	struct bpf_prog *prog;
151
	int cpu;
152

153
	prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
154
	if (!prog)
155
		return NULL;
156

157
	prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
158
	if (!prog->stats) {
159
		free_percpu(prog->active);
160
		kfree(prog->aux);
161
		vfree(prog);
162
		return NULL;
163
	}
164

165
	for_each_possible_cpu(cpu) {
166
		struct bpf_prog_stats *pstats;
167

168
		pstats = per_cpu_ptr(prog->stats, cpu);
169
		u64_stats_init(&pstats->syncp);
170
	}
171
	return prog;
172
}
173
EXPORT_SYMBOL_GPL(bpf_prog_alloc);
174

175
int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
176
{
177
	if (!prog->aux->nr_linfo || !prog->jit_requested)
178
		return 0;
179

180
	prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,
181
					  sizeof(*prog->aux->jited_linfo),
182
					  bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN));
183
	if (!prog->aux->jited_linfo)
184
		return -ENOMEM;
185

186
	return 0;
187
}
188

189
void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
190
{
191
	if (prog->aux->jited_linfo &&
192
	    (!prog->jited || !prog->aux->jited_linfo[0])) {
193
		kvfree(prog->aux->jited_linfo);
194
		prog->aux->jited_linfo = NULL;
195
	}
196

197
	kfree(prog->aux->kfunc_tab);
198
	prog->aux->kfunc_tab = NULL;
199
}
200

201
/* The jit engine is responsible to provide an array
202
 * for insn_off to the jited_off mapping (insn_to_jit_off).
203
 *
204
 * The idx to this array is the insn_off.  Hence, the insn_off
205
 * here is relative to the prog itself instead of the main prog.
206
 * This array has one entry for each xlated bpf insn.
207
 *
208
 * jited_off is the byte off to the end of the jited insn.
209
 *
210
 * Hence, with
211
 * insn_start:
212
 *      The first bpf insn off of the prog.  The insn off
213
 *      here is relative to the main prog.
214
 *      e.g. if prog is a subprog, insn_start > 0
215
 * linfo_idx:
216
 *      The prog's idx to prog->aux->linfo and jited_linfo
217
 *
218
 * jited_linfo[linfo_idx] = prog->bpf_func
219
 *
220
 * For i > linfo_idx,
221
 *
222
 * jited_linfo[i] = prog->bpf_func +
223
 *	insn_to_jit_off[linfo[i].insn_off - insn_start - 1]
224
 */
225
void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
226
			       const u32 *insn_to_jit_off)
227
{
228
	u32 linfo_idx, insn_start, insn_end, nr_linfo, i;
229
	const struct bpf_line_info *linfo;
230
	void **jited_linfo;
231

232
	if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt)
233
		/* Userspace did not provide linfo */
234
		return;
235

236
	linfo_idx = prog->aux->linfo_idx;
237
	linfo = &prog->aux->linfo[linfo_idx];
238
	insn_start = linfo[0].insn_off;
239
	insn_end = insn_start + prog->len;
240

241
	jited_linfo = &prog->aux->jited_linfo[linfo_idx];
242
	jited_linfo[0] = prog->bpf_func;
243

244
	nr_linfo = prog->aux->nr_linfo - linfo_idx;
245

246
	for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++)
247
		/* The verifier ensures that linfo[i].insn_off is
248
		 * strictly increasing
249
		 */
250
		jited_linfo[i] = prog->bpf_func +
251
			insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
252
}
253

254
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
255
				  gfp_t gfp_extra_flags)
256
{
257
	gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
258
	struct bpf_prog *fp;
259
	u32 pages;
260

261
	size = round_up(size, PAGE_SIZE);
262
	pages = size / PAGE_SIZE;
263
	if (pages <= fp_old->pages)
264
		return fp_old;
265

266
	fp = __vmalloc(size, gfp_flags);
267
	if (fp) {
268
		memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
269
		fp->pages = pages;
270
		fp->aux->prog = fp;
271

272
		/* We keep fp->aux from fp_old around in the new
273
		 * reallocated structure.
274
		 */
275
		fp_old->aux = NULL;
276
		fp_old->stats = NULL;
277
		fp_old->active = NULL;
278
		__bpf_prog_free(fp_old);
279
	}
280

281
	return fp;
282
}
283

284
void __bpf_prog_free(struct bpf_prog *fp)
285
{
286
	if (fp->aux) {
287
		mutex_destroy(&fp->aux->used_maps_mutex);
288
		mutex_destroy(&fp->aux->dst_mutex);
289
		kfree(fp->aux->poke_tab);
290
		kfree(fp->aux);
291
	}
292
	free_percpu(fp->stats);
293
	free_percpu(fp->active);
294
	vfree(fp);
295
}
296

297
int bpf_prog_calc_tag(struct bpf_prog *fp)
298
{
299
	size_t size = bpf_prog_insn_size(fp);
300
	struct bpf_insn *dst;
301
	bool was_ld_map;
302
	u32 i;
303

304
	dst = vmalloc(size);
305
	if (!dst)
306
		return -ENOMEM;
307

308
	/* We need to take out the map fd for the digest calculation
309
	 * since they are unstable from user space side.
310
	 */
311
	for (i = 0, was_ld_map = false; i < fp->len; i++) {
312
		dst[i] = fp->insnsi[i];
313
		if (!was_ld_map &&
314
		    dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
315
		    (dst[i].src_reg == BPF_PSEUDO_MAP_FD ||
316
		     dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) {
317
			was_ld_map = true;
318
			dst[i].imm = 0;
319
		} else if (was_ld_map &&
320
			   dst[i].code == 0 &&
321
			   dst[i].dst_reg == 0 &&
322
			   dst[i].src_reg == 0 &&
323
			   dst[i].off == 0) {
324
			was_ld_map = false;
325
			dst[i].imm = 0;
326
		} else {
327
			was_ld_map = false;
328
		}
329
	}
330
	sha256((u8 *)dst, size, fp->digest);
331
	vfree(dst);
332
	return 0;
333
}
334

335
static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
336
				s32 end_new, s32 curr, const bool probe_pass)
337
{
338
	const s64 imm_min = S32_MIN, imm_max = S32_MAX;
339
	s32 delta = end_new - end_old;
340
	s64 imm = insn->imm;
341

342
	if (curr < pos && curr + imm + 1 >= end_old)
343
		imm += delta;
344
	else if (curr >= end_new && curr + imm + 1 < end_new)
345
		imm -= delta;
346
	if (imm < imm_min || imm > imm_max)
347
		return -ERANGE;
348
	if (!probe_pass)
349
		insn->imm = imm;
350
	return 0;
351
}
352

353
static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
354
				s32 end_new, s32 curr, const bool probe_pass)
355
{
356
	s64 off_min, off_max, off;
357
	s32 delta = end_new - end_old;
358

359
	if (insn->code == (BPF_JMP32 | BPF_JA)) {
360
		off = insn->imm;
361
		off_min = S32_MIN;
362
		off_max = S32_MAX;
363
	} else {
364
		off = insn->off;
365
		off_min = S16_MIN;
366
		off_max = S16_MAX;
367
	}
368

369
	if (curr < pos && curr + off + 1 >= end_old)
370
		off += delta;
371
	else if (curr >= end_new && curr + off + 1 < end_new)
372
		off -= delta;
373
	if (off < off_min || off > off_max)
374
		return -ERANGE;
375
	if (!probe_pass) {
376
		if (insn->code == (BPF_JMP32 | BPF_JA))
377
			insn->imm = off;
378
		else
379
			insn->off = off;
380
	}
381
	return 0;
382
}
383

384
static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
385
			    s32 end_new, const bool probe_pass)
386
{
387
	u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0);
388
	struct bpf_insn *insn = prog->insnsi;
389
	int ret = 0;
390

391
	for (i = 0; i < insn_cnt; i++, insn++) {
392
		u8 code;
393

394
		/* In the probing pass we still operate on the original,
395
		 * unpatched image in order to check overflows before we
396
		 * do any other adjustments. Therefore skip the patchlet.
397
		 */
398
		if (probe_pass && i == pos) {
399
			i = end_new;
400
			insn = prog->insnsi + end_old;
401
		}
402
		if (bpf_pseudo_func(insn)) {
403
			ret = bpf_adj_delta_to_imm(insn, pos, end_old,
404
						   end_new, i, probe_pass);
405
			if (ret)
406
				return ret;
407
			continue;
408
		}
409
		code = insn->code;
410
		if ((BPF_CLASS(code) != BPF_JMP &&
411
		     BPF_CLASS(code) != BPF_JMP32) ||
412
		    BPF_OP(code) == BPF_EXIT)
413
			continue;
414
		/* Adjust offset of jmps if we cross patch boundaries. */
415
		if (BPF_OP(code) == BPF_CALL) {
416
			if (insn->src_reg != BPF_PSEUDO_CALL)
417
				continue;
418
			ret = bpf_adj_delta_to_imm(insn, pos, end_old,
419
						   end_new, i, probe_pass);
420
		} else {
421
			ret = bpf_adj_delta_to_off(insn, pos, end_old,
422
						   end_new, i, probe_pass);
423
		}
424
		if (ret)
425
			break;
426
	}
427

428
	return ret;
429
}
430

431
static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta)
432
{
433
	struct bpf_line_info *linfo;
434
	u32 i, nr_linfo;
435

436
	nr_linfo = prog->aux->nr_linfo;
437
	if (!nr_linfo || !delta)
438
		return;
439

440
	linfo = prog->aux->linfo;
441

442
	for (i = 0; i < nr_linfo; i++)
443
		if (off < linfo[i].insn_off)
444
			break;
445

446
	/* Push all off < linfo[i].insn_off by delta */
447
	for (; i < nr_linfo; i++)
448
		linfo[i].insn_off += delta;
449
}
450

451
struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
452
				       const struct bpf_insn *patch, u32 len)
453
{
454
	u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
455
	const u32 cnt_max = S16_MAX;
456
	struct bpf_prog *prog_adj;
457
	int err;
458

459
	/* Since our patchlet doesn't expand the image, we're done. */
460
	if (insn_delta == 0) {
461
		memcpy(prog->insnsi + off, patch, sizeof(*patch));
462
		return prog;
463
	}
464

465
	insn_adj_cnt = prog->len + insn_delta;
466

467
	/* Reject anything that would potentially let the insn->off
468
	 * target overflow when we have excessive program expansions.
469
	 * We need to probe here before we do any reallocation where
470
	 * we afterwards may not fail anymore.
471
	 */
472
	if (insn_adj_cnt > cnt_max &&
473
	    (err = bpf_adj_branches(prog, off, off + 1, off + len, true)))
474
		return ERR_PTR(err);
475

476
	/* Several new instructions need to be inserted. Make room
477
	 * for them. Likely, there's no need for a new allocation as
478
	 * last page could have large enough tailroom.
479
	 */
480
	prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
481
				    GFP_USER);
482
	if (!prog_adj)
483
		return ERR_PTR(-ENOMEM);
484

485
	prog_adj->len = insn_adj_cnt;
486

487
	/* Patching happens in 3 steps:
488
	 *
489
	 * 1) Move over tail of insnsi from next instruction onwards,
490
	 *    so we can patch the single target insn with one or more
491
	 *    new ones (patching is always from 1 to n insns, n > 0).
492
	 * 2) Inject new instructions at the target location.
493
	 * 3) Adjust branch offsets if necessary.
494
	 */
495
	insn_rest = insn_adj_cnt - off - len;
496

497
	memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1,
498
		sizeof(*patch) * insn_rest);
499
	memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
500

501
	/* We are guaranteed to not fail at this point, otherwise
502
	 * the ship has sailed to reverse to the original state. An
503
	 * overflow cannot happen at this point.
504
	 */
505
	BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false));
506

507
	bpf_adj_linfo(prog_adj, off, insn_delta);
508

509
	return prog_adj;
510
}
511

512
int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
513
{
514
	int err;
515

516
	/* Branch offsets can't overflow when program is shrinking, no need
517
	 * to call bpf_adj_branches(..., true) here
518
	 */
519
	memmove(prog->insnsi + off, prog->insnsi + off + cnt,
520
		sizeof(struct bpf_insn) * (prog->len - off - cnt));
521
	prog->len -= cnt;
522

523
	err = bpf_adj_branches(prog, off, off + cnt, off, false);
524
	WARN_ON_ONCE(err);
525
	return err;
526
}
527

528
static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
529
{
530
	int i;
531

532
	for (i = 0; i < fp->aux->real_func_cnt; i++)
533
		bpf_prog_kallsyms_del(fp->aux->func[i]);
534
}
535

536
void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
537
{
538
	bpf_prog_kallsyms_del_subprogs(fp);
539
	bpf_prog_kallsyms_del(fp);
540
}
541

542
#ifdef CONFIG_BPF_JIT
543
/* All BPF JIT sysctl knobs here. */
544
int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
545
int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
546
int bpf_jit_harden   __read_mostly;
547
long bpf_jit_limit   __read_mostly;
548
long bpf_jit_limit_max __read_mostly;
549

550
static void
551
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
552
{
553
	WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
554

555
	prog->aux->ksym.start = (unsigned long) prog->bpf_func;
556
	prog->aux->ksym.end   = prog->aux->ksym.start + prog->jited_len;
557
}
558

559
static void
560
bpf_prog_ksym_set_name(struct bpf_prog *prog)
561
{
562
	char *sym = prog->aux->ksym.name;
563
	const char *end = sym + KSYM_NAME_LEN;
564
	const struct btf_type *type;
565
	const char *func_name;
566

567
	BUILD_BUG_ON(sizeof("bpf_prog_") +
568
		     sizeof(prog->tag) * 2 +
569
		     /* name has been null terminated.
570
		      * We should need +1 for the '_' preceding
571
		      * the name.  However, the null character
572
		      * is double counted between the name and the
573
		      * sizeof("bpf_prog_") above, so we omit
574
		      * the +1 here.
575
		      */
576
		     sizeof(prog->aux->name) > KSYM_NAME_LEN);
577

578
	sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
579
	sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
580

581
	/* prog->aux->name will be ignored if full btf name is available */
582
	if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) {
583
		type = btf_type_by_id(prog->aux->btf,
584
				      prog->aux->func_info[prog->aux->func_idx].type_id);
585
		func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
586
		snprintf(sym, (size_t)(end - sym), "_%s", func_name);
587
		return;
588
	}
589

590
	if (prog->aux->name[0])
591
		snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
592
	else
593
		*sym = 0;
594
}
595

596
static unsigned long bpf_get_ksym_start(struct latch_tree_node *n)
597
{
598
	return container_of(n, struct bpf_ksym, tnode)->start;
599
}
600

601
static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
602
					  struct latch_tree_node *b)
603
{
604
	return bpf_get_ksym_start(a) < bpf_get_ksym_start(b);
605
}
606

607
static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
608
{
609
	unsigned long val = (unsigned long)key;
610
	const struct bpf_ksym *ksym;
611

612
	ksym = container_of(n, struct bpf_ksym, tnode);
613

614
	if (val < ksym->start)
615
		return -1;
616
	/* Ensure that we detect return addresses as part of the program, when
617
	 * the final instruction is a call for a program part of the stack
618
	 * trace. Therefore, do val > ksym->end instead of val >= ksym->end.
619
	 */
620
	if (val > ksym->end)
621
		return  1;
622

623
	return 0;
624
}
625

626
static const struct latch_tree_ops bpf_tree_ops = {
627
	.less	= bpf_tree_less,
628
	.comp	= bpf_tree_comp,
629
};
630

631
static DEFINE_SPINLOCK(bpf_lock);
632
static LIST_HEAD(bpf_kallsyms);
633
static struct latch_tree_root bpf_tree __cacheline_aligned;
634

635
void bpf_ksym_add(struct bpf_ksym *ksym)
636
{
637
	spin_lock_bh(&bpf_lock);
638
	WARN_ON_ONCE(!list_empty(&ksym->lnode));
639
	list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms);
640
	latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
641
	spin_unlock_bh(&bpf_lock);
642
}
643

644
static void __bpf_ksym_del(struct bpf_ksym *ksym)
645
{
646
	if (list_empty(&ksym->lnode))
647
		return;
648

649
	latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
650
	list_del_rcu(&ksym->lnode);
651
}
652

653
void bpf_ksym_del(struct bpf_ksym *ksym)
654
{
655
	spin_lock_bh(&bpf_lock);
656
	__bpf_ksym_del(ksym);
657
	spin_unlock_bh(&bpf_lock);
658
}
659

660
static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
661
{
662
	return fp->jited && !bpf_prog_was_classic(fp);
663
}
664

665
void bpf_prog_kallsyms_add(struct bpf_prog *fp)
666
{
667
	if (!bpf_prog_kallsyms_candidate(fp) ||
668
	    !bpf_token_capable(fp->aux->token, CAP_BPF))
669
		return;
670

671
	bpf_prog_ksym_set_addr(fp);
672
	bpf_prog_ksym_set_name(fp);
673
	fp->aux->ksym.prog = true;
674

675
	bpf_ksym_add(&fp->aux->ksym);
676

677
#ifdef CONFIG_FINEIBT
678
	/*
679
	 * When FineIBT, code in the __cfi_foo() symbols can get executed
680
	 * and hence unwinder needs help.
681
	 */
682
	if (cfi_mode != CFI_FINEIBT)
683
		return;
684

685
	snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN,
686
		 "__cfi_%s", fp->aux->ksym.name);
687

688
	fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16;
689
	fp->aux->ksym_prefix.end   = (unsigned long) fp->bpf_func;
690

691
	bpf_ksym_add(&fp->aux->ksym_prefix);
692
#endif
693
}
694

695
void bpf_prog_kallsyms_del(struct bpf_prog *fp)
696
{
697
	if (!bpf_prog_kallsyms_candidate(fp))
698
		return;
699

700
	bpf_ksym_del(&fp->aux->ksym);
701
#ifdef CONFIG_FINEIBT
702
	if (cfi_mode != CFI_FINEIBT)
703
		return;
704
	bpf_ksym_del(&fp->aux->ksym_prefix);
705
#endif
706
}
707

708
static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
709
{
710
	struct latch_tree_node *n;
711

712
	n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
713
	return n ? container_of(n, struct bpf_ksym, tnode) : NULL;
714
}
715

716
int __bpf_address_lookup(unsigned long addr, unsigned long *size,
717
				 unsigned long *off, char *sym)
718
{
719
	struct bpf_ksym *ksym;
720
	int ret = 0;
721

722
	rcu_read_lock();
723
	ksym = bpf_ksym_find(addr);
724
	if (ksym) {
725
		unsigned long symbol_start = ksym->start;
726
		unsigned long symbol_end = ksym->end;
727

728
		ret = strscpy(sym, ksym->name, KSYM_NAME_LEN);
729

730
		if (size)
731
			*size = symbol_end - symbol_start;
732
		if (off)
733
			*off  = addr - symbol_start;
734
	}
735
	rcu_read_unlock();
736

737
	return ret;
738
}
739

740
bool is_bpf_text_address(unsigned long addr)
741
{
742
	bool ret;
743

744
	rcu_read_lock();
745
	ret = bpf_ksym_find(addr) != NULL;
746
	rcu_read_unlock();
747

748
	return ret;
749
}
750

751
struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
752
{
753
	struct bpf_ksym *ksym;
754

755
	WARN_ON_ONCE(!rcu_read_lock_held());
756
	ksym = bpf_ksym_find(addr);
757

758
	return ksym && ksym->prog ?
759
	       container_of(ksym, struct bpf_prog_aux, ksym)->prog :
760
	       NULL;
761
}
762

763
bool bpf_has_frame_pointer(unsigned long ip)
764
{
765
	struct bpf_ksym *ksym;
766
	unsigned long offset;
767

768
	guard(rcu)();
769

770
	ksym = bpf_ksym_find(ip);
771
	if (!ksym || !ksym->fp_start || !ksym->fp_end)
772
		return false;
773

774
	offset = ip - ksym->start;
775

776
	return offset >= ksym->fp_start && offset < ksym->fp_end;
777
}
778

779
const struct exception_table_entry *search_bpf_extables(unsigned long addr)
780
{
781
	const struct exception_table_entry *e = NULL;
782
	struct bpf_prog *prog;
783

784
	rcu_read_lock();
785
	prog = bpf_prog_ksym_find(addr);
786
	if (!prog)
787
		goto out;
788
	if (!prog->aux->num_exentries)
789
		goto out;
790

791
	e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr);
792
out:
793
	rcu_read_unlock();
794
	return e;
795
}
796

797
int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
798
		    char *sym)
799
{
800
	struct bpf_ksym *ksym;
801
	unsigned int it = 0;
802
	int ret = -ERANGE;
803

804
	if (!bpf_jit_kallsyms_enabled())
805
		return ret;
806

807
	rcu_read_lock();
808
	list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) {
809
		if (it++ != symnum)
810
			continue;
811

812
		strscpy(sym, ksym->name, KSYM_NAME_LEN);
813

814
		*value = ksym->start;
815
		*type  = BPF_SYM_ELF_TYPE;
816

817
		ret = 0;
818
		break;
819
	}
820
	rcu_read_unlock();
821

822
	return ret;
823
}
824

825
int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
826
				struct bpf_jit_poke_descriptor *poke)
827
{
828
	struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
829
	static const u32 poke_tab_max = 1024;
830
	u32 slot = prog->aux->size_poke_tab;
831
	u32 size = slot + 1;
832

833
	if (size > poke_tab_max)
834
		return -ENOSPC;
835
	if (poke->tailcall_target || poke->tailcall_target_stable ||
836
	    poke->tailcall_bypass || poke->adj_off || poke->bypass_addr)
837
		return -EINVAL;
838

839
	switch (poke->reason) {
840
	case BPF_POKE_REASON_TAIL_CALL:
841
		if (!poke->tail_call.map)
842
			return -EINVAL;
843
		break;
844
	default:
845
		return -EINVAL;
846
	}
847

848
	tab = krealloc_array(tab, size, sizeof(*poke), GFP_KERNEL);
849
	if (!tab)
850
		return -ENOMEM;
851

852
	memcpy(&tab[slot], poke, sizeof(*poke));
853
	prog->aux->size_poke_tab = size;
854
	prog->aux->poke_tab = tab;
855

856
	return slot;
857
}
858

859
/*
860
 * BPF program pack allocator.
861
 *
862
 * Most BPF programs are pretty small. Allocating a hole page for each
863
 * program is sometime a waste. Many small bpf program also adds pressure
864
 * to instruction TLB. To solve this issue, we introduce a BPF program pack
865
 * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
866
 * to host BPF programs.
867
 */
868
#define BPF_PROG_CHUNK_SHIFT	6
869
#define BPF_PROG_CHUNK_SIZE	(1 << BPF_PROG_CHUNK_SHIFT)
870
#define BPF_PROG_CHUNK_MASK	(~(BPF_PROG_CHUNK_SIZE - 1))
871

872
struct bpf_prog_pack {
873
	struct list_head list;
874
	void *ptr;
875
	unsigned long bitmap[];
876
};
877

878
void bpf_jit_fill_hole_with_zero(void *area, unsigned int size)
879
{
880
	memset(area, 0, size);
881
}
882

883
#define BPF_PROG_SIZE_TO_NBITS(size)	(round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
884

885
static DEFINE_MUTEX(pack_mutex);
886
static LIST_HEAD(pack_list);
887

888
/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with
889
 * CONFIG_MMU=n. Use PAGE_SIZE in these cases.
890
 */
891
#ifdef PMD_SIZE
892
/* PMD_SIZE is really big for some archs. It doesn't make sense to
893
 * reserve too much memory in one allocation. Hardcode BPF_PROG_PACK_SIZE to
894
 * 2MiB * num_possible_nodes(). On most architectures PMD_SIZE will be
895
 * greater than or equal to 2MB.
896
 */
897
#define BPF_PROG_PACK_SIZE (SZ_2M * num_possible_nodes())
898
#else
899
#define BPF_PROG_PACK_SIZE PAGE_SIZE
900
#endif
901

902
#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
903

904
static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
905
{
906
	struct bpf_prog_pack *pack;
907
	int err;
908

909
	pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)),
910
		       GFP_KERNEL);
911
	if (!pack)
912
		return NULL;
913
	pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE);
914
	if (!pack->ptr)
915
		goto out;
916
	bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
917
	bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
918

919
	set_vm_flush_reset_perms(pack->ptr);
920
	err = set_memory_rox((unsigned long)pack->ptr,
921
			     BPF_PROG_PACK_SIZE / PAGE_SIZE);
922
	if (err)
923
		goto out;
924
	list_add_tail(&pack->list, &pack_list);
925
	return pack;
926

927
out:
928
	bpf_jit_free_exec(pack->ptr);
929
	kfree(pack);
930
	return NULL;
931
}
932

933
void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
934
{
935
	unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
936
	struct bpf_prog_pack *pack;
937
	unsigned long pos;
938
	void *ptr = NULL;
939

940
	mutex_lock(&pack_mutex);
941
	if (size > BPF_PROG_PACK_SIZE) {
942
		size = round_up(size, PAGE_SIZE);
943
		ptr = bpf_jit_alloc_exec(size);
944
		if (ptr) {
945
			int err;
946

947
			bpf_fill_ill_insns(ptr, size);
948
			set_vm_flush_reset_perms(ptr);
949
			err = set_memory_rox((unsigned long)ptr,
950
					     size / PAGE_SIZE);
951
			if (err) {
952
				bpf_jit_free_exec(ptr);
953
				ptr = NULL;
954
			}
955
		}
956
		goto out;
957
	}
958
	list_for_each_entry(pack, &pack_list, list) {
959
		pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
960
						 nbits, 0);
961
		if (pos < BPF_PROG_CHUNK_COUNT)
962
			goto found_free_area;
963
	}
964

965
	pack = alloc_new_pack(bpf_fill_ill_insns);
966
	if (!pack)
967
		goto out;
968

969
	pos = 0;
970

971
found_free_area:
972
	bitmap_set(pack->bitmap, pos, nbits);
973
	ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
974

975
out:
976
	mutex_unlock(&pack_mutex);
977
	return ptr;
978
}
979

980
void bpf_prog_pack_free(void *ptr, u32 size)
981
{
982
	struct bpf_prog_pack *pack = NULL, *tmp;
983
	unsigned int nbits;
984
	unsigned long pos;
985

986
	mutex_lock(&pack_mutex);
987
	if (size > BPF_PROG_PACK_SIZE) {
988
		bpf_jit_free_exec(ptr);
989
		goto out;
990
	}
991

992
	list_for_each_entry(tmp, &pack_list, list) {
993
		if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) {
994
			pack = tmp;
995
			break;
996
		}
997
	}
998

999
	if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
1000
		goto out;
1001

1002
	nbits = BPF_PROG_SIZE_TO_NBITS(size);
1003
	pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
1004

1005
	WARN_ONCE(bpf_arch_text_invalidate(ptr, size),
1006
		  "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
1007

1008
	bitmap_clear(pack->bitmap, pos, nbits);
1009
	if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
1010
				       BPF_PROG_CHUNK_COUNT, 0) == 0) {
1011
		list_del(&pack->list);
1012
		bpf_jit_free_exec(pack->ptr);
1013
		kfree(pack);
1014
	}
1015
out:
1016
	mutex_unlock(&pack_mutex);
1017
}
1018

1019
static atomic_long_t bpf_jit_current;
1020

1021
/* Can be overridden by an arch's JIT compiler if it has a custom,
1022
 * dedicated BPF backend memory area, or if neither of the two
1023
 * below apply.
1024
 */
1025
u64 __weak bpf_jit_alloc_exec_limit(void)
1026
{
1027
#if defined(MODULES_VADDR)
1028
	return MODULES_END - MODULES_VADDR;
1029
#else
1030
	return VMALLOC_END - VMALLOC_START;
1031
#endif
1032
}
1033

1034
static int __init bpf_jit_charge_init(void)
1035
{
1036
	/* Only used as heuristic here to derive limit. */
1037
	bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
1038
	bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1,
1039
					    PAGE_SIZE), LONG_MAX);
1040
	return 0;
1041
}
1042
pure_initcall(bpf_jit_charge_init);
1043

1044
int bpf_jit_charge_modmem(u32 size)
1045
{
1046
	if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) {
1047
		if (!bpf_capable()) {
1048
			atomic_long_sub(size, &bpf_jit_current);
1049
			return -EPERM;
1050
		}
1051
	}
1052

1053
	return 0;
1054
}
1055

1056
void bpf_jit_uncharge_modmem(u32 size)
1057
{
1058
	atomic_long_sub(size, &bpf_jit_current);
1059
}
1060

1061
void *__weak bpf_jit_alloc_exec(unsigned long size)
1062
{
1063
	return execmem_alloc(EXECMEM_BPF, size);
1064
}
1065

1066
void __weak bpf_jit_free_exec(void *addr)
1067
{
1068
	execmem_free(addr);
1069
}
1070

1071
struct bpf_binary_header *
1072
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
1073
		     unsigned int alignment,
1074
		     bpf_jit_fill_hole_t bpf_fill_ill_insns)
1075
{
1076
	struct bpf_binary_header *hdr;
1077
	u32 size, hole, start;
1078

1079
	WARN_ON_ONCE(!is_power_of_2(alignment) ||
1080
		     alignment > BPF_IMAGE_ALIGNMENT);
1081

1082
	/* Most of BPF filters are really small, but if some of them
1083
	 * fill a page, allow at least 128 extra bytes to insert a
1084
	 * random section of illegal instructions.
1085
	 */
1086
	size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
1087

1088
	if (bpf_jit_charge_modmem(size))
1089
		return NULL;
1090
	hdr = bpf_jit_alloc_exec(size);
1091
	if (!hdr) {
1092
		bpf_jit_uncharge_modmem(size);
1093
		return NULL;
1094
	}
1095

1096
	/* Fill space with illegal/arch-dep instructions. */
1097
	bpf_fill_ill_insns(hdr, size);
1098

1099
	hdr->size = size;
1100
	hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
1101
		     PAGE_SIZE - sizeof(*hdr));
1102
	start = get_random_u32_below(hole) & ~(alignment - 1);
1103

1104
	/* Leave a random number of instructions before BPF code. */
1105
	*image_ptr = &hdr->image[start];
1106

1107
	return hdr;
1108
}
1109

1110
void bpf_jit_binary_free(struct bpf_binary_header *hdr)
1111
{
1112
	u32 size = hdr->size;
1113

1114
	bpf_jit_free_exec(hdr);
1115
	bpf_jit_uncharge_modmem(size);
1116
}
1117

1118
/* Allocate jit binary from bpf_prog_pack allocator.
1119
 * Since the allocated memory is RO+X, the JIT engine cannot write directly
1120
 * to the memory. To solve this problem, a RW buffer is also allocated at
1121
 * as the same time. The JIT engine should calculate offsets based on the
1122
 * RO memory address, but write JITed program to the RW buffer. Once the
1123
 * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
1124
 * the JITed program to the RO memory.
1125
 */
1126
struct bpf_binary_header *
1127
bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
1128
			  unsigned int alignment,
1129
			  struct bpf_binary_header **rw_header,
1130
			  u8 **rw_image,
1131
			  bpf_jit_fill_hole_t bpf_fill_ill_insns)
1132
{
1133
	struct bpf_binary_header *ro_header;
1134
	u32 size, hole, start;
1135

1136
	WARN_ON_ONCE(!is_power_of_2(alignment) ||
1137
		     alignment > BPF_IMAGE_ALIGNMENT);
1138

1139
	/* add 16 bytes for a random section of illegal instructions */
1140
	size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
1141

1142
	if (bpf_jit_charge_modmem(size))
1143
		return NULL;
1144
	ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns);
1145
	if (!ro_header) {
1146
		bpf_jit_uncharge_modmem(size);
1147
		return NULL;
1148
	}
1149

1150
	*rw_header = kvmalloc(size, GFP_KERNEL);
1151
	if (!*rw_header) {
1152
		bpf_prog_pack_free(ro_header, size);
1153
		bpf_jit_uncharge_modmem(size);
1154
		return NULL;
1155
	}
1156

1157
	/* Fill space with illegal/arch-dep instructions. */
1158
	bpf_fill_ill_insns(*rw_header, size);
1159
	(*rw_header)->size = size;
1160

1161
	hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
1162
		     BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
1163
	start = get_random_u32_below(hole) & ~(alignment - 1);
1164

1165
	*image_ptr = &ro_header->image[start];
1166
	*rw_image = &(*rw_header)->image[start];
1167

1168
	return ro_header;
1169
}
1170

1171
/* Copy JITed text from rw_header to its final location, the ro_header. */
1172
int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header,
1173
				 struct bpf_binary_header *rw_header)
1174
{
1175
	void *ptr;
1176

1177
	ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
1178

1179
	kvfree(rw_header);
1180

1181
	if (IS_ERR(ptr)) {
1182
		bpf_prog_pack_free(ro_header, ro_header->size);
1183
		return PTR_ERR(ptr);
1184
	}
1185
	return 0;
1186
}
1187

1188
/* bpf_jit_binary_pack_free is called in two different scenarios:
1189
 *   1) when the program is freed after;
1190
 *   2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
1191
 * For case 2), we need to free both the RO memory and the RW buffer.
1192
 *
1193
 * bpf_jit_binary_pack_free requires proper ro_header->size. However,
1194
 * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size
1195
 * must be set with either bpf_jit_binary_pack_finalize (normal path) or
1196
 * bpf_arch_text_copy (when jit fails).
1197
 */
1198
void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
1199
			      struct bpf_binary_header *rw_header)
1200
{
1201
	u32 size = ro_header->size;
1202

1203
	bpf_prog_pack_free(ro_header, size);
1204
	kvfree(rw_header);
1205
	bpf_jit_uncharge_modmem(size);
1206
}
1207

1208
struct bpf_binary_header *
1209
bpf_jit_binary_pack_hdr(const struct bpf_prog *fp)
1210
{
1211
	unsigned long real_start = (unsigned long)fp->bpf_func;
1212
	unsigned long addr;
1213

1214
	addr = real_start & BPF_PROG_CHUNK_MASK;
1215
	return (void *)addr;
1216
}
1217

1218
static inline struct bpf_binary_header *
1219
bpf_jit_binary_hdr(const struct bpf_prog *fp)
1220
{
1221
	unsigned long real_start = (unsigned long)fp->bpf_func;
1222
	unsigned long addr;
1223

1224
	addr = real_start & PAGE_MASK;
1225
	return (void *)addr;
1226
}
1227

1228
/* This symbol is only overridden by archs that have different
1229
 * requirements than the usual eBPF JITs, f.e. when they only
1230
 * implement cBPF JIT, do not set images read-only, etc.
1231
 */
1232
void __weak bpf_jit_free(struct bpf_prog *fp)
1233
{
1234
	if (fp->jited) {
1235
		struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
1236

1237
		bpf_jit_binary_free(hdr);
1238
		WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
1239
	}
1240

1241
	bpf_prog_unlock_free(fp);
1242
}
1243

1244
int bpf_jit_get_func_addr(const struct bpf_prog *prog,
1245
			  const struct bpf_insn *insn, bool extra_pass,
1246
			  u64 *func_addr, bool *func_addr_fixed)
1247
{
1248
	s16 off = insn->off;
1249
	s32 imm = insn->imm;
1250
	u8 *addr;
1251
	int err;
1252

1253
	*func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL;
1254
	if (!*func_addr_fixed) {
1255
		/* Place-holder address till the last pass has collected
1256
		 * all addresses for JITed subprograms in which case we
1257
		 * can pick them up from prog->aux.
1258
		 */
1259
		if (!extra_pass)
1260
			addr = NULL;
1261
		else if (prog->aux->func &&
1262
			 off >= 0 && off < prog->aux->real_func_cnt)
1263
			addr = (u8 *)prog->aux->func[off]->bpf_func;
1264
		else
1265
			return -EINVAL;
1266
	} else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
1267
		   bpf_jit_supports_far_kfunc_call()) {
1268
		err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr);
1269
		if (err)
1270
			return err;
1271
	} else {
1272
		/* Address of a BPF helper call. Since part of the core
1273
		 * kernel, it's always at a fixed location. __bpf_call_base
1274
		 * and the helper with imm relative to it are both in core
1275
		 * kernel.
1276
		 */
1277
		addr = (u8 *)__bpf_call_base + imm;
1278
	}
1279

1280
	*func_addr = (unsigned long)addr;
1281
	return 0;
1282
}
1283

1284
const char *bpf_jit_get_prog_name(struct bpf_prog *prog)
1285
{
1286
	if (prog->aux->ksym.prog)
1287
		return prog->aux->ksym.name;
1288
	return prog->aux->name;
1289
}
1290

1291
static int bpf_jit_blind_insn(const struct bpf_insn *from,
1292
			      const struct bpf_insn *aux,
1293
			      struct bpf_insn *to_buff,
1294
			      bool emit_zext)
1295
{
1296
	struct bpf_insn *to = to_buff;
1297
	u32 imm_rnd = get_random_u32();
1298
	s16 off;
1299

1300
	BUILD_BUG_ON(BPF_REG_AX  + 1 != MAX_BPF_JIT_REG);
1301
	BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);
1302

1303
	/* Constraints on AX register:
1304
	 *
1305
	 * AX register is inaccessible from user space. It is mapped in
1306
	 * all JITs, and used here for constant blinding rewrites. It is
1307
	 * typically "stateless" meaning its contents are only valid within
1308
	 * the executed instruction, but not across several instructions.
1309
	 * There are a few exceptions however which are further detailed
1310
	 * below.
1311
	 *
1312
	 * Constant blinding is only used by JITs, not in the interpreter.
1313
	 * The interpreter uses AX in some occasions as a local temporary
1314
	 * register e.g. in DIV or MOD instructions.
1315
	 *
1316
	 * In restricted circumstances, the verifier can also use the AX
1317
	 * register for rewrites as long as they do not interfere with
1318
	 * the above cases!
1319
	 */
1320
	if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX)
1321
		goto out;
1322

1323
	if (from->imm == 0 &&
1324
	    (from->code == (BPF_ALU   | BPF_MOV | BPF_K) ||
1325
	     from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {
1326
		*to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg);
1327
		goto out;
1328
	}
1329

1330
	switch (from->code) {
1331
	case BPF_ALU | BPF_ADD | BPF_K:
1332
	case BPF_ALU | BPF_SUB | BPF_K:
1333
	case BPF_ALU | BPF_AND | BPF_K:
1334
	case BPF_ALU | BPF_OR  | BPF_K:
1335
	case BPF_ALU | BPF_XOR | BPF_K:
1336
	case BPF_ALU | BPF_MUL | BPF_K:
1337
	case BPF_ALU | BPF_MOV | BPF_K:
1338
	case BPF_ALU | BPF_DIV | BPF_K:
1339
	case BPF_ALU | BPF_MOD | BPF_K:
1340
		*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
1341
		*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1342
		*to++ = BPF_ALU32_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
1343
		break;
1344

1345
	case BPF_ALU64 | BPF_ADD | BPF_K:
1346
	case BPF_ALU64 | BPF_SUB | BPF_K:
1347
	case BPF_ALU64 | BPF_AND | BPF_K:
1348
	case BPF_ALU64 | BPF_OR  | BPF_K:
1349
	case BPF_ALU64 | BPF_XOR | BPF_K:
1350
	case BPF_ALU64 | BPF_MUL | BPF_K:
1351
	case BPF_ALU64 | BPF_MOV | BPF_K:
1352
	case BPF_ALU64 | BPF_DIV | BPF_K:
1353
	case BPF_ALU64 | BPF_MOD | BPF_K:
1354
		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
1355
		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1356
		*to++ = BPF_ALU64_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
1357
		break;
1358

1359
	case BPF_JMP | BPF_JEQ  | BPF_K:
1360
	case BPF_JMP | BPF_JNE  | BPF_K:
1361
	case BPF_JMP | BPF_JGT  | BPF_K:
1362
	case BPF_JMP | BPF_JLT  | BPF_K:
1363
	case BPF_JMP | BPF_JGE  | BPF_K:
1364
	case BPF_JMP | BPF_JLE  | BPF_K:
1365
	case BPF_JMP | BPF_JSGT | BPF_K:
1366
	case BPF_JMP | BPF_JSLT | BPF_K:
1367
	case BPF_JMP | BPF_JSGE | BPF_K:
1368
	case BPF_JMP | BPF_JSLE | BPF_K:
1369
	case BPF_JMP | BPF_JSET | BPF_K:
1370
		/* Accommodate for extra offset in case of a backjump. */
1371
		off = from->off;
1372
		if (off < 0)
1373
			off -= 2;
1374
		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
1375
		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1376
		*to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
1377
		break;
1378

1379
	case BPF_JMP32 | BPF_JEQ  | BPF_K:
1380
	case BPF_JMP32 | BPF_JNE  | BPF_K:
1381
	case BPF_JMP32 | BPF_JGT  | BPF_K:
1382
	case BPF_JMP32 | BPF_JLT  | BPF_K:
1383
	case BPF_JMP32 | BPF_JGE  | BPF_K:
1384
	case BPF_JMP32 | BPF_JLE  | BPF_K:
1385
	case BPF_JMP32 | BPF_JSGT | BPF_K:
1386
	case BPF_JMP32 | BPF_JSLT | BPF_K:
1387
	case BPF_JMP32 | BPF_JSGE | BPF_K:
1388
	case BPF_JMP32 | BPF_JSLE | BPF_K:
1389
	case BPF_JMP32 | BPF_JSET | BPF_K:
1390
		/* Accommodate for extra offset in case of a backjump. */
1391
		off = from->off;
1392
		if (off < 0)
1393
			off -= 2;
1394
		*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
1395
		*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1396
		*to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX,
1397
				      off);
1398
		break;
1399

1400
	case BPF_LD | BPF_IMM | BPF_DW:
1401
		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
1402
		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1403
		*to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
1404
		*to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX);
1405
		break;
1406
	case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
1407
		*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
1408
		*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1409
		if (emit_zext)
1410
			*to++ = BPF_ZEXT_REG(BPF_REG_AX);
1411
		*to++ = BPF_ALU64_REG(BPF_OR,  aux[0].dst_reg, BPF_REG_AX);
1412
		break;
1413

1414
	case BPF_ST | BPF_MEM | BPF_DW:
1415
	case BPF_ST | BPF_MEM | BPF_W:
1416
	case BPF_ST | BPF_MEM | BPF_H:
1417
	case BPF_ST | BPF_MEM | BPF_B:
1418
		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
1419
		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1420
		*to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off);
1421
		break;
1422
	}
1423
out:
1424
	return to - to_buff;
1425
}
1426

1427
static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
1428
					      gfp_t gfp_extra_flags)
1429
{
1430
	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
1431
	struct bpf_prog *fp;
1432

1433
	fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags);
1434
	if (fp != NULL) {
1435
		/* aux->prog still points to the fp_other one, so
1436
		 * when promoting the clone to the real program,
1437
		 * this still needs to be adapted.
1438
		 */
1439
		memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE);
1440
	}
1441

1442
	return fp;
1443
}
1444

1445
static void bpf_prog_clone_free(struct bpf_prog *fp)
1446
{
1447
	/* aux was stolen by the other clone, so we cannot free
1448
	 * it from this path! It will be freed eventually by the
1449
	 * other program on release.
1450
	 *
1451
	 * At this point, we don't need a deferred release since
1452
	 * clone is guaranteed to not be locked.
1453
	 */
1454
	fp->aux = NULL;
1455
	fp->stats = NULL;
1456
	fp->active = NULL;
1457
	__bpf_prog_free(fp);
1458
}
1459

1460
void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
1461
{
1462
	/* We have to repoint aux->prog to self, as we don't
1463
	 * know whether fp here is the clone or the original.
1464
	 */
1465
	fp->aux->prog = fp;
1466
	bpf_prog_clone_free(fp_other);
1467
}
1468

1469
static void adjust_insn_arrays(struct bpf_prog *prog, u32 off, u32 len)
1470
{
1471
#ifdef CONFIG_BPF_SYSCALL
1472
	struct bpf_map *map;
1473
	int i;
1474

1475
	if (len <= 1)
1476
		return;
1477

1478
	for (i = 0; i < prog->aux->used_map_cnt; i++) {
1479
		map = prog->aux->used_maps[i];
1480
		if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY)
1481
			bpf_insn_array_adjust(map, off, len);
1482
	}
1483
#endif
1484
}
1485

1486
struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
1487
{
1488
	struct bpf_insn insn_buff[16], aux[2];
1489
	struct bpf_prog *clone, *tmp;
1490
	int insn_delta, insn_cnt;
1491
	struct bpf_insn *insn;
1492
	int i, rewritten;
1493

1494
	if (!prog->blinding_requested || prog->blinded)
1495
		return prog;
1496

1497
	clone = bpf_prog_clone_create(prog, GFP_USER);
1498
	if (!clone)
1499
		return ERR_PTR(-ENOMEM);
1500

1501
	insn_cnt = clone->len;
1502
	insn = clone->insnsi;
1503

1504
	for (i = 0; i < insn_cnt; i++, insn++) {
1505
		if (bpf_pseudo_func(insn)) {
1506
			/* ld_imm64 with an address of bpf subprog is not
1507
			 * a user controlled constant. Don't randomize it,
1508
			 * since it will conflict with jit_subprogs() logic.
1509
			 */
1510
			insn++;
1511
			i++;
1512
			continue;
1513
		}
1514

1515
		/* We temporarily need to hold the original ld64 insn
1516
		 * so that we can still access the first part in the
1517
		 * second blinding run.
1518
		 */
1519
		if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) &&
1520
		    insn[1].code == 0)
1521
			memcpy(aux, insn, sizeof(aux));
1522

1523
		rewritten = bpf_jit_blind_insn(insn, aux, insn_buff,
1524
						clone->aux->verifier_zext);
1525
		if (!rewritten)
1526
			continue;
1527

1528
		tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
1529
		if (IS_ERR(tmp)) {
1530
			/* Patching may have repointed aux->prog during
1531
			 * realloc from the original one, so we need to
1532
			 * fix it up here on error.
1533
			 */
1534
			bpf_jit_prog_release_other(prog, clone);
1535
			return tmp;
1536
		}
1537

1538
		clone = tmp;
1539
		insn_delta = rewritten - 1;
1540

1541
		/* Instructions arrays must be updated using absolute xlated offsets */
1542
		adjust_insn_arrays(clone, prog->aux->subprog_start + i, rewritten);
1543

1544
		/* Walk new program and skip insns we just inserted. */
1545
		insn = clone->insnsi + i + insn_delta;
1546
		insn_cnt += insn_delta;
1547
		i        += insn_delta;
1548
	}
1549

1550
	clone->blinded = 1;
1551
	return clone;
1552
}
1553
#endif /* CONFIG_BPF_JIT */
1554

1555
/* Base function for offset calculation. Needs to go into .text section,
1556
 * therefore keeping it non-static as well; will also be used by JITs
1557
 * anyway later on, so do not let the compiler omit it. This also needs
1558
 * to go into kallsyms for correlation from e.g. bpftool, so naming
1559
 * must not change.
1560
 */
1561
noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
1562
{
1563
	return 0;
1564
}
1565
EXPORT_SYMBOL_GPL(__bpf_call_base);
1566

1567
/* All UAPI available opcodes. */
1568
#define BPF_INSN_MAP(INSN_2, INSN_3)		\
1569
	/* 32 bit ALU operations. */		\
1570
	/*   Register based. */			\
1571
	INSN_3(ALU, ADD,  X),			\
1572
	INSN_3(ALU, SUB,  X),			\
1573
	INSN_3(ALU, AND,  X),			\
1574
	INSN_3(ALU, OR,   X),			\
1575
	INSN_3(ALU, LSH,  X),			\
1576
	INSN_3(ALU, RSH,  X),			\
1577
	INSN_3(ALU, XOR,  X),			\
1578
	INSN_3(ALU, MUL,  X),			\
1579
	INSN_3(ALU, MOV,  X),			\
1580
	INSN_3(ALU, ARSH, X),			\
1581
	INSN_3(ALU, DIV,  X),			\
1582
	INSN_3(ALU, MOD,  X),			\
1583
	INSN_2(ALU, NEG),			\
1584
	INSN_3(ALU, END, TO_BE),		\
1585
	INSN_3(ALU, END, TO_LE),		\
1586
	/*   Immediate based. */		\
1587
	INSN_3(ALU, ADD,  K),			\
1588
	INSN_3(ALU, SUB,  K),			\
1589
	INSN_3(ALU, AND,  K),			\
1590
	INSN_3(ALU, OR,   K),			\
1591
	INSN_3(ALU, LSH,  K),			\
1592
	INSN_3(ALU, RSH,  K),			\
1593
	INSN_3(ALU, XOR,  K),			\
1594
	INSN_3(ALU, MUL,  K),			\
1595
	INSN_3(ALU, MOV,  K),			\
1596
	INSN_3(ALU, ARSH, K),			\
1597
	INSN_3(ALU, DIV,  K),			\
1598
	INSN_3(ALU, MOD,  K),			\
1599
	/* 64 bit ALU operations. */		\
1600
	/*   Register based. */			\
1601
	INSN_3(ALU64, ADD,  X),			\
1602
	INSN_3(ALU64, SUB,  X),			\
1603
	INSN_3(ALU64, AND,  X),			\
1604
	INSN_3(ALU64, OR,   X),			\
1605
	INSN_3(ALU64, LSH,  X),			\
1606
	INSN_3(ALU64, RSH,  X),			\
1607
	INSN_3(ALU64, XOR,  X),			\
1608
	INSN_3(ALU64, MUL,  X),			\
1609
	INSN_3(ALU64, MOV,  X),			\
1610
	INSN_3(ALU64, ARSH, X),			\
1611
	INSN_3(ALU64, DIV,  X),			\
1612
	INSN_3(ALU64, MOD,  X),			\
1613
	INSN_2(ALU64, NEG),			\
1614
	INSN_3(ALU64, END, TO_LE),		\
1615
	/*   Immediate based. */		\
1616
	INSN_3(ALU64, ADD,  K),			\
1617
	INSN_3(ALU64, SUB,  K),			\
1618
	INSN_3(ALU64, AND,  K),			\
1619
	INSN_3(ALU64, OR,   K),			\
1620
	INSN_3(ALU64, LSH,  K),			\
1621
	INSN_3(ALU64, RSH,  K),			\
1622
	INSN_3(ALU64, XOR,  K),			\
1623
	INSN_3(ALU64, MUL,  K),			\
1624
	INSN_3(ALU64, MOV,  K),			\
1625
	INSN_3(ALU64, ARSH, K),			\
1626
	INSN_3(ALU64, DIV,  K),			\
1627
	INSN_3(ALU64, MOD,  K),			\
1628
	/* Call instruction. */			\
1629
	INSN_2(JMP, CALL),			\
1630
	/* Exit instruction. */			\
1631
	INSN_2(JMP, EXIT),			\
1632
	/* 32-bit Jump instructions. */		\
1633
	/*   Register based. */			\
1634
	INSN_3(JMP32, JEQ,  X),			\
1635
	INSN_3(JMP32, JNE,  X),			\
1636
	INSN_3(JMP32, JGT,  X),			\
1637
	INSN_3(JMP32, JLT,  X),			\
1638
	INSN_3(JMP32, JGE,  X),			\
1639
	INSN_3(JMP32, JLE,  X),			\
1640
	INSN_3(JMP32, JSGT, X),			\
1641
	INSN_3(JMP32, JSLT, X),			\
1642
	INSN_3(JMP32, JSGE, X),			\
1643
	INSN_3(JMP32, JSLE, X),			\
1644
	INSN_3(JMP32, JSET, X),			\
1645
	/*   Immediate based. */		\
1646
	INSN_3(JMP32, JEQ,  K),			\
1647
	INSN_3(JMP32, JNE,  K),			\
1648
	INSN_3(JMP32, JGT,  K),			\
1649
	INSN_3(JMP32, JLT,  K),			\
1650
	INSN_3(JMP32, JGE,  K),			\
1651
	INSN_3(JMP32, JLE,  K),			\
1652
	INSN_3(JMP32, JSGT, K),			\
1653
	INSN_3(JMP32, JSLT, K),			\
1654
	INSN_3(JMP32, JSGE, K),			\
1655
	INSN_3(JMP32, JSLE, K),			\
1656
	INSN_3(JMP32, JSET, K),			\
1657
	/* Jump instructions. */		\
1658
	/*   Register based. */			\
1659
	INSN_3(JMP, JEQ,  X),			\
1660
	INSN_3(JMP, JNE,  X),			\
1661
	INSN_3(JMP, JGT,  X),			\
1662
	INSN_3(JMP, JLT,  X),			\
1663
	INSN_3(JMP, JGE,  X),			\
1664
	INSN_3(JMP, JLE,  X),			\
1665
	INSN_3(JMP, JSGT, X),			\
1666
	INSN_3(JMP, JSLT, X),			\
1667
	INSN_3(JMP, JSGE, X),			\
1668
	INSN_3(JMP, JSLE, X),			\
1669
	INSN_3(JMP, JSET, X),			\
1670
	/*   Immediate based. */		\
1671
	INSN_3(JMP, JEQ,  K),			\
1672
	INSN_3(JMP, JNE,  K),			\
1673
	INSN_3(JMP, JGT,  K),			\
1674
	INSN_3(JMP, JLT,  K),			\
1675
	INSN_3(JMP, JGE,  K),			\
1676
	INSN_3(JMP, JLE,  K),			\
1677
	INSN_3(JMP, JSGT, K),			\
1678
	INSN_3(JMP, JSLT, K),			\
1679
	INSN_3(JMP, JSGE, K),			\
1680
	INSN_3(JMP, JSLE, K),			\
1681
	INSN_3(JMP, JSET, K),			\
1682
	INSN_2(JMP, JA),			\
1683
	INSN_2(JMP32, JA),			\
1684
	/* Atomic operations. */		\
1685
	INSN_3(STX, ATOMIC, B),			\
1686
	INSN_3(STX, ATOMIC, H),			\
1687
	INSN_3(STX, ATOMIC, W),			\
1688
	INSN_3(STX, ATOMIC, DW),		\
1689
	/* Store instructions. */		\
1690
	/*   Register based. */			\
1691
	INSN_3(STX, MEM,  B),			\
1692
	INSN_3(STX, MEM,  H),			\
1693
	INSN_3(STX, MEM,  W),			\
1694
	INSN_3(STX, MEM,  DW),			\
1695
	/*   Immediate based. */		\
1696
	INSN_3(ST, MEM, B),			\
1697
	INSN_3(ST, MEM, H),			\
1698
	INSN_3(ST, MEM, W),			\
1699
	INSN_3(ST, MEM, DW),			\
1700
	/* Load instructions. */		\
1701
	/*   Register based. */			\
1702
	INSN_3(LDX, MEM, B),			\
1703
	INSN_3(LDX, MEM, H),			\
1704
	INSN_3(LDX, MEM, W),			\
1705
	INSN_3(LDX, MEM, DW),			\
1706
	INSN_3(LDX, MEMSX, B),			\
1707
	INSN_3(LDX, MEMSX, H),			\
1708
	INSN_3(LDX, MEMSX, W),			\
1709
	/*   Immediate based. */		\
1710
	INSN_3(LD, IMM, DW)
1711

1712
bool bpf_opcode_in_insntable(u8 code)
1713
{
1714
#define BPF_INSN_2_TBL(x, y)    [BPF_##x | BPF_##y] = true
1715
#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true
1716
	static const bool public_insntable[256] = {
1717
		[0 ... 255] = false,
1718
		/* Now overwrite non-defaults ... */
1719
		BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
1720
		/* UAPI exposed, but rewritten opcodes. cBPF carry-over. */
1721
		[BPF_LD | BPF_ABS | BPF_B] = true,
1722
		[BPF_LD | BPF_ABS | BPF_H] = true,
1723
		[BPF_LD | BPF_ABS | BPF_W] = true,
1724
		[BPF_LD | BPF_IND | BPF_B] = true,
1725
		[BPF_LD | BPF_IND | BPF_H] = true,
1726
		[BPF_LD | BPF_IND | BPF_W] = true,
1727
		[BPF_JMP | BPF_JA | BPF_X] = true,
1728
		[BPF_JMP | BPF_JCOND] = true,
1729
	};
1730
#undef BPF_INSN_3_TBL
1731
#undef BPF_INSN_2_TBL
1732
	return public_insntable[code];
1733
}
1734

1735
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
1736
/**
1737
 *	___bpf_prog_run - run eBPF program on a given context
1738
 *	@regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
1739
 *	@insn: is the array of eBPF instructions
1740
 *
1741
 * Decode and execute eBPF instructions.
1742
 *
1743
 * Return: whatever value is in %BPF_R0 at program exit
1744
 */
1745
static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
1746
{
1747
#define BPF_INSN_2_LBL(x, y)    [BPF_##x | BPF_##y] = &&x##_##y
1748
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
1749
	static const void * const jumptable[256] __annotate_jump_table = {
1750
		[0 ... 255] = &&default_label,
1751
		/* Now overwrite non-defaults ... */
1752
		BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
1753
		/* Non-UAPI available opcodes. */
1754
		[BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
1755
		[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
1756
		[BPF_ST  | BPF_NOSPEC] = &&ST_NOSPEC,
1757
		[BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
1758
		[BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
1759
		[BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
1760
		[BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW,
1761
		[BPF_LDX | BPF_PROBE_MEMSX | BPF_B] = &&LDX_PROBE_MEMSX_B,
1762
		[BPF_LDX | BPF_PROBE_MEMSX | BPF_H] = &&LDX_PROBE_MEMSX_H,
1763
		[BPF_LDX | BPF_PROBE_MEMSX | BPF_W] = &&LDX_PROBE_MEMSX_W,
1764
	};
1765
#undef BPF_INSN_3_LBL
1766
#undef BPF_INSN_2_LBL
1767
	u32 tail_call_cnt = 0;
1768

1769
#define CONT	 ({ insn++; goto select_insn; })
1770
#define CONT_JMP ({ insn++; goto select_insn; })
1771

1772
select_insn:
1773
	goto *jumptable[insn->code];
1774

1775
	/* Explicitly mask the register-based shift amounts with 63 or 31
1776
	 * to avoid undefined behavior. Normally this won't affect the
1777
	 * generated code, for example, in case of native 64 bit archs such
1778
	 * as x86-64 or arm64, the compiler is optimizing the AND away for
1779
	 * the interpreter. In case of JITs, each of the JIT backends compiles
1780
	 * the BPF shift operations to machine instructions which produce
1781
	 * implementation-defined results in such a case; the resulting
1782
	 * contents of the register may be arbitrary, but program behaviour
1783
	 * as a whole remains defined. In other words, in case of JIT backends,
1784
	 * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation.
1785
	 */
1786
	/* ALU (shifts) */
1787
#define SHT(OPCODE, OP)					\
1788
	ALU64_##OPCODE##_X:				\
1789
		DST = DST OP (SRC & 63);		\
1790
		CONT;					\
1791
	ALU_##OPCODE##_X:				\
1792
		DST = (u32) DST OP ((u32) SRC & 31);	\
1793
		CONT;					\
1794
	ALU64_##OPCODE##_K:				\
1795
		DST = DST OP IMM;			\
1796
		CONT;					\
1797
	ALU_##OPCODE##_K:				\
1798
		DST = (u32) DST OP (u32) IMM;		\
1799
		CONT;
1800
	/* ALU (rest) */
1801
#define ALU(OPCODE, OP)					\
1802
	ALU64_##OPCODE##_X:				\
1803
		DST = DST OP SRC;			\
1804
		CONT;					\
1805
	ALU_##OPCODE##_X:				\
1806
		DST = (u32) DST OP (u32) SRC;		\
1807
		CONT;					\
1808
	ALU64_##OPCODE##_K:				\
1809
		DST = DST OP IMM;			\
1810
		CONT;					\
1811
	ALU_##OPCODE##_K:				\
1812
		DST = (u32) DST OP (u32) IMM;		\
1813
		CONT;
1814
	ALU(ADD,  +)
1815
	ALU(SUB,  -)
1816
	ALU(AND,  &)
1817
	ALU(OR,   |)
1818
	ALU(XOR,  ^)
1819
	ALU(MUL,  *)
1820
	SHT(LSH, <<)
1821
	SHT(RSH, >>)
1822
#undef SHT
1823
#undef ALU
1824
	ALU_NEG:
1825
		DST = (u32) -DST;
1826
		CONT;
1827
	ALU64_NEG:
1828
		DST = -DST;
1829
		CONT;
1830
	ALU_MOV_X:
1831
		switch (OFF) {
1832
		case 0:
1833
			DST = (u32) SRC;
1834
			break;
1835
		case 8:
1836
			DST = (u32)(s8) SRC;
1837
			break;
1838
		case 16:
1839
			DST = (u32)(s16) SRC;
1840
			break;
1841
		}
1842
		CONT;
1843
	ALU_MOV_K:
1844
		DST = (u32) IMM;
1845
		CONT;
1846
	ALU64_MOV_X:
1847
		switch (OFF) {
1848
		case 0:
1849
			DST = SRC;
1850
			break;
1851
		case 8:
1852
			DST = (s8) SRC;
1853
			break;
1854
		case 16:
1855
			DST = (s16) SRC;
1856
			break;
1857
		case 32:
1858
			DST = (s32) SRC;
1859
			break;
1860
		}
1861
		CONT;
1862
	ALU64_MOV_K:
1863
		DST = IMM;
1864
		CONT;
1865
	LD_IMM_DW:
1866
		DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
1867
		insn++;
1868
		CONT;
1869
	ALU_ARSH_X:
1870
		DST = (u64) (u32) (((s32) DST) >> (SRC & 31));
1871
		CONT;
1872
	ALU_ARSH_K:
1873
		DST = (u64) (u32) (((s32) DST) >> IMM);
1874
		CONT;
1875
	ALU64_ARSH_X:
1876
		(*(s64 *) &DST) >>= (SRC & 63);
1877
		CONT;
1878
	ALU64_ARSH_K:
1879
		(*(s64 *) &DST) >>= IMM;
1880
		CONT;
1881
	ALU64_MOD_X:
1882
		switch (OFF) {
1883
		case 0:
1884
			div64_u64_rem(DST, SRC, &AX);
1885
			DST = AX;
1886
			break;
1887
		case 1:
1888
			AX = div64_s64(DST, SRC);
1889
			DST = DST - AX * SRC;
1890
			break;
1891
		}
1892
		CONT;
1893
	ALU_MOD_X:
1894
		switch (OFF) {
1895
		case 0:
1896
			AX = (u32) DST;
1897
			DST = do_div(AX, (u32) SRC);
1898
			break;
1899
		case 1:
1900
			AX = abs((s32)DST);
1901
			AX = do_div(AX, abs((s32)SRC));
1902
			if ((s32)DST < 0)
1903
				DST = (u32)-AX;
1904
			else
1905
				DST = (u32)AX;
1906
			break;
1907
		}
1908
		CONT;
1909
	ALU64_MOD_K:
1910
		switch (OFF) {
1911
		case 0:
1912
			div64_u64_rem(DST, IMM, &AX);
1913
			DST = AX;
1914
			break;
1915
		case 1:
1916
			AX = div64_s64(DST, IMM);
1917
			DST = DST - AX * IMM;
1918
			break;
1919
		}
1920
		CONT;
1921
	ALU_MOD_K:
1922
		switch (OFF) {
1923
		case 0:
1924
			AX = (u32) DST;
1925
			DST = do_div(AX, (u32) IMM);
1926
			break;
1927
		case 1:
1928
			AX = abs((s32)DST);
1929
			AX = do_div(AX, abs((s32)IMM));
1930
			if ((s32)DST < 0)
1931
				DST = (u32)-AX;
1932
			else
1933
				DST = (u32)AX;
1934
			break;
1935
		}
1936
		CONT;
1937
	ALU64_DIV_X:
1938
		switch (OFF) {
1939
		case 0:
1940
			DST = div64_u64(DST, SRC);
1941
			break;
1942
		case 1:
1943
			DST = div64_s64(DST, SRC);
1944
			break;
1945
		}
1946
		CONT;
1947
	ALU_DIV_X:
1948
		switch (OFF) {
1949
		case 0:
1950
			AX = (u32) DST;
1951
			do_div(AX, (u32) SRC);
1952
			DST = (u32) AX;
1953
			break;
1954
		case 1:
1955
			AX = abs((s32)DST);
1956
			do_div(AX, abs((s32)SRC));
1957
			if (((s32)DST < 0) == ((s32)SRC < 0))
1958
				DST = (u32)AX;
1959
			else
1960
				DST = (u32)-AX;
1961
			break;
1962
		}
1963
		CONT;
1964
	ALU64_DIV_K:
1965
		switch (OFF) {
1966
		case 0:
1967
			DST = div64_u64(DST, IMM);
1968
			break;
1969
		case 1:
1970
			DST = div64_s64(DST, IMM);
1971
			break;
1972
		}
1973
		CONT;
1974
	ALU_DIV_K:
1975
		switch (OFF) {
1976
		case 0:
1977
			AX = (u32) DST;
1978
			do_div(AX, (u32) IMM);
1979
			DST = (u32) AX;
1980
			break;
1981
		case 1:
1982
			AX = abs((s32)DST);
1983
			do_div(AX, abs((s32)IMM));
1984
			if (((s32)DST < 0) == ((s32)IMM < 0))
1985
				DST = (u32)AX;
1986
			else
1987
				DST = (u32)-AX;
1988
			break;
1989
		}
1990
		CONT;
1991
	ALU_END_TO_BE:
1992
		switch (IMM) {
1993
		case 16:
1994
			DST = (__force u16) cpu_to_be16(DST);
1995
			break;
1996
		case 32:
1997
			DST = (__force u32) cpu_to_be32(DST);
1998
			break;
1999
		case 64:
2000
			DST = (__force u64) cpu_to_be64(DST);
2001
			break;
2002
		}
2003
		CONT;
2004
	ALU_END_TO_LE:
2005
		switch (IMM) {
2006
		case 16:
2007
			DST = (__force u16) cpu_to_le16(DST);
2008
			break;
2009
		case 32:
2010
			DST = (__force u32) cpu_to_le32(DST);
2011
			break;
2012
		case 64:
2013
			DST = (__force u64) cpu_to_le64(DST);
2014
			break;
2015
		}
2016
		CONT;
2017
	ALU64_END_TO_LE:
2018
		switch (IMM) {
2019
		case 16:
2020
			DST = (__force u16) __swab16(DST);
2021
			break;
2022
		case 32:
2023
			DST = (__force u32) __swab32(DST);
2024
			break;
2025
		case 64:
2026
			DST = (__force u64) __swab64(DST);
2027
			break;
2028
		}
2029
		CONT;
2030

2031
	/* CALL */
2032
	JMP_CALL:
2033
		/* Function call scratches BPF_R1-BPF_R5 registers,
2034
		 * preserves BPF_R6-BPF_R9, and stores return value
2035
		 * into BPF_R0.
2036
		 */
2037
		BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
2038
						       BPF_R4, BPF_R5);
2039
		CONT;
2040

2041
	JMP_CALL_ARGS:
2042
		BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2,
2043
							    BPF_R3, BPF_R4,
2044
							    BPF_R5,
2045
							    insn + insn->off + 1);
2046
		CONT;
2047

2048
	JMP_TAIL_CALL: {
2049
		struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
2050
		struct bpf_array *array = container_of(map, struct bpf_array, map);
2051
		struct bpf_prog *prog;
2052
		u32 index = BPF_R3;
2053

2054
		if (unlikely(index >= array->map.max_entries))
2055
			goto out;
2056

2057
		if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
2058
			goto out;
2059

2060
		tail_call_cnt++;
2061

2062
		prog = READ_ONCE(array->ptrs[index]);
2063
		if (!prog)
2064
			goto out;
2065

2066
		/* ARG1 at this point is guaranteed to point to CTX from
2067
		 * the verifier side due to the fact that the tail call is
2068
		 * handled like a helper, that is, bpf_tail_call_proto,
2069
		 * where arg1_type is ARG_PTR_TO_CTX.
2070
		 */
2071
		insn = prog->insnsi;
2072
		goto select_insn;
2073
out:
2074
		CONT;
2075
	}
2076
	JMP_JA:
2077
		insn += insn->off;
2078
		CONT;
2079
	JMP32_JA:
2080
		insn += insn->imm;
2081
		CONT;
2082
	JMP_EXIT:
2083
		return BPF_R0;
2084
	/* JMP */
2085
#define COND_JMP(SIGN, OPCODE, CMP_OP)				\
2086
	JMP_##OPCODE##_X:					\
2087
		if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) {	\
2088
			insn += insn->off;			\
2089
			CONT_JMP;				\
2090
		}						\
2091
		CONT;						\
2092
	JMP32_##OPCODE##_X:					\
2093
		if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) {	\
2094
			insn += insn->off;			\
2095
			CONT_JMP;				\
2096
		}						\
2097
		CONT;						\
2098
	JMP_##OPCODE##_K:					\
2099
		if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) {	\
2100
			insn += insn->off;			\
2101
			CONT_JMP;				\
2102
		}						\
2103
		CONT;						\
2104
	JMP32_##OPCODE##_K:					\
2105
		if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) {	\
2106
			insn += insn->off;			\
2107
			CONT_JMP;				\
2108
		}						\
2109
		CONT;
2110
	COND_JMP(u, JEQ, ==)
2111
	COND_JMP(u, JNE, !=)
2112
	COND_JMP(u, JGT, >)
2113
	COND_JMP(u, JLT, <)
2114
	COND_JMP(u, JGE, >=)
2115
	COND_JMP(u, JLE, <=)
2116
	COND_JMP(u, JSET, &)
2117
	COND_JMP(s, JSGT, >)
2118
	COND_JMP(s, JSLT, <)
2119
	COND_JMP(s, JSGE, >=)
2120
	COND_JMP(s, JSLE, <=)
2121
#undef COND_JMP
2122
	/* ST, STX and LDX*/
2123
	ST_NOSPEC:
2124
		/* Speculation barrier for mitigating Speculative Store Bypass,
2125
		 * Bounds-Check Bypass and Type Confusion. In case of arm64, we
2126
		 * rely on the firmware mitigation as controlled via the ssbd
2127
		 * kernel parameter. Whenever the mitigation is enabled, it
2128
		 * works for all of the kernel code with no need to provide any
2129
		 * additional instructions here. In case of x86, we use 'lfence'
2130
		 * insn for mitigation. We reuse preexisting logic from Spectre
2131
		 * v1 mitigation that happens to produce the required code on
2132
		 * x86 for v4 as well.
2133
		 */
2134
		barrier_nospec();
2135
		CONT;
2136
#define LDST(SIZEOP, SIZE)						\
2137
	STX_MEM_##SIZEOP:						\
2138
		*(SIZE *)(unsigned long) (DST + insn->off) = SRC;	\
2139
		CONT;							\
2140
	ST_MEM_##SIZEOP:						\
2141
		*(SIZE *)(unsigned long) (DST + insn->off) = IMM;	\
2142
		CONT;							\
2143
	LDX_MEM_##SIZEOP:						\
2144
		DST = *(SIZE *)(unsigned long) (SRC + insn->off);	\
2145
		CONT;							\
2146
	LDX_PROBE_MEM_##SIZEOP:						\
2147
		bpf_probe_read_kernel_common(&DST, sizeof(SIZE),	\
2148
			      (const void *)(long) (SRC + insn->off));	\
2149
		DST = *((SIZE *)&DST);					\
2150
		CONT;
2151

2152
	LDST(B,   u8)
2153
	LDST(H,  u16)
2154
	LDST(W,  u32)
2155
	LDST(DW, u64)
2156
#undef LDST
2157

2158
#define LDSX(SIZEOP, SIZE)						\
2159
	LDX_MEMSX_##SIZEOP:						\
2160
		DST = *(SIZE *)(unsigned long) (SRC + insn->off);	\
2161
		CONT;							\
2162
	LDX_PROBE_MEMSX_##SIZEOP:					\
2163
		bpf_probe_read_kernel_common(&DST, sizeof(SIZE),		\
2164
				      (const void *)(long) (SRC + insn->off));	\
2165
		DST = *((SIZE *)&DST);					\
2166
		CONT;
2167

2168
	LDSX(B,   s8)
2169
	LDSX(H,  s16)
2170
	LDSX(W,  s32)
2171
#undef LDSX
2172

2173
#define ATOMIC_ALU_OP(BOP, KOP)						\
2174
		case BOP:						\
2175
			if (BPF_SIZE(insn->code) == BPF_W)		\
2176
				atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \
2177
					     (DST + insn->off));	\
2178
			else if (BPF_SIZE(insn->code) == BPF_DW)	\
2179
				atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \
2180
					       (DST + insn->off));	\
2181
			else						\
2182
				goto default_label;			\
2183
			break;						\
2184
		case BOP | BPF_FETCH:					\
2185
			if (BPF_SIZE(insn->code) == BPF_W)		\
2186
				SRC = (u32) atomic_fetch_##KOP(		\
2187
					(u32) SRC,			\
2188
					(atomic_t *)(unsigned long) (DST + insn->off)); \
2189
			else if (BPF_SIZE(insn->code) == BPF_DW)	\
2190
				SRC = (u64) atomic64_fetch_##KOP(	\
2191
					(u64) SRC,			\
2192
					(atomic64_t *)(unsigned long) (DST + insn->off)); \
2193
			else						\
2194
				goto default_label;			\
2195
			break;
2196

2197
	STX_ATOMIC_DW:
2198
	STX_ATOMIC_W:
2199
	STX_ATOMIC_H:
2200
	STX_ATOMIC_B:
2201
		switch (IMM) {
2202
		/* Atomic read-modify-write instructions support only W and DW
2203
		 * size modifiers.
2204
		 */
2205
		ATOMIC_ALU_OP(BPF_ADD, add)
2206
		ATOMIC_ALU_OP(BPF_AND, and)
2207
		ATOMIC_ALU_OP(BPF_OR, or)
2208
		ATOMIC_ALU_OP(BPF_XOR, xor)
2209
#undef ATOMIC_ALU_OP
2210

2211
		case BPF_XCHG:
2212
			if (BPF_SIZE(insn->code) == BPF_W)
2213
				SRC = (u32) atomic_xchg(
2214
					(atomic_t *)(unsigned long) (DST + insn->off),
2215
					(u32) SRC);
2216
			else if (BPF_SIZE(insn->code) == BPF_DW)
2217
				SRC = (u64) atomic64_xchg(
2218
					(atomic64_t *)(unsigned long) (DST + insn->off),
2219
					(u64) SRC);
2220
			else
2221
				goto default_label;
2222
			break;
2223
		case BPF_CMPXCHG:
2224
			if (BPF_SIZE(insn->code) == BPF_W)
2225
				BPF_R0 = (u32) atomic_cmpxchg(
2226
					(atomic_t *)(unsigned long) (DST + insn->off),
2227
					(u32) BPF_R0, (u32) SRC);
2228
			else if (BPF_SIZE(insn->code) == BPF_DW)
2229
				BPF_R0 = (u64) atomic64_cmpxchg(
2230
					(atomic64_t *)(unsigned long) (DST + insn->off),
2231
					(u64) BPF_R0, (u64) SRC);
2232
			else
2233
				goto default_label;
2234
			break;
2235
		/* Atomic load and store instructions support all size
2236
		 * modifiers.
2237
		 */
2238
		case BPF_LOAD_ACQ:
2239
			switch (BPF_SIZE(insn->code)) {
2240
#define LOAD_ACQUIRE(SIZEOP, SIZE)				\
2241
			case BPF_##SIZEOP:			\
2242
				DST = (SIZE)smp_load_acquire(	\
2243
					(SIZE *)(unsigned long)(SRC + insn->off));	\
2244
				break;
2245
			LOAD_ACQUIRE(B,   u8)
2246
			LOAD_ACQUIRE(H,  u16)
2247
			LOAD_ACQUIRE(W,  u32)
2248
#ifdef CONFIG_64BIT
2249
			LOAD_ACQUIRE(DW, u64)
2250
#endif
2251
#undef LOAD_ACQUIRE
2252
			default:
2253
				goto default_label;
2254
			}
2255
			break;
2256
		case BPF_STORE_REL:
2257
			switch (BPF_SIZE(insn->code)) {
2258
#define STORE_RELEASE(SIZEOP, SIZE)			\
2259
			case BPF_##SIZEOP:		\
2260
				smp_store_release(	\
2261
					(SIZE *)(unsigned long)(DST + insn->off), (SIZE)SRC);	\
2262
				break;
2263
			STORE_RELEASE(B,   u8)
2264
			STORE_RELEASE(H,  u16)
2265
			STORE_RELEASE(W,  u32)
2266
#ifdef CONFIG_64BIT
2267
			STORE_RELEASE(DW, u64)
2268
#endif
2269
#undef STORE_RELEASE
2270
			default:
2271
				goto default_label;
2272
			}
2273
			break;
2274

2275
		default:
2276
			goto default_label;
2277
		}
2278
		CONT;
2279

2280
	default_label:
2281
		/* If we ever reach this, we have a bug somewhere. Die hard here
2282
		 * instead of just returning 0; we could be somewhere in a subprog,
2283
		 * so execution could continue otherwise which we do /not/ want.
2284
		 *
2285
		 * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
2286
		 */
2287
		pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n",
2288
			insn->code, insn->imm);
2289
		BUG_ON(1);
2290
		return 0;
2291
}
2292

2293
#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
2294
#define DEFINE_BPF_PROG_RUN(stack_size) \
2295
static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
2296
{ \
2297
	u64 stack[stack_size / sizeof(u64)]; \
2298
	u64 regs[MAX_BPF_EXT_REG] = {}; \
2299
\
2300
	kmsan_unpoison_memory(stack, sizeof(stack)); \
2301
	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
2302
	ARG1 = (u64) (unsigned long) ctx; \
2303
	return ___bpf_prog_run(regs, insn); \
2304
}
2305

2306
#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size
2307
#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \
2308
static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
2309
				      const struct bpf_insn *insn) \
2310
{ \
2311
	u64 stack[stack_size / sizeof(u64)]; \
2312
	u64 regs[MAX_BPF_EXT_REG]; \
2313
\
2314
	kmsan_unpoison_memory(stack, sizeof(stack)); \
2315
	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
2316
	BPF_R1 = r1; \
2317
	BPF_R2 = r2; \
2318
	BPF_R3 = r3; \
2319
	BPF_R4 = r4; \
2320
	BPF_R5 = r5; \
2321
	return ___bpf_prog_run(regs, insn); \
2322
}
2323

2324
#define EVAL1(FN, X) FN(X)
2325
#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
2326
#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
2327
#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y)
2328
#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y)
2329
#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y)
2330

2331
EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
2332
EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
2333
EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);
2334

2335
EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192);
2336
EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384);
2337
EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512);
2338

2339
#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),
2340

2341
static unsigned int (*interpreters[])(const void *ctx,
2342
				      const struct bpf_insn *insn) = {
2343
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
2344
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
2345
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
2346
};
2347
#undef PROG_NAME_LIST
2348
#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size),
2349
static __maybe_unused
2350
u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
2351
			   const struct bpf_insn *insn) = {
2352
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
2353
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
2354
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
2355
};
2356
#undef PROG_NAME_LIST
2357

2358
#ifdef CONFIG_BPF_SYSCALL
2359
void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
2360
{
2361
	stack_depth = max_t(u32, stack_depth, 1);
2362
	insn->off = (s16) insn->imm;
2363
	insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] -
2364
		__bpf_call_base_args;
2365
	insn->code = BPF_JMP | BPF_CALL_ARGS;
2366
}
2367
#endif
2368
#endif
2369

2370
static unsigned int __bpf_prog_ret0_warn(const void *ctx,
2371
					 const struct bpf_insn *insn)
2372
{
2373
	/* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
2374
	 * is not working properly, so warn about it!
2375
	 */
2376
	WARN_ON_ONCE(1);
2377
	return 0;
2378
}
2379

2380
static bool __bpf_prog_map_compatible(struct bpf_map *map,
2381
				      const struct bpf_prog *fp)
2382
{
2383
	enum bpf_prog_type prog_type = resolve_prog_type(fp);
2384
	struct bpf_prog_aux *aux = fp->aux;
2385
	enum bpf_cgroup_storage_type i;
2386
	bool ret = false;
2387
	u64 cookie;
2388

2389
	if (fp->kprobe_override)
2390
		return ret;
2391

2392
	spin_lock(&map->owner_lock);
2393
	/* There's no owner yet where we could check for compatibility. */
2394
	if (!map->owner) {
2395
		map->owner = bpf_map_owner_alloc(map);
2396
		if (!map->owner)
2397
			goto err;
2398
		map->owner->type  = prog_type;
2399
		map->owner->jited = fp->jited;
2400
		map->owner->xdp_has_frags = aux->xdp_has_frags;
2401
		map->owner->expected_attach_type = fp->expected_attach_type;
2402
		map->owner->attach_func_proto = aux->attach_func_proto;
2403
		for_each_cgroup_storage_type(i) {
2404
			map->owner->storage_cookie[i] =
2405
				aux->cgroup_storage[i] ?
2406
				aux->cgroup_storage[i]->cookie : 0;
2407
		}
2408
		ret = true;
2409
	} else {
2410
		ret = map->owner->type  == prog_type &&
2411
		      map->owner->jited == fp->jited &&
2412
		      map->owner->xdp_has_frags == aux->xdp_has_frags;
2413
		if (ret &&
2414
		    map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
2415
		    map->owner->expected_attach_type != fp->expected_attach_type)
2416
			ret = false;
2417
		for_each_cgroup_storage_type(i) {
2418
			if (!ret)
2419
				break;
2420
			cookie = aux->cgroup_storage[i] ?
2421
				 aux->cgroup_storage[i]->cookie : 0;
2422
			ret = map->owner->storage_cookie[i] == cookie ||
2423
			      !cookie;
2424
		}
2425
		if (ret &&
2426
		    map->owner->attach_func_proto != aux->attach_func_proto) {
2427
			switch (prog_type) {
2428
			case BPF_PROG_TYPE_TRACING:
2429
			case BPF_PROG_TYPE_LSM:
2430
			case BPF_PROG_TYPE_EXT:
2431
			case BPF_PROG_TYPE_STRUCT_OPS:
2432
				ret = false;
2433
				break;
2434
			default:
2435
				break;
2436
			}
2437
		}
2438
	}
2439
err:
2440
	spin_unlock(&map->owner_lock);
2441
	return ret;
2442
}
2443

2444
bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp)
2445
{
2446
	/* XDP programs inserted into maps are not guaranteed to run on
2447
	 * a particular netdev (and can run outside driver context entirely
2448
	 * in the case of devmap and cpumap). Until device checks
2449
	 * are implemented, prohibit adding dev-bound programs to program maps.
2450
	 */
2451
	if (bpf_prog_is_dev_bound(fp->aux))
2452
		return false;
2453

2454
	return __bpf_prog_map_compatible(map, fp);
2455
}
2456

2457
static int bpf_check_tail_call(const struct bpf_prog *fp)
2458
{
2459
	struct bpf_prog_aux *aux = fp->aux;
2460
	int i, ret = 0;
2461

2462
	mutex_lock(&aux->used_maps_mutex);
2463
	for (i = 0; i < aux->used_map_cnt; i++) {
2464
		struct bpf_map *map = aux->used_maps[i];
2465

2466
		if (!map_type_contains_progs(map))
2467
			continue;
2468

2469
		if (!__bpf_prog_map_compatible(map, fp)) {
2470
			ret = -EINVAL;
2471
			goto out;
2472
		}
2473
	}
2474

2475
out:
2476
	mutex_unlock(&aux->used_maps_mutex);
2477
	return ret;
2478
}
2479

2480
static bool bpf_prog_select_interpreter(struct bpf_prog *fp)
2481
{
2482
	bool select_interpreter = false;
2483
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
2484
	u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
2485
	u32 idx = (round_up(stack_depth, 32) / 32) - 1;
2486

2487
	/* may_goto may cause stack size > 512, leading to idx out-of-bounds.
2488
	 * But for non-JITed programs, we don't need bpf_func, so no bounds
2489
	 * check needed.
2490
	 */
2491
	if (idx < ARRAY_SIZE(interpreters)) {
2492
		fp->bpf_func = interpreters[idx];
2493
		select_interpreter = true;
2494
	} else {
2495
		fp->bpf_func = __bpf_prog_ret0_warn;
2496
	}
2497
#else
2498
	fp->bpf_func = __bpf_prog_ret0_warn;
2499
#endif
2500
	return select_interpreter;
2501
}
2502

2503
/**
2504
 *	bpf_prog_select_runtime - select exec runtime for BPF program
2505
 *	@fp: bpf_prog populated with BPF program
2506
 *	@err: pointer to error variable
2507
 *
2508
 * Try to JIT eBPF program, if JIT is not available, use interpreter.
2509
 * The BPF program will be executed via bpf_prog_run() function.
2510
 *
2511
 * Return: the &fp argument along with &err set to 0 for success or
2512
 * a negative errno code on failure
2513
 */
2514
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
2515
{
2516
	/* In case of BPF to BPF calls, verifier did all the prep
2517
	 * work with regards to JITing, etc.
2518
	 */
2519
	bool jit_needed = false;
2520

2521
	if (fp->bpf_func)
2522
		goto finalize;
2523

2524
	if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) ||
2525
	    bpf_prog_has_kfunc_call(fp))
2526
		jit_needed = true;
2527

2528
	if (!bpf_prog_select_interpreter(fp))
2529
		jit_needed = true;
2530

2531
	/* eBPF JITs can rewrite the program in case constant
2532
	 * blinding is active. However, in case of error during
2533
	 * blinding, bpf_int_jit_compile() must always return a
2534
	 * valid program, which in this case would simply not
2535
	 * be JITed, but falls back to the interpreter.
2536
	 */
2537
	if (!bpf_prog_is_offloaded(fp->aux)) {
2538
		*err = bpf_prog_alloc_jited_linfo(fp);
2539
		if (*err)
2540
			return fp;
2541

2542
		fp = bpf_int_jit_compile(fp);
2543
		bpf_prog_jit_attempt_done(fp);
2544
		if (!fp->jited && jit_needed) {
2545
			*err = -ENOTSUPP;
2546
			return fp;
2547
		}
2548
	} else {
2549
		*err = bpf_prog_offload_compile(fp);
2550
		if (*err)
2551
			return fp;
2552
	}
2553

2554
finalize:
2555
	*err = bpf_prog_lock_ro(fp);
2556
	if (*err)
2557
		return fp;
2558

2559
	/* The tail call compatibility check can only be done at
2560
	 * this late stage as we need to determine, if we deal
2561
	 * with JITed or non JITed program concatenations and not
2562
	 * all eBPF JITs might immediately support all features.
2563
	 */
2564
	*err = bpf_check_tail_call(fp);
2565

2566
	return fp;
2567
}
2568
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
2569

2570
static unsigned int __bpf_prog_ret1(const void *ctx,
2571
				    const struct bpf_insn *insn)
2572
{
2573
	return 1;
2574
}
2575

2576
static struct bpf_prog_dummy {
2577
	struct bpf_prog prog;
2578
} dummy_bpf_prog = {
2579
	.prog = {
2580
		.bpf_func = __bpf_prog_ret1,
2581
	},
2582
};
2583

2584
struct bpf_empty_prog_array bpf_empty_prog_array = {
2585
	.null_prog = NULL,
2586
};
2587
EXPORT_SYMBOL(bpf_empty_prog_array);
2588

2589
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
2590
{
2591
	struct bpf_prog_array *p;
2592

2593
	if (prog_cnt)
2594
		p = kzalloc(struct_size(p, items, prog_cnt + 1), flags);
2595
	else
2596
		p = &bpf_empty_prog_array.hdr;
2597

2598
	return p;
2599
}
2600

2601
void bpf_prog_array_free(struct bpf_prog_array *progs)
2602
{
2603
	if (!progs || progs == &bpf_empty_prog_array.hdr)
2604
		return;
2605
	kfree_rcu(progs, rcu);
2606
}
2607

2608
static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
2609
{
2610
	struct bpf_prog_array *progs;
2611

2612
	/* If RCU Tasks Trace grace period implies RCU grace period, there is
2613
	 * no need to call kfree_rcu(), just call kfree() directly.
2614
	 */
2615
	progs = container_of(rcu, struct bpf_prog_array, rcu);
2616
	if (rcu_trace_implies_rcu_gp())
2617
		kfree(progs);
2618
	else
2619
		kfree_rcu(progs, rcu);
2620
}
2621

2622
void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
2623
{
2624
	if (!progs || progs == &bpf_empty_prog_array.hdr)
2625
		return;
2626
	call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
2627
}
2628

2629
int bpf_prog_array_length(struct bpf_prog_array *array)
2630
{
2631
	struct bpf_prog_array_item *item;
2632
	u32 cnt = 0;
2633

2634
	for (item = array->items; item->prog; item++)
2635
		if (item->prog != &dummy_bpf_prog.prog)
2636
			cnt++;
2637
	return cnt;
2638
}
2639

2640
bool bpf_prog_array_is_empty(struct bpf_prog_array *array)
2641
{
2642
	struct bpf_prog_array_item *item;
2643

2644
	for (item = array->items; item->prog; item++)
2645
		if (item->prog != &dummy_bpf_prog.prog)
2646
			return false;
2647
	return true;
2648
}
2649

2650
static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
2651
				     u32 *prog_ids,
2652
				     u32 request_cnt)
2653
{
2654
	struct bpf_prog_array_item *item;
2655
	int i = 0;
2656

2657
	for (item = array->items; item->prog; item++) {
2658
		if (item->prog == &dummy_bpf_prog.prog)
2659
			continue;
2660
		prog_ids[i] = item->prog->aux->id;
2661
		if (++i == request_cnt) {
2662
			item++;
2663
			break;
2664
		}
2665
	}
2666

2667
	return !!(item->prog);
2668
}
2669

2670
int bpf_prog_array_copy_to_user(struct bpf_prog_array *array,
2671
				__u32 __user *prog_ids, u32 cnt)
2672
{
2673
	unsigned long err = 0;
2674
	bool nospc;
2675
	u32 *ids;
2676

2677
	/* users of this function are doing:
2678
	 * cnt = bpf_prog_array_length();
2679
	 * if (cnt > 0)
2680
	 *     bpf_prog_array_copy_to_user(..., cnt);
2681
	 * so below kcalloc doesn't need extra cnt > 0 check.
2682
	 */
2683
	ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN);
2684
	if (!ids)
2685
		return -ENOMEM;
2686
	nospc = bpf_prog_array_copy_core(array, ids, cnt);
2687
	err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
2688
	kfree(ids);
2689
	if (err)
2690
		return -EFAULT;
2691
	if (nospc)
2692
		return -ENOSPC;
2693
	return 0;
2694
}
2695

2696
void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
2697
				struct bpf_prog *old_prog)
2698
{
2699
	struct bpf_prog_array_item *item;
2700

2701
	for (item = array->items; item->prog; item++)
2702
		if (item->prog == old_prog) {
2703
			WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
2704
			break;
2705
		}
2706
}
2707

2708
/**
2709
 * bpf_prog_array_delete_safe_at() - Replaces the program at the given
2710
 *                                   index into the program array with
2711
 *                                   a dummy no-op program.
2712
 * @array: a bpf_prog_array
2713
 * @index: the index of the program to replace
2714
 *
2715
 * Skips over dummy programs, by not counting them, when calculating
2716
 * the position of the program to replace.
2717
 *
2718
 * Return:
2719
 * * 0		- Success
2720
 * * -EINVAL	- Invalid index value. Must be a non-negative integer.
2721
 * * -ENOENT	- Index out of range
2722
 */
2723
int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index)
2724
{
2725
	return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog);
2726
}
2727

2728
/**
2729
 * bpf_prog_array_update_at() - Updates the program at the given index
2730
 *                              into the program array.
2731
 * @array: a bpf_prog_array
2732
 * @index: the index of the program to update
2733
 * @prog: the program to insert into the array
2734
 *
2735
 * Skips over dummy programs, by not counting them, when calculating
2736
 * the position of the program to update.
2737
 *
2738
 * Return:
2739
 * * 0		- Success
2740
 * * -EINVAL	- Invalid index value. Must be a non-negative integer.
2741
 * * -ENOENT	- Index out of range
2742
 */
2743
int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
2744
			     struct bpf_prog *prog)
2745
{
2746
	struct bpf_prog_array_item *item;
2747

2748
	if (unlikely(index < 0))
2749
		return -EINVAL;
2750

2751
	for (item = array->items; item->prog; item++) {
2752
		if (item->prog == &dummy_bpf_prog.prog)
2753
			continue;
2754
		if (!index) {
2755
			WRITE_ONCE(item->prog, prog);
2756
			return 0;
2757
		}
2758
		index--;
2759
	}
2760
	return -ENOENT;
2761
}
2762

2763
int bpf_prog_array_copy(struct bpf_prog_array *old_array,
2764
			struct bpf_prog *exclude_prog,
2765
			struct bpf_prog *include_prog,
2766
			u64 bpf_cookie,
2767
			struct bpf_prog_array **new_array)
2768
{
2769
	int new_prog_cnt, carry_prog_cnt = 0;
2770
	struct bpf_prog_array_item *existing, *new;
2771
	struct bpf_prog_array *array;
2772
	bool found_exclude = false;
2773

2774
	/* Figure out how many existing progs we need to carry over to
2775
	 * the new array.
2776
	 */
2777
	if (old_array) {
2778
		existing = old_array->items;
2779
		for (; existing->prog; existing++) {
2780
			if (existing->prog == exclude_prog) {
2781
				found_exclude = true;
2782
				continue;
2783
			}
2784
			if (existing->prog != &dummy_bpf_prog.prog)
2785
				carry_prog_cnt++;
2786
			if (existing->prog == include_prog)
2787
				return -EEXIST;
2788
		}
2789
	}
2790

2791
	if (exclude_prog && !found_exclude)
2792
		return -ENOENT;
2793

2794
	/* How many progs (not NULL) will be in the new array? */
2795
	new_prog_cnt = carry_prog_cnt;
2796
	if (include_prog)
2797
		new_prog_cnt += 1;
2798

2799
	/* Do we have any prog (not NULL) in the new array? */
2800
	if (!new_prog_cnt) {
2801
		*new_array = NULL;
2802
		return 0;
2803
	}
2804

2805
	/* +1 as the end of prog_array is marked with NULL */
2806
	array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
2807
	if (!array)
2808
		return -ENOMEM;
2809
	new = array->items;
2810

2811
	/* Fill in the new prog array */
2812
	if (carry_prog_cnt) {
2813
		existing = old_array->items;
2814
		for (; existing->prog; existing++) {
2815
			if (existing->prog == exclude_prog ||
2816
			    existing->prog == &dummy_bpf_prog.prog)
2817
				continue;
2818

2819
			new->prog = existing->prog;
2820
			new->bpf_cookie = existing->bpf_cookie;
2821
			new++;
2822
		}
2823
	}
2824
	if (include_prog) {
2825
		new->prog = include_prog;
2826
		new->bpf_cookie = bpf_cookie;
2827
		new++;
2828
	}
2829
	new->prog = NULL;
2830
	*new_array = array;
2831
	return 0;
2832
}
2833

2834
int bpf_prog_array_copy_info(struct bpf_prog_array *array,
2835
			     u32 *prog_ids, u32 request_cnt,
2836
			     u32 *prog_cnt)
2837
{
2838
	u32 cnt = 0;
2839

2840
	if (array)
2841
		cnt = bpf_prog_array_length(array);
2842

2843
	*prog_cnt = cnt;
2844

2845
	/* return early if user requested only program count or nothing to copy */
2846
	if (!request_cnt || !cnt)
2847
		return 0;
2848

2849
	/* this function is called under trace/bpf_trace.c: bpf_event_mutex */
2850
	return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC
2851
								     : 0;
2852
}
2853

2854
void __bpf_free_used_maps(struct bpf_prog_aux *aux,
2855
			  struct bpf_map **used_maps, u32 len)
2856
{
2857
	struct bpf_map *map;
2858
	bool sleepable;
2859
	u32 i;
2860

2861
	sleepable = aux->prog->sleepable;
2862
	for (i = 0; i < len; i++) {
2863
		map = used_maps[i];
2864
		if (map->ops->map_poke_untrack)
2865
			map->ops->map_poke_untrack(map, aux);
2866
		if (sleepable)
2867
			atomic64_dec(&map->sleepable_refcnt);
2868
		bpf_map_put(map);
2869
	}
2870
}
2871

2872
static void bpf_free_used_maps(struct bpf_prog_aux *aux)
2873
{
2874
	__bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt);
2875
	kfree(aux->used_maps);
2876
}
2877

2878
void __bpf_free_used_btfs(struct btf_mod_pair *used_btfs, u32 len)
2879
{
2880
#ifdef CONFIG_BPF_SYSCALL
2881
	struct btf_mod_pair *btf_mod;
2882
	u32 i;
2883

2884
	for (i = 0; i < len; i++) {
2885
		btf_mod = &used_btfs[i];
2886
		if (btf_mod->module)
2887
			module_put(btf_mod->module);
2888
		btf_put(btf_mod->btf);
2889
	}
2890
#endif
2891
}
2892

2893
static void bpf_free_used_btfs(struct bpf_prog_aux *aux)
2894
{
2895
	__bpf_free_used_btfs(aux->used_btfs, aux->used_btf_cnt);
2896
	kfree(aux->used_btfs);
2897
}
2898

2899
static void bpf_prog_free_deferred(struct work_struct *work)
2900
{
2901
	struct bpf_prog_aux *aux;
2902
	int i;
2903

2904
	aux = container_of(work, struct bpf_prog_aux, work);
2905
#ifdef CONFIG_BPF_SYSCALL
2906
	bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
2907
	bpf_prog_stream_free(aux->prog);
2908
#endif
2909
#ifdef CONFIG_CGROUP_BPF
2910
	if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID)
2911
		bpf_cgroup_atype_put(aux->cgroup_atype);
2912
#endif
2913
	bpf_free_used_maps(aux);
2914
	bpf_free_used_btfs(aux);
2915
	if (bpf_prog_is_dev_bound(aux))
2916
		bpf_prog_dev_bound_destroy(aux->prog);
2917
#ifdef CONFIG_PERF_EVENTS
2918
	if (aux->prog->has_callchain_buf)
2919
		put_callchain_buffers();
2920
#endif
2921
	if (aux->dst_trampoline)
2922
		bpf_trampoline_put(aux->dst_trampoline);
2923
	for (i = 0; i < aux->real_func_cnt; i++) {
2924
		/* We can just unlink the subprog poke descriptor table as
2925
		 * it was originally linked to the main program and is also
2926
		 * released along with it.
2927
		 */
2928
		aux->func[i]->aux->poke_tab = NULL;
2929
		bpf_jit_free(aux->func[i]);
2930
	}
2931
	if (aux->real_func_cnt) {
2932
		kfree(aux->func);
2933
		bpf_prog_unlock_free(aux->prog);
2934
	} else {
2935
		bpf_jit_free(aux->prog);
2936
	}
2937
}
2938

2939
void bpf_prog_free(struct bpf_prog *fp)
2940
{
2941
	struct bpf_prog_aux *aux = fp->aux;
2942

2943
	if (aux->dst_prog)
2944
		bpf_prog_put(aux->dst_prog);
2945
	bpf_token_put(aux->token);
2946
	INIT_WORK(&aux->work, bpf_prog_free_deferred);
2947
	schedule_work(&aux->work);
2948
}
2949
EXPORT_SYMBOL_GPL(bpf_prog_free);
2950

2951
/* RNG for unprivileged user space with separated state from prandom_u32(). */
2952
static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);
2953

2954
void bpf_user_rnd_init_once(void)
2955
{
2956
	prandom_init_once(&bpf_user_rnd_state);
2957
}
2958

2959
BPF_CALL_0(bpf_user_rnd_u32)
2960
{
2961
	/* Should someone ever have the rather unwise idea to use some
2962
	 * of the registers passed into this function, then note that
2963
	 * this function is called from native eBPF and classic-to-eBPF
2964
	 * transformations. Register assignments from both sides are
2965
	 * different, f.e. classic always sets fn(ctx, A, X) here.
2966
	 */
2967
	struct rnd_state *state;
2968
	u32 res;
2969

2970
	state = &get_cpu_var(bpf_user_rnd_state);
2971
	res = prandom_u32_state(state);
2972
	put_cpu_var(bpf_user_rnd_state);
2973

2974
	return res;
2975
}
2976

2977
BPF_CALL_0(bpf_get_raw_cpu_id)
2978
{
2979
	return raw_smp_processor_id();
2980
}
2981

2982
/* Weak definitions of helper functions in case we don't have bpf syscall. */
2983
const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
2984
const struct bpf_func_proto bpf_map_update_elem_proto __weak;
2985
const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
2986
const struct bpf_func_proto bpf_map_push_elem_proto __weak;
2987
const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
2988
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
2989
const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak;
2990
const struct bpf_func_proto bpf_spin_lock_proto __weak;
2991
const struct bpf_func_proto bpf_spin_unlock_proto __weak;
2992
const struct bpf_func_proto bpf_jiffies64_proto __weak;
2993

2994
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
2995
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
2996
const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
2997
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
2998
const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
2999
const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
3000
const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak;
3001

3002
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
3003
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
3004
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
3005
const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
3006
const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak;
3007
const struct bpf_func_proto bpf_get_local_storage_proto __weak;
3008
const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
3009
const struct bpf_func_proto bpf_snprintf_btf_proto __weak;
3010
const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;
3011
const struct bpf_func_proto bpf_set_retval_proto __weak;
3012
const struct bpf_func_proto bpf_get_retval_proto __weak;
3013

3014
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
3015
{
3016
	return NULL;
3017
}
3018

3019
const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void)
3020
{
3021
	return NULL;
3022
}
3023

3024
const struct bpf_func_proto * __weak bpf_get_perf_event_read_value_proto(void)
3025
{
3026
	return NULL;
3027
}
3028

3029
u64 __weak
3030
bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
3031
		 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
3032
{
3033
	return -ENOTSUPP;
3034
}
3035
EXPORT_SYMBOL_GPL(bpf_event_output);
3036

3037
/* Always built-in helper functions. */
3038
const struct bpf_func_proto bpf_tail_call_proto = {
3039
	/* func is unused for tail_call, we set it to pass the
3040
	 * get_helper_proto check
3041
	 */
3042
	.func		= BPF_PTR_POISON,
3043
	.gpl_only	= false,
3044
	.ret_type	= RET_VOID,
3045
	.arg1_type	= ARG_PTR_TO_CTX,
3046
	.arg2_type	= ARG_CONST_MAP_PTR,
3047
	.arg3_type	= ARG_ANYTHING,
3048
};
3049

3050
/* Stub for JITs that only support cBPF. eBPF programs are interpreted.
3051
 * It is encouraged to implement bpf_int_jit_compile() instead, so that
3052
 * eBPF and implicitly also cBPF can get JITed!
3053
 */
3054
struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
3055
{
3056
	return prog;
3057
}
3058

3059
/* Stub for JITs that support eBPF. All cBPF code gets transformed into
3060
 * eBPF by the kernel and is later compiled by bpf_int_jit_compile().
3061
 */
3062
void __weak bpf_jit_compile(struct bpf_prog *prog)
3063
{
3064
}
3065

3066
bool __weak bpf_helper_changes_pkt_data(enum bpf_func_id func_id)
3067
{
3068
	return false;
3069
}
3070

3071
/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
3072
 * analysis code and wants explicit zero extension inserted by verifier.
3073
 * Otherwise, return FALSE.
3074
 *
3075
 * The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if
3076
 * you don't override this. JITs that don't want these extra insns can detect
3077
 * them using insn_is_zext.
3078
 */
3079
bool __weak bpf_jit_needs_zext(void)
3080
{
3081
	return false;
3082
}
3083

3084
/* By default, enable the verifier's mitigations against Spectre v1 and v4 for
3085
 * all archs. The value returned must not change at runtime as there is
3086
 * currently no support for reloading programs that were loaded without
3087
 * mitigations.
3088
 */
3089
bool __weak bpf_jit_bypass_spec_v1(void)
3090
{
3091
	return false;
3092
}
3093

3094
bool __weak bpf_jit_bypass_spec_v4(void)
3095
{
3096
	return false;
3097
}
3098

3099
/* Return true if the JIT inlines the call to the helper corresponding to
3100
 * the imm.
3101
 *
3102
 * The verifier will not patch the insn->imm for the call to the helper if
3103
 * this returns true.
3104
 */
3105
bool __weak bpf_jit_inlines_helper_call(s32 imm)
3106
{
3107
	return false;
3108
}
3109

3110
/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
3111
bool __weak bpf_jit_supports_subprog_tailcalls(void)
3112
{
3113
	return false;
3114
}
3115

3116
bool __weak bpf_jit_supports_percpu_insn(void)
3117
{
3118
	return false;
3119
}
3120

3121
bool __weak bpf_jit_supports_kfunc_call(void)
3122
{
3123
	return false;
3124
}
3125

3126
bool __weak bpf_jit_supports_far_kfunc_call(void)
3127
{
3128
	return false;
3129
}
3130

3131
bool __weak bpf_jit_supports_arena(void)
3132
{
3133
	return false;
3134
}
3135

3136
bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
3137
{
3138
	return false;
3139
}
3140

3141
u64 __weak bpf_arch_uaddress_limit(void)
3142
{
3143
#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
3144
	return TASK_SIZE;
3145
#else
3146
	return 0;
3147
#endif
3148
}
3149

3150
/* Return TRUE if the JIT backend satisfies the following two conditions:
3151
 * 1) JIT backend supports atomic_xchg() on pointer-sized words.
3152
 * 2) Under the specific arch, the implementation of xchg() is the same
3153
 *    as atomic_xchg() on pointer-sized words.
3154
 */
3155
bool __weak bpf_jit_supports_ptr_xchg(void)
3156
{
3157
	return false;
3158
}
3159

3160
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
3161
 * skb_copy_bits(), so provide a weak definition of it for NET-less config.
3162
 */
3163
int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
3164
			 int len)
3165
{
3166
	return -EFAULT;
3167
}
3168

3169
int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
3170
			      enum bpf_text_poke_type new_t, void *old_addr,
3171
			      void *new_addr)
3172
{
3173
	return -ENOTSUPP;
3174
}
3175

3176
void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
3177
{
3178
	return ERR_PTR(-ENOTSUPP);
3179
}
3180

3181
int __weak bpf_arch_text_invalidate(void *dst, size_t len)
3182
{
3183
	return -ENOTSUPP;
3184
}
3185

3186
bool __weak bpf_jit_supports_exceptions(void)
3187
{
3188
	return false;
3189
}
3190

3191
bool __weak bpf_jit_supports_private_stack(void)
3192
{
3193
	return false;
3194
}
3195

3196
void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
3197
{
3198
}
3199

3200
bool __weak bpf_jit_supports_timed_may_goto(void)
3201
{
3202
	return false;
3203
}
3204

3205
u64 __weak arch_bpf_timed_may_goto(void)
3206
{
3207
	return 0;
3208
}
3209

3210
static noinline void bpf_prog_report_may_goto_violation(void)
3211
{
3212
#ifdef CONFIG_BPF_SYSCALL
3213
	struct bpf_stream_stage ss;
3214
	struct bpf_prog *prog;
3215

3216
	prog = bpf_prog_find_from_stack();
3217
	if (!prog)
3218
		return;
3219
	bpf_stream_stage(ss, prog, BPF_STDERR, ({
3220
		bpf_stream_printk(ss, "ERROR: Timeout detected for may_goto instruction\n");
3221
		bpf_stream_dump_stack(ss);
3222
	}));
3223
#endif
3224
}
3225

3226
u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p)
3227
{
3228
	u64 time = ktime_get_mono_fast_ns();
3229

3230
	/* Populate the timestamp for this stack frame, and refresh count. */
3231
	if (!p->timestamp) {
3232
		p->timestamp = time;
3233
		return BPF_MAX_TIMED_LOOPS;
3234
	}
3235
	/* Check if we've exhausted our time slice, and zero count. */
3236
	if (unlikely(time - p->timestamp >= (NSEC_PER_SEC / 4))) {
3237
		bpf_prog_report_may_goto_violation();
3238
		return 0;
3239
	}
3240
	/* Refresh the count for the stack frame. */
3241
	return BPF_MAX_TIMED_LOOPS;
3242
}
3243

3244
/* for configs without MMU or 32-bit */
3245
__weak const struct bpf_map_ops arena_map_ops;
3246
__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
3247
{
3248
	return 0;
3249
}
3250
__weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
3251
{
3252
	return 0;
3253
}
3254

3255
#ifdef CONFIG_BPF_SYSCALL
3256
static int __init bpf_global_ma_init(void)
3257
{
3258
	int ret;
3259

3260
	ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
3261
	bpf_global_ma_set = !ret;
3262
	return ret;
3263
}
3264
late_initcall(bpf_global_ma_init);
3265
#endif
3266

3267
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
3268
EXPORT_SYMBOL(bpf_stats_enabled_key);
3269

3270
/* All definitions of tracepoints related to BPF. */
3271
#define CREATE_TRACE_POINTS
3272
#include <linux/bpf_trace.h>
3273

3274
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
3275
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);
3276

3277
#ifdef CONFIG_BPF_SYSCALL
3278

3279
int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep,
3280
			   const char **linep, int *nump)
3281
{
3282
	int idx = -1, insn_start, insn_end, len;
3283
	struct bpf_line_info *linfo;
3284
	void **jited_linfo;
3285
	struct btf *btf;
3286
	int nr_linfo;
3287

3288
	btf = prog->aux->btf;
3289
	linfo = prog->aux->linfo;
3290
	jited_linfo = prog->aux->jited_linfo;
3291

3292
	if (!btf || !linfo || !jited_linfo)
3293
		return -EINVAL;
3294
	len = prog->aux->func ? prog->aux->func[prog->aux->func_idx]->len : prog->len;
3295

3296
	linfo = &prog->aux->linfo[prog->aux->linfo_idx];
3297
	jited_linfo = &prog->aux->jited_linfo[prog->aux->linfo_idx];
3298

3299
	insn_start = linfo[0].insn_off;
3300
	insn_end = insn_start + len;
3301
	nr_linfo = prog->aux->nr_linfo - prog->aux->linfo_idx;
3302

3303
	for (int i = 0; i < nr_linfo &&
3304
	     linfo[i].insn_off >= insn_start && linfo[i].insn_off < insn_end; i++) {
3305
		if (jited_linfo[i] >= (void *)ip)
3306
			break;
3307
		idx = i;
3308
	}
3309

3310
	if (idx == -1)
3311
		return -ENOENT;
3312

3313
	/* Get base component of the file path. */
3314
	*filep = btf_name_by_offset(btf, linfo[idx].file_name_off);
3315
	*filep = kbasename(*filep);
3316
	/* Obtain the source line, and strip whitespace in prefix. */
3317
	*linep = btf_name_by_offset(btf, linfo[idx].line_off);
3318
	while (isspace(**linep))
3319
		*linep += 1;
3320
	*nump = BPF_LINE_INFO_LINE_NUM(linfo[idx].line_col);
3321
	return 0;
3322
}
3323

3324
struct walk_stack_ctx {
3325
	struct bpf_prog *prog;
3326
};
3327

3328
static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp)
3329
{
3330
	struct walk_stack_ctx *ctxp = cookie;
3331
	struct bpf_prog *prog;
3332

3333
	/*
3334
	 * The RCU read lock is held to safely traverse the latch tree, but we
3335
	 * don't need its protection when accessing the prog, since it has an
3336
	 * active stack frame on the current stack trace, and won't disappear.
3337
	 */
3338
	rcu_read_lock();
3339
	prog = bpf_prog_ksym_find(ip);
3340
	rcu_read_unlock();
3341
	if (!prog)
3342
		return true;
3343
	/* Make sure we return the main prog if we found a subprog */
3344
	ctxp->prog = prog->aux->main_prog_aux->prog;
3345
	return false;
3346
}
3347

3348
struct bpf_prog *bpf_prog_find_from_stack(void)
3349
{
3350
	struct walk_stack_ctx ctx = {};
3351

3352
	arch_bpf_stack_walk(find_from_stack_cb, &ctx);
3353
	return ctx.prog;
3354
}
3355

3356
#endif
3357

3358
Product

Resources

Company