Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kernel/alternative.c
26424 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
#define pr_fmt(fmt) "SMP alternatives: " fmt
3
4
#include <linux/mmu_context.h>
5
#include <linux/perf_event.h>
6
#include <linux/vmalloc.h>
7
#include <linux/memory.h>
8
#include <linux/execmem.h>
9
10
#include <asm/text-patching.h>
11
#include <asm/insn.h>
12
#include <asm/ibt.h>
13
#include <asm/set_memory.h>
14
#include <asm/nmi.h>
15
16
int __read_mostly alternatives_patched;
17
18
EXPORT_SYMBOL_GPL(alternatives_patched);
19
20
#define MAX_PATCH_LEN (255-1)
21
22
#define DA_ALL (~0)
23
#define DA_ALT 0x01
24
#define DA_RET 0x02
25
#define DA_RETPOLINE 0x04
26
#define DA_ENDBR 0x08
27
#define DA_SMP 0x10
28
29
static unsigned int debug_alternative;
30
31
static int __init debug_alt(char *str)
32
{
33
if (str && *str == '=')
34
str++;
35
36
if (!str || kstrtouint(str, 0, &debug_alternative))
37
debug_alternative = DA_ALL;
38
39
return 1;
40
}
41
__setup("debug-alternative", debug_alt);
42
43
static int noreplace_smp;
44
45
static int __init setup_noreplace_smp(char *str)
46
{
47
noreplace_smp = 1;
48
return 1;
49
}
50
__setup("noreplace-smp", setup_noreplace_smp);
51
52
#define DPRINTK(type, fmt, args...) \
53
do { \
54
if (debug_alternative & DA_##type) \
55
printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \
56
} while (0)
57
58
#define DUMP_BYTES(type, buf, len, fmt, args...) \
59
do { \
60
if (unlikely(debug_alternative & DA_##type)) { \
61
int j; \
62
\
63
if (!(len)) \
64
break; \
65
\
66
printk(KERN_DEBUG pr_fmt(fmt), ##args); \
67
for (j = 0; j < (len) - 1; j++) \
68
printk(KERN_CONT "%02hhx ", buf[j]); \
69
printk(KERN_CONT "%02hhx\n", buf[j]); \
70
} \
71
} while (0)
72
73
static const unsigned char x86nops[] =
74
{
75
BYTES_NOP1,
76
BYTES_NOP2,
77
BYTES_NOP3,
78
BYTES_NOP4,
79
BYTES_NOP5,
80
BYTES_NOP6,
81
BYTES_NOP7,
82
BYTES_NOP8,
83
#ifdef CONFIG_64BIT
84
BYTES_NOP9,
85
BYTES_NOP10,
86
BYTES_NOP11,
87
#endif
88
};
89
90
const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
91
{
92
NULL,
93
x86nops,
94
x86nops + 1,
95
x86nops + 1 + 2,
96
x86nops + 1 + 2 + 3,
97
x86nops + 1 + 2 + 3 + 4,
98
x86nops + 1 + 2 + 3 + 4 + 5,
99
x86nops + 1 + 2 + 3 + 4 + 5 + 6,
100
x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
101
#ifdef CONFIG_64BIT
102
x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
103
x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
104
x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10,
105
#endif
106
};
107
108
#ifdef CONFIG_FINEIBT
109
static bool cfi_paranoid __ro_after_init;
110
#endif
111
112
#ifdef CONFIG_MITIGATION_ITS
113
114
#ifdef CONFIG_MODULES
115
static struct module *its_mod;
116
#endif
117
static void *its_page;
118
static unsigned int its_offset;
119
struct its_array its_pages;
120
121
static void *__its_alloc(struct its_array *pages)
122
{
123
void *page __free(execmem) = execmem_alloc_rw(EXECMEM_MODULE_TEXT, PAGE_SIZE);
124
if (!page)
125
return NULL;
126
127
void *tmp = krealloc(pages->pages, (pages->num+1) * sizeof(void *),
128
GFP_KERNEL);
129
if (!tmp)
130
return NULL;
131
132
pages->pages = tmp;
133
pages->pages[pages->num++] = page;
134
135
return no_free_ptr(page);
136
}
137
138
/* Initialize a thunk with the "jmp *reg; int3" instructions. */
139
static void *its_init_thunk(void *thunk, int reg)
140
{
141
u8 *bytes = thunk;
142
int offset = 0;
143
int i = 0;
144
145
#ifdef CONFIG_FINEIBT
146
if (cfi_paranoid) {
147
/*
148
* When ITS uses indirect branch thunk the fineibt_paranoid
149
* caller sequence doesn't fit in the caller site. So put the
150
* remaining part of the sequence (<ea> + JNE) into the ITS
151
* thunk.
152
*/
153
bytes[i++] = 0xea; /* invalid instruction */
154
bytes[i++] = 0x75; /* JNE */
155
bytes[i++] = 0xfd;
156
157
offset = 1;
158
}
159
#endif
160
161
if (reg >= 8) {
162
bytes[i++] = 0x41; /* REX.B prefix */
163
reg -= 8;
164
}
165
bytes[i++] = 0xff;
166
bytes[i++] = 0xe0 + reg; /* jmp *reg */
167
bytes[i++] = 0xcc;
168
169
return thunk + offset;
170
}
171
172
static void its_pages_protect(struct its_array *pages)
173
{
174
for (int i = 0; i < pages->num; i++) {
175
void *page = pages->pages[i];
176
execmem_restore_rox(page, PAGE_SIZE);
177
}
178
}
179
180
static void its_fini_core(void)
181
{
182
if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
183
its_pages_protect(&its_pages);
184
kfree(its_pages.pages);
185
}
186
187
#ifdef CONFIG_MODULES
188
void its_init_mod(struct module *mod)
189
{
190
if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
191
return;
192
193
mutex_lock(&text_mutex);
194
its_mod = mod;
195
its_page = NULL;
196
}
197
198
void its_fini_mod(struct module *mod)
199
{
200
if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
201
return;
202
203
WARN_ON_ONCE(its_mod != mod);
204
205
its_mod = NULL;
206
its_page = NULL;
207
mutex_unlock(&text_mutex);
208
209
if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
210
its_pages_protect(&mod->arch.its_pages);
211
}
212
213
void its_free_mod(struct module *mod)
214
{
215
if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
216
return;
217
218
for (int i = 0; i < mod->arch.its_pages.num; i++) {
219
void *page = mod->arch.its_pages.pages[i];
220
execmem_free(page);
221
}
222
kfree(mod->arch.its_pages.pages);
223
}
224
#endif /* CONFIG_MODULES */
225
226
static void *its_alloc(void)
227
{
228
struct its_array *pages = &its_pages;
229
void *page;
230
231
#ifdef CONFIG_MODULES
232
if (its_mod)
233
pages = &its_mod->arch.its_pages;
234
#endif
235
236
page = __its_alloc(pages);
237
if (!page)
238
return NULL;
239
240
if (pages == &its_pages)
241
set_memory_x((unsigned long)page, 1);
242
243
return page;
244
}
245
246
static void *its_allocate_thunk(int reg)
247
{
248
int size = 3 + (reg / 8);
249
void *thunk;
250
251
#ifdef CONFIG_FINEIBT
252
/*
253
* The ITS thunk contains an indirect jump and an int3 instruction so
254
* its size is 3 or 4 bytes depending on the register used. If CFI
255
* paranoid is used then 3 extra bytes are added in the ITS thunk to
256
* complete the fineibt_paranoid caller sequence.
257
*/
258
if (cfi_paranoid)
259
size += 3;
260
#endif
261
262
if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) {
263
its_page = its_alloc();
264
if (!its_page) {
265
pr_err("ITS page allocation failed\n");
266
return NULL;
267
}
268
memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE);
269
its_offset = 32;
270
}
271
272
/*
273
* If the indirect branch instruction will be in the lower half
274
* of a cacheline, then update the offset to reach the upper half.
275
*/
276
if ((its_offset + size - 1) % 64 < 32)
277
its_offset = ((its_offset - 1) | 0x3F) + 33;
278
279
thunk = its_page + its_offset;
280
its_offset += size;
281
282
return its_init_thunk(thunk, reg);
283
}
284
285
u8 *its_static_thunk(int reg)
286
{
287
u8 *thunk = __x86_indirect_its_thunk_array[reg];
288
289
#ifdef CONFIG_FINEIBT
290
/* Paranoid thunk starts 2 bytes before */
291
if (cfi_paranoid)
292
return thunk - 2;
293
#endif
294
return thunk;
295
}
296
297
#else
298
static inline void its_fini_core(void) {}
299
#endif /* CONFIG_MITIGATION_ITS */
300
301
/*
302
* Nomenclature for variable names to simplify and clarify this code and ease
303
* any potential staring at it:
304
*
305
* @instr: source address of the original instructions in the kernel text as
306
* generated by the compiler.
307
*
308
* @buf: temporary buffer on which the patching operates. This buffer is
309
* eventually text-poked into the kernel image.
310
*
311
* @replacement/@repl: pointer to the opcodes which are replacing @instr, located
312
* in the .altinstr_replacement section.
313
*/
314
315
/*
316
* Fill the buffer with a single effective instruction of size @len.
317
*
318
* In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
319
* for every single-byte NOP, try to generate the maximally available NOP of
320
* size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for
321
* each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
322
* *jump* over instead of executing long and daft NOPs.
323
*/
324
static void add_nop(u8 *buf, unsigned int len)
325
{
326
u8 *target = buf + len;
327
328
if (!len)
329
return;
330
331
if (len <= ASM_NOP_MAX) {
332
memcpy(buf, x86_nops[len], len);
333
return;
334
}
335
336
if (len < 128) {
337
__text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE);
338
buf += JMP8_INSN_SIZE;
339
} else {
340
__text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE);
341
buf += JMP32_INSN_SIZE;
342
}
343
344
for (;buf < target; buf++)
345
*buf = INT3_INSN_OPCODE;
346
}
347
348
/*
349
* Matches NOP and NOPL, not any of the other possible NOPs.
350
*/
351
static bool insn_is_nop(struct insn *insn)
352
{
353
/* Anything NOP, but no REP NOP */
354
if (insn->opcode.bytes[0] == 0x90 &&
355
(!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3))
356
return true;
357
358
/* NOPL */
359
if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
360
return true;
361
362
/* TODO: more nops */
363
364
return false;
365
}
366
367
/*
368
* Find the offset of the first non-NOP instruction starting at @offset
369
* but no further than @len.
370
*/
371
static int skip_nops(u8 *buf, int offset, int len)
372
{
373
struct insn insn;
374
375
for (; offset < len; offset += insn.length) {
376
if (insn_decode_kernel(&insn, &buf[offset]))
377
break;
378
379
if (!insn_is_nop(&insn))
380
break;
381
}
382
383
return offset;
384
}
385
386
/*
387
* "noinline" to cause control flow change and thus invalidate I$ and
388
* cause refetch after modification.
389
*/
390
static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len)
391
{
392
for (int next, i = 0; i < len; i = next) {
393
struct insn insn;
394
395
if (insn_decode_kernel(&insn, &buf[i]))
396
return;
397
398
next = i + insn.length;
399
400
if (insn_is_nop(&insn)) {
401
int nop = i;
402
403
/* Has the NOP already been optimized? */
404
if (i + insn.length == len)
405
return;
406
407
next = skip_nops(buf, next, len);
408
409
add_nop(buf + nop, next - nop);
410
DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next);
411
}
412
}
413
}
414
415
/*
416
* In this context, "source" is where the instructions are placed in the
417
* section .altinstr_replacement, for example during kernel build by the
418
* toolchain.
419
* "Destination" is where the instructions are being patched in by this
420
* machinery.
421
*
422
* The source offset is:
423
*
424
* src_imm = target - src_next_ip (1)
425
*
426
* and the target offset is:
427
*
428
* dst_imm = target - dst_next_ip (2)
429
*
430
* so rework (1) as an expression for target like:
431
*
432
* target = src_imm + src_next_ip (1a)
433
*
434
* and substitute in (2) to get:
435
*
436
* dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
437
*
438
* Now, since the instruction stream is 'identical' at src and dst (it
439
* is being copied after all) it can be stated that:
440
*
441
* src_next_ip = src + ip_offset
442
* dst_next_ip = dst + ip_offset (4)
443
*
444
* Substitute (4) in (3) and observe ip_offset being cancelled out to
445
* obtain:
446
*
447
* dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
448
* = src_imm + src - dst + ip_offset - ip_offset
449
* = src_imm + src - dst (5)
450
*
451
* IOW, only the relative displacement of the code block matters.
452
*/
453
454
#define apply_reloc_n(n_, p_, d_) \
455
do { \
456
s32 v = *(s##n_ *)(p_); \
457
v += (d_); \
458
BUG_ON((v >> 31) != (v >> (n_-1))); \
459
*(s##n_ *)(p_) = (s##n_)v; \
460
} while (0)
461
462
463
static __always_inline
464
void apply_reloc(int n, void *ptr, uintptr_t diff)
465
{
466
switch (n) {
467
case 1: apply_reloc_n(8, ptr, diff); break;
468
case 2: apply_reloc_n(16, ptr, diff); break;
469
case 4: apply_reloc_n(32, ptr, diff); break;
470
default: BUG();
471
}
472
}
473
474
static __always_inline
475
bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
476
{
477
u8 *target = src + offset;
478
/*
479
* If the target is inside the patched block, it's relative to the
480
* block itself and does not need relocation.
481
*/
482
return (target < src || target > src + src_len);
483
}
484
485
static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
486
{
487
for (int next, i = 0; i < instrlen; i = next) {
488
struct insn insn;
489
490
if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
491
return;
492
493
next = i + insn.length;
494
495
switch (insn.opcode.bytes[0]) {
496
case 0x0f:
497
if (insn.opcode.bytes[1] < 0x80 ||
498
insn.opcode.bytes[1] > 0x8f)
499
break;
500
501
fallthrough; /* Jcc.d32 */
502
case 0x70 ... 0x7f: /* Jcc.d8 */
503
case JMP8_INSN_OPCODE:
504
case JMP32_INSN_OPCODE:
505
case CALL_INSN_OPCODE:
506
if (need_reloc(next + insn.immediate.value, repl, repl_len)) {
507
apply_reloc(insn.immediate.nbytes,
508
buf + i + insn_offset_immediate(&insn),
509
repl - instr);
510
}
511
512
/*
513
* Where possible, convert JMP.d32 into JMP.d8.
514
*/
515
if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
516
s32 imm = insn.immediate.value;
517
imm += repl - instr;
518
imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
519
if ((imm >> 31) == (imm >> 7)) {
520
buf[i+0] = JMP8_INSN_OPCODE;
521
buf[i+1] = (s8)imm;
522
523
memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2);
524
}
525
}
526
break;
527
}
528
529
if (insn_rip_relative(&insn)) {
530
if (need_reloc(next + insn.displacement.value, repl, repl_len)) {
531
apply_reloc(insn.displacement.nbytes,
532
buf + i + insn_offset_displacement(&insn),
533
repl - instr);
534
}
535
}
536
}
537
}
538
539
void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
540
{
541
__apply_relocation(buf, instr, instrlen, repl, repl_len);
542
optimize_nops(instr, buf, instrlen);
543
}
544
545
/* Low-level backend functions usable from alternative code replacements. */
546
DEFINE_ASM_FUNC(nop_func, "", .entry.text);
547
EXPORT_SYMBOL_GPL(nop_func);
548
549
noinstr void BUG_func(void)
550
{
551
BUG();
552
}
553
EXPORT_SYMBOL(BUG_func);
554
555
#define CALL_RIP_REL_OPCODE 0xff
556
#define CALL_RIP_REL_MODRM 0x15
557
558
/*
559
* Rewrite the "call BUG_func" replacement to point to the target of the
560
* indirect pv_ops call "call *disp(%ip)".
561
*/
562
static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
563
{
564
void *target, *bug = &BUG_func;
565
s32 disp;
566
567
if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) {
568
pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n");
569
BUG();
570
}
571
572
if (a->instrlen != 6 ||
573
instr[0] != CALL_RIP_REL_OPCODE ||
574
instr[1] != CALL_RIP_REL_MODRM) {
575
pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n");
576
BUG();
577
}
578
579
/* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */
580
disp = *(s32 *)(instr + 2);
581
#ifdef CONFIG_X86_64
582
/* ff 15 00 00 00 00 call *0x0(%rip) */
583
/* target address is stored at "next instruction + disp". */
584
target = *(void **)(instr + a->instrlen + disp);
585
#else
586
/* ff 15 00 00 00 00 call *0x0 */
587
/* target address is stored at disp. */
588
target = *(void **)disp;
589
#endif
590
if (!target)
591
target = bug;
592
593
/* (BUG_func - .) + (target - BUG_func) := target - . */
594
*(s32 *)(insn_buff + 1) += target - bug;
595
596
if (target == &nop_func)
597
return 0;
598
599
return 5;
600
}
601
602
static inline u8 * instr_va(struct alt_instr *i)
603
{
604
return (u8 *)&i->instr_offset + i->instr_offset;
605
}
606
607
/*
608
* Replace instructions with better alternatives for this CPU type. This runs
609
* before SMP is initialized to avoid SMP problems with self modifying code.
610
* This implies that asymmetric systems where APs have less capabilities than
611
* the boot processor are not handled. Tough. Make sure you disable such
612
* features by hand.
613
*
614
* Marked "noinline" to cause control flow change and thus insn cache
615
* to refetch changed I$ lines.
616
*/
617
void __init_or_module noinline apply_alternatives(struct alt_instr *start,
618
struct alt_instr *end)
619
{
620
u8 insn_buff[MAX_PATCH_LEN];
621
u8 *instr, *replacement;
622
struct alt_instr *a, *b;
623
624
DPRINTK(ALT, "alt table %px, -> %px", start, end);
625
626
/*
627
* KASAN_SHADOW_START is defined using
628
* cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here.
629
* During the process, KASAN becomes confused seeing partial LA57
630
* conversion and triggers a false-positive out-of-bound report.
631
*
632
* Disable KASAN until the patching is complete.
633
*/
634
kasan_disable_current();
635
636
/*
637
* The scan order should be from start to end. A later scanned
638
* alternative code can overwrite previously scanned alternative code.
639
* Some kernel functions (e.g. memcpy, memset, etc) use this order to
640
* patch code.
641
*
642
* So be careful if you want to change the scan order to any other
643
* order.
644
*/
645
for (a = start; a < end; a++) {
646
int insn_buff_sz = 0;
647
648
/*
649
* In case of nested ALTERNATIVE()s the outer alternative might
650
* add more padding. To ensure consistent patching find the max
651
* padding for all alt_instr entries for this site (nested
652
* alternatives result in consecutive entries).
653
*/
654
for (b = a+1; b < end && instr_va(b) == instr_va(a); b++) {
655
u8 len = max(a->instrlen, b->instrlen);
656
a->instrlen = b->instrlen = len;
657
}
658
659
instr = instr_va(a);
660
replacement = (u8 *)&a->repl_offset + a->repl_offset;
661
BUG_ON(a->instrlen > sizeof(insn_buff));
662
BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
663
664
/*
665
* Patch if either:
666
* - feature is present
667
* - feature not present but ALT_FLAG_NOT is set to mean,
668
* patch if feature is *NOT* present.
669
*/
670
if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
671
memcpy(insn_buff, instr, a->instrlen);
672
optimize_nops(instr, insn_buff, a->instrlen);
673
text_poke_early(instr, insn_buff, a->instrlen);
674
continue;
675
}
676
677
DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x",
678
a->cpuid >> 5,
679
a->cpuid & 0x1f,
680
instr, instr, a->instrlen,
681
replacement, a->replacementlen, a->flags);
682
683
memcpy(insn_buff, replacement, a->replacementlen);
684
insn_buff_sz = a->replacementlen;
685
686
if (a->flags & ALT_FLAG_DIRECT_CALL) {
687
insn_buff_sz = alt_replace_call(instr, insn_buff, a);
688
if (insn_buff_sz < 0)
689
continue;
690
}
691
692
for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
693
insn_buff[insn_buff_sz] = 0x90;
694
695
text_poke_apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
696
697
DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr);
698
DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
699
DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
700
701
text_poke_early(instr, insn_buff, insn_buff_sz);
702
}
703
704
kasan_enable_current();
705
}
706
707
static inline bool is_jcc32(struct insn *insn)
708
{
709
/* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
710
return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
711
}
712
713
#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL)
714
715
/*
716
* CALL/JMP *%\reg
717
*/
718
static int emit_indirect(int op, int reg, u8 *bytes)
719
{
720
int i = 0;
721
u8 modrm;
722
723
switch (op) {
724
case CALL_INSN_OPCODE:
725
modrm = 0x10; /* Reg = 2; CALL r/m */
726
break;
727
728
case JMP32_INSN_OPCODE:
729
modrm = 0x20; /* Reg = 4; JMP r/m */
730
break;
731
732
default:
733
WARN_ON_ONCE(1);
734
return -1;
735
}
736
737
if (reg >= 8) {
738
bytes[i++] = 0x41; /* REX.B prefix */
739
reg -= 8;
740
}
741
742
modrm |= 0xc0; /* Mod = 3 */
743
modrm += reg;
744
745
bytes[i++] = 0xff; /* opcode */
746
bytes[i++] = modrm;
747
748
return i;
749
}
750
751
static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes,
752
void *call_dest, void *jmp_dest)
753
{
754
u8 op = insn->opcode.bytes[0];
755
int i = 0;
756
757
/*
758
* Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional
759
* tail-calls. Deal with them.
760
*/
761
if (is_jcc32(insn)) {
762
bytes[i++] = op;
763
op = insn->opcode.bytes[1];
764
goto clang_jcc;
765
}
766
767
if (insn->length == 6)
768
bytes[i++] = 0x2e; /* CS-prefix */
769
770
switch (op) {
771
case CALL_INSN_OPCODE:
772
__text_gen_insn(bytes+i, op, addr+i,
773
call_dest,
774
CALL_INSN_SIZE);
775
i += CALL_INSN_SIZE;
776
break;
777
778
case JMP32_INSN_OPCODE:
779
clang_jcc:
780
__text_gen_insn(bytes+i, op, addr+i,
781
jmp_dest,
782
JMP32_INSN_SIZE);
783
i += JMP32_INSN_SIZE;
784
break;
785
786
default:
787
WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr);
788
return -1;
789
}
790
791
WARN_ON_ONCE(i != insn->length);
792
793
return i;
794
}
795
796
static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
797
{
798
return __emit_trampoline(addr, insn, bytes,
799
__x86_indirect_call_thunk_array[reg],
800
__x86_indirect_jump_thunk_array[reg]);
801
}
802
803
#ifdef CONFIG_MITIGATION_ITS
804
static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
805
{
806
u8 *thunk = __x86_indirect_its_thunk_array[reg];
807
u8 *tmp = its_allocate_thunk(reg);
808
809
if (tmp)
810
thunk = tmp;
811
812
return __emit_trampoline(addr, insn, bytes, thunk, thunk);
813
}
814
815
/* Check if an indirect branch is at ITS-unsafe address */
816
static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
817
{
818
if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
819
return false;
820
821
/* Indirect branch opcode is 2 or 3 bytes depending on reg */
822
addr += 1 + reg / 8;
823
824
/* Lower-half of the cacheline? */
825
return !(addr & 0x20);
826
}
827
#else /* CONFIG_MITIGATION_ITS */
828
829
#ifdef CONFIG_FINEIBT
830
static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
831
{
832
return false;
833
}
834
#endif
835
836
#endif /* CONFIG_MITIGATION_ITS */
837
838
/*
839
* Rewrite the compiler generated retpoline thunk calls.
840
*
841
* For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
842
* indirect instructions, avoiding the extra indirection.
843
*
844
* For example, convert:
845
*
846
* CALL __x86_indirect_thunk_\reg
847
*
848
* into:
849
*
850
* CALL *%\reg
851
*
852
* It also tries to inline spectre_v2=retpoline,lfence when size permits.
853
*/
854
static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
855
{
856
retpoline_thunk_t *target;
857
int reg, ret, i = 0;
858
u8 op, cc;
859
860
target = addr + insn->length + insn->immediate.value;
861
reg = target - __x86_indirect_thunk_array;
862
863
if (WARN_ON_ONCE(reg & ~0xf))
864
return -1;
865
866
/* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
867
BUG_ON(reg == 4);
868
869
if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
870
!cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
871
if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
872
return emit_call_track_retpoline(addr, insn, reg, bytes);
873
874
return -1;
875
}
876
877
op = insn->opcode.bytes[0];
878
879
/*
880
* Convert:
881
*
882
* Jcc.d32 __x86_indirect_thunk_\reg
883
*
884
* into:
885
*
886
* Jncc.d8 1f
887
* [ LFENCE ]
888
* JMP *%\reg
889
* [ NOP ]
890
* 1:
891
*/
892
if (is_jcc32(insn)) {
893
cc = insn->opcode.bytes[1] & 0xf;
894
cc ^= 1; /* invert condition */
895
896
bytes[i++] = 0x70 + cc; /* Jcc.d8 */
897
bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
898
899
/* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
900
op = JMP32_INSN_OPCODE;
901
}
902
903
/*
904
* For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
905
*/
906
if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
907
bytes[i++] = 0x0f;
908
bytes[i++] = 0xae;
909
bytes[i++] = 0xe8; /* LFENCE */
910
}
911
912
#ifdef CONFIG_MITIGATION_ITS
913
/*
914
* Check if the address of last byte of emitted-indirect is in
915
* lower-half of the cacheline. Such branches need ITS mitigation.
916
*/
917
if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + i, reg))
918
return emit_its_trampoline(addr, insn, reg, bytes);
919
#endif
920
921
ret = emit_indirect(op, reg, bytes + i);
922
if (ret < 0)
923
return ret;
924
i += ret;
925
926
/*
927
* The compiler is supposed to EMIT an INT3 after every unconditional
928
* JMP instruction due to AMD BTC. However, if the compiler is too old
929
* or MITIGATION_SLS isn't enabled, we still need an INT3 after
930
* indirect JMPs even on Intel.
931
*/
932
if (op == JMP32_INSN_OPCODE && i < insn->length)
933
bytes[i++] = INT3_INSN_OPCODE;
934
935
for (; i < insn->length;)
936
bytes[i++] = BYTES_NOP1;
937
938
return i;
939
}
940
941
/*
942
* Generated by 'objtool --retpoline'.
943
*/
944
void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
945
{
946
s32 *s;
947
948
for (s = start; s < end; s++) {
949
void *addr = (void *)s + *s;
950
struct insn insn;
951
int len, ret;
952
u8 bytes[16];
953
u8 op1, op2;
954
u8 *dest;
955
956
ret = insn_decode_kernel(&insn, addr);
957
if (WARN_ON_ONCE(ret < 0))
958
continue;
959
960
op1 = insn.opcode.bytes[0];
961
op2 = insn.opcode.bytes[1];
962
963
switch (op1) {
964
case 0x70 ... 0x7f: /* Jcc.d8 */
965
/* See cfi_paranoid. */
966
WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
967
continue;
968
969
case CALL_INSN_OPCODE:
970
case JMP32_INSN_OPCODE:
971
/* Check for cfi_paranoid + ITS */
972
dest = addr + insn.length + insn.immediate.value;
973
if (dest[-1] == 0xea && (dest[0] & 0xf0) == 0x70) {
974
WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
975
continue;
976
}
977
break;
978
979
case 0x0f: /* escape */
980
if (op2 >= 0x80 && op2 <= 0x8f)
981
break;
982
fallthrough;
983
default:
984
WARN_ON_ONCE(1);
985
continue;
986
}
987
988
DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS",
989
addr, addr, insn.length,
990
addr + insn.length + insn.immediate.value);
991
992
len = patch_retpoline(addr, &insn, bytes);
993
if (len == insn.length) {
994
optimize_nops(addr, bytes, len);
995
DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr);
996
DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
997
text_poke_early(addr, bytes, len);
998
}
999
}
1000
}
1001
1002
#ifdef CONFIG_MITIGATION_RETHUNK
1003
1004
bool cpu_wants_rethunk(void)
1005
{
1006
return cpu_feature_enabled(X86_FEATURE_RETHUNK);
1007
}
1008
1009
bool cpu_wants_rethunk_at(void *addr)
1010
{
1011
if (!cpu_feature_enabled(X86_FEATURE_RETHUNK))
1012
return false;
1013
if (x86_return_thunk != its_return_thunk)
1014
return true;
1015
1016
return !((unsigned long)addr & 0x20);
1017
}
1018
1019
/*
1020
* Rewrite the compiler generated return thunk tail-calls.
1021
*
1022
* For example, convert:
1023
*
1024
* JMP __x86_return_thunk
1025
*
1026
* into:
1027
*
1028
* RET
1029
*/
1030
static int patch_return(void *addr, struct insn *insn, u8 *bytes)
1031
{
1032
int i = 0;
1033
1034
/* Patch the custom return thunks... */
1035
if (cpu_wants_rethunk_at(addr)) {
1036
i = JMP32_INSN_SIZE;
1037
__text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
1038
} else {
1039
/* ... or patch them out if not needed. */
1040
bytes[i++] = RET_INSN_OPCODE;
1041
}
1042
1043
for (; i < insn->length;)
1044
bytes[i++] = INT3_INSN_OPCODE;
1045
return i;
1046
}
1047
1048
void __init_or_module noinline apply_returns(s32 *start, s32 *end)
1049
{
1050
s32 *s;
1051
1052
if (cpu_wants_rethunk())
1053
static_call_force_reinit();
1054
1055
for (s = start; s < end; s++) {
1056
void *dest = NULL, *addr = (void *)s + *s;
1057
struct insn insn;
1058
int len, ret;
1059
u8 bytes[16];
1060
u8 op;
1061
1062
ret = insn_decode_kernel(&insn, addr);
1063
if (WARN_ON_ONCE(ret < 0))
1064
continue;
1065
1066
op = insn.opcode.bytes[0];
1067
if (op == JMP32_INSN_OPCODE)
1068
dest = addr + insn.length + insn.immediate.value;
1069
1070
if (__static_call_fixup(addr, op, dest) ||
1071
WARN_ONCE(dest != &__x86_return_thunk,
1072
"missing return thunk: %pS-%pS: %*ph",
1073
addr, dest, 5, addr))
1074
continue;
1075
1076
DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS",
1077
addr, addr, insn.length,
1078
addr + insn.length + insn.immediate.value);
1079
1080
len = patch_return(addr, &insn, bytes);
1081
if (len == insn.length) {
1082
DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr);
1083
DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
1084
text_poke_early(addr, bytes, len);
1085
}
1086
}
1087
}
1088
#else /* !CONFIG_MITIGATION_RETHUNK: */
1089
void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
1090
#endif /* !CONFIG_MITIGATION_RETHUNK */
1091
1092
#else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
1093
1094
void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
1095
void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
1096
1097
#endif /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
1098
1099
#ifdef CONFIG_X86_KERNEL_IBT
1100
1101
__noendbr bool is_endbr(u32 *val)
1102
{
1103
u32 endbr;
1104
1105
__get_kernel_nofault(&endbr, val, u32, Efault);
1106
return __is_endbr(endbr);
1107
1108
Efault:
1109
return false;
1110
}
1111
1112
#ifdef CONFIG_FINEIBT
1113
1114
static __noendbr bool exact_endbr(u32 *val)
1115
{
1116
u32 endbr;
1117
1118
__get_kernel_nofault(&endbr, val, u32, Efault);
1119
return endbr == gen_endbr();
1120
1121
Efault:
1122
return false;
1123
}
1124
1125
#endif
1126
1127
static void poison_cfi(void *addr);
1128
1129
static void __init_or_module poison_endbr(void *addr)
1130
{
1131
u32 poison = gen_endbr_poison();
1132
1133
if (WARN_ON_ONCE(!is_endbr(addr)))
1134
return;
1135
1136
DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);
1137
1138
/*
1139
* When we have IBT, the lack of ENDBR will trigger #CP
1140
*/
1141
DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
1142
DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
1143
text_poke_early(addr, &poison, 4);
1144
}
1145
1146
/*
1147
* Generated by: objtool --ibt
1148
*
1149
* Seal the functions for indirect calls by clobbering the ENDBR instructions
1150
* and the kCFI hash value.
1151
*/
1152
void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end)
1153
{
1154
s32 *s;
1155
1156
for (s = start; s < end; s++) {
1157
void *addr = (void *)s + *s;
1158
1159
poison_endbr(addr);
1160
if (IS_ENABLED(CONFIG_FINEIBT))
1161
poison_cfi(addr - 16);
1162
}
1163
}
1164
1165
#else /* !CONFIG_X86_KERNEL_IBT: */
1166
1167
void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
1168
1169
#endif /* !CONFIG_X86_KERNEL_IBT */
1170
1171
#ifdef CONFIG_CFI_AUTO_DEFAULT
1172
# define __CFI_DEFAULT CFI_AUTO
1173
#elif defined(CONFIG_CFI_CLANG)
1174
# define __CFI_DEFAULT CFI_KCFI
1175
#else
1176
# define __CFI_DEFAULT CFI_OFF
1177
#endif
1178
1179
enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
1180
1181
#ifdef CONFIG_FINEIBT_BHI
1182
bool cfi_bhi __ro_after_init = false;
1183
#endif
1184
1185
#ifdef CONFIG_CFI_CLANG
1186
u32 cfi_get_func_hash(void *func)
1187
{
1188
u32 hash;
1189
1190
func -= cfi_get_offset();
1191
switch (cfi_mode) {
1192
case CFI_FINEIBT:
1193
func += 7;
1194
break;
1195
case CFI_KCFI:
1196
func += 1;
1197
break;
1198
default:
1199
return 0;
1200
}
1201
1202
if (get_kernel_nofault(hash, func))
1203
return 0;
1204
1205
return hash;
1206
}
1207
1208
int cfi_get_func_arity(void *func)
1209
{
1210
bhi_thunk *target;
1211
s32 disp;
1212
1213
if (cfi_mode != CFI_FINEIBT && !cfi_bhi)
1214
return 0;
1215
1216
if (get_kernel_nofault(disp, func - 4))
1217
return 0;
1218
1219
target = func + disp;
1220
return target - __bhi_args;
1221
}
1222
#endif
1223
1224
#ifdef CONFIG_FINEIBT
1225
1226
static bool cfi_rand __ro_after_init = true;
1227
static u32 cfi_seed __ro_after_init;
1228
1229
/*
1230
* Re-hash the CFI hash with a boot-time seed while making sure the result is
1231
* not a valid ENDBR instruction.
1232
*/
1233
static u32 cfi_rehash(u32 hash)
1234
{
1235
hash ^= cfi_seed;
1236
while (unlikely(__is_endbr(hash) || __is_endbr(-hash))) {
1237
bool lsb = hash & 1;
1238
hash >>= 1;
1239
if (lsb)
1240
hash ^= 0x80200003;
1241
}
1242
return hash;
1243
}
1244
1245
static __init int cfi_parse_cmdline(char *str)
1246
{
1247
if (!str)
1248
return -EINVAL;
1249
1250
while (str) {
1251
char *next = strchr(str, ',');
1252
if (next) {
1253
*next = 0;
1254
next++;
1255
}
1256
1257
if (!strcmp(str, "auto")) {
1258
cfi_mode = CFI_AUTO;
1259
} else if (!strcmp(str, "off")) {
1260
cfi_mode = CFI_OFF;
1261
cfi_rand = false;
1262
} else if (!strcmp(str, "kcfi")) {
1263
cfi_mode = CFI_KCFI;
1264
} else if (!strcmp(str, "fineibt")) {
1265
cfi_mode = CFI_FINEIBT;
1266
} else if (!strcmp(str, "norand")) {
1267
cfi_rand = false;
1268
} else if (!strcmp(str, "warn")) {
1269
pr_alert("CFI mismatch non-fatal!\n");
1270
cfi_warn = true;
1271
} else if (!strcmp(str, "paranoid")) {
1272
if (cfi_mode == CFI_FINEIBT) {
1273
cfi_paranoid = true;
1274
} else {
1275
pr_err("Ignoring paranoid; depends on fineibt.\n");
1276
}
1277
} else if (!strcmp(str, "bhi")) {
1278
#ifdef CONFIG_FINEIBT_BHI
1279
if (cfi_mode == CFI_FINEIBT) {
1280
cfi_bhi = true;
1281
} else {
1282
pr_err("Ignoring bhi; depends on fineibt.\n");
1283
}
1284
#else
1285
pr_err("Ignoring bhi; depends on FINEIBT_BHI=y.\n");
1286
#endif
1287
} else {
1288
pr_err("Ignoring unknown cfi option (%s).", str);
1289
}
1290
1291
str = next;
1292
}
1293
1294
return 0;
1295
}
1296
early_param("cfi", cfi_parse_cmdline);
1297
1298
/*
1299
* kCFI FineIBT
1300
*
1301
* __cfi_\func: __cfi_\func:
1302
* movl $0x12345678,%eax // 5 endbr64 // 4
1303
* nop subl $0x12345678,%r10d // 7
1304
* nop jne __cfi_\func+6 // 2
1305
* nop nop3 // 3
1306
* nop
1307
* nop
1308
* nop
1309
* nop
1310
* nop
1311
* nop
1312
* nop
1313
* nop
1314
*
1315
*
1316
* caller: caller:
1317
* movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6
1318
* addl $-15(%r11),%r10d // 4 lea -0x10(%r11),%r11 // 4
1319
* je 1f // 2 nop4 // 4
1320
* ud2 // 2
1321
* 1: cs call __x86_indirect_thunk_r11 // 6 call *%r11; nop3; // 6
1322
*
1323
*/
1324
1325
/*
1326
* <fineibt_preamble_start>:
1327
* 0: f3 0f 1e fa endbr64
1328
* 4: 41 81 <ea> 78 56 34 12 sub $0x12345678, %r10d
1329
* b: 75 f9 jne 6 <fineibt_preamble_start+0x6>
1330
* d: 0f 1f 00 nopl (%rax)
1331
*
1332
* Note that the JNE target is the 0xEA byte inside the SUB, this decodes as
1333
* (bad) on x86_64 and raises #UD.
1334
*/
1335
asm( ".pushsection .rodata \n"
1336
"fineibt_preamble_start: \n"
1337
" endbr64 \n"
1338
" subl $0x12345678, %r10d \n"
1339
"fineibt_preamble_bhi: \n"
1340
" jne fineibt_preamble_start+6 \n"
1341
ASM_NOP3
1342
"fineibt_preamble_end: \n"
1343
".popsection\n"
1344
);
1345
1346
extern u8 fineibt_preamble_start[];
1347
extern u8 fineibt_preamble_bhi[];
1348
extern u8 fineibt_preamble_end[];
1349
1350
#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
1351
#define fineibt_preamble_bhi (fineibt_preamble_bhi - fineibt_preamble_start)
1352
#define fineibt_preamble_ud 6
1353
#define fineibt_preamble_hash 7
1354
1355
/*
1356
* <fineibt_caller_start>:
1357
* 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d
1358
* 6: 4d 8d 5b f0 lea -0x10(%r11), %r11
1359
* a: 0f 1f 40 00 nopl 0x0(%rax)
1360
*/
1361
asm( ".pushsection .rodata \n"
1362
"fineibt_caller_start: \n"
1363
" movl $0x12345678, %r10d \n"
1364
" lea -0x10(%r11), %r11 \n"
1365
ASM_NOP4
1366
"fineibt_caller_end: \n"
1367
".popsection \n"
1368
);
1369
1370
extern u8 fineibt_caller_start[];
1371
extern u8 fineibt_caller_end[];
1372
1373
#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start)
1374
#define fineibt_caller_hash 2
1375
1376
#define fineibt_caller_jmp (fineibt_caller_size - 2)
1377
1378
/*
1379
* Since FineIBT does hash validation on the callee side it is prone to
1380
* circumvention attacks where a 'naked' ENDBR instruction exists that
1381
* is not part of the fineibt_preamble sequence.
1382
*
1383
* Notably the x86 entry points must be ENDBR and equally cannot be
1384
* fineibt_preamble.
1385
*
1386
* The fineibt_paranoid caller sequence adds additional caller side
1387
* hash validation. This stops such circumvention attacks dead, but at the cost
1388
* of adding a load.
1389
*
1390
* <fineibt_paranoid_start>:
1391
* 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d
1392
* 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d
1393
* a: 4d 8d 5b <f0> lea -0x10(%r11), %r11
1394
* e: 75 fd jne d <fineibt_paranoid_start+0xd>
1395
* 10: 41 ff d3 call *%r11
1396
* 13: 90 nop
1397
*
1398
* Notably LEA does not modify flags and can be reordered with the CMP,
1399
* avoiding a dependency. Again, using a non-taken (backwards) branch
1400
* for the failure case, abusing LEA's immediate 0xf0 as LOCK prefix for the
1401
* Jcc.d8, causing #UD.
1402
*/
1403
asm( ".pushsection .rodata \n"
1404
"fineibt_paranoid_start: \n"
1405
" movl $0x12345678, %r10d \n"
1406
" cmpl -9(%r11), %r10d \n"
1407
" lea -0x10(%r11), %r11 \n"
1408
" jne fineibt_paranoid_start+0xd \n"
1409
"fineibt_paranoid_ind: \n"
1410
" call *%r11 \n"
1411
" nop \n"
1412
"fineibt_paranoid_end: \n"
1413
".popsection \n"
1414
);
1415
1416
extern u8 fineibt_paranoid_start[];
1417
extern u8 fineibt_paranoid_ind[];
1418
extern u8 fineibt_paranoid_end[];
1419
1420
#define fineibt_paranoid_size (fineibt_paranoid_end - fineibt_paranoid_start)
1421
#define fineibt_paranoid_ind (fineibt_paranoid_ind - fineibt_paranoid_start)
1422
#define fineibt_paranoid_ud 0xd
1423
1424
static u32 decode_preamble_hash(void *addr, int *reg)
1425
{
1426
u8 *p = addr;
1427
1428
/* b8+reg 78 56 34 12 movl $0x12345678,\reg */
1429
if (p[0] >= 0xb8 && p[0] < 0xc0) {
1430
if (reg)
1431
*reg = p[0] - 0xb8;
1432
return *(u32 *)(addr + 1);
1433
}
1434
1435
return 0; /* invalid hash value */
1436
}
1437
1438
static u32 decode_caller_hash(void *addr)
1439
{
1440
u8 *p = addr;
1441
1442
/* 41 ba 88 a9 cb ed mov $(-0x12345678),%r10d */
1443
if (p[0] == 0x41 && p[1] == 0xba)
1444
return -*(u32 *)(addr + 2);
1445
1446
/* e8 0c 88 a9 cb ed jmp.d8 +12 */
1447
if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
1448
return -*(u32 *)(addr + 2);
1449
1450
return 0; /* invalid hash value */
1451
}
1452
1453
/* .retpoline_sites */
1454
static int cfi_disable_callers(s32 *start, s32 *end)
1455
{
1456
/*
1457
* Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
1458
* in tact for later usage. Also see decode_caller_hash() and
1459
* cfi_rewrite_callers().
1460
*/
1461
const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp };
1462
s32 *s;
1463
1464
for (s = start; s < end; s++) {
1465
void *addr = (void *)s + *s;
1466
u32 hash;
1467
1468
addr -= fineibt_caller_size;
1469
hash = decode_caller_hash(addr);
1470
if (!hash) /* nocfi callers */
1471
continue;
1472
1473
text_poke_early(addr, jmp, 2);
1474
}
1475
1476
return 0;
1477
}
1478
1479
static int cfi_enable_callers(s32 *start, s32 *end)
1480
{
1481
/*
1482
* Re-enable kCFI, undo what cfi_disable_callers() did.
1483
*/
1484
const u8 mov[] = { 0x41, 0xba };
1485
s32 *s;
1486
1487
for (s = start; s < end; s++) {
1488
void *addr = (void *)s + *s;
1489
u32 hash;
1490
1491
addr -= fineibt_caller_size;
1492
hash = decode_caller_hash(addr);
1493
if (!hash) /* nocfi callers */
1494
continue;
1495
1496
text_poke_early(addr, mov, 2);
1497
}
1498
1499
return 0;
1500
}
1501
1502
/* .cfi_sites */
1503
static int cfi_rand_preamble(s32 *start, s32 *end)
1504
{
1505
s32 *s;
1506
1507
for (s = start; s < end; s++) {
1508
void *addr = (void *)s + *s;
1509
u32 hash;
1510
1511
hash = decode_preamble_hash(addr, NULL);
1512
if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
1513
addr, addr, 5, addr))
1514
return -EINVAL;
1515
1516
hash = cfi_rehash(hash);
1517
text_poke_early(addr + 1, &hash, 4);
1518
}
1519
1520
return 0;
1521
}
1522
1523
static void cfi_fineibt_bhi_preamble(void *addr, int arity)
1524
{
1525
if (!arity)
1526
return;
1527
1528
if (!cfi_warn && arity == 1) {
1529
/*
1530
* Crazy scheme to allow arity-1 inline:
1531
*
1532
* __cfi_foo:
1533
* 0: f3 0f 1e fa endbr64
1534
* 4: 41 81 <ea> 78 56 34 12 sub 0x12345678, %r10d
1535
* b: 49 0f 45 fa cmovne %r10, %rdi
1536
* f: 75 f5 jne __cfi_foo+6
1537
* 11: 0f 1f 00 nopl (%rax)
1538
*
1539
* Code that direct calls to foo()+0, decodes the tail end as:
1540
*
1541
* foo:
1542
* 0: f5 cmc
1543
* 1: 0f 1f 00 nopl (%rax)
1544
*
1545
* which clobbers CF, but does not affect anything ABI
1546
* wise.
1547
*
1548
* Notably, this scheme is incompatible with permissive CFI
1549
* because the CMOVcc is unconditional and RDI will have been
1550
* clobbered.
1551
*/
1552
const u8 magic[9] = {
1553
0x49, 0x0f, 0x45, 0xfa,
1554
0x75, 0xf5,
1555
BYTES_NOP3,
1556
};
1557
1558
text_poke_early(addr + fineibt_preamble_bhi, magic, 9);
1559
1560
return;
1561
}
1562
1563
text_poke_early(addr + fineibt_preamble_bhi,
1564
text_gen_insn(CALL_INSN_OPCODE,
1565
addr + fineibt_preamble_bhi,
1566
__bhi_args[arity]),
1567
CALL_INSN_SIZE);
1568
}
1569
1570
static int cfi_rewrite_preamble(s32 *start, s32 *end)
1571
{
1572
s32 *s;
1573
1574
for (s = start; s < end; s++) {
1575
void *addr = (void *)s + *s;
1576
int arity;
1577
u32 hash;
1578
1579
/*
1580
* When the function doesn't start with ENDBR the compiler will
1581
* have determined there are no indirect calls to it and we
1582
* don't need no CFI either.
1583
*/
1584
if (!is_endbr(addr + 16))
1585
continue;
1586
1587
hash = decode_preamble_hash(addr, &arity);
1588
if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
1589
addr, addr, 5, addr))
1590
return -EINVAL;
1591
1592
text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
1593
WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
1594
text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
1595
1596
WARN_ONCE(!IS_ENABLED(CONFIG_FINEIBT_BHI) && arity,
1597
"kCFI preamble has wrong register at: %pS %*ph\n",
1598
addr, 5, addr);
1599
1600
if (cfi_bhi)
1601
cfi_fineibt_bhi_preamble(addr, arity);
1602
}
1603
1604
return 0;
1605
}
1606
1607
static void cfi_rewrite_endbr(s32 *start, s32 *end)
1608
{
1609
s32 *s;
1610
1611
for (s = start; s < end; s++) {
1612
void *addr = (void *)s + *s;
1613
1614
if (!exact_endbr(addr + 16))
1615
continue;
1616
1617
poison_endbr(addr + 16);
1618
}
1619
}
1620
1621
/* .retpoline_sites */
1622
static int cfi_rand_callers(s32 *start, s32 *end)
1623
{
1624
s32 *s;
1625
1626
for (s = start; s < end; s++) {
1627
void *addr = (void *)s + *s;
1628
u32 hash;
1629
1630
addr -= fineibt_caller_size;
1631
hash = decode_caller_hash(addr);
1632
if (hash) {
1633
hash = -cfi_rehash(hash);
1634
text_poke_early(addr + 2, &hash, 4);
1635
}
1636
}
1637
1638
return 0;
1639
}
1640
1641
static int emit_paranoid_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
1642
{
1643
u8 *thunk = (void *)__x86_indirect_its_thunk_array[reg] - 2;
1644
1645
#ifdef CONFIG_MITIGATION_ITS
1646
u8 *tmp = its_allocate_thunk(reg);
1647
if (tmp)
1648
thunk = tmp;
1649
#endif
1650
1651
return __emit_trampoline(addr, insn, bytes, thunk, thunk);
1652
}
1653
1654
static int cfi_rewrite_callers(s32 *start, s32 *end)
1655
{
1656
s32 *s;
1657
1658
BUG_ON(fineibt_paranoid_size != 20);
1659
1660
for (s = start; s < end; s++) {
1661
void *addr = (void *)s + *s;
1662
struct insn insn;
1663
u8 bytes[20];
1664
u32 hash;
1665
int ret;
1666
u8 op;
1667
1668
addr -= fineibt_caller_size;
1669
hash = decode_caller_hash(addr);
1670
if (!hash)
1671
continue;
1672
1673
if (!cfi_paranoid) {
1674
text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
1675
WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
1676
text_poke_early(addr + fineibt_caller_hash, &hash, 4);
1677
/* rely on apply_retpolines() */
1678
continue;
1679
}
1680
1681
/* cfi_paranoid */
1682
ret = insn_decode_kernel(&insn, addr + fineibt_caller_size);
1683
if (WARN_ON_ONCE(ret < 0))
1684
continue;
1685
1686
op = insn.opcode.bytes[0];
1687
if (op != CALL_INSN_OPCODE && op != JMP32_INSN_OPCODE) {
1688
WARN_ON_ONCE(1);
1689
continue;
1690
}
1691
1692
memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size);
1693
memcpy(bytes + fineibt_caller_hash, &hash, 4);
1694
1695
if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + fineibt_paranoid_ind, 11)) {
1696
emit_paranoid_trampoline(addr + fineibt_caller_size,
1697
&insn, 11, bytes + fineibt_caller_size);
1698
} else {
1699
ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind);
1700
if (WARN_ON_ONCE(ret != 3))
1701
continue;
1702
}
1703
1704
text_poke_early(addr, bytes, fineibt_paranoid_size);
1705
}
1706
1707
return 0;
1708
}
1709
1710
static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1711
s32 *start_cfi, s32 *end_cfi, bool builtin)
1712
{
1713
int ret;
1714
1715
if (WARN_ONCE(fineibt_preamble_size != 16,
1716
"FineIBT preamble wrong size: %ld", fineibt_preamble_size))
1717
return;
1718
1719
if (cfi_mode == CFI_AUTO) {
1720
cfi_mode = CFI_KCFI;
1721
if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) {
1722
/*
1723
* FRED has much saner context on exception entry and
1724
* is less easy to take advantage of.
1725
*/
1726
if (!cpu_feature_enabled(X86_FEATURE_FRED))
1727
cfi_paranoid = true;
1728
cfi_mode = CFI_FINEIBT;
1729
}
1730
}
1731
1732
/*
1733
* Rewrite the callers to not use the __cfi_ stubs, such that we might
1734
* rewrite them. This disables all CFI. If this succeeds but any of the
1735
* later stages fails, we're without CFI.
1736
*/
1737
ret = cfi_disable_callers(start_retpoline, end_retpoline);
1738
if (ret)
1739
goto err;
1740
1741
if (cfi_rand) {
1742
if (builtin) {
1743
cfi_seed = get_random_u32();
1744
cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
1745
cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
1746
}
1747
1748
ret = cfi_rand_preamble(start_cfi, end_cfi);
1749
if (ret)
1750
goto err;
1751
1752
ret = cfi_rand_callers(start_retpoline, end_retpoline);
1753
if (ret)
1754
goto err;
1755
}
1756
1757
switch (cfi_mode) {
1758
case CFI_OFF:
1759
if (builtin)
1760
pr_info("Disabling CFI\n");
1761
return;
1762
1763
case CFI_KCFI:
1764
ret = cfi_enable_callers(start_retpoline, end_retpoline);
1765
if (ret)
1766
goto err;
1767
1768
if (builtin)
1769
pr_info("Using kCFI\n");
1770
return;
1771
1772
case CFI_FINEIBT:
1773
/* place the FineIBT preamble at func()-16 */
1774
ret = cfi_rewrite_preamble(start_cfi, end_cfi);
1775
if (ret)
1776
goto err;
1777
1778
/* rewrite the callers to target func()-16 */
1779
ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
1780
if (ret)
1781
goto err;
1782
1783
/* now that nobody targets func()+0, remove ENDBR there */
1784
cfi_rewrite_endbr(start_cfi, end_cfi);
1785
1786
if (builtin) {
1787
pr_info("Using %sFineIBT%s CFI\n",
1788
cfi_paranoid ? "paranoid " : "",
1789
cfi_bhi ? "+BHI" : "");
1790
}
1791
return;
1792
1793
default:
1794
break;
1795
}
1796
1797
err:
1798
pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
1799
}
1800
1801
static inline void poison_hash(void *addr)
1802
{
1803
*(u32 *)addr = 0;
1804
}
1805
1806
static void poison_cfi(void *addr)
1807
{
1808
/*
1809
* Compilers manage to be inconsistent with ENDBR vs __cfi prefixes,
1810
* some (static) functions for which they can determine the address
1811
* is never taken do not get a __cfi prefix, but *DO* get an ENDBR.
1812
*
1813
* As such, these functions will get sealed, but we need to be careful
1814
* to not unconditionally scribble the previous function.
1815
*/
1816
switch (cfi_mode) {
1817
case CFI_FINEIBT:
1818
/*
1819
* FineIBT prefix should start with an ENDBR.
1820
*/
1821
if (!is_endbr(addr))
1822
break;
1823
1824
/*
1825
* __cfi_\func:
1826
* osp nopl (%rax)
1827
* subl $0, %r10d
1828
* jz 1f
1829
* ud2
1830
* 1: nop
1831
*/
1832
poison_endbr(addr);
1833
poison_hash(addr + fineibt_preamble_hash);
1834
break;
1835
1836
case CFI_KCFI:
1837
/*
1838
* kCFI prefix should start with a valid hash.
1839
*/
1840
if (!decode_preamble_hash(addr, NULL))
1841
break;
1842
1843
/*
1844
* __cfi_\func:
1845
* movl $0, %eax
1846
* .skip 11, 0x90
1847
*/
1848
poison_hash(addr + 1);
1849
break;
1850
1851
default:
1852
break;
1853
}
1854
}
1855
1856
/*
1857
* When regs->ip points to a 0xEA byte in the FineIBT preamble,
1858
* return true and fill out target and type.
1859
*
1860
* We check the preamble by checking for the ENDBR instruction relative to the
1861
* 0xEA instruction.
1862
*/
1863
static bool decode_fineibt_preamble(struct pt_regs *regs, unsigned long *target, u32 *type)
1864
{
1865
unsigned long addr = regs->ip - fineibt_preamble_ud;
1866
u32 hash;
1867
1868
if (!exact_endbr((void *)addr))
1869
return false;
1870
1871
*target = addr + fineibt_preamble_size;
1872
1873
__get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
1874
*type = (u32)regs->r10 + hash;
1875
1876
/*
1877
* Since regs->ip points to the middle of an instruction; it cannot
1878
* continue with the normal fixup.
1879
*/
1880
regs->ip = *target;
1881
1882
return true;
1883
1884
Efault:
1885
return false;
1886
}
1887
1888
/*
1889
* regs->ip points to one of the UD2 in __bhi_args[].
1890
*/
1891
static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 *type)
1892
{
1893
unsigned long addr;
1894
u32 hash;
1895
1896
if (!cfi_bhi)
1897
return false;
1898
1899
if (regs->ip < (unsigned long)__bhi_args ||
1900
regs->ip >= (unsigned long)__bhi_args_end)
1901
return false;
1902
1903
/*
1904
* Fetch the return address from the stack, this points to the
1905
* FineIBT preamble. Since the CALL instruction is in the 5 last
1906
* bytes of the preamble, the return address is in fact the target
1907
* address.
1908
*/
1909
__get_kernel_nofault(&addr, regs->sp, unsigned long, Efault);
1910
*target = addr;
1911
1912
addr -= fineibt_preamble_size;
1913
if (!exact_endbr((void *)addr))
1914
return false;
1915
1916
__get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
1917
*type = (u32)regs->r10 + hash;
1918
1919
/*
1920
* The UD2 sites are constructed with a RET immediately following,
1921
* as such the non-fatal case can use the regular fixup.
1922
*/
1923
return true;
1924
1925
Efault:
1926
return false;
1927
}
1928
1929
static bool is_paranoid_thunk(unsigned long addr)
1930
{
1931
u32 thunk;
1932
1933
__get_kernel_nofault(&thunk, (u32 *)addr, u32, Efault);
1934
return (thunk & 0x00FFFFFF) == 0xfd75ea;
1935
1936
Efault:
1937
return false;
1938
}
1939
1940
/*
1941
* regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[]
1942
* sequence, or to an invalid instruction (0xea) + Jcc.d8 for cfi_paranoid + ITS
1943
* thunk.
1944
*/
1945
static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type)
1946
{
1947
unsigned long addr = regs->ip - fineibt_paranoid_ud;
1948
1949
if (!cfi_paranoid)
1950
return false;
1951
1952
if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) {
1953
*target = regs->r11 + fineibt_preamble_size;
1954
*type = regs->r10;
1955
1956
/*
1957
* Since the trapping instruction is the exact, but LOCK prefixed,
1958
* Jcc.d8 that got us here, the normal fixup will work.
1959
*/
1960
return true;
1961
}
1962
1963
/*
1964
* The cfi_paranoid + ITS thunk combination results in:
1965
*
1966
* 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d
1967
* 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d
1968
* a: 4d 8d 5b f0 lea -0x10(%r11), %r11
1969
* e: 2e e8 XX XX XX XX cs call __x86_indirect_paranoid_thunk_r11
1970
*
1971
* Where the paranoid_thunk looks like:
1972
*
1973
* 1d: <ea> (bad)
1974
* __x86_indirect_paranoid_thunk_r11:
1975
* 1e: 75 fd jne 1d
1976
* __x86_indirect_its_thunk_r11:
1977
* 20: 41 ff eb jmp *%r11
1978
* 23: cc int3
1979
*
1980
*/
1981
if (is_paranoid_thunk(regs->ip)) {
1982
*target = regs->r11 + fineibt_preamble_size;
1983
*type = regs->r10;
1984
1985
regs->ip = *target;
1986
return true;
1987
}
1988
1989
return false;
1990
}
1991
1992
bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type)
1993
{
1994
if (decode_fineibt_paranoid(regs, target, type))
1995
return true;
1996
1997
if (decode_fineibt_bhi(regs, target, type))
1998
return true;
1999
2000
return decode_fineibt_preamble(regs, target, type);
2001
}
2002
2003
#else /* !CONFIG_FINEIBT: */
2004
2005
static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
2006
s32 *start_cfi, s32 *end_cfi, bool builtin)
2007
{
2008
}
2009
2010
#ifdef CONFIG_X86_KERNEL_IBT
2011
static void poison_cfi(void *addr) { }
2012
#endif
2013
2014
#endif /* !CONFIG_FINEIBT */
2015
2016
void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
2017
s32 *start_cfi, s32 *end_cfi)
2018
{
2019
return __apply_fineibt(start_retpoline, end_retpoline,
2020
start_cfi, end_cfi,
2021
/* .builtin = */ false);
2022
}
2023
2024
#ifdef CONFIG_SMP
2025
static void alternatives_smp_lock(const s32 *start, const s32 *end,
2026
u8 *text, u8 *text_end)
2027
{
2028
const s32 *poff;
2029
2030
for (poff = start; poff < end; poff++) {
2031
u8 *ptr = (u8 *)poff + *poff;
2032
2033
if (!*poff || ptr < text || ptr >= text_end)
2034
continue;
2035
/* turn DS segment override prefix into lock prefix */
2036
if (*ptr == 0x3e)
2037
text_poke(ptr, ((unsigned char []){0xf0}), 1);
2038
}
2039
}
2040
2041
static void alternatives_smp_unlock(const s32 *start, const s32 *end,
2042
u8 *text, u8 *text_end)
2043
{
2044
const s32 *poff;
2045
2046
for (poff = start; poff < end; poff++) {
2047
u8 *ptr = (u8 *)poff + *poff;
2048
2049
if (!*poff || ptr < text || ptr >= text_end)
2050
continue;
2051
/* turn lock prefix into DS segment override prefix */
2052
if (*ptr == 0xf0)
2053
text_poke(ptr, ((unsigned char []){0x3E}), 1);
2054
}
2055
}
2056
2057
struct smp_alt_module {
2058
/* what is this ??? */
2059
struct module *mod;
2060
char *name;
2061
2062
/* ptrs to lock prefixes */
2063
const s32 *locks;
2064
const s32 *locks_end;
2065
2066
/* .text segment, needed to avoid patching init code ;) */
2067
u8 *text;
2068
u8 *text_end;
2069
2070
struct list_head next;
2071
};
2072
static LIST_HEAD(smp_alt_modules);
2073
static bool uniproc_patched = false; /* protected by text_mutex */
2074
2075
void __init_or_module alternatives_smp_module_add(struct module *mod,
2076
char *name,
2077
void *locks, void *locks_end,
2078
void *text, void *text_end)
2079
{
2080
struct smp_alt_module *smp;
2081
2082
mutex_lock(&text_mutex);
2083
if (!uniproc_patched)
2084
goto unlock;
2085
2086
if (num_possible_cpus() == 1)
2087
/* Don't bother remembering, we'll never have to undo it. */
2088
goto smp_unlock;
2089
2090
smp = kzalloc(sizeof(*smp), GFP_KERNEL);
2091
if (NULL == smp)
2092
/* we'll run the (safe but slow) SMP code then ... */
2093
goto unlock;
2094
2095
smp->mod = mod;
2096
smp->name = name;
2097
smp->locks = locks;
2098
smp->locks_end = locks_end;
2099
smp->text = text;
2100
smp->text_end = text_end;
2101
DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n",
2102
smp->locks, smp->locks_end,
2103
smp->text, smp->text_end, smp->name);
2104
2105
list_add_tail(&smp->next, &smp_alt_modules);
2106
smp_unlock:
2107
alternatives_smp_unlock(locks, locks_end, text, text_end);
2108
unlock:
2109
mutex_unlock(&text_mutex);
2110
}
2111
2112
void __init_or_module alternatives_smp_module_del(struct module *mod)
2113
{
2114
struct smp_alt_module *item;
2115
2116
mutex_lock(&text_mutex);
2117
list_for_each_entry(item, &smp_alt_modules, next) {
2118
if (mod != item->mod)
2119
continue;
2120
list_del(&item->next);
2121
kfree(item);
2122
break;
2123
}
2124
mutex_unlock(&text_mutex);
2125
}
2126
2127
void alternatives_enable_smp(void)
2128
{
2129
struct smp_alt_module *mod;
2130
2131
/* Why bother if there are no other CPUs? */
2132
BUG_ON(num_possible_cpus() == 1);
2133
2134
mutex_lock(&text_mutex);
2135
2136
if (uniproc_patched) {
2137
pr_info("switching to SMP code\n");
2138
BUG_ON(num_online_cpus() != 1);
2139
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
2140
clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
2141
list_for_each_entry(mod, &smp_alt_modules, next)
2142
alternatives_smp_lock(mod->locks, mod->locks_end,
2143
mod->text, mod->text_end);
2144
uniproc_patched = false;
2145
}
2146
mutex_unlock(&text_mutex);
2147
}
2148
2149
/*
2150
* Return 1 if the address range is reserved for SMP-alternatives.
2151
* Must hold text_mutex.
2152
*/
2153
int alternatives_text_reserved(void *start, void *end)
2154
{
2155
struct smp_alt_module *mod;
2156
const s32 *poff;
2157
u8 *text_start = start;
2158
u8 *text_end = end;
2159
2160
lockdep_assert_held(&text_mutex);
2161
2162
list_for_each_entry(mod, &smp_alt_modules, next) {
2163
if (mod->text > text_end || mod->text_end < text_start)
2164
continue;
2165
for (poff = mod->locks; poff < mod->locks_end; poff++) {
2166
const u8 *ptr = (const u8 *)poff + *poff;
2167
2168
if (text_start <= ptr && text_end > ptr)
2169
return 1;
2170
}
2171
}
2172
2173
return 0;
2174
}
2175
#endif /* CONFIG_SMP */
2176
2177
/*
2178
* Self-test for the INT3 based CALL emulation code.
2179
*
2180
* This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
2181
* properly and that there is a stack gap between the INT3 frame and the
2182
* previous context. Without this gap doing a virtual PUSH on the interrupted
2183
* stack would corrupt the INT3 IRET frame.
2184
*
2185
* See entry_{32,64}.S for more details.
2186
*/
2187
2188
/*
2189
* We define the int3_magic() function in assembly to control the calling
2190
* convention such that we can 'call' it from assembly.
2191
*/
2192
2193
extern void int3_magic(unsigned int *ptr); /* defined in asm */
2194
2195
asm (
2196
" .pushsection .init.text, \"ax\", @progbits\n"
2197
" .type int3_magic, @function\n"
2198
"int3_magic:\n"
2199
ANNOTATE_NOENDBR
2200
" movl $1, (%" _ASM_ARG1 ")\n"
2201
ASM_RET
2202
" .size int3_magic, .-int3_magic\n"
2203
" .popsection\n"
2204
);
2205
2206
extern void int3_selftest_ip(void); /* defined in asm below */
2207
2208
static int __init
2209
int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
2210
{
2211
unsigned long selftest = (unsigned long)&int3_selftest_ip;
2212
struct die_args *args = data;
2213
struct pt_regs *regs = args->regs;
2214
2215
OPTIMIZER_HIDE_VAR(selftest);
2216
2217
if (!regs || user_mode(regs))
2218
return NOTIFY_DONE;
2219
2220
if (val != DIE_INT3)
2221
return NOTIFY_DONE;
2222
2223
if (regs->ip - INT3_INSN_SIZE != selftest)
2224
return NOTIFY_DONE;
2225
2226
int3_emulate_call(regs, (unsigned long)&int3_magic);
2227
return NOTIFY_STOP;
2228
}
2229
2230
/* Must be noinline to ensure uniqueness of int3_selftest_ip. */
2231
static noinline void __init int3_selftest(void)
2232
{
2233
static __initdata struct notifier_block int3_exception_nb = {
2234
.notifier_call = int3_exception_notify,
2235
.priority = INT_MAX-1, /* last */
2236
};
2237
unsigned int val = 0;
2238
2239
BUG_ON(register_die_notifier(&int3_exception_nb));
2240
2241
/*
2242
* Basically: int3_magic(&val); but really complicated :-)
2243
*
2244
* INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
2245
* notifier above will emulate CALL for us.
2246
*/
2247
asm volatile ("int3_selftest_ip:\n\t"
2248
ANNOTATE_NOENDBR
2249
" int3; nop; nop; nop; nop\n\t"
2250
: ASM_CALL_CONSTRAINT
2251
: __ASM_SEL_RAW(a, D) (&val)
2252
: "memory");
2253
2254
BUG_ON(val != 1);
2255
2256
unregister_die_notifier(&int3_exception_nb);
2257
}
2258
2259
static __initdata int __alt_reloc_selftest_addr;
2260
2261
extern void __init __alt_reloc_selftest(void *arg);
2262
__visible noinline void __init __alt_reloc_selftest(void *arg)
2263
{
2264
WARN_ON(arg != &__alt_reloc_selftest_addr);
2265
}
2266
2267
static noinline void __init alt_reloc_selftest(void)
2268
{
2269
/*
2270
* Tests text_poke_apply_relocation().
2271
*
2272
* This has a relative immediate (CALL) in a place other than the first
2273
* instruction and additionally on x86_64 we get a RIP-relative LEA:
2274
*
2275
* lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c
2276
* call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4
2277
*
2278
* Getting this wrong will either crash and burn or tickle the WARN
2279
* above.
2280
*/
2281
asm_inline volatile (
2282
ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
2283
: ASM_CALL_CONSTRAINT
2284
: [mem] "m" (__alt_reloc_selftest_addr)
2285
: _ASM_ARG1
2286
);
2287
}
2288
2289
void __init alternative_instructions(void)
2290
{
2291
u64 ibt;
2292
2293
int3_selftest();
2294
2295
/*
2296
* The patching is not fully atomic, so try to avoid local
2297
* interruptions that might execute the to be patched code.
2298
* Other CPUs are not running.
2299
*/
2300
stop_nmi();
2301
2302
/*
2303
* Don't stop machine check exceptions while patching.
2304
* MCEs only happen when something got corrupted and in this
2305
* case we must do something about the corruption.
2306
* Ignoring it is worse than an unlikely patching race.
2307
* Also machine checks tend to be broadcast and if one CPU
2308
* goes into machine check the others follow quickly, so we don't
2309
* expect a machine check to cause undue problems during to code
2310
* patching.
2311
*/
2312
2313
/*
2314
* Make sure to set (artificial) features depending on used paravirt
2315
* functions which can later influence alternative patching.
2316
*/
2317
paravirt_set_cap();
2318
2319
/* Keep CET-IBT disabled until caller/callee are patched */
2320
ibt = ibt_save(/*disable*/ true);
2321
2322
__apply_fineibt(__retpoline_sites, __retpoline_sites_end,
2323
__cfi_sites, __cfi_sites_end, true);
2324
2325
/*
2326
* Rewrite the retpolines, must be done before alternatives since
2327
* those can rewrite the retpoline thunks.
2328
*/
2329
apply_retpolines(__retpoline_sites, __retpoline_sites_end);
2330
apply_returns(__return_sites, __return_sites_end);
2331
2332
its_fini_core();
2333
2334
/*
2335
* Adjust all CALL instructions to point to func()-10, including
2336
* those in .altinstr_replacement.
2337
*/
2338
callthunks_patch_builtin_calls();
2339
2340
apply_alternatives(__alt_instructions, __alt_instructions_end);
2341
2342
/*
2343
* Seal all functions that do not have their address taken.
2344
*/
2345
apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
2346
2347
ibt_restore(ibt);
2348
2349
#ifdef CONFIG_SMP
2350
/* Patch to UP if other cpus not imminent. */
2351
if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
2352
uniproc_patched = true;
2353
alternatives_smp_module_add(NULL, "core kernel",
2354
__smp_locks, __smp_locks_end,
2355
_text, _etext);
2356
}
2357
2358
if (!uniproc_patched || num_possible_cpus() == 1) {
2359
free_init_pages("SMP alternatives",
2360
(unsigned long)__smp_locks,
2361
(unsigned long)__smp_locks_end);
2362
}
2363
#endif
2364
2365
restart_nmi();
2366
alternatives_patched = 1;
2367
2368
alt_reloc_selftest();
2369
}
2370
2371
/**
2372
* text_poke_early - Update instructions on a live kernel at boot time
2373
* @addr: address to modify
2374
* @opcode: source of the copy
2375
* @len: length to copy
2376
*
2377
* When you use this code to patch more than one byte of an instruction
2378
* you need to make sure that other CPUs cannot execute this code in parallel.
2379
* Also no thread must be currently preempted in the middle of these
2380
* instructions. And on the local CPU you need to be protected against NMI or
2381
* MCE handlers seeing an inconsistent instruction while you patch.
2382
*/
2383
void __init_or_module text_poke_early(void *addr, const void *opcode,
2384
size_t len)
2385
{
2386
unsigned long flags;
2387
2388
if (boot_cpu_has(X86_FEATURE_NX) &&
2389
is_module_text_address((unsigned long)addr)) {
2390
/*
2391
* Modules text is marked initially as non-executable, so the
2392
* code cannot be running and speculative code-fetches are
2393
* prevented. Just change the code.
2394
*/
2395
memcpy(addr, opcode, len);
2396
} else {
2397
local_irq_save(flags);
2398
memcpy(addr, opcode, len);
2399
sync_core();
2400
local_irq_restore(flags);
2401
2402
/*
2403
* Could also do a CLFLUSH here to speed up CPU recovery; but
2404
* that causes hangs on some VIA CPUs.
2405
*/
2406
}
2407
}
2408
2409
__ro_after_init struct mm_struct *text_poke_mm;
2410
__ro_after_init unsigned long text_poke_mm_addr;
2411
2412
static void text_poke_memcpy(void *dst, const void *src, size_t len)
2413
{
2414
memcpy(dst, src, len);
2415
}
2416
2417
static void text_poke_memset(void *dst, const void *src, size_t len)
2418
{
2419
int c = *(const int *)src;
2420
2421
memset(dst, c, len);
2422
}
2423
2424
typedef void text_poke_f(void *dst, const void *src, size_t len);
2425
2426
static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
2427
{
2428
bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
2429
struct page *pages[2] = {NULL};
2430
struct mm_struct *prev_mm;
2431
unsigned long flags;
2432
pte_t pte, *ptep;
2433
spinlock_t *ptl;
2434
pgprot_t pgprot;
2435
2436
/*
2437
* While boot memory allocator is running we cannot use struct pages as
2438
* they are not yet initialized. There is no way to recover.
2439
*/
2440
BUG_ON(!after_bootmem);
2441
2442
if (!core_kernel_text((unsigned long)addr)) {
2443
pages[0] = vmalloc_to_page(addr);
2444
if (cross_page_boundary)
2445
pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
2446
} else {
2447
pages[0] = virt_to_page(addr);
2448
WARN_ON(!PageReserved(pages[0]));
2449
if (cross_page_boundary)
2450
pages[1] = virt_to_page(addr + PAGE_SIZE);
2451
}
2452
/*
2453
* If something went wrong, crash and burn since recovery paths are not
2454
* implemented.
2455
*/
2456
BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
2457
2458
/*
2459
* Map the page without the global bit, as TLB flushing is done with
2460
* flush_tlb_mm_range(), which is intended for non-global PTEs.
2461
*/
2462
pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
2463
2464
/*
2465
* The lock is not really needed, but this allows to avoid open-coding.
2466
*/
2467
ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl);
2468
2469
/*
2470
* This must not fail; preallocated in poking_init().
2471
*/
2472
VM_BUG_ON(!ptep);
2473
2474
local_irq_save(flags);
2475
2476
pte = mk_pte(pages[0], pgprot);
2477
set_pte_at(text_poke_mm, text_poke_mm_addr, ptep, pte);
2478
2479
if (cross_page_boundary) {
2480
pte = mk_pte(pages[1], pgprot);
2481
set_pte_at(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1, pte);
2482
}
2483
2484
/*
2485
* Loading the temporary mm behaves as a compiler barrier, which
2486
* guarantees that the PTE will be set at the time memcpy() is done.
2487
*/
2488
prev_mm = use_temporary_mm(text_poke_mm);
2489
2490
kasan_disable_current();
2491
func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len);
2492
kasan_enable_current();
2493
2494
/*
2495
* Ensure that the PTE is only cleared after the instructions of memcpy
2496
* were issued by using a compiler barrier.
2497
*/
2498
barrier();
2499
2500
pte_clear(text_poke_mm, text_poke_mm_addr, ptep);
2501
if (cross_page_boundary)
2502
pte_clear(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1);
2503
2504
/*
2505
* Loading the previous page-table hierarchy requires a serializing
2506
* instruction that already allows the core to see the updated version.
2507
* Xen-PV is assumed to serialize execution in a similar manner.
2508
*/
2509
unuse_temporary_mm(prev_mm);
2510
2511
/*
2512
* Flushing the TLB might involve IPIs, which would require enabled
2513
* IRQs, but not if the mm is not used, as it is in this point.
2514
*/
2515
flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr +
2516
(cross_page_boundary ? 2 : 1) * PAGE_SIZE,
2517
PAGE_SHIFT, false);
2518
2519
if (func == text_poke_memcpy) {
2520
/*
2521
* If the text does not match what we just wrote then something is
2522
* fundamentally screwy; there's nothing we can really do about that.
2523
*/
2524
BUG_ON(memcmp(addr, src, len));
2525
}
2526
2527
local_irq_restore(flags);
2528
pte_unmap_unlock(ptep, ptl);
2529
return addr;
2530
}
2531
2532
/**
2533
* text_poke - Update instructions on a live kernel
2534
* @addr: address to modify
2535
* @opcode: source of the copy
2536
* @len: length to copy
2537
*
2538
* Only atomic text poke/set should be allowed when not doing early patching.
2539
* It means the size must be writable atomically and the address must be aligned
2540
* in a way that permits an atomic write. It also makes sure we fit on a single
2541
* page.
2542
*
2543
* Note that the caller must ensure that if the modified code is part of a
2544
* module, the module would not be removed during poking. This can be achieved
2545
* by registering a module notifier, and ordering module removal and patching
2546
* through a mutex.
2547
*/
2548
void *text_poke(void *addr, const void *opcode, size_t len)
2549
{
2550
lockdep_assert_held(&text_mutex);
2551
2552
return __text_poke(text_poke_memcpy, addr, opcode, len);
2553
}
2554
2555
/**
2556
* text_poke_kgdb - Update instructions on a live kernel by kgdb
2557
* @addr: address to modify
2558
* @opcode: source of the copy
2559
* @len: length to copy
2560
*
2561
* Only atomic text poke/set should be allowed when not doing early patching.
2562
* It means the size must be writable atomically and the address must be aligned
2563
* in a way that permits an atomic write. It also makes sure we fit on a single
2564
* page.
2565
*
2566
* Context: should only be used by kgdb, which ensures no other core is running,
2567
* despite the fact it does not hold the text_mutex.
2568
*/
2569
void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
2570
{
2571
return __text_poke(text_poke_memcpy, addr, opcode, len);
2572
}
2573
2574
void *text_poke_copy_locked(void *addr, const void *opcode, size_t len,
2575
bool core_ok)
2576
{
2577
unsigned long start = (unsigned long)addr;
2578
size_t patched = 0;
2579
2580
if (WARN_ON_ONCE(!core_ok && core_kernel_text(start)))
2581
return NULL;
2582
2583
while (patched < len) {
2584
unsigned long ptr = start + patched;
2585
size_t s;
2586
2587
s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
2588
2589
__text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
2590
patched += s;
2591
}
2592
return addr;
2593
}
2594
2595
/**
2596
* text_poke_copy - Copy instructions into (an unused part of) RX memory
2597
* @addr: address to modify
2598
* @opcode: source of the copy
2599
* @len: length to copy, could be more than 2x PAGE_SIZE
2600
*
2601
* Not safe against concurrent execution; useful for JITs to dump
2602
* new code blocks into unused regions of RX memory. Can be used in
2603
* conjunction with synchronize_rcu_tasks() to wait for existing
2604
* execution to quiesce after having made sure no existing functions
2605
* pointers are live.
2606
*/
2607
void *text_poke_copy(void *addr, const void *opcode, size_t len)
2608
{
2609
mutex_lock(&text_mutex);
2610
addr = text_poke_copy_locked(addr, opcode, len, false);
2611
mutex_unlock(&text_mutex);
2612
return addr;
2613
}
2614
2615
/**
2616
* text_poke_set - memset into (an unused part of) RX memory
2617
* @addr: address to modify
2618
* @c: the byte to fill the area with
2619
* @len: length to copy, could be more than 2x PAGE_SIZE
2620
*
2621
* This is useful to overwrite unused regions of RX memory with illegal
2622
* instructions.
2623
*/
2624
void *text_poke_set(void *addr, int c, size_t len)
2625
{
2626
unsigned long start = (unsigned long)addr;
2627
size_t patched = 0;
2628
2629
if (WARN_ON_ONCE(core_kernel_text(start)))
2630
return NULL;
2631
2632
mutex_lock(&text_mutex);
2633
while (patched < len) {
2634
unsigned long ptr = start + patched;
2635
size_t s;
2636
2637
s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
2638
2639
__text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
2640
patched += s;
2641
}
2642
mutex_unlock(&text_mutex);
2643
return addr;
2644
}
2645
2646
static void do_sync_core(void *info)
2647
{
2648
sync_core();
2649
}
2650
2651
void smp_text_poke_sync_each_cpu(void)
2652
{
2653
on_each_cpu(do_sync_core, NULL, 1);
2654
}
2655
2656
/*
2657
* NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of
2658
* this thing. When len == 6 everything is prefixed with 0x0f and we map
2659
* opcode to Jcc.d8, using len to distinguish.
2660
*/
2661
struct smp_text_poke_loc {
2662
/* addr := _stext + rel_addr */
2663
s32 rel_addr;
2664
s32 disp;
2665
u8 len;
2666
u8 opcode;
2667
const u8 text[TEXT_POKE_MAX_OPCODE_SIZE];
2668
/* see smp_text_poke_batch_finish() */
2669
u8 old;
2670
};
2671
2672
#define TEXT_POKE_ARRAY_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc))
2673
2674
static struct smp_text_poke_array {
2675
struct smp_text_poke_loc vec[TEXT_POKE_ARRAY_MAX];
2676
int nr_entries;
2677
} text_poke_array;
2678
2679
static DEFINE_PER_CPU(atomic_t, text_poke_array_refs);
2680
2681
/*
2682
* These four __always_inline annotations imply noinstr, necessary
2683
* due to smp_text_poke_int3_handler() being noinstr:
2684
*/
2685
2686
static __always_inline bool try_get_text_poke_array(void)
2687
{
2688
atomic_t *refs = this_cpu_ptr(&text_poke_array_refs);
2689
2690
if (!raw_atomic_inc_not_zero(refs))
2691
return false;
2692
2693
return true;
2694
}
2695
2696
static __always_inline void put_text_poke_array(void)
2697
{
2698
atomic_t *refs = this_cpu_ptr(&text_poke_array_refs);
2699
2700
smp_mb__before_atomic();
2701
raw_atomic_dec(refs);
2702
}
2703
2704
static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tpl)
2705
{
2706
return _stext + tpl->rel_addr;
2707
}
2708
2709
static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b)
2710
{
2711
if (tpl_a < text_poke_addr(tpl_b))
2712
return -1;
2713
if (tpl_a > text_poke_addr(tpl_b))
2714
return 1;
2715
return 0;
2716
}
2717
2718
noinstr int smp_text_poke_int3_handler(struct pt_regs *regs)
2719
{
2720
struct smp_text_poke_loc *tpl;
2721
int ret = 0;
2722
void *ip;
2723
2724
if (user_mode(regs))
2725
return 0;
2726
2727
/*
2728
* Having observed our INT3 instruction, we now must observe
2729
* text_poke_array with non-zero refcount:
2730
*
2731
* text_poke_array_refs = 1 INT3
2732
* WMB RMB
2733
* write INT3 if (text_poke_array_refs != 0)
2734
*/
2735
smp_rmb();
2736
2737
if (!try_get_text_poke_array())
2738
return 0;
2739
2740
/*
2741
* Discount the INT3. See smp_text_poke_batch_finish().
2742
*/
2743
ip = (void *) regs->ip - INT3_INSN_SIZE;
2744
2745
/*
2746
* Skip the binary search if there is a single member in the vector.
2747
*/
2748
if (unlikely(text_poke_array.nr_entries > 1)) {
2749
tpl = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries,
2750
sizeof(struct smp_text_poke_loc),
2751
patch_cmp);
2752
if (!tpl)
2753
goto out_put;
2754
} else {
2755
tpl = text_poke_array.vec;
2756
if (text_poke_addr(tpl) != ip)
2757
goto out_put;
2758
}
2759
2760
ip += tpl->len;
2761
2762
switch (tpl->opcode) {
2763
case INT3_INSN_OPCODE:
2764
/*
2765
* Someone poked an explicit INT3, they'll want to handle it,
2766
* do not consume.
2767
*/
2768
goto out_put;
2769
2770
case RET_INSN_OPCODE:
2771
int3_emulate_ret(regs);
2772
break;
2773
2774
case CALL_INSN_OPCODE:
2775
int3_emulate_call(regs, (long)ip + tpl->disp);
2776
break;
2777
2778
case JMP32_INSN_OPCODE:
2779
case JMP8_INSN_OPCODE:
2780
int3_emulate_jmp(regs, (long)ip + tpl->disp);
2781
break;
2782
2783
case 0x70 ... 0x7f: /* Jcc */
2784
int3_emulate_jcc(regs, tpl->opcode & 0xf, (long)ip, tpl->disp);
2785
break;
2786
2787
default:
2788
BUG();
2789
}
2790
2791
ret = 1;
2792
2793
out_put:
2794
put_text_poke_array();
2795
return ret;
2796
}
2797
2798
/**
2799
* smp_text_poke_batch_finish() -- update instructions on live kernel on SMP
2800
*
2801
* Input state:
2802
* text_poke_array.vec: vector of instructions to patch
2803
* text_poke_array.nr_entries: number of entries in the vector
2804
*
2805
* Modify multi-byte instructions by using INT3 breakpoints on SMP.
2806
* We completely avoid using stop_machine() here, and achieve the
2807
* synchronization using INT3 breakpoints and SMP cross-calls.
2808
*
2809
* The way it is done:
2810
* - For each entry in the vector:
2811
* - add an INT3 trap to the address that will be patched
2812
* - SMP sync all CPUs
2813
* - For each entry in the vector:
2814
* - update all but the first byte of the patched range
2815
* - SMP sync all CPUs
2816
* - For each entry in the vector:
2817
* - replace the first byte (INT3) by the first byte of the
2818
* replacing opcode
2819
* - SMP sync all CPUs
2820
*/
2821
void smp_text_poke_batch_finish(void)
2822
{
2823
unsigned char int3 = INT3_INSN_OPCODE;
2824
unsigned int i;
2825
int do_sync;
2826
2827
if (!text_poke_array.nr_entries)
2828
return;
2829
2830
lockdep_assert_held(&text_mutex);
2831
2832
/*
2833
* Corresponds to the implicit memory barrier in try_get_text_poke_array() to
2834
* ensure reading a non-zero refcount provides up to date text_poke_array data.
2835
*/
2836
for_each_possible_cpu(i)
2837
atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1);
2838
2839
/*
2840
* Function tracing can enable thousands of places that need to be
2841
* updated. This can take quite some time, and with full kernel debugging
2842
* enabled, this could cause the softlockup watchdog to trigger.
2843
* This function gets called every 256 entries added to be patched.
2844
* Call cond_resched() here to make sure that other tasks can get scheduled
2845
* while processing all the functions being patched.
2846
*/
2847
cond_resched();
2848
2849
/*
2850
* Corresponding read barrier in INT3 notifier for making sure the
2851
* text_poke_array.nr_entries and handler are correctly ordered wrt. patching.
2852
*/
2853
smp_wmb();
2854
2855
/*
2856
* First step: add a INT3 trap to the address that will be patched.
2857
*/
2858
for (i = 0; i < text_poke_array.nr_entries; i++) {
2859
text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]);
2860
text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE);
2861
}
2862
2863
smp_text_poke_sync_each_cpu();
2864
2865
/*
2866
* Second step: update all but the first byte of the patched range.
2867
*/
2868
for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) {
2869
u8 old[TEXT_POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, };
2870
u8 _new[TEXT_POKE_MAX_OPCODE_SIZE+1];
2871
const u8 *new = text_poke_array.vec[i].text;
2872
int len = text_poke_array.vec[i].len;
2873
2874
if (len - INT3_INSN_SIZE > 0) {
2875
memcpy(old + INT3_INSN_SIZE,
2876
text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE,
2877
len - INT3_INSN_SIZE);
2878
2879
if (len == 6) {
2880
_new[0] = 0x0f;
2881
memcpy(_new + 1, new, 5);
2882
new = _new;
2883
}
2884
2885
text_poke(text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE,
2886
new + INT3_INSN_SIZE,
2887
len - INT3_INSN_SIZE);
2888
2889
do_sync++;
2890
}
2891
2892
/*
2893
* Emit a perf event to record the text poke, primarily to
2894
* support Intel PT decoding which must walk the executable code
2895
* to reconstruct the trace. The flow up to here is:
2896
* - write INT3 byte
2897
* - IPI-SYNC
2898
* - write instruction tail
2899
* At this point the actual control flow will be through the
2900
* INT3 and handler and not hit the old or new instruction.
2901
* Intel PT outputs FUP/TIP packets for the INT3, so the flow
2902
* can still be decoded. Subsequently:
2903
* - emit RECORD_TEXT_POKE with the new instruction
2904
* - IPI-SYNC
2905
* - write first byte
2906
* - IPI-SYNC
2907
* So before the text poke event timestamp, the decoder will see
2908
* either the old instruction flow or FUP/TIP of INT3. After the
2909
* text poke event timestamp, the decoder will see either the
2910
* new instruction flow or FUP/TIP of INT3. Thus decoders can
2911
* use the timestamp as the point at which to modify the
2912
* executable code.
2913
* The old instruction is recorded so that the event can be
2914
* processed forwards or backwards.
2915
*/
2916
perf_event_text_poke(text_poke_addr(&text_poke_array.vec[i]), old, len, new, len);
2917
}
2918
2919
if (do_sync) {
2920
/*
2921
* According to Intel, this core syncing is very likely
2922
* not necessary and we'd be safe even without it. But
2923
* better safe than sorry (plus there's not only Intel).
2924
*/
2925
smp_text_poke_sync_each_cpu();
2926
}
2927
2928
/*
2929
* Third step: replace the first byte (INT3) by the first byte of the
2930
* replacing opcode.
2931
*/
2932
for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) {
2933
u8 byte = text_poke_array.vec[i].text[0];
2934
2935
if (text_poke_array.vec[i].len == 6)
2936
byte = 0x0f;
2937
2938
if (byte == INT3_INSN_OPCODE)
2939
continue;
2940
2941
text_poke(text_poke_addr(&text_poke_array.vec[i]), &byte, INT3_INSN_SIZE);
2942
do_sync++;
2943
}
2944
2945
if (do_sync)
2946
smp_text_poke_sync_each_cpu();
2947
2948
/*
2949
* Remove and wait for refs to be zero.
2950
*
2951
* Notably, if after step-3 above the INT3 got removed, then the
2952
* smp_text_poke_sync_each_cpu() will have serialized against any running INT3
2953
* handlers and the below spin-wait will not happen.
2954
*
2955
* IOW. unless the replacement instruction is INT3, this case goes
2956
* unused.
2957
*/
2958
for_each_possible_cpu(i) {
2959
atomic_t *refs = per_cpu_ptr(&text_poke_array_refs, i);
2960
2961
if (unlikely(!atomic_dec_and_test(refs)))
2962
atomic_cond_read_acquire(refs, !VAL);
2963
}
2964
2965
/* They are all completed: */
2966
text_poke_array.nr_entries = 0;
2967
}
2968
2969
static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate)
2970
{
2971
struct smp_text_poke_loc *tpl;
2972
struct insn insn;
2973
int ret, i = 0;
2974
2975
tpl = &text_poke_array.vec[text_poke_array.nr_entries++];
2976
2977
if (len == 6)
2978
i = 1;
2979
memcpy((void *)tpl->text, opcode+i, len-i);
2980
if (!emulate)
2981
emulate = opcode;
2982
2983
ret = insn_decode_kernel(&insn, emulate);
2984
BUG_ON(ret < 0);
2985
2986
tpl->rel_addr = addr - (void *)_stext;
2987
tpl->len = len;
2988
tpl->opcode = insn.opcode.bytes[0];
2989
2990
if (is_jcc32(&insn)) {
2991
/*
2992
* Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
2993
*/
2994
tpl->opcode = insn.opcode.bytes[1] - 0x10;
2995
}
2996
2997
switch (tpl->opcode) {
2998
case RET_INSN_OPCODE:
2999
case JMP32_INSN_OPCODE:
3000
case JMP8_INSN_OPCODE:
3001
/*
3002
* Control flow instructions without implied execution of the
3003
* next instruction can be padded with INT3.
3004
*/
3005
for (i = insn.length; i < len; i++)
3006
BUG_ON(tpl->text[i] != INT3_INSN_OPCODE);
3007
break;
3008
3009
default:
3010
BUG_ON(len != insn.length);
3011
}
3012
3013
switch (tpl->opcode) {
3014
case INT3_INSN_OPCODE:
3015
case RET_INSN_OPCODE:
3016
break;
3017
3018
case CALL_INSN_OPCODE:
3019
case JMP32_INSN_OPCODE:
3020
case JMP8_INSN_OPCODE:
3021
case 0x70 ... 0x7f: /* Jcc */
3022
tpl->disp = insn.immediate.value;
3023
break;
3024
3025
default: /* assume NOP */
3026
switch (len) {
3027
case 2: /* NOP2 -- emulate as JMP8+0 */
3028
BUG_ON(memcmp(emulate, x86_nops[len], len));
3029
tpl->opcode = JMP8_INSN_OPCODE;
3030
tpl->disp = 0;
3031
break;
3032
3033
case 5: /* NOP5 -- emulate as JMP32+0 */
3034
BUG_ON(memcmp(emulate, x86_nops[len], len));
3035
tpl->opcode = JMP32_INSN_OPCODE;
3036
tpl->disp = 0;
3037
break;
3038
3039
default: /* unknown instruction */
3040
BUG();
3041
}
3042
break;
3043
}
3044
}
3045
3046
/*
3047
* We hard rely on the text_poke_array.vec being ordered; ensure this is so by flushing
3048
* early if needed.
3049
*/
3050
static bool text_poke_addr_ordered(void *addr)
3051
{
3052
WARN_ON_ONCE(!addr);
3053
3054
if (!text_poke_array.nr_entries)
3055
return true;
3056
3057
/*
3058
* If the last current entry's address is higher than the
3059
* new entry's address we'd like to add, then ordering
3060
* is violated and we must first flush all pending patching
3061
* requests:
3062
*/
3063
if (text_poke_addr(text_poke_array.vec + text_poke_array.nr_entries-1) > addr)
3064
return false;
3065
3066
return true;
3067
}
3068
3069
/**
3070
* smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched
3071
* @addr: address to patch
3072
* @opcode: opcode of new instruction
3073
* @len: length to copy
3074
* @emulate: instruction to be emulated
3075
*
3076
* Add a new instruction to the current queue of to-be-patched instructions
3077
* the kernel maintains. The patching request will not be executed immediately,
3078
* but becomes part of an array of patching requests, optimized for batched
3079
* execution. All pending patching requests will be executed on the next
3080
* smp_text_poke_batch_finish() call.
3081
*/
3082
void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate)
3083
{
3084
if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr))
3085
smp_text_poke_batch_finish();
3086
__smp_text_poke_batch_add(addr, opcode, len, emulate);
3087
}
3088
3089
/**
3090
* smp_text_poke_single() -- update instruction on live kernel on SMP immediately
3091
* @addr: address to patch
3092
* @opcode: opcode of new instruction
3093
* @len: length to copy
3094
* @emulate: instruction to be emulated
3095
*
3096
* Update a single instruction with the vector in the stack, avoiding
3097
* dynamically allocated memory. This function should be used when it is
3098
* not possible to allocate memory for a vector. The single instruction
3099
* is patched in immediately.
3100
*/
3101
void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate)
3102
{
3103
smp_text_poke_batch_add(addr, opcode, len, emulate);
3104
smp_text_poke_batch_finish();
3105
}
3106
3107