Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/mm/tlb.c
50904 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
#include <linux/init.h>
3
4
#include <linux/mm.h>
5
#include <linux/spinlock.h>
6
#include <linux/smp.h>
7
#include <linux/interrupt.h>
8
#include <linux/export.h>
9
#include <linux/cpu.h>
10
#include <linux/debugfs.h>
11
#include <linux/sched/smt.h>
12
#include <linux/task_work.h>
13
#include <linux/mmu_notifier.h>
14
#include <linux/mmu_context.h>
15
#include <linux/kvm_types.h>
16
17
#include <asm/tlbflush.h>
18
#include <asm/mmu_context.h>
19
#include <asm/nospec-branch.h>
20
#include <asm/cache.h>
21
#include <asm/cacheflush.h>
22
#include <asm/apic.h>
23
#include <asm/msr.h>
24
#include <asm/perf_event.h>
25
#include <asm/tlb.h>
26
27
#include "mm_internal.h"
28
29
#ifdef CONFIG_PARAVIRT
30
# define STATIC_NOPV
31
#else
32
# define STATIC_NOPV static
33
# define __flush_tlb_local native_flush_tlb_local
34
# define __flush_tlb_global native_flush_tlb_global
35
# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr)
36
# define __flush_tlb_multi(msk, info) native_flush_tlb_multi(msk, info)
37
#endif
38
39
/*
40
* TLB flushing, formerly SMP-only
41
* c/o Linus Torvalds.
42
*
43
* These mean you can really definitely utterly forget about
44
* writing to user space from interrupts. (Its not allowed anyway).
45
*
46
* Optimizations Manfred Spraul <[email protected]>
47
*
48
* More scalable flush, from Andi Kleen
49
*
50
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
51
*/
52
53
/*
54
* Bits to mangle the TIF_SPEC_* state into the mm pointer which is
55
* stored in cpu_tlb_state.last_user_mm_spec.
56
*/
57
#define LAST_USER_MM_IBPB 0x1UL
58
#define LAST_USER_MM_L1D_FLUSH 0x2UL
59
#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB | LAST_USER_MM_L1D_FLUSH)
60
61
/* Bits to set when tlbstate and flush is (re)initialized */
62
#define LAST_USER_MM_INIT LAST_USER_MM_IBPB
63
64
/*
65
* The x86 feature is called PCID (Process Context IDentifier). It is similar
66
* to what is traditionally called ASID on the RISC processors.
67
*
68
* We don't use the traditional ASID implementation, where each process/mm gets
69
* its own ASID and flush/restart when we run out of ASID space.
70
*
71
* Instead we have a small per-cpu array of ASIDs and cache the last few mm's
72
* that came by on this CPU, allowing cheaper switch_mm between processes on
73
* this CPU.
74
*
75
* We end up with different spaces for different things. To avoid confusion we
76
* use different names for each of them:
77
*
78
* ASID - [0, TLB_NR_DYN_ASIDS-1]
79
* the canonical identifier for an mm, dynamically allocated on each CPU
80
* [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1]
81
* the canonical, global identifier for an mm, identical across all CPUs
82
*
83
* kPCID - [1, MAX_ASID_AVAILABLE]
84
* the value we write into the PCID part of CR3; corresponds to the
85
* ASID+1, because PCID 0 is special.
86
*
87
* uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE]
88
* for KPTI each mm has two address spaces and thus needs two
89
* PCID values, but we can still do with a single ASID denomination
90
* for each mm. Corresponds to kPCID + 2048.
91
*
92
*/
93
94
/*
95
* When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for
96
* user/kernel switches
97
*/
98
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
99
# define PTI_CONSUMED_PCID_BITS 1
100
#else
101
# define PTI_CONSUMED_PCID_BITS 0
102
#endif
103
104
#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
105
106
/*
107
* ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
108
* for them being zero-based. Another -1 is because PCID 0 is reserved for
109
* use by non-PCID-aware users.
110
*/
111
#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
112
113
/*
114
* Given @asid, compute kPCID
115
*/
116
static inline u16 kern_pcid(u16 asid)
117
{
118
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
119
120
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
121
/*
122
* Make sure that the dynamic ASID space does not conflict with the
123
* bit we are using to switch between user and kernel ASIDs.
124
*/
125
BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
126
127
/*
128
* The ASID being passed in here should have respected the
129
* MAX_ASID_AVAILABLE and thus never have the switch bit set.
130
*/
131
VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
132
#endif
133
/*
134
* The dynamically-assigned ASIDs that get passed in are small
135
* (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
136
* so do not bother to clear it.
137
*
138
* If PCID is on, ASID-aware code paths put the ASID+1 into the
139
* PCID bits. This serves two purposes. It prevents a nasty
140
* situation in which PCID-unaware code saves CR3, loads some other
141
* value (with PCID == 0), and then restores CR3, thus corrupting
142
* the TLB for ASID 0 if the saved ASID was nonzero. It also means
143
* that any bugs involving loading a PCID-enabled CR3 with
144
* CR4.PCIDE off will trigger deterministically.
145
*/
146
return asid + 1;
147
}
148
149
/*
150
* Given @asid, compute uPCID
151
*/
152
static inline u16 user_pcid(u16 asid)
153
{
154
u16 ret = kern_pcid(asid);
155
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
156
ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
157
#endif
158
return ret;
159
}
160
161
static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam)
162
{
163
unsigned long cr3 = __sme_pa(pgd) | lam;
164
165
if (static_cpu_has(X86_FEATURE_PCID)) {
166
cr3 |= kern_pcid(asid);
167
} else {
168
VM_WARN_ON_ONCE(asid != 0);
169
}
170
171
return cr3;
172
}
173
174
static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid,
175
unsigned long lam)
176
{
177
/*
178
* Use boot_cpu_has() instead of this_cpu_has() as this function
179
* might be called during early boot. This should work even after
180
* boot because all CPU's the have same capabilities:
181
*/
182
VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
183
return build_cr3(pgd, asid, lam) | CR3_NOFLUSH;
184
}
185
186
/*
187
* We get here when we do something requiring a TLB invalidation
188
* but could not go invalidate all of the contexts. We do the
189
* necessary invalidation by clearing out the 'ctx_id' which
190
* forces a TLB flush when the context is loaded.
191
*/
192
static void clear_asid_other(void)
193
{
194
u16 asid;
195
196
/*
197
* This is only expected to be set if we have disabled
198
* kernel _PAGE_GLOBAL pages.
199
*/
200
if (!static_cpu_has(X86_FEATURE_PTI)) {
201
WARN_ON_ONCE(1);
202
return;
203
}
204
205
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
206
/* Do not need to flush the current asid */
207
if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
208
continue;
209
/*
210
* Make sure the next time we go to switch to
211
* this asid, we do a flush:
212
*/
213
this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
214
}
215
this_cpu_write(cpu_tlbstate.invalidate_other, false);
216
}
217
218
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
219
220
struct new_asid {
221
unsigned int asid : 16;
222
unsigned int need_flush : 1;
223
};
224
225
static struct new_asid choose_new_asid(struct mm_struct *next, u64 next_tlb_gen)
226
{
227
struct new_asid ns;
228
u16 asid;
229
230
if (!static_cpu_has(X86_FEATURE_PCID)) {
231
ns.asid = 0;
232
ns.need_flush = 1;
233
return ns;
234
}
235
236
/*
237
* TLB consistency for global ASIDs is maintained with hardware assisted
238
* remote TLB flushing. Global ASIDs are always up to date.
239
*/
240
if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
241
u16 global_asid = mm_global_asid(next);
242
243
if (global_asid) {
244
ns.asid = global_asid;
245
ns.need_flush = 0;
246
return ns;
247
}
248
}
249
250
if (this_cpu_read(cpu_tlbstate.invalidate_other))
251
clear_asid_other();
252
253
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
254
if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
255
next->context.ctx_id)
256
continue;
257
258
ns.asid = asid;
259
ns.need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < next_tlb_gen);
260
return ns;
261
}
262
263
/*
264
* We don't currently own an ASID slot on this CPU.
265
* Allocate a slot.
266
*/
267
ns.asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
268
if (ns.asid >= TLB_NR_DYN_ASIDS) {
269
ns.asid = 0;
270
this_cpu_write(cpu_tlbstate.next_asid, 1);
271
}
272
ns.need_flush = true;
273
274
return ns;
275
}
276
277
/*
278
* Global ASIDs are allocated for multi-threaded processes that are
279
* active on multiple CPUs simultaneously, giving each of those
280
* processes the same PCID on every CPU, for use with hardware-assisted
281
* TLB shootdown on remote CPUs, like AMD INVLPGB or Intel RAR.
282
*
283
* These global ASIDs are held for the lifetime of the process.
284
*/
285
static DEFINE_RAW_SPINLOCK(global_asid_lock);
286
static u16 last_global_asid = MAX_ASID_AVAILABLE;
287
static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE);
288
static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE);
289
static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
290
291
/*
292
* When the search for a free ASID in the global ASID space reaches
293
* MAX_ASID_AVAILABLE, a global TLB flush guarantees that previously
294
* freed global ASIDs are safe to re-use.
295
*
296
* This way the global flush only needs to happen at ASID rollover
297
* time, and not at ASID allocation time.
298
*/
299
static void reset_global_asid_space(void)
300
{
301
lockdep_assert_held(&global_asid_lock);
302
303
invlpgb_flush_all_nonglobals();
304
305
/*
306
* The TLB flush above makes it safe to re-use the previously
307
* freed global ASIDs.
308
*/
309
bitmap_andnot(global_asid_used, global_asid_used,
310
global_asid_freed, MAX_ASID_AVAILABLE);
311
bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE);
312
313
/* Restart the search from the start of global ASID space. */
314
last_global_asid = TLB_NR_DYN_ASIDS;
315
}
316
317
static u16 allocate_global_asid(void)
318
{
319
u16 asid;
320
321
lockdep_assert_held(&global_asid_lock);
322
323
/* The previous allocation hit the edge of available address space */
324
if (last_global_asid >= MAX_ASID_AVAILABLE - 1)
325
reset_global_asid_space();
326
327
asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, last_global_asid);
328
329
if (asid >= MAX_ASID_AVAILABLE && !global_asid_available) {
330
/* This should never happen. */
331
VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n",
332
global_asid_available);
333
return 0;
334
}
335
336
/* Claim this global ASID. */
337
__set_bit(asid, global_asid_used);
338
last_global_asid = asid;
339
global_asid_available--;
340
return asid;
341
}
342
343
/*
344
* Check whether a process is currently active on more than @threshold CPUs.
345
* This is a cheap estimation on whether or not it may make sense to assign
346
* a global ASID to this process, and use broadcast TLB invalidation.
347
*/
348
static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold)
349
{
350
int count = 0;
351
int cpu;
352
353
/* This quick check should eliminate most single threaded programs. */
354
if (cpumask_weight(mm_cpumask(mm)) <= threshold)
355
return false;
356
357
/* Slower check to make sure. */
358
for_each_cpu(cpu, mm_cpumask(mm)) {
359
/* Skip the CPUs that aren't really running this process. */
360
if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
361
continue;
362
363
if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
364
continue;
365
366
if (++count > threshold)
367
return true;
368
}
369
return false;
370
}
371
372
/*
373
* Assign a global ASID to the current process, protecting against
374
* races between multiple threads in the process.
375
*/
376
static void use_global_asid(struct mm_struct *mm)
377
{
378
u16 asid;
379
380
guard(raw_spinlock_irqsave)(&global_asid_lock);
381
382
/* This process is already using broadcast TLB invalidation. */
383
if (mm_global_asid(mm))
384
return;
385
386
/*
387
* The last global ASID was consumed while waiting for the lock.
388
*
389
* If this fires, a more aggressive ASID reuse scheme might be
390
* needed.
391
*/
392
if (!global_asid_available) {
393
VM_WARN_ONCE(1, "Ran out of global ASIDs\n");
394
return;
395
}
396
397
asid = allocate_global_asid();
398
if (!asid)
399
return;
400
401
mm_assign_global_asid(mm, asid);
402
}
403
404
#ifdef CONFIG_BROADCAST_TLB_FLUSH
405
void mm_free_global_asid(struct mm_struct *mm)
406
{
407
if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
408
return;
409
410
if (!mm_global_asid(mm))
411
return;
412
413
guard(raw_spinlock_irqsave)(&global_asid_lock);
414
415
/* The global ASID can be re-used only after flush at wrap-around. */
416
__set_bit(mm->context.global_asid, global_asid_freed);
417
418
mm->context.global_asid = 0;
419
global_asid_available++;
420
}
421
#endif
422
423
/*
424
* Is the mm transitioning from a CPU-local ASID to a global ASID?
425
*/
426
static bool mm_needs_global_asid(struct mm_struct *mm, u16 asid)
427
{
428
u16 global_asid = mm_global_asid(mm);
429
430
if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
431
return false;
432
433
/* Process is transitioning to a global ASID */
434
if (global_asid && asid != global_asid)
435
return true;
436
437
return false;
438
}
439
440
/*
441
* x86 has 4k ASIDs (2k when compiled with KPTI), but the largest x86
442
* systems have over 8k CPUs. Because of this potential ASID shortage,
443
* global ASIDs are handed out to processes that have frequent TLB
444
* flushes and are active on 4 or more CPUs simultaneously.
445
*/
446
static void consider_global_asid(struct mm_struct *mm)
447
{
448
if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
449
return;
450
451
/* Check every once in a while. */
452
if ((current->pid & 0x1f) != (jiffies & 0x1f))
453
return;
454
455
/*
456
* Assign a global ASID if the process is active on
457
* 4 or more CPUs simultaneously.
458
*/
459
if (mm_active_cpus_exceeds(mm, 3))
460
use_global_asid(mm);
461
}
462
463
static void finish_asid_transition(struct flush_tlb_info *info)
464
{
465
struct mm_struct *mm = info->mm;
466
int bc_asid = mm_global_asid(mm);
467
int cpu;
468
469
if (!mm_in_asid_transition(mm))
470
return;
471
472
for_each_cpu(cpu, mm_cpumask(mm)) {
473
/*
474
* The remote CPU is context switching. Wait for that to
475
* finish, to catch the unlikely case of it switching to
476
* the target mm with an out of date ASID.
477
*/
478
while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING)
479
cpu_relax();
480
481
if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)
482
continue;
483
484
/*
485
* If at least one CPU is not using the global ASID yet,
486
* send a TLB flush IPI. The IPI should cause stragglers
487
* to transition soon.
488
*
489
* This can race with the CPU switching to another task;
490
* that results in a (harmless) extra IPI.
491
*/
492
if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) {
493
flush_tlb_multi(mm_cpumask(info->mm), info);
494
return;
495
}
496
}
497
498
/* All the CPUs running this process are using the global ASID. */
499
mm_clear_asid_transition(mm);
500
}
501
502
static void broadcast_tlb_flush(struct flush_tlb_info *info)
503
{
504
bool pmd = info->stride_shift == PMD_SHIFT;
505
unsigned long asid = mm_global_asid(info->mm);
506
unsigned long addr = info->start;
507
508
/*
509
* TLB flushes with INVLPGB are kicked off asynchronously.
510
* The inc_mm_tlb_gen() guarantees page table updates are done
511
* before these TLB flushes happen.
512
*/
513
if (info->end == TLB_FLUSH_ALL) {
514
invlpgb_flush_single_pcid_nosync(kern_pcid(asid));
515
/* Do any CPUs supporting INVLPGB need PTI? */
516
if (cpu_feature_enabled(X86_FEATURE_PTI))
517
invlpgb_flush_single_pcid_nosync(user_pcid(asid));
518
} else do {
519
unsigned long nr = 1;
520
521
if (info->stride_shift <= PMD_SHIFT) {
522
nr = (info->end - addr) >> info->stride_shift;
523
nr = clamp_val(nr, 1, invlpgb_count_max);
524
}
525
526
invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd);
527
if (cpu_feature_enabled(X86_FEATURE_PTI))
528
invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);
529
530
addr += nr << info->stride_shift;
531
} while (addr < info->end);
532
533
finish_asid_transition(info);
534
535
/* Wait for the INVLPGBs kicked off above to finish. */
536
__tlbsync();
537
}
538
539
/*
540
* Given an ASID, flush the corresponding user ASID. We can delay this
541
* until the next time we switch to it.
542
*
543
* See SWITCH_TO_USER_CR3.
544
*/
545
static inline void invalidate_user_asid(u16 asid)
546
{
547
/* There is no user ASID if address space separation is off */
548
if (!IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION))
549
return;
550
551
/*
552
* We only have a single ASID if PCID is off and the CR3
553
* write will have flushed it.
554
*/
555
if (!cpu_feature_enabled(X86_FEATURE_PCID))
556
return;
557
558
if (!static_cpu_has(X86_FEATURE_PTI))
559
return;
560
561
__set_bit(kern_pcid(asid),
562
(unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
563
}
564
565
static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam,
566
bool need_flush)
567
{
568
unsigned long new_mm_cr3;
569
570
if (need_flush) {
571
invalidate_user_asid(new_asid);
572
new_mm_cr3 = build_cr3(pgdir, new_asid, lam);
573
} else {
574
new_mm_cr3 = build_cr3_noflush(pgdir, new_asid, lam);
575
}
576
577
/*
578
* Caution: many callers of this function expect
579
* that load_cr3() is serializing and orders TLB
580
* fills with respect to the mm_cpumask writes.
581
*/
582
write_cr3(new_mm_cr3);
583
}
584
585
void leave_mm(void)
586
{
587
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
588
589
/*
590
* It's plausible that we're in lazy TLB mode while our mm is init_mm.
591
* If so, our callers still expect us to flush the TLB, but there
592
* aren't any user TLB entries in init_mm to worry about.
593
*
594
* This needs to happen before any other sanity checks due to
595
* intel_idle's shenanigans.
596
*/
597
if (loaded_mm == &init_mm)
598
return;
599
600
/* Warn if we're not lazy. */
601
WARN_ON(!this_cpu_read(cpu_tlbstate_shared.is_lazy));
602
603
switch_mm(NULL, &init_mm, NULL);
604
}
605
EXPORT_SYMBOL_GPL(leave_mm);
606
607
void switch_mm(struct mm_struct *prev, struct mm_struct *next,
608
struct task_struct *tsk)
609
{
610
unsigned long flags;
611
612
local_irq_save(flags);
613
switch_mm_irqs_off(NULL, next, tsk);
614
local_irq_restore(flags);
615
}
616
617
/*
618
* Invoked from return to user/guest by a task that opted-in to L1D
619
* flushing but ended up running on an SMT enabled core due to wrong
620
* affinity settings or CPU hotplug. This is part of the paranoid L1D flush
621
* contract which this task requested.
622
*/
623
static void l1d_flush_force_sigbus(struct callback_head *ch)
624
{
625
force_sig(SIGBUS);
626
}
627
628
static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm,
629
struct task_struct *next)
630
{
631
/* Flush L1D if the outgoing task requests it */
632
if (prev_mm & LAST_USER_MM_L1D_FLUSH)
633
wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
634
635
/* Check whether the incoming task opted in for L1D flush */
636
if (likely(!(next_mm & LAST_USER_MM_L1D_FLUSH)))
637
return;
638
639
/*
640
* Validate that it is not running on an SMT sibling as this would
641
* make the exercise pointless because the siblings share L1D. If
642
* it runs on a SMT sibling, notify it with SIGBUS on return to
643
* user/guest
644
*/
645
if (this_cpu_read(cpu_info.smt_active)) {
646
clear_ti_thread_flag(&next->thread_info, TIF_SPEC_L1D_FLUSH);
647
next->l1d_flush_kill.func = l1d_flush_force_sigbus;
648
task_work_add(next, &next->l1d_flush_kill, TWA_RESUME);
649
}
650
}
651
652
static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next)
653
{
654
unsigned long next_tif = read_task_thread_flags(next);
655
unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK;
656
657
/*
658
* Ensure that the bit shift above works as expected and the two flags
659
* end up in bit 0 and 1.
660
*/
661
BUILD_BUG_ON(TIF_SPEC_L1D_FLUSH != TIF_SPEC_IB + 1);
662
663
return (unsigned long)next->mm | spec_bits;
664
}
665
666
static void cond_mitigation(struct task_struct *next)
667
{
668
unsigned long prev_mm, next_mm;
669
670
if (!next || !next->mm)
671
return;
672
673
next_mm = mm_mangle_tif_spec_bits(next);
674
prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec);
675
676
/*
677
* Avoid user->user BTB/RSB poisoning by flushing them when switching
678
* between processes. This stops one process from doing Spectre-v2
679
* attacks on another.
680
*
681
* Both, the conditional and the always IBPB mode use the mm
682
* pointer to avoid the IBPB when switching between tasks of the
683
* same process. Using the mm pointer instead of mm->context.ctx_id
684
* opens a hypothetical hole vs. mm_struct reuse, which is more or
685
* less impossible to control by an attacker. Aside of that it
686
* would only affect the first schedule so the theoretically
687
* exposed data is not really interesting.
688
*/
689
if (static_branch_likely(&switch_mm_cond_ibpb)) {
690
/*
691
* This is a bit more complex than the always mode because
692
* it has to handle two cases:
693
*
694
* 1) Switch from a user space task (potential attacker)
695
* which has TIF_SPEC_IB set to a user space task
696
* (potential victim) which has TIF_SPEC_IB not set.
697
*
698
* 2) Switch from a user space task (potential attacker)
699
* which has TIF_SPEC_IB not set to a user space task
700
* (potential victim) which has TIF_SPEC_IB set.
701
*
702
* This could be done by unconditionally issuing IBPB when
703
* a task which has TIF_SPEC_IB set is either scheduled in
704
* or out. Though that results in two flushes when:
705
*
706
* - the same user space task is scheduled out and later
707
* scheduled in again and only a kernel thread ran in
708
* between.
709
*
710
* - a user space task belonging to the same process is
711
* scheduled in after a kernel thread ran in between
712
*
713
* - a user space task belonging to the same process is
714
* scheduled in immediately.
715
*
716
* Optimize this with reasonably small overhead for the
717
* above cases. Mangle the TIF_SPEC_IB bit into the mm
718
* pointer of the incoming task which is stored in
719
* cpu_tlbstate.last_user_mm_spec for comparison.
720
*
721
* Issue IBPB only if the mm's are different and one or
722
* both have the IBPB bit set.
723
*/
724
if (next_mm != prev_mm &&
725
(next_mm | prev_mm) & LAST_USER_MM_IBPB)
726
indirect_branch_prediction_barrier();
727
}
728
729
if (static_branch_unlikely(&switch_mm_always_ibpb)) {
730
/*
731
* Only flush when switching to a user space task with a
732
* different context than the user space task which ran
733
* last on this CPU.
734
*/
735
if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) != (unsigned long)next->mm)
736
indirect_branch_prediction_barrier();
737
}
738
739
if (static_branch_unlikely(&switch_mm_cond_l1d_flush)) {
740
/*
741
* Flush L1D when the outgoing task requested it and/or
742
* check whether the incoming task requested L1D flushing
743
* and ended up on an SMT sibling.
744
*/
745
if (unlikely((prev_mm | next_mm) & LAST_USER_MM_L1D_FLUSH))
746
l1d_flush_evaluate(prev_mm, next_mm, next);
747
}
748
749
this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm);
750
}
751
752
#ifdef CONFIG_PERF_EVENTS
753
static inline void cr4_update_pce_mm(struct mm_struct *mm)
754
{
755
if (static_branch_unlikely(&rdpmc_always_available_key) ||
756
(!static_branch_unlikely(&rdpmc_never_available_key) &&
757
atomic_read(&mm->context.perf_rdpmc_allowed))) {
758
/*
759
* Clear the existing dirty counters to
760
* prevent the leak for an RDPMC task.
761
*/
762
perf_clear_dirty_counters();
763
cr4_set_bits_irqsoff(X86_CR4_PCE);
764
} else
765
cr4_clear_bits_irqsoff(X86_CR4_PCE);
766
}
767
768
void cr4_update_pce(void *ignored)
769
{
770
cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm));
771
}
772
773
#else
774
static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
775
#endif
776
777
/*
778
* This optimizes when not actually switching mm's. Some architectures use the
779
* 'unused' argument for this optimization, but x86 must use
780
* 'cpu_tlbstate.loaded_mm' instead because it does not always keep
781
* 'current->active_mm' up to date.
782
*/
783
void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
784
struct task_struct *tsk)
785
{
786
struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm);
787
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
788
bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
789
unsigned cpu = smp_processor_id();
790
unsigned long new_lam;
791
struct new_asid ns;
792
u64 next_tlb_gen;
793
794
795
/* We don't want flush_tlb_func() to run concurrently with us. */
796
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
797
WARN_ON_ONCE(!irqs_disabled());
798
799
/*
800
* Verify that CR3 is what we think it is. This will catch
801
* hypothetical buggy code that directly switches to swapper_pg_dir
802
* without going through leave_mm() / switch_mm_irqs_off() or that
803
* does something like write_cr3(read_cr3_pa()).
804
*
805
* Only do this check if CONFIG_DEBUG_VM=y because __read_cr3()
806
* isn't free.
807
*/
808
#ifdef CONFIG_DEBUG_VM
809
if (WARN_ON_ONCE(__read_cr3() != build_cr3(prev->pgd, prev_asid,
810
tlbstate_lam_cr3_mask()))) {
811
/*
812
* If we were to BUG here, we'd be very likely to kill
813
* the system so hard that we don't see the call trace.
814
* Try to recover instead by ignoring the error and doing
815
* a global flush to minimize the chance of corruption.
816
*
817
* (This is far from being a fully correct recovery.
818
* Architecturally, the CPU could prefetch something
819
* back into an incorrect ASID slot and leave it there
820
* to cause trouble down the road. It's better than
821
* nothing, though.)
822
*/
823
__flush_tlb_all();
824
}
825
#endif
826
if (was_lazy)
827
this_cpu_write(cpu_tlbstate_shared.is_lazy, false);
828
829
/*
830
* The membarrier system call requires a full memory barrier and
831
* core serialization before returning to user-space, after
832
* storing to rq->curr, when changing mm. This is because
833
* membarrier() sends IPIs to all CPUs that are in the target mm
834
* to make them issue memory barriers. However, if another CPU
835
* switches to/from the target mm concurrently with
836
* membarrier(), it can cause that CPU not to receive an IPI
837
* when it really should issue a memory barrier. Writing to CR3
838
* provides that full memory barrier and core serializing
839
* instruction.
840
*/
841
if (prev == next) {
842
/* Not actually switching mm's */
843
VM_WARN_ON(is_dyn_asid(prev_asid) &&
844
this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
845
next->context.ctx_id);
846
847
/*
848
* If this races with another thread that enables lam, 'new_lam'
849
* might not match tlbstate_lam_cr3_mask().
850
*/
851
852
/*
853
* Even in lazy TLB mode, the CPU should stay set in the
854
* mm_cpumask. The TLB shootdown code can figure out from
855
* cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
856
*/
857
if (IS_ENABLED(CONFIG_DEBUG_VM) &&
858
WARN_ON_ONCE(prev != &init_mm && !is_notrack_mm(prev) &&
859
!cpumask_test_cpu(cpu, mm_cpumask(next))))
860
cpumask_set_cpu(cpu, mm_cpumask(next));
861
862
/* Check if the current mm is transitioning to a global ASID */
863
if (mm_needs_global_asid(next, prev_asid)) {
864
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
865
ns = choose_new_asid(next, next_tlb_gen);
866
goto reload_tlb;
867
}
868
869
/*
870
* Broadcast TLB invalidation keeps this ASID up to date
871
* all the time.
872
*/
873
if (is_global_asid(prev_asid))
874
return;
875
876
/*
877
* If the CPU is not in lazy TLB mode, we are just switching
878
* from one thread in a process to another thread in the same
879
* process. No TLB flush required.
880
*/
881
if (!was_lazy)
882
return;
883
884
/*
885
* Read the tlb_gen to check whether a flush is needed.
886
* If the TLB is up to date, just use it.
887
* The barrier synchronizes with the tlb_gen increment in
888
* the TLB shootdown code.
889
*/
890
smp_mb();
891
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
892
if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
893
next_tlb_gen)
894
return;
895
896
/*
897
* TLB contents went out of date while we were in lazy
898
* mode. Fall through to the TLB switching code below.
899
*/
900
ns.asid = prev_asid;
901
ns.need_flush = true;
902
} else {
903
/*
904
* Apply process to process speculation vulnerability
905
* mitigations if applicable.
906
*/
907
cond_mitigation(tsk);
908
909
/*
910
* Indicate that CR3 is about to change. nmi_uaccess_okay()
911
* and others are sensitive to the window where mm_cpumask(),
912
* CR3 and cpu_tlbstate.loaded_mm are not all in sync.
913
*/
914
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
915
916
/*
917
* Make sure this CPU is set in mm_cpumask() such that we'll
918
* receive invalidation IPIs.
919
*
920
* Rely on the smp_mb() implied by cpumask_set_cpu()'s atomic
921
* operation, or explicitly provide one. Such that:
922
*
923
* switch_mm_irqs_off() flush_tlb_mm_range()
924
* smp_store_release(loaded_mm, SWITCHING); atomic64_inc_return(tlb_gen)
925
* smp_mb(); // here // smp_mb() implied
926
* atomic64_read(tlb_gen); this_cpu_read(loaded_mm);
927
*
928
* we properly order against flush_tlb_mm_range(), where the
929
* loaded_mm load can happen in mative_flush_tlb_multi() ->
930
* should_flush_tlb().
931
*
932
* This way switch_mm() must see the new tlb_gen or
933
* flush_tlb_mm_range() must see the new loaded_mm, or both.
934
*/
935
if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))
936
cpumask_set_cpu(cpu, mm_cpumask(next));
937
else
938
smp_mb();
939
940
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
941
942
ns = choose_new_asid(next, next_tlb_gen);
943
}
944
945
reload_tlb:
946
new_lam = mm_lam_cr3_mask(next);
947
if (ns.need_flush) {
948
VM_WARN_ON_ONCE(is_global_asid(ns.asid));
949
this_cpu_write(cpu_tlbstate.ctxs[ns.asid].ctx_id, next->context.ctx_id);
950
this_cpu_write(cpu_tlbstate.ctxs[ns.asid].tlb_gen, next_tlb_gen);
951
load_new_mm_cr3(next->pgd, ns.asid, new_lam, true);
952
953
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
954
} else {
955
/* The new ASID is already up to date. */
956
load_new_mm_cr3(next->pgd, ns.asid, new_lam, false);
957
958
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
959
}
960
961
/* Make sure we write CR3 before loaded_mm. */
962
barrier();
963
964
this_cpu_write(cpu_tlbstate.loaded_mm, next);
965
this_cpu_write(cpu_tlbstate.loaded_mm_asid, ns.asid);
966
cpu_tlbstate_update_lam(new_lam, mm_untag_mask(next));
967
968
if (next != prev) {
969
cr4_update_pce_mm(next);
970
switch_ldt(prev, next);
971
}
972
}
973
974
/*
975
* Please ignore the name of this function. It should be called
976
* switch_to_kernel_thread().
977
*
978
* enter_lazy_tlb() is a hint from the scheduler that we are entering a
979
* kernel thread or other context without an mm. Acceptable implementations
980
* include doing nothing whatsoever, switching to init_mm, or various clever
981
* lazy tricks to try to minimize TLB flushes.
982
*
983
* The scheduler reserves the right to call enter_lazy_tlb() several times
984
* in a row. It will notify us that we're going back to a real mm by
985
* calling switch_mm_irqs_off().
986
*/
987
void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
988
{
989
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
990
return;
991
992
this_cpu_write(cpu_tlbstate_shared.is_lazy, true);
993
}
994
995
/*
996
* Using a temporary mm allows to set temporary mappings that are not accessible
997
* by other CPUs. Such mappings are needed to perform sensitive memory writes
998
* that override the kernel memory protections (e.g., W^X), without exposing the
999
* temporary page-table mappings that are required for these write operations to
1000
* other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
1001
* mapping is torn down. Temporary mms can also be used for EFI runtime service
1002
* calls or similar functionality.
1003
*
1004
* It is illegal to schedule while using a temporary mm -- the context switch
1005
* code is unaware of the temporary mm and does not know how to context switch.
1006
* Use a real (non-temporary) mm in a kernel thread if you need to sleep.
1007
*
1008
* Note: For sensitive memory writes, the temporary mm needs to be used
1009
* exclusively by a single core, and IRQs should be disabled while the
1010
* temporary mm is loaded, thereby preventing interrupt handler bugs from
1011
* overriding the kernel memory protection.
1012
*/
1013
struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm)
1014
{
1015
struct mm_struct *prev_mm;
1016
1017
lockdep_assert_preemption_disabled();
1018
guard(irqsave)();
1019
1020
/*
1021
* Make sure not to be in TLB lazy mode, as otherwise we'll end up
1022
* with a stale address space WITHOUT being in lazy mode after
1023
* restoring the previous mm.
1024
*/
1025
if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
1026
leave_mm();
1027
1028
prev_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1029
switch_mm_irqs_off(NULL, temp_mm, current);
1030
1031
/*
1032
* If breakpoints are enabled, disable them while the temporary mm is
1033
* used. Userspace might set up watchpoints on addresses that are used
1034
* in the temporary mm, which would lead to wrong signals being sent or
1035
* crashes.
1036
*
1037
* Note that breakpoints are not disabled selectively, which also causes
1038
* kernel breakpoints (e.g., perf's) to be disabled. This might be
1039
* undesirable, but still seems reasonable as the code that runs in the
1040
* temporary mm should be short.
1041
*/
1042
if (hw_breakpoint_active())
1043
hw_breakpoint_disable();
1044
1045
return prev_mm;
1046
}
1047
1048
void unuse_temporary_mm(struct mm_struct *prev_mm)
1049
{
1050
lockdep_assert_preemption_disabled();
1051
guard(irqsave)();
1052
1053
/* Clear the cpumask, to indicate no TLB flushing is needed anywhere */
1054
cpumask_clear_cpu(smp_processor_id(), mm_cpumask(this_cpu_read(cpu_tlbstate.loaded_mm)));
1055
1056
switch_mm_irqs_off(NULL, prev_mm, current);
1057
1058
/*
1059
* Restore the breakpoints if they were disabled before the temporary mm
1060
* was loaded.
1061
*/
1062
if (hw_breakpoint_active())
1063
hw_breakpoint_restore();
1064
}
1065
1066
/*
1067
* Call this when reinitializing a CPU. It fixes the following potential
1068
* problems:
1069
*
1070
* - The ASID changed from what cpu_tlbstate thinks it is (most likely
1071
* because the CPU was taken down and came back up with CR3's PCID
1072
* bits clear. CPU hotplug can do this.
1073
*
1074
* - The TLB contains junk in slots corresponding to inactive ASIDs.
1075
*
1076
* - The CPU went so far out to lunch that it may have missed a TLB
1077
* flush.
1078
*/
1079
void initialize_tlbstate_and_flush(void)
1080
{
1081
int i;
1082
struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1083
u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
1084
unsigned long lam = mm_lam_cr3_mask(mm);
1085
unsigned long cr3 = __read_cr3();
1086
1087
/* Assert that CR3 already references the right mm. */
1088
WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
1089
1090
/* LAM expected to be disabled */
1091
WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57));
1092
WARN_ON(lam);
1093
1094
/*
1095
* Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
1096
* doesn't work like other CR4 bits because it can only be set from
1097
* long mode.)
1098
*/
1099
WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
1100
!(cr4_read_shadow() & X86_CR4_PCIDE));
1101
1102
/* Disable LAM, force ASID 0 and force a TLB flush. */
1103
write_cr3(build_cr3(mm->pgd, 0, 0));
1104
1105
/* Reinitialize tlbstate. */
1106
this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT);
1107
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
1108
this_cpu_write(cpu_tlbstate.next_asid, 1);
1109
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
1110
this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
1111
cpu_tlbstate_update_lam(lam, mm_untag_mask(mm));
1112
1113
for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
1114
this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
1115
}
1116
1117
/*
1118
* flush_tlb_func()'s memory ordering requirement is that any
1119
* TLB fills that happen after we flush the TLB are ordered after we
1120
* read active_mm's tlb_gen. We don't need any explicit barriers
1121
* because all x86 flush operations are serializing and the
1122
* atomic64_read operation won't be reordered by the compiler.
1123
*/
1124
static void flush_tlb_func(void *info)
1125
{
1126
/*
1127
* We have three different tlb_gen values in here. They are:
1128
*
1129
* - mm_tlb_gen: the latest generation.
1130
* - local_tlb_gen: the generation that this CPU has already caught
1131
* up to.
1132
* - f->new_tlb_gen: the generation that the requester of the flush
1133
* wants us to catch up to.
1134
*/
1135
const struct flush_tlb_info *f = info;
1136
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1137
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
1138
u64 local_tlb_gen;
1139
bool local = smp_processor_id() == f->initiating_cpu;
1140
unsigned long nr_invalidate = 0;
1141
u64 mm_tlb_gen;
1142
1143
/* This code cannot presently handle being reentered. */
1144
VM_WARN_ON(!irqs_disabled());
1145
1146
if (!local) {
1147
inc_irq_stat(irq_tlb_count);
1148
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
1149
}
1150
1151
/* The CPU was left in the mm_cpumask of the target mm. Clear it. */
1152
if (f->mm && f->mm != loaded_mm) {
1153
cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm));
1154
trace_tlb_flush(TLB_REMOTE_WRONG_CPU, 0);
1155
return;
1156
}
1157
1158
if (unlikely(loaded_mm == &init_mm))
1159
return;
1160
1161
/* Reload the ASID if transitioning into or out of a global ASID */
1162
if (mm_needs_global_asid(loaded_mm, loaded_mm_asid)) {
1163
switch_mm_irqs_off(NULL, loaded_mm, NULL);
1164
loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
1165
}
1166
1167
/* Broadcast ASIDs are always kept up to date with INVLPGB. */
1168
if (is_global_asid(loaded_mm_asid))
1169
return;
1170
1171
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
1172
loaded_mm->context.ctx_id);
1173
1174
if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) {
1175
/*
1176
* We're in lazy mode. We need to at least flush our
1177
* paging-structure cache to avoid speculatively reading
1178
* garbage into our TLB. Since switching to init_mm is barely
1179
* slower than a minimal flush, just switch to init_mm.
1180
*
1181
* This should be rare, with native_flush_tlb_multi() skipping
1182
* IPIs to lazy TLB mode CPUs.
1183
*/
1184
switch_mm_irqs_off(NULL, &init_mm, NULL);
1185
return;
1186
}
1187
1188
local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
1189
1190
if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
1191
f->new_tlb_gen <= local_tlb_gen)) {
1192
/*
1193
* The TLB is already up to date in respect to f->new_tlb_gen.
1194
* While the core might be still behind mm_tlb_gen, checking
1195
* mm_tlb_gen unnecessarily would have negative caching effects
1196
* so avoid it.
1197
*/
1198
return;
1199
}
1200
1201
/*
1202
* Defer mm_tlb_gen reading as long as possible to avoid cache
1203
* contention.
1204
*/
1205
mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
1206
1207
if (unlikely(local_tlb_gen == mm_tlb_gen)) {
1208
/*
1209
* There's nothing to do: we're already up to date. This can
1210
* happen if two concurrent flushes happen -- the first flush to
1211
* be handled can catch us all the way up, leaving no work for
1212
* the second flush.
1213
*/
1214
goto done;
1215
}
1216
1217
WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
1218
WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
1219
1220
/*
1221
* If we get to this point, we know that our TLB is out of date.
1222
* This does not strictly imply that we need to flush (it's
1223
* possible that f->new_tlb_gen <= local_tlb_gen), but we're
1224
* going to need to flush in the very near future, so we might
1225
* as well get it over with.
1226
*
1227
* The only question is whether to do a full or partial flush.
1228
*
1229
* We do a partial flush if requested and two extra conditions
1230
* are met:
1231
*
1232
* 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that
1233
* we've always done all needed flushes to catch up to
1234
* local_tlb_gen. If, for example, local_tlb_gen == 2 and
1235
* f->new_tlb_gen == 3, then we know that the flush needed to bring
1236
* us up to date for tlb_gen 3 is the partial flush we're
1237
* processing.
1238
*
1239
* As an example of why this check is needed, suppose that there
1240
* are two concurrent flushes. The first is a full flush that
1241
* changes context.tlb_gen from 1 to 2. The second is a partial
1242
* flush that changes context.tlb_gen from 2 to 3. If they get
1243
* processed on this CPU in reverse order, we'll see
1244
* local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
1245
* If we were to use __flush_tlb_one_user() and set local_tlb_gen to
1246
* 3, we'd be break the invariant: we'd update local_tlb_gen above
1247
* 1 without the full flush that's needed for tlb_gen 2.
1248
*
1249
* 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimization.
1250
* Partial TLB flushes are not all that much cheaper than full TLB
1251
* flushes, so it seems unlikely that it would be a performance win
1252
* to do a partial flush if that won't bring our TLB fully up to
1253
* date. By doing a full flush instead, we can increase
1254
* local_tlb_gen all the way to mm_tlb_gen and we can probably
1255
* avoid another flush in the very near future.
1256
*/
1257
if (f->end != TLB_FLUSH_ALL &&
1258
f->new_tlb_gen == local_tlb_gen + 1 &&
1259
f->new_tlb_gen == mm_tlb_gen) {
1260
/* Partial flush */
1261
unsigned long addr = f->start;
1262
1263
/* Partial flush cannot have invalid generations */
1264
VM_WARN_ON(f->new_tlb_gen == TLB_GENERATION_INVALID);
1265
1266
/* Partial flush must have valid mm */
1267
VM_WARN_ON(f->mm == NULL);
1268
1269
nr_invalidate = (f->end - f->start) >> f->stride_shift;
1270
1271
while (addr < f->end) {
1272
flush_tlb_one_user(addr);
1273
addr += 1UL << f->stride_shift;
1274
}
1275
if (local)
1276
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
1277
} else {
1278
/* Full flush. */
1279
nr_invalidate = TLB_FLUSH_ALL;
1280
1281
flush_tlb_local();
1282
if (local)
1283
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
1284
}
1285
1286
/* Both paths above update our state to mm_tlb_gen. */
1287
this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
1288
1289
/* Tracing is done in a unified manner to reduce the code size */
1290
done:
1291
trace_tlb_flush(!local ? TLB_REMOTE_SHOOTDOWN :
1292
(f->mm == NULL) ? TLB_LOCAL_SHOOTDOWN :
1293
TLB_LOCAL_MM_SHOOTDOWN,
1294
nr_invalidate);
1295
}
1296
1297
static bool should_flush_tlb(int cpu, void *data)
1298
{
1299
struct mm_struct *loaded_mm = per_cpu(cpu_tlbstate.loaded_mm, cpu);
1300
struct flush_tlb_info *info = data;
1301
1302
/*
1303
* Order the 'loaded_mm' and 'is_lazy' against their
1304
* write ordering in switch_mm_irqs_off(). Ensure
1305
* 'is_lazy' is at least as new as 'loaded_mm'.
1306
*/
1307
smp_rmb();
1308
1309
/* Lazy TLB will get flushed at the next context switch. */
1310
if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
1311
return false;
1312
1313
/* No mm means kernel memory flush. */
1314
if (!info->mm)
1315
return true;
1316
1317
/*
1318
* While switching, the remote CPU could have state from
1319
* either the prev or next mm. Assume the worst and flush.
1320
*/
1321
if (loaded_mm == LOADED_MM_SWITCHING)
1322
return true;
1323
1324
/* The target mm is loaded, and the CPU is not lazy. */
1325
if (loaded_mm == info->mm)
1326
return true;
1327
1328
/* In cpumask, but not the loaded mm? Periodically remove by flushing. */
1329
if (info->trim_cpumask)
1330
return true;
1331
1332
return false;
1333
}
1334
1335
static bool should_trim_cpumask(struct mm_struct *mm)
1336
{
1337
if (time_after(jiffies, READ_ONCE(mm->context.next_trim_cpumask))) {
1338
WRITE_ONCE(mm->context.next_trim_cpumask, jiffies + HZ);
1339
return true;
1340
}
1341
return false;
1342
}
1343
1344
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
1345
EXPORT_PER_CPU_SYMBOL(cpu_tlbstate_shared);
1346
1347
STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
1348
const struct flush_tlb_info *info)
1349
{
1350
/*
1351
* Do accounting and tracing. Note that there are (and have always been)
1352
* cases in which a remote TLB flush will be traced, but eventually
1353
* would not happen.
1354
*/
1355
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
1356
if (info->end == TLB_FLUSH_ALL)
1357
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
1358
else
1359
trace_tlb_flush(TLB_REMOTE_SEND_IPI,
1360
(info->end - info->start) >> PAGE_SHIFT);
1361
1362
/*
1363
* If no page tables were freed, we can skip sending IPIs to
1364
* CPUs in lazy TLB mode. They will flush the CPU themselves
1365
* at the next context switch.
1366
*
1367
* However, if page tables are getting freed, we need to send the
1368
* IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
1369
* up on the new contents of what used to be page tables, while
1370
* doing a speculative memory access.
1371
*/
1372
if (info->freed_tables || mm_in_asid_transition(info->mm))
1373
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
1374
else
1375
on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,
1376
(void *)info, 1, cpumask);
1377
}
1378
1379
void flush_tlb_multi(const struct cpumask *cpumask,
1380
const struct flush_tlb_info *info)
1381
{
1382
__flush_tlb_multi(cpumask, info);
1383
}
1384
1385
/*
1386
* See Documentation/arch/x86/tlb.rst for details. We choose 33
1387
* because it is large enough to cover the vast majority (at
1388
* least 95%) of allocations, and is small enough that we are
1389
* confident it will not cause too much overhead. Each single
1390
* flush is about 100 ns, so this caps the maximum overhead at
1391
* _about_ 3,000 ns.
1392
*
1393
* This is in units of pages.
1394
*/
1395
unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
1396
1397
static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
1398
1399
#ifdef CONFIG_DEBUG_VM
1400
static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
1401
#endif
1402
1403
static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
1404
unsigned long start, unsigned long end,
1405
unsigned int stride_shift, bool freed_tables,
1406
u64 new_tlb_gen)
1407
{
1408
struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
1409
1410
#ifdef CONFIG_DEBUG_VM
1411
/*
1412
* Ensure that the following code is non-reentrant and flush_tlb_info
1413
* is not overwritten. This means no TLB flushing is initiated by
1414
* interrupt handlers and machine-check exception handlers.
1415
*/
1416
BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
1417
#endif
1418
1419
/*
1420
* If the number of flushes is so large that a full flush
1421
* would be faster, do a full flush.
1422
*/
1423
if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
1424
start = 0;
1425
end = TLB_FLUSH_ALL;
1426
}
1427
1428
info->start = start;
1429
info->end = end;
1430
info->mm = mm;
1431
info->stride_shift = stride_shift;
1432
info->freed_tables = freed_tables;
1433
info->new_tlb_gen = new_tlb_gen;
1434
info->initiating_cpu = smp_processor_id();
1435
info->trim_cpumask = 0;
1436
1437
return info;
1438
}
1439
1440
static void put_flush_tlb_info(void)
1441
{
1442
#ifdef CONFIG_DEBUG_VM
1443
/* Complete reentrancy prevention checks */
1444
barrier();
1445
this_cpu_dec(flush_tlb_info_idx);
1446
#endif
1447
}
1448
1449
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
1450
unsigned long end, unsigned int stride_shift,
1451
bool freed_tables)
1452
{
1453
struct flush_tlb_info *info;
1454
int cpu = get_cpu();
1455
u64 new_tlb_gen;
1456
1457
/* This is also a barrier that synchronizes with switch_mm(). */
1458
new_tlb_gen = inc_mm_tlb_gen(mm);
1459
1460
info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
1461
new_tlb_gen);
1462
1463
/*
1464
* flush_tlb_multi() is not optimized for the common case in which only
1465
* a local TLB flush is needed. Optimize this use-case by calling
1466
* flush_tlb_func_local() directly in this case.
1467
*/
1468
if (mm_global_asid(mm)) {
1469
broadcast_tlb_flush(info);
1470
} else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
1471
info->trim_cpumask = should_trim_cpumask(mm);
1472
flush_tlb_multi(mm_cpumask(mm), info);
1473
consider_global_asid(mm);
1474
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
1475
lockdep_assert_irqs_enabled();
1476
local_irq_disable();
1477
flush_tlb_func(info);
1478
local_irq_enable();
1479
}
1480
1481
put_flush_tlb_info();
1482
put_cpu();
1483
mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
1484
}
1485
1486
static void do_flush_tlb_all(void *info)
1487
{
1488
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
1489
__flush_tlb_all();
1490
}
1491
1492
void flush_tlb_all(void)
1493
{
1494
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
1495
1496
/* First try (faster) hardware-assisted TLB invalidation. */
1497
if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
1498
invlpgb_flush_all();
1499
else
1500
/* Fall back to the IPI-based invalidation. */
1501
on_each_cpu(do_flush_tlb_all, NULL, 1);
1502
}
1503
1504
/* Flush an arbitrarily large range of memory with INVLPGB. */
1505
static void invlpgb_kernel_range_flush(struct flush_tlb_info *info)
1506
{
1507
unsigned long addr, nr;
1508
1509
for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
1510
nr = (info->end - addr) >> PAGE_SHIFT;
1511
1512
/*
1513
* INVLPGB has a limit on the size of ranges it can
1514
* flush. Break up large flushes.
1515
*/
1516
nr = clamp_val(nr, 1, invlpgb_count_max);
1517
1518
invlpgb_flush_addr_nosync(addr, nr);
1519
}
1520
__tlbsync();
1521
}
1522
1523
static void do_kernel_range_flush(void *info)
1524
{
1525
struct flush_tlb_info *f = info;
1526
unsigned long addr;
1527
1528
/* flush range by one by one 'invlpg' */
1529
for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
1530
flush_tlb_one_kernel(addr);
1531
}
1532
1533
static void kernel_tlb_flush_all(struct flush_tlb_info *info)
1534
{
1535
if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
1536
invlpgb_flush_all();
1537
else
1538
on_each_cpu(do_flush_tlb_all, NULL, 1);
1539
}
1540
1541
static void kernel_tlb_flush_range(struct flush_tlb_info *info)
1542
{
1543
if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
1544
invlpgb_kernel_range_flush(info);
1545
else
1546
on_each_cpu(do_kernel_range_flush, info, 1);
1547
}
1548
1549
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
1550
{
1551
struct flush_tlb_info *info;
1552
1553
guard(preempt)();
1554
1555
info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
1556
TLB_GENERATION_INVALID);
1557
1558
if (info->end == TLB_FLUSH_ALL)
1559
kernel_tlb_flush_all(info);
1560
else
1561
kernel_tlb_flush_range(info);
1562
1563
put_flush_tlb_info();
1564
}
1565
1566
/*
1567
* This can be used from process context to figure out what the value of
1568
* CR3 is without needing to do a (slow) __read_cr3().
1569
*
1570
* It's intended to be used for code like KVM that sneakily changes CR3
1571
* and needs to restore it. It needs to be used very carefully.
1572
*/
1573
unsigned long __get_current_cr3_fast(void)
1574
{
1575
unsigned long cr3 =
1576
build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
1577
this_cpu_read(cpu_tlbstate.loaded_mm_asid),
1578
tlbstate_lam_cr3_mask());
1579
1580
/* For now, be very restrictive about when this can be called. */
1581
VM_WARN_ON(in_nmi() || preemptible());
1582
1583
VM_BUG_ON(cr3 != __read_cr3());
1584
return cr3;
1585
}
1586
EXPORT_SYMBOL_FOR_KVM(__get_current_cr3_fast);
1587
1588
/*
1589
* Flush one page in the kernel mapping
1590
*/
1591
void flush_tlb_one_kernel(unsigned long addr)
1592
{
1593
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
1594
1595
/*
1596
* If PTI is off, then __flush_tlb_one_user() is just INVLPG or its
1597
* paravirt equivalent. Even with PCID, this is sufficient: we only
1598
* use PCID if we also use global PTEs for the kernel mapping, and
1599
* INVLPG flushes global translations across all address spaces.
1600
*
1601
* If PTI is on, then the kernel is mapped with non-global PTEs, and
1602
* __flush_tlb_one_user() will flush the given address for the current
1603
* kernel address space and for its usermode counterpart, but it does
1604
* not flush it for other address spaces.
1605
*/
1606
flush_tlb_one_user(addr);
1607
1608
if (!static_cpu_has(X86_FEATURE_PTI))
1609
return;
1610
1611
/*
1612
* See above. We need to propagate the flush to all other address
1613
* spaces. In principle, we only need to propagate it to kernelmode
1614
* address spaces, but the extra bookkeeping we would need is not
1615
* worth it.
1616
*/
1617
this_cpu_write(cpu_tlbstate.invalidate_other, true);
1618
}
1619
1620
/*
1621
* Flush one page in the user mapping
1622
*/
1623
STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr)
1624
{
1625
u32 loaded_mm_asid;
1626
bool cpu_pcide;
1627
1628
/* Flush 'addr' from the kernel PCID: */
1629
invlpg(addr);
1630
1631
/* If PTI is off there is no user PCID and nothing to flush. */
1632
if (!static_cpu_has(X86_FEATURE_PTI))
1633
return;
1634
1635
loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
1636
cpu_pcide = this_cpu_read(cpu_tlbstate.cr4) & X86_CR4_PCIDE;
1637
1638
/*
1639
* invpcid_flush_one(pcid>0) will #GP if CR4.PCIDE==0. Check
1640
* 'cpu_pcide' to ensure that *this* CPU will not trigger those
1641
* #GP's even if called before CR4.PCIDE has been initialized.
1642
*/
1643
if (boot_cpu_has(X86_FEATURE_INVPCID) && cpu_pcide)
1644
invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
1645
else
1646
invalidate_user_asid(loaded_mm_asid);
1647
}
1648
1649
void flush_tlb_one_user(unsigned long addr)
1650
{
1651
__flush_tlb_one_user(addr);
1652
}
1653
1654
/*
1655
* Flush everything
1656
*/
1657
STATIC_NOPV void native_flush_tlb_global(void)
1658
{
1659
unsigned long flags;
1660
1661
if (static_cpu_has(X86_FEATURE_INVPCID)) {
1662
/*
1663
* Using INVPCID is considerably faster than a pair of writes
1664
* to CR4 sandwiched inside an IRQ flag save/restore.
1665
*
1666
* Note, this works with CR4.PCIDE=0 or 1.
1667
*/
1668
invpcid_flush_all();
1669
return;
1670
}
1671
1672
/*
1673
* Read-modify-write to CR4 - protect it from preemption and
1674
* from interrupts. (Use the raw variant because this code can
1675
* be called from deep inside debugging code.)
1676
*/
1677
raw_local_irq_save(flags);
1678
1679
__native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4));
1680
1681
raw_local_irq_restore(flags);
1682
}
1683
1684
/*
1685
* Flush the entire current user mapping
1686
*/
1687
STATIC_NOPV void native_flush_tlb_local(void)
1688
{
1689
/*
1690
* Preemption or interrupts must be disabled to protect the access
1691
* to the per CPU variable and to prevent being preempted between
1692
* read_cr3() and write_cr3().
1693
*/
1694
WARN_ON_ONCE(preemptible());
1695
1696
invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
1697
1698
/* If current->mm == NULL then the read_cr3() "borrows" an mm */
1699
native_write_cr3(__native_read_cr3());
1700
}
1701
1702
void flush_tlb_local(void)
1703
{
1704
__flush_tlb_local();
1705
}
1706
1707
/*
1708
* Flush everything
1709
*/
1710
void __flush_tlb_all(void)
1711
{
1712
/*
1713
* This is to catch users with enabled preemption and the PGE feature
1714
* and don't trigger the warning in __native_flush_tlb().
1715
*/
1716
VM_WARN_ON_ONCE(preemptible());
1717
1718
if (cpu_feature_enabled(X86_FEATURE_PGE)) {
1719
__flush_tlb_global();
1720
} else {
1721
/*
1722
* !PGE -> !PCID (setup_pcid()), thus every flush is total.
1723
*/
1724
flush_tlb_local();
1725
}
1726
}
1727
EXPORT_SYMBOL_FOR_KVM(__flush_tlb_all);
1728
1729
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
1730
{
1731
struct flush_tlb_info *info;
1732
1733
int cpu = get_cpu();
1734
1735
info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
1736
TLB_GENERATION_INVALID);
1737
/*
1738
* flush_tlb_multi() is not optimized for the common case in which only
1739
* a local TLB flush is needed. Optimize this use-case by calling
1740
* flush_tlb_func_local() directly in this case.
1741
*/
1742
if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->unmapped_pages) {
1743
invlpgb_flush_all_nonglobals();
1744
batch->unmapped_pages = false;
1745
} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
1746
flush_tlb_multi(&batch->cpumask, info);
1747
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
1748
lockdep_assert_irqs_enabled();
1749
local_irq_disable();
1750
flush_tlb_func(info);
1751
local_irq_enable();
1752
}
1753
1754
cpumask_clear(&batch->cpumask);
1755
1756
put_flush_tlb_info();
1757
put_cpu();
1758
}
1759
1760
/*
1761
* Blindly accessing user memory from NMI context can be dangerous
1762
* if we're in the middle of switching the current user task or
1763
* switching the loaded mm. It can also be dangerous if we
1764
* interrupted some kernel code that was temporarily using a
1765
* different mm.
1766
*/
1767
bool nmi_uaccess_okay(void)
1768
{
1769
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1770
struct mm_struct *current_mm = current->mm;
1771
1772
VM_WARN_ON_ONCE(!loaded_mm);
1773
1774
/*
1775
* The condition we want to check is
1776
* current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
1777
* if we're running in a VM with shadow paging, and nmi_uaccess_okay()
1778
* is supposed to be reasonably fast.
1779
*
1780
* Instead, we check the almost equivalent but somewhat conservative
1781
* condition below, and we rely on the fact that switch_mm_irqs_off()
1782
* sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
1783
*/
1784
if (loaded_mm != current_mm)
1785
return false;
1786
1787
VM_WARN_ON_ONCE(__pa(current_mm->pgd) != read_cr3_pa());
1788
1789
return true;
1790
}
1791
1792
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
1793
size_t count, loff_t *ppos)
1794
{
1795
char buf[32];
1796
unsigned int len;
1797
1798
len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
1799
return simple_read_from_buffer(user_buf, count, ppos, buf, len);
1800
}
1801
1802
static ssize_t tlbflush_write_file(struct file *file,
1803
const char __user *user_buf, size_t count, loff_t *ppos)
1804
{
1805
char buf[32];
1806
ssize_t len;
1807
int ceiling;
1808
1809
len = min(count, sizeof(buf) - 1);
1810
if (copy_from_user(buf, user_buf, len))
1811
return -EFAULT;
1812
1813
buf[len] = '\0';
1814
if (kstrtoint(buf, 0, &ceiling))
1815
return -EINVAL;
1816
1817
if (ceiling < 0)
1818
return -EINVAL;
1819
1820
tlb_single_page_flush_ceiling = ceiling;
1821
return count;
1822
}
1823
1824
static const struct file_operations fops_tlbflush = {
1825
.read = tlbflush_read_file,
1826
.write = tlbflush_write_file,
1827
.llseek = default_llseek,
1828
};
1829
1830
static int __init create_tlb_single_page_flush_ceiling(void)
1831
{
1832
debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
1833
arch_debugfs_dir, NULL, &fops_tlbflush);
1834
return 0;
1835
}
1836
late_initcall(create_tlb_single_page_flush_ceiling);
1837
1838