CoCalc -- tlb.c

GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/x86/mm/tlb.c
¹⁰⁸¹⁷ views
1
#include <linux/init.h>
2

3
#include <linux/mm.h>
4
#include <linux/spinlock.h>
5
#include <linux/smp.h>
6
#include <linux/interrupt.h>
7
#include <linux/module.h>
8
#include <linux/cpu.h>
9

10
#include <asm/tlbflush.h>
11
#include <asm/mmu_context.h>
12
#include <asm/cache.h>
13
#include <asm/apic.h>
14
#include <asm/uv/uv.h>
15

16
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
17
			= { &init_mm, 0, };
18

19
/*
20
 *	Smarter SMP flushing macros.
21
 *		c/o Linus Torvalds.
22
 *
23
 *	These mean you can really definitely utterly forget about
24
 *	writing to user space from interrupts. (Its not allowed anyway).
25
 *
26
 *	Optimizations Manfred Spraul <[email protected]>
27
 *
28
 *	More scalable flush, from Andi Kleen
29
 *
30
 *	To avoid global state use 8 different call vectors.
31
 *	Each CPU uses a specific vector to trigger flushes on other
32
 *	CPUs. Depending on the received vector the target CPUs look into
33
 *	the right array slot for the flush data.
34
 *
35
 *	With more than 8 CPUs they are hashed to the 8 available
36
 *	vectors. The limited global vector space forces us to this right now.
37
 *	In future when interrupts are split into per CPU domains this could be
38
 *	fixed, at the cost of triggering multiple IPIs in some cases.
39
 */
40

41
union smp_flush_state {
42
	struct {
43
		struct mm_struct *flush_mm;
44
		unsigned long flush_va;
45
		raw_spinlock_t tlbstate_lock;
46
		DECLARE_BITMAP(flush_cpumask, NR_CPUS);
47
	};
48
	char pad[INTERNODE_CACHE_BYTES];
49
} ____cacheline_internodealigned_in_smp;
50

51
/* State is put into the per CPU data section, but padded
52
   to a full cache line because other CPUs can access it and we don't
53
   want false sharing in the per cpu data segment. */
54
static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
55

56
static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
57

58
/*
59
 * We cannot call mmdrop() because we are in interrupt context,
60
 * instead update mm->cpu_vm_mask.
61
 */
62
void leave_mm(int cpu)
63
{
64
	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
65
		BUG();
66
	cpumask_clear_cpu(cpu,
67
			  mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
68
	load_cr3(swapper_pg_dir);
69
}
70
EXPORT_SYMBOL_GPL(leave_mm);
71

72
/*
73
 *
74
 * The flush IPI assumes that a thread switch happens in this order:
75
 * [cpu0: the cpu that switches]
76
 * 1) switch_mm() either 1a) or 1b)
77
 * 1a) thread switch to a different mm
78
 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
79
 *	Stop ipi delivery for the old mm. This is not synchronized with
80
 *	the other cpus, but smp_invalidate_interrupt ignore flush ipis
81
 *	for the wrong mm, and in the worst case we perform a superfluous
82
 *	tlb flush.
83
 * 1a2) set cpu mmu_state to TLBSTATE_OK
84
 *	Now the smp_invalidate_interrupt won't call leave_mm if cpu0
85
 *	was in lazy tlb mode.
86
 * 1a3) update cpu active_mm
87
 *	Now cpu0 accepts tlb flushes for the new mm.
88
 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
89
 *	Now the other cpus will send tlb flush ipis.
90
 * 1a4) change cr3.
91
 * 1b) thread switch without mm change
92
 *	cpu active_mm is correct, cpu0 already handles
93
 *	flush ipis.
94
 * 1b1) set cpu mmu_state to TLBSTATE_OK
95
 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
96
 *	Atomically set the bit [other cpus will start sending flush ipis],
97
 *	and test the bit.
98
 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
99
 * 2) switch %%esp, ie current
100
 *
101
 * The interrupt must handle 2 special cases:
102
 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
103
 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
104
 *   runs in kernel space, the cpu could load tlb entries for user space
105
 *   pages.
106
 *
107
 * The good news is that cpu mmu_state is local to each cpu, no
108
 * write/read ordering problems.
109
 */
110

111
/*
112
 * TLB flush IPI:
113
 *
114
 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
115
 * 2) Leave the mm if we are in the lazy tlb mode.
116
 *
117
 * Interrupts are disabled.
118
 */
119

120
/*
121
 * FIXME: use of asmlinkage is not consistent.  On x86_64 it's noop
122
 * but still used for documentation purpose but the usage is slightly
123
 * inconsistent.  On x86_32, asmlinkage is regparm(0) but interrupt
124
 * entry calls in with the first parameter in %eax.  Maybe define
125
 * intrlinkage?
126
 */
127
#ifdef CONFIG_X86_64
128
asmlinkage
129
#endif
130
void smp_invalidate_interrupt(struct pt_regs *regs)
131
{
132
	unsigned int cpu;
133
	unsigned int sender;
134
	union smp_flush_state *f;
135

136
	cpu = smp_processor_id();
137
	/*
138
	 * orig_rax contains the negated interrupt vector.
139
	 * Use that to determine where the sender put the data.
140
	 */
141
	sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
142
	f = &flush_state[sender];
143

144
	if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
145
		goto out;
146
		/*
147
		 * This was a BUG() but until someone can quote me the
148
		 * line from the intel manual that guarantees an IPI to
149
		 * multiple CPUs is retried _only_ on the erroring CPUs
150
		 * its staying as a return
151
		 *
152
		 * BUG();
153
		 */
154

155
	if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
156
		if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
157
			if (f->flush_va == TLB_FLUSH_ALL)
158
				local_flush_tlb();
159
			else
160
				__flush_tlb_one(f->flush_va);
161
		} else
162
			leave_mm(cpu);
163
	}
164
out:
165
	ack_APIC_irq();
166
	smp_mb__before_clear_bit();
167
	cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
168
	smp_mb__after_clear_bit();
169
	inc_irq_stat(irq_tlb_count);
170
}
171

172
static void flush_tlb_others_ipi(const struct cpumask *cpumask,
173
				 struct mm_struct *mm, unsigned long va)
174
{
175
	unsigned int sender;
176
	union smp_flush_state *f;
177

178
	/* Caller has disabled preemption */
179
	sender = this_cpu_read(tlb_vector_offset);
180
	f = &flush_state[sender];
181

182
	if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
183
		raw_spin_lock(&f->tlbstate_lock);
184

185
	f->flush_mm = mm;
186
	f->flush_va = va;
187
	if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) {
188
		/*
189
		 * We have to send the IPI only to
190
		 * CPUs affected.
191
		 */
192
		apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
193
			      INVALIDATE_TLB_VECTOR_START + sender);
194

195
		while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
196
			cpu_relax();
197
	}
198

199
	f->flush_mm = NULL;
200
	f->flush_va = 0;
201
	if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
202
		raw_spin_unlock(&f->tlbstate_lock);
203
}
204

205
void native_flush_tlb_others(const struct cpumask *cpumask,
206
			     struct mm_struct *mm, unsigned long va)
207
{
208
	if (is_uv_system()) {
209
		unsigned int cpu;
210

211
		cpu = smp_processor_id();
212
		cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
213
		if (cpumask)
214
			flush_tlb_others_ipi(cpumask, mm, va);
215
		return;
216
	}
217
	flush_tlb_others_ipi(cpumask, mm, va);
218
}
219

220
static void __cpuinit calculate_tlb_offset(void)
221
{
222
	int cpu, node, nr_node_vecs, idx = 0;
223
	/*
224
	 * we are changing tlb_vector_offset for each CPU in runtime, but this
225
	 * will not cause inconsistency, as the write is atomic under X86. we
226
	 * might see more lock contentions in a short time, but after all CPU's
227
	 * tlb_vector_offset are changed, everything should go normal
228
	 *
229
	 * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
230
	 * waste some vectors.
231
	 **/
232
	if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
233
		nr_node_vecs = 1;
234
	else
235
		nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
236

237
	for_each_online_node(node) {
238
		int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
239
			nr_node_vecs;
240
		int cpu_offset = 0;
241
		for_each_cpu(cpu, cpumask_of_node(node)) {
242
			per_cpu(tlb_vector_offset, cpu) = node_offset +
243
				cpu_offset;
244
			cpu_offset++;
245
			cpu_offset = cpu_offset % nr_node_vecs;
246
		}
247
		idx++;
248
	}
249
}
250

251
static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n,
252
		unsigned long action, void *hcpu)
253
{
254
	switch (action & 0xf) {
255
	case CPU_ONLINE:
256
	case CPU_DEAD:
257
		calculate_tlb_offset();
258
	}
259
	return NOTIFY_OK;
260
}
261

262
static int __cpuinit init_smp_flush(void)
263
{
264
	int i;
265

266
	for (i = 0; i < ARRAY_SIZE(flush_state); i++)
267
		raw_spin_lock_init(&flush_state[i].tlbstate_lock);
268

269
	calculate_tlb_offset();
270
	hotcpu_notifier(tlb_cpuhp_notify, 0);
271
	return 0;
272
}
273
core_initcall(init_smp_flush);
274

275
void flush_tlb_current_task(void)
276
{
277
	struct mm_struct *mm = current->mm;
278

279
	preempt_disable();
280

281
	local_flush_tlb();
282
	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
283
		flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
284
	preempt_enable();
285
}
286

287
void flush_tlb_mm(struct mm_struct *mm)
288
{
289
	preempt_disable();
290

291
	if (current->active_mm == mm) {
292
		if (current->mm)
293
			local_flush_tlb();
294
		else
295
			leave_mm(smp_processor_id());
296
	}
297
	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
298
		flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
299

300
	preempt_enable();
301
}
302

303
void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
304
{
305
	struct mm_struct *mm = vma->vm_mm;
306

307
	preempt_disable();
308

309
	if (current->active_mm == mm) {
310
		if (current->mm)
311
			__flush_tlb_one(va);
312
		else
313
			leave_mm(smp_processor_id());
314
	}
315

316
	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
317
		flush_tlb_others(mm_cpumask(mm), mm, va);
318

319
	preempt_enable();
320
}
321

322
static void do_flush_tlb_all(void *info)
323
{
324
	__flush_tlb_all();
325
	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
326
		leave_mm(smp_processor_id());
327
}
328

329
void flush_tlb_all(void)
330
{
331
	on_each_cpu(do_flush_tlb_all, NULL, 1);
332
}
333

334
Product

Resources

Company