CoCalc -- kmmio.c

GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/x86/mm/kmmio.c
¹⁷⁶⁵² views
1
/* Support for MMIO probes.
2
 * Benfit many code from kprobes
3
 * (C) 2002 Louis Zhuang <[email protected]>.
4
 *     2007 Alexander Eichner
5
 *     2008 Pekka Paalanen <[email protected]>
6
 */
7

8
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9

10
#include <linux/list.h>
11
#include <linux/rculist.h>
12
#include <linux/spinlock.h>
13
#include <linux/hash.h>
14
#include <linux/init.h>
15
#include <linux/module.h>
16
#include <linux/kernel.h>
17
#include <linux/uaccess.h>
18
#include <linux/ptrace.h>
19
#include <linux/preempt.h>
20
#include <linux/percpu.h>
21
#include <linux/kdebug.h>
22
#include <linux/mutex.h>
23
#include <linux/io.h>
24
#include <linux/slab.h>
25
#include <asm/cacheflush.h>
26
#include <asm/tlbflush.h>
27
#include <linux/errno.h>
28
#include <asm/debugreg.h>
29
#include <linux/mmiotrace.h>
30

31
#define KMMIO_PAGE_HASH_BITS 4
32
#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
33

34
struct kmmio_fault_page {
35
	struct list_head list;
36
	struct kmmio_fault_page *release_next;
37
	unsigned long page; /* location of the fault page */
38
	pteval_t old_presence; /* page presence prior to arming */
39
	bool armed;
40

41
	/*
42
	 * Number of times this page has been registered as a part
43
	 * of a probe. If zero, page is disarmed and this may be freed.
44
	 * Used only by writers (RCU) and post_kmmio_handler().
45
	 * Protected by kmmio_lock, when linked into kmmio_page_table.
46
	 */
47
	int count;
48

49
	bool scheduled_for_release;
50
};
51

52
struct kmmio_delayed_release {
53
	struct rcu_head rcu;
54
	struct kmmio_fault_page *release_list;
55
};
56

57
struct kmmio_context {
58
	struct kmmio_fault_page *fpage;
59
	struct kmmio_probe *probe;
60
	unsigned long saved_flags;
61
	unsigned long addr;
62
	int active;
63
};
64

65
static DEFINE_SPINLOCK(kmmio_lock);
66

67
/* Protected by kmmio_lock */
68
unsigned int kmmio_count;
69

70
/* Read-protected by RCU, write-protected by kmmio_lock. */
71
static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
72
static LIST_HEAD(kmmio_probes);
73

74
static struct list_head *kmmio_page_list(unsigned long page)
75
{
76
	return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
77
}
78

79
/* Accessed per-cpu */
80
static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
81

82
/*
83
 * this is basically a dynamic stabbing problem:
84
 * Could use the existing prio tree code or
85
 * Possible better implementations:
86
 * The Interval Skip List: A Data Structure for Finding All Intervals That
87
 * Overlap a Point (might be simple)
88
 * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
89
 */
90
/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
91
static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
92
{
93
	struct kmmio_probe *p;
94
	list_for_each_entry_rcu(p, &kmmio_probes, list) {
95
		if (addr >= p->addr && addr < (p->addr + p->len))
96
			return p;
97
	}
98
	return NULL;
99
}
100

101
/* You must be holding RCU read lock. */
102
static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
103
{
104
	struct list_head *head;
105
	struct kmmio_fault_page *f;
106

107
	page &= PAGE_MASK;
108
	head = kmmio_page_list(page);
109
	list_for_each_entry_rcu(f, head, list) {
110
		if (f->page == page)
111
			return f;
112
	}
113
	return NULL;
114
}
115

116
static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
117
{
118
	pmdval_t v = pmd_val(*pmd);
119
	if (clear) {
120
		*old = v & _PAGE_PRESENT;
121
		v &= ~_PAGE_PRESENT;
122
	} else	/* presume this has been called with clear==true previously */
123
		v |= *old;
124
	set_pmd(pmd, __pmd(v));
125
}
126

127
static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
128
{
129
	pteval_t v = pte_val(*pte);
130
	if (clear) {
131
		*old = v & _PAGE_PRESENT;
132
		v &= ~_PAGE_PRESENT;
133
	} else	/* presume this has been called with clear==true previously */
134
		v |= *old;
135
	set_pte_atomic(pte, __pte(v));
136
}
137

138
static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
139
{
140
	unsigned int level;
141
	pte_t *pte = lookup_address(f->page, &level);
142

143
	if (!pte) {
144
		pr_err("no pte for page 0x%08lx\n", f->page);
145
		return -1;
146
	}
147

148
	switch (level) {
149
	case PG_LEVEL_2M:
150
		clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
151
		break;
152
	case PG_LEVEL_4K:
153
		clear_pte_presence(pte, clear, &f->old_presence);
154
		break;
155
	default:
156
		pr_err("unexpected page level 0x%x.\n", level);
157
		return -1;
158
	}
159

160
	__flush_tlb_one(f->page);
161
	return 0;
162
}
163

164
/*
165
 * Mark the given page as not present. Access to it will trigger a fault.
166
 *
167
 * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
168
 * protection is ignored here. RCU read lock is assumed held, so the struct
169
 * will not disappear unexpectedly. Furthermore, the caller must guarantee,
170
 * that double arming the same virtual address (page) cannot occur.
171
 *
172
 * Double disarming on the other hand is allowed, and may occur when a fault
173
 * and mmiotrace shutdown happen simultaneously.
174
 */
175
static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
176
{
177
	int ret;
178
	WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
179
	if (f->armed) {
180
		pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n",
181
			   f->page, f->count, !!f->old_presence);
182
	}
183
	ret = clear_page_presence(f, true);
184
	WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"),
185
		  f->page);
186
	f->armed = true;
187
	return ret;
188
}
189

190
/** Restore the given page to saved presence state. */
191
static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
192
{
193
	int ret = clear_page_presence(f, false);
194
	WARN_ONCE(ret < 0,
195
			KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
196
	f->armed = false;
197
}
198

199
/*
200
 * This is being called from do_page_fault().
201
 *
202
 * We may be in an interrupt or a critical section. Also prefecthing may
203
 * trigger a page fault. We may be in the middle of process switch.
204
 * We cannot take any locks, because we could be executing especially
205
 * within a kmmio critical section.
206
 *
207
 * Local interrupts are disabled, so preemption cannot happen.
208
 * Do not enable interrupts, do not sleep, and watch out for other CPUs.
209
 */
210
/*
211
 * Interrupts are disabled on entry as trap3 is an interrupt gate
212
 * and they remain disabled throughout this function.
213
 */
214
int kmmio_handler(struct pt_regs *regs, unsigned long addr)
215
{
216
	struct kmmio_context *ctx;
217
	struct kmmio_fault_page *faultpage;
218
	int ret = 0; /* default to fault not handled */
219

220
	/*
221
	 * Preemption is now disabled to prevent process switch during
222
	 * single stepping. We can only handle one active kmmio trace
223
	 * per cpu, so ensure that we finish it before something else
224
	 * gets to run. We also hold the RCU read lock over single
225
	 * stepping to avoid looking up the probe and kmmio_fault_page
226
	 * again.
227
	 */
228
	preempt_disable();
229
	rcu_read_lock();
230

231
	faultpage = get_kmmio_fault_page(addr);
232
	if (!faultpage) {
233
		/*
234
		 * Either this page fault is not caused by kmmio, or
235
		 * another CPU just pulled the kmmio probe from under
236
		 * our feet. The latter case should not be possible.
237
		 */
238
		goto no_kmmio;
239
	}
240

241
	ctx = &get_cpu_var(kmmio_ctx);
242
	if (ctx->active) {
243
		if (addr == ctx->addr) {
244
			/*
245
			 * A second fault on the same page means some other
246
			 * condition needs handling by do_page_fault(), the
247
			 * page really not being present is the most common.
248
			 */
249
			pr_debug("secondary hit for 0x%08lx CPU %d.\n",
250
				 addr, smp_processor_id());
251

252
			if (!faultpage->old_presence)
253
				pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n",
254
					addr, smp_processor_id());
255
		} else {
256
			/*
257
			 * Prevent overwriting already in-flight context.
258
			 * This should not happen, let's hope disarming at
259
			 * least prevents a panic.
260
			 */
261
			pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n",
262
				 smp_processor_id(), addr);
263
			pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr);
264
			disarm_kmmio_fault_page(faultpage);
265
		}
266
		goto no_kmmio_ctx;
267
	}
268
	ctx->active++;
269

270
	ctx->fpage = faultpage;
271
	ctx->probe = get_kmmio_probe(addr);
272
	ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
273
	ctx->addr = addr;
274

275
	if (ctx->probe && ctx->probe->pre_handler)
276
		ctx->probe->pre_handler(ctx->probe, regs, addr);
277

278
	/*
279
	 * Enable single-stepping and disable interrupts for the faulting
280
	 * context. Local interrupts must not get enabled during stepping.
281
	 */
282
	regs->flags |= X86_EFLAGS_TF;
283
	regs->flags &= ~X86_EFLAGS_IF;
284

285
	/* Now we set present bit in PTE and single step. */
286
	disarm_kmmio_fault_page(ctx->fpage);
287

288
	/*
289
	 * If another cpu accesses the same page while we are stepping,
290
	 * the access will not be caught. It will simply succeed and the
291
	 * only downside is we lose the event. If this becomes a problem,
292
	 * the user should drop to single cpu before tracing.
293
	 */
294

295
	put_cpu_var(kmmio_ctx);
296
	return 1; /* fault handled */
297

298
no_kmmio_ctx:
299
	put_cpu_var(kmmio_ctx);
300
no_kmmio:
301
	rcu_read_unlock();
302
	preempt_enable_no_resched();
303
	return ret;
304
}
305

306
/*
307
 * Interrupts are disabled on entry as trap1 is an interrupt gate
308
 * and they remain disabled throughout this function.
309
 * This must always get called as the pair to kmmio_handler().
310
 */
311
static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
312
{
313
	int ret = 0;
314
	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
315

316
	if (!ctx->active) {
317
		/*
318
		 * debug traps without an active context are due to either
319
		 * something external causing them (f.e. using a debugger while
320
		 * mmio tracing enabled), or erroneous behaviour
321
		 */
322
		pr_warning("unexpected debug trap on CPU %d.\n",
323
			   smp_processor_id());
324
		goto out;
325
	}
326

327
	if (ctx->probe && ctx->probe->post_handler)
328
		ctx->probe->post_handler(ctx->probe, condition, regs);
329

330
	/* Prevent racing against release_kmmio_fault_page(). */
331
	spin_lock(&kmmio_lock);
332
	if (ctx->fpage->count)
333
		arm_kmmio_fault_page(ctx->fpage);
334
	spin_unlock(&kmmio_lock);
335

336
	regs->flags &= ~X86_EFLAGS_TF;
337
	regs->flags |= ctx->saved_flags;
338

339
	/* These were acquired in kmmio_handler(). */
340
	ctx->active--;
341
	BUG_ON(ctx->active);
342
	rcu_read_unlock();
343
	preempt_enable_no_resched();
344

345
	/*
346
	 * if somebody else is singlestepping across a probe point, flags
347
	 * will have TF set, in which case, continue the remaining processing
348
	 * of do_debug, as if this is not a probe hit.
349
	 */
350
	if (!(regs->flags & X86_EFLAGS_TF))
351
		ret = 1;
352
out:
353
	put_cpu_var(kmmio_ctx);
354
	return ret;
355
}
356

357
/* You must be holding kmmio_lock. */
358
static int add_kmmio_fault_page(unsigned long page)
359
{
360
	struct kmmio_fault_page *f;
361

362
	page &= PAGE_MASK;
363
	f = get_kmmio_fault_page(page);
364
	if (f) {
365
		if (!f->count)
366
			arm_kmmio_fault_page(f);
367
		f->count++;
368
		return 0;
369
	}
370

371
	f = kzalloc(sizeof(*f), GFP_ATOMIC);
372
	if (!f)
373
		return -1;
374

375
	f->count = 1;
376
	f->page = page;
377

378
	if (arm_kmmio_fault_page(f)) {
379
		kfree(f);
380
		return -1;
381
	}
382

383
	list_add_rcu(&f->list, kmmio_page_list(f->page));
384

385
	return 0;
386
}
387

388
/* You must be holding kmmio_lock. */
389
static void release_kmmio_fault_page(unsigned long page,
390
				struct kmmio_fault_page **release_list)
391
{
392
	struct kmmio_fault_page *f;
393

394
	page &= PAGE_MASK;
395
	f = get_kmmio_fault_page(page);
396
	if (!f)
397
		return;
398

399
	f->count--;
400
	BUG_ON(f->count < 0);
401
	if (!f->count) {
402
		disarm_kmmio_fault_page(f);
403
		if (!f->scheduled_for_release) {
404
			f->release_next = *release_list;
405
			*release_list = f;
406
			f->scheduled_for_release = true;
407
		}
408
	}
409
}
410

411
/*
412
 * With page-unaligned ioremaps, one or two armed pages may contain
413
 * addresses from outside the intended mapping. Events for these addresses
414
 * are currently silently dropped. The events may result only from programming
415
 * mistakes by accessing addresses before the beginning or past the end of a
416
 * mapping.
417
 */
418
int register_kmmio_probe(struct kmmio_probe *p)
419
{
420
	unsigned long flags;
421
	int ret = 0;
422
	unsigned long size = 0;
423
	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
424

425
	spin_lock_irqsave(&kmmio_lock, flags);
426
	if (get_kmmio_probe(p->addr)) {
427
		ret = -EEXIST;
428
		goto out;
429
	}
430
	kmmio_count++;
431
	list_add_rcu(&p->list, &kmmio_probes);
432
	while (size < size_lim) {
433
		if (add_kmmio_fault_page(p->addr + size))
434
			pr_err("Unable to set page fault.\n");
435
		size += PAGE_SIZE;
436
	}
437
out:
438
	spin_unlock_irqrestore(&kmmio_lock, flags);
439
	/*
440
	 * XXX: What should I do here?
441
	 * Here was a call to global_flush_tlb(), but it does not exist
442
	 * anymore. It seems it's not needed after all.
443
	 */
444
	return ret;
445
}
446
EXPORT_SYMBOL(register_kmmio_probe);
447

448
static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
449
{
450
	struct kmmio_delayed_release *dr = container_of(
451
						head,
452
						struct kmmio_delayed_release,
453
						rcu);
454
	struct kmmio_fault_page *f = dr->release_list;
455
	while (f) {
456
		struct kmmio_fault_page *next = f->release_next;
457
		BUG_ON(f->count);
458
		kfree(f);
459
		f = next;
460
	}
461
	kfree(dr);
462
}
463

464
static void remove_kmmio_fault_pages(struct rcu_head *head)
465
{
466
	struct kmmio_delayed_release *dr =
467
		container_of(head, struct kmmio_delayed_release, rcu);
468
	struct kmmio_fault_page *f = dr->release_list;
469
	struct kmmio_fault_page **prevp = &dr->release_list;
470
	unsigned long flags;
471

472
	spin_lock_irqsave(&kmmio_lock, flags);
473
	while (f) {
474
		if (!f->count) {
475
			list_del_rcu(&f->list);
476
			prevp = &f->release_next;
477
		} else {
478
			*prevp = f->release_next;
479
			f->release_next = NULL;
480
			f->scheduled_for_release = false;
481
		}
482
		f = *prevp;
483
	}
484
	spin_unlock_irqrestore(&kmmio_lock, flags);
485

486
	/* This is the real RCU destroy call. */
487
	call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
488
}
489

490
/*
491
 * Remove a kmmio probe. You have to synchronize_rcu() before you can be
492
 * sure that the callbacks will not be called anymore. Only after that
493
 * you may actually release your struct kmmio_probe.
494
 *
495
 * Unregistering a kmmio fault page has three steps:
496
 * 1. release_kmmio_fault_page()
497
 *    Disarm the page, wait a grace period to let all faults finish.
498
 * 2. remove_kmmio_fault_pages()
499
 *    Remove the pages from kmmio_page_table.
500
 * 3. rcu_free_kmmio_fault_pages()
501
 *    Actually free the kmmio_fault_page structs as with RCU.
502
 */
503
void unregister_kmmio_probe(struct kmmio_probe *p)
504
{
505
	unsigned long flags;
506
	unsigned long size = 0;
507
	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
508
	struct kmmio_fault_page *release_list = NULL;
509
	struct kmmio_delayed_release *drelease;
510

511
	spin_lock_irqsave(&kmmio_lock, flags);
512
	while (size < size_lim) {
513
		release_kmmio_fault_page(p->addr + size, &release_list);
514
		size += PAGE_SIZE;
515
	}
516
	list_del_rcu(&p->list);
517
	kmmio_count--;
518
	spin_unlock_irqrestore(&kmmio_lock, flags);
519

520
	if (!release_list)
521
		return;
522

523
	drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
524
	if (!drelease) {
525
		pr_crit("leaking kmmio_fault_page objects.\n");
526
		return;
527
	}
528
	drelease->release_list = release_list;
529

530
	/*
531
	 * This is not really RCU here. We have just disarmed a set of
532
	 * pages so that they cannot trigger page faults anymore. However,
533
	 * we cannot remove the pages from kmmio_page_table,
534
	 * because a probe hit might be in flight on another CPU. The
535
	 * pages are collected into a list, and they will be removed from
536
	 * kmmio_page_table when it is certain that no probe hit related to
537
	 * these pages can be in flight. RCU grace period sounds like a
538
	 * good choice.
539
	 *
540
	 * If we removed the pages too early, kmmio page fault handler might
541
	 * not find the respective kmmio_fault_page and determine it's not
542
	 * a kmmio fault, when it actually is. This would lead to madness.
543
	 */
544
	call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
545
}
546
EXPORT_SYMBOL(unregister_kmmio_probe);
547

548
static int
549
kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
550
{
551
	struct die_args *arg = args;
552
	unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err);
553

554
	if (val == DIE_DEBUG && (*dr6_p & DR_STEP))
555
		if (post_kmmio_handler(*dr6_p, arg->regs) == 1) {
556
			/*
557
			 * Reset the BS bit in dr6 (pointed by args->err) to
558
			 * denote completion of processing
559
			 */
560
			*dr6_p &= ~DR_STEP;
561
			return NOTIFY_STOP;
562
		}
563

564
	return NOTIFY_DONE;
565
}
566

567
static struct notifier_block nb_die = {
568
	.notifier_call = kmmio_die_notifier
569
};
570

571
int kmmio_init(void)
572
{
573
	int i;
574

575
	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
576
		INIT_LIST_HEAD(&kmmio_page_table[i]);
577

578
	return register_die_notifier(&nb_die);
579
}
580

581
void kmmio_cleanup(void)
582
{
583
	int i;
584

585
	unregister_die_notifier(&nb_die);
586
	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
587
		WARN_ONCE(!list_empty(&kmmio_page_table[i]),
588
			KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n");
589
	}
590
}
591

592
Product

Resources

Company