CoCalc -- perf

GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/x86/kernel/cpu/perf_event.c
¹⁰⁶⁹⁹ views
1
/*
2
 * Performance events x86 architecture code
3
 *
4
 *  Copyright (C) 2008 Thomas Gleixner <[email protected]>
5
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6
 *  Copyright (C) 2009 Jaswinder Singh Rajput
7
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <[email protected]>
9
 *  Copyright (C) 2009 Intel Corporation, <[email protected]>
10
 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
11
 *
12
 *  For licencing details see kernel-base/COPYING
13
 */
14

15
#include <linux/perf_event.h>
16
#include <linux/capability.h>
17
#include <linux/notifier.h>
18
#include <linux/hardirq.h>
19
#include <linux/kprobes.h>
20
#include <linux/module.h>
21
#include <linux/kdebug.h>
22
#include <linux/sched.h>
23
#include <linux/uaccess.h>
24
#include <linux/slab.h>
25
#include <linux/highmem.h>
26
#include <linux/cpu.h>
27
#include <linux/bitops.h>
28

29
#include <asm/apic.h>
30
#include <asm/stacktrace.h>
31
#include <asm/nmi.h>
32
#include <asm/compat.h>
33
#include <asm/smp.h>
34
#include <asm/alternative.h>
35

36
#if 0
37
#undef wrmsrl
38
#define wrmsrl(msr, val) 					\
39
do {								\
40
	trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
41
			(unsigned long)(val));			\
42
	native_write_msr((msr), (u32)((u64)(val)), 		\
43
			(u32)((u64)(val) >> 32));		\
44
} while (0)
45
#endif
46

47
/*
48
 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
49
 */
50
static unsigned long
51
copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
52
{
53
	unsigned long offset, addr = (unsigned long)from;
54
	unsigned long size, len = 0;
55
	struct page *page;
56
	void *map;
57
	int ret;
58

59
	do {
60
		ret = __get_user_pages_fast(addr, 1, 0, &page);
61
		if (!ret)
62
			break;
63

64
		offset = addr & (PAGE_SIZE - 1);
65
		size = min(PAGE_SIZE - offset, n - len);
66

67
		map = kmap_atomic(page);
68
		memcpy(to, map+offset, size);
69
		kunmap_atomic(map);
70
		put_page(page);
71

72
		len  += size;
73
		to   += size;
74
		addr += size;
75

76
	} while (len < n);
77

78
	return len;
79
}
80

81
struct event_constraint {
82
	union {
83
		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
84
		u64		idxmsk64;
85
	};
86
	u64	code;
87
	u64	cmask;
88
	int	weight;
89
};
90

91
struct amd_nb {
92
	int nb_id;  /* NorthBridge id */
93
	int refcnt; /* reference count */
94
	struct perf_event *owners[X86_PMC_IDX_MAX];
95
	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
96
};
97

98
struct intel_percore;
99

100
#define MAX_LBR_ENTRIES		16
101

102
struct cpu_hw_events {
103
	/*
104
	 * Generic x86 PMC bits
105
	 */
106
	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
107
	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
108
	unsigned long		running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
109
	int			enabled;
110

111
	int			n_events;
112
	int			n_added;
113
	int			n_txn;
114
	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
115
	u64			tags[X86_PMC_IDX_MAX];
116
	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
117

118
	unsigned int		group_flag;
119

120
	/*
121
	 * Intel DebugStore bits
122
	 */
123
	struct debug_store	*ds;
124
	u64			pebs_enabled;
125

126
	/*
127
	 * Intel LBR bits
128
	 */
129
	int				lbr_users;
130
	void				*lbr_context;
131
	struct perf_branch_stack	lbr_stack;
132
	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
133

134
	/*
135
	 * Intel percore register state.
136
	 * Coordinate shared resources between HT threads.
137
	 */
138
	int				percore_used; /* Used by this CPU? */
139
	struct intel_percore		*per_core;
140

141
	/*
142
	 * AMD specific bits
143
	 */
144
	struct amd_nb		*amd_nb;
145
};
146

147
#define __EVENT_CONSTRAINT(c, n, m, w) {\
148
	{ .idxmsk64 = (n) },		\
149
	.code = (c),			\
150
	.cmask = (m),			\
151
	.weight = (w),			\
152
}
153

154
#define EVENT_CONSTRAINT(c, n, m)	\
155
	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
156

157
/*
158
 * Constraint on the Event code.
159
 */
160
#define INTEL_EVENT_CONSTRAINT(c, n)	\
161
	EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
162

163
/*
164
 * Constraint on the Event code + UMask + fixed-mask
165
 *
166
 * filter mask to validate fixed counter events.
167
 * the following filters disqualify for fixed counters:
168
 *  - inv
169
 *  - edge
170
 *  - cnt-mask
171
 *  The other filters are supported by fixed counters.
172
 *  The any-thread option is supported starting with v3.
173
 */
174
#define FIXED_EVENT_CONSTRAINT(c, n)	\
175
	EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
176

177
/*
178
 * Constraint on the Event code + UMask
179
 */
180
#define INTEL_UEVENT_CONSTRAINT(c, n)	\
181
	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
182

183
#define EVENT_CONSTRAINT_END		\
184
	EVENT_CONSTRAINT(0, 0, 0)
185

186
#define for_each_event_constraint(e, c)	\
187
	for ((e) = (c); (e)->weight; (e)++)
188

189
/*
190
 * Extra registers for specific events.
191
 * Some events need large masks and require external MSRs.
192
 * Define a mapping to these extra registers.
193
 */
194
struct extra_reg {
195
	unsigned int		event;
196
	unsigned int		msr;
197
	u64			config_mask;
198
	u64			valid_mask;
199
};
200

201
#define EVENT_EXTRA_REG(e, ms, m, vm) {	\
202
	.event = (e),		\
203
	.msr = (ms),		\
204
	.config_mask = (m),	\
205
	.valid_mask = (vm),	\
206
	}
207
#define INTEL_EVENT_EXTRA_REG(event, msr, vm)	\
208
	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
209
#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)
210

211
union perf_capabilities {
212
	struct {
213
		u64	lbr_format    : 6;
214
		u64	pebs_trap     : 1;
215
		u64	pebs_arch_reg : 1;
216
		u64	pebs_format   : 4;
217
		u64	smm_freeze    : 1;
218
	};
219
	u64	capabilities;
220
};
221

222
/*
223
 * struct x86_pmu - generic x86 pmu
224
 */
225
struct x86_pmu {
226
	/*
227
	 * Generic x86 PMC bits
228
	 */
229
	const char	*name;
230
	int		version;
231
	int		(*handle_irq)(struct pt_regs *);
232
	void		(*disable_all)(void);
233
	void		(*enable_all)(int added);
234
	void		(*enable)(struct perf_event *);
235
	void		(*disable)(struct perf_event *);
236
	int		(*hw_config)(struct perf_event *event);
237
	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
238
	unsigned	eventsel;
239
	unsigned	perfctr;
240
	u64		(*event_map)(int);
241
	int		max_events;
242
	int		num_counters;
243
	int		num_counters_fixed;
244
	int		cntval_bits;
245
	u64		cntval_mask;
246
	int		apic;
247
	u64		max_period;
248
	struct event_constraint *
249
			(*get_event_constraints)(struct cpu_hw_events *cpuc,
250
						 struct perf_event *event);
251

252
	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
253
						 struct perf_event *event);
254
	struct event_constraint *event_constraints;
255
	struct event_constraint *percore_constraints;
256
	void		(*quirks)(void);
257
	int		perfctr_second_write;
258

259
	int		(*cpu_prepare)(int cpu);
260
	void		(*cpu_starting)(int cpu);
261
	void		(*cpu_dying)(int cpu);
262
	void		(*cpu_dead)(int cpu);
263

264
	/*
265
	 * Intel Arch Perfmon v2+
266
	 */
267
	u64			intel_ctrl;
268
	union perf_capabilities intel_cap;
269

270
	/*
271
	 * Intel DebugStore bits
272
	 */
273
	int		bts, pebs;
274
	int		bts_active, pebs_active;
275
	int		pebs_record_size;
276
	void		(*drain_pebs)(struct pt_regs *regs);
277
	struct event_constraint *pebs_constraints;
278

279
	/*
280
	 * Intel LBR
281
	 */
282
	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
283
	int		lbr_nr;			   /* hardware stack size */
284

285
	/*
286
	 * Extra registers for events
287
	 */
288
	struct extra_reg *extra_regs;
289
};
290

291
static struct x86_pmu x86_pmu __read_mostly;
292

293
static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
294
	.enabled = 1,
295
};
296

297
static int x86_perf_event_set_period(struct perf_event *event);
298

299
/*
300
 * Generalized hw caching related hw_event table, filled
301
 * in on a per model basis. A value of 0 means
302
 * 'not supported', -1 means 'hw_event makes no sense on
303
 * this CPU', any other value means the raw hw_event
304
 * ID.
305
 */
306

307
#define C(x) PERF_COUNT_HW_CACHE_##x
308

309
static u64 __read_mostly hw_cache_event_ids
310
				[PERF_COUNT_HW_CACHE_MAX]
311
				[PERF_COUNT_HW_CACHE_OP_MAX]
312
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
313
static u64 __read_mostly hw_cache_extra_regs
314
				[PERF_COUNT_HW_CACHE_MAX]
315
				[PERF_COUNT_HW_CACHE_OP_MAX]
316
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
317

318
/*
319
 * Propagate event elapsed time into the generic event.
320
 * Can only be executed on the CPU where the event is active.
321
 * Returns the delta events processed.
322
 */
323
static u64
324
x86_perf_event_update(struct perf_event *event)
325
{
326
	struct hw_perf_event *hwc = &event->hw;
327
	int shift = 64 - x86_pmu.cntval_bits;
328
	u64 prev_raw_count, new_raw_count;
329
	int idx = hwc->idx;
330
	s64 delta;
331

332
	if (idx == X86_PMC_IDX_FIXED_BTS)
333
		return 0;
334

335
	/*
336
	 * Careful: an NMI might modify the previous event value.
337
	 *
338
	 * Our tactic to handle this is to first atomically read and
339
	 * exchange a new raw count - then add that new-prev delta
340
	 * count to the generic event atomically:
341
	 */
342
again:
343
	prev_raw_count = local64_read(&hwc->prev_count);
344
	rdmsrl(hwc->event_base, new_raw_count);
345

346
	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
347
					new_raw_count) != prev_raw_count)
348
		goto again;
349

350
	/*
351
	 * Now we have the new raw value and have updated the prev
352
	 * timestamp already. We can now calculate the elapsed delta
353
	 * (event-)time and add that to the generic event.
354
	 *
355
	 * Careful, not all hw sign-extends above the physical width
356
	 * of the count.
357
	 */
358
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
359
	delta >>= shift;
360

361
	local64_add(delta, &event->count);
362
	local64_sub(delta, &hwc->period_left);
363

364
	return new_raw_count;
365
}
366

367
static inline int x86_pmu_addr_offset(int index)
368
{
369
	int offset;
370

371
	/* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
372
	alternative_io(ASM_NOP2,
373
		       "shll $1, %%eax",
374
		       X86_FEATURE_PERFCTR_CORE,
375
		       "=a" (offset),
376
		       "a"  (index));
377

378
	return offset;
379
}
380

381
static inline unsigned int x86_pmu_config_addr(int index)
382
{
383
	return x86_pmu.eventsel + x86_pmu_addr_offset(index);
384
}
385

386
static inline unsigned int x86_pmu_event_addr(int index)
387
{
388
	return x86_pmu.perfctr + x86_pmu_addr_offset(index);
389
}
390

391
/*
392
 * Find and validate any extra registers to set up.
393
 */
394
static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
395
{
396
	struct extra_reg *er;
397

398
	event->hw.extra_reg = 0;
399
	event->hw.extra_config = 0;
400

401
	if (!x86_pmu.extra_regs)
402
		return 0;
403

404
	for (er = x86_pmu.extra_regs; er->msr; er++) {
405
		if (er->event != (config & er->config_mask))
406
			continue;
407
		if (event->attr.config1 & ~er->valid_mask)
408
			return -EINVAL;
409
		event->hw.extra_reg = er->msr;
410
		event->hw.extra_config = event->attr.config1;
411
		break;
412
	}
413
	return 0;
414
}
415

416
static atomic_t active_events;
417
static DEFINE_MUTEX(pmc_reserve_mutex);
418

419
#ifdef CONFIG_X86_LOCAL_APIC
420

421
static bool reserve_pmc_hardware(void)
422
{
423
	int i;
424

425
	for (i = 0; i < x86_pmu.num_counters; i++) {
426
		if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
427
			goto perfctr_fail;
428
	}
429

430
	for (i = 0; i < x86_pmu.num_counters; i++) {
431
		if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
432
			goto eventsel_fail;
433
	}
434

435
	return true;
436

437
eventsel_fail:
438
	for (i--; i >= 0; i--)
439
		release_evntsel_nmi(x86_pmu_config_addr(i));
440

441
	i = x86_pmu.num_counters;
442

443
perfctr_fail:
444
	for (i--; i >= 0; i--)
445
		release_perfctr_nmi(x86_pmu_event_addr(i));
446

447
	return false;
448
}
449

450
static void release_pmc_hardware(void)
451
{
452
	int i;
453

454
	for (i = 0; i < x86_pmu.num_counters; i++) {
455
		release_perfctr_nmi(x86_pmu_event_addr(i));
456
		release_evntsel_nmi(x86_pmu_config_addr(i));
457
	}
458
}
459

460
#else
461

462
static bool reserve_pmc_hardware(void) { return true; }
463
static void release_pmc_hardware(void) {}
464

465
#endif
466

467
static bool check_hw_exists(void)
468
{
469
	u64 val, val_new = 0;
470
	int i, reg, ret = 0;
471

472
	/*
473
	 * Check to see if the BIOS enabled any of the counters, if so
474
	 * complain and bail.
475
	 */
476
	for (i = 0; i < x86_pmu.num_counters; i++) {
477
		reg = x86_pmu_config_addr(i);
478
		ret = rdmsrl_safe(reg, &val);
479
		if (ret)
480
			goto msr_fail;
481
		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
482
			goto bios_fail;
483
	}
484

485
	if (x86_pmu.num_counters_fixed) {
486
		reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
487
		ret = rdmsrl_safe(reg, &val);
488
		if (ret)
489
			goto msr_fail;
490
		for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
491
			if (val & (0x03 << i*4))
492
				goto bios_fail;
493
		}
494
	}
495

496
	/*
497
	 * Now write a value and read it back to see if it matches,
498
	 * this is needed to detect certain hardware emulators (qemu/kvm)
499
	 * that don't trap on the MSR access and always return 0s.
500
	 */
501
	val = 0xabcdUL;
502
	ret = checking_wrmsrl(x86_pmu_event_addr(0), val);
503
	ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new);
504
	if (ret || val != val_new)
505
		goto msr_fail;
506

507
	return true;
508

509
bios_fail:
510
	/*
511
	 * We still allow the PMU driver to operate:
512
	 */
513
	printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
514
	printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
515

516
	return true;
517

518
msr_fail:
519
	printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
520

521
	return false;
522
}
523

524
static void reserve_ds_buffers(void);
525
static void release_ds_buffers(void);
526

527
static void hw_perf_event_destroy(struct perf_event *event)
528
{
529
	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
530
		release_pmc_hardware();
531
		release_ds_buffers();
532
		mutex_unlock(&pmc_reserve_mutex);
533
	}
534
}
535

536
static inline int x86_pmu_initialized(void)
537
{
538
	return x86_pmu.handle_irq != NULL;
539
}
540

541
static inline int
542
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
543
{
544
	struct perf_event_attr *attr = &event->attr;
545
	unsigned int cache_type, cache_op, cache_result;
546
	u64 config, val;
547

548
	config = attr->config;
549

550
	cache_type = (config >>  0) & 0xff;
551
	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
552
		return -EINVAL;
553

554
	cache_op = (config >>  8) & 0xff;
555
	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
556
		return -EINVAL;
557

558
	cache_result = (config >> 16) & 0xff;
559
	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
560
		return -EINVAL;
561

562
	val = hw_cache_event_ids[cache_type][cache_op][cache_result];
563

564
	if (val == 0)
565
		return -ENOENT;
566

567
	if (val == -1)
568
		return -EINVAL;
569

570
	hwc->config |= val;
571
	attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
572
	return x86_pmu_extra_regs(val, event);
573
}
574

575
static int x86_setup_perfctr(struct perf_event *event)
576
{
577
	struct perf_event_attr *attr = &event->attr;
578
	struct hw_perf_event *hwc = &event->hw;
579
	u64 config;
580

581
	if (!is_sampling_event(event)) {
582
		hwc->sample_period = x86_pmu.max_period;
583
		hwc->last_period = hwc->sample_period;
584
		local64_set(&hwc->period_left, hwc->sample_period);
585
	} else {
586
		/*
587
		 * If we have a PMU initialized but no APIC
588
		 * interrupts, we cannot sample hardware
589
		 * events (user-space has to fall back and
590
		 * sample via a hrtimer based software event):
591
		 */
592
		if (!x86_pmu.apic)
593
			return -EOPNOTSUPP;
594
	}
595

596
	/*
597
	 * Do not allow config1 (extended registers) to propagate,
598
	 * there's no sane user-space generalization yet:
599
	 */
600
	if (attr->type == PERF_TYPE_RAW)
601
		return 0;
602

603
	if (attr->type == PERF_TYPE_HW_CACHE)
604
		return set_ext_hw_attr(hwc, event);
605

606
	if (attr->config >= x86_pmu.max_events)
607
		return -EINVAL;
608

609
	/*
610
	 * The generic map:
611
	 */
612
	config = x86_pmu.event_map(attr->config);
613

614
	if (config == 0)
615
		return -ENOENT;
616

617
	if (config == -1LL)
618
		return -EINVAL;
619

620
	/*
621
	 * Branch tracing:
622
	 */
623
	if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
624
	    !attr->freq && hwc->sample_period == 1) {
625
		/* BTS is not supported by this architecture. */
626
		if (!x86_pmu.bts_active)
627
			return -EOPNOTSUPP;
628

629
		/* BTS is currently only allowed for user-mode. */
630
		if (!attr->exclude_kernel)
631
			return -EOPNOTSUPP;
632
	}
633

634
	hwc->config |= config;
635

636
	return 0;
637
}
638

639
static int x86_pmu_hw_config(struct perf_event *event)
640
{
641
	if (event->attr.precise_ip) {
642
		int precise = 0;
643

644
		/* Support for constant skid */
645
		if (x86_pmu.pebs_active) {
646
			precise++;
647

648
			/* Support for IP fixup */
649
			if (x86_pmu.lbr_nr)
650
				precise++;
651
		}
652

653
		if (event->attr.precise_ip > precise)
654
			return -EOPNOTSUPP;
655
	}
656

657
	/*
658
	 * Generate PMC IRQs:
659
	 * (keep 'enabled' bit clear for now)
660
	 */
661
	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
662

663
	/*
664
	 * Count user and OS events unless requested not to
665
	 */
666
	if (!event->attr.exclude_user)
667
		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
668
	if (!event->attr.exclude_kernel)
669
		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
670

671
	if (event->attr.type == PERF_TYPE_RAW)
672
		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
673

674
	return x86_setup_perfctr(event);
675
}
676

677
/*
678
 * Setup the hardware configuration for a given attr_type
679
 */
680
static int __x86_pmu_event_init(struct perf_event *event)
681
{
682
	int err;
683

684
	if (!x86_pmu_initialized())
685
		return -ENODEV;
686

687
	err = 0;
688
	if (!atomic_inc_not_zero(&active_events)) {
689
		mutex_lock(&pmc_reserve_mutex);
690
		if (atomic_read(&active_events) == 0) {
691
			if (!reserve_pmc_hardware())
692
				err = -EBUSY;
693
			else
694
				reserve_ds_buffers();
695
		}
696
		if (!err)
697
			atomic_inc(&active_events);
698
		mutex_unlock(&pmc_reserve_mutex);
699
	}
700
	if (err)
701
		return err;
702

703
	event->destroy = hw_perf_event_destroy;
704

705
	event->hw.idx = -1;
706
	event->hw.last_cpu = -1;
707
	event->hw.last_tag = ~0ULL;
708

709
	return x86_pmu.hw_config(event);
710
}
711

712
static void x86_pmu_disable_all(void)
713
{
714
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
715
	int idx;
716

717
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
718
		u64 val;
719

720
		if (!test_bit(idx, cpuc->active_mask))
721
			continue;
722
		rdmsrl(x86_pmu_config_addr(idx), val);
723
		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
724
			continue;
725
		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
726
		wrmsrl(x86_pmu_config_addr(idx), val);
727
	}
728
}
729

730
static void x86_pmu_disable(struct pmu *pmu)
731
{
732
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
733

734
	if (!x86_pmu_initialized())
735
		return;
736

737
	if (!cpuc->enabled)
738
		return;
739

740
	cpuc->n_added = 0;
741
	cpuc->enabled = 0;
742
	barrier();
743

744
	x86_pmu.disable_all();
745
}
746

747
static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
748
					  u64 enable_mask)
749
{
750
	if (hwc->extra_reg)
751
		wrmsrl(hwc->extra_reg, hwc->extra_config);
752
	wrmsrl(hwc->config_base, hwc->config | enable_mask);
753
}
754

755
static void x86_pmu_enable_all(int added)
756
{
757
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
758
	int idx;
759

760
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
761
		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
762

763
		if (!test_bit(idx, cpuc->active_mask))
764
			continue;
765

766
		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
767
	}
768
}
769

770
static struct pmu pmu;
771

772
static inline int is_x86_event(struct perf_event *event)
773
{
774
	return event->pmu == &pmu;
775
}
776

777
static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
778
{
779
	struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
780
	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
781
	int i, j, w, wmax, num = 0;
782
	struct hw_perf_event *hwc;
783

784
	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
785

786
	for (i = 0; i < n; i++) {
787
		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
788
		constraints[i] = c;
789
	}
790

791
	/*
792
	 * fastpath, try to reuse previous register
793
	 */
794
	for (i = 0; i < n; i++) {
795
		hwc = &cpuc->event_list[i]->hw;
796
		c = constraints[i];
797

798
		/* never assigned */
799
		if (hwc->idx == -1)
800
			break;
801

802
		/* constraint still honored */
803
		if (!test_bit(hwc->idx, c->idxmsk))
804
			break;
805

806
		/* not already used */
807
		if (test_bit(hwc->idx, used_mask))
808
			break;
809

810
		__set_bit(hwc->idx, used_mask);
811
		if (assign)
812
			assign[i] = hwc->idx;
813
	}
814
	if (i == n)
815
		goto done;
816

817
	/*
818
	 * begin slow path
819
	 */
820

821
	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
822

823
	/*
824
	 * weight = number of possible counters
825
	 *
826
	 * 1    = most constrained, only works on one counter
827
	 * wmax = least constrained, works on any counter
828
	 *
829
	 * assign events to counters starting with most
830
	 * constrained events.
831
	 */
832
	wmax = x86_pmu.num_counters;
833

834
	/*
835
	 * when fixed event counters are present,
836
	 * wmax is incremented by 1 to account
837
	 * for one more choice
838
	 */
839
	if (x86_pmu.num_counters_fixed)
840
		wmax++;
841

842
	for (w = 1, num = n; num && w <= wmax; w++) {
843
		/* for each event */
844
		for (i = 0; num && i < n; i++) {
845
			c = constraints[i];
846
			hwc = &cpuc->event_list[i]->hw;
847

848
			if (c->weight != w)
849
				continue;
850

851
			for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
852
				if (!test_bit(j, used_mask))
853
					break;
854
			}
855

856
			if (j == X86_PMC_IDX_MAX)
857
				break;
858

859
			__set_bit(j, used_mask);
860

861
			if (assign)
862
				assign[i] = j;
863
			num--;
864
		}
865
	}
866
done:
867
	/*
868
	 * scheduling failed or is just a simulation,
869
	 * free resources if necessary
870
	 */
871
	if (!assign || num) {
872
		for (i = 0; i < n; i++) {
873
			if (x86_pmu.put_event_constraints)
874
				x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
875
		}
876
	}
877
	return num ? -ENOSPC : 0;
878
}
879

880
/*
881
 * dogrp: true if must collect siblings events (group)
882
 * returns total number of events and error code
883
 */
884
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
885
{
886
	struct perf_event *event;
887
	int n, max_count;
888

889
	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
890

891
	/* current number of events already accepted */
892
	n = cpuc->n_events;
893

894
	if (is_x86_event(leader)) {
895
		if (n >= max_count)
896
			return -ENOSPC;
897
		cpuc->event_list[n] = leader;
898
		n++;
899
	}
900
	if (!dogrp)
901
		return n;
902

903
	list_for_each_entry(event, &leader->sibling_list, group_entry) {
904
		if (!is_x86_event(event) ||
905
		    event->state <= PERF_EVENT_STATE_OFF)
906
			continue;
907

908
		if (n >= max_count)
909
			return -ENOSPC;
910

911
		cpuc->event_list[n] = event;
912
		n++;
913
	}
914
	return n;
915
}
916

917
static inline void x86_assign_hw_event(struct perf_event *event,
918
				struct cpu_hw_events *cpuc, int i)
919
{
920
	struct hw_perf_event *hwc = &event->hw;
921

922
	hwc->idx = cpuc->assign[i];
923
	hwc->last_cpu = smp_processor_id();
924
	hwc->last_tag = ++cpuc->tags[i];
925

926
	if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
927
		hwc->config_base = 0;
928
		hwc->event_base	= 0;
929
	} else if (hwc->idx >= X86_PMC_IDX_FIXED) {
930
		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
931
		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED);
932
	} else {
933
		hwc->config_base = x86_pmu_config_addr(hwc->idx);
934
		hwc->event_base  = x86_pmu_event_addr(hwc->idx);
935
	}
936
}
937

938
static inline int match_prev_assignment(struct hw_perf_event *hwc,
939
					struct cpu_hw_events *cpuc,
940
					int i)
941
{
942
	return hwc->idx == cpuc->assign[i] &&
943
		hwc->last_cpu == smp_processor_id() &&
944
		hwc->last_tag == cpuc->tags[i];
945
}
946

947
static void x86_pmu_start(struct perf_event *event, int flags);
948
static void x86_pmu_stop(struct perf_event *event, int flags);
949

950
static void x86_pmu_enable(struct pmu *pmu)
951
{
952
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
953
	struct perf_event *event;
954
	struct hw_perf_event *hwc;
955
	int i, added = cpuc->n_added;
956

957
	if (!x86_pmu_initialized())
958
		return;
959

960
	if (cpuc->enabled)
961
		return;
962

963
	if (cpuc->n_added) {
964
		int n_running = cpuc->n_events - cpuc->n_added;
965
		/*
966
		 * apply assignment obtained either from
967
		 * hw_perf_group_sched_in() or x86_pmu_enable()
968
		 *
969
		 * step1: save events moving to new counters
970
		 * step2: reprogram moved events into new counters
971
		 */
972
		for (i = 0; i < n_running; i++) {
973
			event = cpuc->event_list[i];
974
			hwc = &event->hw;
975

976
			/*
977
			 * we can avoid reprogramming counter if:
978
			 * - assigned same counter as last time
979
			 * - running on same CPU as last time
980
			 * - no other event has used the counter since
981
			 */
982
			if (hwc->idx == -1 ||
983
			    match_prev_assignment(hwc, cpuc, i))
984
				continue;
985

986
			/*
987
			 * Ensure we don't accidentally enable a stopped
988
			 * counter simply because we rescheduled.
989
			 */
990
			if (hwc->state & PERF_HES_STOPPED)
991
				hwc->state |= PERF_HES_ARCH;
992

993
			x86_pmu_stop(event, PERF_EF_UPDATE);
994
		}
995

996
		for (i = 0; i < cpuc->n_events; i++) {
997
			event = cpuc->event_list[i];
998
			hwc = &event->hw;
999

1000
			if (!match_prev_assignment(hwc, cpuc, i))
1001
				x86_assign_hw_event(event, cpuc, i);
1002
			else if (i < n_running)
1003
				continue;
1004

1005
			if (hwc->state & PERF_HES_ARCH)
1006
				continue;
1007

1008
			x86_pmu_start(event, PERF_EF_RELOAD);
1009
		}
1010
		cpuc->n_added = 0;
1011
		perf_events_lapic_init();
1012
	}
1013

1014
	cpuc->enabled = 1;
1015
	barrier();
1016

1017
	x86_pmu.enable_all(added);
1018
}
1019

1020
static inline void x86_pmu_disable_event(struct perf_event *event)
1021
{
1022
	struct hw_perf_event *hwc = &event->hw;
1023

1024
	wrmsrl(hwc->config_base, hwc->config);
1025
}
1026

1027
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
1028

1029
/*
1030
 * Set the next IRQ period, based on the hwc->period_left value.
1031
 * To be called with the event disabled in hw:
1032
 */
1033
static int
1034
x86_perf_event_set_period(struct perf_event *event)
1035
{
1036
	struct hw_perf_event *hwc = &event->hw;
1037
	s64 left = local64_read(&hwc->period_left);
1038
	s64 period = hwc->sample_period;
1039
	int ret = 0, idx = hwc->idx;
1040

1041
	if (idx == X86_PMC_IDX_FIXED_BTS)
1042
		return 0;
1043

1044
	/*
1045
	 * If we are way outside a reasonable range then just skip forward:
1046
	 */
1047
	if (unlikely(left <= -period)) {
1048
		left = period;
1049
		local64_set(&hwc->period_left, left);
1050
		hwc->last_period = period;
1051
		ret = 1;
1052
	}
1053

1054
	if (unlikely(left <= 0)) {
1055
		left += period;
1056
		local64_set(&hwc->period_left, left);
1057
		hwc->last_period = period;
1058
		ret = 1;
1059
	}
1060
	/*
1061
	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
1062
	 */
1063
	if (unlikely(left < 2))
1064
		left = 2;
1065

1066
	if (left > x86_pmu.max_period)
1067
		left = x86_pmu.max_period;
1068

1069
	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
1070

1071
	/*
1072
	 * The hw event starts counting from this event offset,
1073
	 * mark it to be able to extra future deltas:
1074
	 */
1075
	local64_set(&hwc->prev_count, (u64)-left);
1076

1077
	wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
1078

1079
	/*
1080
	 * Due to erratum on certan cpu we need
1081
	 * a second write to be sure the register
1082
	 * is updated properly
1083
	 */
1084
	if (x86_pmu.perfctr_second_write) {
1085
		wrmsrl(hwc->event_base,
1086
			(u64)(-left) & x86_pmu.cntval_mask);
1087
	}
1088

1089
	perf_event_update_userpage(event);
1090

1091
	return ret;
1092
}
1093

1094
static void x86_pmu_enable_event(struct perf_event *event)
1095
{
1096
	if (__this_cpu_read(cpu_hw_events.enabled))
1097
		__x86_pmu_enable_event(&event->hw,
1098
				       ARCH_PERFMON_EVENTSEL_ENABLE);
1099
}
1100

1101
/*
1102
 * Add a single event to the PMU.
1103
 *
1104
 * The event is added to the group of enabled events
1105
 * but only if it can be scehduled with existing events.
1106
 */
1107
static int x86_pmu_add(struct perf_event *event, int flags)
1108
{
1109
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1110
	struct hw_perf_event *hwc;
1111
	int assign[X86_PMC_IDX_MAX];
1112
	int n, n0, ret;
1113

1114
	hwc = &event->hw;
1115

1116
	perf_pmu_disable(event->pmu);
1117
	n0 = cpuc->n_events;
1118
	ret = n = collect_events(cpuc, event, false);
1119
	if (ret < 0)
1120
		goto out;
1121

1122
	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
1123
	if (!(flags & PERF_EF_START))
1124
		hwc->state |= PERF_HES_ARCH;
1125

1126
	/*
1127
	 * If group events scheduling transaction was started,
1128
	 * skip the schedulability test here, it will be performed
1129
	 * at commit time (->commit_txn) as a whole
1130
	 */
1131
	if (cpuc->group_flag & PERF_EVENT_TXN)
1132
		goto done_collect;
1133

1134
	ret = x86_pmu.schedule_events(cpuc, n, assign);
1135
	if (ret)
1136
		goto out;
1137
	/*
1138
	 * copy new assignment, now we know it is possible
1139
	 * will be used by hw_perf_enable()
1140
	 */
1141
	memcpy(cpuc->assign, assign, n*sizeof(int));
1142

1143
done_collect:
1144
	cpuc->n_events = n;
1145
	cpuc->n_added += n - n0;
1146
	cpuc->n_txn += n - n0;
1147

1148
	ret = 0;
1149
out:
1150
	perf_pmu_enable(event->pmu);
1151
	return ret;
1152
}
1153

1154
static void x86_pmu_start(struct perf_event *event, int flags)
1155
{
1156
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1157
	int idx = event->hw.idx;
1158

1159
	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1160
		return;
1161

1162
	if (WARN_ON_ONCE(idx == -1))
1163
		return;
1164

1165
	if (flags & PERF_EF_RELOAD) {
1166
		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
1167
		x86_perf_event_set_period(event);
1168
	}
1169

1170
	event->hw.state = 0;
1171

1172
	cpuc->events[idx] = event;
1173
	__set_bit(idx, cpuc->active_mask);
1174
	__set_bit(idx, cpuc->running);
1175
	x86_pmu.enable(event);
1176
	perf_event_update_userpage(event);
1177
}
1178

1179
void perf_event_print_debug(void)
1180
{
1181
	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1182
	u64 pebs;
1183
	struct cpu_hw_events *cpuc;
1184
	unsigned long flags;
1185
	int cpu, idx;
1186

1187
	if (!x86_pmu.num_counters)
1188
		return;
1189

1190
	local_irq_save(flags);
1191

1192
	cpu = smp_processor_id();
1193
	cpuc = &per_cpu(cpu_hw_events, cpu);
1194

1195
	if (x86_pmu.version >= 2) {
1196
		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
1197
		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1198
		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1199
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1200
		rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1201

1202
		pr_info("\n");
1203
		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
1204
		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
1205
		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
1206
		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
1207
		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
1208
	}
1209
	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1210

1211
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1212
		rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
1213
		rdmsrl(x86_pmu_event_addr(idx), pmc_count);
1214

1215
		prev_left = per_cpu(pmc_prev_left[idx], cpu);
1216

1217
		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
1218
			cpu, idx, pmc_ctrl);
1219
		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
1220
			cpu, idx, pmc_count);
1221
		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
1222
			cpu, idx, prev_left);
1223
	}
1224
	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1225
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1226

1227
		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1228
			cpu, idx, pmc_count);
1229
	}
1230
	local_irq_restore(flags);
1231
}
1232

1233
static void x86_pmu_stop(struct perf_event *event, int flags)
1234
{
1235
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1236
	struct hw_perf_event *hwc = &event->hw;
1237

1238
	if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
1239
		x86_pmu.disable(event);
1240
		cpuc->events[hwc->idx] = NULL;
1241
		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
1242
		hwc->state |= PERF_HES_STOPPED;
1243
	}
1244

1245
	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
1246
		/*
1247
		 * Drain the remaining delta count out of a event
1248
		 * that we are disabling:
1249
		 */
1250
		x86_perf_event_update(event);
1251
		hwc->state |= PERF_HES_UPTODATE;
1252
	}
1253
}
1254

1255
static void x86_pmu_del(struct perf_event *event, int flags)
1256
{
1257
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1258
	int i;
1259

1260
	/*
1261
	 * If we're called during a txn, we don't need to do anything.
1262
	 * The events never got scheduled and ->cancel_txn will truncate
1263
	 * the event_list.
1264
	 */
1265
	if (cpuc->group_flag & PERF_EVENT_TXN)
1266
		return;
1267

1268
	x86_pmu_stop(event, PERF_EF_UPDATE);
1269

1270
	for (i = 0; i < cpuc->n_events; i++) {
1271
		if (event == cpuc->event_list[i]) {
1272

1273
			if (x86_pmu.put_event_constraints)
1274
				x86_pmu.put_event_constraints(cpuc, event);
1275

1276
			while (++i < cpuc->n_events)
1277
				cpuc->event_list[i-1] = cpuc->event_list[i];
1278

1279
			--cpuc->n_events;
1280
			break;
1281
		}
1282
	}
1283
	perf_event_update_userpage(event);
1284
}
1285

1286
static int x86_pmu_handle_irq(struct pt_regs *regs)
1287
{
1288
	struct perf_sample_data data;
1289
	struct cpu_hw_events *cpuc;
1290
	struct perf_event *event;
1291
	int idx, handled = 0;
1292
	u64 val;
1293

1294
	perf_sample_data_init(&data, 0);
1295

1296
	cpuc = &__get_cpu_var(cpu_hw_events);
1297

1298
	/*
1299
	 * Some chipsets need to unmask the LVTPC in a particular spot
1300
	 * inside the nmi handler.  As a result, the unmasking was pushed
1301
	 * into all the nmi handlers.
1302
	 *
1303
	 * This generic handler doesn't seem to have any issues where the
1304
	 * unmasking occurs so it was left at the top.
1305
	 */
1306
	apic_write(APIC_LVTPC, APIC_DM_NMI);
1307

1308
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1309
		if (!test_bit(idx, cpuc->active_mask)) {
1310
			/*
1311
			 * Though we deactivated the counter some cpus
1312
			 * might still deliver spurious interrupts still
1313
			 * in flight. Catch them:
1314
			 */
1315
			if (__test_and_clear_bit(idx, cpuc->running))
1316
				handled++;
1317
			continue;
1318
		}
1319

1320
		event = cpuc->events[idx];
1321

1322
		val = x86_perf_event_update(event);
1323
		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1324
			continue;
1325

1326
		/*
1327
		 * event overflow
1328
		 */
1329
		handled++;
1330
		data.period	= event->hw.last_period;
1331

1332
		if (!x86_perf_event_set_period(event))
1333
			continue;
1334

1335
		if (perf_event_overflow(event, 1, &data, regs))
1336
			x86_pmu_stop(event, 0);
1337
	}
1338

1339
	if (handled)
1340
		inc_irq_stat(apic_perf_irqs);
1341

1342
	return handled;
1343
}
1344

1345
void perf_events_lapic_init(void)
1346
{
1347
	if (!x86_pmu.apic || !x86_pmu_initialized())
1348
		return;
1349

1350
	/*
1351
	 * Always use NMI for PMU
1352
	 */
1353
	apic_write(APIC_LVTPC, APIC_DM_NMI);
1354
}
1355

1356
struct pmu_nmi_state {
1357
	unsigned int	marked;
1358
	int		handled;
1359
};
1360

1361
static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);
1362

1363
static int __kprobes
1364
perf_event_nmi_handler(struct notifier_block *self,
1365
			 unsigned long cmd, void *__args)
1366
{
1367
	struct die_args *args = __args;
1368
	unsigned int this_nmi;
1369
	int handled;
1370

1371
	if (!atomic_read(&active_events))
1372
		return NOTIFY_DONE;
1373

1374
	switch (cmd) {
1375
	case DIE_NMI:
1376
		break;
1377
	case DIE_NMIUNKNOWN:
1378
		this_nmi = percpu_read(irq_stat.__nmi_count);
1379
		if (this_nmi != __this_cpu_read(pmu_nmi.marked))
1380
			/* let the kernel handle the unknown nmi */
1381
			return NOTIFY_DONE;
1382
		/*
1383
		 * This one is a PMU back-to-back nmi. Two events
1384
		 * trigger 'simultaneously' raising two back-to-back
1385
		 * NMIs. If the first NMI handles both, the latter
1386
		 * will be empty and daze the CPU. So, we drop it to
1387
		 * avoid false-positive 'unknown nmi' messages.
1388
		 */
1389
		return NOTIFY_STOP;
1390
	default:
1391
		return NOTIFY_DONE;
1392
	}
1393

1394
	handled = x86_pmu.handle_irq(args->regs);
1395
	if (!handled)
1396
		return NOTIFY_DONE;
1397

1398
	this_nmi = percpu_read(irq_stat.__nmi_count);
1399
	if ((handled > 1) ||
1400
		/* the next nmi could be a back-to-back nmi */
1401
	    ((__this_cpu_read(pmu_nmi.marked) == this_nmi) &&
1402
	     (__this_cpu_read(pmu_nmi.handled) > 1))) {
1403
		/*
1404
		 * We could have two subsequent back-to-back nmis: The
1405
		 * first handles more than one counter, the 2nd
1406
		 * handles only one counter and the 3rd handles no
1407
		 * counter.
1408
		 *
1409
		 * This is the 2nd nmi because the previous was
1410
		 * handling more than one counter. We will mark the
1411
		 * next (3rd) and then drop it if unhandled.
1412
		 */
1413
		__this_cpu_write(pmu_nmi.marked, this_nmi + 1);
1414
		__this_cpu_write(pmu_nmi.handled, handled);
1415
	}
1416

1417
	return NOTIFY_STOP;
1418
}
1419

1420
static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1421
	.notifier_call		= perf_event_nmi_handler,
1422
	.next			= NULL,
1423
	.priority		= NMI_LOCAL_LOW_PRIOR,
1424
};
1425

1426
static struct event_constraint unconstrained;
1427
static struct event_constraint emptyconstraint;
1428

1429
static struct event_constraint *
1430
x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1431
{
1432
	struct event_constraint *c;
1433

1434
	if (x86_pmu.event_constraints) {
1435
		for_each_event_constraint(c, x86_pmu.event_constraints) {
1436
			if ((event->hw.config & c->cmask) == c->code)
1437
				return c;
1438
		}
1439
	}
1440

1441
	return &unconstrained;
1442
}
1443

1444
#include "perf_event_amd.c"
1445
#include "perf_event_p6.c"
1446
#include "perf_event_p4.c"
1447
#include "perf_event_intel_lbr.c"
1448
#include "perf_event_intel_ds.c"
1449
#include "perf_event_intel.c"
1450

1451
static int __cpuinit
1452
x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1453
{
1454
	unsigned int cpu = (long)hcpu;
1455
	int ret = NOTIFY_OK;
1456

1457
	switch (action & ~CPU_TASKS_FROZEN) {
1458
	case CPU_UP_PREPARE:
1459
		if (x86_pmu.cpu_prepare)
1460
			ret = x86_pmu.cpu_prepare(cpu);
1461
		break;
1462

1463
	case CPU_STARTING:
1464
		if (x86_pmu.cpu_starting)
1465
			x86_pmu.cpu_starting(cpu);
1466
		break;
1467

1468
	case CPU_DYING:
1469
		if (x86_pmu.cpu_dying)
1470
			x86_pmu.cpu_dying(cpu);
1471
		break;
1472

1473
	case CPU_UP_CANCELED:
1474
	case CPU_DEAD:
1475
		if (x86_pmu.cpu_dead)
1476
			x86_pmu.cpu_dead(cpu);
1477
		break;
1478

1479
	default:
1480
		break;
1481
	}
1482

1483
	return ret;
1484
}
1485

1486
static void __init pmu_check_apic(void)
1487
{
1488
	if (cpu_has_apic)
1489
		return;
1490

1491
	x86_pmu.apic = 0;
1492
	pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1493
	pr_info("no hardware sampling interrupt available.\n");
1494
}
1495

1496
static int __init init_hw_perf_events(void)
1497
{
1498
	struct event_constraint *c;
1499
	int err;
1500

1501
	pr_info("Performance Events: ");
1502

1503
	switch (boot_cpu_data.x86_vendor) {
1504
	case X86_VENDOR_INTEL:
1505
		err = intel_pmu_init();
1506
		break;
1507
	case X86_VENDOR_AMD:
1508
		err = amd_pmu_init();
1509
		break;
1510
	default:
1511
		return 0;
1512
	}
1513
	if (err != 0) {
1514
		pr_cont("no PMU driver, software events only.\n");
1515
		return 0;
1516
	}
1517

1518
	pmu_check_apic();
1519

1520
	/* sanity check that the hardware exists or is emulated */
1521
	if (!check_hw_exists())
1522
		return 0;
1523

1524
	pr_cont("%s PMU driver.\n", x86_pmu.name);
1525

1526
	if (x86_pmu.quirks)
1527
		x86_pmu.quirks();
1528

1529
	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1530
		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1531
		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1532
		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1533
	}
1534
	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1535

1536
	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1537
		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1538
		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1539
		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1540
	}
1541

1542
	x86_pmu.intel_ctrl |=
1543
		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1544

1545
	perf_events_lapic_init();
1546
	register_die_notifier(&perf_event_nmi_notifier);
1547

1548
	unconstrained = (struct event_constraint)
1549
		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1550
				   0, x86_pmu.num_counters);
1551

1552
	if (x86_pmu.event_constraints) {
1553
		for_each_event_constraint(c, x86_pmu.event_constraints) {
1554
			if (c->cmask != X86_RAW_EVENT_MASK)
1555
				continue;
1556

1557
			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
1558
			c->weight += x86_pmu.num_counters;
1559
		}
1560
	}
1561

1562
	pr_info("... version:                %d\n",     x86_pmu.version);
1563
	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
1564
	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
1565
	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
1566
	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
1567
	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
1568
	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
1569

1570
	perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
1571
	perf_cpu_notifier(x86_pmu_notifier);
1572

1573
	return 0;
1574
}
1575
early_initcall(init_hw_perf_events);
1576

1577
static inline void x86_pmu_read(struct perf_event *event)
1578
{
1579
	x86_perf_event_update(event);
1580
}
1581

1582
/*
1583
 * Start group events scheduling transaction
1584
 * Set the flag to make pmu::enable() not perform the
1585
 * schedulability test, it will be performed at commit time
1586
 */
1587
static void x86_pmu_start_txn(struct pmu *pmu)
1588
{
1589
	perf_pmu_disable(pmu);
1590
	__this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
1591
	__this_cpu_write(cpu_hw_events.n_txn, 0);
1592
}
1593

1594
/*
1595
 * Stop group events scheduling transaction
1596
 * Clear the flag and pmu::enable() will perform the
1597
 * schedulability test.
1598
 */
1599
static void x86_pmu_cancel_txn(struct pmu *pmu)
1600
{
1601
	__this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
1602
	/*
1603
	 * Truncate the collected events.
1604
	 */
1605
	__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
1606
	__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
1607
	perf_pmu_enable(pmu);
1608
}
1609

1610
/*
1611
 * Commit group events scheduling transaction
1612
 * Perform the group schedulability test as a whole
1613
 * Return 0 if success
1614
 */
1615
static int x86_pmu_commit_txn(struct pmu *pmu)
1616
{
1617
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1618
	int assign[X86_PMC_IDX_MAX];
1619
	int n, ret;
1620

1621
	n = cpuc->n_events;
1622

1623
	if (!x86_pmu_initialized())
1624
		return -EAGAIN;
1625

1626
	ret = x86_pmu.schedule_events(cpuc, n, assign);
1627
	if (ret)
1628
		return ret;
1629

1630
	/*
1631
	 * copy new assignment, now we know it is possible
1632
	 * will be used by hw_perf_enable()
1633
	 */
1634
	memcpy(cpuc->assign, assign, n*sizeof(int));
1635

1636
	cpuc->group_flag &= ~PERF_EVENT_TXN;
1637
	perf_pmu_enable(pmu);
1638
	return 0;
1639
}
1640

1641
/*
1642
 * validate that we can schedule this event
1643
 */
1644
static int validate_event(struct perf_event *event)
1645
{
1646
	struct cpu_hw_events *fake_cpuc;
1647
	struct event_constraint *c;
1648
	int ret = 0;
1649

1650
	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1651
	if (!fake_cpuc)
1652
		return -ENOMEM;
1653

1654
	c = x86_pmu.get_event_constraints(fake_cpuc, event);
1655

1656
	if (!c || !c->weight)
1657
		ret = -ENOSPC;
1658

1659
	if (x86_pmu.put_event_constraints)
1660
		x86_pmu.put_event_constraints(fake_cpuc, event);
1661

1662
	kfree(fake_cpuc);
1663

1664
	return ret;
1665
}
1666

1667
/*
1668
 * validate a single event group
1669
 *
1670
 * validation include:
1671
 *	- check events are compatible which each other
1672
 *	- events do not compete for the same counter
1673
 *	- number of events <= number of counters
1674
 *
1675
 * validation ensures the group can be loaded onto the
1676
 * PMU if it was the only group available.
1677
 */
1678
static int validate_group(struct perf_event *event)
1679
{
1680
	struct perf_event *leader = event->group_leader;
1681
	struct cpu_hw_events *fake_cpuc;
1682
	int ret, n;
1683

1684
	ret = -ENOMEM;
1685
	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1686
	if (!fake_cpuc)
1687
		goto out;
1688

1689
	/*
1690
	 * the event is not yet connected with its
1691
	 * siblings therefore we must first collect
1692
	 * existing siblings, then add the new event
1693
	 * before we can simulate the scheduling
1694
	 */
1695
	ret = -ENOSPC;
1696
	n = collect_events(fake_cpuc, leader, true);
1697
	if (n < 0)
1698
		goto out_free;
1699

1700
	fake_cpuc->n_events = n;
1701
	n = collect_events(fake_cpuc, event, false);
1702
	if (n < 0)
1703
		goto out_free;
1704

1705
	fake_cpuc->n_events = n;
1706

1707
	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1708

1709
out_free:
1710
	kfree(fake_cpuc);
1711
out:
1712
	return ret;
1713
}
1714

1715
static int x86_pmu_event_init(struct perf_event *event)
1716
{
1717
	struct pmu *tmp;
1718
	int err;
1719

1720
	switch (event->attr.type) {
1721
	case PERF_TYPE_RAW:
1722
	case PERF_TYPE_HARDWARE:
1723
	case PERF_TYPE_HW_CACHE:
1724
		break;
1725

1726
	default:
1727
		return -ENOENT;
1728
	}
1729

1730
	err = __x86_pmu_event_init(event);
1731
	if (!err) {
1732
		/*
1733
		 * we temporarily connect event to its pmu
1734
		 * such that validate_group() can classify
1735
		 * it as an x86 event using is_x86_event()
1736
		 */
1737
		tmp = event->pmu;
1738
		event->pmu = &pmu;
1739

1740
		if (event->group_leader != event)
1741
			err = validate_group(event);
1742
		else
1743
			err = validate_event(event);
1744

1745
		event->pmu = tmp;
1746
	}
1747
	if (err) {
1748
		if (event->destroy)
1749
			event->destroy(event);
1750
	}
1751

1752
	return err;
1753
}
1754

1755
static struct pmu pmu = {
1756
	.pmu_enable	= x86_pmu_enable,
1757
	.pmu_disable	= x86_pmu_disable,
1758

1759
	.event_init	= x86_pmu_event_init,
1760

1761
	.add		= x86_pmu_add,
1762
	.del		= x86_pmu_del,
1763
	.start		= x86_pmu_start,
1764
	.stop		= x86_pmu_stop,
1765
	.read		= x86_pmu_read,
1766

1767
	.start_txn	= x86_pmu_start_txn,
1768
	.cancel_txn	= x86_pmu_cancel_txn,
1769
	.commit_txn	= x86_pmu_commit_txn,
1770
};
1771

1772
/*
1773
 * callchain support
1774
 */
1775

1776
static int backtrace_stack(void *data, char *name)
1777
{
1778
	return 0;
1779
}
1780

1781
static void backtrace_address(void *data, unsigned long addr, int reliable)
1782
{
1783
	struct perf_callchain_entry *entry = data;
1784

1785
	perf_callchain_store(entry, addr);
1786
}
1787

1788
static const struct stacktrace_ops backtrace_ops = {
1789
	.stack			= backtrace_stack,
1790
	.address		= backtrace_address,
1791
	.walk_stack		= print_context_stack_bp,
1792
};
1793

1794
void
1795
perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1796
{
1797
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1798
		/* TODO: We don't support guest os callchain now */
1799
		return;
1800
	}
1801

1802
	perf_callchain_store(entry, regs->ip);
1803

1804
	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1805
}
1806

1807
#ifdef CONFIG_COMPAT
1808
static inline int
1809
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1810
{
1811
	/* 32-bit process in 64-bit kernel. */
1812
	struct stack_frame_ia32 frame;
1813
	const void __user *fp;
1814

1815
	if (!test_thread_flag(TIF_IA32))
1816
		return 0;
1817

1818
	fp = compat_ptr(regs->bp);
1819
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
1820
		unsigned long bytes;
1821
		frame.next_frame     = 0;
1822
		frame.return_address = 0;
1823

1824
		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1825
		if (bytes != sizeof(frame))
1826
			break;
1827

1828
		if (fp < compat_ptr(regs->sp))
1829
			break;
1830

1831
		perf_callchain_store(entry, frame.return_address);
1832
		fp = compat_ptr(frame.next_frame);
1833
	}
1834
	return 1;
1835
}
1836
#else
1837
static inline int
1838
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1839
{
1840
    return 0;
1841
}
1842
#endif
1843

1844
void
1845
perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1846
{
1847
	struct stack_frame frame;
1848
	const void __user *fp;
1849

1850
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1851
		/* TODO: We don't support guest os callchain now */
1852
		return;
1853
	}
1854

1855
	fp = (void __user *)regs->bp;
1856

1857
	perf_callchain_store(entry, regs->ip);
1858

1859
	if (perf_callchain_user32(regs, entry))
1860
		return;
1861

1862
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
1863
		unsigned long bytes;
1864
		frame.next_frame	     = NULL;
1865
		frame.return_address = 0;
1866

1867
		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1868
		if (bytes != sizeof(frame))
1869
			break;
1870

1871
		if ((unsigned long)fp < regs->sp)
1872
			break;
1873

1874
		perf_callchain_store(entry, frame.return_address);
1875
		fp = frame.next_frame;
1876
	}
1877
}
1878

1879
unsigned long perf_instruction_pointer(struct pt_regs *regs)
1880
{
1881
	unsigned long ip;
1882

1883
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
1884
		ip = perf_guest_cbs->get_guest_ip();
1885
	else
1886
		ip = instruction_pointer(regs);
1887

1888
	return ip;
1889
}
1890

1891
unsigned long perf_misc_flags(struct pt_regs *regs)
1892
{
1893
	int misc = 0;
1894

1895
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1896
		if (perf_guest_cbs->is_user_mode())
1897
			misc |= PERF_RECORD_MISC_GUEST_USER;
1898
		else
1899
			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
1900
	} else {
1901
		if (user_mode(regs))
1902
			misc |= PERF_RECORD_MISC_USER;
1903
		else
1904
			misc |= PERF_RECORD_MISC_KERNEL;
1905
	}
1906

1907
	if (regs->flags & PERF_EFLAGS_EXACT)
1908
		misc |= PERF_RECORD_MISC_EXACT_IP;
1909

1910
	return misc;
1911
}
1912

1913
Product

Resources

Company