Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/x86/kernel/cpu/perf_event.c
10699 views
1
/*
2
* Performance events x86 architecture code
3
*
4
* Copyright (C) 2008 Thomas Gleixner <[email protected]>
5
* Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6
* Copyright (C) 2009 Jaswinder Singh Rajput
7
* Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8
* Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <[email protected]>
9
* Copyright (C) 2009 Intel Corporation, <[email protected]>
10
* Copyright (C) 2009 Google, Inc., Stephane Eranian
11
*
12
* For licencing details see kernel-base/COPYING
13
*/
14
15
#include <linux/perf_event.h>
16
#include <linux/capability.h>
17
#include <linux/notifier.h>
18
#include <linux/hardirq.h>
19
#include <linux/kprobes.h>
20
#include <linux/module.h>
21
#include <linux/kdebug.h>
22
#include <linux/sched.h>
23
#include <linux/uaccess.h>
24
#include <linux/slab.h>
25
#include <linux/highmem.h>
26
#include <linux/cpu.h>
27
#include <linux/bitops.h>
28
29
#include <asm/apic.h>
30
#include <asm/stacktrace.h>
31
#include <asm/nmi.h>
32
#include <asm/compat.h>
33
#include <asm/smp.h>
34
#include <asm/alternative.h>
35
36
#if 0
37
#undef wrmsrl
38
#define wrmsrl(msr, val) \
39
do { \
40
trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
41
(unsigned long)(val)); \
42
native_write_msr((msr), (u32)((u64)(val)), \
43
(u32)((u64)(val) >> 32)); \
44
} while (0)
45
#endif
46
47
/*
48
* best effort, GUP based copy_from_user() that assumes IRQ or NMI context
49
*/
50
static unsigned long
51
copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
52
{
53
unsigned long offset, addr = (unsigned long)from;
54
unsigned long size, len = 0;
55
struct page *page;
56
void *map;
57
int ret;
58
59
do {
60
ret = __get_user_pages_fast(addr, 1, 0, &page);
61
if (!ret)
62
break;
63
64
offset = addr & (PAGE_SIZE - 1);
65
size = min(PAGE_SIZE - offset, n - len);
66
67
map = kmap_atomic(page);
68
memcpy(to, map+offset, size);
69
kunmap_atomic(map);
70
put_page(page);
71
72
len += size;
73
to += size;
74
addr += size;
75
76
} while (len < n);
77
78
return len;
79
}
80
81
struct event_constraint {
82
union {
83
unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
84
u64 idxmsk64;
85
};
86
u64 code;
87
u64 cmask;
88
int weight;
89
};
90
91
struct amd_nb {
92
int nb_id; /* NorthBridge id */
93
int refcnt; /* reference count */
94
struct perf_event *owners[X86_PMC_IDX_MAX];
95
struct event_constraint event_constraints[X86_PMC_IDX_MAX];
96
};
97
98
struct intel_percore;
99
100
#define MAX_LBR_ENTRIES 16
101
102
struct cpu_hw_events {
103
/*
104
* Generic x86 PMC bits
105
*/
106
struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
107
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
108
unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
109
int enabled;
110
111
int n_events;
112
int n_added;
113
int n_txn;
114
int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
115
u64 tags[X86_PMC_IDX_MAX];
116
struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
117
118
unsigned int group_flag;
119
120
/*
121
* Intel DebugStore bits
122
*/
123
struct debug_store *ds;
124
u64 pebs_enabled;
125
126
/*
127
* Intel LBR bits
128
*/
129
int lbr_users;
130
void *lbr_context;
131
struct perf_branch_stack lbr_stack;
132
struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
133
134
/*
135
* Intel percore register state.
136
* Coordinate shared resources between HT threads.
137
*/
138
int percore_used; /* Used by this CPU? */
139
struct intel_percore *per_core;
140
141
/*
142
* AMD specific bits
143
*/
144
struct amd_nb *amd_nb;
145
};
146
147
#define __EVENT_CONSTRAINT(c, n, m, w) {\
148
{ .idxmsk64 = (n) }, \
149
.code = (c), \
150
.cmask = (m), \
151
.weight = (w), \
152
}
153
154
#define EVENT_CONSTRAINT(c, n, m) \
155
__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
156
157
/*
158
* Constraint on the Event code.
159
*/
160
#define INTEL_EVENT_CONSTRAINT(c, n) \
161
EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
162
163
/*
164
* Constraint on the Event code + UMask + fixed-mask
165
*
166
* filter mask to validate fixed counter events.
167
* the following filters disqualify for fixed counters:
168
* - inv
169
* - edge
170
* - cnt-mask
171
* The other filters are supported by fixed counters.
172
* The any-thread option is supported starting with v3.
173
*/
174
#define FIXED_EVENT_CONSTRAINT(c, n) \
175
EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
176
177
/*
178
* Constraint on the Event code + UMask
179
*/
180
#define INTEL_UEVENT_CONSTRAINT(c, n) \
181
EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
182
183
#define EVENT_CONSTRAINT_END \
184
EVENT_CONSTRAINT(0, 0, 0)
185
186
#define for_each_event_constraint(e, c) \
187
for ((e) = (c); (e)->weight; (e)++)
188
189
/*
190
* Extra registers for specific events.
191
* Some events need large masks and require external MSRs.
192
* Define a mapping to these extra registers.
193
*/
194
struct extra_reg {
195
unsigned int event;
196
unsigned int msr;
197
u64 config_mask;
198
u64 valid_mask;
199
};
200
201
#define EVENT_EXTRA_REG(e, ms, m, vm) { \
202
.event = (e), \
203
.msr = (ms), \
204
.config_mask = (m), \
205
.valid_mask = (vm), \
206
}
207
#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \
208
EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
209
#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)
210
211
union perf_capabilities {
212
struct {
213
u64 lbr_format : 6;
214
u64 pebs_trap : 1;
215
u64 pebs_arch_reg : 1;
216
u64 pebs_format : 4;
217
u64 smm_freeze : 1;
218
};
219
u64 capabilities;
220
};
221
222
/*
223
* struct x86_pmu - generic x86 pmu
224
*/
225
struct x86_pmu {
226
/*
227
* Generic x86 PMC bits
228
*/
229
const char *name;
230
int version;
231
int (*handle_irq)(struct pt_regs *);
232
void (*disable_all)(void);
233
void (*enable_all)(int added);
234
void (*enable)(struct perf_event *);
235
void (*disable)(struct perf_event *);
236
int (*hw_config)(struct perf_event *event);
237
int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
238
unsigned eventsel;
239
unsigned perfctr;
240
u64 (*event_map)(int);
241
int max_events;
242
int num_counters;
243
int num_counters_fixed;
244
int cntval_bits;
245
u64 cntval_mask;
246
int apic;
247
u64 max_period;
248
struct event_constraint *
249
(*get_event_constraints)(struct cpu_hw_events *cpuc,
250
struct perf_event *event);
251
252
void (*put_event_constraints)(struct cpu_hw_events *cpuc,
253
struct perf_event *event);
254
struct event_constraint *event_constraints;
255
struct event_constraint *percore_constraints;
256
void (*quirks)(void);
257
int perfctr_second_write;
258
259
int (*cpu_prepare)(int cpu);
260
void (*cpu_starting)(int cpu);
261
void (*cpu_dying)(int cpu);
262
void (*cpu_dead)(int cpu);
263
264
/*
265
* Intel Arch Perfmon v2+
266
*/
267
u64 intel_ctrl;
268
union perf_capabilities intel_cap;
269
270
/*
271
* Intel DebugStore bits
272
*/
273
int bts, pebs;
274
int bts_active, pebs_active;
275
int pebs_record_size;
276
void (*drain_pebs)(struct pt_regs *regs);
277
struct event_constraint *pebs_constraints;
278
279
/*
280
* Intel LBR
281
*/
282
unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
283
int lbr_nr; /* hardware stack size */
284
285
/*
286
* Extra registers for events
287
*/
288
struct extra_reg *extra_regs;
289
};
290
291
static struct x86_pmu x86_pmu __read_mostly;
292
293
static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
294
.enabled = 1,
295
};
296
297
static int x86_perf_event_set_period(struct perf_event *event);
298
299
/*
300
* Generalized hw caching related hw_event table, filled
301
* in on a per model basis. A value of 0 means
302
* 'not supported', -1 means 'hw_event makes no sense on
303
* this CPU', any other value means the raw hw_event
304
* ID.
305
*/
306
307
#define C(x) PERF_COUNT_HW_CACHE_##x
308
309
static u64 __read_mostly hw_cache_event_ids
310
[PERF_COUNT_HW_CACHE_MAX]
311
[PERF_COUNT_HW_CACHE_OP_MAX]
312
[PERF_COUNT_HW_CACHE_RESULT_MAX];
313
static u64 __read_mostly hw_cache_extra_regs
314
[PERF_COUNT_HW_CACHE_MAX]
315
[PERF_COUNT_HW_CACHE_OP_MAX]
316
[PERF_COUNT_HW_CACHE_RESULT_MAX];
317
318
/*
319
* Propagate event elapsed time into the generic event.
320
* Can only be executed on the CPU where the event is active.
321
* Returns the delta events processed.
322
*/
323
static u64
324
x86_perf_event_update(struct perf_event *event)
325
{
326
struct hw_perf_event *hwc = &event->hw;
327
int shift = 64 - x86_pmu.cntval_bits;
328
u64 prev_raw_count, new_raw_count;
329
int idx = hwc->idx;
330
s64 delta;
331
332
if (idx == X86_PMC_IDX_FIXED_BTS)
333
return 0;
334
335
/*
336
* Careful: an NMI might modify the previous event value.
337
*
338
* Our tactic to handle this is to first atomically read and
339
* exchange a new raw count - then add that new-prev delta
340
* count to the generic event atomically:
341
*/
342
again:
343
prev_raw_count = local64_read(&hwc->prev_count);
344
rdmsrl(hwc->event_base, new_raw_count);
345
346
if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
347
new_raw_count) != prev_raw_count)
348
goto again;
349
350
/*
351
* Now we have the new raw value and have updated the prev
352
* timestamp already. We can now calculate the elapsed delta
353
* (event-)time and add that to the generic event.
354
*
355
* Careful, not all hw sign-extends above the physical width
356
* of the count.
357
*/
358
delta = (new_raw_count << shift) - (prev_raw_count << shift);
359
delta >>= shift;
360
361
local64_add(delta, &event->count);
362
local64_sub(delta, &hwc->period_left);
363
364
return new_raw_count;
365
}
366
367
static inline int x86_pmu_addr_offset(int index)
368
{
369
int offset;
370
371
/* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
372
alternative_io(ASM_NOP2,
373
"shll $1, %%eax",
374
X86_FEATURE_PERFCTR_CORE,
375
"=a" (offset),
376
"a" (index));
377
378
return offset;
379
}
380
381
static inline unsigned int x86_pmu_config_addr(int index)
382
{
383
return x86_pmu.eventsel + x86_pmu_addr_offset(index);
384
}
385
386
static inline unsigned int x86_pmu_event_addr(int index)
387
{
388
return x86_pmu.perfctr + x86_pmu_addr_offset(index);
389
}
390
391
/*
392
* Find and validate any extra registers to set up.
393
*/
394
static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
395
{
396
struct extra_reg *er;
397
398
event->hw.extra_reg = 0;
399
event->hw.extra_config = 0;
400
401
if (!x86_pmu.extra_regs)
402
return 0;
403
404
for (er = x86_pmu.extra_regs; er->msr; er++) {
405
if (er->event != (config & er->config_mask))
406
continue;
407
if (event->attr.config1 & ~er->valid_mask)
408
return -EINVAL;
409
event->hw.extra_reg = er->msr;
410
event->hw.extra_config = event->attr.config1;
411
break;
412
}
413
return 0;
414
}
415
416
static atomic_t active_events;
417
static DEFINE_MUTEX(pmc_reserve_mutex);
418
419
#ifdef CONFIG_X86_LOCAL_APIC
420
421
static bool reserve_pmc_hardware(void)
422
{
423
int i;
424
425
for (i = 0; i < x86_pmu.num_counters; i++) {
426
if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
427
goto perfctr_fail;
428
}
429
430
for (i = 0; i < x86_pmu.num_counters; i++) {
431
if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
432
goto eventsel_fail;
433
}
434
435
return true;
436
437
eventsel_fail:
438
for (i--; i >= 0; i--)
439
release_evntsel_nmi(x86_pmu_config_addr(i));
440
441
i = x86_pmu.num_counters;
442
443
perfctr_fail:
444
for (i--; i >= 0; i--)
445
release_perfctr_nmi(x86_pmu_event_addr(i));
446
447
return false;
448
}
449
450
static void release_pmc_hardware(void)
451
{
452
int i;
453
454
for (i = 0; i < x86_pmu.num_counters; i++) {
455
release_perfctr_nmi(x86_pmu_event_addr(i));
456
release_evntsel_nmi(x86_pmu_config_addr(i));
457
}
458
}
459
460
#else
461
462
static bool reserve_pmc_hardware(void) { return true; }
463
static void release_pmc_hardware(void) {}
464
465
#endif
466
467
static bool check_hw_exists(void)
468
{
469
u64 val, val_new = 0;
470
int i, reg, ret = 0;
471
472
/*
473
* Check to see if the BIOS enabled any of the counters, if so
474
* complain and bail.
475
*/
476
for (i = 0; i < x86_pmu.num_counters; i++) {
477
reg = x86_pmu_config_addr(i);
478
ret = rdmsrl_safe(reg, &val);
479
if (ret)
480
goto msr_fail;
481
if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
482
goto bios_fail;
483
}
484
485
if (x86_pmu.num_counters_fixed) {
486
reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
487
ret = rdmsrl_safe(reg, &val);
488
if (ret)
489
goto msr_fail;
490
for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
491
if (val & (0x03 << i*4))
492
goto bios_fail;
493
}
494
}
495
496
/*
497
* Now write a value and read it back to see if it matches,
498
* this is needed to detect certain hardware emulators (qemu/kvm)
499
* that don't trap on the MSR access and always return 0s.
500
*/
501
val = 0xabcdUL;
502
ret = checking_wrmsrl(x86_pmu_event_addr(0), val);
503
ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new);
504
if (ret || val != val_new)
505
goto msr_fail;
506
507
return true;
508
509
bios_fail:
510
/*
511
* We still allow the PMU driver to operate:
512
*/
513
printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
514
printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
515
516
return true;
517
518
msr_fail:
519
printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
520
521
return false;
522
}
523
524
static void reserve_ds_buffers(void);
525
static void release_ds_buffers(void);
526
527
static void hw_perf_event_destroy(struct perf_event *event)
528
{
529
if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
530
release_pmc_hardware();
531
release_ds_buffers();
532
mutex_unlock(&pmc_reserve_mutex);
533
}
534
}
535
536
static inline int x86_pmu_initialized(void)
537
{
538
return x86_pmu.handle_irq != NULL;
539
}
540
541
static inline int
542
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
543
{
544
struct perf_event_attr *attr = &event->attr;
545
unsigned int cache_type, cache_op, cache_result;
546
u64 config, val;
547
548
config = attr->config;
549
550
cache_type = (config >> 0) & 0xff;
551
if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
552
return -EINVAL;
553
554
cache_op = (config >> 8) & 0xff;
555
if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
556
return -EINVAL;
557
558
cache_result = (config >> 16) & 0xff;
559
if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
560
return -EINVAL;
561
562
val = hw_cache_event_ids[cache_type][cache_op][cache_result];
563
564
if (val == 0)
565
return -ENOENT;
566
567
if (val == -1)
568
return -EINVAL;
569
570
hwc->config |= val;
571
attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
572
return x86_pmu_extra_regs(val, event);
573
}
574
575
static int x86_setup_perfctr(struct perf_event *event)
576
{
577
struct perf_event_attr *attr = &event->attr;
578
struct hw_perf_event *hwc = &event->hw;
579
u64 config;
580
581
if (!is_sampling_event(event)) {
582
hwc->sample_period = x86_pmu.max_period;
583
hwc->last_period = hwc->sample_period;
584
local64_set(&hwc->period_left, hwc->sample_period);
585
} else {
586
/*
587
* If we have a PMU initialized but no APIC
588
* interrupts, we cannot sample hardware
589
* events (user-space has to fall back and
590
* sample via a hrtimer based software event):
591
*/
592
if (!x86_pmu.apic)
593
return -EOPNOTSUPP;
594
}
595
596
/*
597
* Do not allow config1 (extended registers) to propagate,
598
* there's no sane user-space generalization yet:
599
*/
600
if (attr->type == PERF_TYPE_RAW)
601
return 0;
602
603
if (attr->type == PERF_TYPE_HW_CACHE)
604
return set_ext_hw_attr(hwc, event);
605
606
if (attr->config >= x86_pmu.max_events)
607
return -EINVAL;
608
609
/*
610
* The generic map:
611
*/
612
config = x86_pmu.event_map(attr->config);
613
614
if (config == 0)
615
return -ENOENT;
616
617
if (config == -1LL)
618
return -EINVAL;
619
620
/*
621
* Branch tracing:
622
*/
623
if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
624
!attr->freq && hwc->sample_period == 1) {
625
/* BTS is not supported by this architecture. */
626
if (!x86_pmu.bts_active)
627
return -EOPNOTSUPP;
628
629
/* BTS is currently only allowed for user-mode. */
630
if (!attr->exclude_kernel)
631
return -EOPNOTSUPP;
632
}
633
634
hwc->config |= config;
635
636
return 0;
637
}
638
639
static int x86_pmu_hw_config(struct perf_event *event)
640
{
641
if (event->attr.precise_ip) {
642
int precise = 0;
643
644
/* Support for constant skid */
645
if (x86_pmu.pebs_active) {
646
precise++;
647
648
/* Support for IP fixup */
649
if (x86_pmu.lbr_nr)
650
precise++;
651
}
652
653
if (event->attr.precise_ip > precise)
654
return -EOPNOTSUPP;
655
}
656
657
/*
658
* Generate PMC IRQs:
659
* (keep 'enabled' bit clear for now)
660
*/
661
event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
662
663
/*
664
* Count user and OS events unless requested not to
665
*/
666
if (!event->attr.exclude_user)
667
event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
668
if (!event->attr.exclude_kernel)
669
event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
670
671
if (event->attr.type == PERF_TYPE_RAW)
672
event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
673
674
return x86_setup_perfctr(event);
675
}
676
677
/*
678
* Setup the hardware configuration for a given attr_type
679
*/
680
static int __x86_pmu_event_init(struct perf_event *event)
681
{
682
int err;
683
684
if (!x86_pmu_initialized())
685
return -ENODEV;
686
687
err = 0;
688
if (!atomic_inc_not_zero(&active_events)) {
689
mutex_lock(&pmc_reserve_mutex);
690
if (atomic_read(&active_events) == 0) {
691
if (!reserve_pmc_hardware())
692
err = -EBUSY;
693
else
694
reserve_ds_buffers();
695
}
696
if (!err)
697
atomic_inc(&active_events);
698
mutex_unlock(&pmc_reserve_mutex);
699
}
700
if (err)
701
return err;
702
703
event->destroy = hw_perf_event_destroy;
704
705
event->hw.idx = -1;
706
event->hw.last_cpu = -1;
707
event->hw.last_tag = ~0ULL;
708
709
return x86_pmu.hw_config(event);
710
}
711
712
static void x86_pmu_disable_all(void)
713
{
714
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
715
int idx;
716
717
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
718
u64 val;
719
720
if (!test_bit(idx, cpuc->active_mask))
721
continue;
722
rdmsrl(x86_pmu_config_addr(idx), val);
723
if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
724
continue;
725
val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
726
wrmsrl(x86_pmu_config_addr(idx), val);
727
}
728
}
729
730
static void x86_pmu_disable(struct pmu *pmu)
731
{
732
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
733
734
if (!x86_pmu_initialized())
735
return;
736
737
if (!cpuc->enabled)
738
return;
739
740
cpuc->n_added = 0;
741
cpuc->enabled = 0;
742
barrier();
743
744
x86_pmu.disable_all();
745
}
746
747
static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
748
u64 enable_mask)
749
{
750
if (hwc->extra_reg)
751
wrmsrl(hwc->extra_reg, hwc->extra_config);
752
wrmsrl(hwc->config_base, hwc->config | enable_mask);
753
}
754
755
static void x86_pmu_enable_all(int added)
756
{
757
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
758
int idx;
759
760
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
761
struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
762
763
if (!test_bit(idx, cpuc->active_mask))
764
continue;
765
766
__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
767
}
768
}
769
770
static struct pmu pmu;
771
772
static inline int is_x86_event(struct perf_event *event)
773
{
774
return event->pmu == &pmu;
775
}
776
777
static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
778
{
779
struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
780
unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
781
int i, j, w, wmax, num = 0;
782
struct hw_perf_event *hwc;
783
784
bitmap_zero(used_mask, X86_PMC_IDX_MAX);
785
786
for (i = 0; i < n; i++) {
787
c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
788
constraints[i] = c;
789
}
790
791
/*
792
* fastpath, try to reuse previous register
793
*/
794
for (i = 0; i < n; i++) {
795
hwc = &cpuc->event_list[i]->hw;
796
c = constraints[i];
797
798
/* never assigned */
799
if (hwc->idx == -1)
800
break;
801
802
/* constraint still honored */
803
if (!test_bit(hwc->idx, c->idxmsk))
804
break;
805
806
/* not already used */
807
if (test_bit(hwc->idx, used_mask))
808
break;
809
810
__set_bit(hwc->idx, used_mask);
811
if (assign)
812
assign[i] = hwc->idx;
813
}
814
if (i == n)
815
goto done;
816
817
/*
818
* begin slow path
819
*/
820
821
bitmap_zero(used_mask, X86_PMC_IDX_MAX);
822
823
/*
824
* weight = number of possible counters
825
*
826
* 1 = most constrained, only works on one counter
827
* wmax = least constrained, works on any counter
828
*
829
* assign events to counters starting with most
830
* constrained events.
831
*/
832
wmax = x86_pmu.num_counters;
833
834
/*
835
* when fixed event counters are present,
836
* wmax is incremented by 1 to account
837
* for one more choice
838
*/
839
if (x86_pmu.num_counters_fixed)
840
wmax++;
841
842
for (w = 1, num = n; num && w <= wmax; w++) {
843
/* for each event */
844
for (i = 0; num && i < n; i++) {
845
c = constraints[i];
846
hwc = &cpuc->event_list[i]->hw;
847
848
if (c->weight != w)
849
continue;
850
851
for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
852
if (!test_bit(j, used_mask))
853
break;
854
}
855
856
if (j == X86_PMC_IDX_MAX)
857
break;
858
859
__set_bit(j, used_mask);
860
861
if (assign)
862
assign[i] = j;
863
num--;
864
}
865
}
866
done:
867
/*
868
* scheduling failed or is just a simulation,
869
* free resources if necessary
870
*/
871
if (!assign || num) {
872
for (i = 0; i < n; i++) {
873
if (x86_pmu.put_event_constraints)
874
x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
875
}
876
}
877
return num ? -ENOSPC : 0;
878
}
879
880
/*
881
* dogrp: true if must collect siblings events (group)
882
* returns total number of events and error code
883
*/
884
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
885
{
886
struct perf_event *event;
887
int n, max_count;
888
889
max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
890
891
/* current number of events already accepted */
892
n = cpuc->n_events;
893
894
if (is_x86_event(leader)) {
895
if (n >= max_count)
896
return -ENOSPC;
897
cpuc->event_list[n] = leader;
898
n++;
899
}
900
if (!dogrp)
901
return n;
902
903
list_for_each_entry(event, &leader->sibling_list, group_entry) {
904
if (!is_x86_event(event) ||
905
event->state <= PERF_EVENT_STATE_OFF)
906
continue;
907
908
if (n >= max_count)
909
return -ENOSPC;
910
911
cpuc->event_list[n] = event;
912
n++;
913
}
914
return n;
915
}
916
917
static inline void x86_assign_hw_event(struct perf_event *event,
918
struct cpu_hw_events *cpuc, int i)
919
{
920
struct hw_perf_event *hwc = &event->hw;
921
922
hwc->idx = cpuc->assign[i];
923
hwc->last_cpu = smp_processor_id();
924
hwc->last_tag = ++cpuc->tags[i];
925
926
if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
927
hwc->config_base = 0;
928
hwc->event_base = 0;
929
} else if (hwc->idx >= X86_PMC_IDX_FIXED) {
930
hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
931
hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED);
932
} else {
933
hwc->config_base = x86_pmu_config_addr(hwc->idx);
934
hwc->event_base = x86_pmu_event_addr(hwc->idx);
935
}
936
}
937
938
static inline int match_prev_assignment(struct hw_perf_event *hwc,
939
struct cpu_hw_events *cpuc,
940
int i)
941
{
942
return hwc->idx == cpuc->assign[i] &&
943
hwc->last_cpu == smp_processor_id() &&
944
hwc->last_tag == cpuc->tags[i];
945
}
946
947
static void x86_pmu_start(struct perf_event *event, int flags);
948
static void x86_pmu_stop(struct perf_event *event, int flags);
949
950
static void x86_pmu_enable(struct pmu *pmu)
951
{
952
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
953
struct perf_event *event;
954
struct hw_perf_event *hwc;
955
int i, added = cpuc->n_added;
956
957
if (!x86_pmu_initialized())
958
return;
959
960
if (cpuc->enabled)
961
return;
962
963
if (cpuc->n_added) {
964
int n_running = cpuc->n_events - cpuc->n_added;
965
/*
966
* apply assignment obtained either from
967
* hw_perf_group_sched_in() or x86_pmu_enable()
968
*
969
* step1: save events moving to new counters
970
* step2: reprogram moved events into new counters
971
*/
972
for (i = 0; i < n_running; i++) {
973
event = cpuc->event_list[i];
974
hwc = &event->hw;
975
976
/*
977
* we can avoid reprogramming counter if:
978
* - assigned same counter as last time
979
* - running on same CPU as last time
980
* - no other event has used the counter since
981
*/
982
if (hwc->idx == -1 ||
983
match_prev_assignment(hwc, cpuc, i))
984
continue;
985
986
/*
987
* Ensure we don't accidentally enable a stopped
988
* counter simply because we rescheduled.
989
*/
990
if (hwc->state & PERF_HES_STOPPED)
991
hwc->state |= PERF_HES_ARCH;
992
993
x86_pmu_stop(event, PERF_EF_UPDATE);
994
}
995
996
for (i = 0; i < cpuc->n_events; i++) {
997
event = cpuc->event_list[i];
998
hwc = &event->hw;
999
1000
if (!match_prev_assignment(hwc, cpuc, i))
1001
x86_assign_hw_event(event, cpuc, i);
1002
else if (i < n_running)
1003
continue;
1004
1005
if (hwc->state & PERF_HES_ARCH)
1006
continue;
1007
1008
x86_pmu_start(event, PERF_EF_RELOAD);
1009
}
1010
cpuc->n_added = 0;
1011
perf_events_lapic_init();
1012
}
1013
1014
cpuc->enabled = 1;
1015
barrier();
1016
1017
x86_pmu.enable_all(added);
1018
}
1019
1020
static inline void x86_pmu_disable_event(struct perf_event *event)
1021
{
1022
struct hw_perf_event *hwc = &event->hw;
1023
1024
wrmsrl(hwc->config_base, hwc->config);
1025
}
1026
1027
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
1028
1029
/*
1030
* Set the next IRQ period, based on the hwc->period_left value.
1031
* To be called with the event disabled in hw:
1032
*/
1033
static int
1034
x86_perf_event_set_period(struct perf_event *event)
1035
{
1036
struct hw_perf_event *hwc = &event->hw;
1037
s64 left = local64_read(&hwc->period_left);
1038
s64 period = hwc->sample_period;
1039
int ret = 0, idx = hwc->idx;
1040
1041
if (idx == X86_PMC_IDX_FIXED_BTS)
1042
return 0;
1043
1044
/*
1045
* If we are way outside a reasonable range then just skip forward:
1046
*/
1047
if (unlikely(left <= -period)) {
1048
left = period;
1049
local64_set(&hwc->period_left, left);
1050
hwc->last_period = period;
1051
ret = 1;
1052
}
1053
1054
if (unlikely(left <= 0)) {
1055
left += period;
1056
local64_set(&hwc->period_left, left);
1057
hwc->last_period = period;
1058
ret = 1;
1059
}
1060
/*
1061
* Quirk: certain CPUs dont like it if just 1 hw_event is left:
1062
*/
1063
if (unlikely(left < 2))
1064
left = 2;
1065
1066
if (left > x86_pmu.max_period)
1067
left = x86_pmu.max_period;
1068
1069
per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
1070
1071
/*
1072
* The hw event starts counting from this event offset,
1073
* mark it to be able to extra future deltas:
1074
*/
1075
local64_set(&hwc->prev_count, (u64)-left);
1076
1077
wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
1078
1079
/*
1080
* Due to erratum on certan cpu we need
1081
* a second write to be sure the register
1082
* is updated properly
1083
*/
1084
if (x86_pmu.perfctr_second_write) {
1085
wrmsrl(hwc->event_base,
1086
(u64)(-left) & x86_pmu.cntval_mask);
1087
}
1088
1089
perf_event_update_userpage(event);
1090
1091
return ret;
1092
}
1093
1094
static void x86_pmu_enable_event(struct perf_event *event)
1095
{
1096
if (__this_cpu_read(cpu_hw_events.enabled))
1097
__x86_pmu_enable_event(&event->hw,
1098
ARCH_PERFMON_EVENTSEL_ENABLE);
1099
}
1100
1101
/*
1102
* Add a single event to the PMU.
1103
*
1104
* The event is added to the group of enabled events
1105
* but only if it can be scehduled with existing events.
1106
*/
1107
static int x86_pmu_add(struct perf_event *event, int flags)
1108
{
1109
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1110
struct hw_perf_event *hwc;
1111
int assign[X86_PMC_IDX_MAX];
1112
int n, n0, ret;
1113
1114
hwc = &event->hw;
1115
1116
perf_pmu_disable(event->pmu);
1117
n0 = cpuc->n_events;
1118
ret = n = collect_events(cpuc, event, false);
1119
if (ret < 0)
1120
goto out;
1121
1122
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
1123
if (!(flags & PERF_EF_START))
1124
hwc->state |= PERF_HES_ARCH;
1125
1126
/*
1127
* If group events scheduling transaction was started,
1128
* skip the schedulability test here, it will be performed
1129
* at commit time (->commit_txn) as a whole
1130
*/
1131
if (cpuc->group_flag & PERF_EVENT_TXN)
1132
goto done_collect;
1133
1134
ret = x86_pmu.schedule_events(cpuc, n, assign);
1135
if (ret)
1136
goto out;
1137
/*
1138
* copy new assignment, now we know it is possible
1139
* will be used by hw_perf_enable()
1140
*/
1141
memcpy(cpuc->assign, assign, n*sizeof(int));
1142
1143
done_collect:
1144
cpuc->n_events = n;
1145
cpuc->n_added += n - n0;
1146
cpuc->n_txn += n - n0;
1147
1148
ret = 0;
1149
out:
1150
perf_pmu_enable(event->pmu);
1151
return ret;
1152
}
1153
1154
static void x86_pmu_start(struct perf_event *event, int flags)
1155
{
1156
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1157
int idx = event->hw.idx;
1158
1159
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1160
return;
1161
1162
if (WARN_ON_ONCE(idx == -1))
1163
return;
1164
1165
if (flags & PERF_EF_RELOAD) {
1166
WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
1167
x86_perf_event_set_period(event);
1168
}
1169
1170
event->hw.state = 0;
1171
1172
cpuc->events[idx] = event;
1173
__set_bit(idx, cpuc->active_mask);
1174
__set_bit(idx, cpuc->running);
1175
x86_pmu.enable(event);
1176
perf_event_update_userpage(event);
1177
}
1178
1179
void perf_event_print_debug(void)
1180
{
1181
u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1182
u64 pebs;
1183
struct cpu_hw_events *cpuc;
1184
unsigned long flags;
1185
int cpu, idx;
1186
1187
if (!x86_pmu.num_counters)
1188
return;
1189
1190
local_irq_save(flags);
1191
1192
cpu = smp_processor_id();
1193
cpuc = &per_cpu(cpu_hw_events, cpu);
1194
1195
if (x86_pmu.version >= 2) {
1196
rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
1197
rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1198
rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1199
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1200
rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1201
1202
pr_info("\n");
1203
pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
1204
pr_info("CPU#%d: status: %016llx\n", cpu, status);
1205
pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1206
pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1207
pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
1208
}
1209
pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1210
1211
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1212
rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
1213
rdmsrl(x86_pmu_event_addr(idx), pmc_count);
1214
1215
prev_left = per_cpu(pmc_prev_left[idx], cpu);
1216
1217
pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
1218
cpu, idx, pmc_ctrl);
1219
pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
1220
cpu, idx, pmc_count);
1221
pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1222
cpu, idx, prev_left);
1223
}
1224
for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1225
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1226
1227
pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1228
cpu, idx, pmc_count);
1229
}
1230
local_irq_restore(flags);
1231
}
1232
1233
static void x86_pmu_stop(struct perf_event *event, int flags)
1234
{
1235
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1236
struct hw_perf_event *hwc = &event->hw;
1237
1238
if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
1239
x86_pmu.disable(event);
1240
cpuc->events[hwc->idx] = NULL;
1241
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
1242
hwc->state |= PERF_HES_STOPPED;
1243
}
1244
1245
if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
1246
/*
1247
* Drain the remaining delta count out of a event
1248
* that we are disabling:
1249
*/
1250
x86_perf_event_update(event);
1251
hwc->state |= PERF_HES_UPTODATE;
1252
}
1253
}
1254
1255
static void x86_pmu_del(struct perf_event *event, int flags)
1256
{
1257
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1258
int i;
1259
1260
/*
1261
* If we're called during a txn, we don't need to do anything.
1262
* The events never got scheduled and ->cancel_txn will truncate
1263
* the event_list.
1264
*/
1265
if (cpuc->group_flag & PERF_EVENT_TXN)
1266
return;
1267
1268
x86_pmu_stop(event, PERF_EF_UPDATE);
1269
1270
for (i = 0; i < cpuc->n_events; i++) {
1271
if (event == cpuc->event_list[i]) {
1272
1273
if (x86_pmu.put_event_constraints)
1274
x86_pmu.put_event_constraints(cpuc, event);
1275
1276
while (++i < cpuc->n_events)
1277
cpuc->event_list[i-1] = cpuc->event_list[i];
1278
1279
--cpuc->n_events;
1280
break;
1281
}
1282
}
1283
perf_event_update_userpage(event);
1284
}
1285
1286
static int x86_pmu_handle_irq(struct pt_regs *regs)
1287
{
1288
struct perf_sample_data data;
1289
struct cpu_hw_events *cpuc;
1290
struct perf_event *event;
1291
int idx, handled = 0;
1292
u64 val;
1293
1294
perf_sample_data_init(&data, 0);
1295
1296
cpuc = &__get_cpu_var(cpu_hw_events);
1297
1298
/*
1299
* Some chipsets need to unmask the LVTPC in a particular spot
1300
* inside the nmi handler. As a result, the unmasking was pushed
1301
* into all the nmi handlers.
1302
*
1303
* This generic handler doesn't seem to have any issues where the
1304
* unmasking occurs so it was left at the top.
1305
*/
1306
apic_write(APIC_LVTPC, APIC_DM_NMI);
1307
1308
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1309
if (!test_bit(idx, cpuc->active_mask)) {
1310
/*
1311
* Though we deactivated the counter some cpus
1312
* might still deliver spurious interrupts still
1313
* in flight. Catch them:
1314
*/
1315
if (__test_and_clear_bit(idx, cpuc->running))
1316
handled++;
1317
continue;
1318
}
1319
1320
event = cpuc->events[idx];
1321
1322
val = x86_perf_event_update(event);
1323
if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1324
continue;
1325
1326
/*
1327
* event overflow
1328
*/
1329
handled++;
1330
data.period = event->hw.last_period;
1331
1332
if (!x86_perf_event_set_period(event))
1333
continue;
1334
1335
if (perf_event_overflow(event, 1, &data, regs))
1336
x86_pmu_stop(event, 0);
1337
}
1338
1339
if (handled)
1340
inc_irq_stat(apic_perf_irqs);
1341
1342
return handled;
1343
}
1344
1345
void perf_events_lapic_init(void)
1346
{
1347
if (!x86_pmu.apic || !x86_pmu_initialized())
1348
return;
1349
1350
/*
1351
* Always use NMI for PMU
1352
*/
1353
apic_write(APIC_LVTPC, APIC_DM_NMI);
1354
}
1355
1356
struct pmu_nmi_state {
1357
unsigned int marked;
1358
int handled;
1359
};
1360
1361
static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);
1362
1363
static int __kprobes
1364
perf_event_nmi_handler(struct notifier_block *self,
1365
unsigned long cmd, void *__args)
1366
{
1367
struct die_args *args = __args;
1368
unsigned int this_nmi;
1369
int handled;
1370
1371
if (!atomic_read(&active_events))
1372
return NOTIFY_DONE;
1373
1374
switch (cmd) {
1375
case DIE_NMI:
1376
break;
1377
case DIE_NMIUNKNOWN:
1378
this_nmi = percpu_read(irq_stat.__nmi_count);
1379
if (this_nmi != __this_cpu_read(pmu_nmi.marked))
1380
/* let the kernel handle the unknown nmi */
1381
return NOTIFY_DONE;
1382
/*
1383
* This one is a PMU back-to-back nmi. Two events
1384
* trigger 'simultaneously' raising two back-to-back
1385
* NMIs. If the first NMI handles both, the latter
1386
* will be empty and daze the CPU. So, we drop it to
1387
* avoid false-positive 'unknown nmi' messages.
1388
*/
1389
return NOTIFY_STOP;
1390
default:
1391
return NOTIFY_DONE;
1392
}
1393
1394
handled = x86_pmu.handle_irq(args->regs);
1395
if (!handled)
1396
return NOTIFY_DONE;
1397
1398
this_nmi = percpu_read(irq_stat.__nmi_count);
1399
if ((handled > 1) ||
1400
/* the next nmi could be a back-to-back nmi */
1401
((__this_cpu_read(pmu_nmi.marked) == this_nmi) &&
1402
(__this_cpu_read(pmu_nmi.handled) > 1))) {
1403
/*
1404
* We could have two subsequent back-to-back nmis: The
1405
* first handles more than one counter, the 2nd
1406
* handles only one counter and the 3rd handles no
1407
* counter.
1408
*
1409
* This is the 2nd nmi because the previous was
1410
* handling more than one counter. We will mark the
1411
* next (3rd) and then drop it if unhandled.
1412
*/
1413
__this_cpu_write(pmu_nmi.marked, this_nmi + 1);
1414
__this_cpu_write(pmu_nmi.handled, handled);
1415
}
1416
1417
return NOTIFY_STOP;
1418
}
1419
1420
static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1421
.notifier_call = perf_event_nmi_handler,
1422
.next = NULL,
1423
.priority = NMI_LOCAL_LOW_PRIOR,
1424
};
1425
1426
static struct event_constraint unconstrained;
1427
static struct event_constraint emptyconstraint;
1428
1429
static struct event_constraint *
1430
x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1431
{
1432
struct event_constraint *c;
1433
1434
if (x86_pmu.event_constraints) {
1435
for_each_event_constraint(c, x86_pmu.event_constraints) {
1436
if ((event->hw.config & c->cmask) == c->code)
1437
return c;
1438
}
1439
}
1440
1441
return &unconstrained;
1442
}
1443
1444
#include "perf_event_amd.c"
1445
#include "perf_event_p6.c"
1446
#include "perf_event_p4.c"
1447
#include "perf_event_intel_lbr.c"
1448
#include "perf_event_intel_ds.c"
1449
#include "perf_event_intel.c"
1450
1451
static int __cpuinit
1452
x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1453
{
1454
unsigned int cpu = (long)hcpu;
1455
int ret = NOTIFY_OK;
1456
1457
switch (action & ~CPU_TASKS_FROZEN) {
1458
case CPU_UP_PREPARE:
1459
if (x86_pmu.cpu_prepare)
1460
ret = x86_pmu.cpu_prepare(cpu);
1461
break;
1462
1463
case CPU_STARTING:
1464
if (x86_pmu.cpu_starting)
1465
x86_pmu.cpu_starting(cpu);
1466
break;
1467
1468
case CPU_DYING:
1469
if (x86_pmu.cpu_dying)
1470
x86_pmu.cpu_dying(cpu);
1471
break;
1472
1473
case CPU_UP_CANCELED:
1474
case CPU_DEAD:
1475
if (x86_pmu.cpu_dead)
1476
x86_pmu.cpu_dead(cpu);
1477
break;
1478
1479
default:
1480
break;
1481
}
1482
1483
return ret;
1484
}
1485
1486
static void __init pmu_check_apic(void)
1487
{
1488
if (cpu_has_apic)
1489
return;
1490
1491
x86_pmu.apic = 0;
1492
pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1493
pr_info("no hardware sampling interrupt available.\n");
1494
}
1495
1496
static int __init init_hw_perf_events(void)
1497
{
1498
struct event_constraint *c;
1499
int err;
1500
1501
pr_info("Performance Events: ");
1502
1503
switch (boot_cpu_data.x86_vendor) {
1504
case X86_VENDOR_INTEL:
1505
err = intel_pmu_init();
1506
break;
1507
case X86_VENDOR_AMD:
1508
err = amd_pmu_init();
1509
break;
1510
default:
1511
return 0;
1512
}
1513
if (err != 0) {
1514
pr_cont("no PMU driver, software events only.\n");
1515
return 0;
1516
}
1517
1518
pmu_check_apic();
1519
1520
/* sanity check that the hardware exists or is emulated */
1521
if (!check_hw_exists())
1522
return 0;
1523
1524
pr_cont("%s PMU driver.\n", x86_pmu.name);
1525
1526
if (x86_pmu.quirks)
1527
x86_pmu.quirks();
1528
1529
if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1530
WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1531
x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1532
x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1533
}
1534
x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1535
1536
if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1537
WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1538
x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1539
x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1540
}
1541
1542
x86_pmu.intel_ctrl |=
1543
((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1544
1545
perf_events_lapic_init();
1546
register_die_notifier(&perf_event_nmi_notifier);
1547
1548
unconstrained = (struct event_constraint)
1549
__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1550
0, x86_pmu.num_counters);
1551
1552
if (x86_pmu.event_constraints) {
1553
for_each_event_constraint(c, x86_pmu.event_constraints) {
1554
if (c->cmask != X86_RAW_EVENT_MASK)
1555
continue;
1556
1557
c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
1558
c->weight += x86_pmu.num_counters;
1559
}
1560
}
1561
1562
pr_info("... version: %d\n", x86_pmu.version);
1563
pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
1564
pr_info("... generic registers: %d\n", x86_pmu.num_counters);
1565
pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
1566
pr_info("... max period: %016Lx\n", x86_pmu.max_period);
1567
pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
1568
pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
1569
1570
perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
1571
perf_cpu_notifier(x86_pmu_notifier);
1572
1573
return 0;
1574
}
1575
early_initcall(init_hw_perf_events);
1576
1577
static inline void x86_pmu_read(struct perf_event *event)
1578
{
1579
x86_perf_event_update(event);
1580
}
1581
1582
/*
1583
* Start group events scheduling transaction
1584
* Set the flag to make pmu::enable() not perform the
1585
* schedulability test, it will be performed at commit time
1586
*/
1587
static void x86_pmu_start_txn(struct pmu *pmu)
1588
{
1589
perf_pmu_disable(pmu);
1590
__this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
1591
__this_cpu_write(cpu_hw_events.n_txn, 0);
1592
}
1593
1594
/*
1595
* Stop group events scheduling transaction
1596
* Clear the flag and pmu::enable() will perform the
1597
* schedulability test.
1598
*/
1599
static void x86_pmu_cancel_txn(struct pmu *pmu)
1600
{
1601
__this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
1602
/*
1603
* Truncate the collected events.
1604
*/
1605
__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
1606
__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
1607
perf_pmu_enable(pmu);
1608
}
1609
1610
/*
1611
* Commit group events scheduling transaction
1612
* Perform the group schedulability test as a whole
1613
* Return 0 if success
1614
*/
1615
static int x86_pmu_commit_txn(struct pmu *pmu)
1616
{
1617
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1618
int assign[X86_PMC_IDX_MAX];
1619
int n, ret;
1620
1621
n = cpuc->n_events;
1622
1623
if (!x86_pmu_initialized())
1624
return -EAGAIN;
1625
1626
ret = x86_pmu.schedule_events(cpuc, n, assign);
1627
if (ret)
1628
return ret;
1629
1630
/*
1631
* copy new assignment, now we know it is possible
1632
* will be used by hw_perf_enable()
1633
*/
1634
memcpy(cpuc->assign, assign, n*sizeof(int));
1635
1636
cpuc->group_flag &= ~PERF_EVENT_TXN;
1637
perf_pmu_enable(pmu);
1638
return 0;
1639
}
1640
1641
/*
1642
* validate that we can schedule this event
1643
*/
1644
static int validate_event(struct perf_event *event)
1645
{
1646
struct cpu_hw_events *fake_cpuc;
1647
struct event_constraint *c;
1648
int ret = 0;
1649
1650
fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1651
if (!fake_cpuc)
1652
return -ENOMEM;
1653
1654
c = x86_pmu.get_event_constraints(fake_cpuc, event);
1655
1656
if (!c || !c->weight)
1657
ret = -ENOSPC;
1658
1659
if (x86_pmu.put_event_constraints)
1660
x86_pmu.put_event_constraints(fake_cpuc, event);
1661
1662
kfree(fake_cpuc);
1663
1664
return ret;
1665
}
1666
1667
/*
1668
* validate a single event group
1669
*
1670
* validation include:
1671
* - check events are compatible which each other
1672
* - events do not compete for the same counter
1673
* - number of events <= number of counters
1674
*
1675
* validation ensures the group can be loaded onto the
1676
* PMU if it was the only group available.
1677
*/
1678
static int validate_group(struct perf_event *event)
1679
{
1680
struct perf_event *leader = event->group_leader;
1681
struct cpu_hw_events *fake_cpuc;
1682
int ret, n;
1683
1684
ret = -ENOMEM;
1685
fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1686
if (!fake_cpuc)
1687
goto out;
1688
1689
/*
1690
* the event is not yet connected with its
1691
* siblings therefore we must first collect
1692
* existing siblings, then add the new event
1693
* before we can simulate the scheduling
1694
*/
1695
ret = -ENOSPC;
1696
n = collect_events(fake_cpuc, leader, true);
1697
if (n < 0)
1698
goto out_free;
1699
1700
fake_cpuc->n_events = n;
1701
n = collect_events(fake_cpuc, event, false);
1702
if (n < 0)
1703
goto out_free;
1704
1705
fake_cpuc->n_events = n;
1706
1707
ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1708
1709
out_free:
1710
kfree(fake_cpuc);
1711
out:
1712
return ret;
1713
}
1714
1715
static int x86_pmu_event_init(struct perf_event *event)
1716
{
1717
struct pmu *tmp;
1718
int err;
1719
1720
switch (event->attr.type) {
1721
case PERF_TYPE_RAW:
1722
case PERF_TYPE_HARDWARE:
1723
case PERF_TYPE_HW_CACHE:
1724
break;
1725
1726
default:
1727
return -ENOENT;
1728
}
1729
1730
err = __x86_pmu_event_init(event);
1731
if (!err) {
1732
/*
1733
* we temporarily connect event to its pmu
1734
* such that validate_group() can classify
1735
* it as an x86 event using is_x86_event()
1736
*/
1737
tmp = event->pmu;
1738
event->pmu = &pmu;
1739
1740
if (event->group_leader != event)
1741
err = validate_group(event);
1742
else
1743
err = validate_event(event);
1744
1745
event->pmu = tmp;
1746
}
1747
if (err) {
1748
if (event->destroy)
1749
event->destroy(event);
1750
}
1751
1752
return err;
1753
}
1754
1755
static struct pmu pmu = {
1756
.pmu_enable = x86_pmu_enable,
1757
.pmu_disable = x86_pmu_disable,
1758
1759
.event_init = x86_pmu_event_init,
1760
1761
.add = x86_pmu_add,
1762
.del = x86_pmu_del,
1763
.start = x86_pmu_start,
1764
.stop = x86_pmu_stop,
1765
.read = x86_pmu_read,
1766
1767
.start_txn = x86_pmu_start_txn,
1768
.cancel_txn = x86_pmu_cancel_txn,
1769
.commit_txn = x86_pmu_commit_txn,
1770
};
1771
1772
/*
1773
* callchain support
1774
*/
1775
1776
static int backtrace_stack(void *data, char *name)
1777
{
1778
return 0;
1779
}
1780
1781
static void backtrace_address(void *data, unsigned long addr, int reliable)
1782
{
1783
struct perf_callchain_entry *entry = data;
1784
1785
perf_callchain_store(entry, addr);
1786
}
1787
1788
static const struct stacktrace_ops backtrace_ops = {
1789
.stack = backtrace_stack,
1790
.address = backtrace_address,
1791
.walk_stack = print_context_stack_bp,
1792
};
1793
1794
void
1795
perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1796
{
1797
if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1798
/* TODO: We don't support guest os callchain now */
1799
return;
1800
}
1801
1802
perf_callchain_store(entry, regs->ip);
1803
1804
dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1805
}
1806
1807
#ifdef CONFIG_COMPAT
1808
static inline int
1809
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1810
{
1811
/* 32-bit process in 64-bit kernel. */
1812
struct stack_frame_ia32 frame;
1813
const void __user *fp;
1814
1815
if (!test_thread_flag(TIF_IA32))
1816
return 0;
1817
1818
fp = compat_ptr(regs->bp);
1819
while (entry->nr < PERF_MAX_STACK_DEPTH) {
1820
unsigned long bytes;
1821
frame.next_frame = 0;
1822
frame.return_address = 0;
1823
1824
bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1825
if (bytes != sizeof(frame))
1826
break;
1827
1828
if (fp < compat_ptr(regs->sp))
1829
break;
1830
1831
perf_callchain_store(entry, frame.return_address);
1832
fp = compat_ptr(frame.next_frame);
1833
}
1834
return 1;
1835
}
1836
#else
1837
static inline int
1838
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1839
{
1840
return 0;
1841
}
1842
#endif
1843
1844
void
1845
perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1846
{
1847
struct stack_frame frame;
1848
const void __user *fp;
1849
1850
if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1851
/* TODO: We don't support guest os callchain now */
1852
return;
1853
}
1854
1855
fp = (void __user *)regs->bp;
1856
1857
perf_callchain_store(entry, regs->ip);
1858
1859
if (perf_callchain_user32(regs, entry))
1860
return;
1861
1862
while (entry->nr < PERF_MAX_STACK_DEPTH) {
1863
unsigned long bytes;
1864
frame.next_frame = NULL;
1865
frame.return_address = 0;
1866
1867
bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1868
if (bytes != sizeof(frame))
1869
break;
1870
1871
if ((unsigned long)fp < regs->sp)
1872
break;
1873
1874
perf_callchain_store(entry, frame.return_address);
1875
fp = frame.next_frame;
1876
}
1877
}
1878
1879
unsigned long perf_instruction_pointer(struct pt_regs *regs)
1880
{
1881
unsigned long ip;
1882
1883
if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
1884
ip = perf_guest_cbs->get_guest_ip();
1885
else
1886
ip = instruction_pointer(regs);
1887
1888
return ip;
1889
}
1890
1891
unsigned long perf_misc_flags(struct pt_regs *regs)
1892
{
1893
int misc = 0;
1894
1895
if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1896
if (perf_guest_cbs->is_user_mode())
1897
misc |= PERF_RECORD_MISC_GUEST_USER;
1898
else
1899
misc |= PERF_RECORD_MISC_GUEST_KERNEL;
1900
} else {
1901
if (user_mode(regs))
1902
misc |= PERF_RECORD_MISC_USER;
1903
else
1904
misc |= PERF_RECORD_MISC_KERNEL;
1905
}
1906
1907
if (regs->flags & PERF_EFLAGS_EXACT)
1908
misc |= PERF_RECORD_MISC_EXACT_IP;
1909
1910
return misc;
1911
}
1912
1913