Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/kernel/events/core.c
48988 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Performance events core code:
4
*
5
* Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner <[email protected]>
6
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
7
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
8
* Copyright © 2009 Paul Mackerras, IBM Corp. <[email protected]>
9
*/
10
11
#include <linux/fs.h>
12
#include <linux/mm.h>
13
#include <linux/cpu.h>
14
#include <linux/smp.h>
15
#include <linux/idr.h>
16
#include <linux/file.h>
17
#include <linux/poll.h>
18
#include <linux/slab.h>
19
#include <linux/hash.h>
20
#include <linux/tick.h>
21
#include <linux/sysfs.h>
22
#include <linux/dcache.h>
23
#include <linux/percpu.h>
24
#include <linux/ptrace.h>
25
#include <linux/reboot.h>
26
#include <linux/vmstat.h>
27
#include <linux/device.h>
28
#include <linux/export.h>
29
#include <linux/vmalloc.h>
30
#include <linux/hardirq.h>
31
#include <linux/hugetlb.h>
32
#include <linux/rculist.h>
33
#include <linux/uaccess.h>
34
#include <linux/syscalls.h>
35
#include <linux/anon_inodes.h>
36
#include <linux/kernel_stat.h>
37
#include <linux/cgroup.h>
38
#include <linux/perf_event.h>
39
#include <linux/trace_events.h>
40
#include <linux/hw_breakpoint.h>
41
#include <linux/mm_types.h>
42
#include <linux/module.h>
43
#include <linux/mman.h>
44
#include <linux/compat.h>
45
#include <linux/bpf.h>
46
#include <linux/filter.h>
47
#include <linux/namei.h>
48
#include <linux/parser.h>
49
#include <linux/sched/clock.h>
50
#include <linux/sched/mm.h>
51
#include <linux/proc_ns.h>
52
#include <linux/mount.h>
53
#include <linux/min_heap.h>
54
#include <linux/highmem.h>
55
#include <linux/pgtable.h>
56
#include <linux/buildid.h>
57
#include <linux/task_work.h>
58
#include <linux/percpu-rwsem.h>
59
#include <linux/unwind_deferred.h>
60
61
#include "internal.h"
62
63
#include <asm/irq_regs.h>
64
65
typedef int (*remote_function_f)(void *);
66
67
struct remote_function_call {
68
struct task_struct *p;
69
remote_function_f func;
70
void *info;
71
int ret;
72
};
73
74
static void remote_function(void *data)
75
{
76
struct remote_function_call *tfc = data;
77
struct task_struct *p = tfc->p;
78
79
if (p) {
80
/* -EAGAIN */
81
if (task_cpu(p) != smp_processor_id())
82
return;
83
84
/*
85
* Now that we're on right CPU with IRQs disabled, we can test
86
* if we hit the right task without races.
87
*/
88
89
tfc->ret = -ESRCH; /* No such (running) process */
90
if (p != current)
91
return;
92
}
93
94
tfc->ret = tfc->func(tfc->info);
95
}
96
97
/**
98
* task_function_call - call a function on the cpu on which a task runs
99
* @p: the task to evaluate
100
* @func: the function to be called
101
* @info: the function call argument
102
*
103
* Calls the function @func when the task is currently running. This might
104
* be on the current CPU, which just calls the function directly. This will
105
* retry due to any failures in smp_call_function_single(), such as if the
106
* task_cpu() goes offline concurrently.
107
*
108
* returns @func return value or -ESRCH or -ENXIO when the process isn't running
109
*/
110
static int
111
task_function_call(struct task_struct *p, remote_function_f func, void *info)
112
{
113
struct remote_function_call data = {
114
.p = p,
115
.func = func,
116
.info = info,
117
.ret = -EAGAIN,
118
};
119
int ret;
120
121
for (;;) {
122
ret = smp_call_function_single(task_cpu(p), remote_function,
123
&data, 1);
124
if (!ret)
125
ret = data.ret;
126
127
if (ret != -EAGAIN)
128
break;
129
130
cond_resched();
131
}
132
133
return ret;
134
}
135
136
/**
137
* cpu_function_call - call a function on the cpu
138
* @cpu: target cpu to queue this function
139
* @func: the function to be called
140
* @info: the function call argument
141
*
142
* Calls the function @func on the remote cpu.
143
*
144
* returns: @func return value or -ENXIO when the cpu is offline
145
*/
146
static int cpu_function_call(int cpu, remote_function_f func, void *info)
147
{
148
struct remote_function_call data = {
149
.p = NULL,
150
.func = func,
151
.info = info,
152
.ret = -ENXIO, /* No such CPU */
153
};
154
155
smp_call_function_single(cpu, remote_function, &data, 1);
156
157
return data.ret;
158
}
159
160
enum event_type_t {
161
EVENT_FLEXIBLE = 0x01,
162
EVENT_PINNED = 0x02,
163
EVENT_TIME = 0x04,
164
EVENT_FROZEN = 0x08,
165
/* see ctx_resched() for details */
166
EVENT_CPU = 0x10,
167
EVENT_CGROUP = 0x20,
168
169
/* compound helpers */
170
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
171
EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
172
};
173
174
static inline void __perf_ctx_lock(struct perf_event_context *ctx)
175
{
176
raw_spin_lock(&ctx->lock);
177
WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN);
178
}
179
180
static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
181
struct perf_event_context *ctx)
182
{
183
__perf_ctx_lock(&cpuctx->ctx);
184
if (ctx)
185
__perf_ctx_lock(ctx);
186
}
187
188
static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
189
{
190
/*
191
* If ctx_sched_in() didn't again set any ALL flags, clean up
192
* after ctx_sched_out() by clearing is_active.
193
*/
194
if (ctx->is_active & EVENT_FROZEN) {
195
if (!(ctx->is_active & EVENT_ALL))
196
ctx->is_active = 0;
197
else
198
ctx->is_active &= ~EVENT_FROZEN;
199
}
200
raw_spin_unlock(&ctx->lock);
201
}
202
203
static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
204
struct perf_event_context *ctx)
205
{
206
if (ctx)
207
__perf_ctx_unlock(ctx);
208
__perf_ctx_unlock(&cpuctx->ctx);
209
}
210
211
typedef struct {
212
struct perf_cpu_context *cpuctx;
213
struct perf_event_context *ctx;
214
} class_perf_ctx_lock_t;
215
216
static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T)
217
{ perf_ctx_unlock(_T->cpuctx, _T->ctx); }
218
219
static inline class_perf_ctx_lock_t
220
class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx,
221
struct perf_event_context *ctx)
222
{ perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; }
223
224
#define TASK_TOMBSTONE ((void *)-1L)
225
226
static bool is_kernel_event(struct perf_event *event)
227
{
228
return READ_ONCE(event->owner) == TASK_TOMBSTONE;
229
}
230
231
static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
232
233
struct perf_event_context *perf_cpu_task_ctx(void)
234
{
235
lockdep_assert_irqs_disabled();
236
return this_cpu_ptr(&perf_cpu_context)->task_ctx;
237
}
238
239
/*
240
* On task ctx scheduling...
241
*
242
* When !ctx->nr_events a task context will not be scheduled. This means
243
* we can disable the scheduler hooks (for performance) without leaving
244
* pending task ctx state.
245
*
246
* This however results in two special cases:
247
*
248
* - removing the last event from a task ctx; this is relatively straight
249
* forward and is done in __perf_remove_from_context.
250
*
251
* - adding the first event to a task ctx; this is tricky because we cannot
252
* rely on ctx->is_active and therefore cannot use event_function_call().
253
* See perf_install_in_context().
254
*
255
* If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
256
*/
257
258
typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
259
struct perf_event_context *, void *);
260
261
struct event_function_struct {
262
struct perf_event *event;
263
event_f func;
264
void *data;
265
};
266
267
static int event_function(void *info)
268
{
269
struct event_function_struct *efs = info;
270
struct perf_event *event = efs->event;
271
struct perf_event_context *ctx = event->ctx;
272
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
273
struct perf_event_context *task_ctx = cpuctx->task_ctx;
274
int ret = 0;
275
276
lockdep_assert_irqs_disabled();
277
278
perf_ctx_lock(cpuctx, task_ctx);
279
/*
280
* Since we do the IPI call without holding ctx->lock things can have
281
* changed, double check we hit the task we set out to hit.
282
*/
283
if (ctx->task) {
284
if (ctx->task != current) {
285
ret = -ESRCH;
286
goto unlock;
287
}
288
289
/*
290
* We only use event_function_call() on established contexts,
291
* and event_function() is only ever called when active (or
292
* rather, we'll have bailed in task_function_call() or the
293
* above ctx->task != current test), therefore we must have
294
* ctx->is_active here.
295
*/
296
WARN_ON_ONCE(!ctx->is_active);
297
/*
298
* And since we have ctx->is_active, cpuctx->task_ctx must
299
* match.
300
*/
301
WARN_ON_ONCE(task_ctx != ctx);
302
} else {
303
WARN_ON_ONCE(&cpuctx->ctx != ctx);
304
}
305
306
efs->func(event, cpuctx, ctx, efs->data);
307
unlock:
308
perf_ctx_unlock(cpuctx, task_ctx);
309
310
return ret;
311
}
312
313
static void event_function_call(struct perf_event *event, event_f func, void *data)
314
{
315
struct perf_event_context *ctx = event->ctx;
316
struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
317
struct perf_cpu_context *cpuctx;
318
struct event_function_struct efs = {
319
.event = event,
320
.func = func,
321
.data = data,
322
};
323
324
if (!event->parent) {
325
/*
326
* If this is a !child event, we must hold ctx::mutex to
327
* stabilize the event->ctx relation. See
328
* perf_event_ctx_lock().
329
*/
330
lockdep_assert_held(&ctx->mutex);
331
}
332
333
if (!task) {
334
cpu_function_call(event->cpu, event_function, &efs);
335
return;
336
}
337
338
if (task == TASK_TOMBSTONE)
339
return;
340
341
again:
342
if (!task_function_call(task, event_function, &efs))
343
return;
344
345
local_irq_disable();
346
cpuctx = this_cpu_ptr(&perf_cpu_context);
347
perf_ctx_lock(cpuctx, ctx);
348
/*
349
* Reload the task pointer, it might have been changed by
350
* a concurrent perf_event_context_sched_out().
351
*/
352
task = ctx->task;
353
if (task == TASK_TOMBSTONE)
354
goto unlock;
355
if (ctx->is_active) {
356
perf_ctx_unlock(cpuctx, ctx);
357
local_irq_enable();
358
goto again;
359
}
360
func(event, NULL, ctx, data);
361
unlock:
362
perf_ctx_unlock(cpuctx, ctx);
363
local_irq_enable();
364
}
365
366
/*
367
* Similar to event_function_call() + event_function(), but hard assumes IRQs
368
* are already disabled and we're on the right CPU.
369
*/
370
static void event_function_local(struct perf_event *event, event_f func, void *data)
371
{
372
struct perf_event_context *ctx = event->ctx;
373
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
374
struct task_struct *task = READ_ONCE(ctx->task);
375
struct perf_event_context *task_ctx = NULL;
376
377
lockdep_assert_irqs_disabled();
378
379
if (task) {
380
if (task == TASK_TOMBSTONE)
381
return;
382
383
task_ctx = ctx;
384
}
385
386
perf_ctx_lock(cpuctx, task_ctx);
387
388
task = ctx->task;
389
if (task == TASK_TOMBSTONE)
390
goto unlock;
391
392
if (task) {
393
/*
394
* We must be either inactive or active and the right task,
395
* otherwise we're screwed, since we cannot IPI to somewhere
396
* else.
397
*/
398
if (ctx->is_active) {
399
if (WARN_ON_ONCE(task != current))
400
goto unlock;
401
402
if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
403
goto unlock;
404
}
405
} else {
406
WARN_ON_ONCE(&cpuctx->ctx != ctx);
407
}
408
409
func(event, cpuctx, ctx, data);
410
unlock:
411
perf_ctx_unlock(cpuctx, task_ctx);
412
}
413
414
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
415
PERF_FLAG_FD_OUTPUT |\
416
PERF_FLAG_PID_CGROUP |\
417
PERF_FLAG_FD_CLOEXEC)
418
419
/*
420
* branch priv levels that need permission checks
421
*/
422
#define PERF_SAMPLE_BRANCH_PERM_PLM \
423
(PERF_SAMPLE_BRANCH_KERNEL |\
424
PERF_SAMPLE_BRANCH_HV)
425
426
/*
427
* perf_sched_events : >0 events exist
428
*/
429
430
static void perf_sched_delayed(struct work_struct *work);
431
DEFINE_STATIC_KEY_FALSE(perf_sched_events);
432
static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
433
static DEFINE_MUTEX(perf_sched_mutex);
434
static atomic_t perf_sched_count;
435
436
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
437
438
static atomic_t nr_mmap_events __read_mostly;
439
static atomic_t nr_comm_events __read_mostly;
440
static atomic_t nr_namespaces_events __read_mostly;
441
static atomic_t nr_task_events __read_mostly;
442
static atomic_t nr_freq_events __read_mostly;
443
static atomic_t nr_switch_events __read_mostly;
444
static atomic_t nr_ksymbol_events __read_mostly;
445
static atomic_t nr_bpf_events __read_mostly;
446
static atomic_t nr_cgroup_events __read_mostly;
447
static atomic_t nr_text_poke_events __read_mostly;
448
static atomic_t nr_build_id_events __read_mostly;
449
450
static LIST_HEAD(pmus);
451
static DEFINE_MUTEX(pmus_lock);
452
static struct srcu_struct pmus_srcu;
453
static cpumask_var_t perf_online_mask;
454
static cpumask_var_t perf_online_core_mask;
455
static cpumask_var_t perf_online_die_mask;
456
static cpumask_var_t perf_online_cluster_mask;
457
static cpumask_var_t perf_online_pkg_mask;
458
static cpumask_var_t perf_online_sys_mask;
459
static struct kmem_cache *perf_event_cache;
460
461
/*
462
* perf event paranoia level:
463
* -1 - not paranoid at all
464
* 0 - disallow raw tracepoint access for unpriv
465
* 1 - disallow cpu events for unpriv
466
* 2 - disallow kernel profiling for unpriv
467
*/
468
int sysctl_perf_event_paranoid __read_mostly = 2;
469
470
/* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */
471
static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
472
473
/*
474
* max perf event sample rate
475
*/
476
#define DEFAULT_MAX_SAMPLE_RATE 100000
477
#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
478
#define DEFAULT_CPU_TIME_MAX_PERCENT 25
479
480
int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
481
static int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
482
483
static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
484
static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
485
486
static int perf_sample_allowed_ns __read_mostly =
487
DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
488
489
static void update_perf_cpu_limits(void)
490
{
491
u64 tmp = perf_sample_period_ns;
492
493
tmp *= sysctl_perf_cpu_time_max_percent;
494
tmp = div_u64(tmp, 100);
495
if (!tmp)
496
tmp = 1;
497
498
WRITE_ONCE(perf_sample_allowed_ns, tmp);
499
}
500
501
static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
502
503
static int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write,
504
void *buffer, size_t *lenp, loff_t *ppos)
505
{
506
int ret;
507
int perf_cpu = sysctl_perf_cpu_time_max_percent;
508
/*
509
* If throttling is disabled don't allow the write:
510
*/
511
if (write && (perf_cpu == 100 || perf_cpu == 0))
512
return -EINVAL;
513
514
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
515
if (ret || !write)
516
return ret;
517
518
max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
519
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
520
update_perf_cpu_limits();
521
522
return 0;
523
}
524
525
static int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write,
526
void *buffer, size_t *lenp, loff_t *ppos)
527
{
528
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
529
530
if (ret || !write)
531
return ret;
532
533
if (sysctl_perf_cpu_time_max_percent == 100 ||
534
sysctl_perf_cpu_time_max_percent == 0) {
535
printk(KERN_WARNING
536
"perf: Dynamic interrupt throttling disabled, can hang your system!\n");
537
WRITE_ONCE(perf_sample_allowed_ns, 0);
538
} else {
539
update_perf_cpu_limits();
540
}
541
542
return 0;
543
}
544
545
static const struct ctl_table events_core_sysctl_table[] = {
546
/*
547
* User-space relies on this file as a feature check for
548
* perf_events being enabled. It's an ABI, do not remove!
549
*/
550
{
551
.procname = "perf_event_paranoid",
552
.data = &sysctl_perf_event_paranoid,
553
.maxlen = sizeof(sysctl_perf_event_paranoid),
554
.mode = 0644,
555
.proc_handler = proc_dointvec,
556
},
557
{
558
.procname = "perf_event_mlock_kb",
559
.data = &sysctl_perf_event_mlock,
560
.maxlen = sizeof(sysctl_perf_event_mlock),
561
.mode = 0644,
562
.proc_handler = proc_dointvec,
563
},
564
{
565
.procname = "perf_event_max_sample_rate",
566
.data = &sysctl_perf_event_sample_rate,
567
.maxlen = sizeof(sysctl_perf_event_sample_rate),
568
.mode = 0644,
569
.proc_handler = perf_event_max_sample_rate_handler,
570
.extra1 = SYSCTL_ONE,
571
},
572
{
573
.procname = "perf_cpu_time_max_percent",
574
.data = &sysctl_perf_cpu_time_max_percent,
575
.maxlen = sizeof(sysctl_perf_cpu_time_max_percent),
576
.mode = 0644,
577
.proc_handler = perf_cpu_time_max_percent_handler,
578
.extra1 = SYSCTL_ZERO,
579
.extra2 = SYSCTL_ONE_HUNDRED,
580
},
581
};
582
583
static int __init init_events_core_sysctls(void)
584
{
585
register_sysctl_init("kernel", events_core_sysctl_table);
586
return 0;
587
}
588
core_initcall(init_events_core_sysctls);
589
590
591
/*
592
* perf samples are done in some very critical code paths (NMIs).
593
* If they take too much CPU time, the system can lock up and not
594
* get any real work done. This will drop the sample rate when
595
* we detect that events are taking too long.
596
*/
597
#define NR_ACCUMULATED_SAMPLES 128
598
static DEFINE_PER_CPU(u64, running_sample_length);
599
600
static u64 __report_avg;
601
static u64 __report_allowed;
602
603
static void perf_duration_warn(struct irq_work *w)
604
{
605
printk_ratelimited(KERN_INFO
606
"perf: interrupt took too long (%lld > %lld), lowering "
607
"kernel.perf_event_max_sample_rate to %d\n",
608
__report_avg, __report_allowed,
609
sysctl_perf_event_sample_rate);
610
}
611
612
static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
613
614
void perf_sample_event_took(u64 sample_len_ns)
615
{
616
u64 max_len = READ_ONCE(perf_sample_allowed_ns);
617
u64 running_len;
618
u64 avg_len;
619
u32 max;
620
621
if (max_len == 0)
622
return;
623
624
/* Decay the counter by 1 average sample. */
625
running_len = __this_cpu_read(running_sample_length);
626
running_len -= running_len/NR_ACCUMULATED_SAMPLES;
627
running_len += sample_len_ns;
628
__this_cpu_write(running_sample_length, running_len);
629
630
/*
631
* Note: this will be biased artificially low until we have
632
* seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
633
* from having to maintain a count.
634
*/
635
avg_len = running_len/NR_ACCUMULATED_SAMPLES;
636
if (avg_len <= max_len)
637
return;
638
639
__report_avg = avg_len;
640
__report_allowed = max_len;
641
642
/*
643
* Compute a throttle threshold 25% below the current duration.
644
*/
645
avg_len += avg_len / 4;
646
max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
647
if (avg_len < max)
648
max /= (u32)avg_len;
649
else
650
max = 1;
651
652
WRITE_ONCE(perf_sample_allowed_ns, avg_len);
653
WRITE_ONCE(max_samples_per_tick, max);
654
655
sysctl_perf_event_sample_rate = max * HZ;
656
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
657
658
if (!irq_work_queue(&perf_duration_work)) {
659
early_printk("perf: interrupt took too long (%lld > %lld), lowering "
660
"kernel.perf_event_max_sample_rate to %d\n",
661
__report_avg, __report_allowed,
662
sysctl_perf_event_sample_rate);
663
}
664
}
665
666
static atomic64_t perf_event_id;
667
668
static void update_context_time(struct perf_event_context *ctx);
669
static u64 perf_event_time(struct perf_event *event);
670
671
void __weak perf_event_print_debug(void) { }
672
673
static inline u64 perf_clock(void)
674
{
675
return local_clock();
676
}
677
678
static inline u64 perf_event_clock(struct perf_event *event)
679
{
680
return event->clock();
681
}
682
683
/*
684
* State based event timekeeping...
685
*
686
* The basic idea is to use event->state to determine which (if any) time
687
* fields to increment with the current delta. This means we only need to
688
* update timestamps when we change state or when they are explicitly requested
689
* (read).
690
*
691
* Event groups make things a little more complicated, but not terribly so. The
692
* rules for a group are that if the group leader is OFF the entire group is
693
* OFF, irrespective of what the group member states are. This results in
694
* __perf_effective_state().
695
*
696
* A further ramification is that when a group leader flips between OFF and
697
* !OFF, we need to update all group member times.
698
*
699
*
700
* NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
701
* need to make sure the relevant context time is updated before we try and
702
* update our timestamps.
703
*/
704
705
static __always_inline enum perf_event_state
706
__perf_effective_state(struct perf_event *event)
707
{
708
struct perf_event *leader = event->group_leader;
709
710
if (leader->state <= PERF_EVENT_STATE_OFF)
711
return leader->state;
712
713
return event->state;
714
}
715
716
static __always_inline void
717
__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
718
{
719
enum perf_event_state state = __perf_effective_state(event);
720
u64 delta = now - event->tstamp;
721
722
*enabled = event->total_time_enabled;
723
if (state >= PERF_EVENT_STATE_INACTIVE)
724
*enabled += delta;
725
726
*running = event->total_time_running;
727
if (state >= PERF_EVENT_STATE_ACTIVE)
728
*running += delta;
729
}
730
731
static void perf_event_update_time(struct perf_event *event)
732
{
733
u64 now = perf_event_time(event);
734
735
__perf_update_times(event, now, &event->total_time_enabled,
736
&event->total_time_running);
737
event->tstamp = now;
738
}
739
740
static void perf_event_update_sibling_time(struct perf_event *leader)
741
{
742
struct perf_event *sibling;
743
744
for_each_sibling_event(sibling, leader)
745
perf_event_update_time(sibling);
746
}
747
748
static void
749
perf_event_set_state(struct perf_event *event, enum perf_event_state state)
750
{
751
if (event->state == state)
752
return;
753
754
perf_event_update_time(event);
755
/*
756
* If a group leader gets enabled/disabled all its siblings
757
* are affected too.
758
*/
759
if ((event->state < 0) ^ (state < 0))
760
perf_event_update_sibling_time(event);
761
762
WRITE_ONCE(event->state, state);
763
}
764
765
/*
766
* UP store-release, load-acquire
767
*/
768
769
#define __store_release(ptr, val) \
770
do { \
771
barrier(); \
772
WRITE_ONCE(*(ptr), (val)); \
773
} while (0)
774
775
#define __load_acquire(ptr) \
776
({ \
777
__unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \
778
barrier(); \
779
___p; \
780
})
781
782
#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \
783
list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \
784
if (_cgroup && !_epc->nr_cgroups) \
785
continue; \
786
else if (_pmu && _epc->pmu != _pmu) \
787
continue; \
788
else
789
790
static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
791
{
792
struct perf_event_pmu_context *pmu_ctx;
793
794
for_each_epc(pmu_ctx, ctx, NULL, cgroup)
795
perf_pmu_disable(pmu_ctx->pmu);
796
}
797
798
static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
799
{
800
struct perf_event_pmu_context *pmu_ctx;
801
802
for_each_epc(pmu_ctx, ctx, NULL, cgroup)
803
perf_pmu_enable(pmu_ctx->pmu);
804
}
805
806
static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
807
static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
808
809
#ifdef CONFIG_CGROUP_PERF
810
811
static inline bool
812
perf_cgroup_match(struct perf_event *event)
813
{
814
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
815
816
/* @event doesn't care about cgroup */
817
if (!event->cgrp)
818
return true;
819
820
/* wants specific cgroup scope but @cpuctx isn't associated with any */
821
if (!cpuctx->cgrp)
822
return false;
823
824
/*
825
* Cgroup scoping is recursive. An event enabled for a cgroup is
826
* also enabled for all its descendant cgroups. If @cpuctx's
827
* cgroup is a descendant of @event's (the test covers identity
828
* case), it's a match.
829
*/
830
return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
831
event->cgrp->css.cgroup);
832
}
833
834
static inline void perf_detach_cgroup(struct perf_event *event)
835
{
836
css_put(&event->cgrp->css);
837
event->cgrp = NULL;
838
}
839
840
static inline int is_cgroup_event(struct perf_event *event)
841
{
842
return event->cgrp != NULL;
843
}
844
845
static inline u64 perf_cgroup_event_time(struct perf_event *event)
846
{
847
struct perf_cgroup_info *t;
848
849
t = per_cpu_ptr(event->cgrp->info, event->cpu);
850
return t->time;
851
}
852
853
static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
854
{
855
struct perf_cgroup_info *t;
856
857
t = per_cpu_ptr(event->cgrp->info, event->cpu);
858
if (!__load_acquire(&t->active))
859
return t->time;
860
now += READ_ONCE(t->timeoffset);
861
return now;
862
}
863
864
static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
865
{
866
if (adv)
867
info->time += now - info->timestamp;
868
info->timestamp = now;
869
/*
870
* see update_context_time()
871
*/
872
WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
873
}
874
875
static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
876
{
877
struct perf_cgroup *cgrp = cpuctx->cgrp;
878
struct cgroup_subsys_state *css;
879
struct perf_cgroup_info *info;
880
881
if (cgrp) {
882
u64 now = perf_clock();
883
884
for (css = &cgrp->css; css; css = css->parent) {
885
cgrp = container_of(css, struct perf_cgroup, css);
886
info = this_cpu_ptr(cgrp->info);
887
888
__update_cgrp_time(info, now, true);
889
if (final)
890
__store_release(&info->active, 0);
891
}
892
}
893
}
894
895
static inline void update_cgrp_time_from_event(struct perf_event *event)
896
{
897
struct perf_cgroup_info *info;
898
899
/*
900
* ensure we access cgroup data only when needed and
901
* when we know the cgroup is pinned (css_get)
902
*/
903
if (!is_cgroup_event(event))
904
return;
905
906
info = this_cpu_ptr(event->cgrp->info);
907
/*
908
* Do not update time when cgroup is not active
909
*/
910
if (info->active)
911
__update_cgrp_time(info, perf_clock(), true);
912
}
913
914
static inline void
915
perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
916
{
917
struct perf_event_context *ctx = &cpuctx->ctx;
918
struct perf_cgroup *cgrp = cpuctx->cgrp;
919
struct perf_cgroup_info *info;
920
struct cgroup_subsys_state *css;
921
922
/*
923
* ctx->lock held by caller
924
* ensure we do not access cgroup data
925
* unless we have the cgroup pinned (css_get)
926
*/
927
if (!cgrp)
928
return;
929
930
WARN_ON_ONCE(!ctx->nr_cgroups);
931
932
for (css = &cgrp->css; css; css = css->parent) {
933
cgrp = container_of(css, struct perf_cgroup, css);
934
info = this_cpu_ptr(cgrp->info);
935
__update_cgrp_time(info, ctx->timestamp, false);
936
__store_release(&info->active, 1);
937
}
938
}
939
940
/*
941
* reschedule events based on the cgroup constraint of task.
942
*/
943
static void perf_cgroup_switch(struct task_struct *task)
944
{
945
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
946
struct perf_cgroup *cgrp;
947
948
/*
949
* cpuctx->cgrp is set when the first cgroup event enabled,
950
* and is cleared when the last cgroup event disabled.
951
*/
952
if (READ_ONCE(cpuctx->cgrp) == NULL)
953
return;
954
955
cgrp = perf_cgroup_from_task(task, NULL);
956
if (READ_ONCE(cpuctx->cgrp) == cgrp)
957
return;
958
959
guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx);
960
/*
961
* Re-check, could've raced vs perf_remove_from_context().
962
*/
963
if (READ_ONCE(cpuctx->cgrp) == NULL)
964
return;
965
966
WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
967
968
perf_ctx_disable(&cpuctx->ctx, true);
969
970
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
971
/*
972
* must not be done before ctxswout due
973
* to update_cgrp_time_from_cpuctx() in
974
* ctx_sched_out()
975
*/
976
cpuctx->cgrp = cgrp;
977
/*
978
* set cgrp before ctxsw in to allow
979
* perf_cgroup_set_timestamp() in ctx_sched_in()
980
* to not have to pass task around
981
*/
982
ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
983
984
perf_ctx_enable(&cpuctx->ctx, true);
985
}
986
987
static int perf_cgroup_ensure_storage(struct perf_event *event,
988
struct cgroup_subsys_state *css)
989
{
990
struct perf_cpu_context *cpuctx;
991
struct perf_event **storage;
992
int cpu, heap_size, ret = 0;
993
994
/*
995
* Allow storage to have sufficient space for an iterator for each
996
* possibly nested cgroup plus an iterator for events with no cgroup.
997
*/
998
for (heap_size = 1; css; css = css->parent)
999
heap_size++;
1000
1001
for_each_possible_cpu(cpu) {
1002
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
1003
if (heap_size <= cpuctx->heap_size)
1004
continue;
1005
1006
storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
1007
GFP_KERNEL, cpu_to_node(cpu));
1008
if (!storage) {
1009
ret = -ENOMEM;
1010
break;
1011
}
1012
1013
raw_spin_lock_irq(&cpuctx->ctx.lock);
1014
if (cpuctx->heap_size < heap_size) {
1015
swap(cpuctx->heap, storage);
1016
if (storage == cpuctx->heap_default)
1017
storage = NULL;
1018
cpuctx->heap_size = heap_size;
1019
}
1020
raw_spin_unlock_irq(&cpuctx->ctx.lock);
1021
1022
kfree(storage);
1023
}
1024
1025
return ret;
1026
}
1027
1028
static inline int perf_cgroup_connect(int fd, struct perf_event *event,
1029
struct perf_event_attr *attr,
1030
struct perf_event *group_leader)
1031
{
1032
struct perf_cgroup *cgrp;
1033
struct cgroup_subsys_state *css;
1034
CLASS(fd, f)(fd);
1035
int ret = 0;
1036
1037
if (fd_empty(f))
1038
return -EBADF;
1039
1040
css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry,
1041
&perf_event_cgrp_subsys);
1042
if (IS_ERR(css))
1043
return PTR_ERR(css);
1044
1045
ret = perf_cgroup_ensure_storage(event, css);
1046
if (ret)
1047
return ret;
1048
1049
cgrp = container_of(css, struct perf_cgroup, css);
1050
event->cgrp = cgrp;
1051
1052
/*
1053
* all events in a group must monitor
1054
* the same cgroup because a task belongs
1055
* to only one perf cgroup at a time
1056
*/
1057
if (group_leader && group_leader->cgrp != cgrp) {
1058
perf_detach_cgroup(event);
1059
ret = -EINVAL;
1060
}
1061
return ret;
1062
}
1063
1064
static inline void
1065
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
1066
{
1067
struct perf_cpu_context *cpuctx;
1068
1069
if (!is_cgroup_event(event))
1070
return;
1071
1072
event->pmu_ctx->nr_cgroups++;
1073
1074
/*
1075
* Because cgroup events are always per-cpu events,
1076
* @ctx == &cpuctx->ctx.
1077
*/
1078
cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1079
1080
if (ctx->nr_cgroups++)
1081
return;
1082
1083
cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
1084
}
1085
1086
static inline void
1087
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1088
{
1089
struct perf_cpu_context *cpuctx;
1090
1091
if (!is_cgroup_event(event))
1092
return;
1093
1094
event->pmu_ctx->nr_cgroups--;
1095
1096
/*
1097
* Because cgroup events are always per-cpu events,
1098
* @ctx == &cpuctx->ctx.
1099
*/
1100
cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1101
1102
if (--ctx->nr_cgroups)
1103
return;
1104
1105
cpuctx->cgrp = NULL;
1106
}
1107
1108
#else /* !CONFIG_CGROUP_PERF */
1109
1110
static inline bool
1111
perf_cgroup_match(struct perf_event *event)
1112
{
1113
return true;
1114
}
1115
1116
static inline void perf_detach_cgroup(struct perf_event *event)
1117
{}
1118
1119
static inline int is_cgroup_event(struct perf_event *event)
1120
{
1121
return 0;
1122
}
1123
1124
static inline void update_cgrp_time_from_event(struct perf_event *event)
1125
{
1126
}
1127
1128
static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
1129
bool final)
1130
{
1131
}
1132
1133
static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1134
struct perf_event_attr *attr,
1135
struct perf_event *group_leader)
1136
{
1137
return -EINVAL;
1138
}
1139
1140
static inline void
1141
perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
1142
{
1143
}
1144
1145
static inline u64 perf_cgroup_event_time(struct perf_event *event)
1146
{
1147
return 0;
1148
}
1149
1150
static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
1151
{
1152
return 0;
1153
}
1154
1155
static inline void
1156
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
1157
{
1158
}
1159
1160
static inline void
1161
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1162
{
1163
}
1164
1165
static void perf_cgroup_switch(struct task_struct *task)
1166
{
1167
}
1168
#endif
1169
1170
/*
1171
* set default to be dependent on timer tick just
1172
* like original code
1173
*/
1174
#define PERF_CPU_HRTIMER (1000 / HZ)
1175
/*
1176
* function must be called with interrupts disabled
1177
*/
1178
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1179
{
1180
struct perf_cpu_pmu_context *cpc;
1181
bool rotations;
1182
1183
lockdep_assert_irqs_disabled();
1184
1185
cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer);
1186
rotations = perf_rotate_context(cpc);
1187
1188
raw_spin_lock(&cpc->hrtimer_lock);
1189
if (rotations)
1190
hrtimer_forward_now(hr, cpc->hrtimer_interval);
1191
else
1192
cpc->hrtimer_active = 0;
1193
raw_spin_unlock(&cpc->hrtimer_lock);
1194
1195
return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1196
}
1197
1198
static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
1199
{
1200
struct hrtimer *timer = &cpc->hrtimer;
1201
struct pmu *pmu = cpc->epc.pmu;
1202
u64 interval;
1203
1204
/*
1205
* check default is sane, if not set then force to
1206
* default interval (1/tick)
1207
*/
1208
interval = pmu->hrtimer_interval_ms;
1209
if (interval < 1)
1210
interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1211
1212
cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1213
1214
raw_spin_lock_init(&cpc->hrtimer_lock);
1215
hrtimer_setup(timer, perf_mux_hrtimer_handler, CLOCK_MONOTONIC,
1216
HRTIMER_MODE_ABS_PINNED_HARD);
1217
}
1218
1219
static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
1220
{
1221
struct hrtimer *timer = &cpc->hrtimer;
1222
unsigned long flags;
1223
1224
raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags);
1225
if (!cpc->hrtimer_active) {
1226
cpc->hrtimer_active = 1;
1227
hrtimer_forward_now(timer, cpc->hrtimer_interval);
1228
hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
1229
}
1230
raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags);
1231
1232
return 0;
1233
}
1234
1235
static int perf_mux_hrtimer_restart_ipi(void *arg)
1236
{
1237
return perf_mux_hrtimer_restart(arg);
1238
}
1239
1240
static __always_inline struct perf_cpu_pmu_context *this_cpc(struct pmu *pmu)
1241
{
1242
return *this_cpu_ptr(pmu->cpu_pmu_context);
1243
}
1244
1245
void perf_pmu_disable(struct pmu *pmu)
1246
{
1247
int *count = &this_cpc(pmu)->pmu_disable_count;
1248
if (!(*count)++)
1249
pmu->pmu_disable(pmu);
1250
}
1251
1252
void perf_pmu_enable(struct pmu *pmu)
1253
{
1254
int *count = &this_cpc(pmu)->pmu_disable_count;
1255
if (!--(*count))
1256
pmu->pmu_enable(pmu);
1257
}
1258
1259
static void perf_assert_pmu_disabled(struct pmu *pmu)
1260
{
1261
int *count = &this_cpc(pmu)->pmu_disable_count;
1262
WARN_ON_ONCE(*count == 0);
1263
}
1264
1265
static inline void perf_pmu_read(struct perf_event *event)
1266
{
1267
if (event->state == PERF_EVENT_STATE_ACTIVE)
1268
event->pmu->read(event);
1269
}
1270
1271
static void get_ctx(struct perf_event_context *ctx)
1272
{
1273
refcount_inc(&ctx->refcount);
1274
}
1275
1276
static void free_ctx(struct rcu_head *head)
1277
{
1278
struct perf_event_context *ctx;
1279
1280
ctx = container_of(head, struct perf_event_context, rcu_head);
1281
kfree(ctx);
1282
}
1283
1284
static void put_ctx(struct perf_event_context *ctx)
1285
{
1286
if (refcount_dec_and_test(&ctx->refcount)) {
1287
if (ctx->parent_ctx)
1288
put_ctx(ctx->parent_ctx);
1289
if (ctx->task && ctx->task != TASK_TOMBSTONE)
1290
put_task_struct(ctx->task);
1291
call_rcu(&ctx->rcu_head, free_ctx);
1292
} else {
1293
smp_mb__after_atomic(); /* pairs with wait_var_event() */
1294
if (ctx->task == TASK_TOMBSTONE)
1295
wake_up_var(&ctx->refcount);
1296
}
1297
}
1298
1299
/*
1300
* Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1301
* perf_pmu_migrate_context() we need some magic.
1302
*
1303
* Those places that change perf_event::ctx will hold both
1304
* perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1305
*
1306
* Lock ordering is by mutex address. There are two other sites where
1307
* perf_event_context::mutex nests and those are:
1308
*
1309
* - perf_event_exit_task_context() [ child , 0 ]
1310
* perf_event_exit_event()
1311
* put_event() [ parent, 1 ]
1312
*
1313
* - perf_event_init_context() [ parent, 0 ]
1314
* inherit_task_group()
1315
* inherit_group()
1316
* inherit_event()
1317
* perf_event_alloc()
1318
* perf_init_event()
1319
* perf_try_init_event() [ child , 1 ]
1320
*
1321
* While it appears there is an obvious deadlock here -- the parent and child
1322
* nesting levels are inverted between the two. This is in fact safe because
1323
* life-time rules separate them. That is an exiting task cannot fork, and a
1324
* spawning task cannot (yet) exit.
1325
*
1326
* But remember that these are parent<->child context relations, and
1327
* migration does not affect children, therefore these two orderings should not
1328
* interact.
1329
*
1330
* The change in perf_event::ctx does not affect children (as claimed above)
1331
* because the sys_perf_event_open() case will install a new event and break
1332
* the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1333
* concerned with cpuctx and that doesn't have children.
1334
*
1335
* The places that change perf_event::ctx will issue:
1336
*
1337
* perf_remove_from_context();
1338
* synchronize_rcu();
1339
* perf_install_in_context();
1340
*
1341
* to affect the change. The remove_from_context() + synchronize_rcu() should
1342
* quiesce the event, after which we can install it in the new location. This
1343
* means that only external vectors (perf_fops, prctl) can perturb the event
1344
* while in transit. Therefore all such accessors should also acquire
1345
* perf_event_context::mutex to serialize against this.
1346
*
1347
* However; because event->ctx can change while we're waiting to acquire
1348
* ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1349
* function.
1350
*
1351
* Lock order:
1352
* exec_update_lock
1353
* task_struct::perf_event_mutex
1354
* perf_event_context::mutex
1355
* perf_event::child_mutex;
1356
* perf_event_context::lock
1357
* mmap_lock
1358
* perf_event::mmap_mutex
1359
* perf_buffer::aux_mutex
1360
* perf_addr_filters_head::lock
1361
*
1362
* cpu_hotplug_lock
1363
* pmus_lock
1364
* cpuctx->mutex / perf_event_context::mutex
1365
*/
1366
static struct perf_event_context *
1367
perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1368
{
1369
struct perf_event_context *ctx;
1370
1371
again:
1372
rcu_read_lock();
1373
ctx = READ_ONCE(event->ctx);
1374
if (!refcount_inc_not_zero(&ctx->refcount)) {
1375
rcu_read_unlock();
1376
goto again;
1377
}
1378
rcu_read_unlock();
1379
1380
mutex_lock_nested(&ctx->mutex, nesting);
1381
if (event->ctx != ctx) {
1382
mutex_unlock(&ctx->mutex);
1383
put_ctx(ctx);
1384
goto again;
1385
}
1386
1387
return ctx;
1388
}
1389
1390
static inline struct perf_event_context *
1391
perf_event_ctx_lock(struct perf_event *event)
1392
{
1393
return perf_event_ctx_lock_nested(event, 0);
1394
}
1395
1396
static void perf_event_ctx_unlock(struct perf_event *event,
1397
struct perf_event_context *ctx)
1398
{
1399
mutex_unlock(&ctx->mutex);
1400
put_ctx(ctx);
1401
}
1402
1403
/*
1404
* This must be done under the ctx->lock, such as to serialize against
1405
* context_equiv(), therefore we cannot call put_ctx() since that might end up
1406
* calling scheduler related locks and ctx->lock nests inside those.
1407
*/
1408
static __must_check struct perf_event_context *
1409
unclone_ctx(struct perf_event_context *ctx)
1410
{
1411
struct perf_event_context *parent_ctx = ctx->parent_ctx;
1412
1413
lockdep_assert_held(&ctx->lock);
1414
1415
if (parent_ctx)
1416
ctx->parent_ctx = NULL;
1417
ctx->generation++;
1418
1419
return parent_ctx;
1420
}
1421
1422
static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1423
enum pid_type type)
1424
{
1425
u32 nr;
1426
/*
1427
* only top level events have the pid namespace they were created in
1428
*/
1429
if (event->parent)
1430
event = event->parent;
1431
1432
nr = __task_pid_nr_ns(p, type, event->ns);
1433
/* avoid -1 if it is idle thread or runs in another ns */
1434
if (!nr && !pid_alive(p))
1435
nr = -1;
1436
return nr;
1437
}
1438
1439
static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1440
{
1441
return perf_event_pid_type(event, p, PIDTYPE_TGID);
1442
}
1443
1444
static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1445
{
1446
return perf_event_pid_type(event, p, PIDTYPE_PID);
1447
}
1448
1449
/*
1450
* If we inherit events we want to return the parent event id
1451
* to userspace.
1452
*/
1453
static u64 primary_event_id(struct perf_event *event)
1454
{
1455
u64 id = event->id;
1456
1457
if (event->parent)
1458
id = event->parent->id;
1459
1460
return id;
1461
}
1462
1463
/*
1464
* Get the perf_event_context for a task and lock it.
1465
*
1466
* This has to cope with the fact that until it is locked,
1467
* the context could get moved to another task.
1468
*/
1469
static struct perf_event_context *
1470
perf_lock_task_context(struct task_struct *task, unsigned long *flags)
1471
{
1472
struct perf_event_context *ctx;
1473
1474
retry:
1475
/*
1476
* One of the few rules of preemptible RCU is that one cannot do
1477
* rcu_read_unlock() while holding a scheduler (or nested) lock when
1478
* part of the read side critical section was irqs-enabled -- see
1479
* rcu_read_unlock_special().
1480
*
1481
* Since ctx->lock nests under rq->lock we must ensure the entire read
1482
* side critical section has interrupts disabled.
1483
*/
1484
local_irq_save(*flags);
1485
rcu_read_lock();
1486
ctx = rcu_dereference(task->perf_event_ctxp);
1487
if (ctx) {
1488
/*
1489
* If this context is a clone of another, it might
1490
* get swapped for another underneath us by
1491
* perf_event_task_sched_out, though the
1492
* rcu_read_lock() protects us from any context
1493
* getting freed. Lock the context and check if it
1494
* got swapped before we could get the lock, and retry
1495
* if so. If we locked the right context, then it
1496
* can't get swapped on us any more.
1497
*/
1498
raw_spin_lock(&ctx->lock);
1499
if (ctx != rcu_dereference(task->perf_event_ctxp)) {
1500
raw_spin_unlock(&ctx->lock);
1501
rcu_read_unlock();
1502
local_irq_restore(*flags);
1503
goto retry;
1504
}
1505
1506
if (ctx->task == TASK_TOMBSTONE ||
1507
!refcount_inc_not_zero(&ctx->refcount)) {
1508
raw_spin_unlock(&ctx->lock);
1509
ctx = NULL;
1510
} else {
1511
WARN_ON_ONCE(ctx->task != task);
1512
}
1513
}
1514
rcu_read_unlock();
1515
if (!ctx)
1516
local_irq_restore(*flags);
1517
return ctx;
1518
}
1519
1520
/*
1521
* Get the context for a task and increment its pin_count so it
1522
* can't get swapped to another task. This also increments its
1523
* reference count so that the context can't get freed.
1524
*/
1525
static struct perf_event_context *
1526
perf_pin_task_context(struct task_struct *task)
1527
{
1528
struct perf_event_context *ctx;
1529
unsigned long flags;
1530
1531
ctx = perf_lock_task_context(task, &flags);
1532
if (ctx) {
1533
++ctx->pin_count;
1534
raw_spin_unlock_irqrestore(&ctx->lock, flags);
1535
}
1536
return ctx;
1537
}
1538
1539
static void perf_unpin_context(struct perf_event_context *ctx)
1540
{
1541
unsigned long flags;
1542
1543
raw_spin_lock_irqsave(&ctx->lock, flags);
1544
--ctx->pin_count;
1545
raw_spin_unlock_irqrestore(&ctx->lock, flags);
1546
}
1547
1548
/*
1549
* Update the record of the current time in a context.
1550
*/
1551
static void __update_context_time(struct perf_event_context *ctx, bool adv)
1552
{
1553
u64 now = perf_clock();
1554
1555
lockdep_assert_held(&ctx->lock);
1556
1557
if (adv)
1558
ctx->time += now - ctx->timestamp;
1559
ctx->timestamp = now;
1560
1561
/*
1562
* The above: time' = time + (now - timestamp), can be re-arranged
1563
* into: time` = now + (time - timestamp), which gives a single value
1564
* offset to compute future time without locks on.
1565
*
1566
* See perf_event_time_now(), which can be used from NMI context where
1567
* it's (obviously) not possible to acquire ctx->lock in order to read
1568
* both the above values in a consistent manner.
1569
*/
1570
WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
1571
}
1572
1573
static void update_context_time(struct perf_event_context *ctx)
1574
{
1575
__update_context_time(ctx, true);
1576
}
1577
1578
static u64 perf_event_time(struct perf_event *event)
1579
{
1580
struct perf_event_context *ctx = event->ctx;
1581
1582
if (unlikely(!ctx))
1583
return 0;
1584
1585
if (is_cgroup_event(event))
1586
return perf_cgroup_event_time(event);
1587
1588
return ctx->time;
1589
}
1590
1591
static u64 perf_event_time_now(struct perf_event *event, u64 now)
1592
{
1593
struct perf_event_context *ctx = event->ctx;
1594
1595
if (unlikely(!ctx))
1596
return 0;
1597
1598
if (is_cgroup_event(event))
1599
return perf_cgroup_event_time_now(event, now);
1600
1601
if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
1602
return ctx->time;
1603
1604
now += READ_ONCE(ctx->timeoffset);
1605
return now;
1606
}
1607
1608
static enum event_type_t get_event_type(struct perf_event *event)
1609
{
1610
struct perf_event_context *ctx = event->ctx;
1611
enum event_type_t event_type;
1612
1613
lockdep_assert_held(&ctx->lock);
1614
1615
/*
1616
* It's 'group type', really, because if our group leader is
1617
* pinned, so are we.
1618
*/
1619
if (event->group_leader != event)
1620
event = event->group_leader;
1621
1622
event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1623
if (!ctx->task)
1624
event_type |= EVENT_CPU;
1625
1626
return event_type;
1627
}
1628
1629
/*
1630
* Helper function to initialize event group nodes.
1631
*/
1632
static void init_event_group(struct perf_event *event)
1633
{
1634
RB_CLEAR_NODE(&event->group_node);
1635
event->group_index = 0;
1636
}
1637
1638
/*
1639
* Extract pinned or flexible groups from the context
1640
* based on event attrs bits.
1641
*/
1642
static struct perf_event_groups *
1643
get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1644
{
1645
if (event->attr.pinned)
1646
return &ctx->pinned_groups;
1647
else
1648
return &ctx->flexible_groups;
1649
}
1650
1651
/*
1652
* Helper function to initializes perf_event_group trees.
1653
*/
1654
static void perf_event_groups_init(struct perf_event_groups *groups)
1655
{
1656
groups->tree = RB_ROOT;
1657
groups->index = 0;
1658
}
1659
1660
static inline struct cgroup *event_cgroup(const struct perf_event *event)
1661
{
1662
struct cgroup *cgroup = NULL;
1663
1664
#ifdef CONFIG_CGROUP_PERF
1665
if (event->cgrp)
1666
cgroup = event->cgrp->css.cgroup;
1667
#endif
1668
1669
return cgroup;
1670
}
1671
1672
/*
1673
* Compare function for event groups;
1674
*
1675
* Implements complex key that first sorts by CPU and then by virtual index
1676
* which provides ordering when rotating groups for the same CPU.
1677
*/
1678
static __always_inline int
1679
perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu,
1680
const struct cgroup *left_cgroup, const u64 left_group_index,
1681
const struct perf_event *right)
1682
{
1683
if (left_cpu < right->cpu)
1684
return -1;
1685
if (left_cpu > right->cpu)
1686
return 1;
1687
1688
if (left_pmu) {
1689
if (left_pmu < right->pmu_ctx->pmu)
1690
return -1;
1691
if (left_pmu > right->pmu_ctx->pmu)
1692
return 1;
1693
}
1694
1695
#ifdef CONFIG_CGROUP_PERF
1696
{
1697
const struct cgroup *right_cgroup = event_cgroup(right);
1698
1699
if (left_cgroup != right_cgroup) {
1700
if (!left_cgroup) {
1701
/*
1702
* Left has no cgroup but right does, no
1703
* cgroups come first.
1704
*/
1705
return -1;
1706
}
1707
if (!right_cgroup) {
1708
/*
1709
* Right has no cgroup but left does, no
1710
* cgroups come first.
1711
*/
1712
return 1;
1713
}
1714
/* Two dissimilar cgroups, order by id. */
1715
if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
1716
return -1;
1717
1718
return 1;
1719
}
1720
}
1721
#endif
1722
1723
if (left_group_index < right->group_index)
1724
return -1;
1725
if (left_group_index > right->group_index)
1726
return 1;
1727
1728
return 0;
1729
}
1730
1731
#define __node_2_pe(node) \
1732
rb_entry((node), struct perf_event, group_node)
1733
1734
static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
1735
{
1736
struct perf_event *e = __node_2_pe(a);
1737
return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e),
1738
e->group_index, __node_2_pe(b)) < 0;
1739
}
1740
1741
struct __group_key {
1742
int cpu;
1743
struct pmu *pmu;
1744
struct cgroup *cgroup;
1745
};
1746
1747
static inline int __group_cmp(const void *key, const struct rb_node *node)
1748
{
1749
const struct __group_key *a = key;
1750
const struct perf_event *b = __node_2_pe(node);
1751
1752
/* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */
1753
return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b);
1754
}
1755
1756
static inline int
1757
__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node)
1758
{
1759
const struct __group_key *a = key;
1760
const struct perf_event *b = __node_2_pe(node);
1761
1762
/* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */
1763
return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b),
1764
b->group_index, b);
1765
}
1766
1767
/*
1768
* Insert @event into @groups' tree; using
1769
* {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index}
1770
* as key. This places it last inside the {cpu,pmu,cgroup} subtree.
1771
*/
1772
static void
1773
perf_event_groups_insert(struct perf_event_groups *groups,
1774
struct perf_event *event)
1775
{
1776
event->group_index = ++groups->index;
1777
1778
rb_add(&event->group_node, &groups->tree, __group_less);
1779
}
1780
1781
/*
1782
* Helper function to insert event into the pinned or flexible groups.
1783
*/
1784
static void
1785
add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1786
{
1787
struct perf_event_groups *groups;
1788
1789
groups = get_event_groups(event, ctx);
1790
perf_event_groups_insert(groups, event);
1791
}
1792
1793
/*
1794
* Delete a group from a tree.
1795
*/
1796
static void
1797
perf_event_groups_delete(struct perf_event_groups *groups,
1798
struct perf_event *event)
1799
{
1800
WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1801
RB_EMPTY_ROOT(&groups->tree));
1802
1803
rb_erase(&event->group_node, &groups->tree);
1804
init_event_group(event);
1805
}
1806
1807
/*
1808
* Helper function to delete event from its groups.
1809
*/
1810
static void
1811
del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1812
{
1813
struct perf_event_groups *groups;
1814
1815
groups = get_event_groups(event, ctx);
1816
perf_event_groups_delete(groups, event);
1817
}
1818
1819
/*
1820
* Get the leftmost event in the {cpu,pmu,cgroup} subtree.
1821
*/
1822
static struct perf_event *
1823
perf_event_groups_first(struct perf_event_groups *groups, int cpu,
1824
struct pmu *pmu, struct cgroup *cgrp)
1825
{
1826
struct __group_key key = {
1827
.cpu = cpu,
1828
.pmu = pmu,
1829
.cgroup = cgrp,
1830
};
1831
struct rb_node *node;
1832
1833
node = rb_find_first(&key, &groups->tree, __group_cmp);
1834
if (node)
1835
return __node_2_pe(node);
1836
1837
return NULL;
1838
}
1839
1840
static struct perf_event *
1841
perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
1842
{
1843
struct __group_key key = {
1844
.cpu = event->cpu,
1845
.pmu = pmu,
1846
.cgroup = event_cgroup(event),
1847
};
1848
struct rb_node *next;
1849
1850
next = rb_next_match(&key, &event->group_node, __group_cmp);
1851
if (next)
1852
return __node_2_pe(next);
1853
1854
return NULL;
1855
}
1856
1857
#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \
1858
for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \
1859
event; event = perf_event_groups_next(event, pmu))
1860
1861
/*
1862
* Iterate through the whole groups tree.
1863
*/
1864
#define perf_event_groups_for_each(event, groups) \
1865
for (event = rb_entry_safe(rb_first(&((groups)->tree)), \
1866
typeof(*event), group_node); event; \
1867
event = rb_entry_safe(rb_next(&event->group_node), \
1868
typeof(*event), group_node))
1869
1870
/*
1871
* Does the event attribute request inherit with PERF_SAMPLE_READ
1872
*/
1873
static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr)
1874
{
1875
return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ);
1876
}
1877
1878
/*
1879
* Add an event from the lists for its context.
1880
* Must be called with ctx->mutex and ctx->lock held.
1881
*/
1882
static void
1883
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1884
{
1885
lockdep_assert_held(&ctx->lock);
1886
1887
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1888
event->attach_state |= PERF_ATTACH_CONTEXT;
1889
1890
event->tstamp = perf_event_time(event);
1891
1892
/*
1893
* If we're a stand alone event or group leader, we go to the context
1894
* list, group events are kept attached to the group so that
1895
* perf_group_detach can, at all times, locate all siblings.
1896
*/
1897
if (event->group_leader == event) {
1898
event->group_caps = event->event_caps;
1899
add_event_to_groups(event, ctx);
1900
}
1901
1902
list_add_rcu(&event->event_entry, &ctx->event_list);
1903
ctx->nr_events++;
1904
if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
1905
ctx->nr_user++;
1906
if (event->attr.inherit_stat)
1907
ctx->nr_stat++;
1908
if (has_inherit_and_sample_read(&event->attr))
1909
local_inc(&ctx->nr_no_switch_fast);
1910
1911
if (event->state > PERF_EVENT_STATE_OFF)
1912
perf_cgroup_event_enable(event, ctx);
1913
1914
ctx->generation++;
1915
event->pmu_ctx->nr_events++;
1916
}
1917
1918
/*
1919
* Initialize event state based on the perf_event_attr::disabled.
1920
*/
1921
static inline void perf_event__state_init(struct perf_event *event)
1922
{
1923
event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1924
PERF_EVENT_STATE_INACTIVE;
1925
}
1926
1927
static int __perf_event_read_size(u64 read_format, int nr_siblings)
1928
{
1929
int entry = sizeof(u64); /* value */
1930
int size = 0;
1931
int nr = 1;
1932
1933
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1934
size += sizeof(u64);
1935
1936
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1937
size += sizeof(u64);
1938
1939
if (read_format & PERF_FORMAT_ID)
1940
entry += sizeof(u64);
1941
1942
if (read_format & PERF_FORMAT_LOST)
1943
entry += sizeof(u64);
1944
1945
if (read_format & PERF_FORMAT_GROUP) {
1946
nr += nr_siblings;
1947
size += sizeof(u64);
1948
}
1949
1950
/*
1951
* Since perf_event_validate_size() limits this to 16k and inhibits
1952
* adding more siblings, this will never overflow.
1953
*/
1954
return size + nr * entry;
1955
}
1956
1957
static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1958
{
1959
struct perf_sample_data *data;
1960
u16 size = 0;
1961
1962
if (sample_type & PERF_SAMPLE_IP)
1963
size += sizeof(data->ip);
1964
1965
if (sample_type & PERF_SAMPLE_ADDR)
1966
size += sizeof(data->addr);
1967
1968
if (sample_type & PERF_SAMPLE_PERIOD)
1969
size += sizeof(data->period);
1970
1971
if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
1972
size += sizeof(data->weight.full);
1973
1974
if (sample_type & PERF_SAMPLE_READ)
1975
size += event->read_size;
1976
1977
if (sample_type & PERF_SAMPLE_DATA_SRC)
1978
size += sizeof(data->data_src.val);
1979
1980
if (sample_type & PERF_SAMPLE_TRANSACTION)
1981
size += sizeof(data->txn);
1982
1983
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1984
size += sizeof(data->phys_addr);
1985
1986
if (sample_type & PERF_SAMPLE_CGROUP)
1987
size += sizeof(data->cgroup);
1988
1989
if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
1990
size += sizeof(data->data_page_size);
1991
1992
if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
1993
size += sizeof(data->code_page_size);
1994
1995
event->header_size = size;
1996
}
1997
1998
/*
1999
* Called at perf_event creation and when events are attached/detached from a
2000
* group.
2001
*/
2002
static void perf_event__header_size(struct perf_event *event)
2003
{
2004
event->read_size =
2005
__perf_event_read_size(event->attr.read_format,
2006
event->group_leader->nr_siblings);
2007
__perf_event_header_size(event, event->attr.sample_type);
2008
}
2009
2010
static void perf_event__id_header_size(struct perf_event *event)
2011
{
2012
struct perf_sample_data *data;
2013
u64 sample_type = event->attr.sample_type;
2014
u16 size = 0;
2015
2016
if (sample_type & PERF_SAMPLE_TID)
2017
size += sizeof(data->tid_entry);
2018
2019
if (sample_type & PERF_SAMPLE_TIME)
2020
size += sizeof(data->time);
2021
2022
if (sample_type & PERF_SAMPLE_IDENTIFIER)
2023
size += sizeof(data->id);
2024
2025
if (sample_type & PERF_SAMPLE_ID)
2026
size += sizeof(data->id);
2027
2028
if (sample_type & PERF_SAMPLE_STREAM_ID)
2029
size += sizeof(data->stream_id);
2030
2031
if (sample_type & PERF_SAMPLE_CPU)
2032
size += sizeof(data->cpu_entry);
2033
2034
event->id_header_size = size;
2035
}
2036
2037
/*
2038
* Check that adding an event to the group does not result in anybody
2039
* overflowing the 64k event limit imposed by the output buffer.
2040
*
2041
* Specifically, check that the read_size for the event does not exceed 16k,
2042
* read_size being the one term that grows with groups size. Since read_size
2043
* depends on per-event read_format, also (re)check the existing events.
2044
*
2045
* This leaves 48k for the constant size fields and things like callchains,
2046
* branch stacks and register sets.
2047
*/
2048
static bool perf_event_validate_size(struct perf_event *event)
2049
{
2050
struct perf_event *sibling, *group_leader = event->group_leader;
2051
2052
if (__perf_event_read_size(event->attr.read_format,
2053
group_leader->nr_siblings + 1) > 16*1024)
2054
return false;
2055
2056
if (__perf_event_read_size(group_leader->attr.read_format,
2057
group_leader->nr_siblings + 1) > 16*1024)
2058
return false;
2059
2060
/*
2061
* When creating a new group leader, group_leader->ctx is initialized
2062
* after the size has been validated, but we cannot safely use
2063
* for_each_sibling_event() until group_leader->ctx is set. A new group
2064
* leader cannot have any siblings yet, so we can safely skip checking
2065
* the non-existent siblings.
2066
*/
2067
if (event == group_leader)
2068
return true;
2069
2070
for_each_sibling_event(sibling, group_leader) {
2071
if (__perf_event_read_size(sibling->attr.read_format,
2072
group_leader->nr_siblings + 1) > 16*1024)
2073
return false;
2074
}
2075
2076
return true;
2077
}
2078
2079
static void perf_group_attach(struct perf_event *event)
2080
{
2081
struct perf_event *group_leader = event->group_leader, *pos;
2082
2083
lockdep_assert_held(&event->ctx->lock);
2084
2085
/*
2086
* We can have double attach due to group movement (move_group) in
2087
* perf_event_open().
2088
*/
2089
if (event->attach_state & PERF_ATTACH_GROUP)
2090
return;
2091
2092
event->attach_state |= PERF_ATTACH_GROUP;
2093
2094
if (group_leader == event)
2095
return;
2096
2097
WARN_ON_ONCE(group_leader->ctx != event->ctx);
2098
2099
group_leader->group_caps &= event->event_caps;
2100
2101
list_add_tail(&event->sibling_list, &group_leader->sibling_list);
2102
group_leader->nr_siblings++;
2103
group_leader->group_generation++;
2104
2105
perf_event__header_size(group_leader);
2106
2107
for_each_sibling_event(pos, group_leader)
2108
perf_event__header_size(pos);
2109
}
2110
2111
/*
2112
* Remove an event from the lists for its context.
2113
* Must be called with ctx->mutex and ctx->lock held.
2114
*/
2115
static void
2116
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
2117
{
2118
WARN_ON_ONCE(event->ctx != ctx);
2119
lockdep_assert_held(&ctx->lock);
2120
2121
/*
2122
* We can have double detach due to exit/hot-unplug + close.
2123
*/
2124
if (!(event->attach_state & PERF_ATTACH_CONTEXT))
2125
return;
2126
2127
event->attach_state &= ~PERF_ATTACH_CONTEXT;
2128
2129
ctx->nr_events--;
2130
if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
2131
ctx->nr_user--;
2132
if (event->attr.inherit_stat)
2133
ctx->nr_stat--;
2134
if (has_inherit_and_sample_read(&event->attr))
2135
local_dec(&ctx->nr_no_switch_fast);
2136
2137
list_del_rcu(&event->event_entry);
2138
2139
if (event->group_leader == event)
2140
del_event_from_groups(event, ctx);
2141
2142
ctx->generation++;
2143
event->pmu_ctx->nr_events--;
2144
}
2145
2146
static int
2147
perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
2148
{
2149
if (!has_aux(aux_event))
2150
return 0;
2151
2152
if (!event->pmu->aux_output_match)
2153
return 0;
2154
2155
return event->pmu->aux_output_match(aux_event);
2156
}
2157
2158
static void put_event(struct perf_event *event);
2159
static void __event_disable(struct perf_event *event,
2160
struct perf_event_context *ctx,
2161
enum perf_event_state state);
2162
2163
static void perf_put_aux_event(struct perf_event *event)
2164
{
2165
struct perf_event_context *ctx = event->ctx;
2166
struct perf_event *iter;
2167
2168
/*
2169
* If event uses aux_event tear down the link
2170
*/
2171
if (event->aux_event) {
2172
iter = event->aux_event;
2173
event->aux_event = NULL;
2174
put_event(iter);
2175
return;
2176
}
2177
2178
/*
2179
* If the event is an aux_event, tear down all links to
2180
* it from other events.
2181
*/
2182
for_each_sibling_event(iter, event) {
2183
if (iter->aux_event != event)
2184
continue;
2185
2186
iter->aux_event = NULL;
2187
put_event(event);
2188
2189
/*
2190
* If it's ACTIVE, schedule it out and put it into ERROR
2191
* state so that we don't try to schedule it again. Note
2192
* that perf_event_enable() will clear the ERROR status.
2193
*/
2194
__event_disable(iter, ctx, PERF_EVENT_STATE_ERROR);
2195
}
2196
}
2197
2198
static bool perf_need_aux_event(struct perf_event *event)
2199
{
2200
return event->attr.aux_output || has_aux_action(event);
2201
}
2202
2203
static int perf_get_aux_event(struct perf_event *event,
2204
struct perf_event *group_leader)
2205
{
2206
/*
2207
* Our group leader must be an aux event if we want to be
2208
* an aux_output. This way, the aux event will precede its
2209
* aux_output events in the group, and therefore will always
2210
* schedule first.
2211
*/
2212
if (!group_leader)
2213
return 0;
2214
2215
/*
2216
* aux_output and aux_sample_size are mutually exclusive.
2217
*/
2218
if (event->attr.aux_output && event->attr.aux_sample_size)
2219
return 0;
2220
2221
if (event->attr.aux_output &&
2222
!perf_aux_output_match(event, group_leader))
2223
return 0;
2224
2225
if ((event->attr.aux_pause || event->attr.aux_resume) &&
2226
!(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
2227
return 0;
2228
2229
if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
2230
return 0;
2231
2232
if (!atomic_long_inc_not_zero(&group_leader->refcount))
2233
return 0;
2234
2235
/*
2236
* Link aux_outputs to their aux event; this is undone in
2237
* perf_group_detach() by perf_put_aux_event(). When the
2238
* group in torn down, the aux_output events loose their
2239
* link to the aux_event and can't schedule any more.
2240
*/
2241
event->aux_event = group_leader;
2242
2243
return 1;
2244
}
2245
2246
static inline struct list_head *get_event_list(struct perf_event *event)
2247
{
2248
return event->attr.pinned ? &event->pmu_ctx->pinned_active :
2249
&event->pmu_ctx->flexible_active;
2250
}
2251
2252
static void perf_group_detach(struct perf_event *event)
2253
{
2254
struct perf_event *leader = event->group_leader;
2255
struct perf_event *sibling, *tmp;
2256
struct perf_event_context *ctx = event->ctx;
2257
2258
lockdep_assert_held(&ctx->lock);
2259
2260
/*
2261
* We can have double detach due to exit/hot-unplug + close.
2262
*/
2263
if (!(event->attach_state & PERF_ATTACH_GROUP))
2264
return;
2265
2266
event->attach_state &= ~PERF_ATTACH_GROUP;
2267
2268
perf_put_aux_event(event);
2269
2270
/*
2271
* If this is a sibling, remove it from its group.
2272
*/
2273
if (leader != event) {
2274
list_del_init(&event->sibling_list);
2275
event->group_leader->nr_siblings--;
2276
event->group_leader->group_generation++;
2277
goto out;
2278
}
2279
2280
/*
2281
* If this was a group event with sibling events then
2282
* upgrade the siblings to singleton events by adding them
2283
* to whatever list we are on.
2284
*/
2285
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
2286
2287
/*
2288
* Events that have PERF_EV_CAP_SIBLING require being part of
2289
* a group and cannot exist on their own, schedule them out
2290
* and move them into the ERROR state. Also see
2291
* _perf_event_enable(), it will not be able to recover this
2292
* ERROR state.
2293
*/
2294
if (sibling->event_caps & PERF_EV_CAP_SIBLING)
2295
__event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR);
2296
2297
sibling->group_leader = sibling;
2298
list_del_init(&sibling->sibling_list);
2299
2300
/* Inherit group flags from the previous leader */
2301
sibling->group_caps = event->group_caps;
2302
2303
if (sibling->attach_state & PERF_ATTACH_CONTEXT) {
2304
add_event_to_groups(sibling, event->ctx);
2305
2306
if (sibling->state == PERF_EVENT_STATE_ACTIVE)
2307
list_add_tail(&sibling->active_list, get_event_list(sibling));
2308
}
2309
2310
WARN_ON_ONCE(sibling->ctx != event->ctx);
2311
}
2312
2313
out:
2314
for_each_sibling_event(tmp, leader)
2315
perf_event__header_size(tmp);
2316
2317
perf_event__header_size(leader);
2318
}
2319
2320
static void perf_child_detach(struct perf_event *event)
2321
{
2322
struct perf_event *parent_event = event->parent;
2323
2324
if (!(event->attach_state & PERF_ATTACH_CHILD))
2325
return;
2326
2327
event->attach_state &= ~PERF_ATTACH_CHILD;
2328
2329
if (WARN_ON_ONCE(!parent_event))
2330
return;
2331
2332
/*
2333
* Can't check this from an IPI, the holder is likey another CPU.
2334
*
2335
lockdep_assert_held(&parent_event->child_mutex);
2336
*/
2337
2338
list_del_init(&event->child_list);
2339
}
2340
2341
static bool is_orphaned_event(struct perf_event *event)
2342
{
2343
return event->state == PERF_EVENT_STATE_DEAD;
2344
}
2345
2346
static inline int
2347
event_filter_match(struct perf_event *event)
2348
{
2349
return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
2350
perf_cgroup_match(event);
2351
}
2352
2353
static inline bool is_event_in_freq_mode(struct perf_event *event)
2354
{
2355
return event->attr.freq && event->attr.sample_freq;
2356
}
2357
2358
static void
2359
event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
2360
{
2361
struct perf_event_pmu_context *epc = event->pmu_ctx;
2362
struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
2363
enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
2364
2365
// XXX cpc serialization, probably per-cpu IRQ disabled
2366
2367
WARN_ON_ONCE(event->ctx != ctx);
2368
lockdep_assert_held(&ctx->lock);
2369
2370
if (event->state != PERF_EVENT_STATE_ACTIVE)
2371
return;
2372
2373
/*
2374
* Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
2375
* we can schedule events _OUT_ individually through things like
2376
* __perf_remove_from_context().
2377
*/
2378
list_del_init(&event->active_list);
2379
2380
perf_pmu_disable(event->pmu);
2381
2382
event->pmu->del(event, 0);
2383
event->oncpu = -1;
2384
2385
if (event->pending_disable) {
2386
event->pending_disable = 0;
2387
perf_cgroup_event_disable(event, ctx);
2388
state = PERF_EVENT_STATE_OFF;
2389
}
2390
2391
perf_event_set_state(event, state);
2392
2393
if (!is_software_event(event))
2394
cpc->active_oncpu--;
2395
if (is_event_in_freq_mode(event)) {
2396
ctx->nr_freq--;
2397
epc->nr_freq--;
2398
}
2399
if (event->attr.exclusive || !cpc->active_oncpu)
2400
cpc->exclusive = 0;
2401
2402
perf_pmu_enable(event->pmu);
2403
}
2404
2405
static void
2406
group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
2407
{
2408
struct perf_event *event;
2409
2410
if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2411
return;
2412
2413
perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);
2414
2415
event_sched_out(group_event, ctx);
2416
2417
/*
2418
* Schedule out siblings (if any):
2419
*/
2420
for_each_sibling_event(event, group_event)
2421
event_sched_out(event, ctx);
2422
}
2423
2424
static inline void
2425
__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
2426
{
2427
if (ctx->is_active & EVENT_TIME) {
2428
if (ctx->is_active & EVENT_FROZEN)
2429
return;
2430
update_context_time(ctx);
2431
update_cgrp_time_from_cpuctx(cpuctx, final);
2432
}
2433
}
2434
2435
static inline void
2436
ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
2437
{
2438
__ctx_time_update(cpuctx, ctx, false);
2439
}
2440
2441
/*
2442
* To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock().
2443
*/
2444
static inline void
2445
ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
2446
{
2447
ctx_time_update(cpuctx, ctx);
2448
if (ctx->is_active & EVENT_TIME)
2449
ctx->is_active |= EVENT_FROZEN;
2450
}
2451
2452
static inline void
2453
ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
2454
{
2455
if (ctx->is_active & EVENT_TIME) {
2456
if (ctx->is_active & EVENT_FROZEN)
2457
return;
2458
update_context_time(ctx);
2459
update_cgrp_time_from_event(event);
2460
}
2461
}
2462
2463
#define DETACH_GROUP 0x01UL
2464
#define DETACH_CHILD 0x02UL
2465
#define DETACH_EXIT 0x04UL
2466
#define DETACH_REVOKE 0x08UL
2467
#define DETACH_DEAD 0x10UL
2468
2469
/*
2470
* Cross CPU call to remove a performance event
2471
*
2472
* We disable the event on the hardware level first. After that we
2473
* remove it from the context list.
2474
*/
2475
static void
2476
__perf_remove_from_context(struct perf_event *event,
2477
struct perf_cpu_context *cpuctx,
2478
struct perf_event_context *ctx,
2479
void *info)
2480
{
2481
struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
2482
enum perf_event_state state = PERF_EVENT_STATE_OFF;
2483
unsigned long flags = (unsigned long)info;
2484
2485
ctx_time_update(cpuctx, ctx);
2486
2487
/*
2488
* Ensure event_sched_out() switches to OFF, at the very least
2489
* this avoids raising perf_pending_task() at this time.
2490
*/
2491
if (flags & DETACH_EXIT)
2492
state = PERF_EVENT_STATE_EXIT;
2493
if (flags & DETACH_REVOKE)
2494
state = PERF_EVENT_STATE_REVOKED;
2495
if (flags & DETACH_DEAD)
2496
state = PERF_EVENT_STATE_DEAD;
2497
2498
event_sched_out(event, ctx);
2499
2500
if (event->state > PERF_EVENT_STATE_OFF)
2501
perf_cgroup_event_disable(event, ctx);
2502
2503
perf_event_set_state(event, min(event->state, state));
2504
2505
if (flags & DETACH_GROUP)
2506
perf_group_detach(event);
2507
if (flags & DETACH_CHILD)
2508
perf_child_detach(event);
2509
list_del_event(event, ctx);
2510
2511
if (!pmu_ctx->nr_events) {
2512
pmu_ctx->rotate_necessary = 0;
2513
2514
if (ctx->task && ctx->is_active) {
2515
struct perf_cpu_pmu_context *cpc = this_cpc(pmu_ctx->pmu);
2516
2517
WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
2518
cpc->task_epc = NULL;
2519
}
2520
}
2521
2522
if (!ctx->nr_events && ctx->is_active) {
2523
if (ctx == &cpuctx->ctx)
2524
update_cgrp_time_from_cpuctx(cpuctx, true);
2525
2526
ctx->is_active = 0;
2527
if (ctx->task) {
2528
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2529
cpuctx->task_ctx = NULL;
2530
}
2531
}
2532
}
2533
2534
/*
2535
* Remove the event from a task's (or a CPU's) list of events.
2536
*
2537
* If event->ctx is a cloned context, callers must make sure that
2538
* every task struct that event->ctx->task could possibly point to
2539
* remains valid. This is OK when called from perf_release since
2540
* that only calls us on the top-level context, which can't be a clone.
2541
* When called from perf_event_exit_task, it's OK because the
2542
* context has been detached from its task.
2543
*/
2544
static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2545
{
2546
struct perf_event_context *ctx = event->ctx;
2547
2548
lockdep_assert_held(&ctx->mutex);
2549
2550
/*
2551
* Because of perf_event_exit_task(), perf_remove_from_context() ought
2552
* to work in the face of TASK_TOMBSTONE, unlike every other
2553
* event_function_call() user.
2554
*/
2555
raw_spin_lock_irq(&ctx->lock);
2556
if (!ctx->is_active) {
2557
__perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
2558
ctx, (void *)flags);
2559
raw_spin_unlock_irq(&ctx->lock);
2560
return;
2561
}
2562
raw_spin_unlock_irq(&ctx->lock);
2563
2564
event_function_call(event, __perf_remove_from_context, (void *)flags);
2565
}
2566
2567
static void __event_disable(struct perf_event *event,
2568
struct perf_event_context *ctx,
2569
enum perf_event_state state)
2570
{
2571
event_sched_out(event, ctx);
2572
perf_cgroup_event_disable(event, ctx);
2573
perf_event_set_state(event, state);
2574
}
2575
2576
/*
2577
* Cross CPU call to disable a performance event
2578
*/
2579
static void __perf_event_disable(struct perf_event *event,
2580
struct perf_cpu_context *cpuctx,
2581
struct perf_event_context *ctx,
2582
void *info)
2583
{
2584
if (event->state < PERF_EVENT_STATE_INACTIVE)
2585
return;
2586
2587
perf_pmu_disable(event->pmu_ctx->pmu);
2588
ctx_time_update_event(ctx, event);
2589
2590
/*
2591
* When disabling a group leader, the whole group becomes ineligible
2592
* to run, so schedule out the full group.
2593
*/
2594
if (event == event->group_leader)
2595
group_sched_out(event, ctx);
2596
2597
/*
2598
* But only mark the leader OFF; the siblings will remain
2599
* INACTIVE.
2600
*/
2601
__event_disable(event, ctx, PERF_EVENT_STATE_OFF);
2602
2603
perf_pmu_enable(event->pmu_ctx->pmu);
2604
}
2605
2606
/*
2607
* Disable an event.
2608
*
2609
* If event->ctx is a cloned context, callers must make sure that
2610
* every task struct that event->ctx->task could possibly point to
2611
* remains valid. This condition is satisfied when called through
2612
* perf_event_for_each_child or perf_event_for_each because they
2613
* hold the top-level event's child_mutex, so any descendant that
2614
* goes to exit will block in perf_event_exit_event().
2615
*
2616
* When called from perf_pending_disable it's OK because event->ctx
2617
* is the current context on this CPU and preemption is disabled,
2618
* hence we can't get into perf_event_task_sched_out for this context.
2619
*/
2620
static void _perf_event_disable(struct perf_event *event)
2621
{
2622
struct perf_event_context *ctx = event->ctx;
2623
2624
raw_spin_lock_irq(&ctx->lock);
2625
if (event->state <= PERF_EVENT_STATE_OFF) {
2626
raw_spin_unlock_irq(&ctx->lock);
2627
return;
2628
}
2629
raw_spin_unlock_irq(&ctx->lock);
2630
2631
event_function_call(event, __perf_event_disable, NULL);
2632
}
2633
2634
void perf_event_disable_local(struct perf_event *event)
2635
{
2636
event_function_local(event, __perf_event_disable, NULL);
2637
}
2638
2639
/*
2640
* Strictly speaking kernel users cannot create groups and therefore this
2641
* interface does not need the perf_event_ctx_lock() magic.
2642
*/
2643
void perf_event_disable(struct perf_event *event)
2644
{
2645
struct perf_event_context *ctx;
2646
2647
ctx = perf_event_ctx_lock(event);
2648
_perf_event_disable(event);
2649
perf_event_ctx_unlock(event, ctx);
2650
}
2651
EXPORT_SYMBOL_GPL(perf_event_disable);
2652
2653
void perf_event_disable_inatomic(struct perf_event *event)
2654
{
2655
event->pending_disable = 1;
2656
irq_work_queue(&event->pending_disable_irq);
2657
}
2658
2659
#define MAX_INTERRUPTS (~0ULL)
2660
2661
static void perf_log_throttle(struct perf_event *event, int enable);
2662
static void perf_log_itrace_start(struct perf_event *event);
2663
2664
static void perf_event_unthrottle(struct perf_event *event, bool start)
2665
{
2666
if (event->state != PERF_EVENT_STATE_ACTIVE)
2667
return;
2668
2669
event->hw.interrupts = 0;
2670
if (start)
2671
event->pmu->start(event, 0);
2672
if (event == event->group_leader)
2673
perf_log_throttle(event, 1);
2674
}
2675
2676
static void perf_event_throttle(struct perf_event *event)
2677
{
2678
if (event->state != PERF_EVENT_STATE_ACTIVE)
2679
return;
2680
2681
event->hw.interrupts = MAX_INTERRUPTS;
2682
event->pmu->stop(event, 0);
2683
if (event == event->group_leader)
2684
perf_log_throttle(event, 0);
2685
}
2686
2687
static void perf_event_unthrottle_group(struct perf_event *event, bool skip_start_event)
2688
{
2689
struct perf_event *sibling, *leader = event->group_leader;
2690
2691
perf_event_unthrottle(leader, skip_start_event ? leader != event : true);
2692
for_each_sibling_event(sibling, leader)
2693
perf_event_unthrottle(sibling, skip_start_event ? sibling != event : true);
2694
}
2695
2696
static void perf_event_throttle_group(struct perf_event *event)
2697
{
2698
struct perf_event *sibling, *leader = event->group_leader;
2699
2700
perf_event_throttle(leader);
2701
for_each_sibling_event(sibling, leader)
2702
perf_event_throttle(sibling);
2703
}
2704
2705
static int
2706
event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
2707
{
2708
struct perf_event_pmu_context *epc = event->pmu_ctx;
2709
struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
2710
int ret = 0;
2711
2712
WARN_ON_ONCE(event->ctx != ctx);
2713
2714
lockdep_assert_held(&ctx->lock);
2715
2716
if (event->state <= PERF_EVENT_STATE_OFF)
2717
return 0;
2718
2719
WRITE_ONCE(event->oncpu, smp_processor_id());
2720
/*
2721
* Order event::oncpu write to happen before the ACTIVE state is
2722
* visible. This allows perf_event_{stop,read}() to observe the correct
2723
* ->oncpu if it sees ACTIVE.
2724
*/
2725
smp_wmb();
2726
perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2727
2728
/*
2729
* Unthrottle events, since we scheduled we might have missed several
2730
* ticks already, also for a heavily scheduling task there is little
2731
* guarantee it'll get a tick in a timely manner.
2732
*/
2733
if (unlikely(event->hw.interrupts == MAX_INTERRUPTS))
2734
perf_event_unthrottle(event, false);
2735
2736
perf_pmu_disable(event->pmu);
2737
2738
perf_log_itrace_start(event);
2739
2740
if (event->pmu->add(event, PERF_EF_START)) {
2741
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2742
event->oncpu = -1;
2743
ret = -EAGAIN;
2744
goto out;
2745
}
2746
2747
if (!is_software_event(event))
2748
cpc->active_oncpu++;
2749
if (is_event_in_freq_mode(event)) {
2750
ctx->nr_freq++;
2751
epc->nr_freq++;
2752
}
2753
if (event->attr.exclusive)
2754
cpc->exclusive = 1;
2755
2756
out:
2757
perf_pmu_enable(event->pmu);
2758
2759
return ret;
2760
}
2761
2762
static int
2763
group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx)
2764
{
2765
struct perf_event *event, *partial_group = NULL;
2766
struct pmu *pmu = group_event->pmu_ctx->pmu;
2767
2768
if (group_event->state == PERF_EVENT_STATE_OFF)
2769
return 0;
2770
2771
pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2772
2773
if (event_sched_in(group_event, ctx))
2774
goto error;
2775
2776
/*
2777
* Schedule in siblings as one group (if any):
2778
*/
2779
for_each_sibling_event(event, group_event) {
2780
if (event_sched_in(event, ctx)) {
2781
partial_group = event;
2782
goto group_error;
2783
}
2784
}
2785
2786
if (!pmu->commit_txn(pmu))
2787
return 0;
2788
2789
group_error:
2790
/*
2791
* Groups can be scheduled in as one unit only, so undo any
2792
* partial group before returning:
2793
* The events up to the failed event are scheduled out normally.
2794
*/
2795
for_each_sibling_event(event, group_event) {
2796
if (event == partial_group)
2797
break;
2798
2799
event_sched_out(event, ctx);
2800
}
2801
event_sched_out(group_event, ctx);
2802
2803
error:
2804
pmu->cancel_txn(pmu);
2805
return -EAGAIN;
2806
}
2807
2808
/*
2809
* Work out whether we can put this event group on the CPU now.
2810
*/
2811
static int group_can_go_on(struct perf_event *event, int can_add_hw)
2812
{
2813
struct perf_event_pmu_context *epc = event->pmu_ctx;
2814
struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
2815
2816
/*
2817
* Groups consisting entirely of software events can always go on.
2818
*/
2819
if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2820
return 1;
2821
/*
2822
* If an exclusive group is already on, no other hardware
2823
* events can go on.
2824
*/
2825
if (cpc->exclusive)
2826
return 0;
2827
/*
2828
* If this group is exclusive and there are already
2829
* events on the CPU, it can't go on.
2830
*/
2831
if (event->attr.exclusive && !list_empty(get_event_list(event)))
2832
return 0;
2833
/*
2834
* Otherwise, try to add it if all previous groups were able
2835
* to go on.
2836
*/
2837
return can_add_hw;
2838
}
2839
2840
static void add_event_to_ctx(struct perf_event *event,
2841
struct perf_event_context *ctx)
2842
{
2843
list_add_event(event, ctx);
2844
perf_group_attach(event);
2845
}
2846
2847
static void task_ctx_sched_out(struct perf_event_context *ctx,
2848
struct pmu *pmu,
2849
enum event_type_t event_type)
2850
{
2851
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
2852
2853
if (!cpuctx->task_ctx)
2854
return;
2855
2856
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2857
return;
2858
2859
ctx_sched_out(ctx, pmu, event_type);
2860
}
2861
2862
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2863
struct perf_event_context *ctx,
2864
struct pmu *pmu)
2865
{
2866
ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
2867
if (ctx)
2868
ctx_sched_in(ctx, pmu, EVENT_PINNED);
2869
ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
2870
if (ctx)
2871
ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
2872
}
2873
2874
/*
2875
* We want to maintain the following priority of scheduling:
2876
* - CPU pinned (EVENT_CPU | EVENT_PINNED)
2877
* - task pinned (EVENT_PINNED)
2878
* - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
2879
* - task flexible (EVENT_FLEXIBLE).
2880
*
2881
* In order to avoid unscheduling and scheduling back in everything every
2882
* time an event is added, only do it for the groups of equal priority and
2883
* below.
2884
*
2885
* This can be called after a batch operation on task events, in which case
2886
* event_type is a bit mask of the types of events involved. For CPU events,
2887
* event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
2888
*/
2889
static void ctx_resched(struct perf_cpu_context *cpuctx,
2890
struct perf_event_context *task_ctx,
2891
struct pmu *pmu, enum event_type_t event_type)
2892
{
2893
bool cpu_event = !!(event_type & EVENT_CPU);
2894
struct perf_event_pmu_context *epc;
2895
2896
/*
2897
* If pinned groups are involved, flexible groups also need to be
2898
* scheduled out.
2899
*/
2900
if (event_type & EVENT_PINNED)
2901
event_type |= EVENT_FLEXIBLE;
2902
2903
event_type &= EVENT_ALL;
2904
2905
for_each_epc(epc, &cpuctx->ctx, pmu, false)
2906
perf_pmu_disable(epc->pmu);
2907
2908
if (task_ctx) {
2909
for_each_epc(epc, task_ctx, pmu, false)
2910
perf_pmu_disable(epc->pmu);
2911
2912
task_ctx_sched_out(task_ctx, pmu, event_type);
2913
}
2914
2915
/*
2916
* Decide which cpu ctx groups to schedule out based on the types
2917
* of events that caused rescheduling:
2918
* - EVENT_CPU: schedule out corresponding groups;
2919
* - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
2920
* - otherwise, do nothing more.
2921
*/
2922
if (cpu_event)
2923
ctx_sched_out(&cpuctx->ctx, pmu, event_type);
2924
else if (event_type & EVENT_PINNED)
2925
ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
2926
2927
perf_event_sched_in(cpuctx, task_ctx, pmu);
2928
2929
for_each_epc(epc, &cpuctx->ctx, pmu, false)
2930
perf_pmu_enable(epc->pmu);
2931
2932
if (task_ctx) {
2933
for_each_epc(epc, task_ctx, pmu, false)
2934
perf_pmu_enable(epc->pmu);
2935
}
2936
}
2937
2938
void perf_pmu_resched(struct pmu *pmu)
2939
{
2940
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
2941
struct perf_event_context *task_ctx = cpuctx->task_ctx;
2942
2943
perf_ctx_lock(cpuctx, task_ctx);
2944
ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
2945
perf_ctx_unlock(cpuctx, task_ctx);
2946
}
2947
2948
/*
2949
* Cross CPU call to install and enable a performance event
2950
*
2951
* Very similar to remote_function() + event_function() but cannot assume that
2952
* things like ctx->is_active and cpuctx->task_ctx are set.
2953
*/
2954
static int __perf_install_in_context(void *info)
2955
{
2956
struct perf_event *event = info;
2957
struct perf_event_context *ctx = event->ctx;
2958
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
2959
struct perf_event_context *task_ctx = cpuctx->task_ctx;
2960
bool reprogram = true;
2961
int ret = 0;
2962
2963
raw_spin_lock(&cpuctx->ctx.lock);
2964
if (ctx->task) {
2965
raw_spin_lock(&ctx->lock);
2966
task_ctx = ctx;
2967
2968
reprogram = (ctx->task == current);
2969
2970
/*
2971
* If the task is running, it must be running on this CPU,
2972
* otherwise we cannot reprogram things.
2973
*
2974
* If its not running, we don't care, ctx->lock will
2975
* serialize against it becoming runnable.
2976
*/
2977
if (task_curr(ctx->task) && !reprogram) {
2978
ret = -ESRCH;
2979
goto unlock;
2980
}
2981
2982
WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2983
} else if (task_ctx) {
2984
raw_spin_lock(&task_ctx->lock);
2985
}
2986
2987
#ifdef CONFIG_CGROUP_PERF
2988
if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
2989
/*
2990
* If the current cgroup doesn't match the event's
2991
* cgroup, we should not try to schedule it.
2992
*/
2993
struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2994
reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2995
event->cgrp->css.cgroup);
2996
}
2997
#endif
2998
2999
if (reprogram) {
3000
ctx_time_freeze(cpuctx, ctx);
3001
add_event_to_ctx(event, ctx);
3002
ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
3003
get_event_type(event));
3004
} else {
3005
add_event_to_ctx(event, ctx);
3006
}
3007
3008
unlock:
3009
perf_ctx_unlock(cpuctx, task_ctx);
3010
3011
return ret;
3012
}
3013
3014
static bool exclusive_event_installable(struct perf_event *event,
3015
struct perf_event_context *ctx);
3016
3017
/*
3018
* Attach a performance event to a context.
3019
*
3020
* Very similar to event_function_call, see comment there.
3021
*/
3022
static void
3023
perf_install_in_context(struct perf_event_context *ctx,
3024
struct perf_event *event,
3025
int cpu)
3026
{
3027
struct task_struct *task = READ_ONCE(ctx->task);
3028
3029
lockdep_assert_held(&ctx->mutex);
3030
3031
WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
3032
3033
if (event->cpu != -1)
3034
WARN_ON_ONCE(event->cpu != cpu);
3035
3036
/*
3037
* Ensures that if we can observe event->ctx, both the event and ctx
3038
* will be 'complete'. See perf_iterate_sb_cpu().
3039
*/
3040
smp_store_release(&event->ctx, ctx);
3041
3042
/*
3043
* perf_event_attr::disabled events will not run and can be initialized
3044
* without IPI. Except when this is the first event for the context, in
3045
* that case we need the magic of the IPI to set ctx->is_active.
3046
*
3047
* The IOC_ENABLE that is sure to follow the creation of a disabled
3048
* event will issue the IPI and reprogram the hardware.
3049
*/
3050
if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
3051
ctx->nr_events && !is_cgroup_event(event)) {
3052
raw_spin_lock_irq(&ctx->lock);
3053
if (ctx->task == TASK_TOMBSTONE) {
3054
raw_spin_unlock_irq(&ctx->lock);
3055
return;
3056
}
3057
add_event_to_ctx(event, ctx);
3058
raw_spin_unlock_irq(&ctx->lock);
3059
return;
3060
}
3061
3062
if (!task) {
3063
cpu_function_call(cpu, __perf_install_in_context, event);
3064
return;
3065
}
3066
3067
/*
3068
* Should not happen, we validate the ctx is still alive before calling.
3069
*/
3070
if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
3071
return;
3072
3073
/*
3074
* Installing events is tricky because we cannot rely on ctx->is_active
3075
* to be set in case this is the nr_events 0 -> 1 transition.
3076
*
3077
* Instead we use task_curr(), which tells us if the task is running.
3078
* However, since we use task_curr() outside of rq::lock, we can race
3079
* against the actual state. This means the result can be wrong.
3080
*
3081
* If we get a false positive, we retry, this is harmless.
3082
*
3083
* If we get a false negative, things are complicated. If we are after
3084
* perf_event_context_sched_in() ctx::lock will serialize us, and the
3085
* value must be correct. If we're before, it doesn't matter since
3086
* perf_event_context_sched_in() will program the counter.
3087
*
3088
* However, this hinges on the remote context switch having observed
3089
* our task->perf_event_ctxp[] store, such that it will in fact take
3090
* ctx::lock in perf_event_context_sched_in().
3091
*
3092
* We do this by task_function_call(), if the IPI fails to hit the task
3093
* we know any future context switch of task must see the
3094
* perf_event_ctpx[] store.
3095
*/
3096
3097
/*
3098
* This smp_mb() orders the task->perf_event_ctxp[] store with the
3099
* task_cpu() load, such that if the IPI then does not find the task
3100
* running, a future context switch of that task must observe the
3101
* store.
3102
*/
3103
smp_mb();
3104
again:
3105
if (!task_function_call(task, __perf_install_in_context, event))
3106
return;
3107
3108
raw_spin_lock_irq(&ctx->lock);
3109
task = ctx->task;
3110
if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
3111
/*
3112
* Cannot happen because we already checked above (which also
3113
* cannot happen), and we hold ctx->mutex, which serializes us
3114
* against perf_event_exit_task_context().
3115
*/
3116
raw_spin_unlock_irq(&ctx->lock);
3117
return;
3118
}
3119
/*
3120
* If the task is not running, ctx->lock will avoid it becoming so,
3121
* thus we can safely install the event.
3122
*/
3123
if (task_curr(task)) {
3124
raw_spin_unlock_irq(&ctx->lock);
3125
goto again;
3126
}
3127
add_event_to_ctx(event, ctx);
3128
raw_spin_unlock_irq(&ctx->lock);
3129
}
3130
3131
/*
3132
* Cross CPU call to enable a performance event
3133
*/
3134
static void __perf_event_enable(struct perf_event *event,
3135
struct perf_cpu_context *cpuctx,
3136
struct perf_event_context *ctx,
3137
void *info)
3138
{
3139
struct perf_event *leader = event->group_leader;
3140
struct perf_event_context *task_ctx;
3141
3142
if (event->state >= PERF_EVENT_STATE_INACTIVE ||
3143
event->state <= PERF_EVENT_STATE_ERROR)
3144
return;
3145
3146
ctx_time_freeze(cpuctx, ctx);
3147
3148
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
3149
perf_cgroup_event_enable(event, ctx);
3150
3151
if (!ctx->is_active)
3152
return;
3153
3154
if (!event_filter_match(event))
3155
return;
3156
3157
/*
3158
* If the event is in a group and isn't the group leader,
3159
* then don't put it on unless the group is on.
3160
*/
3161
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
3162
return;
3163
3164
task_ctx = cpuctx->task_ctx;
3165
if (ctx->task)
3166
WARN_ON_ONCE(task_ctx != ctx);
3167
3168
ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
3169
}
3170
3171
/*
3172
* Enable an event.
3173
*
3174
* If event->ctx is a cloned context, callers must make sure that
3175
* every task struct that event->ctx->task could possibly point to
3176
* remains valid. This condition is satisfied when called through
3177
* perf_event_for_each_child or perf_event_for_each as described
3178
* for perf_event_disable.
3179
*/
3180
static void _perf_event_enable(struct perf_event *event)
3181
{
3182
struct perf_event_context *ctx = event->ctx;
3183
3184
raw_spin_lock_irq(&ctx->lock);
3185
if (event->state >= PERF_EVENT_STATE_INACTIVE ||
3186
event->state < PERF_EVENT_STATE_ERROR) {
3187
out:
3188
raw_spin_unlock_irq(&ctx->lock);
3189
return;
3190
}
3191
3192
/*
3193
* If the event is in error state, clear that first.
3194
*
3195
* That way, if we see the event in error state below, we know that it
3196
* has gone back into error state, as distinct from the task having
3197
* been scheduled away before the cross-call arrived.
3198
*/
3199
if (event->state == PERF_EVENT_STATE_ERROR) {
3200
/*
3201
* Detached SIBLING events cannot leave ERROR state.
3202
*/
3203
if (event->event_caps & PERF_EV_CAP_SIBLING &&
3204
event->group_leader == event)
3205
goto out;
3206
3207
event->state = PERF_EVENT_STATE_OFF;
3208
}
3209
raw_spin_unlock_irq(&ctx->lock);
3210
3211
event_function_call(event, __perf_event_enable, NULL);
3212
}
3213
3214
/*
3215
* See perf_event_disable();
3216
*/
3217
void perf_event_enable(struct perf_event *event)
3218
{
3219
struct perf_event_context *ctx;
3220
3221
ctx = perf_event_ctx_lock(event);
3222
_perf_event_enable(event);
3223
perf_event_ctx_unlock(event, ctx);
3224
}
3225
EXPORT_SYMBOL_GPL(perf_event_enable);
3226
3227
struct stop_event_data {
3228
struct perf_event *event;
3229
unsigned int restart;
3230
};
3231
3232
static int __perf_event_stop(void *info)
3233
{
3234
struct stop_event_data *sd = info;
3235
struct perf_event *event = sd->event;
3236
3237
/* if it's already INACTIVE, do nothing */
3238
if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3239
return 0;
3240
3241
/* matches smp_wmb() in event_sched_in() */
3242
smp_rmb();
3243
3244
/*
3245
* There is a window with interrupts enabled before we get here,
3246
* so we need to check again lest we try to stop another CPU's event.
3247
*/
3248
if (READ_ONCE(event->oncpu) != smp_processor_id())
3249
return -EAGAIN;
3250
3251
event->pmu->stop(event, PERF_EF_UPDATE);
3252
3253
/*
3254
* May race with the actual stop (through perf_pmu_output_stop()),
3255
* but it is only used for events with AUX ring buffer, and such
3256
* events will refuse to restart because of rb::aux_mmap_count==0,
3257
* see comments in perf_aux_output_begin().
3258
*
3259
* Since this is happening on an event-local CPU, no trace is lost
3260
* while restarting.
3261
*/
3262
if (sd->restart)
3263
event->pmu->start(event, 0);
3264
3265
return 0;
3266
}
3267
3268
static int perf_event_stop(struct perf_event *event, int restart)
3269
{
3270
struct stop_event_data sd = {
3271
.event = event,
3272
.restart = restart,
3273
};
3274
int ret = 0;
3275
3276
do {
3277
if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3278
return 0;
3279
3280
/* matches smp_wmb() in event_sched_in() */
3281
smp_rmb();
3282
3283
/*
3284
* We only want to restart ACTIVE events, so if the event goes
3285
* inactive here (event->oncpu==-1), there's nothing more to do;
3286
* fall through with ret==-ENXIO.
3287
*/
3288
ret = cpu_function_call(READ_ONCE(event->oncpu),
3289
__perf_event_stop, &sd);
3290
} while (ret == -EAGAIN);
3291
3292
return ret;
3293
}
3294
3295
/*
3296
* In order to contain the amount of racy and tricky in the address filter
3297
* configuration management, it is a two part process:
3298
*
3299
* (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
3300
* we update the addresses of corresponding vmas in
3301
* event::addr_filter_ranges array and bump the event::addr_filters_gen;
3302
* (p2) when an event is scheduled in (pmu::add), it calls
3303
* perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
3304
* if the generation has changed since the previous call.
3305
*
3306
* If (p1) happens while the event is active, we restart it to force (p2).
3307
*
3308
* (1) perf_addr_filters_apply(): adjusting filters' offsets based on
3309
* pre-existing mappings, called once when new filters arrive via SET_FILTER
3310
* ioctl;
3311
* (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
3312
* registered mapping, called for every new mmap(), with mm::mmap_lock down
3313
* for reading;
3314
* (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
3315
* of exec.
3316
*/
3317
void perf_event_addr_filters_sync(struct perf_event *event)
3318
{
3319
struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
3320
3321
if (!has_addr_filter(event))
3322
return;
3323
3324
raw_spin_lock(&ifh->lock);
3325
if (event->addr_filters_gen != event->hw.addr_filters_gen) {
3326
event->pmu->addr_filters_sync(event);
3327
event->hw.addr_filters_gen = event->addr_filters_gen;
3328
}
3329
raw_spin_unlock(&ifh->lock);
3330
}
3331
EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
3332
3333
static int _perf_event_refresh(struct perf_event *event, int refresh)
3334
{
3335
/*
3336
* not supported on inherited events
3337
*/
3338
if (event->attr.inherit || !is_sampling_event(event))
3339
return -EINVAL;
3340
3341
atomic_add(refresh, &event->event_limit);
3342
_perf_event_enable(event);
3343
3344
return 0;
3345
}
3346
3347
/*
3348
* See perf_event_disable()
3349
*/
3350
int perf_event_refresh(struct perf_event *event, int refresh)
3351
{
3352
struct perf_event_context *ctx;
3353
int ret;
3354
3355
ctx = perf_event_ctx_lock(event);
3356
ret = _perf_event_refresh(event, refresh);
3357
perf_event_ctx_unlock(event, ctx);
3358
3359
return ret;
3360
}
3361
EXPORT_SYMBOL_GPL(perf_event_refresh);
3362
3363
static int perf_event_modify_breakpoint(struct perf_event *bp,
3364
struct perf_event_attr *attr)
3365
{
3366
int err;
3367
3368
_perf_event_disable(bp);
3369
3370
err = modify_user_hw_breakpoint_check(bp, attr, true);
3371
3372
if (!bp->attr.disabled)
3373
_perf_event_enable(bp);
3374
3375
return err;
3376
}
3377
3378
/*
3379
* Copy event-type-independent attributes that may be modified.
3380
*/
3381
static void perf_event_modify_copy_attr(struct perf_event_attr *to,
3382
const struct perf_event_attr *from)
3383
{
3384
to->sig_data = from->sig_data;
3385
}
3386
3387
static int perf_event_modify_attr(struct perf_event *event,
3388
struct perf_event_attr *attr)
3389
{
3390
int (*func)(struct perf_event *, struct perf_event_attr *);
3391
struct perf_event *child;
3392
int err;
3393
3394
if (event->attr.type != attr->type)
3395
return -EINVAL;
3396
3397
switch (event->attr.type) {
3398
case PERF_TYPE_BREAKPOINT:
3399
func = perf_event_modify_breakpoint;
3400
break;
3401
default:
3402
/* Place holder for future additions. */
3403
return -EOPNOTSUPP;
3404
}
3405
3406
WARN_ON_ONCE(event->ctx->parent_ctx);
3407
3408
mutex_lock(&event->child_mutex);
3409
/*
3410
* Event-type-independent attributes must be copied before event-type
3411
* modification, which will validate that final attributes match the
3412
* source attributes after all relevant attributes have been copied.
3413
*/
3414
perf_event_modify_copy_attr(&event->attr, attr);
3415
err = func(event, attr);
3416
if (err)
3417
goto out;
3418
list_for_each_entry(child, &event->child_list, child_list) {
3419
perf_event_modify_copy_attr(&child->attr, attr);
3420
err = func(child, attr);
3421
if (err)
3422
goto out;
3423
}
3424
out:
3425
mutex_unlock(&event->child_mutex);
3426
return err;
3427
}
3428
3429
static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
3430
enum event_type_t event_type)
3431
{
3432
struct perf_event_context *ctx = pmu_ctx->ctx;
3433
struct perf_event *event, *tmp;
3434
struct pmu *pmu = pmu_ctx->pmu;
3435
3436
if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
3437
struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
3438
3439
WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
3440
cpc->task_epc = NULL;
3441
}
3442
3443
if (!(event_type & EVENT_ALL))
3444
return;
3445
3446
perf_pmu_disable(pmu);
3447
if (event_type & EVENT_PINNED) {
3448
list_for_each_entry_safe(event, tmp,
3449
&pmu_ctx->pinned_active,
3450
active_list)
3451
group_sched_out(event, ctx);
3452
}
3453
3454
if (event_type & EVENT_FLEXIBLE) {
3455
list_for_each_entry_safe(event, tmp,
3456
&pmu_ctx->flexible_active,
3457
active_list)
3458
group_sched_out(event, ctx);
3459
/*
3460
* Since we cleared EVENT_FLEXIBLE, also clear
3461
* rotate_necessary, is will be reset by
3462
* ctx_flexible_sched_in() when needed.
3463
*/
3464
pmu_ctx->rotate_necessary = 0;
3465
}
3466
perf_pmu_enable(pmu);
3467
}
3468
3469
/*
3470
* Be very careful with the @pmu argument since this will change ctx state.
3471
* The @pmu argument works for ctx_resched(), because that is symmetric in
3472
* ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant.
3473
*
3474
* However, if you were to be asymmetrical, you could end up with messed up
3475
* state, eg. ctx->is_active cleared even though most EPCs would still actually
3476
* be active.
3477
*/
3478
static void
3479
ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
3480
{
3481
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
3482
struct perf_event_pmu_context *pmu_ctx;
3483
int is_active = ctx->is_active;
3484
bool cgroup = event_type & EVENT_CGROUP;
3485
3486
event_type &= ~EVENT_CGROUP;
3487
3488
lockdep_assert_held(&ctx->lock);
3489
3490
if (likely(!ctx->nr_events)) {
3491
/*
3492
* See __perf_remove_from_context().
3493
*/
3494
WARN_ON_ONCE(ctx->is_active);
3495
if (ctx->task)
3496
WARN_ON_ONCE(cpuctx->task_ctx);
3497
return;
3498
}
3499
3500
/*
3501
* Always update time if it was set; not only when it changes.
3502
* Otherwise we can 'forget' to update time for any but the last
3503
* context we sched out. For example:
3504
*
3505
* ctx_sched_out(.event_type = EVENT_FLEXIBLE)
3506
* ctx_sched_out(.event_type = EVENT_PINNED)
3507
*
3508
* would only update time for the pinned events.
3509
*/
3510
__ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
3511
3512
/*
3513
* CPU-release for the below ->is_active store,
3514
* see __load_acquire() in perf_event_time_now()
3515
*/
3516
barrier();
3517
ctx->is_active &= ~event_type;
3518
3519
if (!(ctx->is_active & EVENT_ALL)) {
3520
/*
3521
* For FROZEN, preserve TIME|FROZEN such that perf_event_time_now()
3522
* does not observe a hole. perf_ctx_unlock() will clean up.
3523
*/
3524
if (ctx->is_active & EVENT_FROZEN)
3525
ctx->is_active &= EVENT_TIME_FROZEN;
3526
else
3527
ctx->is_active = 0;
3528
}
3529
3530
if (ctx->task) {
3531
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3532
if (!(ctx->is_active & EVENT_ALL))
3533
cpuctx->task_ctx = NULL;
3534
}
3535
3536
is_active ^= ctx->is_active; /* changed bits */
3537
3538
for_each_epc(pmu_ctx, ctx, pmu, cgroup)
3539
__pmu_ctx_sched_out(pmu_ctx, is_active);
3540
}
3541
3542
/*
3543
* Test whether two contexts are equivalent, i.e. whether they have both been
3544
* cloned from the same version of the same context.
3545
*
3546
* Equivalence is measured using a generation number in the context that is
3547
* incremented on each modification to it; see unclone_ctx(), list_add_event()
3548
* and list_del_event().
3549
*/
3550
static int context_equiv(struct perf_event_context *ctx1,
3551
struct perf_event_context *ctx2)
3552
{
3553
lockdep_assert_held(&ctx1->lock);
3554
lockdep_assert_held(&ctx2->lock);
3555
3556
/* Pinning disables the swap optimization */
3557
if (ctx1->pin_count || ctx2->pin_count)
3558
return 0;
3559
3560
/* If ctx1 is the parent of ctx2 */
3561
if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
3562
return 1;
3563
3564
/* If ctx2 is the parent of ctx1 */
3565
if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
3566
return 1;
3567
3568
/*
3569
* If ctx1 and ctx2 have the same parent; we flatten the parent
3570
* hierarchy, see perf_event_init_context().
3571
*/
3572
if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
3573
ctx1->parent_gen == ctx2->parent_gen)
3574
return 1;
3575
3576
/* Unmatched */
3577
return 0;
3578
}
3579
3580
static void __perf_event_sync_stat(struct perf_event *event,
3581
struct perf_event *next_event)
3582
{
3583
u64 value;
3584
3585
if (!event->attr.inherit_stat)
3586
return;
3587
3588
/*
3589
* Update the event value, we cannot use perf_event_read()
3590
* because we're in the middle of a context switch and have IRQs
3591
* disabled, which upsets smp_call_function_single(), however
3592
* we know the event must be on the current CPU, therefore we
3593
* don't need to use it.
3594
*/
3595
perf_pmu_read(event);
3596
3597
perf_event_update_time(event);
3598
3599
/*
3600
* In order to keep per-task stats reliable we need to flip the event
3601
* values when we flip the contexts.
3602
*/
3603
value = local64_read(&next_event->count);
3604
value = local64_xchg(&event->count, value);
3605
local64_set(&next_event->count, value);
3606
3607
swap(event->total_time_enabled, next_event->total_time_enabled);
3608
swap(event->total_time_running, next_event->total_time_running);
3609
3610
/*
3611
* Since we swizzled the values, update the user visible data too.
3612
*/
3613
perf_event_update_userpage(event);
3614
perf_event_update_userpage(next_event);
3615
}
3616
3617
static void perf_event_sync_stat(struct perf_event_context *ctx,
3618
struct perf_event_context *next_ctx)
3619
{
3620
struct perf_event *event, *next_event;
3621
3622
if (!ctx->nr_stat)
3623
return;
3624
3625
update_context_time(ctx);
3626
3627
event = list_first_entry(&ctx->event_list,
3628
struct perf_event, event_entry);
3629
3630
next_event = list_first_entry(&next_ctx->event_list,
3631
struct perf_event, event_entry);
3632
3633
while (&event->event_entry != &ctx->event_list &&
3634
&next_event->event_entry != &next_ctx->event_list) {
3635
3636
__perf_event_sync_stat(event, next_event);
3637
3638
event = list_next_entry(event, event_entry);
3639
next_event = list_next_entry(next_event, event_entry);
3640
}
3641
}
3642
3643
static void perf_ctx_sched_task_cb(struct perf_event_context *ctx,
3644
struct task_struct *task, bool sched_in)
3645
{
3646
struct perf_event_pmu_context *pmu_ctx;
3647
struct perf_cpu_pmu_context *cpc;
3648
3649
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
3650
cpc = this_cpc(pmu_ctx->pmu);
3651
3652
if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
3653
pmu_ctx->pmu->sched_task(pmu_ctx, task, sched_in);
3654
}
3655
}
3656
3657
static void
3658
perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
3659
{
3660
struct perf_event_context *ctx = task->perf_event_ctxp;
3661
struct perf_event_context *next_ctx;
3662
struct perf_event_context *parent, *next_parent;
3663
int do_switch = 1;
3664
3665
if (likely(!ctx))
3666
return;
3667
3668
rcu_read_lock();
3669
next_ctx = rcu_dereference(next->perf_event_ctxp);
3670
if (!next_ctx)
3671
goto unlock;
3672
3673
parent = rcu_dereference(ctx->parent_ctx);
3674
next_parent = rcu_dereference(next_ctx->parent_ctx);
3675
3676
/* If neither context have a parent context; they cannot be clones. */
3677
if (!parent && !next_parent)
3678
goto unlock;
3679
3680
if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3681
/*
3682
* Looks like the two contexts are clones, so we might be
3683
* able to optimize the context switch. We lock both
3684
* contexts and check that they are clones under the
3685
* lock (including re-checking that neither has been
3686
* uncloned in the meantime). It doesn't matter which
3687
* order we take the locks because no other cpu could
3688
* be trying to lock both of these tasks.
3689
*/
3690
raw_spin_lock(&ctx->lock);
3691
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3692
if (context_equiv(ctx, next_ctx)) {
3693
3694
perf_ctx_disable(ctx, false);
3695
3696
/* PMIs are disabled; ctx->nr_no_switch_fast is stable. */
3697
if (local_read(&ctx->nr_no_switch_fast) ||
3698
local_read(&next_ctx->nr_no_switch_fast)) {
3699
/*
3700
* Must not swap out ctx when there's pending
3701
* events that rely on the ctx->task relation.
3702
*
3703
* Likewise, when a context contains inherit +
3704
* SAMPLE_READ events they should be switched
3705
* out using the slow path so that they are
3706
* treated as if they were distinct contexts.
3707
*/
3708
raw_spin_unlock(&next_ctx->lock);
3709
rcu_read_unlock();
3710
goto inside_switch;
3711
}
3712
3713
WRITE_ONCE(ctx->task, next);
3714
WRITE_ONCE(next_ctx->task, task);
3715
3716
perf_ctx_sched_task_cb(ctx, task, false);
3717
3718
perf_ctx_enable(ctx, false);
3719
3720
/*
3721
* RCU_INIT_POINTER here is safe because we've not
3722
* modified the ctx and the above modification of
3723
* ctx->task is immaterial since this value is
3724
* always verified under ctx->lock which we're now
3725
* holding.
3726
*/
3727
RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
3728
RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
3729
3730
do_switch = 0;
3731
3732
perf_event_sync_stat(ctx, next_ctx);
3733
}
3734
raw_spin_unlock(&next_ctx->lock);
3735
raw_spin_unlock(&ctx->lock);
3736
}
3737
unlock:
3738
rcu_read_unlock();
3739
3740
if (do_switch) {
3741
raw_spin_lock(&ctx->lock);
3742
perf_ctx_disable(ctx, false);
3743
3744
inside_switch:
3745
perf_ctx_sched_task_cb(ctx, task, false);
3746
task_ctx_sched_out(ctx, NULL, EVENT_ALL);
3747
3748
perf_ctx_enable(ctx, false);
3749
raw_spin_unlock(&ctx->lock);
3750
}
3751
}
3752
3753
static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3754
static DEFINE_PER_CPU(int, perf_sched_cb_usages);
3755
3756
void perf_sched_cb_dec(struct pmu *pmu)
3757
{
3758
struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
3759
3760
this_cpu_dec(perf_sched_cb_usages);
3761
barrier();
3762
3763
if (!--cpc->sched_cb_usage)
3764
list_del(&cpc->sched_cb_entry);
3765
}
3766
3767
3768
void perf_sched_cb_inc(struct pmu *pmu)
3769
{
3770
struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
3771
3772
if (!cpc->sched_cb_usage++)
3773
list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3774
3775
barrier();
3776
this_cpu_inc(perf_sched_cb_usages);
3777
}
3778
3779
/*
3780
* This function provides the context switch callback to the lower code
3781
* layer. It is invoked ONLY when the context switch callback is enabled.
3782
*
3783
* This callback is relevant even to per-cpu events; for example multi event
3784
* PEBS requires this to provide PID/TID information. This requires we flush
3785
* all queued PEBS records before we context switch to a new task.
3786
*/
3787
static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc,
3788
struct task_struct *task, bool sched_in)
3789
{
3790
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
3791
struct pmu *pmu;
3792
3793
pmu = cpc->epc.pmu;
3794
3795
/* software PMUs will not have sched_task */
3796
if (WARN_ON_ONCE(!pmu->sched_task))
3797
return;
3798
3799
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3800
perf_pmu_disable(pmu);
3801
3802
pmu->sched_task(cpc->task_epc, task, sched_in);
3803
3804
perf_pmu_enable(pmu);
3805
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3806
}
3807
3808
static void perf_pmu_sched_task(struct task_struct *prev,
3809
struct task_struct *next,
3810
bool sched_in)
3811
{
3812
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
3813
struct perf_cpu_pmu_context *cpc;
3814
3815
/* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */
3816
if (prev == next || cpuctx->task_ctx)
3817
return;
3818
3819
list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
3820
__perf_pmu_sched_task(cpc, sched_in ? next : prev, sched_in);
3821
}
3822
3823
static void perf_event_switch(struct task_struct *task,
3824
struct task_struct *next_prev, bool sched_in);
3825
3826
/*
3827
* Called from scheduler to remove the events of the current task,
3828
* with interrupts disabled.
3829
*
3830
* We stop each event and update the event value in event->count.
3831
*
3832
* This does not protect us against NMI, but disable()
3833
* sets the disabled bit in the control field of event _before_
3834
* accessing the event control register. If a NMI hits, then it will
3835
* not restart the event.
3836
*/
3837
void __perf_event_task_sched_out(struct task_struct *task,
3838
struct task_struct *next)
3839
{
3840
if (__this_cpu_read(perf_sched_cb_usages))
3841
perf_pmu_sched_task(task, next, false);
3842
3843
if (atomic_read(&nr_switch_events))
3844
perf_event_switch(task, next, false);
3845
3846
perf_event_context_sched_out(task, next);
3847
3848
/*
3849
* if cgroup events exist on this CPU, then we need
3850
* to check if we have to switch out PMU state.
3851
* cgroup event are system-wide mode only
3852
*/
3853
perf_cgroup_switch(next);
3854
}
3855
3856
static bool perf_less_group_idx(const void *l, const void *r, void __always_unused *args)
3857
{
3858
const struct perf_event *le = *(const struct perf_event **)l;
3859
const struct perf_event *re = *(const struct perf_event **)r;
3860
3861
return le->group_index < re->group_index;
3862
}
3863
3864
DEFINE_MIN_HEAP(struct perf_event *, perf_event_min_heap);
3865
3866
static const struct min_heap_callbacks perf_min_heap = {
3867
.less = perf_less_group_idx,
3868
.swp = NULL,
3869
};
3870
3871
static void __heap_add(struct perf_event_min_heap *heap, struct perf_event *event)
3872
{
3873
struct perf_event **itrs = heap->data;
3874
3875
if (event) {
3876
itrs[heap->nr] = event;
3877
heap->nr++;
3878
}
3879
}
3880
3881
static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
3882
{
3883
struct perf_cpu_pmu_context *cpc;
3884
3885
if (!pmu_ctx->ctx->task)
3886
return;
3887
3888
cpc = this_cpc(pmu_ctx->pmu);
3889
WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
3890
cpc->task_epc = pmu_ctx;
3891
}
3892
3893
static noinline int visit_groups_merge(struct perf_event_context *ctx,
3894
struct perf_event_groups *groups, int cpu,
3895
struct pmu *pmu,
3896
int (*func)(struct perf_event *, void *),
3897
void *data)
3898
{
3899
#ifdef CONFIG_CGROUP_PERF
3900
struct cgroup_subsys_state *css = NULL;
3901
#endif
3902
struct perf_cpu_context *cpuctx = NULL;
3903
/* Space for per CPU and/or any CPU event iterators. */
3904
struct perf_event *itrs[2];
3905
struct perf_event_min_heap event_heap;
3906
struct perf_event **evt;
3907
int ret;
3908
3909
if (pmu->filter && pmu->filter(pmu, cpu))
3910
return 0;
3911
3912
if (!ctx->task) {
3913
cpuctx = this_cpu_ptr(&perf_cpu_context);
3914
event_heap = (struct perf_event_min_heap){
3915
.data = cpuctx->heap,
3916
.nr = 0,
3917
.size = cpuctx->heap_size,
3918
};
3919
3920
lockdep_assert_held(&cpuctx->ctx.lock);
3921
3922
#ifdef CONFIG_CGROUP_PERF
3923
if (cpuctx->cgrp)
3924
css = &cpuctx->cgrp->css;
3925
#endif
3926
} else {
3927
event_heap = (struct perf_event_min_heap){
3928
.data = itrs,
3929
.nr = 0,
3930
.size = ARRAY_SIZE(itrs),
3931
};
3932
/* Events not within a CPU context may be on any CPU. */
3933
__heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL));
3934
}
3935
evt = event_heap.data;
3936
3937
__heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL));
3938
3939
#ifdef CONFIG_CGROUP_PERF
3940
for (; css; css = css->parent)
3941
__heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup));
3942
#endif
3943
3944
if (event_heap.nr) {
3945
__link_epc((*evt)->pmu_ctx);
3946
perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
3947
}
3948
3949
min_heapify_all_inline(&event_heap, &perf_min_heap, NULL);
3950
3951
while (event_heap.nr) {
3952
ret = func(*evt, data);
3953
if (ret)
3954
return ret;
3955
3956
*evt = perf_event_groups_next(*evt, pmu);
3957
if (*evt)
3958
min_heap_sift_down_inline(&event_heap, 0, &perf_min_heap, NULL);
3959
else
3960
min_heap_pop_inline(&event_heap, &perf_min_heap, NULL);
3961
}
3962
3963
return 0;
3964
}
3965
3966
/*
3967
* Because the userpage is strictly per-event (there is no concept of context,
3968
* so there cannot be a context indirection), every userpage must be updated
3969
* when context time starts :-(
3970
*
3971
* IOW, we must not miss EVENT_TIME edges.
3972
*/
3973
static inline bool event_update_userpage(struct perf_event *event)
3974
{
3975
if (likely(!refcount_read(&event->mmap_count)))
3976
return false;
3977
3978
perf_event_update_time(event);
3979
perf_event_update_userpage(event);
3980
3981
return true;
3982
}
3983
3984
static inline void group_update_userpage(struct perf_event *group_event)
3985
{
3986
struct perf_event *event;
3987
3988
if (!event_update_userpage(group_event))
3989
return;
3990
3991
for_each_sibling_event(event, group_event)
3992
event_update_userpage(event);
3993
}
3994
3995
static int merge_sched_in(struct perf_event *event, void *data)
3996
{
3997
struct perf_event_context *ctx = event->ctx;
3998
int *can_add_hw = data;
3999
4000
if (event->state <= PERF_EVENT_STATE_OFF)
4001
return 0;
4002
4003
if (!event_filter_match(event))
4004
return 0;
4005
4006
if (group_can_go_on(event, *can_add_hw)) {
4007
if (!group_sched_in(event, ctx))
4008
list_add_tail(&event->active_list, get_event_list(event));
4009
}
4010
4011
if (event->state == PERF_EVENT_STATE_INACTIVE) {
4012
*can_add_hw = 0;
4013
if (event->attr.pinned) {
4014
perf_cgroup_event_disable(event, ctx);
4015
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
4016
4017
if (*perf_event_fasync(event))
4018
event->pending_kill = POLL_ERR;
4019
4020
perf_event_wakeup(event);
4021
} else {
4022
struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu);
4023
4024
event->pmu_ctx->rotate_necessary = 1;
4025
perf_mux_hrtimer_restart(cpc);
4026
group_update_userpage(event);
4027
}
4028
}
4029
4030
return 0;
4031
}
4032
4033
static void pmu_groups_sched_in(struct perf_event_context *ctx,
4034
struct perf_event_groups *groups,
4035
struct pmu *pmu)
4036
{
4037
int can_add_hw = 1;
4038
visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
4039
merge_sched_in, &can_add_hw);
4040
}
4041
4042
static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
4043
enum event_type_t event_type)
4044
{
4045
struct perf_event_context *ctx = pmu_ctx->ctx;
4046
4047
if (event_type & EVENT_PINNED)
4048
pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
4049
if (event_type & EVENT_FLEXIBLE)
4050
pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
4051
}
4052
4053
static void
4054
ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
4055
{
4056
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
4057
struct perf_event_pmu_context *pmu_ctx;
4058
int is_active = ctx->is_active;
4059
bool cgroup = event_type & EVENT_CGROUP;
4060
4061
event_type &= ~EVENT_CGROUP;
4062
4063
lockdep_assert_held(&ctx->lock);
4064
4065
if (likely(!ctx->nr_events))
4066
return;
4067
4068
if (!(is_active & EVENT_TIME)) {
4069
/* start ctx time */
4070
__update_context_time(ctx, false);
4071
perf_cgroup_set_timestamp(cpuctx);
4072
/*
4073
* CPU-release for the below ->is_active store,
4074
* see __load_acquire() in perf_event_time_now()
4075
*/
4076
barrier();
4077
}
4078
4079
ctx->is_active |= (event_type | EVENT_TIME);
4080
if (ctx->task) {
4081
if (!(is_active & EVENT_ALL))
4082
cpuctx->task_ctx = ctx;
4083
else
4084
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
4085
}
4086
4087
is_active ^= ctx->is_active; /* changed bits */
4088
4089
/*
4090
* First go through the list and put on any pinned groups
4091
* in order to give them the best chance of going on.
4092
*/
4093
if (is_active & EVENT_PINNED) {
4094
for_each_epc(pmu_ctx, ctx, pmu, cgroup)
4095
__pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
4096
}
4097
4098
/* Then walk through the lower prio flexible groups */
4099
if (is_active & EVENT_FLEXIBLE) {
4100
for_each_epc(pmu_ctx, ctx, pmu, cgroup)
4101
__pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
4102
}
4103
}
4104
4105
static void perf_event_context_sched_in(struct task_struct *task)
4106
{
4107
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
4108
struct perf_event_context *ctx;
4109
4110
rcu_read_lock();
4111
ctx = rcu_dereference(task->perf_event_ctxp);
4112
if (!ctx)
4113
goto rcu_unlock;
4114
4115
if (cpuctx->task_ctx == ctx) {
4116
perf_ctx_lock(cpuctx, ctx);
4117
perf_ctx_disable(ctx, false);
4118
4119
perf_ctx_sched_task_cb(ctx, task, true);
4120
4121
perf_ctx_enable(ctx, false);
4122
perf_ctx_unlock(cpuctx, ctx);
4123
goto rcu_unlock;
4124
}
4125
4126
perf_ctx_lock(cpuctx, ctx);
4127
/*
4128
* We must check ctx->nr_events while holding ctx->lock, such
4129
* that we serialize against perf_install_in_context().
4130
*/
4131
if (!ctx->nr_events)
4132
goto unlock;
4133
4134
perf_ctx_disable(ctx, false);
4135
/*
4136
* We want to keep the following priority order:
4137
* cpu pinned (that don't need to move), task pinned,
4138
* cpu flexible, task flexible.
4139
*
4140
* However, if task's ctx is not carrying any pinned
4141
* events, no need to flip the cpuctx's events around.
4142
*/
4143
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
4144
perf_ctx_disable(&cpuctx->ctx, false);
4145
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
4146
}
4147
4148
perf_event_sched_in(cpuctx, ctx, NULL);
4149
4150
perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true);
4151
4152
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
4153
perf_ctx_enable(&cpuctx->ctx, false);
4154
4155
perf_ctx_enable(ctx, false);
4156
4157
unlock:
4158
perf_ctx_unlock(cpuctx, ctx);
4159
rcu_unlock:
4160
rcu_read_unlock();
4161
}
4162
4163
/*
4164
* Called from scheduler to add the events of the current task
4165
* with interrupts disabled.
4166
*
4167
* We restore the event value and then enable it.
4168
*
4169
* This does not protect us against NMI, but enable()
4170
* sets the enabled bit in the control field of event _before_
4171
* accessing the event control register. If a NMI hits, then it will
4172
* keep the event running.
4173
*/
4174
void __perf_event_task_sched_in(struct task_struct *prev,
4175
struct task_struct *task)
4176
{
4177
perf_event_context_sched_in(task);
4178
4179
if (atomic_read(&nr_switch_events))
4180
perf_event_switch(task, prev, true);
4181
4182
if (__this_cpu_read(perf_sched_cb_usages))
4183
perf_pmu_sched_task(prev, task, true);
4184
}
4185
4186
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
4187
{
4188
u64 frequency = event->attr.sample_freq;
4189
u64 sec = NSEC_PER_SEC;
4190
u64 divisor, dividend;
4191
4192
int count_fls, nsec_fls, frequency_fls, sec_fls;
4193
4194
count_fls = fls64(count);
4195
nsec_fls = fls64(nsec);
4196
frequency_fls = fls64(frequency);
4197
sec_fls = 30;
4198
4199
/*
4200
* We got @count in @nsec, with a target of sample_freq HZ
4201
* the target period becomes:
4202
*
4203
* @count * 10^9
4204
* period = -------------------
4205
* @nsec * sample_freq
4206
*
4207
*/
4208
4209
/*
4210
* Reduce accuracy by one bit such that @a and @b converge
4211
* to a similar magnitude.
4212
*/
4213
#define REDUCE_FLS(a, b) \
4214
do { \
4215
if (a##_fls > b##_fls) { \
4216
a >>= 1; \
4217
a##_fls--; \
4218
} else { \
4219
b >>= 1; \
4220
b##_fls--; \
4221
} \
4222
} while (0)
4223
4224
/*
4225
* Reduce accuracy until either term fits in a u64, then proceed with
4226
* the other, so that finally we can do a u64/u64 division.
4227
*/
4228
while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
4229
REDUCE_FLS(nsec, frequency);
4230
REDUCE_FLS(sec, count);
4231
}
4232
4233
if (count_fls + sec_fls > 64) {
4234
divisor = nsec * frequency;
4235
4236
while (count_fls + sec_fls > 64) {
4237
REDUCE_FLS(count, sec);
4238
divisor >>= 1;
4239
}
4240
4241
dividend = count * sec;
4242
} else {
4243
dividend = count * sec;
4244
4245
while (nsec_fls + frequency_fls > 64) {
4246
REDUCE_FLS(nsec, frequency);
4247
dividend >>= 1;
4248
}
4249
4250
divisor = nsec * frequency;
4251
}
4252
4253
if (!divisor)
4254
return dividend;
4255
4256
return div64_u64(dividend, divisor);
4257
}
4258
4259
static DEFINE_PER_CPU(int, perf_throttled_count);
4260
static DEFINE_PER_CPU(u64, perf_throttled_seq);
4261
4262
static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
4263
{
4264
struct hw_perf_event *hwc = &event->hw;
4265
s64 period, sample_period;
4266
s64 delta;
4267
4268
period = perf_calculate_period(event, nsec, count);
4269
4270
delta = (s64)(period - hwc->sample_period);
4271
if (delta >= 0)
4272
delta += 7;
4273
else
4274
delta -= 7;
4275
delta /= 8; /* low pass filter */
4276
4277
sample_period = hwc->sample_period + delta;
4278
4279
if (!sample_period)
4280
sample_period = 1;
4281
4282
hwc->sample_period = sample_period;
4283
4284
if (local64_read(&hwc->period_left) > 8*sample_period) {
4285
if (disable)
4286
event->pmu->stop(event, PERF_EF_UPDATE);
4287
4288
local64_set(&hwc->period_left, 0);
4289
4290
if (disable)
4291
event->pmu->start(event, PERF_EF_RELOAD);
4292
}
4293
}
4294
4295
static void perf_adjust_freq_unthr_events(struct list_head *event_list)
4296
{
4297
struct perf_event *event;
4298
struct hw_perf_event *hwc;
4299
u64 now, period = TICK_NSEC;
4300
s64 delta;
4301
4302
list_for_each_entry(event, event_list, active_list) {
4303
if (event->state != PERF_EVENT_STATE_ACTIVE)
4304
continue;
4305
4306
// XXX use visit thingy to avoid the -1,cpu match
4307
if (!event_filter_match(event))
4308
continue;
4309
4310
hwc = &event->hw;
4311
4312
if (hwc->interrupts == MAX_INTERRUPTS)
4313
perf_event_unthrottle_group(event, is_event_in_freq_mode(event));
4314
4315
if (!is_event_in_freq_mode(event))
4316
continue;
4317
4318
/*
4319
* stop the event and update event->count
4320
*/
4321
event->pmu->stop(event, PERF_EF_UPDATE);
4322
4323
now = local64_read(&event->count);
4324
delta = now - hwc->freq_count_stamp;
4325
hwc->freq_count_stamp = now;
4326
4327
/*
4328
* restart the event
4329
* reload only if value has changed
4330
* we have stopped the event so tell that
4331
* to perf_adjust_period() to avoid stopping it
4332
* twice.
4333
*/
4334
if (delta > 0)
4335
perf_adjust_period(event, period, delta, false);
4336
4337
event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
4338
}
4339
}
4340
4341
/*
4342
* combine freq adjustment with unthrottling to avoid two passes over the
4343
* events. At the same time, make sure, having freq events does not change
4344
* the rate of unthrottling as that would introduce bias.
4345
*/
4346
static void
4347
perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
4348
{
4349
struct perf_event_pmu_context *pmu_ctx;
4350
4351
/*
4352
* only need to iterate over all events iff:
4353
* - context have events in frequency mode (needs freq adjust)
4354
* - there are events to unthrottle on this cpu
4355
*/
4356
if (!(ctx->nr_freq || unthrottle))
4357
return;
4358
4359
raw_spin_lock(&ctx->lock);
4360
4361
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
4362
if (!(pmu_ctx->nr_freq || unthrottle))
4363
continue;
4364
if (!perf_pmu_ctx_is_active(pmu_ctx))
4365
continue;
4366
if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
4367
continue;
4368
4369
perf_pmu_disable(pmu_ctx->pmu);
4370
perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
4371
perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
4372
perf_pmu_enable(pmu_ctx->pmu);
4373
}
4374
4375
raw_spin_unlock(&ctx->lock);
4376
}
4377
4378
/*
4379
* Move @event to the tail of the @ctx's elegible events.
4380
*/
4381
static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
4382
{
4383
/*
4384
* Rotate the first entry last of non-pinned groups. Rotation might be
4385
* disabled by the inheritance code.
4386
*/
4387
if (ctx->rotate_disable)
4388
return;
4389
4390
perf_event_groups_delete(&ctx->flexible_groups, event);
4391
perf_event_groups_insert(&ctx->flexible_groups, event);
4392
}
4393
4394
/* pick an event from the flexible_groups to rotate */
4395
static inline struct perf_event *
4396
ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx)
4397
{
4398
struct perf_event *event;
4399
struct rb_node *node;
4400
struct rb_root *tree;
4401
struct __group_key key = {
4402
.pmu = pmu_ctx->pmu,
4403
};
4404
4405
/* pick the first active flexible event */
4406
event = list_first_entry_or_null(&pmu_ctx->flexible_active,
4407
struct perf_event, active_list);
4408
if (event)
4409
goto out;
4410
4411
/* if no active flexible event, pick the first event */
4412
tree = &pmu_ctx->ctx->flexible_groups.tree;
4413
4414
if (!pmu_ctx->ctx->task) {
4415
key.cpu = smp_processor_id();
4416
4417
node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
4418
if (node)
4419
event = __node_2_pe(node);
4420
goto out;
4421
}
4422
4423
key.cpu = -1;
4424
node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
4425
if (node) {
4426
event = __node_2_pe(node);
4427
goto out;
4428
}
4429
4430
key.cpu = smp_processor_id();
4431
node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
4432
if (node)
4433
event = __node_2_pe(node);
4434
4435
out:
4436
/*
4437
* Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
4438
* finds there are unschedulable events, it will set it again.
4439
*/
4440
pmu_ctx->rotate_necessary = 0;
4441
4442
return event;
4443
}
4444
4445
static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
4446
{
4447
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
4448
struct perf_event_pmu_context *cpu_epc, *task_epc = NULL;
4449
struct perf_event *cpu_event = NULL, *task_event = NULL;
4450
int cpu_rotate, task_rotate;
4451
struct pmu *pmu;
4452
4453
/*
4454
* Since we run this from IRQ context, nobody can install new
4455
* events, thus the event count values are stable.
4456
*/
4457
4458
cpu_epc = &cpc->epc;
4459
pmu = cpu_epc->pmu;
4460
task_epc = cpc->task_epc;
4461
4462
cpu_rotate = cpu_epc->rotate_necessary;
4463
task_rotate = task_epc ? task_epc->rotate_necessary : 0;
4464
4465
if (!(cpu_rotate || task_rotate))
4466
return false;
4467
4468
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
4469
perf_pmu_disable(pmu);
4470
4471
if (task_rotate)
4472
task_event = ctx_event_to_rotate(task_epc);
4473
if (cpu_rotate)
4474
cpu_event = ctx_event_to_rotate(cpu_epc);
4475
4476
/*
4477
* As per the order given at ctx_resched() first 'pop' task flexible
4478
* and then, if needed CPU flexible.
4479
*/
4480
if (task_event || (task_epc && cpu_event)) {
4481
update_context_time(task_epc->ctx);
4482
__pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
4483
}
4484
4485
if (cpu_event) {
4486
update_context_time(&cpuctx->ctx);
4487
__pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
4488
rotate_ctx(&cpuctx->ctx, cpu_event);
4489
__pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
4490
}
4491
4492
if (task_event)
4493
rotate_ctx(task_epc->ctx, task_event);
4494
4495
if (task_event || (task_epc && cpu_event))
4496
__pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
4497
4498
perf_pmu_enable(pmu);
4499
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
4500
4501
return true;
4502
}
4503
4504
void perf_event_task_tick(void)
4505
{
4506
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
4507
struct perf_event_context *ctx;
4508
int throttled;
4509
4510
lockdep_assert_irqs_disabled();
4511
4512
__this_cpu_inc(perf_throttled_seq);
4513
throttled = __this_cpu_xchg(perf_throttled_count, 0);
4514
tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
4515
4516
perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled);
4517
4518
rcu_read_lock();
4519
ctx = rcu_dereference(current->perf_event_ctxp);
4520
if (ctx)
4521
perf_adjust_freq_unthr_context(ctx, !!throttled);
4522
rcu_read_unlock();
4523
}
4524
4525
static int event_enable_on_exec(struct perf_event *event,
4526
struct perf_event_context *ctx)
4527
{
4528
if (!event->attr.enable_on_exec)
4529
return 0;
4530
4531
event->attr.enable_on_exec = 0;
4532
if (event->state >= PERF_EVENT_STATE_INACTIVE)
4533
return 0;
4534
4535
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
4536
4537
return 1;
4538
}
4539
4540
/*
4541
* Enable all of a task's events that have been marked enable-on-exec.
4542
* This expects task == current.
4543
*/
4544
static void perf_event_enable_on_exec(struct perf_event_context *ctx)
4545
{
4546
struct perf_event_context *clone_ctx = NULL;
4547
enum event_type_t event_type = 0;
4548
struct perf_cpu_context *cpuctx;
4549
struct perf_event *event;
4550
unsigned long flags;
4551
int enabled = 0;
4552
4553
local_irq_save(flags);
4554
if (WARN_ON_ONCE(current->perf_event_ctxp != ctx))
4555
goto out;
4556
4557
if (!ctx->nr_events)
4558
goto out;
4559
4560
cpuctx = this_cpu_ptr(&perf_cpu_context);
4561
perf_ctx_lock(cpuctx, ctx);
4562
ctx_time_freeze(cpuctx, ctx);
4563
4564
list_for_each_entry(event, &ctx->event_list, event_entry) {
4565
enabled |= event_enable_on_exec(event, ctx);
4566
event_type |= get_event_type(event);
4567
}
4568
4569
/*
4570
* Unclone and reschedule this context if we enabled any event.
4571
*/
4572
if (enabled) {
4573
clone_ctx = unclone_ctx(ctx);
4574
ctx_resched(cpuctx, ctx, NULL, event_type);
4575
}
4576
perf_ctx_unlock(cpuctx, ctx);
4577
4578
out:
4579
local_irq_restore(flags);
4580
4581
if (clone_ctx)
4582
put_ctx(clone_ctx);
4583
}
4584
4585
static void perf_remove_from_owner(struct perf_event *event);
4586
static void perf_event_exit_event(struct perf_event *event,
4587
struct perf_event_context *ctx,
4588
struct task_struct *task,
4589
bool revoke);
4590
4591
/*
4592
* Removes all events from the current task that have been marked
4593
* remove-on-exec, and feeds their values back to parent events.
4594
*/
4595
static void perf_event_remove_on_exec(struct perf_event_context *ctx)
4596
{
4597
struct perf_event_context *clone_ctx = NULL;
4598
struct perf_event *event, *next;
4599
unsigned long flags;
4600
bool modified = false;
4601
4602
mutex_lock(&ctx->mutex);
4603
4604
if (WARN_ON_ONCE(ctx->task != current))
4605
goto unlock;
4606
4607
list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
4608
if (!event->attr.remove_on_exec)
4609
continue;
4610
4611
if (!is_kernel_event(event))
4612
perf_remove_from_owner(event);
4613
4614
modified = true;
4615
4616
perf_event_exit_event(event, ctx, ctx->task, false);
4617
}
4618
4619
raw_spin_lock_irqsave(&ctx->lock, flags);
4620
if (modified)
4621
clone_ctx = unclone_ctx(ctx);
4622
raw_spin_unlock_irqrestore(&ctx->lock, flags);
4623
4624
unlock:
4625
mutex_unlock(&ctx->mutex);
4626
4627
if (clone_ctx)
4628
put_ctx(clone_ctx);
4629
}
4630
4631
struct perf_read_data {
4632
struct perf_event *event;
4633
bool group;
4634
int ret;
4635
};
4636
4637
static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu);
4638
4639
static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
4640
{
4641
int local_cpu = smp_processor_id();
4642
u16 local_pkg, event_pkg;
4643
4644
if ((unsigned)event_cpu >= nr_cpu_ids)
4645
return event_cpu;
4646
4647
if (event->group_caps & PERF_EV_CAP_READ_SCOPE) {
4648
const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu);
4649
4650
if (cpumask && cpumask_test_cpu(local_cpu, cpumask))
4651
return local_cpu;
4652
}
4653
4654
if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
4655
event_pkg = topology_physical_package_id(event_cpu);
4656
local_pkg = topology_physical_package_id(local_cpu);
4657
4658
if (event_pkg == local_pkg)
4659
return local_cpu;
4660
}
4661
4662
return event_cpu;
4663
}
4664
4665
/*
4666
* Cross CPU call to read the hardware event
4667
*/
4668
static void __perf_event_read(void *info)
4669
{
4670
struct perf_read_data *data = info;
4671
struct perf_event *sub, *event = data->event;
4672
struct perf_event_context *ctx = event->ctx;
4673
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
4674
struct pmu *pmu = event->pmu;
4675
4676
/*
4677
* If this is a task context, we need to check whether it is
4678
* the current task context of this cpu. If not it has been
4679
* scheduled out before the smp call arrived. In that case
4680
* event->count would have been updated to a recent sample
4681
* when the event was scheduled out.
4682
*/
4683
if (ctx->task && cpuctx->task_ctx != ctx)
4684
return;
4685
4686
raw_spin_lock(&ctx->lock);
4687
ctx_time_update_event(ctx, event);
4688
4689
perf_event_update_time(event);
4690
if (data->group)
4691
perf_event_update_sibling_time(event);
4692
4693
if (event->state != PERF_EVENT_STATE_ACTIVE)
4694
goto unlock;
4695
4696
if (!data->group) {
4697
pmu->read(event);
4698
data->ret = 0;
4699
goto unlock;
4700
}
4701
4702
pmu->start_txn(pmu, PERF_PMU_TXN_READ);
4703
4704
pmu->read(event);
4705
4706
for_each_sibling_event(sub, event)
4707
perf_pmu_read(sub);
4708
4709
data->ret = pmu->commit_txn(pmu);
4710
4711
unlock:
4712
raw_spin_unlock(&ctx->lock);
4713
}
4714
4715
static inline u64 perf_event_count(struct perf_event *event, bool self)
4716
{
4717
if (self)
4718
return local64_read(&event->count);
4719
4720
return local64_read(&event->count) + atomic64_read(&event->child_count);
4721
}
4722
4723
static void calc_timer_values(struct perf_event *event,
4724
u64 *now,
4725
u64 *enabled,
4726
u64 *running)
4727
{
4728
u64 ctx_time;
4729
4730
*now = perf_clock();
4731
ctx_time = perf_event_time_now(event, *now);
4732
__perf_update_times(event, ctx_time, enabled, running);
4733
}
4734
4735
/*
4736
* NMI-safe method to read a local event, that is an event that
4737
* is:
4738
* - either for the current task, or for this CPU
4739
* - does not have inherit set, for inherited task events
4740
* will not be local and we cannot read them atomically
4741
* - must not have a pmu::count method
4742
*/
4743
int perf_event_read_local(struct perf_event *event, u64 *value,
4744
u64 *enabled, u64 *running)
4745
{
4746
unsigned long flags;
4747
int event_oncpu;
4748
int event_cpu;
4749
int ret = 0;
4750
4751
/*
4752
* Disabling interrupts avoids all counter scheduling (context
4753
* switches, timer based rotation and IPIs).
4754
*/
4755
local_irq_save(flags);
4756
4757
/*
4758
* It must not be an event with inherit set, we cannot read
4759
* all child counters from atomic context.
4760
*/
4761
if (event->attr.inherit) {
4762
ret = -EOPNOTSUPP;
4763
goto out;
4764
}
4765
4766
/* If this is a per-task event, it must be for current */
4767
if ((event->attach_state & PERF_ATTACH_TASK) &&
4768
event->hw.target != current) {
4769
ret = -EINVAL;
4770
goto out;
4771
}
4772
4773
/*
4774
* Get the event CPU numbers, and adjust them to local if the event is
4775
* a per-package event that can be read locally
4776
*/
4777
event_oncpu = __perf_event_read_cpu(event, event->oncpu);
4778
event_cpu = __perf_event_read_cpu(event, event->cpu);
4779
4780
/* If this is a per-CPU event, it must be for this CPU */
4781
if (!(event->attach_state & PERF_ATTACH_TASK) &&
4782
event_cpu != smp_processor_id()) {
4783
ret = -EINVAL;
4784
goto out;
4785
}
4786
4787
/* If this is a pinned event it must be running on this CPU */
4788
if (event->attr.pinned && event_oncpu != smp_processor_id()) {
4789
ret = -EBUSY;
4790
goto out;
4791
}
4792
4793
/*
4794
* If the event is currently on this CPU, its either a per-task event,
4795
* or local to this CPU. Furthermore it means its ACTIVE (otherwise
4796
* oncpu == -1).
4797
*/
4798
if (event_oncpu == smp_processor_id())
4799
event->pmu->read(event);
4800
4801
*value = local64_read(&event->count);
4802
if (enabled || running) {
4803
u64 __enabled, __running, __now;
4804
4805
calc_timer_values(event, &__now, &__enabled, &__running);
4806
if (enabled)
4807
*enabled = __enabled;
4808
if (running)
4809
*running = __running;
4810
}
4811
out:
4812
local_irq_restore(flags);
4813
4814
return ret;
4815
}
4816
4817
static int perf_event_read(struct perf_event *event, bool group)
4818
{
4819
enum perf_event_state state = READ_ONCE(event->state);
4820
int event_cpu, ret = 0;
4821
4822
/*
4823
* If event is enabled and currently active on a CPU, update the
4824
* value in the event structure:
4825
*/
4826
again:
4827
if (state == PERF_EVENT_STATE_ACTIVE) {
4828
struct perf_read_data data;
4829
4830
/*
4831
* Orders the ->state and ->oncpu loads such that if we see
4832
* ACTIVE we must also see the right ->oncpu.
4833
*
4834
* Matches the smp_wmb() from event_sched_in().
4835
*/
4836
smp_rmb();
4837
4838
event_cpu = READ_ONCE(event->oncpu);
4839
if ((unsigned)event_cpu >= nr_cpu_ids)
4840
return 0;
4841
4842
data = (struct perf_read_data){
4843
.event = event,
4844
.group = group,
4845
.ret = 0,
4846
};
4847
4848
preempt_disable();
4849
event_cpu = __perf_event_read_cpu(event, event_cpu);
4850
4851
/*
4852
* Purposely ignore the smp_call_function_single() return
4853
* value.
4854
*
4855
* If event_cpu isn't a valid CPU it means the event got
4856
* scheduled out and that will have updated the event count.
4857
*
4858
* Therefore, either way, we'll have an up-to-date event count
4859
* after this.
4860
*/
4861
(void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4862
preempt_enable();
4863
ret = data.ret;
4864
4865
} else if (state == PERF_EVENT_STATE_INACTIVE) {
4866
struct perf_event_context *ctx = event->ctx;
4867
unsigned long flags;
4868
4869
raw_spin_lock_irqsave(&ctx->lock, flags);
4870
state = event->state;
4871
if (state != PERF_EVENT_STATE_INACTIVE) {
4872
raw_spin_unlock_irqrestore(&ctx->lock, flags);
4873
goto again;
4874
}
4875
4876
/*
4877
* May read while context is not active (e.g., thread is
4878
* blocked), in that case we cannot update context time
4879
*/
4880
ctx_time_update_event(ctx, event);
4881
4882
perf_event_update_time(event);
4883
if (group)
4884
perf_event_update_sibling_time(event);
4885
raw_spin_unlock_irqrestore(&ctx->lock, flags);
4886
}
4887
4888
return ret;
4889
}
4890
4891
/*
4892
* Initialize the perf_event context in a task_struct:
4893
*/
4894
static void __perf_event_init_context(struct perf_event_context *ctx)
4895
{
4896
raw_spin_lock_init(&ctx->lock);
4897
mutex_init(&ctx->mutex);
4898
INIT_LIST_HEAD(&ctx->pmu_ctx_list);
4899
perf_event_groups_init(&ctx->pinned_groups);
4900
perf_event_groups_init(&ctx->flexible_groups);
4901
INIT_LIST_HEAD(&ctx->event_list);
4902
refcount_set(&ctx->refcount, 1);
4903
}
4904
4905
static void
4906
__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu)
4907
{
4908
epc->pmu = pmu;
4909
INIT_LIST_HEAD(&epc->pmu_ctx_entry);
4910
INIT_LIST_HEAD(&epc->pinned_active);
4911
INIT_LIST_HEAD(&epc->flexible_active);
4912
atomic_set(&epc->refcount, 1);
4913
}
4914
4915
static struct perf_event_context *
4916
alloc_perf_context(struct task_struct *task)
4917
{
4918
struct perf_event_context *ctx;
4919
4920
ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4921
if (!ctx)
4922
return NULL;
4923
4924
__perf_event_init_context(ctx);
4925
if (task)
4926
ctx->task = get_task_struct(task);
4927
4928
return ctx;
4929
}
4930
4931
static struct task_struct *
4932
find_lively_task_by_vpid(pid_t vpid)
4933
{
4934
struct task_struct *task;
4935
4936
rcu_read_lock();
4937
if (!vpid)
4938
task = current;
4939
else
4940
task = find_task_by_vpid(vpid);
4941
if (task)
4942
get_task_struct(task);
4943
rcu_read_unlock();
4944
4945
if (!task)
4946
return ERR_PTR(-ESRCH);
4947
4948
return task;
4949
}
4950
4951
/*
4952
* Returns a matching context with refcount and pincount.
4953
*/
4954
static struct perf_event_context *
4955
find_get_context(struct task_struct *task, struct perf_event *event)
4956
{
4957
struct perf_event_context *ctx, *clone_ctx = NULL;
4958
struct perf_cpu_context *cpuctx;
4959
unsigned long flags;
4960
int err;
4961
4962
if (!task) {
4963
/* Must be root to operate on a CPU event: */
4964
err = perf_allow_cpu();
4965
if (err)
4966
return ERR_PTR(err);
4967
4968
cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
4969
ctx = &cpuctx->ctx;
4970
get_ctx(ctx);
4971
raw_spin_lock_irqsave(&ctx->lock, flags);
4972
++ctx->pin_count;
4973
raw_spin_unlock_irqrestore(&ctx->lock, flags);
4974
4975
return ctx;
4976
}
4977
4978
err = -EINVAL;
4979
retry:
4980
ctx = perf_lock_task_context(task, &flags);
4981
if (ctx) {
4982
clone_ctx = unclone_ctx(ctx);
4983
++ctx->pin_count;
4984
4985
raw_spin_unlock_irqrestore(&ctx->lock, flags);
4986
4987
if (clone_ctx)
4988
put_ctx(clone_ctx);
4989
} else {
4990
ctx = alloc_perf_context(task);
4991
err = -ENOMEM;
4992
if (!ctx)
4993
goto errout;
4994
4995
err = 0;
4996
mutex_lock(&task->perf_event_mutex);
4997
/*
4998
* If it has already passed perf_event_exit_task().
4999
* we must see PF_EXITING, it takes this mutex too.
5000
*/
5001
if (task->flags & PF_EXITING)
5002
err = -ESRCH;
5003
else if (task->perf_event_ctxp)
5004
err = -EAGAIN;
5005
else {
5006
get_ctx(ctx);
5007
++ctx->pin_count;
5008
rcu_assign_pointer(task->perf_event_ctxp, ctx);
5009
}
5010
mutex_unlock(&task->perf_event_mutex);
5011
5012
if (unlikely(err)) {
5013
put_ctx(ctx);
5014
5015
if (err == -EAGAIN)
5016
goto retry;
5017
goto errout;
5018
}
5019
}
5020
5021
return ctx;
5022
5023
errout:
5024
return ERR_PTR(err);
5025
}
5026
5027
static struct perf_event_pmu_context *
5028
find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
5029
struct perf_event *event)
5030
{
5031
struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc;
5032
5033
if (!ctx->task) {
5034
/*
5035
* perf_pmu_migrate_context() / __perf_pmu_install_event()
5036
* relies on the fact that find_get_pmu_context() cannot fail
5037
* for CPU contexts.
5038
*/
5039
struct perf_cpu_pmu_context *cpc;
5040
5041
cpc = *per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
5042
epc = &cpc->epc;
5043
raw_spin_lock_irq(&ctx->lock);
5044
if (!epc->ctx) {
5045
/*
5046
* One extra reference for the pmu; see perf_pmu_free().
5047
*/
5048
atomic_set(&epc->refcount, 2);
5049
epc->embedded = 1;
5050
list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
5051
epc->ctx = ctx;
5052
} else {
5053
WARN_ON_ONCE(epc->ctx != ctx);
5054
atomic_inc(&epc->refcount);
5055
}
5056
raw_spin_unlock_irq(&ctx->lock);
5057
return epc;
5058
}
5059
5060
new = kzalloc(sizeof(*epc), GFP_KERNEL);
5061
if (!new)
5062
return ERR_PTR(-ENOMEM);
5063
5064
__perf_init_event_pmu_context(new, pmu);
5065
5066
/*
5067
* XXX
5068
*
5069
* lockdep_assert_held(&ctx->mutex);
5070
*
5071
* can't because perf_event_init_task() doesn't actually hold the
5072
* child_ctx->mutex.
5073
*/
5074
5075
raw_spin_lock_irq(&ctx->lock);
5076
list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) {
5077
if (epc->pmu == pmu) {
5078
WARN_ON_ONCE(epc->ctx != ctx);
5079
atomic_inc(&epc->refcount);
5080
goto found_epc;
5081
}
5082
/* Make sure the pmu_ctx_list is sorted by PMU type: */
5083
if (!pos && epc->pmu->type > pmu->type)
5084
pos = epc;
5085
}
5086
5087
epc = new;
5088
new = NULL;
5089
5090
if (!pos)
5091
list_add_tail(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
5092
else
5093
list_add(&epc->pmu_ctx_entry, pos->pmu_ctx_entry.prev);
5094
5095
epc->ctx = ctx;
5096
5097
found_epc:
5098
raw_spin_unlock_irq(&ctx->lock);
5099
kfree(new);
5100
5101
return epc;
5102
}
5103
5104
static void get_pmu_ctx(struct perf_event_pmu_context *epc)
5105
{
5106
WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
5107
}
5108
5109
static void free_cpc_rcu(struct rcu_head *head)
5110
{
5111
struct perf_cpu_pmu_context *cpc =
5112
container_of(head, typeof(*cpc), epc.rcu_head);
5113
5114
kfree(cpc);
5115
}
5116
5117
static void free_epc_rcu(struct rcu_head *head)
5118
{
5119
struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);
5120
5121
kfree(epc);
5122
}
5123
5124
static void put_pmu_ctx(struct perf_event_pmu_context *epc)
5125
{
5126
struct perf_event_context *ctx = epc->ctx;
5127
unsigned long flags;
5128
5129
/*
5130
* XXX
5131
*
5132
* lockdep_assert_held(&ctx->mutex);
5133
*
5134
* can't because of the call-site in _free_event()/put_event()
5135
* which isn't always called under ctx->mutex.
5136
*/
5137
if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags))
5138
return;
5139
5140
WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
5141
5142
list_del_init(&epc->pmu_ctx_entry);
5143
epc->ctx = NULL;
5144
5145
WARN_ON_ONCE(!list_empty(&epc->pinned_active));
5146
WARN_ON_ONCE(!list_empty(&epc->flexible_active));
5147
5148
raw_spin_unlock_irqrestore(&ctx->lock, flags);
5149
5150
if (epc->embedded) {
5151
call_rcu(&epc->rcu_head, free_cpc_rcu);
5152
return;
5153
}
5154
5155
call_rcu(&epc->rcu_head, free_epc_rcu);
5156
}
5157
5158
static void perf_event_free_filter(struct perf_event *event);
5159
5160
static void free_event_rcu(struct rcu_head *head)
5161
{
5162
struct perf_event *event = container_of(head, typeof(*event), rcu_head);
5163
5164
if (event->ns)
5165
put_pid_ns(event->ns);
5166
perf_event_free_filter(event);
5167
kmem_cache_free(perf_event_cache, event);
5168
}
5169
5170
static void ring_buffer_attach(struct perf_event *event,
5171
struct perf_buffer *rb);
5172
5173
static void detach_sb_event(struct perf_event *event)
5174
{
5175
struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
5176
5177
raw_spin_lock(&pel->lock);
5178
list_del_rcu(&event->sb_list);
5179
raw_spin_unlock(&pel->lock);
5180
}
5181
5182
static bool is_sb_event(struct perf_event *event)
5183
{
5184
struct perf_event_attr *attr = &event->attr;
5185
5186
if (event->parent)
5187
return false;
5188
5189
if (event->attach_state & PERF_ATTACH_TASK)
5190
return false;
5191
5192
if (attr->mmap || attr->mmap_data || attr->mmap2 ||
5193
attr->comm || attr->comm_exec ||
5194
attr->task || attr->ksymbol ||
5195
attr->context_switch || attr->text_poke ||
5196
attr->bpf_event)
5197
return true;
5198
5199
return false;
5200
}
5201
5202
static void unaccount_pmu_sb_event(struct perf_event *event)
5203
{
5204
if (is_sb_event(event))
5205
detach_sb_event(event);
5206
}
5207
5208
#ifdef CONFIG_NO_HZ_FULL
5209
static DEFINE_SPINLOCK(nr_freq_lock);
5210
#endif
5211
5212
static void unaccount_freq_event_nohz(void)
5213
{
5214
#ifdef CONFIG_NO_HZ_FULL
5215
spin_lock(&nr_freq_lock);
5216
if (atomic_dec_and_test(&nr_freq_events))
5217
tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
5218
spin_unlock(&nr_freq_lock);
5219
#endif
5220
}
5221
5222
static void unaccount_freq_event(void)
5223
{
5224
if (tick_nohz_full_enabled())
5225
unaccount_freq_event_nohz();
5226
else
5227
atomic_dec(&nr_freq_events);
5228
}
5229
5230
5231
static struct perf_ctx_data *
5232
alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global)
5233
{
5234
struct perf_ctx_data *cd;
5235
5236
cd = kzalloc(sizeof(*cd), GFP_KERNEL);
5237
if (!cd)
5238
return NULL;
5239
5240
cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL);
5241
if (!cd->data) {
5242
kfree(cd);
5243
return NULL;
5244
}
5245
5246
cd->global = global;
5247
cd->ctx_cache = ctx_cache;
5248
refcount_set(&cd->refcount, 1);
5249
5250
return cd;
5251
}
5252
5253
static void free_perf_ctx_data(struct perf_ctx_data *cd)
5254
{
5255
kmem_cache_free(cd->ctx_cache, cd->data);
5256
kfree(cd);
5257
}
5258
5259
static void __free_perf_ctx_data_rcu(struct rcu_head *rcu_head)
5260
{
5261
struct perf_ctx_data *cd;
5262
5263
cd = container_of(rcu_head, struct perf_ctx_data, rcu_head);
5264
free_perf_ctx_data(cd);
5265
}
5266
5267
static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd)
5268
{
5269
call_rcu(&cd->rcu_head, __free_perf_ctx_data_rcu);
5270
}
5271
5272
static int
5273
attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
5274
bool global)
5275
{
5276
struct perf_ctx_data *cd, *old = NULL;
5277
5278
cd = alloc_perf_ctx_data(ctx_cache, global);
5279
if (!cd)
5280
return -ENOMEM;
5281
5282
for (;;) {
5283
if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) {
5284
if (old)
5285
perf_free_ctx_data_rcu(old);
5286
return 0;
5287
}
5288
5289
if (!old) {
5290
/*
5291
* After seeing a dead @old, we raced with
5292
* removal and lost, try again to install @cd.
5293
*/
5294
continue;
5295
}
5296
5297
if (refcount_inc_not_zero(&old->refcount)) {
5298
free_perf_ctx_data(cd); /* unused */
5299
return 0;
5300
}
5301
5302
/*
5303
* @old is a dead object, refcount==0 is stable, try and
5304
* replace it with @cd.
5305
*/
5306
}
5307
return 0;
5308
}
5309
5310
static void __detach_global_ctx_data(void);
5311
DEFINE_STATIC_PERCPU_RWSEM(global_ctx_data_rwsem);
5312
static refcount_t global_ctx_data_ref;
5313
5314
static int
5315
attach_global_ctx_data(struct kmem_cache *ctx_cache)
5316
{
5317
struct task_struct *g, *p;
5318
struct perf_ctx_data *cd;
5319
int ret;
5320
5321
if (refcount_inc_not_zero(&global_ctx_data_ref))
5322
return 0;
5323
5324
guard(percpu_write)(&global_ctx_data_rwsem);
5325
if (refcount_inc_not_zero(&global_ctx_data_ref))
5326
return 0;
5327
again:
5328
/* Allocate everything */
5329
scoped_guard (rcu) {
5330
for_each_process_thread(g, p) {
5331
cd = rcu_dereference(p->perf_ctx_data);
5332
if (cd && !cd->global) {
5333
cd->global = 1;
5334
if (!refcount_inc_not_zero(&cd->refcount))
5335
cd = NULL;
5336
}
5337
if (!cd) {
5338
get_task_struct(p);
5339
goto alloc;
5340
}
5341
}
5342
}
5343
5344
refcount_set(&global_ctx_data_ref, 1);
5345
5346
return 0;
5347
alloc:
5348
ret = attach_task_ctx_data(p, ctx_cache, true);
5349
put_task_struct(p);
5350
if (ret) {
5351
__detach_global_ctx_data();
5352
return ret;
5353
}
5354
goto again;
5355
}
5356
5357
static int
5358
attach_perf_ctx_data(struct perf_event *event)
5359
{
5360
struct task_struct *task = event->hw.target;
5361
struct kmem_cache *ctx_cache = event->pmu->task_ctx_cache;
5362
int ret;
5363
5364
if (!ctx_cache)
5365
return -ENOMEM;
5366
5367
if (task)
5368
return attach_task_ctx_data(task, ctx_cache, false);
5369
5370
ret = attach_global_ctx_data(ctx_cache);
5371
if (ret)
5372
return ret;
5373
5374
event->attach_state |= PERF_ATTACH_GLOBAL_DATA;
5375
return 0;
5376
}
5377
5378
static void
5379
detach_task_ctx_data(struct task_struct *p)
5380
{
5381
struct perf_ctx_data *cd;
5382
5383
scoped_guard (rcu) {
5384
cd = rcu_dereference(p->perf_ctx_data);
5385
if (!cd || !refcount_dec_and_test(&cd->refcount))
5386
return;
5387
}
5388
5389
/*
5390
* The old ctx_data may be lost because of the race.
5391
* Nothing is required to do for the case.
5392
* See attach_task_ctx_data().
5393
*/
5394
if (try_cmpxchg((struct perf_ctx_data **)&p->perf_ctx_data, &cd, NULL))
5395
perf_free_ctx_data_rcu(cd);
5396
}
5397
5398
static void __detach_global_ctx_data(void)
5399
{
5400
struct task_struct *g, *p;
5401
struct perf_ctx_data *cd;
5402
5403
again:
5404
scoped_guard (rcu) {
5405
for_each_process_thread(g, p) {
5406
cd = rcu_dereference(p->perf_ctx_data);
5407
if (!cd || !cd->global)
5408
continue;
5409
cd->global = 0;
5410
get_task_struct(p);
5411
goto detach;
5412
}
5413
}
5414
return;
5415
detach:
5416
detach_task_ctx_data(p);
5417
put_task_struct(p);
5418
goto again;
5419
}
5420
5421
static void detach_global_ctx_data(void)
5422
{
5423
if (refcount_dec_not_one(&global_ctx_data_ref))
5424
return;
5425
5426
guard(percpu_write)(&global_ctx_data_rwsem);
5427
if (!refcount_dec_and_test(&global_ctx_data_ref))
5428
return;
5429
5430
/* remove everything */
5431
__detach_global_ctx_data();
5432
}
5433
5434
static void detach_perf_ctx_data(struct perf_event *event)
5435
{
5436
struct task_struct *task = event->hw.target;
5437
5438
event->attach_state &= ~PERF_ATTACH_TASK_DATA;
5439
5440
if (task)
5441
return detach_task_ctx_data(task);
5442
5443
if (event->attach_state & PERF_ATTACH_GLOBAL_DATA) {
5444
detach_global_ctx_data();
5445
event->attach_state &= ~PERF_ATTACH_GLOBAL_DATA;
5446
}
5447
}
5448
5449
static void unaccount_event(struct perf_event *event)
5450
{
5451
bool dec = false;
5452
5453
if (event->parent)
5454
return;
5455
5456
if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
5457
dec = true;
5458
if (event->attr.mmap || event->attr.mmap_data)
5459
atomic_dec(&nr_mmap_events);
5460
if (event->attr.build_id)
5461
atomic_dec(&nr_build_id_events);
5462
if (event->attr.comm)
5463
atomic_dec(&nr_comm_events);
5464
if (event->attr.namespaces)
5465
atomic_dec(&nr_namespaces_events);
5466
if (event->attr.cgroup)
5467
atomic_dec(&nr_cgroup_events);
5468
if (event->attr.task)
5469
atomic_dec(&nr_task_events);
5470
if (event->attr.freq)
5471
unaccount_freq_event();
5472
if (event->attr.context_switch) {
5473
dec = true;
5474
atomic_dec(&nr_switch_events);
5475
}
5476
if (is_cgroup_event(event))
5477
dec = true;
5478
if (has_branch_stack(event))
5479
dec = true;
5480
if (event->attr.ksymbol)
5481
atomic_dec(&nr_ksymbol_events);
5482
if (event->attr.bpf_event)
5483
atomic_dec(&nr_bpf_events);
5484
if (event->attr.text_poke)
5485
atomic_dec(&nr_text_poke_events);
5486
5487
if (dec) {
5488
if (!atomic_add_unless(&perf_sched_count, -1, 1))
5489
schedule_delayed_work(&perf_sched_work, HZ);
5490
}
5491
5492
unaccount_pmu_sb_event(event);
5493
}
5494
5495
static void perf_sched_delayed(struct work_struct *work)
5496
{
5497
mutex_lock(&perf_sched_mutex);
5498
if (atomic_dec_and_test(&perf_sched_count))
5499
static_branch_disable(&perf_sched_events);
5500
mutex_unlock(&perf_sched_mutex);
5501
}
5502
5503
/*
5504
* The following implement mutual exclusion of events on "exclusive" pmus
5505
* (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
5506
* at a time, so we disallow creating events that might conflict, namely:
5507
*
5508
* 1) cpu-wide events in the presence of per-task events,
5509
* 2) per-task events in the presence of cpu-wide events,
5510
* 3) two matching events on the same perf_event_context.
5511
*
5512
* The former two cases are handled in the allocation path (perf_event_alloc(),
5513
* _free_event()), the latter -- before the first perf_install_in_context().
5514
*/
5515
static int exclusive_event_init(struct perf_event *event)
5516
{
5517
struct pmu *pmu = event->pmu;
5518
5519
if (!is_exclusive_pmu(pmu))
5520
return 0;
5521
5522
/*
5523
* Prevent co-existence of per-task and cpu-wide events on the
5524
* same exclusive pmu.
5525
*
5526
* Negative pmu::exclusive_cnt means there are cpu-wide
5527
* events on this "exclusive" pmu, positive means there are
5528
* per-task events.
5529
*
5530
* Since this is called in perf_event_alloc() path, event::ctx
5531
* doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
5532
* to mean "per-task event", because unlike other attach states it
5533
* never gets cleared.
5534
*/
5535
if (event->attach_state & PERF_ATTACH_TASK) {
5536
if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
5537
return -EBUSY;
5538
} else {
5539
if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
5540
return -EBUSY;
5541
}
5542
5543
event->attach_state |= PERF_ATTACH_EXCLUSIVE;
5544
5545
return 0;
5546
}
5547
5548
static void exclusive_event_destroy(struct perf_event *event)
5549
{
5550
struct pmu *pmu = event->pmu;
5551
5552
/* see comment in exclusive_event_init() */
5553
if (event->attach_state & PERF_ATTACH_TASK)
5554
atomic_dec(&pmu->exclusive_cnt);
5555
else
5556
atomic_inc(&pmu->exclusive_cnt);
5557
5558
event->attach_state &= ~PERF_ATTACH_EXCLUSIVE;
5559
}
5560
5561
static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
5562
{
5563
if ((e1->pmu == e2->pmu) &&
5564
(e1->cpu == e2->cpu ||
5565
e1->cpu == -1 ||
5566
e2->cpu == -1))
5567
return true;
5568
return false;
5569
}
5570
5571
static bool exclusive_event_installable(struct perf_event *event,
5572
struct perf_event_context *ctx)
5573
{
5574
struct perf_event *iter_event;
5575
struct pmu *pmu = event->pmu;
5576
5577
lockdep_assert_held(&ctx->mutex);
5578
5579
if (!is_exclusive_pmu(pmu))
5580
return true;
5581
5582
list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
5583
if (exclusive_event_match(iter_event, event))
5584
return false;
5585
}
5586
5587
return true;
5588
}
5589
5590
static void perf_free_addr_filters(struct perf_event *event);
5591
5592
/* vs perf_event_alloc() error */
5593
static void __free_event(struct perf_event *event)
5594
{
5595
struct pmu *pmu = event->pmu;
5596
5597
if (event->attach_state & PERF_ATTACH_CALLCHAIN)
5598
put_callchain_buffers();
5599
5600
kfree(event->addr_filter_ranges);
5601
5602
if (event->attach_state & PERF_ATTACH_EXCLUSIVE)
5603
exclusive_event_destroy(event);
5604
5605
if (is_cgroup_event(event))
5606
perf_detach_cgroup(event);
5607
5608
if (event->attach_state & PERF_ATTACH_TASK_DATA)
5609
detach_perf_ctx_data(event);
5610
5611
if (event->destroy)
5612
event->destroy(event);
5613
5614
/*
5615
* Must be after ->destroy(), due to uprobe_perf_close() using
5616
* hw.target.
5617
*/
5618
if (event->hw.target)
5619
put_task_struct(event->hw.target);
5620
5621
if (event->pmu_ctx) {
5622
/*
5623
* put_pmu_ctx() needs an event->ctx reference, because of
5624
* epc->ctx.
5625
*/
5626
WARN_ON_ONCE(!pmu);
5627
WARN_ON_ONCE(!event->ctx);
5628
WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
5629
put_pmu_ctx(event->pmu_ctx);
5630
}
5631
5632
/*
5633
* perf_event_free_task() relies on put_ctx() being 'last', in
5634
* particular all task references must be cleaned up.
5635
*/
5636
if (event->ctx)
5637
put_ctx(event->ctx);
5638
5639
if (pmu) {
5640
module_put(pmu->module);
5641
scoped_guard (spinlock, &pmu->events_lock) {
5642
list_del(&event->pmu_list);
5643
wake_up_var(pmu);
5644
}
5645
}
5646
5647
call_rcu(&event->rcu_head, free_event_rcu);
5648
}
5649
5650
DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T))
5651
5652
/* vs perf_event_alloc() success */
5653
static void _free_event(struct perf_event *event)
5654
{
5655
irq_work_sync(&event->pending_irq);
5656
irq_work_sync(&event->pending_disable_irq);
5657
5658
unaccount_event(event);
5659
5660
security_perf_event_free(event);
5661
5662
if (event->rb) {
5663
/*
5664
* Can happen when we close an event with re-directed output.
5665
*
5666
* Since we have a 0 refcount, perf_mmap_close() will skip
5667
* over us; possibly making our ring_buffer_put() the last.
5668
*/
5669
mutex_lock(&event->mmap_mutex);
5670
ring_buffer_attach(event, NULL);
5671
mutex_unlock(&event->mmap_mutex);
5672
}
5673
5674
perf_event_free_bpf_prog(event);
5675
perf_free_addr_filters(event);
5676
5677
__free_event(event);
5678
}
5679
5680
/*
5681
* Used to free events which have a known refcount of 1, such as in error paths
5682
* of inherited events.
5683
*/
5684
static void free_event(struct perf_event *event)
5685
{
5686
if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
5687
"unexpected event refcount: %ld; ptr=%p\n",
5688
atomic_long_read(&event->refcount), event)) {
5689
/* leak to avoid use-after-free */
5690
return;
5691
}
5692
5693
_free_event(event);
5694
}
5695
5696
/*
5697
* Remove user event from the owner task.
5698
*/
5699
static void perf_remove_from_owner(struct perf_event *event)
5700
{
5701
struct task_struct *owner;
5702
5703
rcu_read_lock();
5704
/*
5705
* Matches the smp_store_release() in perf_event_exit_task(). If we
5706
* observe !owner it means the list deletion is complete and we can
5707
* indeed free this event, otherwise we need to serialize on
5708
* owner->perf_event_mutex.
5709
*/
5710
owner = READ_ONCE(event->owner);
5711
if (owner) {
5712
/*
5713
* Since delayed_put_task_struct() also drops the last
5714
* task reference we can safely take a new reference
5715
* while holding the rcu_read_lock().
5716
*/
5717
get_task_struct(owner);
5718
}
5719
rcu_read_unlock();
5720
5721
if (owner) {
5722
/*
5723
* If we're here through perf_event_exit_task() we're already
5724
* holding ctx->mutex which would be an inversion wrt. the
5725
* normal lock order.
5726
*
5727
* However we can safely take this lock because its the child
5728
* ctx->mutex.
5729
*/
5730
mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
5731
5732
/*
5733
* We have to re-check the event->owner field, if it is cleared
5734
* we raced with perf_event_exit_task(), acquiring the mutex
5735
* ensured they're done, and we can proceed with freeing the
5736
* event.
5737
*/
5738
if (event->owner) {
5739
list_del_init(&event->owner_entry);
5740
smp_store_release(&event->owner, NULL);
5741
}
5742
mutex_unlock(&owner->perf_event_mutex);
5743
put_task_struct(owner);
5744
}
5745
}
5746
5747
static void put_event(struct perf_event *event)
5748
{
5749
struct perf_event *parent;
5750
5751
if (!atomic_long_dec_and_test(&event->refcount))
5752
return;
5753
5754
parent = event->parent;
5755
_free_event(event);
5756
5757
/* Matches the refcount bump in inherit_event() */
5758
if (parent)
5759
put_event(parent);
5760
}
5761
5762
/*
5763
* Kill an event dead; while event:refcount will preserve the event
5764
* object, it will not preserve its functionality. Once the last 'user'
5765
* gives up the object, we'll destroy the thing.
5766
*/
5767
int perf_event_release_kernel(struct perf_event *event)
5768
{
5769
struct perf_event_context *ctx = event->ctx;
5770
struct perf_event *child, *tmp;
5771
5772
/*
5773
* If we got here through err_alloc: free_event(event); we will not
5774
* have attached to a context yet.
5775
*/
5776
if (!ctx) {
5777
WARN_ON_ONCE(event->attach_state &
5778
(PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
5779
goto no_ctx;
5780
}
5781
5782
if (!is_kernel_event(event))
5783
perf_remove_from_owner(event);
5784
5785
ctx = perf_event_ctx_lock(event);
5786
WARN_ON_ONCE(ctx->parent_ctx);
5787
5788
/*
5789
* Mark this event as STATE_DEAD, there is no external reference to it
5790
* anymore.
5791
*
5792
* Anybody acquiring event->child_mutex after the below loop _must_
5793
* also see this, most importantly inherit_event() which will avoid
5794
* placing more children on the list.
5795
*
5796
* Thus this guarantees that we will in fact observe and kill _ALL_
5797
* child events.
5798
*/
5799
if (event->state > PERF_EVENT_STATE_REVOKED) {
5800
perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
5801
} else {
5802
event->state = PERF_EVENT_STATE_DEAD;
5803
}
5804
5805
perf_event_ctx_unlock(event, ctx);
5806
5807
again:
5808
mutex_lock(&event->child_mutex);
5809
list_for_each_entry(child, &event->child_list, child_list) {
5810
/*
5811
* Cannot change, child events are not migrated, see the
5812
* comment with perf_event_ctx_lock_nested().
5813
*/
5814
ctx = READ_ONCE(child->ctx);
5815
/*
5816
* Since child_mutex nests inside ctx::mutex, we must jump
5817
* through hoops. We start by grabbing a reference on the ctx.
5818
*
5819
* Since the event cannot get freed while we hold the
5820
* child_mutex, the context must also exist and have a !0
5821
* reference count.
5822
*/
5823
get_ctx(ctx);
5824
5825
/*
5826
* Now that we have a ctx ref, we can drop child_mutex, and
5827
* acquire ctx::mutex without fear of it going away. Then we
5828
* can re-acquire child_mutex.
5829
*/
5830
mutex_unlock(&event->child_mutex);
5831
mutex_lock(&ctx->mutex);
5832
mutex_lock(&event->child_mutex);
5833
5834
/*
5835
* Now that we hold ctx::mutex and child_mutex, revalidate our
5836
* state, if child is still the first entry, it didn't get freed
5837
* and we can continue doing so.
5838
*/
5839
tmp = list_first_entry_or_null(&event->child_list,
5840
struct perf_event, child_list);
5841
if (tmp == child) {
5842
perf_remove_from_context(child, DETACH_GROUP | DETACH_CHILD);
5843
} else {
5844
child = NULL;
5845
}
5846
5847
mutex_unlock(&event->child_mutex);
5848
mutex_unlock(&ctx->mutex);
5849
5850
if (child) {
5851
/* Last reference unless ->pending_task work is pending */
5852
put_event(child);
5853
}
5854
put_ctx(ctx);
5855
5856
goto again;
5857
}
5858
mutex_unlock(&event->child_mutex);
5859
5860
no_ctx:
5861
/*
5862
* Last reference unless ->pending_task work is pending on this event
5863
* or any of its children.
5864
*/
5865
put_event(event);
5866
return 0;
5867
}
5868
EXPORT_SYMBOL_GPL(perf_event_release_kernel);
5869
5870
/*
5871
* Called when the last reference to the file is gone.
5872
*/
5873
static int perf_release(struct inode *inode, struct file *file)
5874
{
5875
perf_event_release_kernel(file->private_data);
5876
return 0;
5877
}
5878
5879
static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5880
{
5881
struct perf_event *child;
5882
u64 total = 0;
5883
5884
*enabled = 0;
5885
*running = 0;
5886
5887
mutex_lock(&event->child_mutex);
5888
5889
(void)perf_event_read(event, false);
5890
total += perf_event_count(event, false);
5891
5892
*enabled += event->total_time_enabled +
5893
atomic64_read(&event->child_total_time_enabled);
5894
*running += event->total_time_running +
5895
atomic64_read(&event->child_total_time_running);
5896
5897
list_for_each_entry(child, &event->child_list, child_list) {
5898
(void)perf_event_read(child, false);
5899
total += perf_event_count(child, false);
5900
*enabled += child->total_time_enabled;
5901
*running += child->total_time_running;
5902
}
5903
mutex_unlock(&event->child_mutex);
5904
5905
return total;
5906
}
5907
5908
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5909
{
5910
struct perf_event_context *ctx;
5911
u64 count;
5912
5913
ctx = perf_event_ctx_lock(event);
5914
count = __perf_event_read_value(event, enabled, running);
5915
perf_event_ctx_unlock(event, ctx);
5916
5917
return count;
5918
}
5919
EXPORT_SYMBOL_GPL(perf_event_read_value);
5920
5921
static int __perf_read_group_add(struct perf_event *leader,
5922
u64 read_format, u64 *values)
5923
{
5924
struct perf_event_context *ctx = leader->ctx;
5925
struct perf_event *sub, *parent;
5926
unsigned long flags;
5927
int n = 1; /* skip @nr */
5928
int ret;
5929
5930
ret = perf_event_read(leader, true);
5931
if (ret)
5932
return ret;
5933
5934
raw_spin_lock_irqsave(&ctx->lock, flags);
5935
/*
5936
* Verify the grouping between the parent and child (inherited)
5937
* events is still in tact.
5938
*
5939
* Specifically:
5940
* - leader->ctx->lock pins leader->sibling_list
5941
* - parent->child_mutex pins parent->child_list
5942
* - parent->ctx->mutex pins parent->sibling_list
5943
*
5944
* Because parent->ctx != leader->ctx (and child_list nests inside
5945
* ctx->mutex), group destruction is not atomic between children, also
5946
* see perf_event_release_kernel(). Additionally, parent can grow the
5947
* group.
5948
*
5949
* Therefore it is possible to have parent and child groups in a
5950
* different configuration and summing over such a beast makes no sense
5951
* what so ever.
5952
*
5953
* Reject this.
5954
*/
5955
parent = leader->parent;
5956
if (parent &&
5957
(parent->group_generation != leader->group_generation ||
5958
parent->nr_siblings != leader->nr_siblings)) {
5959
ret = -ECHILD;
5960
goto unlock;
5961
}
5962
5963
/*
5964
* Since we co-schedule groups, {enabled,running} times of siblings
5965
* will be identical to those of the leader, so we only publish one
5966
* set.
5967
*/
5968
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5969
values[n++] += leader->total_time_enabled +
5970
atomic64_read(&leader->child_total_time_enabled);
5971
}
5972
5973
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5974
values[n++] += leader->total_time_running +
5975
atomic64_read(&leader->child_total_time_running);
5976
}
5977
5978
/*
5979
* Write {count,id} tuples for every sibling.
5980
*/
5981
values[n++] += perf_event_count(leader, false);
5982
if (read_format & PERF_FORMAT_ID)
5983
values[n++] = primary_event_id(leader);
5984
if (read_format & PERF_FORMAT_LOST)
5985
values[n++] = atomic64_read(&leader->lost_samples);
5986
5987
for_each_sibling_event(sub, leader) {
5988
values[n++] += perf_event_count(sub, false);
5989
if (read_format & PERF_FORMAT_ID)
5990
values[n++] = primary_event_id(sub);
5991
if (read_format & PERF_FORMAT_LOST)
5992
values[n++] = atomic64_read(&sub->lost_samples);
5993
}
5994
5995
unlock:
5996
raw_spin_unlock_irqrestore(&ctx->lock, flags);
5997
return ret;
5998
}
5999
6000
static int perf_read_group(struct perf_event *event,
6001
u64 read_format, char __user *buf)
6002
{
6003
struct perf_event *leader = event->group_leader, *child;
6004
struct perf_event_context *ctx = leader->ctx;
6005
int ret;
6006
u64 *values;
6007
6008
lockdep_assert_held(&ctx->mutex);
6009
6010
values = kzalloc(event->read_size, GFP_KERNEL);
6011
if (!values)
6012
return -ENOMEM;
6013
6014
values[0] = 1 + leader->nr_siblings;
6015
6016
mutex_lock(&leader->child_mutex);
6017
6018
ret = __perf_read_group_add(leader, read_format, values);
6019
if (ret)
6020
goto unlock;
6021
6022
list_for_each_entry(child, &leader->child_list, child_list) {
6023
ret = __perf_read_group_add(child, read_format, values);
6024
if (ret)
6025
goto unlock;
6026
}
6027
6028
mutex_unlock(&leader->child_mutex);
6029
6030
ret = event->read_size;
6031
if (copy_to_user(buf, values, event->read_size))
6032
ret = -EFAULT;
6033
goto out;
6034
6035
unlock:
6036
mutex_unlock(&leader->child_mutex);
6037
out:
6038
kfree(values);
6039
return ret;
6040
}
6041
6042
static int perf_read_one(struct perf_event *event,
6043
u64 read_format, char __user *buf)
6044
{
6045
u64 enabled, running;
6046
u64 values[5];
6047
int n = 0;
6048
6049
values[n++] = __perf_event_read_value(event, &enabled, &running);
6050
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
6051
values[n++] = enabled;
6052
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
6053
values[n++] = running;
6054
if (read_format & PERF_FORMAT_ID)
6055
values[n++] = primary_event_id(event);
6056
if (read_format & PERF_FORMAT_LOST)
6057
values[n++] = atomic64_read(&event->lost_samples);
6058
6059
if (copy_to_user(buf, values, n * sizeof(u64)))
6060
return -EFAULT;
6061
6062
return n * sizeof(u64);
6063
}
6064
6065
static bool is_event_hup(struct perf_event *event)
6066
{
6067
bool no_children;
6068
6069
if (event->state > PERF_EVENT_STATE_EXIT)
6070
return false;
6071
6072
mutex_lock(&event->child_mutex);
6073
no_children = list_empty(&event->child_list);
6074
mutex_unlock(&event->child_mutex);
6075
return no_children;
6076
}
6077
6078
/*
6079
* Read the performance event - simple non blocking version for now
6080
*/
6081
static ssize_t
6082
__perf_read(struct perf_event *event, char __user *buf, size_t count)
6083
{
6084
u64 read_format = event->attr.read_format;
6085
int ret;
6086
6087
/*
6088
* Return end-of-file for a read on an event that is in
6089
* error state (i.e. because it was pinned but it couldn't be
6090
* scheduled on to the CPU at some point).
6091
*/
6092
if (event->state == PERF_EVENT_STATE_ERROR)
6093
return 0;
6094
6095
if (count < event->read_size)
6096
return -ENOSPC;
6097
6098
WARN_ON_ONCE(event->ctx->parent_ctx);
6099
if (read_format & PERF_FORMAT_GROUP)
6100
ret = perf_read_group(event, read_format, buf);
6101
else
6102
ret = perf_read_one(event, read_format, buf);
6103
6104
return ret;
6105
}
6106
6107
static ssize_t
6108
perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
6109
{
6110
struct perf_event *event = file->private_data;
6111
struct perf_event_context *ctx;
6112
int ret;
6113
6114
ret = security_perf_event_read(event);
6115
if (ret)
6116
return ret;
6117
6118
ctx = perf_event_ctx_lock(event);
6119
ret = __perf_read(event, buf, count);
6120
perf_event_ctx_unlock(event, ctx);
6121
6122
return ret;
6123
}
6124
6125
static __poll_t perf_poll(struct file *file, poll_table *wait)
6126
{
6127
struct perf_event *event = file->private_data;
6128
struct perf_buffer *rb;
6129
__poll_t events = EPOLLHUP;
6130
6131
if (event->state <= PERF_EVENT_STATE_REVOKED)
6132
return EPOLLERR;
6133
6134
poll_wait(file, &event->waitq, wait);
6135
6136
if (event->state <= PERF_EVENT_STATE_REVOKED)
6137
return EPOLLERR;
6138
6139
if (is_event_hup(event))
6140
return events;
6141
6142
if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR &&
6143
event->attr.pinned))
6144
return EPOLLERR;
6145
6146
/*
6147
* Pin the event->rb by taking event->mmap_mutex; otherwise
6148
* perf_event_set_output() can swizzle our rb and make us miss wakeups.
6149
*/
6150
mutex_lock(&event->mmap_mutex);
6151
rb = event->rb;
6152
if (rb)
6153
events = atomic_xchg(&rb->poll, 0);
6154
mutex_unlock(&event->mmap_mutex);
6155
return events;
6156
}
6157
6158
static void _perf_event_reset(struct perf_event *event)
6159
{
6160
(void)perf_event_read(event, false);
6161
local64_set(&event->count, 0);
6162
perf_event_update_userpage(event);
6163
}
6164
6165
/* Assume it's not an event with inherit set. */
6166
u64 perf_event_pause(struct perf_event *event, bool reset)
6167
{
6168
struct perf_event_context *ctx;
6169
u64 count;
6170
6171
ctx = perf_event_ctx_lock(event);
6172
WARN_ON_ONCE(event->attr.inherit);
6173
_perf_event_disable(event);
6174
count = local64_read(&event->count);
6175
if (reset)
6176
local64_set(&event->count, 0);
6177
perf_event_ctx_unlock(event, ctx);
6178
6179
return count;
6180
}
6181
EXPORT_SYMBOL_GPL(perf_event_pause);
6182
6183
/*
6184
* Holding the top-level event's child_mutex means that any
6185
* descendant process that has inherited this event will block
6186
* in perf_event_exit_event() if it goes to exit, thus satisfying the
6187
* task existence requirements of perf_event_enable/disable.
6188
*/
6189
static void perf_event_for_each_child(struct perf_event *event,
6190
void (*func)(struct perf_event *))
6191
{
6192
struct perf_event *child;
6193
6194
WARN_ON_ONCE(event->ctx->parent_ctx);
6195
6196
mutex_lock(&event->child_mutex);
6197
func(event);
6198
list_for_each_entry(child, &event->child_list, child_list)
6199
func(child);
6200
mutex_unlock(&event->child_mutex);
6201
}
6202
6203
static void perf_event_for_each(struct perf_event *event,
6204
void (*func)(struct perf_event *))
6205
{
6206
struct perf_event_context *ctx = event->ctx;
6207
struct perf_event *sibling;
6208
6209
lockdep_assert_held(&ctx->mutex);
6210
6211
event = event->group_leader;
6212
6213
perf_event_for_each_child(event, func);
6214
for_each_sibling_event(sibling, event)
6215
perf_event_for_each_child(sibling, func);
6216
}
6217
6218
static void __perf_event_period(struct perf_event *event,
6219
struct perf_cpu_context *cpuctx,
6220
struct perf_event_context *ctx,
6221
void *info)
6222
{
6223
u64 value = *((u64 *)info);
6224
bool active;
6225
6226
if (event->attr.freq) {
6227
event->attr.sample_freq = value;
6228
} else {
6229
event->attr.sample_period = value;
6230
event->hw.sample_period = value;
6231
}
6232
6233
active = (event->state == PERF_EVENT_STATE_ACTIVE);
6234
if (active) {
6235
perf_pmu_disable(event->pmu);
6236
event->pmu->stop(event, PERF_EF_UPDATE);
6237
}
6238
6239
local64_set(&event->hw.period_left, 0);
6240
6241
if (active) {
6242
event->pmu->start(event, PERF_EF_RELOAD);
6243
/*
6244
* Once the period is force-reset, the event starts immediately.
6245
* But the event/group could be throttled. Unthrottle the
6246
* event/group now to avoid the next tick trying to unthrottle
6247
* while we already re-started the event/group.
6248
*/
6249
if (event->hw.interrupts == MAX_INTERRUPTS)
6250
perf_event_unthrottle_group(event, true);
6251
perf_pmu_enable(event->pmu);
6252
}
6253
}
6254
6255
static int perf_event_check_period(struct perf_event *event, u64 value)
6256
{
6257
return event->pmu->check_period(event, value);
6258
}
6259
6260
static int _perf_event_period(struct perf_event *event, u64 value)
6261
{
6262
if (!is_sampling_event(event))
6263
return -EINVAL;
6264
6265
if (!value)
6266
return -EINVAL;
6267
6268
if (event->attr.freq) {
6269
if (value > sysctl_perf_event_sample_rate)
6270
return -EINVAL;
6271
} else {
6272
if (perf_event_check_period(event, value))
6273
return -EINVAL;
6274
if (value & (1ULL << 63))
6275
return -EINVAL;
6276
}
6277
6278
event_function_call(event, __perf_event_period, &value);
6279
6280
return 0;
6281
}
6282
6283
int perf_event_period(struct perf_event *event, u64 value)
6284
{
6285
struct perf_event_context *ctx;
6286
int ret;
6287
6288
ctx = perf_event_ctx_lock(event);
6289
ret = _perf_event_period(event, value);
6290
perf_event_ctx_unlock(event, ctx);
6291
6292
return ret;
6293
}
6294
EXPORT_SYMBOL_GPL(perf_event_period);
6295
6296
static const struct file_operations perf_fops;
6297
6298
static inline bool is_perf_file(struct fd f)
6299
{
6300
return !fd_empty(f) && fd_file(f)->f_op == &perf_fops;
6301
}
6302
6303
static int perf_event_set_output(struct perf_event *event,
6304
struct perf_event *output_event);
6305
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
6306
static int perf_copy_attr(struct perf_event_attr __user *uattr,
6307
struct perf_event_attr *attr);
6308
static int __perf_event_set_bpf_prog(struct perf_event *event,
6309
struct bpf_prog *prog,
6310
u64 bpf_cookie);
6311
6312
static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
6313
{
6314
void (*func)(struct perf_event *);
6315
u32 flags = arg;
6316
6317
if (event->state <= PERF_EVENT_STATE_REVOKED)
6318
return -ENODEV;
6319
6320
switch (cmd) {
6321
case PERF_EVENT_IOC_ENABLE:
6322
func = _perf_event_enable;
6323
break;
6324
case PERF_EVENT_IOC_DISABLE:
6325
func = _perf_event_disable;
6326
break;
6327
case PERF_EVENT_IOC_RESET:
6328
func = _perf_event_reset;
6329
break;
6330
6331
case PERF_EVENT_IOC_REFRESH:
6332
return _perf_event_refresh(event, arg);
6333
6334
case PERF_EVENT_IOC_PERIOD:
6335
{
6336
u64 value;
6337
6338
if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
6339
return -EFAULT;
6340
6341
return _perf_event_period(event, value);
6342
}
6343
case PERF_EVENT_IOC_ID:
6344
{
6345
u64 id = primary_event_id(event);
6346
6347
if (copy_to_user((void __user *)arg, &id, sizeof(id)))
6348
return -EFAULT;
6349
return 0;
6350
}
6351
6352
case PERF_EVENT_IOC_SET_OUTPUT:
6353
{
6354
CLASS(fd, output)(arg); // arg == -1 => empty
6355
struct perf_event *output_event = NULL;
6356
if (arg != -1) {
6357
if (!is_perf_file(output))
6358
return -EBADF;
6359
output_event = fd_file(output)->private_data;
6360
}
6361
return perf_event_set_output(event, output_event);
6362
}
6363
6364
case PERF_EVENT_IOC_SET_FILTER:
6365
return perf_event_set_filter(event, (void __user *)arg);
6366
6367
case PERF_EVENT_IOC_SET_BPF:
6368
{
6369
struct bpf_prog *prog;
6370
int err;
6371
6372
prog = bpf_prog_get(arg);
6373
if (IS_ERR(prog))
6374
return PTR_ERR(prog);
6375
6376
err = __perf_event_set_bpf_prog(event, prog, 0);
6377
if (err) {
6378
bpf_prog_put(prog);
6379
return err;
6380
}
6381
6382
return 0;
6383
}
6384
6385
case PERF_EVENT_IOC_PAUSE_OUTPUT: {
6386
struct perf_buffer *rb;
6387
6388
rcu_read_lock();
6389
rb = rcu_dereference(event->rb);
6390
if (!rb || !rb->nr_pages) {
6391
rcu_read_unlock();
6392
return -EINVAL;
6393
}
6394
rb_toggle_paused(rb, !!arg);
6395
rcu_read_unlock();
6396
return 0;
6397
}
6398
6399
case PERF_EVENT_IOC_QUERY_BPF:
6400
return perf_event_query_prog_array(event, (void __user *)arg);
6401
6402
case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
6403
struct perf_event_attr new_attr;
6404
int err = perf_copy_attr((struct perf_event_attr __user *)arg,
6405
&new_attr);
6406
6407
if (err)
6408
return err;
6409
6410
return perf_event_modify_attr(event, &new_attr);
6411
}
6412
default:
6413
return -ENOTTY;
6414
}
6415
6416
if (flags & PERF_IOC_FLAG_GROUP)
6417
perf_event_for_each(event, func);
6418
else
6419
perf_event_for_each_child(event, func);
6420
6421
return 0;
6422
}
6423
6424
static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
6425
{
6426
struct perf_event *event = file->private_data;
6427
struct perf_event_context *ctx;
6428
long ret;
6429
6430
/* Treat ioctl like writes as it is likely a mutating operation. */
6431
ret = security_perf_event_write(event);
6432
if (ret)
6433
return ret;
6434
6435
ctx = perf_event_ctx_lock(event);
6436
ret = _perf_ioctl(event, cmd, arg);
6437
perf_event_ctx_unlock(event, ctx);
6438
6439
return ret;
6440
}
6441
6442
#ifdef CONFIG_COMPAT
6443
static long perf_compat_ioctl(struct file *file, unsigned int cmd,
6444
unsigned long arg)
6445
{
6446
switch (_IOC_NR(cmd)) {
6447
case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
6448
case _IOC_NR(PERF_EVENT_IOC_ID):
6449
case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
6450
case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
6451
/* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
6452
if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
6453
cmd &= ~IOCSIZE_MASK;
6454
cmd |= sizeof(void *) << IOCSIZE_SHIFT;
6455
}
6456
break;
6457
}
6458
return perf_ioctl(file, cmd, arg);
6459
}
6460
#else
6461
# define perf_compat_ioctl NULL
6462
#endif
6463
6464
int perf_event_task_enable(void)
6465
{
6466
struct perf_event_context *ctx;
6467
struct perf_event *event;
6468
6469
mutex_lock(&current->perf_event_mutex);
6470
list_for_each_entry(event, &current->perf_event_list, owner_entry) {
6471
ctx = perf_event_ctx_lock(event);
6472
perf_event_for_each_child(event, _perf_event_enable);
6473
perf_event_ctx_unlock(event, ctx);
6474
}
6475
mutex_unlock(&current->perf_event_mutex);
6476
6477
return 0;
6478
}
6479
6480
int perf_event_task_disable(void)
6481
{
6482
struct perf_event_context *ctx;
6483
struct perf_event *event;
6484
6485
mutex_lock(&current->perf_event_mutex);
6486
list_for_each_entry(event, &current->perf_event_list, owner_entry) {
6487
ctx = perf_event_ctx_lock(event);
6488
perf_event_for_each_child(event, _perf_event_disable);
6489
perf_event_ctx_unlock(event, ctx);
6490
}
6491
mutex_unlock(&current->perf_event_mutex);
6492
6493
return 0;
6494
}
6495
6496
static int perf_event_index(struct perf_event *event)
6497
{
6498
if (event->hw.state & PERF_HES_STOPPED)
6499
return 0;
6500
6501
if (event->state != PERF_EVENT_STATE_ACTIVE)
6502
return 0;
6503
6504
return event->pmu->event_idx(event);
6505
}
6506
6507
static void perf_event_init_userpage(struct perf_event *event)
6508
{
6509
struct perf_event_mmap_page *userpg;
6510
struct perf_buffer *rb;
6511
6512
rcu_read_lock();
6513
rb = rcu_dereference(event->rb);
6514
if (!rb)
6515
goto unlock;
6516
6517
userpg = rb->user_page;
6518
6519
/* Allow new userspace to detect that bit 0 is deprecated */
6520
userpg->cap_bit0_is_deprecated = 1;
6521
userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
6522
userpg->data_offset = PAGE_SIZE;
6523
userpg->data_size = perf_data_size(rb);
6524
6525
unlock:
6526
rcu_read_unlock();
6527
}
6528
6529
void __weak arch_perf_update_userpage(
6530
struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
6531
{
6532
}
6533
6534
/*
6535
* Callers need to ensure there can be no nesting of this function, otherwise
6536
* the seqlock logic goes bad. We can not serialize this because the arch
6537
* code calls this from NMI context.
6538
*/
6539
void perf_event_update_userpage(struct perf_event *event)
6540
{
6541
struct perf_event_mmap_page *userpg;
6542
struct perf_buffer *rb;
6543
u64 enabled, running, now;
6544
6545
rcu_read_lock();
6546
rb = rcu_dereference(event->rb);
6547
if (!rb)
6548
goto unlock;
6549
6550
/*
6551
* compute total_time_enabled, total_time_running
6552
* based on snapshot values taken when the event
6553
* was last scheduled in.
6554
*
6555
* we cannot simply called update_context_time()
6556
* because of locking issue as we can be called in
6557
* NMI context
6558
*/
6559
calc_timer_values(event, &now, &enabled, &running);
6560
6561
userpg = rb->user_page;
6562
/*
6563
* Disable preemption to guarantee consistent time stamps are stored to
6564
* the user page.
6565
*/
6566
preempt_disable();
6567
++userpg->lock;
6568
barrier();
6569
userpg->index = perf_event_index(event);
6570
userpg->offset = perf_event_count(event, false);
6571
if (userpg->index)
6572
userpg->offset -= local64_read(&event->hw.prev_count);
6573
6574
userpg->time_enabled = enabled +
6575
atomic64_read(&event->child_total_time_enabled);
6576
6577
userpg->time_running = running +
6578
atomic64_read(&event->child_total_time_running);
6579
6580
arch_perf_update_userpage(event, userpg, now);
6581
6582
barrier();
6583
++userpg->lock;
6584
preempt_enable();
6585
unlock:
6586
rcu_read_unlock();
6587
}
6588
EXPORT_SYMBOL_GPL(perf_event_update_userpage);
6589
6590
static void ring_buffer_attach(struct perf_event *event,
6591
struct perf_buffer *rb)
6592
{
6593
struct perf_buffer *old_rb = NULL;
6594
unsigned long flags;
6595
6596
WARN_ON_ONCE(event->parent);
6597
6598
if (event->rb) {
6599
/*
6600
* Should be impossible, we set this when removing
6601
* event->rb_entry and wait/clear when adding event->rb_entry.
6602
*/
6603
WARN_ON_ONCE(event->rcu_pending);
6604
6605
old_rb = event->rb;
6606
spin_lock_irqsave(&old_rb->event_lock, flags);
6607
list_del_rcu(&event->rb_entry);
6608
spin_unlock_irqrestore(&old_rb->event_lock, flags);
6609
6610
event->rcu_batches = get_state_synchronize_rcu();
6611
event->rcu_pending = 1;
6612
}
6613
6614
if (rb) {
6615
if (event->rcu_pending) {
6616
cond_synchronize_rcu(event->rcu_batches);
6617
event->rcu_pending = 0;
6618
}
6619
6620
spin_lock_irqsave(&rb->event_lock, flags);
6621
list_add_rcu(&event->rb_entry, &rb->event_list);
6622
spin_unlock_irqrestore(&rb->event_lock, flags);
6623
}
6624
6625
/*
6626
* Avoid racing with perf_mmap_close(AUX): stop the event
6627
* before swizzling the event::rb pointer; if it's getting
6628
* unmapped, its aux_mmap_count will be 0 and it won't
6629
* restart. See the comment in __perf_pmu_output_stop().
6630
*
6631
* Data will inevitably be lost when set_output is done in
6632
* mid-air, but then again, whoever does it like this is
6633
* not in for the data anyway.
6634
*/
6635
if (has_aux(event))
6636
perf_event_stop(event, 0);
6637
6638
rcu_assign_pointer(event->rb, rb);
6639
6640
if (old_rb) {
6641
ring_buffer_put(old_rb);
6642
/*
6643
* Since we detached before setting the new rb, so that we
6644
* could attach the new rb, we could have missed a wakeup.
6645
* Provide it now.
6646
*/
6647
wake_up_all(&event->waitq);
6648
}
6649
}
6650
6651
static void ring_buffer_wakeup(struct perf_event *event)
6652
{
6653
struct perf_buffer *rb;
6654
6655
if (event->parent)
6656
event = event->parent;
6657
6658
rcu_read_lock();
6659
rb = rcu_dereference(event->rb);
6660
if (rb) {
6661
list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
6662
wake_up_all(&event->waitq);
6663
}
6664
rcu_read_unlock();
6665
}
6666
6667
struct perf_buffer *ring_buffer_get(struct perf_event *event)
6668
{
6669
struct perf_buffer *rb;
6670
6671
if (event->parent)
6672
event = event->parent;
6673
6674
rcu_read_lock();
6675
rb = rcu_dereference(event->rb);
6676
if (rb) {
6677
if (!refcount_inc_not_zero(&rb->refcount))
6678
rb = NULL;
6679
}
6680
rcu_read_unlock();
6681
6682
return rb;
6683
}
6684
6685
void ring_buffer_put(struct perf_buffer *rb)
6686
{
6687
if (!refcount_dec_and_test(&rb->refcount))
6688
return;
6689
6690
WARN_ON_ONCE(!list_empty(&rb->event_list));
6691
6692
call_rcu(&rb->rcu_head, rb_free_rcu);
6693
}
6694
6695
typedef void (*mapped_f)(struct perf_event *event, struct mm_struct *mm);
6696
6697
#define get_mapped(event, func) \
6698
({ struct pmu *pmu; \
6699
mapped_f f = NULL; \
6700
guard(rcu)(); \
6701
pmu = READ_ONCE(event->pmu); \
6702
if (pmu) \
6703
f = pmu->func; \
6704
f; \
6705
})
6706
6707
static void perf_mmap_open(struct vm_area_struct *vma)
6708
{
6709
struct perf_event *event = vma->vm_file->private_data;
6710
mapped_f mapped = get_mapped(event, event_mapped);
6711
6712
refcount_inc(&event->mmap_count);
6713
refcount_inc(&event->rb->mmap_count);
6714
6715
if (vma->vm_pgoff)
6716
refcount_inc(&event->rb->aux_mmap_count);
6717
6718
if (mapped)
6719
mapped(event, vma->vm_mm);
6720
}
6721
6722
static void perf_pmu_output_stop(struct perf_event *event);
6723
6724
/*
6725
* A buffer can be mmap()ed multiple times; either directly through the same
6726
* event, or through other events by use of perf_event_set_output().
6727
*
6728
* In order to undo the VM accounting done by perf_mmap() we need to destroy
6729
* the buffer here, where we still have a VM context. This means we need
6730
* to detach all events redirecting to us.
6731
*/
6732
static void perf_mmap_close(struct vm_area_struct *vma)
6733
{
6734
struct perf_event *event = vma->vm_file->private_data;
6735
mapped_f unmapped = get_mapped(event, event_unmapped);
6736
struct perf_buffer *rb = ring_buffer_get(event);
6737
struct user_struct *mmap_user = rb->mmap_user;
6738
int mmap_locked = rb->mmap_locked;
6739
unsigned long size = perf_data_size(rb);
6740
bool detach_rest = false;
6741
6742
/* FIXIES vs perf_pmu_unregister() */
6743
if (unmapped)
6744
unmapped(event, vma->vm_mm);
6745
6746
/*
6747
* The AUX buffer is strictly a sub-buffer, serialize using aux_mutex
6748
* to avoid complications.
6749
*/
6750
if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
6751
refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
6752
/*
6753
* Stop all AUX events that are writing to this buffer,
6754
* so that we can free its AUX pages and corresponding PMU
6755
* data. Note that after rb::aux_mmap_count dropped to zero,
6756
* they won't start any more (see perf_aux_output_begin()).
6757
*/
6758
perf_pmu_output_stop(event);
6759
6760
/* now it's safe to free the pages */
6761
atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
6762
atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
6763
6764
/* this has to be the last one */
6765
rb_free_aux(rb);
6766
WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
6767
6768
mutex_unlock(&rb->aux_mutex);
6769
}
6770
6771
if (refcount_dec_and_test(&rb->mmap_count))
6772
detach_rest = true;
6773
6774
if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
6775
goto out_put;
6776
6777
ring_buffer_attach(event, NULL);
6778
mutex_unlock(&event->mmap_mutex);
6779
6780
/* If there's still other mmap()s of this buffer, we're done. */
6781
if (!detach_rest)
6782
goto out_put;
6783
6784
/*
6785
* No other mmap()s, detach from all other events that might redirect
6786
* into the now unreachable buffer. Somewhat complicated by the
6787
* fact that rb::event_lock otherwise nests inside mmap_mutex.
6788
*/
6789
again:
6790
rcu_read_lock();
6791
list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
6792
if (!atomic_long_inc_not_zero(&event->refcount)) {
6793
/*
6794
* This event is en-route to free_event() which will
6795
* detach it and remove it from the list.
6796
*/
6797
continue;
6798
}
6799
rcu_read_unlock();
6800
6801
mutex_lock(&event->mmap_mutex);
6802
/*
6803
* Check we didn't race with perf_event_set_output() which can
6804
* swizzle the rb from under us while we were waiting to
6805
* acquire mmap_mutex.
6806
*
6807
* If we find a different rb; ignore this event, a next
6808
* iteration will no longer find it on the list. We have to
6809
* still restart the iteration to make sure we're not now
6810
* iterating the wrong list.
6811
*/
6812
if (event->rb == rb)
6813
ring_buffer_attach(event, NULL);
6814
6815
mutex_unlock(&event->mmap_mutex);
6816
put_event(event);
6817
6818
/*
6819
* Restart the iteration; either we're on the wrong list or
6820
* destroyed its integrity by doing a deletion.
6821
*/
6822
goto again;
6823
}
6824
rcu_read_unlock();
6825
6826
/*
6827
* It could be there's still a few 0-ref events on the list; they'll
6828
* get cleaned up by free_event() -- they'll also still have their
6829
* ref on the rb and will free it whenever they are done with it.
6830
*
6831
* Aside from that, this buffer is 'fully' detached and unmapped,
6832
* undo the VM accounting.
6833
*/
6834
6835
atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
6836
&mmap_user->locked_vm);
6837
atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
6838
free_uid(mmap_user);
6839
6840
out_put:
6841
ring_buffer_put(rb); /* could be last */
6842
}
6843
6844
static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf)
6845
{
6846
/* The first page is the user control page, others are read-only. */
6847
return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS;
6848
}
6849
6850
static int perf_mmap_may_split(struct vm_area_struct *vma, unsigned long addr)
6851
{
6852
/*
6853
* Forbid splitting perf mappings to prevent refcount leaks due to
6854
* the resulting non-matching offsets and sizes. See open()/close().
6855
*/
6856
return -EINVAL;
6857
}
6858
6859
static const struct vm_operations_struct perf_mmap_vmops = {
6860
.open = perf_mmap_open,
6861
.close = perf_mmap_close, /* non mergeable */
6862
.pfn_mkwrite = perf_mmap_pfn_mkwrite,
6863
.may_split = perf_mmap_may_split,
6864
};
6865
6866
static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
6867
{
6868
unsigned long nr_pages = vma_pages(vma);
6869
int err = 0;
6870
unsigned long pagenum;
6871
6872
/*
6873
* We map this as a VM_PFNMAP VMA.
6874
*
6875
* This is not ideal as this is designed broadly for mappings of PFNs
6876
* referencing memory-mapped I/O ranges or non-system RAM i.e. for which
6877
* !pfn_valid(pfn).
6878
*
6879
* We are mapping kernel-allocated memory (memory we manage ourselves)
6880
* which would more ideally be mapped using vm_insert_page() or a
6881
* similar mechanism, that is as a VM_MIXEDMAP mapping.
6882
*
6883
* However this won't work here, because:
6884
*
6885
* 1. It uses vma->vm_page_prot, but this field has not been completely
6886
* setup at the point of the f_op->mmp() hook, so we are unable to
6887
* indicate that this should be mapped CoW in order that the
6888
* mkwrite() hook can be invoked to make the first page R/W and the
6889
* rest R/O as desired.
6890
*
6891
* 2. Anything other than a VM_PFNMAP of valid PFNs will result in
6892
* vm_normal_page() returning a struct page * pointer, which means
6893
* vm_ops->page_mkwrite() will be invoked rather than
6894
* vm_ops->pfn_mkwrite(), and this means we have to set page->mapping
6895
* to work around retry logic in the fault handler, however this
6896
* field is no longer allowed to be used within struct page.
6897
*
6898
* 3. Having a struct page * made available in the fault logic also
6899
* means that the page gets put on the rmap and becomes
6900
* inappropriately accessible and subject to map and ref counting.
6901
*
6902
* Ideally we would have a mechanism that could explicitly express our
6903
* desires, but this is not currently the case, so we instead use
6904
* VM_PFNMAP.
6905
*
6906
* We manage the lifetime of these mappings with internal refcounts (see
6907
* perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of
6908
* this mapping is maintained correctly.
6909
*/
6910
for (pagenum = 0; pagenum < nr_pages; pagenum++) {
6911
unsigned long va = vma->vm_start + PAGE_SIZE * pagenum;
6912
struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum);
6913
6914
if (page == NULL) {
6915
err = -EINVAL;
6916
break;
6917
}
6918
6919
/* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */
6920
err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE,
6921
vm_get_page_prot(vma->vm_flags & ~VM_SHARED));
6922
if (err)
6923
break;
6924
}
6925
6926
#ifdef CONFIG_MMU
6927
/* Clear any partial mappings on error. */
6928
if (err)
6929
zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL);
6930
#endif
6931
6932
return err;
6933
}
6934
6935
static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, long *extra)
6936
{
6937
unsigned long user_locked, user_lock_limit, locked, lock_limit;
6938
struct user_struct *user = current_user();
6939
6940
user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
6941
/* Increase the limit linearly with more CPUs */
6942
user_lock_limit *= num_online_cpus();
6943
6944
user_locked = atomic_long_read(&user->locked_vm);
6945
6946
/*
6947
* sysctl_perf_event_mlock may have changed, so that
6948
* user->locked_vm > user_lock_limit
6949
*/
6950
if (user_locked > user_lock_limit)
6951
user_locked = user_lock_limit;
6952
user_locked += *user_extra;
6953
6954
if (user_locked > user_lock_limit) {
6955
/*
6956
* charge locked_vm until it hits user_lock_limit;
6957
* charge the rest from pinned_vm
6958
*/
6959
*extra = user_locked - user_lock_limit;
6960
*user_extra -= *extra;
6961
}
6962
6963
lock_limit = rlimit(RLIMIT_MEMLOCK);
6964
lock_limit >>= PAGE_SHIFT;
6965
locked = atomic64_read(&vma->vm_mm->pinned_vm) + *extra;
6966
6967
return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK);
6968
}
6969
6970
static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long extra)
6971
{
6972
struct user_struct *user = current_user();
6973
6974
atomic_long_add(user_extra, &user->locked_vm);
6975
atomic64_add(extra, &vma->vm_mm->pinned_vm);
6976
}
6977
6978
static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
6979
unsigned long nr_pages)
6980
{
6981
long extra = 0, user_extra = nr_pages;
6982
struct perf_buffer *rb;
6983
int rb_flags = 0;
6984
6985
nr_pages -= 1;
6986
6987
/*
6988
* If we have rb pages ensure they're a power-of-two number, so we
6989
* can do bitmasks instead of modulo.
6990
*/
6991
if (nr_pages != 0 && !is_power_of_2(nr_pages))
6992
return -EINVAL;
6993
6994
WARN_ON_ONCE(event->ctx->parent_ctx);
6995
6996
if (event->rb) {
6997
if (data_page_nr(event->rb) != nr_pages)
6998
return -EINVAL;
6999
7000
/*
7001
* If this event doesn't have mmap_count, we're attempting to
7002
* create an alias of another event's mmap(); this would mean
7003
* both events will end up scribbling the same user_page;
7004
* which makes no sense.
7005
*/
7006
if (!refcount_read(&event->mmap_count))
7007
return -EBUSY;
7008
7009
if (refcount_inc_not_zero(&event->rb->mmap_count)) {
7010
/*
7011
* Success -- managed to mmap() the same buffer
7012
* multiple times.
7013
*/
7014
perf_mmap_account(vma, user_extra, extra);
7015
refcount_inc(&event->mmap_count);
7016
return 0;
7017
}
7018
7019
/*
7020
* Raced against perf_mmap_close()'s
7021
* refcount_dec_and_mutex_lock() remove the
7022
* event and continue as if !event->rb
7023
*/
7024
ring_buffer_attach(event, NULL);
7025
}
7026
7027
if (!perf_mmap_calc_limits(vma, &user_extra, &extra))
7028
return -EPERM;
7029
7030
if (vma->vm_flags & VM_WRITE)
7031
rb_flags |= RING_BUFFER_WRITABLE;
7032
7033
rb = rb_alloc(nr_pages,
7034
event->attr.watermark ? event->attr.wakeup_watermark : 0,
7035
event->cpu, rb_flags);
7036
7037
if (!rb)
7038
return -ENOMEM;
7039
7040
refcount_set(&rb->mmap_count, 1);
7041
rb->mmap_user = get_current_user();
7042
rb->mmap_locked = extra;
7043
7044
ring_buffer_attach(event, rb);
7045
7046
perf_event_update_time(event);
7047
perf_event_init_userpage(event);
7048
perf_event_update_userpage(event);
7049
7050
perf_mmap_account(vma, user_extra, extra);
7051
refcount_set(&event->mmap_count, 1);
7052
7053
return 0;
7054
}
7055
7056
static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event,
7057
unsigned long nr_pages)
7058
{
7059
long extra = 0, user_extra = nr_pages;
7060
u64 aux_offset, aux_size;
7061
struct perf_buffer *rb;
7062
int ret, rb_flags = 0;
7063
7064
rb = event->rb;
7065
if (!rb)
7066
return -EINVAL;
7067
7068
guard(mutex)(&rb->aux_mutex);
7069
7070
/*
7071
* AUX area mapping: if rb->aux_nr_pages != 0, it's already
7072
* mapped, all subsequent mappings should have the same size
7073
* and offset. Must be above the normal perf buffer.
7074
*/
7075
aux_offset = READ_ONCE(rb->user_page->aux_offset);
7076
aux_size = READ_ONCE(rb->user_page->aux_size);
7077
7078
if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
7079
return -EINVAL;
7080
7081
if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
7082
return -EINVAL;
7083
7084
/* already mapped with a different offset */
7085
if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
7086
return -EINVAL;
7087
7088
if (aux_size != nr_pages * PAGE_SIZE)
7089
return -EINVAL;
7090
7091
/* already mapped with a different size */
7092
if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
7093
return -EINVAL;
7094
7095
if (!is_power_of_2(nr_pages))
7096
return -EINVAL;
7097
7098
if (!refcount_inc_not_zero(&rb->mmap_count))
7099
return -EINVAL;
7100
7101
if (rb_has_aux(rb)) {
7102
refcount_inc(&rb->aux_mmap_count);
7103
7104
} else {
7105
if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
7106
refcount_dec(&rb->mmap_count);
7107
return -EPERM;
7108
}
7109
7110
WARN_ON(!rb && event->rb);
7111
7112
if (vma->vm_flags & VM_WRITE)
7113
rb_flags |= RING_BUFFER_WRITABLE;
7114
7115
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
7116
event->attr.aux_watermark, rb_flags);
7117
if (ret) {
7118
refcount_dec(&rb->mmap_count);
7119
return ret;
7120
}
7121
7122
refcount_set(&rb->aux_mmap_count, 1);
7123
rb->aux_mmap_locked = extra;
7124
}
7125
7126
perf_mmap_account(vma, user_extra, extra);
7127
refcount_inc(&event->mmap_count);
7128
7129
return 0;
7130
}
7131
7132
static int perf_mmap(struct file *file, struct vm_area_struct *vma)
7133
{
7134
struct perf_event *event = file->private_data;
7135
unsigned long vma_size, nr_pages;
7136
mapped_f mapped;
7137
int ret;
7138
7139
/*
7140
* Don't allow mmap() of inherited per-task counters. This would
7141
* create a performance issue due to all children writing to the
7142
* same rb.
7143
*/
7144
if (event->cpu == -1 && event->attr.inherit)
7145
return -EINVAL;
7146
7147
if (!(vma->vm_flags & VM_SHARED))
7148
return -EINVAL;
7149
7150
ret = security_perf_event_read(event);
7151
if (ret)
7152
return ret;
7153
7154
vma_size = vma->vm_end - vma->vm_start;
7155
nr_pages = vma_size / PAGE_SIZE;
7156
7157
if (nr_pages > INT_MAX)
7158
return -ENOMEM;
7159
7160
if (vma_size != PAGE_SIZE * nr_pages)
7161
return -EINVAL;
7162
7163
scoped_guard (mutex, &event->mmap_mutex) {
7164
/*
7165
* This relies on __pmu_detach_event() taking mmap_mutex after marking
7166
* the event REVOKED. Either we observe the state, or __pmu_detach_event()
7167
* will detach the rb created here.
7168
*/
7169
if (event->state <= PERF_EVENT_STATE_REVOKED)
7170
return -ENODEV;
7171
7172
if (vma->vm_pgoff == 0)
7173
ret = perf_mmap_rb(vma, event, nr_pages);
7174
else
7175
ret = perf_mmap_aux(vma, event, nr_pages);
7176
if (ret)
7177
return ret;
7178
}
7179
7180
/*
7181
* Since pinned accounting is per vm we cannot allow fork() to copy our
7182
* vma.
7183
*/
7184
vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
7185
vma->vm_ops = &perf_mmap_vmops;
7186
7187
mapped = get_mapped(event, event_mapped);
7188
if (mapped)
7189
mapped(event, vma->vm_mm);
7190
7191
/*
7192
* Try to map it into the page table. On fail, invoke
7193
* perf_mmap_close() to undo the above, as the callsite expects
7194
* full cleanup in this case and therefore does not invoke
7195
* vmops::close().
7196
*/
7197
ret = map_range(event->rb, vma);
7198
if (ret)
7199
perf_mmap_close(vma);
7200
7201
return ret;
7202
}
7203
7204
static int perf_fasync(int fd, struct file *filp, int on)
7205
{
7206
struct inode *inode = file_inode(filp);
7207
struct perf_event *event = filp->private_data;
7208
int retval;
7209
7210
if (event->state <= PERF_EVENT_STATE_REVOKED)
7211
return -ENODEV;
7212
7213
inode_lock(inode);
7214
retval = fasync_helper(fd, filp, on, &event->fasync);
7215
inode_unlock(inode);
7216
7217
if (retval < 0)
7218
return retval;
7219
7220
return 0;
7221
}
7222
7223
static const struct file_operations perf_fops = {
7224
.release = perf_release,
7225
.read = perf_read,
7226
.poll = perf_poll,
7227
.unlocked_ioctl = perf_ioctl,
7228
.compat_ioctl = perf_compat_ioctl,
7229
.mmap = perf_mmap,
7230
.fasync = perf_fasync,
7231
};
7232
7233
/*
7234
* Perf event wakeup
7235
*
7236
* If there's data, ensure we set the poll() state and publish everything
7237
* to user-space before waking everybody up.
7238
*/
7239
7240
void perf_event_wakeup(struct perf_event *event)
7241
{
7242
ring_buffer_wakeup(event);
7243
7244
if (event->pending_kill) {
7245
kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
7246
event->pending_kill = 0;
7247
}
7248
}
7249
7250
static void perf_sigtrap(struct perf_event *event)
7251
{
7252
/*
7253
* Both perf_pending_task() and perf_pending_irq() can race with the
7254
* task exiting.
7255
*/
7256
if (current->flags & PF_EXITING)
7257
return;
7258
7259
/*
7260
* We'd expect this to only occur if the irq_work is delayed and either
7261
* ctx->task or current has changed in the meantime. This can be the
7262
* case on architectures that do not implement arch_irq_work_raise().
7263
*/
7264
if (WARN_ON_ONCE(event->ctx->task != current))
7265
return;
7266
7267
send_sig_perf((void __user *)event->pending_addr,
7268
event->orig_type, event->attr.sig_data);
7269
}
7270
7271
/*
7272
* Deliver the pending work in-event-context or follow the context.
7273
*/
7274
static void __perf_pending_disable(struct perf_event *event)
7275
{
7276
int cpu = READ_ONCE(event->oncpu);
7277
7278
/*
7279
* If the event isn't running; we done. event_sched_out() will have
7280
* taken care of things.
7281
*/
7282
if (cpu < 0)
7283
return;
7284
7285
/*
7286
* Yay, we hit home and are in the context of the event.
7287
*/
7288
if (cpu == smp_processor_id()) {
7289
if (event->pending_disable) {
7290
event->pending_disable = 0;
7291
perf_event_disable_local(event);
7292
}
7293
return;
7294
}
7295
7296
/*
7297
* CPU-A CPU-B
7298
*
7299
* perf_event_disable_inatomic()
7300
* @pending_disable = 1;
7301
* irq_work_queue();
7302
*
7303
* sched-out
7304
* @pending_disable = 0;
7305
*
7306
* sched-in
7307
* perf_event_disable_inatomic()
7308
* @pending_disable = 1;
7309
* irq_work_queue(); // FAILS
7310
*
7311
* irq_work_run()
7312
* perf_pending_disable()
7313
*
7314
* But the event runs on CPU-B and wants disabling there.
7315
*/
7316
irq_work_queue_on(&event->pending_disable_irq, cpu);
7317
}
7318
7319
static void perf_pending_disable(struct irq_work *entry)
7320
{
7321
struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq);
7322
int rctx;
7323
7324
/*
7325
* If we 'fail' here, that's OK, it means recursion is already disabled
7326
* and we won't recurse 'further'.
7327
*/
7328
rctx = perf_swevent_get_recursion_context();
7329
__perf_pending_disable(event);
7330
if (rctx >= 0)
7331
perf_swevent_put_recursion_context(rctx);
7332
}
7333
7334
static void perf_pending_irq(struct irq_work *entry)
7335
{
7336
struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
7337
int rctx;
7338
7339
/*
7340
* If we 'fail' here, that's OK, it means recursion is already disabled
7341
* and we won't recurse 'further'.
7342
*/
7343
rctx = perf_swevent_get_recursion_context();
7344
7345
/*
7346
* The wakeup isn't bound to the context of the event -- it can happen
7347
* irrespective of where the event is.
7348
*/
7349
if (event->pending_wakeup) {
7350
event->pending_wakeup = 0;
7351
perf_event_wakeup(event);
7352
}
7353
7354
if (rctx >= 0)
7355
perf_swevent_put_recursion_context(rctx);
7356
}
7357
7358
static void perf_pending_task(struct callback_head *head)
7359
{
7360
struct perf_event *event = container_of(head, struct perf_event, pending_task);
7361
int rctx;
7362
7363
/*
7364
* If we 'fail' here, that's OK, it means recursion is already disabled
7365
* and we won't recurse 'further'.
7366
*/
7367
rctx = perf_swevent_get_recursion_context();
7368
7369
if (event->pending_work) {
7370
event->pending_work = 0;
7371
perf_sigtrap(event);
7372
local_dec(&event->ctx->nr_no_switch_fast);
7373
}
7374
put_event(event);
7375
7376
if (rctx >= 0)
7377
perf_swevent_put_recursion_context(rctx);
7378
}
7379
7380
#ifdef CONFIG_GUEST_PERF_EVENTS
7381
struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
7382
7383
DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state);
7384
DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
7385
DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);
7386
7387
void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
7388
{
7389
if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
7390
return;
7391
7392
rcu_assign_pointer(perf_guest_cbs, cbs);
7393
static_call_update(__perf_guest_state, cbs->state);
7394
static_call_update(__perf_guest_get_ip, cbs->get_ip);
7395
7396
/* Implementing ->handle_intel_pt_intr is optional. */
7397
if (cbs->handle_intel_pt_intr)
7398
static_call_update(__perf_guest_handle_intel_pt_intr,
7399
cbs->handle_intel_pt_intr);
7400
}
7401
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
7402
7403
void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
7404
{
7405
if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
7406
return;
7407
7408
rcu_assign_pointer(perf_guest_cbs, NULL);
7409
static_call_update(__perf_guest_state, (void *)&__static_call_return0);
7410
static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0);
7411
static_call_update(__perf_guest_handle_intel_pt_intr,
7412
(void *)&__static_call_return0);
7413
synchronize_rcu();
7414
}
7415
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
7416
#endif
7417
7418
static bool should_sample_guest(struct perf_event *event)
7419
{
7420
return !event->attr.exclude_guest && perf_guest_state();
7421
}
7422
7423
unsigned long perf_misc_flags(struct perf_event *event,
7424
struct pt_regs *regs)
7425
{
7426
if (should_sample_guest(event))
7427
return perf_arch_guest_misc_flags(regs);
7428
7429
return perf_arch_misc_flags(regs);
7430
}
7431
7432
unsigned long perf_instruction_pointer(struct perf_event *event,
7433
struct pt_regs *regs)
7434
{
7435
if (should_sample_guest(event))
7436
return perf_guest_get_ip();
7437
7438
return perf_arch_instruction_pointer(regs);
7439
}
7440
7441
static void
7442
perf_output_sample_regs(struct perf_output_handle *handle,
7443
struct pt_regs *regs, u64 mask)
7444
{
7445
int bit;
7446
DECLARE_BITMAP(_mask, 64);
7447
7448
bitmap_from_u64(_mask, mask);
7449
for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
7450
u64 val;
7451
7452
val = perf_reg_value(regs, bit);
7453
perf_output_put(handle, val);
7454
}
7455
}
7456
7457
static void perf_sample_regs_user(struct perf_regs *regs_user,
7458
struct pt_regs *regs)
7459
{
7460
if (user_mode(regs)) {
7461
regs_user->abi = perf_reg_abi(current);
7462
regs_user->regs = regs;
7463
} else if (is_user_task(current)) {
7464
perf_get_regs_user(regs_user, regs);
7465
} else {
7466
regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
7467
regs_user->regs = NULL;
7468
}
7469
}
7470
7471
static void perf_sample_regs_intr(struct perf_regs *regs_intr,
7472
struct pt_regs *regs)
7473
{
7474
regs_intr->regs = regs;
7475
regs_intr->abi = perf_reg_abi(current);
7476
}
7477
7478
7479
/*
7480
* Get remaining task size from user stack pointer.
7481
*
7482
* It'd be better to take stack vma map and limit this more
7483
* precisely, but there's no way to get it safely under interrupt,
7484
* so using TASK_SIZE as limit.
7485
*/
7486
static u64 perf_ustack_task_size(struct pt_regs *regs)
7487
{
7488
unsigned long addr = perf_user_stack_pointer(regs);
7489
7490
if (!addr || addr >= TASK_SIZE)
7491
return 0;
7492
7493
return TASK_SIZE - addr;
7494
}
7495
7496
static u16
7497
perf_sample_ustack_size(u16 stack_size, u16 header_size,
7498
struct pt_regs *regs)
7499
{
7500
u64 task_size;
7501
7502
/* No regs, no stack pointer, no dump. */
7503
if (!regs)
7504
return 0;
7505
7506
/* No mm, no stack, no dump. */
7507
if (!current->mm)
7508
return 0;
7509
7510
/*
7511
* Check if we fit in with the requested stack size into the:
7512
* - TASK_SIZE
7513
* If we don't, we limit the size to the TASK_SIZE.
7514
*
7515
* - remaining sample size
7516
* If we don't, we customize the stack size to
7517
* fit in to the remaining sample size.
7518
*/
7519
7520
task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
7521
stack_size = min(stack_size, (u16) task_size);
7522
7523
/* Current header size plus static size and dynamic size. */
7524
header_size += 2 * sizeof(u64);
7525
7526
/* Do we fit in with the current stack dump size? */
7527
if ((u16) (header_size + stack_size) < header_size) {
7528
/*
7529
* If we overflow the maximum size for the sample,
7530
* we customize the stack dump size to fit in.
7531
*/
7532
stack_size = USHRT_MAX - header_size - sizeof(u64);
7533
stack_size = round_up(stack_size, sizeof(u64));
7534
}
7535
7536
return stack_size;
7537
}
7538
7539
static void
7540
perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
7541
struct pt_regs *regs)
7542
{
7543
/* Case of a kernel thread, nothing to dump */
7544
if (!regs) {
7545
u64 size = 0;
7546
perf_output_put(handle, size);
7547
} else {
7548
unsigned long sp;
7549
unsigned int rem;
7550
u64 dyn_size;
7551
7552
/*
7553
* We dump:
7554
* static size
7555
* - the size requested by user or the best one we can fit
7556
* in to the sample max size
7557
* data
7558
* - user stack dump data
7559
* dynamic size
7560
* - the actual dumped size
7561
*/
7562
7563
/* Static size. */
7564
perf_output_put(handle, dump_size);
7565
7566
/* Data. */
7567
sp = perf_user_stack_pointer(regs);
7568
rem = __output_copy_user(handle, (void *) sp, dump_size);
7569
dyn_size = dump_size - rem;
7570
7571
perf_output_skip(handle, rem);
7572
7573
/* Dynamic size. */
7574
perf_output_put(handle, dyn_size);
7575
}
7576
}
7577
7578
static unsigned long perf_prepare_sample_aux(struct perf_event *event,
7579
struct perf_sample_data *data,
7580
size_t size)
7581
{
7582
struct perf_event *sampler = event->aux_event;
7583
struct perf_buffer *rb;
7584
7585
data->aux_size = 0;
7586
7587
if (!sampler)
7588
goto out;
7589
7590
if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
7591
goto out;
7592
7593
if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
7594
goto out;
7595
7596
rb = ring_buffer_get(sampler);
7597
if (!rb)
7598
goto out;
7599
7600
/*
7601
* If this is an NMI hit inside sampling code, don't take
7602
* the sample. See also perf_aux_sample_output().
7603
*/
7604
if (READ_ONCE(rb->aux_in_sampling)) {
7605
data->aux_size = 0;
7606
} else {
7607
size = min_t(size_t, size, perf_aux_size(rb));
7608
data->aux_size = ALIGN(size, sizeof(u64));
7609
}
7610
ring_buffer_put(rb);
7611
7612
out:
7613
return data->aux_size;
7614
}
7615
7616
static long perf_pmu_snapshot_aux(struct perf_buffer *rb,
7617
struct perf_event *event,
7618
struct perf_output_handle *handle,
7619
unsigned long size)
7620
{
7621
unsigned long flags;
7622
long ret;
7623
7624
/*
7625
* Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
7626
* paths. If we start calling them in NMI context, they may race with
7627
* the IRQ ones, that is, for example, re-starting an event that's just
7628
* been stopped, which is why we're using a separate callback that
7629
* doesn't change the event state.
7630
*
7631
* IRQs need to be disabled to prevent IPIs from racing with us.
7632
*/
7633
local_irq_save(flags);
7634
/*
7635
* Guard against NMI hits inside the critical section;
7636
* see also perf_prepare_sample_aux().
7637
*/
7638
WRITE_ONCE(rb->aux_in_sampling, 1);
7639
barrier();
7640
7641
ret = event->pmu->snapshot_aux(event, handle, size);
7642
7643
barrier();
7644
WRITE_ONCE(rb->aux_in_sampling, 0);
7645
local_irq_restore(flags);
7646
7647
return ret;
7648
}
7649
7650
static void perf_aux_sample_output(struct perf_event *event,
7651
struct perf_output_handle *handle,
7652
struct perf_sample_data *data)
7653
{
7654
struct perf_event *sampler = event->aux_event;
7655
struct perf_buffer *rb;
7656
unsigned long pad;
7657
long size;
7658
7659
if (WARN_ON_ONCE(!sampler || !data->aux_size))
7660
return;
7661
7662
rb = ring_buffer_get(sampler);
7663
if (!rb)
7664
return;
7665
7666
size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
7667
7668
/*
7669
* An error here means that perf_output_copy() failed (returned a
7670
* non-zero surplus that it didn't copy), which in its current
7671
* enlightened implementation is not possible. If that changes, we'd
7672
* like to know.
7673
*/
7674
if (WARN_ON_ONCE(size < 0))
7675
goto out_put;
7676
7677
/*
7678
* The pad comes from ALIGN()ing data->aux_size up to u64 in
7679
* perf_prepare_sample_aux(), so should not be more than that.
7680
*/
7681
pad = data->aux_size - size;
7682
if (WARN_ON_ONCE(pad >= sizeof(u64)))
7683
pad = 8;
7684
7685
if (pad) {
7686
u64 zero = 0;
7687
perf_output_copy(handle, &zero, pad);
7688
}
7689
7690
out_put:
7691
ring_buffer_put(rb);
7692
}
7693
7694
/*
7695
* A set of common sample data types saved even for non-sample records
7696
* when event->attr.sample_id_all is set.
7697
*/
7698
#define PERF_SAMPLE_ID_ALL (PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \
7699
PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID | \
7700
PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER)
7701
7702
static void __perf_event_header__init_id(struct perf_sample_data *data,
7703
struct perf_event *event,
7704
u64 sample_type)
7705
{
7706
data->type = event->attr.sample_type;
7707
data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL;
7708
7709
if (sample_type & PERF_SAMPLE_TID) {
7710
/* namespace issues */
7711
data->tid_entry.pid = perf_event_pid(event, current);
7712
data->tid_entry.tid = perf_event_tid(event, current);
7713
}
7714
7715
if (sample_type & PERF_SAMPLE_TIME)
7716
data->time = perf_event_clock(event);
7717
7718
if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
7719
data->id = primary_event_id(event);
7720
7721
if (sample_type & PERF_SAMPLE_STREAM_ID)
7722
data->stream_id = event->id;
7723
7724
if (sample_type & PERF_SAMPLE_CPU) {
7725
data->cpu_entry.cpu = raw_smp_processor_id();
7726
data->cpu_entry.reserved = 0;
7727
}
7728
}
7729
7730
void perf_event_header__init_id(struct perf_event_header *header,
7731
struct perf_sample_data *data,
7732
struct perf_event *event)
7733
{
7734
if (event->attr.sample_id_all) {
7735
header->size += event->id_header_size;
7736
__perf_event_header__init_id(data, event, event->attr.sample_type);
7737
}
7738
}
7739
7740
static void __perf_event__output_id_sample(struct perf_output_handle *handle,
7741
struct perf_sample_data *data)
7742
{
7743
u64 sample_type = data->type;
7744
7745
if (sample_type & PERF_SAMPLE_TID)
7746
perf_output_put(handle, data->tid_entry);
7747
7748
if (sample_type & PERF_SAMPLE_TIME)
7749
perf_output_put(handle, data->time);
7750
7751
if (sample_type & PERF_SAMPLE_ID)
7752
perf_output_put(handle, data->id);
7753
7754
if (sample_type & PERF_SAMPLE_STREAM_ID)
7755
perf_output_put(handle, data->stream_id);
7756
7757
if (sample_type & PERF_SAMPLE_CPU)
7758
perf_output_put(handle, data->cpu_entry);
7759
7760
if (sample_type & PERF_SAMPLE_IDENTIFIER)
7761
perf_output_put(handle, data->id);
7762
}
7763
7764
void perf_event__output_id_sample(struct perf_event *event,
7765
struct perf_output_handle *handle,
7766
struct perf_sample_data *sample)
7767
{
7768
if (event->attr.sample_id_all)
7769
__perf_event__output_id_sample(handle, sample);
7770
}
7771
7772
static void perf_output_read_one(struct perf_output_handle *handle,
7773
struct perf_event *event,
7774
u64 enabled, u64 running)
7775
{
7776
u64 read_format = event->attr.read_format;
7777
u64 values[5];
7778
int n = 0;
7779
7780
values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr));
7781
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
7782
values[n++] = enabled +
7783
atomic64_read(&event->child_total_time_enabled);
7784
}
7785
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
7786
values[n++] = running +
7787
atomic64_read(&event->child_total_time_running);
7788
}
7789
if (read_format & PERF_FORMAT_ID)
7790
values[n++] = primary_event_id(event);
7791
if (read_format & PERF_FORMAT_LOST)
7792
values[n++] = atomic64_read(&event->lost_samples);
7793
7794
__output_copy(handle, values, n * sizeof(u64));
7795
}
7796
7797
static void perf_output_read_group(struct perf_output_handle *handle,
7798
struct perf_event *event,
7799
u64 enabled, u64 running)
7800
{
7801
struct perf_event *leader = event->group_leader, *sub;
7802
u64 read_format = event->attr.read_format;
7803
unsigned long flags;
7804
u64 values[6];
7805
int n = 0;
7806
bool self = has_inherit_and_sample_read(&event->attr);
7807
7808
/*
7809
* Disabling interrupts avoids all counter scheduling
7810
* (context switches, timer based rotation and IPIs).
7811
*/
7812
local_irq_save(flags);
7813
7814
values[n++] = 1 + leader->nr_siblings;
7815
7816
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
7817
values[n++] = enabled;
7818
7819
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
7820
values[n++] = running;
7821
7822
if ((leader != event) && !handle->skip_read)
7823
perf_pmu_read(leader);
7824
7825
values[n++] = perf_event_count(leader, self);
7826
if (read_format & PERF_FORMAT_ID)
7827
values[n++] = primary_event_id(leader);
7828
if (read_format & PERF_FORMAT_LOST)
7829
values[n++] = atomic64_read(&leader->lost_samples);
7830
7831
__output_copy(handle, values, n * sizeof(u64));
7832
7833
for_each_sibling_event(sub, leader) {
7834
n = 0;
7835
7836
if ((sub != event) && !handle->skip_read)
7837
perf_pmu_read(sub);
7838
7839
values[n++] = perf_event_count(sub, self);
7840
if (read_format & PERF_FORMAT_ID)
7841
values[n++] = primary_event_id(sub);
7842
if (read_format & PERF_FORMAT_LOST)
7843
values[n++] = atomic64_read(&sub->lost_samples);
7844
7845
__output_copy(handle, values, n * sizeof(u64));
7846
}
7847
7848
local_irq_restore(flags);
7849
}
7850
7851
#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
7852
PERF_FORMAT_TOTAL_TIME_RUNNING)
7853
7854
/*
7855
* XXX PERF_SAMPLE_READ vs inherited events seems difficult.
7856
*
7857
* The problem is that its both hard and excessively expensive to iterate the
7858
* child list, not to mention that its impossible to IPI the children running
7859
* on another CPU, from interrupt/NMI context.
7860
*
7861
* Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread
7862
* counts rather than attempting to accumulate some value across all children on
7863
* all cores.
7864
*/
7865
static void perf_output_read(struct perf_output_handle *handle,
7866
struct perf_event *event)
7867
{
7868
u64 enabled = 0, running = 0, now;
7869
u64 read_format = event->attr.read_format;
7870
7871
/*
7872
* compute total_time_enabled, total_time_running
7873
* based on snapshot values taken when the event
7874
* was last scheduled in.
7875
*
7876
* we cannot simply called update_context_time()
7877
* because of locking issue as we are called in
7878
* NMI context
7879
*/
7880
if (read_format & PERF_FORMAT_TOTAL_TIMES)
7881
calc_timer_values(event, &now, &enabled, &running);
7882
7883
if (event->attr.read_format & PERF_FORMAT_GROUP)
7884
perf_output_read_group(handle, event, enabled, running);
7885
else
7886
perf_output_read_one(handle, event, enabled, running);
7887
}
7888
7889
void perf_output_sample(struct perf_output_handle *handle,
7890
struct perf_event_header *header,
7891
struct perf_sample_data *data,
7892
struct perf_event *event)
7893
{
7894
u64 sample_type = data->type;
7895
7896
if (data->sample_flags & PERF_SAMPLE_READ)
7897
handle->skip_read = 1;
7898
7899
perf_output_put(handle, *header);
7900
7901
if (sample_type & PERF_SAMPLE_IDENTIFIER)
7902
perf_output_put(handle, data->id);
7903
7904
if (sample_type & PERF_SAMPLE_IP)
7905
perf_output_put(handle, data->ip);
7906
7907
if (sample_type & PERF_SAMPLE_TID)
7908
perf_output_put(handle, data->tid_entry);
7909
7910
if (sample_type & PERF_SAMPLE_TIME)
7911
perf_output_put(handle, data->time);
7912
7913
if (sample_type & PERF_SAMPLE_ADDR)
7914
perf_output_put(handle, data->addr);
7915
7916
if (sample_type & PERF_SAMPLE_ID)
7917
perf_output_put(handle, data->id);
7918
7919
if (sample_type & PERF_SAMPLE_STREAM_ID)
7920
perf_output_put(handle, data->stream_id);
7921
7922
if (sample_type & PERF_SAMPLE_CPU)
7923
perf_output_put(handle, data->cpu_entry);
7924
7925
if (sample_type & PERF_SAMPLE_PERIOD)
7926
perf_output_put(handle, data->period);
7927
7928
if (sample_type & PERF_SAMPLE_READ)
7929
perf_output_read(handle, event);
7930
7931
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
7932
int size = 1;
7933
7934
size += data->callchain->nr;
7935
size *= sizeof(u64);
7936
__output_copy(handle, data->callchain, size);
7937
}
7938
7939
if (sample_type & PERF_SAMPLE_RAW) {
7940
struct perf_raw_record *raw = data->raw;
7941
7942
if (raw) {
7943
struct perf_raw_frag *frag = &raw->frag;
7944
7945
perf_output_put(handle, raw->size);
7946
do {
7947
if (frag->copy) {
7948
__output_custom(handle, frag->copy,
7949
frag->data, frag->size);
7950
} else {
7951
__output_copy(handle, frag->data,
7952
frag->size);
7953
}
7954
if (perf_raw_frag_last(frag))
7955
break;
7956
frag = frag->next;
7957
} while (1);
7958
if (frag->pad)
7959
__output_skip(handle, NULL, frag->pad);
7960
} else {
7961
struct {
7962
u32 size;
7963
u32 data;
7964
} raw = {
7965
.size = sizeof(u32),
7966
.data = 0,
7967
};
7968
perf_output_put(handle, raw);
7969
}
7970
}
7971
7972
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
7973
if (data->br_stack) {
7974
size_t size;
7975
7976
size = data->br_stack->nr
7977
* sizeof(struct perf_branch_entry);
7978
7979
perf_output_put(handle, data->br_stack->nr);
7980
if (branch_sample_hw_index(event))
7981
perf_output_put(handle, data->br_stack->hw_idx);
7982
perf_output_copy(handle, data->br_stack->entries, size);
7983
/*
7984
* Add the extension space which is appended
7985
* right after the struct perf_branch_stack.
7986
*/
7987
if (data->br_stack_cntr) {
7988
size = data->br_stack->nr * sizeof(u64);
7989
perf_output_copy(handle, data->br_stack_cntr, size);
7990
}
7991
} else {
7992
/*
7993
* we always store at least the value of nr
7994
*/
7995
u64 nr = 0;
7996
perf_output_put(handle, nr);
7997
}
7998
}
7999
8000
if (sample_type & PERF_SAMPLE_REGS_USER) {
8001
u64 abi = data->regs_user.abi;
8002
8003
/*
8004
* If there are no regs to dump, notice it through
8005
* first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
8006
*/
8007
perf_output_put(handle, abi);
8008
8009
if (abi) {
8010
u64 mask = event->attr.sample_regs_user;
8011
perf_output_sample_regs(handle,
8012
data->regs_user.regs,
8013
mask);
8014
}
8015
}
8016
8017
if (sample_type & PERF_SAMPLE_STACK_USER) {
8018
perf_output_sample_ustack(handle,
8019
data->stack_user_size,
8020
data->regs_user.regs);
8021
}
8022
8023
if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
8024
perf_output_put(handle, data->weight.full);
8025
8026
if (sample_type & PERF_SAMPLE_DATA_SRC)
8027
perf_output_put(handle, data->data_src.val);
8028
8029
if (sample_type & PERF_SAMPLE_TRANSACTION)
8030
perf_output_put(handle, data->txn);
8031
8032
if (sample_type & PERF_SAMPLE_REGS_INTR) {
8033
u64 abi = data->regs_intr.abi;
8034
/*
8035
* If there are no regs to dump, notice it through
8036
* first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
8037
*/
8038
perf_output_put(handle, abi);
8039
8040
if (abi) {
8041
u64 mask = event->attr.sample_regs_intr;
8042
8043
perf_output_sample_regs(handle,
8044
data->regs_intr.regs,
8045
mask);
8046
}
8047
}
8048
8049
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
8050
perf_output_put(handle, data->phys_addr);
8051
8052
if (sample_type & PERF_SAMPLE_CGROUP)
8053
perf_output_put(handle, data->cgroup);
8054
8055
if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
8056
perf_output_put(handle, data->data_page_size);
8057
8058
if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
8059
perf_output_put(handle, data->code_page_size);
8060
8061
if (sample_type & PERF_SAMPLE_AUX) {
8062
perf_output_put(handle, data->aux_size);
8063
8064
if (data->aux_size)
8065
perf_aux_sample_output(event, handle, data);
8066
}
8067
8068
if (!event->attr.watermark) {
8069
int wakeup_events = event->attr.wakeup_events;
8070
8071
if (wakeup_events) {
8072
struct perf_buffer *rb = handle->rb;
8073
int events = local_inc_return(&rb->events);
8074
8075
if (events >= wakeup_events) {
8076
local_sub(wakeup_events, &rb->events);
8077
local_inc(&rb->wakeup);
8078
}
8079
}
8080
}
8081
}
8082
8083
static u64 perf_virt_to_phys(u64 virt)
8084
{
8085
u64 phys_addr = 0;
8086
8087
if (!virt)
8088
return 0;
8089
8090
if (virt >= TASK_SIZE) {
8091
/* If it's vmalloc()d memory, leave phys_addr as 0 */
8092
if (virt_addr_valid((void *)(uintptr_t)virt) &&
8093
!(virt >= VMALLOC_START && virt < VMALLOC_END))
8094
phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
8095
} else {
8096
/*
8097
* Walking the pages tables for user address.
8098
* Interrupts are disabled, so it prevents any tear down
8099
* of the page tables.
8100
* Try IRQ-safe get_user_page_fast_only first.
8101
* If failed, leave phys_addr as 0.
8102
*/
8103
if (is_user_task(current)) {
8104
struct page *p;
8105
8106
pagefault_disable();
8107
if (get_user_page_fast_only(virt, 0, &p)) {
8108
phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
8109
put_page(p);
8110
}
8111
pagefault_enable();
8112
}
8113
}
8114
8115
return phys_addr;
8116
}
8117
8118
/*
8119
* Return the pagetable size of a given virtual address.
8120
*/
8121
static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
8122
{
8123
u64 size = 0;
8124
8125
#ifdef CONFIG_HAVE_GUP_FAST
8126
pgd_t *pgdp, pgd;
8127
p4d_t *p4dp, p4d;
8128
pud_t *pudp, pud;
8129
pmd_t *pmdp, pmd;
8130
pte_t *ptep, pte;
8131
8132
pgdp = pgd_offset(mm, addr);
8133
pgd = READ_ONCE(*pgdp);
8134
if (pgd_none(pgd))
8135
return 0;
8136
8137
if (pgd_leaf(pgd))
8138
return pgd_leaf_size(pgd);
8139
8140
p4dp = p4d_offset_lockless(pgdp, pgd, addr);
8141
p4d = READ_ONCE(*p4dp);
8142
if (!p4d_present(p4d))
8143
return 0;
8144
8145
if (p4d_leaf(p4d))
8146
return p4d_leaf_size(p4d);
8147
8148
pudp = pud_offset_lockless(p4dp, p4d, addr);
8149
pud = READ_ONCE(*pudp);
8150
if (!pud_present(pud))
8151
return 0;
8152
8153
if (pud_leaf(pud))
8154
return pud_leaf_size(pud);
8155
8156
pmdp = pmd_offset_lockless(pudp, pud, addr);
8157
again:
8158
pmd = pmdp_get_lockless(pmdp);
8159
if (!pmd_present(pmd))
8160
return 0;
8161
8162
if (pmd_leaf(pmd))
8163
return pmd_leaf_size(pmd);
8164
8165
ptep = pte_offset_map(&pmd, addr);
8166
if (!ptep)
8167
goto again;
8168
8169
pte = ptep_get_lockless(ptep);
8170
if (pte_present(pte))
8171
size = __pte_leaf_size(pmd, pte);
8172
pte_unmap(ptep);
8173
#endif /* CONFIG_HAVE_GUP_FAST */
8174
8175
return size;
8176
}
8177
8178
static u64 perf_get_page_size(unsigned long addr)
8179
{
8180
struct mm_struct *mm;
8181
unsigned long flags;
8182
u64 size;
8183
8184
if (!addr)
8185
return 0;
8186
8187
/*
8188
* Software page-table walkers must disable IRQs,
8189
* which prevents any tear down of the page tables.
8190
*/
8191
local_irq_save(flags);
8192
8193
mm = current->mm;
8194
if (!mm) {
8195
/*
8196
* For kernel threads and the like, use init_mm so that
8197
* we can find kernel memory.
8198
*/
8199
mm = &init_mm;
8200
}
8201
8202
size = perf_get_pgtable_size(mm, addr);
8203
8204
local_irq_restore(flags);
8205
8206
return size;
8207
}
8208
8209
static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
8210
8211
static struct unwind_work perf_unwind_work;
8212
8213
struct perf_callchain_entry *
8214
perf_callchain(struct perf_event *event, struct pt_regs *regs)
8215
{
8216
bool kernel = !event->attr.exclude_callchain_kernel;
8217
bool user = !event->attr.exclude_callchain_user &&
8218
is_user_task(current);
8219
/* Disallow cross-task user callchains. */
8220
bool crosstask = event->ctx->task && event->ctx->task != current;
8221
bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user &&
8222
event->attr.defer_callchain;
8223
const u32 max_stack = event->attr.sample_max_stack;
8224
struct perf_callchain_entry *callchain;
8225
u64 defer_cookie;
8226
8227
if (!current->mm)
8228
user = false;
8229
8230
if (!kernel && !user)
8231
return &__empty_callchain;
8232
8233
if (!(user && defer_user && !crosstask &&
8234
unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0))
8235
defer_cookie = 0;
8236
8237
callchain = get_perf_callchain(regs, kernel, user, max_stack,
8238
crosstask, true, defer_cookie);
8239
8240
return callchain ?: &__empty_callchain;
8241
}
8242
8243
static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d)
8244
{
8245
return d * !!(flags & s);
8246
}
8247
8248
void perf_prepare_sample(struct perf_sample_data *data,
8249
struct perf_event *event,
8250
struct pt_regs *regs)
8251
{
8252
u64 sample_type = event->attr.sample_type;
8253
u64 filtered_sample_type;
8254
8255
/*
8256
* Add the sample flags that are dependent to others. And clear the
8257
* sample flags that have already been done by the PMU driver.
8258
*/
8259
filtered_sample_type = sample_type;
8260
filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE,
8261
PERF_SAMPLE_IP);
8262
filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE |
8263
PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR);
8264
filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER,
8265
PERF_SAMPLE_REGS_USER);
8266
filtered_sample_type &= ~data->sample_flags;
8267
8268
if (filtered_sample_type == 0) {
8269
/* Make sure it has the correct data->type for output */
8270
data->type = event->attr.sample_type;
8271
return;
8272
}
8273
8274
__perf_event_header__init_id(data, event, filtered_sample_type);
8275
8276
if (filtered_sample_type & PERF_SAMPLE_IP) {
8277
data->ip = perf_instruction_pointer(event, regs);
8278
data->sample_flags |= PERF_SAMPLE_IP;
8279
}
8280
8281
if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
8282
perf_sample_save_callchain(data, event, regs);
8283
8284
if (filtered_sample_type & PERF_SAMPLE_RAW) {
8285
data->raw = NULL;
8286
data->dyn_size += sizeof(u64);
8287
data->sample_flags |= PERF_SAMPLE_RAW;
8288
}
8289
8290
if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) {
8291
data->br_stack = NULL;
8292
data->dyn_size += sizeof(u64);
8293
data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
8294
}
8295
8296
if (filtered_sample_type & PERF_SAMPLE_REGS_USER)
8297
perf_sample_regs_user(&data->regs_user, regs);
8298
8299
/*
8300
* It cannot use the filtered_sample_type here as REGS_USER can be set
8301
* by STACK_USER (using __cond_set() above) and we don't want to update
8302
* the dyn_size if it's not requested by users.
8303
*/
8304
if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) {
8305
/* regs dump ABI info */
8306
int size = sizeof(u64);
8307
8308
if (data->regs_user.regs) {
8309
u64 mask = event->attr.sample_regs_user;
8310
size += hweight64(mask) * sizeof(u64);
8311
}
8312
8313
data->dyn_size += size;
8314
data->sample_flags |= PERF_SAMPLE_REGS_USER;
8315
}
8316
8317
if (filtered_sample_type & PERF_SAMPLE_STACK_USER) {
8318
/*
8319
* Either we need PERF_SAMPLE_STACK_USER bit to be always
8320
* processed as the last one or have additional check added
8321
* in case new sample type is added, because we could eat
8322
* up the rest of the sample size.
8323
*/
8324
u16 stack_size = event->attr.sample_stack_user;
8325
u16 header_size = perf_sample_data_size(data, event);
8326
u16 size = sizeof(u64);
8327
8328
stack_size = perf_sample_ustack_size(stack_size, header_size,
8329
data->regs_user.regs);
8330
8331
/*
8332
* If there is something to dump, add space for the dump
8333
* itself and for the field that tells the dynamic size,
8334
* which is how many have been actually dumped.
8335
*/
8336
if (stack_size)
8337
size += sizeof(u64) + stack_size;
8338
8339
data->stack_user_size = stack_size;
8340
data->dyn_size += size;
8341
data->sample_flags |= PERF_SAMPLE_STACK_USER;
8342
}
8343
8344
if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
8345
data->weight.full = 0;
8346
data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
8347
}
8348
8349
if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) {
8350
data->data_src.val = PERF_MEM_NA;
8351
data->sample_flags |= PERF_SAMPLE_DATA_SRC;
8352
}
8353
8354
if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) {
8355
data->txn = 0;
8356
data->sample_flags |= PERF_SAMPLE_TRANSACTION;
8357
}
8358
8359
if (filtered_sample_type & PERF_SAMPLE_ADDR) {
8360
data->addr = 0;
8361
data->sample_flags |= PERF_SAMPLE_ADDR;
8362
}
8363
8364
if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) {
8365
/* regs dump ABI info */
8366
int size = sizeof(u64);
8367
8368
perf_sample_regs_intr(&data->regs_intr, regs);
8369
8370
if (data->regs_intr.regs) {
8371
u64 mask = event->attr.sample_regs_intr;
8372
8373
size += hweight64(mask) * sizeof(u64);
8374
}
8375
8376
data->dyn_size += size;
8377
data->sample_flags |= PERF_SAMPLE_REGS_INTR;
8378
}
8379
8380
if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) {
8381
data->phys_addr = perf_virt_to_phys(data->addr);
8382
data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
8383
}
8384
8385
#ifdef CONFIG_CGROUP_PERF
8386
if (filtered_sample_type & PERF_SAMPLE_CGROUP) {
8387
struct cgroup *cgrp;
8388
8389
/* protected by RCU */
8390
cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
8391
data->cgroup = cgroup_id(cgrp);
8392
data->sample_flags |= PERF_SAMPLE_CGROUP;
8393
}
8394
#endif
8395
8396
/*
8397
* PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't
8398
* require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
8399
* but the value will not dump to the userspace.
8400
*/
8401
if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) {
8402
data->data_page_size = perf_get_page_size(data->addr);
8403
data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE;
8404
}
8405
8406
if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) {
8407
data->code_page_size = perf_get_page_size(data->ip);
8408
data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE;
8409
}
8410
8411
if (filtered_sample_type & PERF_SAMPLE_AUX) {
8412
u64 size;
8413
u16 header_size = perf_sample_data_size(data, event);
8414
8415
header_size += sizeof(u64); /* size */
8416
8417
/*
8418
* Given the 16bit nature of header::size, an AUX sample can
8419
* easily overflow it, what with all the preceding sample bits.
8420
* Make sure this doesn't happen by using up to U16_MAX bytes
8421
* per sample in total (rounded down to 8 byte boundary).
8422
*/
8423
size = min_t(size_t, U16_MAX - header_size,
8424
event->attr.aux_sample_size);
8425
size = rounddown(size, 8);
8426
size = perf_prepare_sample_aux(event, data, size);
8427
8428
WARN_ON_ONCE(size + header_size > U16_MAX);
8429
data->dyn_size += size + sizeof(u64); /* size above */
8430
data->sample_flags |= PERF_SAMPLE_AUX;
8431
}
8432
}
8433
8434
void perf_prepare_header(struct perf_event_header *header,
8435
struct perf_sample_data *data,
8436
struct perf_event *event,
8437
struct pt_regs *regs)
8438
{
8439
header->type = PERF_RECORD_SAMPLE;
8440
header->size = perf_sample_data_size(data, event);
8441
header->misc = perf_misc_flags(event, regs);
8442
8443
/*
8444
* If you're adding more sample types here, you likely need to do
8445
* something about the overflowing header::size, like repurpose the
8446
* lowest 3 bits of size, which should be always zero at the moment.
8447
* This raises a more important question, do we really need 512k sized
8448
* samples and why, so good argumentation is in order for whatever you
8449
* do here next.
8450
*/
8451
WARN_ON_ONCE(header->size & 7);
8452
}
8453
8454
static void __perf_event_aux_pause(struct perf_event *event, bool pause)
8455
{
8456
if (pause) {
8457
if (!event->hw.aux_paused) {
8458
event->hw.aux_paused = 1;
8459
event->pmu->stop(event, PERF_EF_PAUSE);
8460
}
8461
} else {
8462
if (event->hw.aux_paused) {
8463
event->hw.aux_paused = 0;
8464
event->pmu->start(event, PERF_EF_RESUME);
8465
}
8466
}
8467
}
8468
8469
static void perf_event_aux_pause(struct perf_event *event, bool pause)
8470
{
8471
struct perf_buffer *rb;
8472
8473
if (WARN_ON_ONCE(!event))
8474
return;
8475
8476
rb = ring_buffer_get(event);
8477
if (!rb)
8478
return;
8479
8480
scoped_guard (irqsave) {
8481
/*
8482
* Guard against self-recursion here. Another event could trip
8483
* this same from NMI context.
8484
*/
8485
if (READ_ONCE(rb->aux_in_pause_resume))
8486
break;
8487
8488
WRITE_ONCE(rb->aux_in_pause_resume, 1);
8489
barrier();
8490
__perf_event_aux_pause(event, pause);
8491
barrier();
8492
WRITE_ONCE(rb->aux_in_pause_resume, 0);
8493
}
8494
ring_buffer_put(rb);
8495
}
8496
8497
static __always_inline int
8498
__perf_event_output(struct perf_event *event,
8499
struct perf_sample_data *data,
8500
struct pt_regs *regs,
8501
int (*output_begin)(struct perf_output_handle *,
8502
struct perf_sample_data *,
8503
struct perf_event *,
8504
unsigned int))
8505
{
8506
struct perf_output_handle handle;
8507
struct perf_event_header header;
8508
int err;
8509
8510
/* protect the callchain buffers */
8511
rcu_read_lock();
8512
8513
perf_prepare_sample(data, event, regs);
8514
perf_prepare_header(&header, data, event, regs);
8515
8516
err = output_begin(&handle, data, event, header.size);
8517
if (err)
8518
goto exit;
8519
8520
perf_output_sample(&handle, &header, data, event);
8521
8522
perf_output_end(&handle);
8523
8524
exit:
8525
rcu_read_unlock();
8526
return err;
8527
}
8528
8529
void
8530
perf_event_output_forward(struct perf_event *event,
8531
struct perf_sample_data *data,
8532
struct pt_regs *regs)
8533
{
8534
__perf_event_output(event, data, regs, perf_output_begin_forward);
8535
}
8536
8537
void
8538
perf_event_output_backward(struct perf_event *event,
8539
struct perf_sample_data *data,
8540
struct pt_regs *regs)
8541
{
8542
__perf_event_output(event, data, regs, perf_output_begin_backward);
8543
}
8544
8545
int
8546
perf_event_output(struct perf_event *event,
8547
struct perf_sample_data *data,
8548
struct pt_regs *regs)
8549
{
8550
return __perf_event_output(event, data, regs, perf_output_begin);
8551
}
8552
8553
/*
8554
* read event_id
8555
*/
8556
8557
struct perf_read_event {
8558
struct perf_event_header header;
8559
8560
u32 pid;
8561
u32 tid;
8562
};
8563
8564
static void
8565
perf_event_read_event(struct perf_event *event,
8566
struct task_struct *task)
8567
{
8568
struct perf_output_handle handle;
8569
struct perf_sample_data sample;
8570
struct perf_read_event read_event = {
8571
.header = {
8572
.type = PERF_RECORD_READ,
8573
.misc = 0,
8574
.size = sizeof(read_event) + event->read_size,
8575
},
8576
.pid = perf_event_pid(event, task),
8577
.tid = perf_event_tid(event, task),
8578
};
8579
int ret;
8580
8581
perf_event_header__init_id(&read_event.header, &sample, event);
8582
ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
8583
if (ret)
8584
return;
8585
8586
perf_output_put(&handle, read_event);
8587
perf_output_read(&handle, event);
8588
perf_event__output_id_sample(event, &handle, &sample);
8589
8590
perf_output_end(&handle);
8591
}
8592
8593
typedef void (perf_iterate_f)(struct perf_event *event, void *data);
8594
8595
static void
8596
perf_iterate_ctx(struct perf_event_context *ctx,
8597
perf_iterate_f output,
8598
void *data, bool all)
8599
{
8600
struct perf_event *event;
8601
8602
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
8603
if (!all) {
8604
if (event->state < PERF_EVENT_STATE_INACTIVE)
8605
continue;
8606
if (!event_filter_match(event))
8607
continue;
8608
}
8609
8610
output(event, data);
8611
}
8612
}
8613
8614
static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
8615
{
8616
struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
8617
struct perf_event *event;
8618
8619
list_for_each_entry_rcu(event, &pel->list, sb_list) {
8620
/*
8621
* Skip events that are not fully formed yet; ensure that
8622
* if we observe event->ctx, both event and ctx will be
8623
* complete enough. See perf_install_in_context().
8624
*/
8625
if (!smp_load_acquire(&event->ctx))
8626
continue;
8627
8628
if (event->state < PERF_EVENT_STATE_INACTIVE)
8629
continue;
8630
if (!event_filter_match(event))
8631
continue;
8632
output(event, data);
8633
}
8634
}
8635
8636
/*
8637
* Iterate all events that need to receive side-band events.
8638
*
8639
* For new callers; ensure that account_pmu_sb_event() includes
8640
* your event, otherwise it might not get delivered.
8641
*/
8642
static void
8643
perf_iterate_sb(perf_iterate_f output, void *data,
8644
struct perf_event_context *task_ctx)
8645
{
8646
struct perf_event_context *ctx;
8647
8648
rcu_read_lock();
8649
preempt_disable();
8650
8651
/*
8652
* If we have task_ctx != NULL we only notify the task context itself.
8653
* The task_ctx is set only for EXIT events before releasing task
8654
* context.
8655
*/
8656
if (task_ctx) {
8657
perf_iterate_ctx(task_ctx, output, data, false);
8658
goto done;
8659
}
8660
8661
perf_iterate_sb_cpu(output, data);
8662
8663
ctx = rcu_dereference(current->perf_event_ctxp);
8664
if (ctx)
8665
perf_iterate_ctx(ctx, output, data, false);
8666
done:
8667
preempt_enable();
8668
rcu_read_unlock();
8669
}
8670
8671
/*
8672
* Clear all file-based filters at exec, they'll have to be
8673
* re-instated when/if these objects are mmapped again.
8674
*/
8675
static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
8676
{
8677
struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
8678
struct perf_addr_filter *filter;
8679
unsigned int restart = 0, count = 0;
8680
unsigned long flags;
8681
8682
if (!has_addr_filter(event))
8683
return;
8684
8685
raw_spin_lock_irqsave(&ifh->lock, flags);
8686
list_for_each_entry(filter, &ifh->list, entry) {
8687
if (filter->path.dentry) {
8688
event->addr_filter_ranges[count].start = 0;
8689
event->addr_filter_ranges[count].size = 0;
8690
restart++;
8691
}
8692
8693
count++;
8694
}
8695
8696
if (restart)
8697
event->addr_filters_gen++;
8698
raw_spin_unlock_irqrestore(&ifh->lock, flags);
8699
8700
if (restart)
8701
perf_event_stop(event, 1);
8702
}
8703
8704
void perf_event_exec(void)
8705
{
8706
struct perf_event_context *ctx;
8707
8708
ctx = perf_pin_task_context(current);
8709
if (!ctx)
8710
return;
8711
8712
perf_event_enable_on_exec(ctx);
8713
perf_event_remove_on_exec(ctx);
8714
scoped_guard(rcu)
8715
perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);
8716
8717
perf_unpin_context(ctx);
8718
put_ctx(ctx);
8719
}
8720
8721
struct remote_output {
8722
struct perf_buffer *rb;
8723
int err;
8724
};
8725
8726
static void __perf_event_output_stop(struct perf_event *event, void *data)
8727
{
8728
struct perf_event *parent = event->parent;
8729
struct remote_output *ro = data;
8730
struct perf_buffer *rb = ro->rb;
8731
struct stop_event_data sd = {
8732
.event = event,
8733
};
8734
8735
if (!has_aux(event))
8736
return;
8737
8738
if (!parent)
8739
parent = event;
8740
8741
/*
8742
* In case of inheritance, it will be the parent that links to the
8743
* ring-buffer, but it will be the child that's actually using it.
8744
*
8745
* We are using event::rb to determine if the event should be stopped,
8746
* however this may race with ring_buffer_attach() (through set_output),
8747
* which will make us skip the event that actually needs to be stopped.
8748
* So ring_buffer_attach() has to stop an aux event before re-assigning
8749
* its rb pointer.
8750
*/
8751
if (rcu_dereference(parent->rb) == rb)
8752
ro->err = __perf_event_stop(&sd);
8753
}
8754
8755
static int __perf_pmu_output_stop(void *info)
8756
{
8757
struct perf_event *event = info;
8758
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
8759
struct remote_output ro = {
8760
.rb = event->rb,
8761
};
8762
8763
rcu_read_lock();
8764
perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
8765
if (cpuctx->task_ctx)
8766
perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
8767
&ro, false);
8768
rcu_read_unlock();
8769
8770
return ro.err;
8771
}
8772
8773
static void perf_pmu_output_stop(struct perf_event *event)
8774
{
8775
struct perf_event *iter;
8776
int err, cpu;
8777
8778
restart:
8779
rcu_read_lock();
8780
list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
8781
/*
8782
* For per-CPU events, we need to make sure that neither they
8783
* nor their children are running; for cpu==-1 events it's
8784
* sufficient to stop the event itself if it's active, since
8785
* it can't have children.
8786
*/
8787
cpu = iter->cpu;
8788
if (cpu == -1)
8789
cpu = READ_ONCE(iter->oncpu);
8790
8791
if (cpu == -1)
8792
continue;
8793
8794
err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
8795
if (err == -EAGAIN) {
8796
rcu_read_unlock();
8797
goto restart;
8798
}
8799
}
8800
rcu_read_unlock();
8801
}
8802
8803
/*
8804
* task tracking -- fork/exit
8805
*
8806
* enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
8807
*/
8808
8809
struct perf_task_event {
8810
struct task_struct *task;
8811
struct perf_event_context *task_ctx;
8812
8813
struct {
8814
struct perf_event_header header;
8815
8816
u32 pid;
8817
u32 ppid;
8818
u32 tid;
8819
u32 ptid;
8820
u64 time;
8821
} event_id;
8822
};
8823
8824
static int perf_event_task_match(struct perf_event *event)
8825
{
8826
return event->attr.comm || event->attr.mmap ||
8827
event->attr.mmap2 || event->attr.mmap_data ||
8828
event->attr.task;
8829
}
8830
8831
static void perf_event_task_output(struct perf_event *event,
8832
void *data)
8833
{
8834
struct perf_task_event *task_event = data;
8835
struct perf_output_handle handle;
8836
struct perf_sample_data sample;
8837
struct task_struct *task = task_event->task;
8838
int ret, size = task_event->event_id.header.size;
8839
8840
if (!perf_event_task_match(event))
8841
return;
8842
8843
perf_event_header__init_id(&task_event->event_id.header, &sample, event);
8844
8845
ret = perf_output_begin(&handle, &sample, event,
8846
task_event->event_id.header.size);
8847
if (ret)
8848
goto out;
8849
8850
task_event->event_id.pid = perf_event_pid(event, task);
8851
task_event->event_id.tid = perf_event_tid(event, task);
8852
8853
if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
8854
task_event->event_id.ppid = perf_event_pid(event,
8855
task->real_parent);
8856
task_event->event_id.ptid = perf_event_pid(event,
8857
task->real_parent);
8858
} else { /* PERF_RECORD_FORK */
8859
task_event->event_id.ppid = perf_event_pid(event, current);
8860
task_event->event_id.ptid = perf_event_tid(event, current);
8861
}
8862
8863
task_event->event_id.time = perf_event_clock(event);
8864
8865
perf_output_put(&handle, task_event->event_id);
8866
8867
perf_event__output_id_sample(event, &handle, &sample);
8868
8869
perf_output_end(&handle);
8870
out:
8871
task_event->event_id.header.size = size;
8872
}
8873
8874
static void perf_event_task(struct task_struct *task,
8875
struct perf_event_context *task_ctx,
8876
int new)
8877
{
8878
struct perf_task_event task_event;
8879
8880
if (!atomic_read(&nr_comm_events) &&
8881
!atomic_read(&nr_mmap_events) &&
8882
!atomic_read(&nr_task_events))
8883
return;
8884
8885
task_event = (struct perf_task_event){
8886
.task = task,
8887
.task_ctx = task_ctx,
8888
.event_id = {
8889
.header = {
8890
.type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
8891
.misc = 0,
8892
.size = sizeof(task_event.event_id),
8893
},
8894
/* .pid */
8895
/* .ppid */
8896
/* .tid */
8897
/* .ptid */
8898
/* .time */
8899
},
8900
};
8901
8902
perf_iterate_sb(perf_event_task_output,
8903
&task_event,
8904
task_ctx);
8905
}
8906
8907
/*
8908
* Allocate data for a new task when profiling system-wide
8909
* events which require PMU specific data
8910
*/
8911
static void
8912
perf_event_alloc_task_data(struct task_struct *child,
8913
struct task_struct *parent)
8914
{
8915
struct kmem_cache *ctx_cache = NULL;
8916
struct perf_ctx_data *cd;
8917
8918
if (!refcount_read(&global_ctx_data_ref))
8919
return;
8920
8921
scoped_guard (rcu) {
8922
cd = rcu_dereference(parent->perf_ctx_data);
8923
if (cd)
8924
ctx_cache = cd->ctx_cache;
8925
}
8926
8927
if (!ctx_cache)
8928
return;
8929
8930
guard(percpu_read)(&global_ctx_data_rwsem);
8931
scoped_guard (rcu) {
8932
cd = rcu_dereference(child->perf_ctx_data);
8933
if (!cd) {
8934
/*
8935
* A system-wide event may be unaccount,
8936
* when attaching the perf_ctx_data.
8937
*/
8938
if (!refcount_read(&global_ctx_data_ref))
8939
return;
8940
goto attach;
8941
}
8942
8943
if (!cd->global) {
8944
cd->global = 1;
8945
refcount_inc(&cd->refcount);
8946
}
8947
}
8948
8949
return;
8950
attach:
8951
attach_task_ctx_data(child, ctx_cache, true);
8952
}
8953
8954
void perf_event_fork(struct task_struct *task)
8955
{
8956
perf_event_task(task, NULL, 1);
8957
perf_event_namespaces(task);
8958
perf_event_alloc_task_data(task, current);
8959
}
8960
8961
/*
8962
* comm tracking
8963
*/
8964
8965
struct perf_comm_event {
8966
struct task_struct *task;
8967
char *comm;
8968
int comm_size;
8969
8970
struct {
8971
struct perf_event_header header;
8972
8973
u32 pid;
8974
u32 tid;
8975
} event_id;
8976
};
8977
8978
static int perf_event_comm_match(struct perf_event *event)
8979
{
8980
return event->attr.comm;
8981
}
8982
8983
static void perf_event_comm_output(struct perf_event *event,
8984
void *data)
8985
{
8986
struct perf_comm_event *comm_event = data;
8987
struct perf_output_handle handle;
8988
struct perf_sample_data sample;
8989
int size = comm_event->event_id.header.size;
8990
int ret;
8991
8992
if (!perf_event_comm_match(event))
8993
return;
8994
8995
perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
8996
ret = perf_output_begin(&handle, &sample, event,
8997
comm_event->event_id.header.size);
8998
8999
if (ret)
9000
goto out;
9001
9002
comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
9003
comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
9004
9005
perf_output_put(&handle, comm_event->event_id);
9006
__output_copy(&handle, comm_event->comm,
9007
comm_event->comm_size);
9008
9009
perf_event__output_id_sample(event, &handle, &sample);
9010
9011
perf_output_end(&handle);
9012
out:
9013
comm_event->event_id.header.size = size;
9014
}
9015
9016
static void perf_event_comm_event(struct perf_comm_event *comm_event)
9017
{
9018
char comm[TASK_COMM_LEN];
9019
unsigned int size;
9020
9021
memset(comm, 0, sizeof(comm));
9022
strscpy(comm, comm_event->task->comm);
9023
size = ALIGN(strlen(comm)+1, sizeof(u64));
9024
9025
comm_event->comm = comm;
9026
comm_event->comm_size = size;
9027
9028
comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
9029
9030
perf_iterate_sb(perf_event_comm_output,
9031
comm_event,
9032
NULL);
9033
}
9034
9035
void perf_event_comm(struct task_struct *task, bool exec)
9036
{
9037
struct perf_comm_event comm_event;
9038
9039
if (!atomic_read(&nr_comm_events))
9040
return;
9041
9042
comm_event = (struct perf_comm_event){
9043
.task = task,
9044
/* .comm */
9045
/* .comm_size */
9046
.event_id = {
9047
.header = {
9048
.type = PERF_RECORD_COMM,
9049
.misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
9050
/* .size */
9051
},
9052
/* .pid */
9053
/* .tid */
9054
},
9055
};
9056
9057
perf_event_comm_event(&comm_event);
9058
}
9059
9060
/*
9061
* namespaces tracking
9062
*/
9063
9064
struct perf_namespaces_event {
9065
struct task_struct *task;
9066
9067
struct {
9068
struct perf_event_header header;
9069
9070
u32 pid;
9071
u32 tid;
9072
u64 nr_namespaces;
9073
struct perf_ns_link_info link_info[NR_NAMESPACES];
9074
} event_id;
9075
};
9076
9077
static int perf_event_namespaces_match(struct perf_event *event)
9078
{
9079
return event->attr.namespaces;
9080
}
9081
9082
static void perf_event_namespaces_output(struct perf_event *event,
9083
void *data)
9084
{
9085
struct perf_namespaces_event *namespaces_event = data;
9086
struct perf_output_handle handle;
9087
struct perf_sample_data sample;
9088
u16 header_size = namespaces_event->event_id.header.size;
9089
int ret;
9090
9091
if (!perf_event_namespaces_match(event))
9092
return;
9093
9094
perf_event_header__init_id(&namespaces_event->event_id.header,
9095
&sample, event);
9096
ret = perf_output_begin(&handle, &sample, event,
9097
namespaces_event->event_id.header.size);
9098
if (ret)
9099
goto out;
9100
9101
namespaces_event->event_id.pid = perf_event_pid(event,
9102
namespaces_event->task);
9103
namespaces_event->event_id.tid = perf_event_tid(event,
9104
namespaces_event->task);
9105
9106
perf_output_put(&handle, namespaces_event->event_id);
9107
9108
perf_event__output_id_sample(event, &handle, &sample);
9109
9110
perf_output_end(&handle);
9111
out:
9112
namespaces_event->event_id.header.size = header_size;
9113
}
9114
9115
static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
9116
struct task_struct *task,
9117
const struct proc_ns_operations *ns_ops)
9118
{
9119
struct path ns_path;
9120
struct inode *ns_inode;
9121
int error;
9122
9123
error = ns_get_path(&ns_path, task, ns_ops);
9124
if (!error) {
9125
ns_inode = ns_path.dentry->d_inode;
9126
ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
9127
ns_link_info->ino = ns_inode->i_ino;
9128
path_put(&ns_path);
9129
}
9130
}
9131
9132
void perf_event_namespaces(struct task_struct *task)
9133
{
9134
struct perf_namespaces_event namespaces_event;
9135
struct perf_ns_link_info *ns_link_info;
9136
9137
if (!atomic_read(&nr_namespaces_events))
9138
return;
9139
9140
namespaces_event = (struct perf_namespaces_event){
9141
.task = task,
9142
.event_id = {
9143
.header = {
9144
.type = PERF_RECORD_NAMESPACES,
9145
.misc = 0,
9146
.size = sizeof(namespaces_event.event_id),
9147
},
9148
/* .pid */
9149
/* .tid */
9150
.nr_namespaces = NR_NAMESPACES,
9151
/* .link_info[NR_NAMESPACES] */
9152
},
9153
};
9154
9155
ns_link_info = namespaces_event.event_id.link_info;
9156
9157
perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
9158
task, &mntns_operations);
9159
9160
#ifdef CONFIG_USER_NS
9161
perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
9162
task, &userns_operations);
9163
#endif
9164
#ifdef CONFIG_NET_NS
9165
perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
9166
task, &netns_operations);
9167
#endif
9168
#ifdef CONFIG_UTS_NS
9169
perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
9170
task, &utsns_operations);
9171
#endif
9172
#ifdef CONFIG_IPC_NS
9173
perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
9174
task, &ipcns_operations);
9175
#endif
9176
#ifdef CONFIG_PID_NS
9177
perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
9178
task, &pidns_operations);
9179
#endif
9180
#ifdef CONFIG_CGROUPS
9181
perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
9182
task, &cgroupns_operations);
9183
#endif
9184
9185
perf_iterate_sb(perf_event_namespaces_output,
9186
&namespaces_event,
9187
NULL);
9188
}
9189
9190
/*
9191
* cgroup tracking
9192
*/
9193
#ifdef CONFIG_CGROUP_PERF
9194
9195
struct perf_cgroup_event {
9196
char *path;
9197
int path_size;
9198
struct {
9199
struct perf_event_header header;
9200
u64 id;
9201
char path[];
9202
} event_id;
9203
};
9204
9205
static int perf_event_cgroup_match(struct perf_event *event)
9206
{
9207
return event->attr.cgroup;
9208
}
9209
9210
static void perf_event_cgroup_output(struct perf_event *event, void *data)
9211
{
9212
struct perf_cgroup_event *cgroup_event = data;
9213
struct perf_output_handle handle;
9214
struct perf_sample_data sample;
9215
u16 header_size = cgroup_event->event_id.header.size;
9216
int ret;
9217
9218
if (!perf_event_cgroup_match(event))
9219
return;
9220
9221
perf_event_header__init_id(&cgroup_event->event_id.header,
9222
&sample, event);
9223
ret = perf_output_begin(&handle, &sample, event,
9224
cgroup_event->event_id.header.size);
9225
if (ret)
9226
goto out;
9227
9228
perf_output_put(&handle, cgroup_event->event_id);
9229
__output_copy(&handle, cgroup_event->path, cgroup_event->path_size);
9230
9231
perf_event__output_id_sample(event, &handle, &sample);
9232
9233
perf_output_end(&handle);
9234
out:
9235
cgroup_event->event_id.header.size = header_size;
9236
}
9237
9238
static void perf_event_cgroup(struct cgroup *cgrp)
9239
{
9240
struct perf_cgroup_event cgroup_event;
9241
char path_enomem[16] = "//enomem";
9242
char *pathname;
9243
size_t size;
9244
9245
if (!atomic_read(&nr_cgroup_events))
9246
return;
9247
9248
cgroup_event = (struct perf_cgroup_event){
9249
.event_id = {
9250
.header = {
9251
.type = PERF_RECORD_CGROUP,
9252
.misc = 0,
9253
.size = sizeof(cgroup_event.event_id),
9254
},
9255
.id = cgroup_id(cgrp),
9256
},
9257
};
9258
9259
pathname = kmalloc(PATH_MAX, GFP_KERNEL);
9260
if (pathname == NULL) {
9261
cgroup_event.path = path_enomem;
9262
} else {
9263
/* just to be sure to have enough space for alignment */
9264
cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
9265
cgroup_event.path = pathname;
9266
}
9267
9268
/*
9269
* Since our buffer works in 8 byte units we need to align our string
9270
* size to a multiple of 8. However, we must guarantee the tail end is
9271
* zero'd out to avoid leaking random bits to userspace.
9272
*/
9273
size = strlen(cgroup_event.path) + 1;
9274
while (!IS_ALIGNED(size, sizeof(u64)))
9275
cgroup_event.path[size++] = '\0';
9276
9277
cgroup_event.event_id.header.size += size;
9278
cgroup_event.path_size = size;
9279
9280
perf_iterate_sb(perf_event_cgroup_output,
9281
&cgroup_event,
9282
NULL);
9283
9284
kfree(pathname);
9285
}
9286
9287
#endif
9288
9289
/*
9290
* mmap tracking
9291
*/
9292
9293
struct perf_mmap_event {
9294
struct vm_area_struct *vma;
9295
9296
const char *file_name;
9297
int file_size;
9298
int maj, min;
9299
u64 ino;
9300
u64 ino_generation;
9301
u32 prot, flags;
9302
u8 build_id[BUILD_ID_SIZE_MAX];
9303
u32 build_id_size;
9304
9305
struct {
9306
struct perf_event_header header;
9307
9308
u32 pid;
9309
u32 tid;
9310
u64 start;
9311
u64 len;
9312
u64 pgoff;
9313
} event_id;
9314
};
9315
9316
static int perf_event_mmap_match(struct perf_event *event,
9317
void *data)
9318
{
9319
struct perf_mmap_event *mmap_event = data;
9320
struct vm_area_struct *vma = mmap_event->vma;
9321
int executable = vma->vm_flags & VM_EXEC;
9322
9323
return (!executable && event->attr.mmap_data) ||
9324
(executable && (event->attr.mmap || event->attr.mmap2));
9325
}
9326
9327
static void perf_event_mmap_output(struct perf_event *event,
9328
void *data)
9329
{
9330
struct perf_mmap_event *mmap_event = data;
9331
struct perf_output_handle handle;
9332
struct perf_sample_data sample;
9333
int size = mmap_event->event_id.header.size;
9334
u32 type = mmap_event->event_id.header.type;
9335
bool use_build_id;
9336
int ret;
9337
9338
if (!perf_event_mmap_match(event, data))
9339
return;
9340
9341
if (event->attr.mmap2) {
9342
mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
9343
mmap_event->event_id.header.size += sizeof(mmap_event->maj);
9344
mmap_event->event_id.header.size += sizeof(mmap_event->min);
9345
mmap_event->event_id.header.size += sizeof(mmap_event->ino);
9346
mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
9347
mmap_event->event_id.header.size += sizeof(mmap_event->prot);
9348
mmap_event->event_id.header.size += sizeof(mmap_event->flags);
9349
}
9350
9351
perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
9352
ret = perf_output_begin(&handle, &sample, event,
9353
mmap_event->event_id.header.size);
9354
if (ret)
9355
goto out;
9356
9357
mmap_event->event_id.pid = perf_event_pid(event, current);
9358
mmap_event->event_id.tid = perf_event_tid(event, current);
9359
9360
use_build_id = event->attr.build_id && mmap_event->build_id_size;
9361
9362
if (event->attr.mmap2 && use_build_id)
9363
mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID;
9364
9365
perf_output_put(&handle, mmap_event->event_id);
9366
9367
if (event->attr.mmap2) {
9368
if (use_build_id) {
9369
u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 };
9370
9371
__output_copy(&handle, size, 4);
9372
__output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX);
9373
} else {
9374
perf_output_put(&handle, mmap_event->maj);
9375
perf_output_put(&handle, mmap_event->min);
9376
perf_output_put(&handle, mmap_event->ino);
9377
perf_output_put(&handle, mmap_event->ino_generation);
9378
}
9379
perf_output_put(&handle, mmap_event->prot);
9380
perf_output_put(&handle, mmap_event->flags);
9381
}
9382
9383
__output_copy(&handle, mmap_event->file_name,
9384
mmap_event->file_size);
9385
9386
perf_event__output_id_sample(event, &handle, &sample);
9387
9388
perf_output_end(&handle);
9389
out:
9390
mmap_event->event_id.header.size = size;
9391
mmap_event->event_id.header.type = type;
9392
}
9393
9394
static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
9395
{
9396
struct vm_area_struct *vma = mmap_event->vma;
9397
struct file *file = vma->vm_file;
9398
int maj = 0, min = 0;
9399
u64 ino = 0, gen = 0;
9400
u32 prot = 0, flags = 0;
9401
unsigned int size;
9402
char tmp[16];
9403
char *buf = NULL;
9404
char *name = NULL;
9405
9406
if (vma->vm_flags & VM_READ)
9407
prot |= PROT_READ;
9408
if (vma->vm_flags & VM_WRITE)
9409
prot |= PROT_WRITE;
9410
if (vma->vm_flags & VM_EXEC)
9411
prot |= PROT_EXEC;
9412
9413
if (vma->vm_flags & VM_MAYSHARE)
9414
flags = MAP_SHARED;
9415
else
9416
flags = MAP_PRIVATE;
9417
9418
if (vma->vm_flags & VM_LOCKED)
9419
flags |= MAP_LOCKED;
9420
if (is_vm_hugetlb_page(vma))
9421
flags |= MAP_HUGETLB;
9422
9423
if (file) {
9424
const struct inode *inode;
9425
dev_t dev;
9426
9427
buf = kmalloc(PATH_MAX, GFP_KERNEL);
9428
if (!buf) {
9429
name = "//enomem";
9430
goto cpy_name;
9431
}
9432
/*
9433
* d_path() works from the end of the rb backwards, so we
9434
* need to add enough zero bytes after the string to handle
9435
* the 64bit alignment we do later.
9436
*/
9437
name = d_path(file_user_path(file), buf, PATH_MAX - sizeof(u64));
9438
if (IS_ERR(name)) {
9439
name = "//toolong";
9440
goto cpy_name;
9441
}
9442
inode = file_user_inode(vma->vm_file);
9443
dev = inode->i_sb->s_dev;
9444
ino = inode->i_ino;
9445
gen = inode->i_generation;
9446
maj = MAJOR(dev);
9447
min = MINOR(dev);
9448
9449
goto got_name;
9450
} else {
9451
if (vma->vm_ops && vma->vm_ops->name)
9452
name = (char *) vma->vm_ops->name(vma);
9453
if (!name)
9454
name = (char *)arch_vma_name(vma);
9455
if (!name) {
9456
if (vma_is_initial_heap(vma))
9457
name = "[heap]";
9458
else if (vma_is_initial_stack(vma))
9459
name = "[stack]";
9460
else
9461
name = "//anon";
9462
}
9463
}
9464
9465
cpy_name:
9466
strscpy(tmp, name);
9467
name = tmp;
9468
got_name:
9469
/*
9470
* Since our buffer works in 8 byte units we need to align our string
9471
* size to a multiple of 8. However, we must guarantee the tail end is
9472
* zero'd out to avoid leaking random bits to userspace.
9473
*/
9474
size = strlen(name)+1;
9475
while (!IS_ALIGNED(size, sizeof(u64)))
9476
name[size++] = '\0';
9477
9478
mmap_event->file_name = name;
9479
mmap_event->file_size = size;
9480
mmap_event->maj = maj;
9481
mmap_event->min = min;
9482
mmap_event->ino = ino;
9483
mmap_event->ino_generation = gen;
9484
mmap_event->prot = prot;
9485
mmap_event->flags = flags;
9486
9487
if (!(vma->vm_flags & VM_EXEC))
9488
mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
9489
9490
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
9491
9492
if (atomic_read(&nr_build_id_events))
9493
build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size);
9494
9495
perf_iterate_sb(perf_event_mmap_output,
9496
mmap_event,
9497
NULL);
9498
9499
kfree(buf);
9500
}
9501
9502
/*
9503
* Check whether inode and address range match filter criteria.
9504
*/
9505
static bool perf_addr_filter_match(struct perf_addr_filter *filter,
9506
struct file *file, unsigned long offset,
9507
unsigned long size)
9508
{
9509
/* d_inode(NULL) won't be equal to any mapped user-space file */
9510
if (!filter->path.dentry)
9511
return false;
9512
9513
if (d_inode(filter->path.dentry) != file_user_inode(file))
9514
return false;
9515
9516
if (filter->offset > offset + size)
9517
return false;
9518
9519
if (filter->offset + filter->size < offset)
9520
return false;
9521
9522
return true;
9523
}
9524
9525
static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
9526
struct vm_area_struct *vma,
9527
struct perf_addr_filter_range *fr)
9528
{
9529
unsigned long vma_size = vma->vm_end - vma->vm_start;
9530
unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
9531
struct file *file = vma->vm_file;
9532
9533
if (!perf_addr_filter_match(filter, file, off, vma_size))
9534
return false;
9535
9536
if (filter->offset < off) {
9537
fr->start = vma->vm_start;
9538
fr->size = min(vma_size, filter->size - (off - filter->offset));
9539
} else {
9540
fr->start = vma->vm_start + filter->offset - off;
9541
fr->size = min(vma->vm_end - fr->start, filter->size);
9542
}
9543
9544
return true;
9545
}
9546
9547
static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
9548
{
9549
struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
9550
struct vm_area_struct *vma = data;
9551
struct perf_addr_filter *filter;
9552
unsigned int restart = 0, count = 0;
9553
unsigned long flags;
9554
9555
if (!has_addr_filter(event))
9556
return;
9557
9558
if (!vma->vm_file)
9559
return;
9560
9561
raw_spin_lock_irqsave(&ifh->lock, flags);
9562
list_for_each_entry(filter, &ifh->list, entry) {
9563
if (perf_addr_filter_vma_adjust(filter, vma,
9564
&event->addr_filter_ranges[count]))
9565
restart++;
9566
9567
count++;
9568
}
9569
9570
if (restart)
9571
event->addr_filters_gen++;
9572
raw_spin_unlock_irqrestore(&ifh->lock, flags);
9573
9574
if (restart)
9575
perf_event_stop(event, 1);
9576
}
9577
9578
/*
9579
* Adjust all task's events' filters to the new vma
9580
*/
9581
static void perf_addr_filters_adjust(struct vm_area_struct *vma)
9582
{
9583
struct perf_event_context *ctx;
9584
9585
/*
9586
* Data tracing isn't supported yet and as such there is no need
9587
* to keep track of anything that isn't related to executable code:
9588
*/
9589
if (!(vma->vm_flags & VM_EXEC))
9590
return;
9591
9592
rcu_read_lock();
9593
ctx = rcu_dereference(current->perf_event_ctxp);
9594
if (ctx)
9595
perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
9596
rcu_read_unlock();
9597
}
9598
9599
void perf_event_mmap(struct vm_area_struct *vma)
9600
{
9601
struct perf_mmap_event mmap_event;
9602
9603
if (!atomic_read(&nr_mmap_events))
9604
return;
9605
9606
mmap_event = (struct perf_mmap_event){
9607
.vma = vma,
9608
/* .file_name */
9609
/* .file_size */
9610
.event_id = {
9611
.header = {
9612
.type = PERF_RECORD_MMAP,
9613
.misc = PERF_RECORD_MISC_USER,
9614
/* .size */
9615
},
9616
/* .pid */
9617
/* .tid */
9618
.start = vma->vm_start,
9619
.len = vma->vm_end - vma->vm_start,
9620
.pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
9621
},
9622
/* .maj (attr_mmap2 only) */
9623
/* .min (attr_mmap2 only) */
9624
/* .ino (attr_mmap2 only) */
9625
/* .ino_generation (attr_mmap2 only) */
9626
/* .prot (attr_mmap2 only) */
9627
/* .flags (attr_mmap2 only) */
9628
};
9629
9630
perf_addr_filters_adjust(vma);
9631
perf_event_mmap_event(&mmap_event);
9632
}
9633
9634
void perf_event_aux_event(struct perf_event *event, unsigned long head,
9635
unsigned long size, u64 flags)
9636
{
9637
struct perf_output_handle handle;
9638
struct perf_sample_data sample;
9639
struct perf_aux_event {
9640
struct perf_event_header header;
9641
u64 offset;
9642
u64 size;
9643
u64 flags;
9644
} rec = {
9645
.header = {
9646
.type = PERF_RECORD_AUX,
9647
.misc = 0,
9648
.size = sizeof(rec),
9649
},
9650
.offset = head,
9651
.size = size,
9652
.flags = flags,
9653
};
9654
int ret;
9655
9656
perf_event_header__init_id(&rec.header, &sample, event);
9657
ret = perf_output_begin(&handle, &sample, event, rec.header.size);
9658
9659
if (ret)
9660
return;
9661
9662
perf_output_put(&handle, rec);
9663
perf_event__output_id_sample(event, &handle, &sample);
9664
9665
perf_output_end(&handle);
9666
}
9667
9668
/*
9669
* Lost/dropped samples logging
9670
*/
9671
void perf_log_lost_samples(struct perf_event *event, u64 lost)
9672
{
9673
struct perf_output_handle handle;
9674
struct perf_sample_data sample;
9675
int ret;
9676
9677
struct {
9678
struct perf_event_header header;
9679
u64 lost;
9680
} lost_samples_event = {
9681
.header = {
9682
.type = PERF_RECORD_LOST_SAMPLES,
9683
.misc = 0,
9684
.size = sizeof(lost_samples_event),
9685
},
9686
.lost = lost,
9687
};
9688
9689
perf_event_header__init_id(&lost_samples_event.header, &sample, event);
9690
9691
ret = perf_output_begin(&handle, &sample, event,
9692
lost_samples_event.header.size);
9693
if (ret)
9694
return;
9695
9696
perf_output_put(&handle, lost_samples_event);
9697
perf_event__output_id_sample(event, &handle, &sample);
9698
perf_output_end(&handle);
9699
}
9700
9701
/*
9702
* context_switch tracking
9703
*/
9704
9705
struct perf_switch_event {
9706
struct task_struct *task;
9707
struct task_struct *next_prev;
9708
9709
struct {
9710
struct perf_event_header header;
9711
u32 next_prev_pid;
9712
u32 next_prev_tid;
9713
} event_id;
9714
};
9715
9716
static int perf_event_switch_match(struct perf_event *event)
9717
{
9718
return event->attr.context_switch;
9719
}
9720
9721
static void perf_event_switch_output(struct perf_event *event, void *data)
9722
{
9723
struct perf_switch_event *se = data;
9724
struct perf_output_handle handle;
9725
struct perf_sample_data sample;
9726
int ret;
9727
9728
if (!perf_event_switch_match(event))
9729
return;
9730
9731
/* Only CPU-wide events are allowed to see next/prev pid/tid */
9732
if (event->ctx->task) {
9733
se->event_id.header.type = PERF_RECORD_SWITCH;
9734
se->event_id.header.size = sizeof(se->event_id.header);
9735
} else {
9736
se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
9737
se->event_id.header.size = sizeof(se->event_id);
9738
se->event_id.next_prev_pid =
9739
perf_event_pid(event, se->next_prev);
9740
se->event_id.next_prev_tid =
9741
perf_event_tid(event, se->next_prev);
9742
}
9743
9744
perf_event_header__init_id(&se->event_id.header, &sample, event);
9745
9746
ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
9747
if (ret)
9748
return;
9749
9750
if (event->ctx->task)
9751
perf_output_put(&handle, se->event_id.header);
9752
else
9753
perf_output_put(&handle, se->event_id);
9754
9755
perf_event__output_id_sample(event, &handle, &sample);
9756
9757
perf_output_end(&handle);
9758
}
9759
9760
static void perf_event_switch(struct task_struct *task,
9761
struct task_struct *next_prev, bool sched_in)
9762
{
9763
struct perf_switch_event switch_event;
9764
9765
/* N.B. caller checks nr_switch_events != 0 */
9766
9767
switch_event = (struct perf_switch_event){
9768
.task = task,
9769
.next_prev = next_prev,
9770
.event_id = {
9771
.header = {
9772
/* .type */
9773
.misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
9774
/* .size */
9775
},
9776
/* .next_prev_pid */
9777
/* .next_prev_tid */
9778
},
9779
};
9780
9781
if (!sched_in && task_is_runnable(task)) {
9782
switch_event.event_id.header.misc |=
9783
PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
9784
}
9785
9786
perf_iterate_sb(perf_event_switch_output, &switch_event, NULL);
9787
}
9788
9789
/*
9790
* IRQ throttle logging
9791
*/
9792
9793
static void perf_log_throttle(struct perf_event *event, int enable)
9794
{
9795
struct perf_output_handle handle;
9796
struct perf_sample_data sample;
9797
int ret;
9798
9799
struct {
9800
struct perf_event_header header;
9801
u64 time;
9802
u64 id;
9803
u64 stream_id;
9804
} throttle_event = {
9805
.header = {
9806
.type = PERF_RECORD_THROTTLE,
9807
.misc = 0,
9808
.size = sizeof(throttle_event),
9809
},
9810
.time = perf_event_clock(event),
9811
.id = primary_event_id(event),
9812
.stream_id = event->id,
9813
};
9814
9815
if (enable)
9816
throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
9817
9818
perf_event_header__init_id(&throttle_event.header, &sample, event);
9819
9820
ret = perf_output_begin(&handle, &sample, event,
9821
throttle_event.header.size);
9822
if (ret)
9823
return;
9824
9825
perf_output_put(&handle, throttle_event);
9826
perf_event__output_id_sample(event, &handle, &sample);
9827
perf_output_end(&handle);
9828
}
9829
9830
/*
9831
* ksymbol register/unregister tracking
9832
*/
9833
9834
struct perf_ksymbol_event {
9835
const char *name;
9836
int name_len;
9837
struct {
9838
struct perf_event_header header;
9839
u64 addr;
9840
u32 len;
9841
u16 ksym_type;
9842
u16 flags;
9843
} event_id;
9844
};
9845
9846
static int perf_event_ksymbol_match(struct perf_event *event)
9847
{
9848
return event->attr.ksymbol;
9849
}
9850
9851
static void perf_event_ksymbol_output(struct perf_event *event, void *data)
9852
{
9853
struct perf_ksymbol_event *ksymbol_event = data;
9854
struct perf_output_handle handle;
9855
struct perf_sample_data sample;
9856
int ret;
9857
9858
if (!perf_event_ksymbol_match(event))
9859
return;
9860
9861
perf_event_header__init_id(&ksymbol_event->event_id.header,
9862
&sample, event);
9863
ret = perf_output_begin(&handle, &sample, event,
9864
ksymbol_event->event_id.header.size);
9865
if (ret)
9866
return;
9867
9868
perf_output_put(&handle, ksymbol_event->event_id);
9869
__output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
9870
perf_event__output_id_sample(event, &handle, &sample);
9871
9872
perf_output_end(&handle);
9873
}
9874
9875
void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
9876
const char *sym)
9877
{
9878
struct perf_ksymbol_event ksymbol_event;
9879
char name[KSYM_NAME_LEN];
9880
u16 flags = 0;
9881
int name_len;
9882
9883
if (!atomic_read(&nr_ksymbol_events))
9884
return;
9885
9886
if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
9887
ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
9888
goto err;
9889
9890
strscpy(name, sym);
9891
name_len = strlen(name) + 1;
9892
while (!IS_ALIGNED(name_len, sizeof(u64)))
9893
name[name_len++] = '\0';
9894
BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
9895
9896
if (unregister)
9897
flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
9898
9899
ksymbol_event = (struct perf_ksymbol_event){
9900
.name = name,
9901
.name_len = name_len,
9902
.event_id = {
9903
.header = {
9904
.type = PERF_RECORD_KSYMBOL,
9905
.size = sizeof(ksymbol_event.event_id) +
9906
name_len,
9907
},
9908
.addr = addr,
9909
.len = len,
9910
.ksym_type = ksym_type,
9911
.flags = flags,
9912
},
9913
};
9914
9915
perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
9916
return;
9917
err:
9918
WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
9919
}
9920
9921
/*
9922
* bpf program load/unload tracking
9923
*/
9924
9925
struct perf_bpf_event {
9926
struct bpf_prog *prog;
9927
struct {
9928
struct perf_event_header header;
9929
u16 type;
9930
u16 flags;
9931
u32 id;
9932
u8 tag[BPF_TAG_SIZE];
9933
} event_id;
9934
};
9935
9936
static int perf_event_bpf_match(struct perf_event *event)
9937
{
9938
return event->attr.bpf_event;
9939
}
9940
9941
static void perf_event_bpf_output(struct perf_event *event, void *data)
9942
{
9943
struct perf_bpf_event *bpf_event = data;
9944
struct perf_output_handle handle;
9945
struct perf_sample_data sample;
9946
int ret;
9947
9948
if (!perf_event_bpf_match(event))
9949
return;
9950
9951
perf_event_header__init_id(&bpf_event->event_id.header,
9952
&sample, event);
9953
ret = perf_output_begin(&handle, &sample, event,
9954
bpf_event->event_id.header.size);
9955
if (ret)
9956
return;
9957
9958
perf_output_put(&handle, bpf_event->event_id);
9959
perf_event__output_id_sample(event, &handle, &sample);
9960
9961
perf_output_end(&handle);
9962
}
9963
9964
static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
9965
enum perf_bpf_event_type type)
9966
{
9967
bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
9968
int i;
9969
9970
perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
9971
(u64)(unsigned long)prog->bpf_func,
9972
prog->jited_len, unregister,
9973
prog->aux->ksym.name);
9974
9975
for (i = 1; i < prog->aux->func_cnt; i++) {
9976
struct bpf_prog *subprog = prog->aux->func[i];
9977
9978
perf_event_ksymbol(
9979
PERF_RECORD_KSYMBOL_TYPE_BPF,
9980
(u64)(unsigned long)subprog->bpf_func,
9981
subprog->jited_len, unregister,
9982
subprog->aux->ksym.name);
9983
}
9984
}
9985
9986
void perf_event_bpf_event(struct bpf_prog *prog,
9987
enum perf_bpf_event_type type,
9988
u16 flags)
9989
{
9990
struct perf_bpf_event bpf_event;
9991
9992
switch (type) {
9993
case PERF_BPF_EVENT_PROG_LOAD:
9994
case PERF_BPF_EVENT_PROG_UNLOAD:
9995
if (atomic_read(&nr_ksymbol_events))
9996
perf_event_bpf_emit_ksymbols(prog, type);
9997
break;
9998
default:
9999
return;
10000
}
10001
10002
if (!atomic_read(&nr_bpf_events))
10003
return;
10004
10005
bpf_event = (struct perf_bpf_event){
10006
.prog = prog,
10007
.event_id = {
10008
.header = {
10009
.type = PERF_RECORD_BPF_EVENT,
10010
.size = sizeof(bpf_event.event_id),
10011
},
10012
.type = type,
10013
.flags = flags,
10014
.id = prog->aux->id,
10015
},
10016
};
10017
10018
BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
10019
10020
memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
10021
perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
10022
}
10023
10024
struct perf_callchain_deferred_event {
10025
struct unwind_stacktrace *trace;
10026
struct {
10027
struct perf_event_header header;
10028
u64 cookie;
10029
u64 nr;
10030
u64 ips[];
10031
} event;
10032
};
10033
10034
static void perf_callchain_deferred_output(struct perf_event *event, void *data)
10035
{
10036
struct perf_callchain_deferred_event *deferred_event = data;
10037
struct perf_output_handle handle;
10038
struct perf_sample_data sample;
10039
int ret, size = deferred_event->event.header.size;
10040
10041
if (!event->attr.defer_output)
10042
return;
10043
10044
/* XXX do we really need sample_id_all for this ??? */
10045
perf_event_header__init_id(&deferred_event->event.header, &sample, event);
10046
10047
ret = perf_output_begin(&handle, &sample, event,
10048
deferred_event->event.header.size);
10049
if (ret)
10050
goto out;
10051
10052
perf_output_put(&handle, deferred_event->event);
10053
for (int i = 0; i < deferred_event->trace->nr; i++) {
10054
u64 entry = deferred_event->trace->entries[i];
10055
perf_output_put(&handle, entry);
10056
}
10057
perf_event__output_id_sample(event, &handle, &sample);
10058
10059
perf_output_end(&handle);
10060
out:
10061
deferred_event->event.header.size = size;
10062
}
10063
10064
static void perf_unwind_deferred_callback(struct unwind_work *work,
10065
struct unwind_stacktrace *trace, u64 cookie)
10066
{
10067
struct perf_callchain_deferred_event deferred_event = {
10068
.trace = trace,
10069
.event = {
10070
.header = {
10071
.type = PERF_RECORD_CALLCHAIN_DEFERRED,
10072
.misc = PERF_RECORD_MISC_USER,
10073
.size = sizeof(deferred_event.event) +
10074
(trace->nr * sizeof(u64)),
10075
},
10076
.cookie = cookie,
10077
.nr = trace->nr,
10078
},
10079
};
10080
10081
perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL);
10082
}
10083
10084
struct perf_text_poke_event {
10085
const void *old_bytes;
10086
const void *new_bytes;
10087
size_t pad;
10088
u16 old_len;
10089
u16 new_len;
10090
10091
struct {
10092
struct perf_event_header header;
10093
10094
u64 addr;
10095
} event_id;
10096
};
10097
10098
static int perf_event_text_poke_match(struct perf_event *event)
10099
{
10100
return event->attr.text_poke;
10101
}
10102
10103
static void perf_event_text_poke_output(struct perf_event *event, void *data)
10104
{
10105
struct perf_text_poke_event *text_poke_event = data;
10106
struct perf_output_handle handle;
10107
struct perf_sample_data sample;
10108
u64 padding = 0;
10109
int ret;
10110
10111
if (!perf_event_text_poke_match(event))
10112
return;
10113
10114
perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
10115
10116
ret = perf_output_begin(&handle, &sample, event,
10117
text_poke_event->event_id.header.size);
10118
if (ret)
10119
return;
10120
10121
perf_output_put(&handle, text_poke_event->event_id);
10122
perf_output_put(&handle, text_poke_event->old_len);
10123
perf_output_put(&handle, text_poke_event->new_len);
10124
10125
__output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
10126
__output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
10127
10128
if (text_poke_event->pad)
10129
__output_copy(&handle, &padding, text_poke_event->pad);
10130
10131
perf_event__output_id_sample(event, &handle, &sample);
10132
10133
perf_output_end(&handle);
10134
}
10135
10136
void perf_event_text_poke(const void *addr, const void *old_bytes,
10137
size_t old_len, const void *new_bytes, size_t new_len)
10138
{
10139
struct perf_text_poke_event text_poke_event;
10140
size_t tot, pad;
10141
10142
if (!atomic_read(&nr_text_poke_events))
10143
return;
10144
10145
tot = sizeof(text_poke_event.old_len) + old_len;
10146
tot += sizeof(text_poke_event.new_len) + new_len;
10147
pad = ALIGN(tot, sizeof(u64)) - tot;
10148
10149
text_poke_event = (struct perf_text_poke_event){
10150
.old_bytes = old_bytes,
10151
.new_bytes = new_bytes,
10152
.pad = pad,
10153
.old_len = old_len,
10154
.new_len = new_len,
10155
.event_id = {
10156
.header = {
10157
.type = PERF_RECORD_TEXT_POKE,
10158
.misc = PERF_RECORD_MISC_KERNEL,
10159
.size = sizeof(text_poke_event.event_id) + tot + pad,
10160
},
10161
.addr = (unsigned long)addr,
10162
},
10163
};
10164
10165
perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
10166
}
10167
10168
void perf_event_itrace_started(struct perf_event *event)
10169
{
10170
WRITE_ONCE(event->attach_state, event->attach_state | PERF_ATTACH_ITRACE);
10171
}
10172
10173
static void perf_log_itrace_start(struct perf_event *event)
10174
{
10175
struct perf_output_handle handle;
10176
struct perf_sample_data sample;
10177
struct perf_aux_event {
10178
struct perf_event_header header;
10179
u32 pid;
10180
u32 tid;
10181
} rec;
10182
int ret;
10183
10184
if (event->parent)
10185
event = event->parent;
10186
10187
if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
10188
event->attach_state & PERF_ATTACH_ITRACE)
10189
return;
10190
10191
rec.header.type = PERF_RECORD_ITRACE_START;
10192
rec.header.misc = 0;
10193
rec.header.size = sizeof(rec);
10194
rec.pid = perf_event_pid(event, current);
10195
rec.tid = perf_event_tid(event, current);
10196
10197
perf_event_header__init_id(&rec.header, &sample, event);
10198
ret = perf_output_begin(&handle, &sample, event, rec.header.size);
10199
10200
if (ret)
10201
return;
10202
10203
perf_output_put(&handle, rec);
10204
perf_event__output_id_sample(event, &handle, &sample);
10205
10206
perf_output_end(&handle);
10207
}
10208
10209
void perf_report_aux_output_id(struct perf_event *event, u64 hw_id)
10210
{
10211
struct perf_output_handle handle;
10212
struct perf_sample_data sample;
10213
struct perf_aux_event {
10214
struct perf_event_header header;
10215
u64 hw_id;
10216
} rec;
10217
int ret;
10218
10219
if (event->parent)
10220
event = event->parent;
10221
10222
rec.header.type = PERF_RECORD_AUX_OUTPUT_HW_ID;
10223
rec.header.misc = 0;
10224
rec.header.size = sizeof(rec);
10225
rec.hw_id = hw_id;
10226
10227
perf_event_header__init_id(&rec.header, &sample, event);
10228
ret = perf_output_begin(&handle, &sample, event, rec.header.size);
10229
10230
if (ret)
10231
return;
10232
10233
perf_output_put(&handle, rec);
10234
perf_event__output_id_sample(event, &handle, &sample);
10235
10236
perf_output_end(&handle);
10237
}
10238
EXPORT_SYMBOL_GPL(perf_report_aux_output_id);
10239
10240
static int
10241
__perf_event_account_interrupt(struct perf_event *event, int throttle)
10242
{
10243
struct hw_perf_event *hwc = &event->hw;
10244
int ret = 0;
10245
u64 seq;
10246
10247
seq = __this_cpu_read(perf_throttled_seq);
10248
if (seq != hwc->interrupts_seq) {
10249
hwc->interrupts_seq = seq;
10250
hwc->interrupts = 1;
10251
} else {
10252
hwc->interrupts++;
10253
}
10254
10255
if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) {
10256
__this_cpu_inc(perf_throttled_count);
10257
tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
10258
perf_event_throttle_group(event);
10259
ret = 1;
10260
}
10261
10262
if (event->attr.freq) {
10263
u64 now = perf_clock();
10264
s64 delta = now - hwc->freq_time_stamp;
10265
10266
hwc->freq_time_stamp = now;
10267
10268
if (delta > 0 && delta < 2*TICK_NSEC)
10269
perf_adjust_period(event, delta, hwc->last_period, true);
10270
}
10271
10272
return ret;
10273
}
10274
10275
int perf_event_account_interrupt(struct perf_event *event)
10276
{
10277
return __perf_event_account_interrupt(event, 1);
10278
}
10279
10280
static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
10281
{
10282
/*
10283
* Due to interrupt latency (AKA "skid"), we may enter the
10284
* kernel before taking an overflow, even if the PMU is only
10285
* counting user events.
10286
*/
10287
if (event->attr.exclude_kernel && !user_mode(regs))
10288
return false;
10289
10290
return true;
10291
}
10292
10293
#ifdef CONFIG_BPF_SYSCALL
10294
static int bpf_overflow_handler(struct perf_event *event,
10295
struct perf_sample_data *data,
10296
struct pt_regs *regs)
10297
{
10298
struct bpf_perf_event_data_kern ctx = {
10299
.data = data,
10300
.event = event,
10301
};
10302
struct bpf_prog *prog;
10303
int ret = 0;
10304
10305
ctx.regs = perf_arch_bpf_user_pt_regs(regs);
10306
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
10307
goto out;
10308
rcu_read_lock();
10309
prog = READ_ONCE(event->prog);
10310
if (prog) {
10311
perf_prepare_sample(data, event, regs);
10312
ret = bpf_prog_run(prog, &ctx);
10313
}
10314
rcu_read_unlock();
10315
out:
10316
__this_cpu_dec(bpf_prog_active);
10317
10318
return ret;
10319
}
10320
10321
static inline int perf_event_set_bpf_handler(struct perf_event *event,
10322
struct bpf_prog *prog,
10323
u64 bpf_cookie)
10324
{
10325
if (event->overflow_handler_context)
10326
/* hw breakpoint or kernel counter */
10327
return -EINVAL;
10328
10329
if (event->prog)
10330
return -EEXIST;
10331
10332
if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
10333
return -EINVAL;
10334
10335
if (event->attr.precise_ip &&
10336
prog->call_get_stack &&
10337
(!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
10338
event->attr.exclude_callchain_kernel ||
10339
event->attr.exclude_callchain_user)) {
10340
/*
10341
* On perf_event with precise_ip, calling bpf_get_stack()
10342
* may trigger unwinder warnings and occasional crashes.
10343
* bpf_get_[stack|stackid] works around this issue by using
10344
* callchain attached to perf_sample_data. If the
10345
* perf_event does not full (kernel and user) callchain
10346
* attached to perf_sample_data, do not allow attaching BPF
10347
* program that calls bpf_get_[stack|stackid].
10348
*/
10349
return -EPROTO;
10350
}
10351
10352
event->prog = prog;
10353
event->bpf_cookie = bpf_cookie;
10354
return 0;
10355
}
10356
10357
static inline void perf_event_free_bpf_handler(struct perf_event *event)
10358
{
10359
struct bpf_prog *prog = event->prog;
10360
10361
if (!prog)
10362
return;
10363
10364
event->prog = NULL;
10365
bpf_prog_put(prog);
10366
}
10367
#else
10368
static inline int bpf_overflow_handler(struct perf_event *event,
10369
struct perf_sample_data *data,
10370
struct pt_regs *regs)
10371
{
10372
return 1;
10373
}
10374
10375
static inline int perf_event_set_bpf_handler(struct perf_event *event,
10376
struct bpf_prog *prog,
10377
u64 bpf_cookie)
10378
{
10379
return -EOPNOTSUPP;
10380
}
10381
10382
static inline void perf_event_free_bpf_handler(struct perf_event *event)
10383
{
10384
}
10385
#endif
10386
10387
/*
10388
* Generic event overflow handling, sampling.
10389
*/
10390
10391
static int __perf_event_overflow(struct perf_event *event,
10392
int throttle, struct perf_sample_data *data,
10393
struct pt_regs *regs)
10394
{
10395
int events = atomic_read(&event->event_limit);
10396
int ret = 0;
10397
10398
/*
10399
* Non-sampling counters might still use the PMI to fold short
10400
* hardware counters, ignore those.
10401
*/
10402
if (unlikely(!is_sampling_event(event)))
10403
return 0;
10404
10405
ret = __perf_event_account_interrupt(event, throttle);
10406
10407
if (event->attr.aux_pause)
10408
perf_event_aux_pause(event->aux_event, true);
10409
10410
if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT &&
10411
!bpf_overflow_handler(event, data, regs))
10412
goto out;
10413
10414
/*
10415
* XXX event_limit might not quite work as expected on inherited
10416
* events
10417
*/
10418
10419
event->pending_kill = POLL_IN;
10420
if (events && atomic_dec_and_test(&event->event_limit)) {
10421
ret = 1;
10422
event->pending_kill = POLL_HUP;
10423
perf_event_disable_inatomic(event);
10424
event->pmu->stop(event, 0);
10425
}
10426
10427
if (event->attr.sigtrap) {
10428
/*
10429
* The desired behaviour of sigtrap vs invalid samples is a bit
10430
* tricky; on the one hand, one should not loose the SIGTRAP if
10431
* it is the first event, on the other hand, we should also not
10432
* trigger the WARN or override the data address.
10433
*/
10434
bool valid_sample = sample_is_allowed(event, regs);
10435
unsigned int pending_id = 1;
10436
enum task_work_notify_mode notify_mode;
10437
10438
if (regs)
10439
pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
10440
10441
notify_mode = in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME;
10442
10443
if (!event->pending_work &&
10444
!task_work_add(current, &event->pending_task, notify_mode)) {
10445
event->pending_work = pending_id;
10446
local_inc(&event->ctx->nr_no_switch_fast);
10447
WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
10448
10449
event->pending_addr = 0;
10450
if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
10451
event->pending_addr = data->addr;
10452
10453
} else if (event->attr.exclude_kernel && valid_sample) {
10454
/*
10455
* Should not be able to return to user space without
10456
* consuming pending_work; with exceptions:
10457
*
10458
* 1. Where !exclude_kernel, events can overflow again
10459
* in the kernel without returning to user space.
10460
*
10461
* 2. Events that can overflow again before the IRQ-
10462
* work without user space progress (e.g. hrtimer).
10463
* To approximate progress (with false negatives),
10464
* check 32-bit hash of the current IP.
10465
*/
10466
WARN_ON_ONCE(event->pending_work != pending_id);
10467
}
10468
}
10469
10470
READ_ONCE(event->overflow_handler)(event, data, regs);
10471
10472
if (*perf_event_fasync(event) && event->pending_kill) {
10473
event->pending_wakeup = 1;
10474
irq_work_queue(&event->pending_irq);
10475
}
10476
out:
10477
if (event->attr.aux_resume)
10478
perf_event_aux_pause(event->aux_event, false);
10479
10480
return ret;
10481
}
10482
10483
int perf_event_overflow(struct perf_event *event,
10484
struct perf_sample_data *data,
10485
struct pt_regs *regs)
10486
{
10487
return __perf_event_overflow(event, 1, data, regs);
10488
}
10489
10490
/*
10491
* Generic software event infrastructure
10492
*/
10493
10494
struct swevent_htable {
10495
struct swevent_hlist *swevent_hlist;
10496
struct mutex hlist_mutex;
10497
int hlist_refcount;
10498
};
10499
static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
10500
10501
/*
10502
* We directly increment event->count and keep a second value in
10503
* event->hw.period_left to count intervals. This period event
10504
* is kept in the range [-sample_period, 0] so that we can use the
10505
* sign as trigger.
10506
*/
10507
10508
u64 perf_swevent_set_period(struct perf_event *event)
10509
{
10510
struct hw_perf_event *hwc = &event->hw;
10511
u64 period = hwc->last_period;
10512
u64 nr, offset;
10513
s64 old, val;
10514
10515
hwc->last_period = hwc->sample_period;
10516
10517
old = local64_read(&hwc->period_left);
10518
do {
10519
val = old;
10520
if (val < 0)
10521
return 0;
10522
10523
nr = div64_u64(period + val, period);
10524
offset = nr * period;
10525
val -= offset;
10526
} while (!local64_try_cmpxchg(&hwc->period_left, &old, val));
10527
10528
return nr;
10529
}
10530
10531
static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
10532
struct perf_sample_data *data,
10533
struct pt_regs *regs)
10534
{
10535
struct hw_perf_event *hwc = &event->hw;
10536
int throttle = 0;
10537
10538
if (!overflow)
10539
overflow = perf_swevent_set_period(event);
10540
10541
if (hwc->interrupts == MAX_INTERRUPTS)
10542
return;
10543
10544
for (; overflow; overflow--) {
10545
if (__perf_event_overflow(event, throttle,
10546
data, regs)) {
10547
/*
10548
* We inhibit the overflow from happening when
10549
* hwc->interrupts == MAX_INTERRUPTS.
10550
*/
10551
break;
10552
}
10553
throttle = 1;
10554
}
10555
}
10556
10557
static void perf_swevent_event(struct perf_event *event, u64 nr,
10558
struct perf_sample_data *data,
10559
struct pt_regs *regs)
10560
{
10561
struct hw_perf_event *hwc = &event->hw;
10562
10563
local64_add(nr, &event->count);
10564
10565
if (!regs)
10566
return;
10567
10568
if (!is_sampling_event(event))
10569
return;
10570
10571
if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
10572
data->period = nr;
10573
return perf_swevent_overflow(event, 1, data, regs);
10574
} else
10575
data->period = event->hw.last_period;
10576
10577
if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
10578
return perf_swevent_overflow(event, 1, data, regs);
10579
10580
if (local64_add_negative(nr, &hwc->period_left))
10581
return;
10582
10583
perf_swevent_overflow(event, 0, data, regs);
10584
}
10585
10586
int perf_exclude_event(struct perf_event *event, struct pt_regs *regs)
10587
{
10588
if (event->hw.state & PERF_HES_STOPPED)
10589
return 1;
10590
10591
if (regs) {
10592
if (event->attr.exclude_user && user_mode(regs))
10593
return 1;
10594
10595
if (event->attr.exclude_kernel && !user_mode(regs))
10596
return 1;
10597
}
10598
10599
return 0;
10600
}
10601
10602
static int perf_swevent_match(struct perf_event *event,
10603
enum perf_type_id type,
10604
u32 event_id,
10605
struct perf_sample_data *data,
10606
struct pt_regs *regs)
10607
{
10608
if (event->attr.type != type)
10609
return 0;
10610
10611
if (event->attr.config != event_id)
10612
return 0;
10613
10614
if (perf_exclude_event(event, regs))
10615
return 0;
10616
10617
return 1;
10618
}
10619
10620
static inline u64 swevent_hash(u64 type, u32 event_id)
10621
{
10622
u64 val = event_id | (type << 32);
10623
10624
return hash_64(val, SWEVENT_HLIST_BITS);
10625
}
10626
10627
static inline struct hlist_head *
10628
__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
10629
{
10630
u64 hash = swevent_hash(type, event_id);
10631
10632
return &hlist->heads[hash];
10633
}
10634
10635
/* For the read side: events when they trigger */
10636
static inline struct hlist_head *
10637
find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
10638
{
10639
struct swevent_hlist *hlist;
10640
10641
hlist = rcu_dereference(swhash->swevent_hlist);
10642
if (!hlist)
10643
return NULL;
10644
10645
return __find_swevent_head(hlist, type, event_id);
10646
}
10647
10648
/* For the event head insertion and removal in the hlist */
10649
static inline struct hlist_head *
10650
find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
10651
{
10652
struct swevent_hlist *hlist;
10653
u32 event_id = event->attr.config;
10654
u64 type = event->attr.type;
10655
10656
/*
10657
* Event scheduling is always serialized against hlist allocation
10658
* and release. Which makes the protected version suitable here.
10659
* The context lock guarantees that.
10660
*/
10661
hlist = rcu_dereference_protected(swhash->swevent_hlist,
10662
lockdep_is_held(&event->ctx->lock));
10663
if (!hlist)
10664
return NULL;
10665
10666
return __find_swevent_head(hlist, type, event_id);
10667
}
10668
10669
static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
10670
u64 nr,
10671
struct perf_sample_data *data,
10672
struct pt_regs *regs)
10673
{
10674
struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
10675
struct perf_event *event;
10676
struct hlist_head *head;
10677
10678
rcu_read_lock();
10679
head = find_swevent_head_rcu(swhash, type, event_id);
10680
if (!head)
10681
goto end;
10682
10683
hlist_for_each_entry_rcu(event, head, hlist_entry) {
10684
if (perf_swevent_match(event, type, event_id, data, regs))
10685
perf_swevent_event(event, nr, data, regs);
10686
}
10687
end:
10688
rcu_read_unlock();
10689
}
10690
10691
DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
10692
10693
int perf_swevent_get_recursion_context(void)
10694
{
10695
return get_recursion_context(current->perf_recursion);
10696
}
10697
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
10698
10699
void perf_swevent_put_recursion_context(int rctx)
10700
{
10701
put_recursion_context(current->perf_recursion, rctx);
10702
}
10703
10704
void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
10705
{
10706
struct perf_sample_data data;
10707
10708
if (WARN_ON_ONCE(!regs))
10709
return;
10710
10711
perf_sample_data_init(&data, addr, 0);
10712
do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
10713
}
10714
10715
void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
10716
{
10717
int rctx;
10718
10719
preempt_disable_notrace();
10720
rctx = perf_swevent_get_recursion_context();
10721
if (unlikely(rctx < 0))
10722
goto fail;
10723
10724
___perf_sw_event(event_id, nr, regs, addr);
10725
10726
perf_swevent_put_recursion_context(rctx);
10727
fail:
10728
preempt_enable_notrace();
10729
}
10730
10731
static void perf_swevent_read(struct perf_event *event)
10732
{
10733
}
10734
10735
static int perf_swevent_add(struct perf_event *event, int flags)
10736
{
10737
struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
10738
struct hw_perf_event *hwc = &event->hw;
10739
struct hlist_head *head;
10740
10741
if (is_sampling_event(event)) {
10742
hwc->last_period = hwc->sample_period;
10743
perf_swevent_set_period(event);
10744
}
10745
10746
hwc->state = !(flags & PERF_EF_START);
10747
10748
head = find_swevent_head(swhash, event);
10749
if (WARN_ON_ONCE(!head))
10750
return -EINVAL;
10751
10752
hlist_add_head_rcu(&event->hlist_entry, head);
10753
perf_event_update_userpage(event);
10754
10755
return 0;
10756
}
10757
10758
static void perf_swevent_del(struct perf_event *event, int flags)
10759
{
10760
hlist_del_rcu(&event->hlist_entry);
10761
}
10762
10763
static void perf_swevent_start(struct perf_event *event, int flags)
10764
{
10765
event->hw.state = 0;
10766
}
10767
10768
static void perf_swevent_stop(struct perf_event *event, int flags)
10769
{
10770
event->hw.state = PERF_HES_STOPPED;
10771
}
10772
10773
/* Deref the hlist from the update side */
10774
static inline struct swevent_hlist *
10775
swevent_hlist_deref(struct swevent_htable *swhash)
10776
{
10777
return rcu_dereference_protected(swhash->swevent_hlist,
10778
lockdep_is_held(&swhash->hlist_mutex));
10779
}
10780
10781
static void swevent_hlist_release(struct swevent_htable *swhash)
10782
{
10783
struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
10784
10785
if (!hlist)
10786
return;
10787
10788
RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
10789
kfree_rcu(hlist, rcu_head);
10790
}
10791
10792
static void swevent_hlist_put_cpu(int cpu)
10793
{
10794
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
10795
10796
mutex_lock(&swhash->hlist_mutex);
10797
10798
if (!--swhash->hlist_refcount)
10799
swevent_hlist_release(swhash);
10800
10801
mutex_unlock(&swhash->hlist_mutex);
10802
}
10803
10804
static void swevent_hlist_put(void)
10805
{
10806
int cpu;
10807
10808
for_each_possible_cpu(cpu)
10809
swevent_hlist_put_cpu(cpu);
10810
}
10811
10812
static int swevent_hlist_get_cpu(int cpu)
10813
{
10814
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
10815
int err = 0;
10816
10817
mutex_lock(&swhash->hlist_mutex);
10818
if (!swevent_hlist_deref(swhash) &&
10819
cpumask_test_cpu(cpu, perf_online_mask)) {
10820
struct swevent_hlist *hlist;
10821
10822
hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
10823
if (!hlist) {
10824
err = -ENOMEM;
10825
goto exit;
10826
}
10827
rcu_assign_pointer(swhash->swevent_hlist, hlist);
10828
}
10829
swhash->hlist_refcount++;
10830
exit:
10831
mutex_unlock(&swhash->hlist_mutex);
10832
10833
return err;
10834
}
10835
10836
static int swevent_hlist_get(void)
10837
{
10838
int err, cpu, failed_cpu;
10839
10840
mutex_lock(&pmus_lock);
10841
for_each_possible_cpu(cpu) {
10842
err = swevent_hlist_get_cpu(cpu);
10843
if (err) {
10844
failed_cpu = cpu;
10845
goto fail;
10846
}
10847
}
10848
mutex_unlock(&pmus_lock);
10849
return 0;
10850
fail:
10851
for_each_possible_cpu(cpu) {
10852
if (cpu == failed_cpu)
10853
break;
10854
swevent_hlist_put_cpu(cpu);
10855
}
10856
mutex_unlock(&pmus_lock);
10857
return err;
10858
}
10859
10860
struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
10861
10862
static void sw_perf_event_destroy(struct perf_event *event)
10863
{
10864
u64 event_id = event->attr.config;
10865
10866
WARN_ON(event->parent);
10867
10868
static_key_slow_dec(&perf_swevent_enabled[event_id]);
10869
swevent_hlist_put();
10870
}
10871
10872
static struct pmu perf_cpu_clock; /* fwd declaration */
10873
static struct pmu perf_task_clock;
10874
10875
static int perf_swevent_init(struct perf_event *event)
10876
{
10877
u64 event_id = event->attr.config;
10878
10879
if (event->attr.type != PERF_TYPE_SOFTWARE)
10880
return -ENOENT;
10881
10882
/*
10883
* no branch sampling for software events
10884
*/
10885
if (has_branch_stack(event))
10886
return -EOPNOTSUPP;
10887
10888
switch (event_id) {
10889
case PERF_COUNT_SW_CPU_CLOCK:
10890
event->attr.type = perf_cpu_clock.type;
10891
return -ENOENT;
10892
case PERF_COUNT_SW_TASK_CLOCK:
10893
event->attr.type = perf_task_clock.type;
10894
return -ENOENT;
10895
10896
default:
10897
break;
10898
}
10899
10900
if (event_id >= PERF_COUNT_SW_MAX)
10901
return -ENOENT;
10902
10903
if (!event->parent) {
10904
int err;
10905
10906
err = swevent_hlist_get();
10907
if (err)
10908
return err;
10909
10910
static_key_slow_inc(&perf_swevent_enabled[event_id]);
10911
event->destroy = sw_perf_event_destroy;
10912
}
10913
10914
return 0;
10915
}
10916
10917
static struct pmu perf_swevent = {
10918
.task_ctx_nr = perf_sw_context,
10919
10920
.capabilities = PERF_PMU_CAP_NO_NMI,
10921
10922
.event_init = perf_swevent_init,
10923
.add = perf_swevent_add,
10924
.del = perf_swevent_del,
10925
.start = perf_swevent_start,
10926
.stop = perf_swevent_stop,
10927
.read = perf_swevent_read,
10928
};
10929
10930
#ifdef CONFIG_EVENT_TRACING
10931
10932
static void tp_perf_event_destroy(struct perf_event *event)
10933
{
10934
perf_trace_destroy(event);
10935
}
10936
10937
static int perf_tp_event_init(struct perf_event *event)
10938
{
10939
int err;
10940
10941
if (event->attr.type != PERF_TYPE_TRACEPOINT)
10942
return -ENOENT;
10943
10944
/*
10945
* no branch sampling for tracepoint events
10946
*/
10947
if (has_branch_stack(event))
10948
return -EOPNOTSUPP;
10949
10950
err = perf_trace_init(event);
10951
if (err)
10952
return err;
10953
10954
event->destroy = tp_perf_event_destroy;
10955
10956
return 0;
10957
}
10958
10959
static struct pmu perf_tracepoint = {
10960
.task_ctx_nr = perf_sw_context,
10961
10962
.event_init = perf_tp_event_init,
10963
.add = perf_trace_add,
10964
.del = perf_trace_del,
10965
.start = perf_swevent_start,
10966
.stop = perf_swevent_stop,
10967
.read = perf_swevent_read,
10968
};
10969
10970
static int perf_tp_filter_match(struct perf_event *event,
10971
struct perf_raw_record *raw)
10972
{
10973
void *record = raw->frag.data;
10974
10975
/* only top level events have filters set */
10976
if (event->parent)
10977
event = event->parent;
10978
10979
if (likely(!event->filter) || filter_match_preds(event->filter, record))
10980
return 1;
10981
return 0;
10982
}
10983
10984
static int perf_tp_event_match(struct perf_event *event,
10985
struct perf_raw_record *raw,
10986
struct pt_regs *regs)
10987
{
10988
if (event->hw.state & PERF_HES_STOPPED)
10989
return 0;
10990
/*
10991
* If exclude_kernel, only trace user-space tracepoints (uprobes)
10992
*/
10993
if (event->attr.exclude_kernel && !user_mode(regs))
10994
return 0;
10995
10996
if (!perf_tp_filter_match(event, raw))
10997
return 0;
10998
10999
return 1;
11000
}
11001
11002
void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
11003
struct trace_event_call *call, u64 count,
11004
struct pt_regs *regs, struct hlist_head *head,
11005
struct task_struct *task)
11006
{
11007
if (bpf_prog_array_valid(call)) {
11008
*(struct pt_regs **)raw_data = regs;
11009
if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
11010
perf_swevent_put_recursion_context(rctx);
11011
return;
11012
}
11013
}
11014
perf_tp_event(call->event.type, count, raw_data, size, regs, head,
11015
rctx, task);
11016
}
11017
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
11018
11019
static void __perf_tp_event_target_task(u64 count, void *record,
11020
struct pt_regs *regs,
11021
struct perf_sample_data *data,
11022
struct perf_raw_record *raw,
11023
struct perf_event *event)
11024
{
11025
struct trace_entry *entry = record;
11026
11027
if (event->attr.config != entry->type)
11028
return;
11029
/* Cannot deliver synchronous signal to other task. */
11030
if (event->attr.sigtrap)
11031
return;
11032
if (perf_tp_event_match(event, raw, regs)) {
11033
perf_sample_data_init(data, 0, 0);
11034
perf_sample_save_raw_data(data, event, raw);
11035
perf_swevent_event(event, count, data, regs);
11036
}
11037
}
11038
11039
static void perf_tp_event_target_task(u64 count, void *record,
11040
struct pt_regs *regs,
11041
struct perf_sample_data *data,
11042
struct perf_raw_record *raw,
11043
struct perf_event_context *ctx)
11044
{
11045
unsigned int cpu = smp_processor_id();
11046
struct pmu *pmu = &perf_tracepoint;
11047
struct perf_event *event, *sibling;
11048
11049
perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
11050
__perf_tp_event_target_task(count, record, regs, data, raw, event);
11051
for_each_sibling_event(sibling, event)
11052
__perf_tp_event_target_task(count, record, regs, data, raw, sibling);
11053
}
11054
11055
perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
11056
__perf_tp_event_target_task(count, record, regs, data, raw, event);
11057
for_each_sibling_event(sibling, event)
11058
__perf_tp_event_target_task(count, record, regs, data, raw, sibling);
11059
}
11060
}
11061
11062
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
11063
struct pt_regs *regs, struct hlist_head *head, int rctx,
11064
struct task_struct *task)
11065
{
11066
struct perf_sample_data data;
11067
struct perf_event *event;
11068
11069
struct perf_raw_record raw = {
11070
.frag = {
11071
.size = entry_size,
11072
.data = record,
11073
},
11074
};
11075
11076
perf_trace_buf_update(record, event_type);
11077
11078
hlist_for_each_entry_rcu(event, head, hlist_entry) {
11079
if (perf_tp_event_match(event, &raw, regs)) {
11080
/*
11081
* Here use the same on-stack perf_sample_data,
11082
* some members in data are event-specific and
11083
* need to be re-computed for different sweveents.
11084
* Re-initialize data->sample_flags safely to avoid
11085
* the problem that next event skips preparing data
11086
* because data->sample_flags is set.
11087
*/
11088
perf_sample_data_init(&data, 0, 0);
11089
perf_sample_save_raw_data(&data, event, &raw);
11090
perf_swevent_event(event, count, &data, regs);
11091
}
11092
}
11093
11094
/*
11095
* If we got specified a target task, also iterate its context and
11096
* deliver this event there too.
11097
*/
11098
if (task && task != current) {
11099
struct perf_event_context *ctx;
11100
11101
rcu_read_lock();
11102
ctx = rcu_dereference(task->perf_event_ctxp);
11103
if (!ctx)
11104
goto unlock;
11105
11106
raw_spin_lock(&ctx->lock);
11107
perf_tp_event_target_task(count, record, regs, &data, &raw, ctx);
11108
raw_spin_unlock(&ctx->lock);
11109
unlock:
11110
rcu_read_unlock();
11111
}
11112
11113
perf_swevent_put_recursion_context(rctx);
11114
}
11115
EXPORT_SYMBOL_GPL(perf_tp_event);
11116
11117
#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
11118
/*
11119
* Flags in config, used by dynamic PMU kprobe and uprobe
11120
* The flags should match following PMU_FORMAT_ATTR().
11121
*
11122
* PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
11123
* if not set, create kprobe/uprobe
11124
*
11125
* The following values specify a reference counter (or semaphore in the
11126
* terminology of tools like dtrace, systemtap, etc.) Userspace Statically
11127
* Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
11128
*
11129
* PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset
11130
* PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left
11131
*/
11132
enum perf_probe_config {
11133
PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */
11134
PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
11135
PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
11136
};
11137
11138
PMU_FORMAT_ATTR(retprobe, "config:0");
11139
#endif
11140
11141
#ifdef CONFIG_KPROBE_EVENTS
11142
static struct attribute *kprobe_attrs[] = {
11143
&format_attr_retprobe.attr,
11144
NULL,
11145
};
11146
11147
static struct attribute_group kprobe_format_group = {
11148
.name = "format",
11149
.attrs = kprobe_attrs,
11150
};
11151
11152
static const struct attribute_group *kprobe_attr_groups[] = {
11153
&kprobe_format_group,
11154
NULL,
11155
};
11156
11157
static int perf_kprobe_event_init(struct perf_event *event);
11158
static struct pmu perf_kprobe = {
11159
.task_ctx_nr = perf_sw_context,
11160
.event_init = perf_kprobe_event_init,
11161
.add = perf_trace_add,
11162
.del = perf_trace_del,
11163
.start = perf_swevent_start,
11164
.stop = perf_swevent_stop,
11165
.read = perf_swevent_read,
11166
.attr_groups = kprobe_attr_groups,
11167
};
11168
11169
static int perf_kprobe_event_init(struct perf_event *event)
11170
{
11171
int err;
11172
bool is_retprobe;
11173
11174
if (event->attr.type != perf_kprobe.type)
11175
return -ENOENT;
11176
11177
if (!perfmon_capable())
11178
return -EACCES;
11179
11180
/*
11181
* no branch sampling for probe events
11182
*/
11183
if (has_branch_stack(event))
11184
return -EOPNOTSUPP;
11185
11186
is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
11187
err = perf_kprobe_init(event, is_retprobe);
11188
if (err)
11189
return err;
11190
11191
event->destroy = perf_kprobe_destroy;
11192
11193
return 0;
11194
}
11195
#endif /* CONFIG_KPROBE_EVENTS */
11196
11197
#ifdef CONFIG_UPROBE_EVENTS
11198
PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
11199
11200
static struct attribute *uprobe_attrs[] = {
11201
&format_attr_retprobe.attr,
11202
&format_attr_ref_ctr_offset.attr,
11203
NULL,
11204
};
11205
11206
static struct attribute_group uprobe_format_group = {
11207
.name = "format",
11208
.attrs = uprobe_attrs,
11209
};
11210
11211
static const struct attribute_group *uprobe_attr_groups[] = {
11212
&uprobe_format_group,
11213
NULL,
11214
};
11215
11216
static int perf_uprobe_event_init(struct perf_event *event);
11217
static struct pmu perf_uprobe = {
11218
.task_ctx_nr = perf_sw_context,
11219
.event_init = perf_uprobe_event_init,
11220
.add = perf_trace_add,
11221
.del = perf_trace_del,
11222
.start = perf_swevent_start,
11223
.stop = perf_swevent_stop,
11224
.read = perf_swevent_read,
11225
.attr_groups = uprobe_attr_groups,
11226
};
11227
11228
static int perf_uprobe_event_init(struct perf_event *event)
11229
{
11230
int err;
11231
unsigned long ref_ctr_offset;
11232
bool is_retprobe;
11233
11234
if (event->attr.type != perf_uprobe.type)
11235
return -ENOENT;
11236
11237
if (!capable(CAP_SYS_ADMIN))
11238
return -EACCES;
11239
11240
/*
11241
* no branch sampling for probe events
11242
*/
11243
if (has_branch_stack(event))
11244
return -EOPNOTSUPP;
11245
11246
is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
11247
ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
11248
err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
11249
if (err)
11250
return err;
11251
11252
event->destroy = perf_uprobe_destroy;
11253
11254
return 0;
11255
}
11256
#endif /* CONFIG_UPROBE_EVENTS */
11257
11258
static inline void perf_tp_register(void)
11259
{
11260
perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
11261
#ifdef CONFIG_KPROBE_EVENTS
11262
perf_pmu_register(&perf_kprobe, "kprobe", -1);
11263
#endif
11264
#ifdef CONFIG_UPROBE_EVENTS
11265
perf_pmu_register(&perf_uprobe, "uprobe", -1);
11266
#endif
11267
}
11268
11269
static void perf_event_free_filter(struct perf_event *event)
11270
{
11271
ftrace_profile_free_filter(event);
11272
}
11273
11274
/*
11275
* returns true if the event is a tracepoint, or a kprobe/upprobe created
11276
* with perf_event_open()
11277
*/
11278
static inline bool perf_event_is_tracing(struct perf_event *event)
11279
{
11280
if (event->pmu == &perf_tracepoint)
11281
return true;
11282
#ifdef CONFIG_KPROBE_EVENTS
11283
if (event->pmu == &perf_kprobe)
11284
return true;
11285
#endif
11286
#ifdef CONFIG_UPROBE_EVENTS
11287
if (event->pmu == &perf_uprobe)
11288
return true;
11289
#endif
11290
return false;
11291
}
11292
11293
static int __perf_event_set_bpf_prog(struct perf_event *event,
11294
struct bpf_prog *prog,
11295
u64 bpf_cookie)
11296
{
11297
bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;
11298
11299
if (event->state <= PERF_EVENT_STATE_REVOKED)
11300
return -ENODEV;
11301
11302
if (!perf_event_is_tracing(event))
11303
return perf_event_set_bpf_handler(event, prog, bpf_cookie);
11304
11305
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE;
11306
is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE;
11307
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
11308
is_syscall_tp = is_syscall_trace_event(event->tp_event);
11309
if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp)
11310
/* bpf programs can only be attached to u/kprobe or tracepoint */
11311
return -EINVAL;
11312
11313
if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) ||
11314
(is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
11315
(is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
11316
return -EINVAL;
11317
11318
if (prog->type == BPF_PROG_TYPE_KPROBE && prog->sleepable && !is_uprobe)
11319
/* only uprobe programs are allowed to be sleepable */
11320
return -EINVAL;
11321
11322
/* Kprobe override only works for kprobes, not uprobes. */
11323
if (prog->kprobe_override && !is_kprobe)
11324
return -EINVAL;
11325
11326
/* Writing to context allowed only for uprobes. */
11327
if (prog->aux->kprobe_write_ctx && !is_uprobe)
11328
return -EINVAL;
11329
11330
if (is_tracepoint || is_syscall_tp) {
11331
int off = trace_event_get_offsets(event->tp_event);
11332
11333
if (prog->aux->max_ctx_offset > off)
11334
return -EACCES;
11335
}
11336
11337
return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
11338
}
11339
11340
int perf_event_set_bpf_prog(struct perf_event *event,
11341
struct bpf_prog *prog,
11342
u64 bpf_cookie)
11343
{
11344
struct perf_event_context *ctx;
11345
int ret;
11346
11347
ctx = perf_event_ctx_lock(event);
11348
ret = __perf_event_set_bpf_prog(event, prog, bpf_cookie);
11349
perf_event_ctx_unlock(event, ctx);
11350
11351
return ret;
11352
}
11353
11354
void perf_event_free_bpf_prog(struct perf_event *event)
11355
{
11356
if (!event->prog)
11357
return;
11358
11359
if (!perf_event_is_tracing(event)) {
11360
perf_event_free_bpf_handler(event);
11361
return;
11362
}
11363
perf_event_detach_bpf_prog(event);
11364
}
11365
11366
#else
11367
11368
static inline void perf_tp_register(void)
11369
{
11370
}
11371
11372
static void perf_event_free_filter(struct perf_event *event)
11373
{
11374
}
11375
11376
static int __perf_event_set_bpf_prog(struct perf_event *event,
11377
struct bpf_prog *prog,
11378
u64 bpf_cookie)
11379
{
11380
return -ENOENT;
11381
}
11382
11383
int perf_event_set_bpf_prog(struct perf_event *event,
11384
struct bpf_prog *prog,
11385
u64 bpf_cookie)
11386
{
11387
return -ENOENT;
11388
}
11389
11390
void perf_event_free_bpf_prog(struct perf_event *event)
11391
{
11392
}
11393
#endif /* CONFIG_EVENT_TRACING */
11394
11395
#ifdef CONFIG_HAVE_HW_BREAKPOINT
11396
void perf_bp_event(struct perf_event *bp, void *data)
11397
{
11398
struct perf_sample_data sample;
11399
struct pt_regs *regs = data;
11400
11401
perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
11402
11403
if (!bp->hw.state && !perf_exclude_event(bp, regs))
11404
perf_swevent_event(bp, 1, &sample, regs);
11405
}
11406
#endif
11407
11408
/*
11409
* Allocate a new address filter
11410
*/
11411
static struct perf_addr_filter *
11412
perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
11413
{
11414
int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
11415
struct perf_addr_filter *filter;
11416
11417
filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
11418
if (!filter)
11419
return NULL;
11420
11421
INIT_LIST_HEAD(&filter->entry);
11422
list_add_tail(&filter->entry, filters);
11423
11424
return filter;
11425
}
11426
11427
static void free_filters_list(struct list_head *filters)
11428
{
11429
struct perf_addr_filter *filter, *iter;
11430
11431
list_for_each_entry_safe(filter, iter, filters, entry) {
11432
path_put(&filter->path);
11433
list_del(&filter->entry);
11434
kfree(filter);
11435
}
11436
}
11437
11438
/*
11439
* Free existing address filters and optionally install new ones
11440
*/
11441
static void perf_addr_filters_splice(struct perf_event *event,
11442
struct list_head *head)
11443
{
11444
unsigned long flags;
11445
LIST_HEAD(list);
11446
11447
if (!has_addr_filter(event))
11448
return;
11449
11450
/* don't bother with children, they don't have their own filters */
11451
if (event->parent)
11452
return;
11453
11454
raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
11455
11456
list_splice_init(&event->addr_filters.list, &list);
11457
if (head)
11458
list_splice(head, &event->addr_filters.list);
11459
11460
raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
11461
11462
free_filters_list(&list);
11463
}
11464
11465
static void perf_free_addr_filters(struct perf_event *event)
11466
{
11467
/*
11468
* Used during free paths, there is no concurrency.
11469
*/
11470
if (list_empty(&event->addr_filters.list))
11471
return;
11472
11473
perf_addr_filters_splice(event, NULL);
11474
}
11475
11476
/*
11477
* Scan through mm's vmas and see if one of them matches the
11478
* @filter; if so, adjust filter's address range.
11479
* Called with mm::mmap_lock down for reading.
11480
*/
11481
static void perf_addr_filter_apply(struct perf_addr_filter *filter,
11482
struct mm_struct *mm,
11483
struct perf_addr_filter_range *fr)
11484
{
11485
struct vm_area_struct *vma;
11486
VMA_ITERATOR(vmi, mm, 0);
11487
11488
for_each_vma(vmi, vma) {
11489
if (!vma->vm_file)
11490
continue;
11491
11492
if (perf_addr_filter_vma_adjust(filter, vma, fr))
11493
return;
11494
}
11495
}
11496
11497
/*
11498
* Update event's address range filters based on the
11499
* task's existing mappings, if any.
11500
*/
11501
static void perf_event_addr_filters_apply(struct perf_event *event)
11502
{
11503
struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
11504
struct task_struct *task = READ_ONCE(event->ctx->task);
11505
struct perf_addr_filter *filter;
11506
struct mm_struct *mm = NULL;
11507
unsigned int count = 0;
11508
unsigned long flags;
11509
11510
/*
11511
* We may observe TASK_TOMBSTONE, which means that the event tear-down
11512
* will stop on the parent's child_mutex that our caller is also holding
11513
*/
11514
if (task == TASK_TOMBSTONE)
11515
return;
11516
11517
if (ifh->nr_file_filters) {
11518
mm = get_task_mm(task);
11519
if (!mm)
11520
goto restart;
11521
11522
mmap_read_lock(mm);
11523
}
11524
11525
raw_spin_lock_irqsave(&ifh->lock, flags);
11526
list_for_each_entry(filter, &ifh->list, entry) {
11527
if (filter->path.dentry) {
11528
/*
11529
* Adjust base offset if the filter is associated to a
11530
* binary that needs to be mapped:
11531
*/
11532
event->addr_filter_ranges[count].start = 0;
11533
event->addr_filter_ranges[count].size = 0;
11534
11535
perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
11536
} else {
11537
event->addr_filter_ranges[count].start = filter->offset;
11538
event->addr_filter_ranges[count].size = filter->size;
11539
}
11540
11541
count++;
11542
}
11543
11544
event->addr_filters_gen++;
11545
raw_spin_unlock_irqrestore(&ifh->lock, flags);
11546
11547
if (ifh->nr_file_filters) {
11548
mmap_read_unlock(mm);
11549
11550
mmput(mm);
11551
}
11552
11553
restart:
11554
perf_event_stop(event, 1);
11555
}
11556
11557
/*
11558
* Address range filtering: limiting the data to certain
11559
* instruction address ranges. Filters are ioctl()ed to us from
11560
* userspace as ascii strings.
11561
*
11562
* Filter string format:
11563
*
11564
* ACTION RANGE_SPEC
11565
* where ACTION is one of the
11566
* * "filter": limit the trace to this region
11567
* * "start": start tracing from this address
11568
* * "stop": stop tracing at this address/region;
11569
* RANGE_SPEC is
11570
* * for kernel addresses: <start address>[/<size>]
11571
* * for object files: <start address>[/<size>]@</path/to/object/file>
11572
*
11573
* if <size> is not specified or is zero, the range is treated as a single
11574
* address; not valid for ACTION=="filter".
11575
*/
11576
enum {
11577
IF_ACT_NONE = -1,
11578
IF_ACT_FILTER,
11579
IF_ACT_START,
11580
IF_ACT_STOP,
11581
IF_SRC_FILE,
11582
IF_SRC_KERNEL,
11583
IF_SRC_FILEADDR,
11584
IF_SRC_KERNELADDR,
11585
};
11586
11587
enum {
11588
IF_STATE_ACTION = 0,
11589
IF_STATE_SOURCE,
11590
IF_STATE_END,
11591
};
11592
11593
static const match_table_t if_tokens = {
11594
{ IF_ACT_FILTER, "filter" },
11595
{ IF_ACT_START, "start" },
11596
{ IF_ACT_STOP, "stop" },
11597
{ IF_SRC_FILE, "%u/%u@%s" },
11598
{ IF_SRC_KERNEL, "%u/%u" },
11599
{ IF_SRC_FILEADDR, "%u@%s" },
11600
{ IF_SRC_KERNELADDR, "%u" },
11601
{ IF_ACT_NONE, NULL },
11602
};
11603
11604
/*
11605
* Address filter string parser
11606
*/
11607
static int
11608
perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
11609
struct list_head *filters)
11610
{
11611
struct perf_addr_filter *filter = NULL;
11612
char *start, *orig, *filename = NULL;
11613
substring_t args[MAX_OPT_ARGS];
11614
int state = IF_STATE_ACTION, token;
11615
unsigned int kernel = 0;
11616
int ret = -EINVAL;
11617
11618
orig = fstr = kstrdup(fstr, GFP_KERNEL);
11619
if (!fstr)
11620
return -ENOMEM;
11621
11622
while ((start = strsep(&fstr, " ,\n")) != NULL) {
11623
static const enum perf_addr_filter_action_t actions[] = {
11624
[IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER,
11625
[IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START,
11626
[IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP,
11627
};
11628
ret = -EINVAL;
11629
11630
if (!*start)
11631
continue;
11632
11633
/* filter definition begins */
11634
if (state == IF_STATE_ACTION) {
11635
filter = perf_addr_filter_new(event, filters);
11636
if (!filter)
11637
goto fail;
11638
}
11639
11640
token = match_token(start, if_tokens, args);
11641
switch (token) {
11642
case IF_ACT_FILTER:
11643
case IF_ACT_START:
11644
case IF_ACT_STOP:
11645
if (state != IF_STATE_ACTION)
11646
goto fail;
11647
11648
filter->action = actions[token];
11649
state = IF_STATE_SOURCE;
11650
break;
11651
11652
case IF_SRC_KERNELADDR:
11653
case IF_SRC_KERNEL:
11654
kernel = 1;
11655
fallthrough;
11656
11657
case IF_SRC_FILEADDR:
11658
case IF_SRC_FILE:
11659
if (state != IF_STATE_SOURCE)
11660
goto fail;
11661
11662
*args[0].to = 0;
11663
ret = kstrtoul(args[0].from, 0, &filter->offset);
11664
if (ret)
11665
goto fail;
11666
11667
if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
11668
*args[1].to = 0;
11669
ret = kstrtoul(args[1].from, 0, &filter->size);
11670
if (ret)
11671
goto fail;
11672
}
11673
11674
if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
11675
int fpos = token == IF_SRC_FILE ? 2 : 1;
11676
11677
kfree(filename);
11678
filename = match_strdup(&args[fpos]);
11679
if (!filename) {
11680
ret = -ENOMEM;
11681
goto fail;
11682
}
11683
}
11684
11685
state = IF_STATE_END;
11686
break;
11687
11688
default:
11689
goto fail;
11690
}
11691
11692
/*
11693
* Filter definition is fully parsed, validate and install it.
11694
* Make sure that it doesn't contradict itself or the event's
11695
* attribute.
11696
*/
11697
if (state == IF_STATE_END) {
11698
ret = -EINVAL;
11699
11700
/*
11701
* ACTION "filter" must have a non-zero length region
11702
* specified.
11703
*/
11704
if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
11705
!filter->size)
11706
goto fail;
11707
11708
if (!kernel) {
11709
if (!filename)
11710
goto fail;
11711
11712
/*
11713
* For now, we only support file-based filters
11714
* in per-task events; doing so for CPU-wide
11715
* events requires additional context switching
11716
* trickery, since same object code will be
11717
* mapped at different virtual addresses in
11718
* different processes.
11719
*/
11720
ret = -EOPNOTSUPP;
11721
if (!event->ctx->task)
11722
goto fail;
11723
11724
/* look up the path and grab its inode */
11725
ret = kern_path(filename, LOOKUP_FOLLOW,
11726
&filter->path);
11727
if (ret)
11728
goto fail;
11729
11730
ret = -EINVAL;
11731
if (!filter->path.dentry ||
11732
!S_ISREG(d_inode(filter->path.dentry)
11733
->i_mode))
11734
goto fail;
11735
11736
event->addr_filters.nr_file_filters++;
11737
}
11738
11739
/* ready to consume more filters */
11740
kfree(filename);
11741
filename = NULL;
11742
state = IF_STATE_ACTION;
11743
filter = NULL;
11744
kernel = 0;
11745
}
11746
}
11747
11748
if (state != IF_STATE_ACTION)
11749
goto fail;
11750
11751
kfree(filename);
11752
kfree(orig);
11753
11754
return 0;
11755
11756
fail:
11757
kfree(filename);
11758
free_filters_list(filters);
11759
kfree(orig);
11760
11761
return ret;
11762
}
11763
11764
static int
11765
perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
11766
{
11767
LIST_HEAD(filters);
11768
int ret;
11769
11770
/*
11771
* Since this is called in perf_ioctl() path, we're already holding
11772
* ctx::mutex.
11773
*/
11774
lockdep_assert_held(&event->ctx->mutex);
11775
11776
if (WARN_ON_ONCE(event->parent))
11777
return -EINVAL;
11778
11779
ret = perf_event_parse_addr_filter(event, filter_str, &filters);
11780
if (ret)
11781
goto fail_clear_files;
11782
11783
ret = event->pmu->addr_filters_validate(&filters);
11784
if (ret)
11785
goto fail_free_filters;
11786
11787
/* remove existing filters, if any */
11788
perf_addr_filters_splice(event, &filters);
11789
11790
/* install new filters */
11791
perf_event_for_each_child(event, perf_event_addr_filters_apply);
11792
11793
return ret;
11794
11795
fail_free_filters:
11796
free_filters_list(&filters);
11797
11798
fail_clear_files:
11799
event->addr_filters.nr_file_filters = 0;
11800
11801
return ret;
11802
}
11803
11804
static int perf_event_set_filter(struct perf_event *event, void __user *arg)
11805
{
11806
int ret = -EINVAL;
11807
char *filter_str;
11808
11809
filter_str = strndup_user(arg, PAGE_SIZE);
11810
if (IS_ERR(filter_str))
11811
return PTR_ERR(filter_str);
11812
11813
#ifdef CONFIG_EVENT_TRACING
11814
if (perf_event_is_tracing(event)) {
11815
struct perf_event_context *ctx = event->ctx;
11816
11817
/*
11818
* Beware, here be dragons!!
11819
*
11820
* the tracepoint muck will deadlock against ctx->mutex, but
11821
* the tracepoint stuff does not actually need it. So
11822
* temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
11823
* already have a reference on ctx.
11824
*
11825
* This can result in event getting moved to a different ctx,
11826
* but that does not affect the tracepoint state.
11827
*/
11828
mutex_unlock(&ctx->mutex);
11829
ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
11830
mutex_lock(&ctx->mutex);
11831
} else
11832
#endif
11833
if (has_addr_filter(event))
11834
ret = perf_event_set_addr_filter(event, filter_str);
11835
11836
kfree(filter_str);
11837
return ret;
11838
}
11839
11840
/*
11841
* hrtimer based swevent callback
11842
*/
11843
11844
static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
11845
{
11846
enum hrtimer_restart ret = HRTIMER_RESTART;
11847
struct perf_sample_data data;
11848
struct pt_regs *regs;
11849
struct perf_event *event;
11850
u64 period;
11851
11852
event = container_of(hrtimer, struct perf_event, hw.hrtimer);
11853
11854
if (event->state != PERF_EVENT_STATE_ACTIVE ||
11855
event->hw.state & PERF_HES_STOPPED)
11856
return HRTIMER_NORESTART;
11857
11858
event->pmu->read(event);
11859
11860
perf_sample_data_init(&data, 0, event->hw.last_period);
11861
regs = get_irq_regs();
11862
11863
if (regs && !perf_exclude_event(event, regs)) {
11864
if (!(event->attr.exclude_idle && is_idle_task(current)))
11865
if (__perf_event_overflow(event, 1, &data, regs))
11866
ret = HRTIMER_NORESTART;
11867
}
11868
11869
period = max_t(u64, 10000, event->hw.sample_period);
11870
hrtimer_forward_now(hrtimer, ns_to_ktime(period));
11871
11872
return ret;
11873
}
11874
11875
static void perf_swevent_start_hrtimer(struct perf_event *event)
11876
{
11877
struct hw_perf_event *hwc = &event->hw;
11878
s64 period;
11879
11880
if (!is_sampling_event(event))
11881
return;
11882
11883
period = local64_read(&hwc->period_left);
11884
if (period) {
11885
if (period < 0)
11886
period = 10000;
11887
11888
local64_set(&hwc->period_left, 0);
11889
} else {
11890
period = max_t(u64, 10000, hwc->sample_period);
11891
}
11892
hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
11893
HRTIMER_MODE_REL_PINNED_HARD);
11894
}
11895
11896
static void perf_swevent_cancel_hrtimer(struct perf_event *event)
11897
{
11898
struct hw_perf_event *hwc = &event->hw;
11899
11900
/*
11901
* Careful: this function can be triggered in the hrtimer handler,
11902
* for cpu-clock events, so hrtimer_cancel() would cause a
11903
* deadlock.
11904
*
11905
* So use hrtimer_try_to_cancel() to try to stop the hrtimer,
11906
* and the cpu-clock handler also sets the PERF_HES_STOPPED flag,
11907
* which guarantees that perf_swevent_hrtimer() will stop the
11908
* hrtimer once it sees the PERF_HES_STOPPED flag.
11909
*/
11910
if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) {
11911
ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
11912
local64_set(&hwc->period_left, ktime_to_ns(remaining));
11913
11914
hrtimer_try_to_cancel(&hwc->hrtimer);
11915
}
11916
}
11917
11918
static void perf_swevent_destroy_hrtimer(struct perf_event *event)
11919
{
11920
hrtimer_cancel(&event->hw.hrtimer);
11921
}
11922
11923
static void perf_swevent_init_hrtimer(struct perf_event *event)
11924
{
11925
struct hw_perf_event *hwc = &event->hw;
11926
11927
if (!is_sampling_event(event))
11928
return;
11929
11930
hrtimer_setup(&hwc->hrtimer, perf_swevent_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
11931
event->destroy = perf_swevent_destroy_hrtimer;
11932
11933
/*
11934
* Since hrtimers have a fixed rate, we can do a static freq->period
11935
* mapping and avoid the whole period adjust feedback stuff.
11936
*/
11937
if (event->attr.freq) {
11938
long freq = event->attr.sample_freq;
11939
11940
event->attr.sample_period = NSEC_PER_SEC / freq;
11941
hwc->sample_period = event->attr.sample_period;
11942
local64_set(&hwc->period_left, hwc->sample_period);
11943
hwc->last_period = hwc->sample_period;
11944
event->attr.freq = 0;
11945
}
11946
}
11947
11948
/*
11949
* Software event: cpu wall time clock
11950
*/
11951
11952
static void cpu_clock_event_update(struct perf_event *event)
11953
{
11954
s64 prev;
11955
u64 now;
11956
11957
now = local_clock();
11958
prev = local64_xchg(&event->hw.prev_count, now);
11959
local64_add(now - prev, &event->count);
11960
}
11961
11962
static void cpu_clock_event_start(struct perf_event *event, int flags)
11963
{
11964
event->hw.state = 0;
11965
local64_set(&event->hw.prev_count, local_clock());
11966
perf_swevent_start_hrtimer(event);
11967
}
11968
11969
static void cpu_clock_event_stop(struct perf_event *event, int flags)
11970
{
11971
event->hw.state = PERF_HES_STOPPED;
11972
perf_swevent_cancel_hrtimer(event);
11973
if (flags & PERF_EF_UPDATE)
11974
cpu_clock_event_update(event);
11975
}
11976
11977
static int cpu_clock_event_add(struct perf_event *event, int flags)
11978
{
11979
if (flags & PERF_EF_START)
11980
cpu_clock_event_start(event, flags);
11981
perf_event_update_userpage(event);
11982
11983
return 0;
11984
}
11985
11986
static void cpu_clock_event_del(struct perf_event *event, int flags)
11987
{
11988
cpu_clock_event_stop(event, PERF_EF_UPDATE);
11989
}
11990
11991
static void cpu_clock_event_read(struct perf_event *event)
11992
{
11993
cpu_clock_event_update(event);
11994
}
11995
11996
static int cpu_clock_event_init(struct perf_event *event)
11997
{
11998
if (event->attr.type != perf_cpu_clock.type)
11999
return -ENOENT;
12000
12001
if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
12002
return -ENOENT;
12003
12004
/*
12005
* no branch sampling for software events
12006
*/
12007
if (has_branch_stack(event))
12008
return -EOPNOTSUPP;
12009
12010
perf_swevent_init_hrtimer(event);
12011
12012
return 0;
12013
}
12014
12015
static struct pmu perf_cpu_clock = {
12016
.task_ctx_nr = perf_sw_context,
12017
12018
.capabilities = PERF_PMU_CAP_NO_NMI,
12019
.dev = PMU_NULL_DEV,
12020
12021
.event_init = cpu_clock_event_init,
12022
.add = cpu_clock_event_add,
12023
.del = cpu_clock_event_del,
12024
.start = cpu_clock_event_start,
12025
.stop = cpu_clock_event_stop,
12026
.read = cpu_clock_event_read,
12027
};
12028
12029
/*
12030
* Software event: task time clock
12031
*/
12032
12033
static void task_clock_event_update(struct perf_event *event, u64 now)
12034
{
12035
u64 prev;
12036
s64 delta;
12037
12038
prev = local64_xchg(&event->hw.prev_count, now);
12039
delta = now - prev;
12040
local64_add(delta, &event->count);
12041
}
12042
12043
static void task_clock_event_start(struct perf_event *event, int flags)
12044
{
12045
event->hw.state = 0;
12046
local64_set(&event->hw.prev_count, event->ctx->time);
12047
perf_swevent_start_hrtimer(event);
12048
}
12049
12050
static void task_clock_event_stop(struct perf_event *event, int flags)
12051
{
12052
event->hw.state = PERF_HES_STOPPED;
12053
perf_swevent_cancel_hrtimer(event);
12054
if (flags & PERF_EF_UPDATE)
12055
task_clock_event_update(event, event->ctx->time);
12056
}
12057
12058
static int task_clock_event_add(struct perf_event *event, int flags)
12059
{
12060
if (flags & PERF_EF_START)
12061
task_clock_event_start(event, flags);
12062
perf_event_update_userpage(event);
12063
12064
return 0;
12065
}
12066
12067
static void task_clock_event_del(struct perf_event *event, int flags)
12068
{
12069
task_clock_event_stop(event, PERF_EF_UPDATE);
12070
}
12071
12072
static void task_clock_event_read(struct perf_event *event)
12073
{
12074
u64 now = perf_clock();
12075
u64 delta = now - event->ctx->timestamp;
12076
u64 time = event->ctx->time + delta;
12077
12078
task_clock_event_update(event, time);
12079
}
12080
12081
static int task_clock_event_init(struct perf_event *event)
12082
{
12083
if (event->attr.type != perf_task_clock.type)
12084
return -ENOENT;
12085
12086
if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
12087
return -ENOENT;
12088
12089
/*
12090
* no branch sampling for software events
12091
*/
12092
if (has_branch_stack(event))
12093
return -EOPNOTSUPP;
12094
12095
perf_swevent_init_hrtimer(event);
12096
12097
return 0;
12098
}
12099
12100
static struct pmu perf_task_clock = {
12101
.task_ctx_nr = perf_sw_context,
12102
12103
.capabilities = PERF_PMU_CAP_NO_NMI,
12104
.dev = PMU_NULL_DEV,
12105
12106
.event_init = task_clock_event_init,
12107
.add = task_clock_event_add,
12108
.del = task_clock_event_del,
12109
.start = task_clock_event_start,
12110
.stop = task_clock_event_stop,
12111
.read = task_clock_event_read,
12112
};
12113
12114
static void perf_pmu_nop_void(struct pmu *pmu)
12115
{
12116
}
12117
12118
static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
12119
{
12120
}
12121
12122
static int perf_pmu_nop_int(struct pmu *pmu)
12123
{
12124
return 0;
12125
}
12126
12127
static int perf_event_nop_int(struct perf_event *event, u64 value)
12128
{
12129
return 0;
12130
}
12131
12132
static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
12133
12134
static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
12135
{
12136
__this_cpu_write(nop_txn_flags, flags);
12137
12138
if (flags & ~PERF_PMU_TXN_ADD)
12139
return;
12140
12141
perf_pmu_disable(pmu);
12142
}
12143
12144
static int perf_pmu_commit_txn(struct pmu *pmu)
12145
{
12146
unsigned int flags = __this_cpu_read(nop_txn_flags);
12147
12148
__this_cpu_write(nop_txn_flags, 0);
12149
12150
if (flags & ~PERF_PMU_TXN_ADD)
12151
return 0;
12152
12153
perf_pmu_enable(pmu);
12154
return 0;
12155
}
12156
12157
static void perf_pmu_cancel_txn(struct pmu *pmu)
12158
{
12159
unsigned int flags = __this_cpu_read(nop_txn_flags);
12160
12161
__this_cpu_write(nop_txn_flags, 0);
12162
12163
if (flags & ~PERF_PMU_TXN_ADD)
12164
return;
12165
12166
perf_pmu_enable(pmu);
12167
}
12168
12169
static int perf_event_idx_default(struct perf_event *event)
12170
{
12171
return 0;
12172
}
12173
12174
/*
12175
* Let userspace know that this PMU supports address range filtering:
12176
*/
12177
static ssize_t nr_addr_filters_show(struct device *dev,
12178
struct device_attribute *attr,
12179
char *page)
12180
{
12181
struct pmu *pmu = dev_get_drvdata(dev);
12182
12183
return sysfs_emit(page, "%d\n", pmu->nr_addr_filters);
12184
}
12185
DEVICE_ATTR_RO(nr_addr_filters);
12186
12187
static struct idr pmu_idr;
12188
12189
static ssize_t
12190
type_show(struct device *dev, struct device_attribute *attr, char *page)
12191
{
12192
struct pmu *pmu = dev_get_drvdata(dev);
12193
12194
return sysfs_emit(page, "%d\n", pmu->type);
12195
}
12196
static DEVICE_ATTR_RO(type);
12197
12198
static ssize_t
12199
perf_event_mux_interval_ms_show(struct device *dev,
12200
struct device_attribute *attr,
12201
char *page)
12202
{
12203
struct pmu *pmu = dev_get_drvdata(dev);
12204
12205
return sysfs_emit(page, "%d\n", pmu->hrtimer_interval_ms);
12206
}
12207
12208
static DEFINE_MUTEX(mux_interval_mutex);
12209
12210
static ssize_t
12211
perf_event_mux_interval_ms_store(struct device *dev,
12212
struct device_attribute *attr,
12213
const char *buf, size_t count)
12214
{
12215
struct pmu *pmu = dev_get_drvdata(dev);
12216
int timer, cpu, ret;
12217
12218
ret = kstrtoint(buf, 0, &timer);
12219
if (ret)
12220
return ret;
12221
12222
if (timer < 1)
12223
return -EINVAL;
12224
12225
/* same value, noting to do */
12226
if (timer == pmu->hrtimer_interval_ms)
12227
return count;
12228
12229
mutex_lock(&mux_interval_mutex);
12230
pmu->hrtimer_interval_ms = timer;
12231
12232
/* update all cpuctx for this PMU */
12233
cpus_read_lock();
12234
for_each_online_cpu(cpu) {
12235
struct perf_cpu_pmu_context *cpc;
12236
cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu);
12237
cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
12238
12239
cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc);
12240
}
12241
cpus_read_unlock();
12242
mutex_unlock(&mux_interval_mutex);
12243
12244
return count;
12245
}
12246
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
12247
12248
static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
12249
{
12250
switch (scope) {
12251
case PERF_PMU_SCOPE_CORE:
12252
return topology_sibling_cpumask(cpu);
12253
case PERF_PMU_SCOPE_DIE:
12254
return topology_die_cpumask(cpu);
12255
case PERF_PMU_SCOPE_CLUSTER:
12256
return topology_cluster_cpumask(cpu);
12257
case PERF_PMU_SCOPE_PKG:
12258
return topology_core_cpumask(cpu);
12259
case PERF_PMU_SCOPE_SYS_WIDE:
12260
return cpu_online_mask;
12261
}
12262
12263
return NULL;
12264
}
12265
12266
static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
12267
{
12268
switch (scope) {
12269
case PERF_PMU_SCOPE_CORE:
12270
return perf_online_core_mask;
12271
case PERF_PMU_SCOPE_DIE:
12272
return perf_online_die_mask;
12273
case PERF_PMU_SCOPE_CLUSTER:
12274
return perf_online_cluster_mask;
12275
case PERF_PMU_SCOPE_PKG:
12276
return perf_online_pkg_mask;
12277
case PERF_PMU_SCOPE_SYS_WIDE:
12278
return perf_online_sys_mask;
12279
}
12280
12281
return NULL;
12282
}
12283
12284
static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
12285
char *buf)
12286
{
12287
struct pmu *pmu = dev_get_drvdata(dev);
12288
struct cpumask *mask = perf_scope_cpumask(pmu->scope);
12289
12290
if (mask)
12291
return cpumap_print_to_pagebuf(true, buf, mask);
12292
return 0;
12293
}
12294
12295
static DEVICE_ATTR_RO(cpumask);
12296
12297
static struct attribute *pmu_dev_attrs[] = {
12298
&dev_attr_type.attr,
12299
&dev_attr_perf_event_mux_interval_ms.attr,
12300
&dev_attr_nr_addr_filters.attr,
12301
&dev_attr_cpumask.attr,
12302
NULL,
12303
};
12304
12305
static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n)
12306
{
12307
struct device *dev = kobj_to_dev(kobj);
12308
struct pmu *pmu = dev_get_drvdata(dev);
12309
12310
if (n == 2 && !pmu->nr_addr_filters)
12311
return 0;
12312
12313
/* cpumask */
12314
if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
12315
return 0;
12316
12317
return a->mode;
12318
}
12319
12320
static struct attribute_group pmu_dev_attr_group = {
12321
.is_visible = pmu_dev_is_visible,
12322
.attrs = pmu_dev_attrs,
12323
};
12324
12325
static const struct attribute_group *pmu_dev_groups[] = {
12326
&pmu_dev_attr_group,
12327
NULL,
12328
};
12329
12330
static int pmu_bus_running;
12331
static const struct bus_type pmu_bus = {
12332
.name = "event_source",
12333
.dev_groups = pmu_dev_groups,
12334
};
12335
12336
static void pmu_dev_release(struct device *dev)
12337
{
12338
kfree(dev);
12339
}
12340
12341
static int pmu_dev_alloc(struct pmu *pmu)
12342
{
12343
int ret = -ENOMEM;
12344
12345
pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
12346
if (!pmu->dev)
12347
goto out;
12348
12349
pmu->dev->groups = pmu->attr_groups;
12350
device_initialize(pmu->dev);
12351
12352
dev_set_drvdata(pmu->dev, pmu);
12353
pmu->dev->bus = &pmu_bus;
12354
pmu->dev->parent = pmu->parent;
12355
pmu->dev->release = pmu_dev_release;
12356
12357
ret = dev_set_name(pmu->dev, "%s", pmu->name);
12358
if (ret)
12359
goto free_dev;
12360
12361
ret = device_add(pmu->dev);
12362
if (ret)
12363
goto free_dev;
12364
12365
if (pmu->attr_update) {
12366
ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
12367
if (ret)
12368
goto del_dev;
12369
}
12370
12371
out:
12372
return ret;
12373
12374
del_dev:
12375
device_del(pmu->dev);
12376
12377
free_dev:
12378
put_device(pmu->dev);
12379
pmu->dev = NULL;
12380
goto out;
12381
}
12382
12383
static struct lock_class_key cpuctx_mutex;
12384
static struct lock_class_key cpuctx_lock;
12385
12386
static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new)
12387
{
12388
void *tmp, *val = idr_find(idr, id);
12389
12390
if (val != old)
12391
return false;
12392
12393
tmp = idr_replace(idr, new, id);
12394
if (IS_ERR(tmp))
12395
return false;
12396
12397
WARN_ON_ONCE(tmp != val);
12398
return true;
12399
}
12400
12401
static void perf_pmu_free(struct pmu *pmu)
12402
{
12403
if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
12404
if (pmu->nr_addr_filters)
12405
device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
12406
device_del(pmu->dev);
12407
put_device(pmu->dev);
12408
}
12409
12410
if (pmu->cpu_pmu_context) {
12411
int cpu;
12412
12413
for_each_possible_cpu(cpu) {
12414
struct perf_cpu_pmu_context *cpc;
12415
12416
cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu);
12417
if (!cpc)
12418
continue;
12419
if (cpc->epc.embedded) {
12420
/* refcount managed */
12421
put_pmu_ctx(&cpc->epc);
12422
continue;
12423
}
12424
kfree(cpc);
12425
}
12426
free_percpu(pmu->cpu_pmu_context);
12427
}
12428
}
12429
12430
DEFINE_FREE(pmu_unregister, struct pmu *, if (_T) perf_pmu_free(_T))
12431
12432
int perf_pmu_register(struct pmu *_pmu, const char *name, int type)
12433
{
12434
int cpu, max = PERF_TYPE_MAX;
12435
12436
struct pmu *pmu __free(pmu_unregister) = _pmu;
12437
guard(mutex)(&pmus_lock);
12438
12439
if (WARN_ONCE(!name, "Can not register anonymous pmu.\n"))
12440
return -EINVAL;
12441
12442
if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE,
12443
"Can not register a pmu with an invalid scope.\n"))
12444
return -EINVAL;
12445
12446
pmu->name = name;
12447
12448
if (type >= 0)
12449
max = type;
12450
12451
CLASS(idr_alloc, pmu_type)(&pmu_idr, NULL, max, 0, GFP_KERNEL);
12452
if (pmu_type.id < 0)
12453
return pmu_type.id;
12454
12455
WARN_ON(type >= 0 && pmu_type.id != type);
12456
12457
pmu->type = pmu_type.id;
12458
atomic_set(&pmu->exclusive_cnt, 0);
12459
12460
if (pmu_bus_running && !pmu->dev) {
12461
int ret = pmu_dev_alloc(pmu);
12462
if (ret)
12463
return ret;
12464
}
12465
12466
pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context *);
12467
if (!pmu->cpu_pmu_context)
12468
return -ENOMEM;
12469
12470
for_each_possible_cpu(cpu) {
12471
struct perf_cpu_pmu_context *cpc =
12472
kmalloc_node(sizeof(struct perf_cpu_pmu_context),
12473
GFP_KERNEL | __GFP_ZERO,
12474
cpu_to_node(cpu));
12475
12476
if (!cpc)
12477
return -ENOMEM;
12478
12479
*per_cpu_ptr(pmu->cpu_pmu_context, cpu) = cpc;
12480
__perf_init_event_pmu_context(&cpc->epc, pmu);
12481
__perf_mux_hrtimer_init(cpc, cpu);
12482
}
12483
12484
if (!pmu->start_txn) {
12485
if (pmu->pmu_enable) {
12486
/*
12487
* If we have pmu_enable/pmu_disable calls, install
12488
* transaction stubs that use that to try and batch
12489
* hardware accesses.
12490
*/
12491
pmu->start_txn = perf_pmu_start_txn;
12492
pmu->commit_txn = perf_pmu_commit_txn;
12493
pmu->cancel_txn = perf_pmu_cancel_txn;
12494
} else {
12495
pmu->start_txn = perf_pmu_nop_txn;
12496
pmu->commit_txn = perf_pmu_nop_int;
12497
pmu->cancel_txn = perf_pmu_nop_void;
12498
}
12499
}
12500
12501
if (!pmu->pmu_enable) {
12502
pmu->pmu_enable = perf_pmu_nop_void;
12503
pmu->pmu_disable = perf_pmu_nop_void;
12504
}
12505
12506
if (!pmu->check_period)
12507
pmu->check_period = perf_event_nop_int;
12508
12509
if (!pmu->event_idx)
12510
pmu->event_idx = perf_event_idx_default;
12511
12512
INIT_LIST_HEAD(&pmu->events);
12513
spin_lock_init(&pmu->events_lock);
12514
12515
/*
12516
* Now that the PMU is complete, make it visible to perf_try_init_event().
12517
*/
12518
if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu))
12519
return -EINVAL;
12520
list_add_rcu(&pmu->entry, &pmus);
12521
12522
take_idr_id(pmu_type);
12523
_pmu = no_free_ptr(pmu); // let it rip
12524
return 0;
12525
}
12526
EXPORT_SYMBOL_GPL(perf_pmu_register);
12527
12528
static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
12529
struct perf_event_context *ctx)
12530
{
12531
/*
12532
* De-schedule the event and mark it REVOKED.
12533
*/
12534
perf_event_exit_event(event, ctx, ctx->task, true);
12535
12536
/*
12537
* All _free_event() bits that rely on event->pmu:
12538
*
12539
* Notably, perf_mmap() relies on the ordering here.
12540
*/
12541
scoped_guard (mutex, &event->mmap_mutex) {
12542
WARN_ON_ONCE(pmu->event_unmapped);
12543
/*
12544
* Mostly an empty lock sequence, such that perf_mmap(), which
12545
* relies on mmap_mutex, is sure to observe the state change.
12546
*/
12547
}
12548
12549
perf_event_free_bpf_prog(event);
12550
perf_free_addr_filters(event);
12551
12552
if (event->destroy) {
12553
event->destroy(event);
12554
event->destroy = NULL;
12555
}
12556
12557
if (event->pmu_ctx) {
12558
put_pmu_ctx(event->pmu_ctx);
12559
event->pmu_ctx = NULL;
12560
}
12561
12562
exclusive_event_destroy(event);
12563
module_put(pmu->module);
12564
12565
event->pmu = NULL; /* force fault instead of UAF */
12566
}
12567
12568
static void pmu_detach_event(struct pmu *pmu, struct perf_event *event)
12569
{
12570
struct perf_event_context *ctx;
12571
12572
ctx = perf_event_ctx_lock(event);
12573
__pmu_detach_event(pmu, event, ctx);
12574
perf_event_ctx_unlock(event, ctx);
12575
12576
scoped_guard (spinlock, &pmu->events_lock)
12577
list_del(&event->pmu_list);
12578
}
12579
12580
static struct perf_event *pmu_get_event(struct pmu *pmu)
12581
{
12582
struct perf_event *event;
12583
12584
guard(spinlock)(&pmu->events_lock);
12585
list_for_each_entry(event, &pmu->events, pmu_list) {
12586
if (atomic_long_inc_not_zero(&event->refcount))
12587
return event;
12588
}
12589
12590
return NULL;
12591
}
12592
12593
static bool pmu_empty(struct pmu *pmu)
12594
{
12595
guard(spinlock)(&pmu->events_lock);
12596
return list_empty(&pmu->events);
12597
}
12598
12599
static void pmu_detach_events(struct pmu *pmu)
12600
{
12601
struct perf_event *event;
12602
12603
for (;;) {
12604
event = pmu_get_event(pmu);
12605
if (!event)
12606
break;
12607
12608
pmu_detach_event(pmu, event);
12609
put_event(event);
12610
}
12611
12612
/*
12613
* wait for pending _free_event()s
12614
*/
12615
wait_var_event(pmu, pmu_empty(pmu));
12616
}
12617
12618
int perf_pmu_unregister(struct pmu *pmu)
12619
{
12620
scoped_guard (mutex, &pmus_lock) {
12621
if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL))
12622
return -EINVAL;
12623
12624
list_del_rcu(&pmu->entry);
12625
}
12626
12627
/*
12628
* We dereference the pmu list under both SRCU and regular RCU, so
12629
* synchronize against both of those.
12630
*
12631
* Notably, the entirety of event creation, from perf_init_event()
12632
* (which will now fail, because of the above) until
12633
* perf_install_in_context() should be under SRCU such that
12634
* this synchronizes against event creation. This avoids trying to
12635
* detach events that are not fully formed.
12636
*/
12637
synchronize_srcu(&pmus_srcu);
12638
synchronize_rcu();
12639
12640
if (pmu->event_unmapped && !pmu_empty(pmu)) {
12641
/*
12642
* Can't force remove events when pmu::event_unmapped()
12643
* is used in perf_mmap_close().
12644
*/
12645
guard(mutex)(&pmus_lock);
12646
idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu);
12647
list_add_rcu(&pmu->entry, &pmus);
12648
return -EBUSY;
12649
}
12650
12651
scoped_guard (mutex, &pmus_lock)
12652
idr_remove(&pmu_idr, pmu->type);
12653
12654
/*
12655
* PMU is removed from the pmus list, so no new events will
12656
* be created, now take care of the existing ones.
12657
*/
12658
pmu_detach_events(pmu);
12659
12660
/*
12661
* PMU is unused, make it go away.
12662
*/
12663
perf_pmu_free(pmu);
12664
return 0;
12665
}
12666
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
12667
12668
static inline bool has_extended_regs(struct perf_event *event)
12669
{
12670
return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
12671
(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
12672
}
12673
12674
static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
12675
{
12676
struct perf_event_context *ctx = NULL;
12677
int ret;
12678
12679
if (!try_module_get(pmu->module))
12680
return -ENODEV;
12681
12682
/*
12683
* A number of pmu->event_init() methods iterate the sibling_list to,
12684
* for example, validate if the group fits on the PMU. Therefore,
12685
* if this is a sibling event, acquire the ctx->mutex to protect
12686
* the sibling_list.
12687
*/
12688
if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
12689
/*
12690
* This ctx->mutex can nest when we're called through
12691
* inheritance. See the perf_event_ctx_lock_nested() comment.
12692
*/
12693
ctx = perf_event_ctx_lock_nested(event->group_leader,
12694
SINGLE_DEPTH_NESTING);
12695
BUG_ON(!ctx);
12696
}
12697
12698
event->pmu = pmu;
12699
ret = pmu->event_init(event);
12700
12701
if (ctx)
12702
perf_event_ctx_unlock(event->group_leader, ctx);
12703
12704
if (ret)
12705
goto err_pmu;
12706
12707
if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
12708
has_extended_regs(event)) {
12709
ret = -EOPNOTSUPP;
12710
goto err_destroy;
12711
}
12712
12713
if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
12714
event_has_any_exclude_flag(event)) {
12715
ret = -EINVAL;
12716
goto err_destroy;
12717
}
12718
12719
if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
12720
const struct cpumask *cpumask;
12721
struct cpumask *pmu_cpumask;
12722
int cpu;
12723
12724
cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
12725
pmu_cpumask = perf_scope_cpumask(pmu->scope);
12726
12727
ret = -ENODEV;
12728
if (!pmu_cpumask || !cpumask)
12729
goto err_destroy;
12730
12731
cpu = cpumask_any_and(pmu_cpumask, cpumask);
12732
if (cpu >= nr_cpu_ids)
12733
goto err_destroy;
12734
12735
event->event_caps |= PERF_EV_CAP_READ_SCOPE;
12736
}
12737
12738
return 0;
12739
12740
err_destroy:
12741
if (event->destroy) {
12742
event->destroy(event);
12743
event->destroy = NULL;
12744
}
12745
12746
err_pmu:
12747
event->pmu = NULL;
12748
module_put(pmu->module);
12749
return ret;
12750
}
12751
12752
static struct pmu *perf_init_event(struct perf_event *event)
12753
{
12754
bool extended_type = false;
12755
struct pmu *pmu;
12756
int type, ret;
12757
12758
guard(srcu)(&pmus_srcu); /* pmu idr/list access */
12759
12760
/*
12761
* Save original type before calling pmu->event_init() since certain
12762
* pmus overwrites event->attr.type to forward event to another pmu.
12763
*/
12764
event->orig_type = event->attr.type;
12765
12766
/* Try parent's PMU first: */
12767
if (event->parent && event->parent->pmu) {
12768
pmu = event->parent->pmu;
12769
ret = perf_try_init_event(pmu, event);
12770
if (!ret)
12771
return pmu;
12772
}
12773
12774
/*
12775
* PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
12776
* are often aliases for PERF_TYPE_RAW.
12777
*/
12778
type = event->attr.type;
12779
if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) {
12780
type = event->attr.config >> PERF_PMU_TYPE_SHIFT;
12781
if (!type) {
12782
type = PERF_TYPE_RAW;
12783
} else {
12784
extended_type = true;
12785
event->attr.config &= PERF_HW_EVENT_MASK;
12786
}
12787
}
12788
12789
again:
12790
scoped_guard (rcu)
12791
pmu = idr_find(&pmu_idr, type);
12792
if (pmu) {
12793
if (event->attr.type != type && type != PERF_TYPE_RAW &&
12794
!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
12795
return ERR_PTR(-ENOENT);
12796
12797
ret = perf_try_init_event(pmu, event);
12798
if (ret == -ENOENT && event->attr.type != type && !extended_type) {
12799
type = event->attr.type;
12800
goto again;
12801
}
12802
12803
if (ret)
12804
return ERR_PTR(ret);
12805
12806
return pmu;
12807
}
12808
12809
list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
12810
ret = perf_try_init_event(pmu, event);
12811
if (!ret)
12812
return pmu;
12813
12814
if (ret != -ENOENT)
12815
return ERR_PTR(ret);
12816
}
12817
12818
return ERR_PTR(-ENOENT);
12819
}
12820
12821
static void attach_sb_event(struct perf_event *event)
12822
{
12823
struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
12824
12825
raw_spin_lock(&pel->lock);
12826
list_add_rcu(&event->sb_list, &pel->list);
12827
raw_spin_unlock(&pel->lock);
12828
}
12829
12830
/*
12831
* We keep a list of all !task (and therefore per-cpu) events
12832
* that need to receive side-band records.
12833
*
12834
* This avoids having to scan all the various PMU per-cpu contexts
12835
* looking for them.
12836
*/
12837
static void account_pmu_sb_event(struct perf_event *event)
12838
{
12839
if (is_sb_event(event))
12840
attach_sb_event(event);
12841
}
12842
12843
/* Freq events need the tick to stay alive (see perf_event_task_tick). */
12844
static void account_freq_event_nohz(void)
12845
{
12846
#ifdef CONFIG_NO_HZ_FULL
12847
/* Lock so we don't race with concurrent unaccount */
12848
spin_lock(&nr_freq_lock);
12849
if (atomic_inc_return(&nr_freq_events) == 1)
12850
tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
12851
spin_unlock(&nr_freq_lock);
12852
#endif
12853
}
12854
12855
static void account_freq_event(void)
12856
{
12857
if (tick_nohz_full_enabled())
12858
account_freq_event_nohz();
12859
else
12860
atomic_inc(&nr_freq_events);
12861
}
12862
12863
12864
static void account_event(struct perf_event *event)
12865
{
12866
bool inc = false;
12867
12868
if (event->parent)
12869
return;
12870
12871
if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
12872
inc = true;
12873
if (event->attr.mmap || event->attr.mmap_data)
12874
atomic_inc(&nr_mmap_events);
12875
if (event->attr.build_id)
12876
atomic_inc(&nr_build_id_events);
12877
if (event->attr.comm)
12878
atomic_inc(&nr_comm_events);
12879
if (event->attr.namespaces)
12880
atomic_inc(&nr_namespaces_events);
12881
if (event->attr.cgroup)
12882
atomic_inc(&nr_cgroup_events);
12883
if (event->attr.task)
12884
atomic_inc(&nr_task_events);
12885
if (event->attr.freq)
12886
account_freq_event();
12887
if (event->attr.context_switch) {
12888
atomic_inc(&nr_switch_events);
12889
inc = true;
12890
}
12891
if (has_branch_stack(event))
12892
inc = true;
12893
if (is_cgroup_event(event))
12894
inc = true;
12895
if (event->attr.ksymbol)
12896
atomic_inc(&nr_ksymbol_events);
12897
if (event->attr.bpf_event)
12898
atomic_inc(&nr_bpf_events);
12899
if (event->attr.text_poke)
12900
atomic_inc(&nr_text_poke_events);
12901
12902
if (inc) {
12903
/*
12904
* We need the mutex here because static_branch_enable()
12905
* must complete *before* the perf_sched_count increment
12906
* becomes visible.
12907
*/
12908
if (atomic_inc_not_zero(&perf_sched_count))
12909
goto enabled;
12910
12911
mutex_lock(&perf_sched_mutex);
12912
if (!atomic_read(&perf_sched_count)) {
12913
static_branch_enable(&perf_sched_events);
12914
/*
12915
* Guarantee that all CPUs observe they key change and
12916
* call the perf scheduling hooks before proceeding to
12917
* install events that need them.
12918
*/
12919
synchronize_rcu();
12920
}
12921
/*
12922
* Now that we have waited for the sync_sched(), allow further
12923
* increments to by-pass the mutex.
12924
*/
12925
atomic_inc(&perf_sched_count);
12926
mutex_unlock(&perf_sched_mutex);
12927
}
12928
enabled:
12929
12930
account_pmu_sb_event(event);
12931
}
12932
12933
/*
12934
* Allocate and initialize an event structure
12935
*/
12936
static struct perf_event *
12937
perf_event_alloc(struct perf_event_attr *attr, int cpu,
12938
struct task_struct *task,
12939
struct perf_event *group_leader,
12940
struct perf_event *parent_event,
12941
perf_overflow_handler_t overflow_handler,
12942
void *context, int cgroup_fd)
12943
{
12944
struct pmu *pmu;
12945
struct hw_perf_event *hwc;
12946
long err = -EINVAL;
12947
int node;
12948
12949
if ((unsigned)cpu >= nr_cpu_ids) {
12950
if (!task || cpu != -1)
12951
return ERR_PTR(-EINVAL);
12952
}
12953
if (attr->sigtrap && !task) {
12954
/* Requires a task: avoid signalling random tasks. */
12955
return ERR_PTR(-EINVAL);
12956
}
12957
12958
node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
12959
struct perf_event *event __free(__free_event) =
12960
kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node);
12961
if (!event)
12962
return ERR_PTR(-ENOMEM);
12963
12964
/*
12965
* Single events are their own group leaders, with an
12966
* empty sibling list:
12967
*/
12968
if (!group_leader)
12969
group_leader = event;
12970
12971
mutex_init(&event->child_mutex);
12972
INIT_LIST_HEAD(&event->child_list);
12973
12974
INIT_LIST_HEAD(&event->event_entry);
12975
INIT_LIST_HEAD(&event->sibling_list);
12976
INIT_LIST_HEAD(&event->active_list);
12977
init_event_group(event);
12978
INIT_LIST_HEAD(&event->rb_entry);
12979
INIT_LIST_HEAD(&event->active_entry);
12980
INIT_LIST_HEAD(&event->addr_filters.list);
12981
INIT_HLIST_NODE(&event->hlist_entry);
12982
INIT_LIST_HEAD(&event->pmu_list);
12983
12984
12985
init_waitqueue_head(&event->waitq);
12986
init_irq_work(&event->pending_irq, perf_pending_irq);
12987
event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable);
12988
init_task_work(&event->pending_task, perf_pending_task);
12989
12990
mutex_init(&event->mmap_mutex);
12991
raw_spin_lock_init(&event->addr_filters.lock);
12992
12993
atomic_long_set(&event->refcount, 1);
12994
event->cpu = cpu;
12995
event->attr = *attr;
12996
event->group_leader = group_leader;
12997
event->pmu = NULL;
12998
event->oncpu = -1;
12999
13000
event->parent = parent_event;
13001
13002
event->ns = get_pid_ns(task_active_pid_ns(current));
13003
event->id = atomic64_inc_return(&perf_event_id);
13004
13005
event->state = PERF_EVENT_STATE_INACTIVE;
13006
13007
if (parent_event)
13008
event->event_caps = parent_event->event_caps;
13009
13010
if (task) {
13011
event->attach_state = PERF_ATTACH_TASK;
13012
/*
13013
* XXX pmu::event_init needs to know what task to account to
13014
* and we cannot use the ctx information because we need the
13015
* pmu before we get a ctx.
13016
*/
13017
event->hw.target = get_task_struct(task);
13018
}
13019
13020
event->clock = &local_clock;
13021
if (parent_event)
13022
event->clock = parent_event->clock;
13023
13024
if (!overflow_handler && parent_event) {
13025
overflow_handler = parent_event->overflow_handler;
13026
context = parent_event->overflow_handler_context;
13027
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
13028
if (parent_event->prog) {
13029
struct bpf_prog *prog = parent_event->prog;
13030
13031
bpf_prog_inc(prog);
13032
event->prog = prog;
13033
}
13034
#endif
13035
}
13036
13037
if (overflow_handler) {
13038
event->overflow_handler = overflow_handler;
13039
event->overflow_handler_context = context;
13040
} else if (is_write_backward(event)){
13041
event->overflow_handler = perf_event_output_backward;
13042
event->overflow_handler_context = NULL;
13043
} else {
13044
event->overflow_handler = perf_event_output_forward;
13045
event->overflow_handler_context = NULL;
13046
}
13047
13048
perf_event__state_init(event);
13049
13050
pmu = NULL;
13051
13052
hwc = &event->hw;
13053
hwc->sample_period = attr->sample_period;
13054
if (is_event_in_freq_mode(event))
13055
hwc->sample_period = 1;
13056
hwc->last_period = hwc->sample_period;
13057
13058
local64_set(&hwc->period_left, hwc->sample_period);
13059
13060
/*
13061
* We do not support PERF_SAMPLE_READ on inherited events unless
13062
* PERF_SAMPLE_TID is also selected, which allows inherited events to
13063
* collect per-thread samples.
13064
* See perf_output_read().
13065
*/
13066
if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID))
13067
return ERR_PTR(-EINVAL);
13068
13069
if (!has_branch_stack(event))
13070
event->attr.branch_sample_type = 0;
13071
13072
pmu = perf_init_event(event);
13073
if (IS_ERR(pmu))
13074
return (void*)pmu;
13075
13076
/*
13077
* The PERF_ATTACH_TASK_DATA is set in the event_init()->hw_config().
13078
* The attach should be right after the perf_init_event().
13079
* Otherwise, the __free_event() would mistakenly detach the non-exist
13080
* perf_ctx_data because of the other errors between them.
13081
*/
13082
if (event->attach_state & PERF_ATTACH_TASK_DATA) {
13083
err = attach_perf_ctx_data(event);
13084
if (err)
13085
return ERR_PTR(err);
13086
}
13087
13088
/*
13089
* Disallow uncore-task events. Similarly, disallow uncore-cgroup
13090
* events (they don't make sense as the cgroup will be different
13091
* on other CPUs in the uncore mask).
13092
*/
13093
if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1))
13094
return ERR_PTR(-EINVAL);
13095
13096
if (event->attr.aux_output &&
13097
(!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) ||
13098
event->attr.aux_pause || event->attr.aux_resume))
13099
return ERR_PTR(-EOPNOTSUPP);
13100
13101
if (event->attr.aux_pause && event->attr.aux_resume)
13102
return ERR_PTR(-EINVAL);
13103
13104
if (event->attr.aux_start_paused) {
13105
if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
13106
return ERR_PTR(-EOPNOTSUPP);
13107
event->hw.aux_paused = 1;
13108
}
13109
13110
if (cgroup_fd != -1) {
13111
err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
13112
if (err)
13113
return ERR_PTR(err);
13114
}
13115
13116
err = exclusive_event_init(event);
13117
if (err)
13118
return ERR_PTR(err);
13119
13120
if (has_addr_filter(event)) {
13121
event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
13122
sizeof(struct perf_addr_filter_range),
13123
GFP_KERNEL);
13124
if (!event->addr_filter_ranges)
13125
return ERR_PTR(-ENOMEM);
13126
13127
/*
13128
* Clone the parent's vma offsets: they are valid until exec()
13129
* even if the mm is not shared with the parent.
13130
*/
13131
if (event->parent) {
13132
struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
13133
13134
raw_spin_lock_irq(&ifh->lock);
13135
memcpy(event->addr_filter_ranges,
13136
event->parent->addr_filter_ranges,
13137
pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
13138
raw_spin_unlock_irq(&ifh->lock);
13139
}
13140
13141
/* force hw sync on the address filters */
13142
event->addr_filters_gen = 1;
13143
}
13144
13145
if (!event->parent) {
13146
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
13147
err = get_callchain_buffers(attr->sample_max_stack);
13148
if (err)
13149
return ERR_PTR(err);
13150
event->attach_state |= PERF_ATTACH_CALLCHAIN;
13151
}
13152
}
13153
13154
err = security_perf_event_alloc(event);
13155
if (err)
13156
return ERR_PTR(err);
13157
13158
/* symmetric to unaccount_event() in _free_event() */
13159
account_event(event);
13160
13161
/*
13162
* Event creation should be under SRCU, see perf_pmu_unregister().
13163
*/
13164
lockdep_assert_held(&pmus_srcu);
13165
scoped_guard (spinlock, &pmu->events_lock)
13166
list_add(&event->pmu_list, &pmu->events);
13167
13168
return_ptr(event);
13169
}
13170
13171
static int perf_copy_attr(struct perf_event_attr __user *uattr,
13172
struct perf_event_attr *attr)
13173
{
13174
u32 size;
13175
int ret;
13176
13177
/* Zero the full structure, so that a short copy will be nice. */
13178
memset(attr, 0, sizeof(*attr));
13179
13180
ret = get_user(size, &uattr->size);
13181
if (ret)
13182
return ret;
13183
13184
/* ABI compatibility quirk: */
13185
if (!size)
13186
size = PERF_ATTR_SIZE_VER0;
13187
if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
13188
goto err_size;
13189
13190
ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
13191
if (ret) {
13192
if (ret == -E2BIG)
13193
goto err_size;
13194
return ret;
13195
}
13196
13197
attr->size = size;
13198
13199
if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
13200
return -EINVAL;
13201
13202
if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
13203
return -EINVAL;
13204
13205
if (attr->read_format & ~(PERF_FORMAT_MAX-1))
13206
return -EINVAL;
13207
13208
if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
13209
u64 mask = attr->branch_sample_type;
13210
13211
/* only using defined bits */
13212
if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
13213
return -EINVAL;
13214
13215
/* at least one branch bit must be set */
13216
if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
13217
return -EINVAL;
13218
13219
/* propagate priv level, when not set for branch */
13220
if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
13221
13222
/* exclude_kernel checked on syscall entry */
13223
if (!attr->exclude_kernel)
13224
mask |= PERF_SAMPLE_BRANCH_KERNEL;
13225
13226
if (!attr->exclude_user)
13227
mask |= PERF_SAMPLE_BRANCH_USER;
13228
13229
if (!attr->exclude_hv)
13230
mask |= PERF_SAMPLE_BRANCH_HV;
13231
/*
13232
* adjust user setting (for HW filter setup)
13233
*/
13234
attr->branch_sample_type = mask;
13235
}
13236
/* privileged levels capture (kernel, hv): check permissions */
13237
if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
13238
ret = perf_allow_kernel();
13239
if (ret)
13240
return ret;
13241
}
13242
}
13243
13244
if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
13245
ret = perf_reg_validate(attr->sample_regs_user);
13246
if (ret)
13247
return ret;
13248
}
13249
13250
if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
13251
if (!arch_perf_have_user_stack_dump())
13252
return -ENOSYS;
13253
13254
/*
13255
* We have __u32 type for the size, but so far
13256
* we can only use __u16 as maximum due to the
13257
* __u16 sample size limit.
13258
*/
13259
if (attr->sample_stack_user >= USHRT_MAX)
13260
return -EINVAL;
13261
else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
13262
return -EINVAL;
13263
}
13264
13265
if (!attr->sample_max_stack)
13266
attr->sample_max_stack = sysctl_perf_event_max_stack;
13267
13268
if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
13269
ret = perf_reg_validate(attr->sample_regs_intr);
13270
13271
#ifndef CONFIG_CGROUP_PERF
13272
if (attr->sample_type & PERF_SAMPLE_CGROUP)
13273
return -EINVAL;
13274
#endif
13275
if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
13276
(attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
13277
return -EINVAL;
13278
13279
if (!attr->inherit && attr->inherit_thread)
13280
return -EINVAL;
13281
13282
if (attr->remove_on_exec && attr->enable_on_exec)
13283
return -EINVAL;
13284
13285
if (attr->sigtrap && !attr->remove_on_exec)
13286
return -EINVAL;
13287
13288
out:
13289
return ret;
13290
13291
err_size:
13292
put_user(sizeof(*attr), &uattr->size);
13293
ret = -E2BIG;
13294
goto out;
13295
}
13296
13297
static void mutex_lock_double(struct mutex *a, struct mutex *b)
13298
{
13299
if (b < a)
13300
swap(a, b);
13301
13302
mutex_lock(a);
13303
mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
13304
}
13305
13306
static int
13307
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
13308
{
13309
struct perf_buffer *rb = NULL;
13310
int ret = -EINVAL;
13311
13312
if (!output_event) {
13313
mutex_lock(&event->mmap_mutex);
13314
goto set;
13315
}
13316
13317
/* don't allow circular references */
13318
if (event == output_event)
13319
goto out;
13320
13321
/*
13322
* Don't allow cross-cpu buffers
13323
*/
13324
if (output_event->cpu != event->cpu)
13325
goto out;
13326
13327
/*
13328
* If its not a per-cpu rb, it must be the same task.
13329
*/
13330
if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
13331
goto out;
13332
13333
/*
13334
* Mixing clocks in the same buffer is trouble you don't need.
13335
*/
13336
if (output_event->clock != event->clock)
13337
goto out;
13338
13339
/*
13340
* Either writing ring buffer from beginning or from end.
13341
* Mixing is not allowed.
13342
*/
13343
if (is_write_backward(output_event) != is_write_backward(event))
13344
goto out;
13345
13346
/*
13347
* If both events generate aux data, they must be on the same PMU
13348
*/
13349
if (has_aux(event) && has_aux(output_event) &&
13350
event->pmu != output_event->pmu)
13351
goto out;
13352
13353
/*
13354
* Hold both mmap_mutex to serialize against perf_mmap_close(). Since
13355
* output_event is already on rb->event_list, and the list iteration
13356
* restarts after every removal, it is guaranteed this new event is
13357
* observed *OR* if output_event is already removed, it's guaranteed we
13358
* observe !rb->mmap_count.
13359
*/
13360
mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
13361
set:
13362
/* Can't redirect output if we've got an active mmap() */
13363
if (refcount_read(&event->mmap_count))
13364
goto unlock;
13365
13366
if (output_event) {
13367
if (output_event->state <= PERF_EVENT_STATE_REVOKED)
13368
goto unlock;
13369
13370
/* get the rb we want to redirect to */
13371
rb = ring_buffer_get(output_event);
13372
if (!rb)
13373
goto unlock;
13374
13375
/* did we race against perf_mmap_close() */
13376
if (!refcount_read(&rb->mmap_count)) {
13377
ring_buffer_put(rb);
13378
goto unlock;
13379
}
13380
}
13381
13382
ring_buffer_attach(event, rb);
13383
13384
ret = 0;
13385
unlock:
13386
mutex_unlock(&event->mmap_mutex);
13387
if (output_event)
13388
mutex_unlock(&output_event->mmap_mutex);
13389
13390
out:
13391
return ret;
13392
}
13393
13394
static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
13395
{
13396
bool nmi_safe = false;
13397
13398
switch (clk_id) {
13399
case CLOCK_MONOTONIC:
13400
event->clock = &ktime_get_mono_fast_ns;
13401
nmi_safe = true;
13402
break;
13403
13404
case CLOCK_MONOTONIC_RAW:
13405
event->clock = &ktime_get_raw_fast_ns;
13406
nmi_safe = true;
13407
break;
13408
13409
case CLOCK_REALTIME:
13410
event->clock = &ktime_get_real_ns;
13411
break;
13412
13413
case CLOCK_BOOTTIME:
13414
event->clock = &ktime_get_boottime_ns;
13415
break;
13416
13417
case CLOCK_TAI:
13418
event->clock = &ktime_get_clocktai_ns;
13419
break;
13420
13421
default:
13422
return -EINVAL;
13423
}
13424
13425
if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
13426
return -EINVAL;
13427
13428
return 0;
13429
}
13430
13431
static bool
13432
perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
13433
{
13434
unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS;
13435
bool is_capable = perfmon_capable();
13436
13437
if (attr->sigtrap) {
13438
/*
13439
* perf_event_attr::sigtrap sends signals to the other task.
13440
* Require the current task to also have CAP_KILL.
13441
*/
13442
rcu_read_lock();
13443
is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL);
13444
rcu_read_unlock();
13445
13446
/*
13447
* If the required capabilities aren't available, checks for
13448
* ptrace permissions: upgrade to ATTACH, since sending signals
13449
* can effectively change the target task.
13450
*/
13451
ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS;
13452
}
13453
13454
/*
13455
* Preserve ptrace permission check for backwards compatibility. The
13456
* ptrace check also includes checks that the current task and other
13457
* task have matching uids, and is therefore not done here explicitly.
13458
*/
13459
return is_capable || ptrace_may_access(task, ptrace_mode);
13460
}
13461
13462
/**
13463
* sys_perf_event_open - open a performance event, associate it to a task/cpu
13464
*
13465
* @attr_uptr: event_id type attributes for monitoring/sampling
13466
* @pid: target pid
13467
* @cpu: target cpu
13468
* @group_fd: group leader event fd
13469
* @flags: perf event open flags
13470
*/
13471
SYSCALL_DEFINE5(perf_event_open,
13472
struct perf_event_attr __user *, attr_uptr,
13473
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
13474
{
13475
struct perf_event *group_leader = NULL, *output_event = NULL;
13476
struct perf_event_pmu_context *pmu_ctx;
13477
struct perf_event *event, *sibling;
13478
struct perf_event_attr attr;
13479
struct perf_event_context *ctx;
13480
struct file *event_file = NULL;
13481
struct task_struct *task = NULL;
13482
struct pmu *pmu;
13483
int event_fd;
13484
int move_group = 0;
13485
int err;
13486
int f_flags = O_RDWR;
13487
int cgroup_fd = -1;
13488
13489
/* for future expandability... */
13490
if (flags & ~PERF_FLAG_ALL)
13491
return -EINVAL;
13492
13493
err = perf_copy_attr(attr_uptr, &attr);
13494
if (err)
13495
return err;
13496
13497
/* Do we allow access to perf_event_open(2) ? */
13498
err = security_perf_event_open(PERF_SECURITY_OPEN);
13499
if (err)
13500
return err;
13501
13502
if (!attr.exclude_kernel) {
13503
err = perf_allow_kernel();
13504
if (err)
13505
return err;
13506
}
13507
13508
if (attr.namespaces) {
13509
if (!perfmon_capable())
13510
return -EACCES;
13511
}
13512
13513
if (attr.freq) {
13514
if (attr.sample_freq > sysctl_perf_event_sample_rate)
13515
return -EINVAL;
13516
} else {
13517
if (attr.sample_period & (1ULL << 63))
13518
return -EINVAL;
13519
}
13520
13521
/* Only privileged users can get physical addresses */
13522
if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
13523
err = perf_allow_kernel();
13524
if (err)
13525
return err;
13526
}
13527
13528
/* REGS_INTR can leak data, lockdown must prevent this */
13529
if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
13530
err = security_locked_down(LOCKDOWN_PERF);
13531
if (err)
13532
return err;
13533
}
13534
13535
/*
13536
* In cgroup mode, the pid argument is used to pass the fd
13537
* opened to the cgroup directory in cgroupfs. The cpu argument
13538
* designates the cpu on which to monitor threads from that
13539
* cgroup.
13540
*/
13541
if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
13542
return -EINVAL;
13543
13544
if (flags & PERF_FLAG_FD_CLOEXEC)
13545
f_flags |= O_CLOEXEC;
13546
13547
event_fd = get_unused_fd_flags(f_flags);
13548
if (event_fd < 0)
13549
return event_fd;
13550
13551
/*
13552
* Event creation should be under SRCU, see perf_pmu_unregister().
13553
*/
13554
guard(srcu)(&pmus_srcu);
13555
13556
CLASS(fd, group)(group_fd); // group_fd == -1 => empty
13557
if (group_fd != -1) {
13558
if (!is_perf_file(group)) {
13559
err = -EBADF;
13560
goto err_fd;
13561
}
13562
group_leader = fd_file(group)->private_data;
13563
if (group_leader->state <= PERF_EVENT_STATE_REVOKED) {
13564
err = -ENODEV;
13565
goto err_fd;
13566
}
13567
if (flags & PERF_FLAG_FD_OUTPUT)
13568
output_event = group_leader;
13569
if (flags & PERF_FLAG_FD_NO_GROUP)
13570
group_leader = NULL;
13571
}
13572
13573
if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
13574
task = find_lively_task_by_vpid(pid);
13575
if (IS_ERR(task)) {
13576
err = PTR_ERR(task);
13577
goto err_fd;
13578
}
13579
}
13580
13581
if (task && group_leader &&
13582
group_leader->attr.inherit != attr.inherit) {
13583
err = -EINVAL;
13584
goto err_task;
13585
}
13586
13587
if (flags & PERF_FLAG_PID_CGROUP)
13588
cgroup_fd = pid;
13589
13590
event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
13591
NULL, NULL, cgroup_fd);
13592
if (IS_ERR(event)) {
13593
err = PTR_ERR(event);
13594
goto err_task;
13595
}
13596
13597
if (is_sampling_event(event)) {
13598
if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
13599
err = -EOPNOTSUPP;
13600
goto err_alloc;
13601
}
13602
}
13603
13604
/*
13605
* Special case software events and allow them to be part of
13606
* any hardware group.
13607
*/
13608
pmu = event->pmu;
13609
13610
if (attr.use_clockid) {
13611
err = perf_event_set_clock(event, attr.clockid);
13612
if (err)
13613
goto err_alloc;
13614
}
13615
13616
if (pmu->task_ctx_nr == perf_sw_context)
13617
event->event_caps |= PERF_EV_CAP_SOFTWARE;
13618
13619
if (task) {
13620
err = down_read_interruptible(&task->signal->exec_update_lock);
13621
if (err)
13622
goto err_alloc;
13623
13624
/*
13625
* We must hold exec_update_lock across this and any potential
13626
* perf_install_in_context() call for this new event to
13627
* serialize against exec() altering our credentials (and the
13628
* perf_event_exit_task() that could imply).
13629
*/
13630
err = -EACCES;
13631
if (!perf_check_permission(&attr, task))
13632
goto err_cred;
13633
}
13634
13635
/*
13636
* Get the target context (task or percpu):
13637
*/
13638
ctx = find_get_context(task, event);
13639
if (IS_ERR(ctx)) {
13640
err = PTR_ERR(ctx);
13641
goto err_cred;
13642
}
13643
13644
mutex_lock(&ctx->mutex);
13645
13646
if (ctx->task == TASK_TOMBSTONE) {
13647
err = -ESRCH;
13648
goto err_locked;
13649
}
13650
13651
if (!task) {
13652
/*
13653
* Check if the @cpu we're creating an event for is online.
13654
*
13655
* We use the perf_cpu_context::ctx::mutex to serialize against
13656
* the hotplug notifiers. See perf_event_{init,exit}_cpu().
13657
*/
13658
struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
13659
13660
if (!cpuctx->online) {
13661
err = -ENODEV;
13662
goto err_locked;
13663
}
13664
}
13665
13666
if (group_leader) {
13667
err = -EINVAL;
13668
13669
/*
13670
* Do not allow a recursive hierarchy (this new sibling
13671
* becoming part of another group-sibling):
13672
*/
13673
if (group_leader->group_leader != group_leader)
13674
goto err_locked;
13675
13676
/* All events in a group should have the same clock */
13677
if (group_leader->clock != event->clock)
13678
goto err_locked;
13679
13680
/*
13681
* Make sure we're both events for the same CPU;
13682
* grouping events for different CPUs is broken; since
13683
* you can never concurrently schedule them anyhow.
13684
*/
13685
if (group_leader->cpu != event->cpu)
13686
goto err_locked;
13687
13688
/*
13689
* Make sure we're both on the same context; either task or cpu.
13690
*/
13691
if (group_leader->ctx != ctx)
13692
goto err_locked;
13693
13694
/*
13695
* Only a group leader can be exclusive or pinned
13696
*/
13697
if (attr.exclusive || attr.pinned)
13698
goto err_locked;
13699
13700
if (is_software_event(event) &&
13701
!in_software_context(group_leader)) {
13702
/*
13703
* If the event is a sw event, but the group_leader
13704
* is on hw context.
13705
*
13706
* Allow the addition of software events to hw
13707
* groups, this is safe because software events
13708
* never fail to schedule.
13709
*
13710
* Note the comment that goes with struct
13711
* perf_event_pmu_context.
13712
*/
13713
pmu = group_leader->pmu_ctx->pmu;
13714
} else if (!is_software_event(event)) {
13715
if (is_software_event(group_leader) &&
13716
(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
13717
/*
13718
* In case the group is a pure software group, and we
13719
* try to add a hardware event, move the whole group to
13720
* the hardware context.
13721
*/
13722
move_group = 1;
13723
}
13724
13725
/* Don't allow group of multiple hw events from different pmus */
13726
if (!in_software_context(group_leader) &&
13727
group_leader->pmu_ctx->pmu != pmu)
13728
goto err_locked;
13729
}
13730
}
13731
13732
/*
13733
* Now that we're certain of the pmu; find the pmu_ctx.
13734
*/
13735
pmu_ctx = find_get_pmu_context(pmu, ctx, event);
13736
if (IS_ERR(pmu_ctx)) {
13737
err = PTR_ERR(pmu_ctx);
13738
goto err_locked;
13739
}
13740
event->pmu_ctx = pmu_ctx;
13741
13742
if (output_event) {
13743
err = perf_event_set_output(event, output_event);
13744
if (err)
13745
goto err_context;
13746
}
13747
13748
if (!perf_event_validate_size(event)) {
13749
err = -E2BIG;
13750
goto err_context;
13751
}
13752
13753
if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
13754
err = -EINVAL;
13755
goto err_context;
13756
}
13757
13758
/*
13759
* Must be under the same ctx::mutex as perf_install_in_context(),
13760
* because we need to serialize with concurrent event creation.
13761
*/
13762
if (!exclusive_event_installable(event, ctx)) {
13763
err = -EBUSY;
13764
goto err_context;
13765
}
13766
13767
WARN_ON_ONCE(ctx->parent_ctx);
13768
13769
event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags);
13770
if (IS_ERR(event_file)) {
13771
err = PTR_ERR(event_file);
13772
event_file = NULL;
13773
goto err_context;
13774
}
13775
13776
/*
13777
* This is the point on no return; we cannot fail hereafter. This is
13778
* where we start modifying current state.
13779
*/
13780
13781
if (move_group) {
13782
perf_remove_from_context(group_leader, 0);
13783
put_pmu_ctx(group_leader->pmu_ctx);
13784
13785
for_each_sibling_event(sibling, group_leader) {
13786
perf_remove_from_context(sibling, 0);
13787
put_pmu_ctx(sibling->pmu_ctx);
13788
}
13789
13790
/*
13791
* Install the group siblings before the group leader.
13792
*
13793
* Because a group leader will try and install the entire group
13794
* (through the sibling list, which is still in-tact), we can
13795
* end up with siblings installed in the wrong context.
13796
*
13797
* By installing siblings first we NO-OP because they're not
13798
* reachable through the group lists.
13799
*/
13800
for_each_sibling_event(sibling, group_leader) {
13801
sibling->pmu_ctx = pmu_ctx;
13802
get_pmu_ctx(pmu_ctx);
13803
perf_event__state_init(sibling);
13804
perf_install_in_context(ctx, sibling, sibling->cpu);
13805
}
13806
13807
/*
13808
* Removing from the context ends up with disabled
13809
* event. What we want here is event in the initial
13810
* startup state, ready to be add into new context.
13811
*/
13812
group_leader->pmu_ctx = pmu_ctx;
13813
get_pmu_ctx(pmu_ctx);
13814
perf_event__state_init(group_leader);
13815
perf_install_in_context(ctx, group_leader, group_leader->cpu);
13816
}
13817
13818
/*
13819
* Precalculate sample_data sizes; do while holding ctx::mutex such
13820
* that we're serialized against further additions and before
13821
* perf_install_in_context() which is the point the event is active and
13822
* can use these values.
13823
*/
13824
perf_event__header_size(event);
13825
perf_event__id_header_size(event);
13826
13827
event->owner = current;
13828
13829
perf_install_in_context(ctx, event, event->cpu);
13830
perf_unpin_context(ctx);
13831
13832
mutex_unlock(&ctx->mutex);
13833
13834
if (task) {
13835
up_read(&task->signal->exec_update_lock);
13836
put_task_struct(task);
13837
}
13838
13839
mutex_lock(&current->perf_event_mutex);
13840
list_add_tail(&event->owner_entry, &current->perf_event_list);
13841
mutex_unlock(&current->perf_event_mutex);
13842
13843
/*
13844
* File reference in group guarantees that group_leader has been
13845
* kept alive until we place the new event on the sibling_list.
13846
* This ensures destruction of the group leader will find
13847
* the pointer to itself in perf_group_detach().
13848
*/
13849
fd_install(event_fd, event_file);
13850
return event_fd;
13851
13852
err_context:
13853
put_pmu_ctx(event->pmu_ctx);
13854
event->pmu_ctx = NULL; /* _free_event() */
13855
err_locked:
13856
mutex_unlock(&ctx->mutex);
13857
perf_unpin_context(ctx);
13858
put_ctx(ctx);
13859
err_cred:
13860
if (task)
13861
up_read(&task->signal->exec_update_lock);
13862
err_alloc:
13863
put_event(event);
13864
err_task:
13865
if (task)
13866
put_task_struct(task);
13867
err_fd:
13868
put_unused_fd(event_fd);
13869
return err;
13870
}
13871
13872
/**
13873
* perf_event_create_kernel_counter
13874
*
13875
* @attr: attributes of the counter to create
13876
* @cpu: cpu in which the counter is bound
13877
* @task: task to profile (NULL for percpu)
13878
* @overflow_handler: callback to trigger when we hit the event
13879
* @context: context data could be used in overflow_handler callback
13880
*/
13881
struct perf_event *
13882
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
13883
struct task_struct *task,
13884
perf_overflow_handler_t overflow_handler,
13885
void *context)
13886
{
13887
struct perf_event_pmu_context *pmu_ctx;
13888
struct perf_event_context *ctx;
13889
struct perf_event *event;
13890
struct pmu *pmu;
13891
int err;
13892
13893
/*
13894
* Grouping is not supported for kernel events, neither is 'AUX',
13895
* make sure the caller's intentions are adjusted.
13896
*/
13897
if (attr->aux_output || attr->aux_action)
13898
return ERR_PTR(-EINVAL);
13899
13900
/*
13901
* Event creation should be under SRCU, see perf_pmu_unregister().
13902
*/
13903
guard(srcu)(&pmus_srcu);
13904
13905
event = perf_event_alloc(attr, cpu, task, NULL, NULL,
13906
overflow_handler, context, -1);
13907
if (IS_ERR(event)) {
13908
err = PTR_ERR(event);
13909
goto err;
13910
}
13911
13912
/* Mark owner so we could distinguish it from user events. */
13913
event->owner = TASK_TOMBSTONE;
13914
pmu = event->pmu;
13915
13916
if (pmu->task_ctx_nr == perf_sw_context)
13917
event->event_caps |= PERF_EV_CAP_SOFTWARE;
13918
13919
/*
13920
* Get the target context (task or percpu):
13921
*/
13922
ctx = find_get_context(task, event);
13923
if (IS_ERR(ctx)) {
13924
err = PTR_ERR(ctx);
13925
goto err_alloc;
13926
}
13927
13928
WARN_ON_ONCE(ctx->parent_ctx);
13929
mutex_lock(&ctx->mutex);
13930
if (ctx->task == TASK_TOMBSTONE) {
13931
err = -ESRCH;
13932
goto err_unlock;
13933
}
13934
13935
pmu_ctx = find_get_pmu_context(pmu, ctx, event);
13936
if (IS_ERR(pmu_ctx)) {
13937
err = PTR_ERR(pmu_ctx);
13938
goto err_unlock;
13939
}
13940
event->pmu_ctx = pmu_ctx;
13941
13942
if (!task) {
13943
/*
13944
* Check if the @cpu we're creating an event for is online.
13945
*
13946
* We use the perf_cpu_context::ctx::mutex to serialize against
13947
* the hotplug notifiers. See perf_event_{init,exit}_cpu().
13948
*/
13949
struct perf_cpu_context *cpuctx =
13950
container_of(ctx, struct perf_cpu_context, ctx);
13951
if (!cpuctx->online) {
13952
err = -ENODEV;
13953
goto err_pmu_ctx;
13954
}
13955
}
13956
13957
if (!exclusive_event_installable(event, ctx)) {
13958
err = -EBUSY;
13959
goto err_pmu_ctx;
13960
}
13961
13962
perf_install_in_context(ctx, event, event->cpu);
13963
perf_unpin_context(ctx);
13964
mutex_unlock(&ctx->mutex);
13965
13966
return event;
13967
13968
err_pmu_ctx:
13969
put_pmu_ctx(pmu_ctx);
13970
event->pmu_ctx = NULL; /* _free_event() */
13971
err_unlock:
13972
mutex_unlock(&ctx->mutex);
13973
perf_unpin_context(ctx);
13974
put_ctx(ctx);
13975
err_alloc:
13976
put_event(event);
13977
err:
13978
return ERR_PTR(err);
13979
}
13980
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
13981
13982
static void __perf_pmu_remove(struct perf_event_context *ctx,
13983
int cpu, struct pmu *pmu,
13984
struct perf_event_groups *groups,
13985
struct list_head *events)
13986
{
13987
struct perf_event *event, *sibling;
13988
13989
perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
13990
perf_remove_from_context(event, 0);
13991
put_pmu_ctx(event->pmu_ctx);
13992
list_add(&event->migrate_entry, events);
13993
13994
for_each_sibling_event(sibling, event) {
13995
perf_remove_from_context(sibling, 0);
13996
put_pmu_ctx(sibling->pmu_ctx);
13997
list_add(&sibling->migrate_entry, events);
13998
}
13999
}
14000
}
14001
14002
static void __perf_pmu_install_event(struct pmu *pmu,
14003
struct perf_event_context *ctx,
14004
int cpu, struct perf_event *event)
14005
{
14006
struct perf_event_pmu_context *epc;
14007
struct perf_event_context *old_ctx = event->ctx;
14008
14009
get_ctx(ctx); /* normally find_get_context() */
14010
14011
event->cpu = cpu;
14012
epc = find_get_pmu_context(pmu, ctx, event);
14013
event->pmu_ctx = epc;
14014
14015
if (event->state >= PERF_EVENT_STATE_OFF)
14016
event->state = PERF_EVENT_STATE_INACTIVE;
14017
perf_install_in_context(ctx, event, cpu);
14018
14019
/*
14020
* Now that event->ctx is updated and visible, put the old ctx.
14021
*/
14022
put_ctx(old_ctx);
14023
}
14024
14025
static void __perf_pmu_install(struct perf_event_context *ctx,
14026
int cpu, struct pmu *pmu, struct list_head *events)
14027
{
14028
struct perf_event *event, *tmp;
14029
14030
/*
14031
* Re-instate events in 2 passes.
14032
*
14033
* Skip over group leaders and only install siblings on this first
14034
* pass, siblings will not get enabled without a leader, however a
14035
* leader will enable its siblings, even if those are still on the old
14036
* context.
14037
*/
14038
list_for_each_entry_safe(event, tmp, events, migrate_entry) {
14039
if (event->group_leader == event)
14040
continue;
14041
14042
list_del(&event->migrate_entry);
14043
__perf_pmu_install_event(pmu, ctx, cpu, event);
14044
}
14045
14046
/*
14047
* Once all the siblings are setup properly, install the group leaders
14048
* to make it go.
14049
*/
14050
list_for_each_entry_safe(event, tmp, events, migrate_entry) {
14051
list_del(&event->migrate_entry);
14052
__perf_pmu_install_event(pmu, ctx, cpu, event);
14053
}
14054
}
14055
14056
void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
14057
{
14058
struct perf_event_context *src_ctx, *dst_ctx;
14059
LIST_HEAD(events);
14060
14061
/*
14062
* Since per-cpu context is persistent, no need to grab an extra
14063
* reference.
14064
*/
14065
src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
14066
dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
14067
14068
/*
14069
* See perf_event_ctx_lock() for comments on the details
14070
* of swizzling perf_event::ctx.
14071
*/
14072
mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
14073
14074
__perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
14075
__perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);
14076
14077
if (!list_empty(&events)) {
14078
/*
14079
* Wait for the events to quiesce before re-instating them.
14080
*/
14081
synchronize_rcu();
14082
14083
__perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
14084
}
14085
14086
mutex_unlock(&dst_ctx->mutex);
14087
mutex_unlock(&src_ctx->mutex);
14088
}
14089
EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
14090
14091
static void sync_child_event(struct perf_event *child_event,
14092
struct task_struct *task)
14093
{
14094
struct perf_event *parent_event = child_event->parent;
14095
u64 child_val;
14096
14097
if (child_event->attr.inherit_stat) {
14098
if (task && task != TASK_TOMBSTONE)
14099
perf_event_read_event(child_event, task);
14100
}
14101
14102
child_val = perf_event_count(child_event, false);
14103
14104
/*
14105
* Add back the child's count to the parent's count:
14106
*/
14107
atomic64_add(child_val, &parent_event->child_count);
14108
atomic64_add(child_event->total_time_enabled,
14109
&parent_event->child_total_time_enabled);
14110
atomic64_add(child_event->total_time_running,
14111
&parent_event->child_total_time_running);
14112
}
14113
14114
static void
14115
perf_event_exit_event(struct perf_event *event,
14116
struct perf_event_context *ctx,
14117
struct task_struct *task,
14118
bool revoke)
14119
{
14120
struct perf_event *parent_event = event->parent;
14121
unsigned long detach_flags = DETACH_EXIT;
14122
unsigned int attach_state;
14123
14124
if (parent_event) {
14125
/*
14126
* Do not destroy the 'original' grouping; because of the
14127
* context switch optimization the original events could've
14128
* ended up in a random child task.
14129
*
14130
* If we were to destroy the original group, all group related
14131
* operations would cease to function properly after this
14132
* random child dies.
14133
*
14134
* Do destroy all inherited groups, we don't care about those
14135
* and being thorough is better.
14136
*/
14137
detach_flags |= DETACH_GROUP | DETACH_CHILD;
14138
mutex_lock(&parent_event->child_mutex);
14139
/* PERF_ATTACH_ITRACE might be set concurrently */
14140
attach_state = READ_ONCE(event->attach_state);
14141
14142
if (attach_state & PERF_ATTACH_CHILD)
14143
sync_child_event(event, task);
14144
}
14145
14146
if (revoke)
14147
detach_flags |= DETACH_GROUP | DETACH_REVOKE;
14148
14149
perf_remove_from_context(event, detach_flags);
14150
/*
14151
* Child events can be freed.
14152
*/
14153
if (parent_event) {
14154
mutex_unlock(&parent_event->child_mutex);
14155
14156
/*
14157
* Match the refcount initialization. Make sure it doesn't happen
14158
* twice if pmu_detach_event() calls it on an already exited task.
14159
*/
14160
if (attach_state & PERF_ATTACH_CHILD) {
14161
/*
14162
* Kick perf_poll() for is_event_hup();
14163
*/
14164
perf_event_wakeup(parent_event);
14165
/*
14166
* pmu_detach_event() will have an extra refcount.
14167
* perf_pending_task() might have one too.
14168
*/
14169
put_event(event);
14170
}
14171
14172
return;
14173
}
14174
14175
/*
14176
* Parent events are governed by their filedesc, retain them.
14177
*/
14178
perf_event_wakeup(event);
14179
}
14180
14181
static void perf_event_exit_task_context(struct task_struct *task, bool exit)
14182
{
14183
struct perf_event_context *ctx, *clone_ctx = NULL;
14184
struct perf_event *child_event, *next;
14185
14186
ctx = perf_pin_task_context(task);
14187
if (!ctx)
14188
return;
14189
14190
/*
14191
* In order to reduce the amount of tricky in ctx tear-down, we hold
14192
* ctx::mutex over the entire thing. This serializes against almost
14193
* everything that wants to access the ctx.
14194
*
14195
* The exception is sys_perf_event_open() /
14196
* perf_event_create_kernel_count() which does find_get_context()
14197
* without ctx::mutex (it cannot because of the move_group double mutex
14198
* lock thing). See the comments in perf_install_in_context().
14199
*/
14200
mutex_lock(&ctx->mutex);
14201
14202
/*
14203
* In a single ctx::lock section, de-schedule the events and detach the
14204
* context from the task such that we cannot ever get it scheduled back
14205
* in.
14206
*/
14207
raw_spin_lock_irq(&ctx->lock);
14208
if (exit)
14209
task_ctx_sched_out(ctx, NULL, EVENT_ALL);
14210
14211
/*
14212
* Now that the context is inactive, destroy the task <-> ctx relation
14213
* and mark the context dead.
14214
*/
14215
RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
14216
put_ctx(ctx); /* cannot be last */
14217
WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
14218
put_task_struct(task); /* cannot be last */
14219
14220
clone_ctx = unclone_ctx(ctx);
14221
raw_spin_unlock_irq(&ctx->lock);
14222
14223
if (clone_ctx)
14224
put_ctx(clone_ctx);
14225
14226
/*
14227
* Report the task dead after unscheduling the events so that we
14228
* won't get any samples after PERF_RECORD_EXIT. We can however still
14229
* get a few PERF_RECORD_READ events.
14230
*/
14231
if (exit)
14232
perf_event_task(task, ctx, 0);
14233
14234
list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry)
14235
perf_event_exit_event(child_event, ctx, exit ? task : NULL, false);
14236
14237
mutex_unlock(&ctx->mutex);
14238
14239
if (!exit) {
14240
/*
14241
* perf_event_release_kernel() could still have a reference on
14242
* this context. In that case we must wait for these events to
14243
* have been freed (in particular all their references to this
14244
* task must've been dropped).
14245
*
14246
* Without this copy_process() will unconditionally free this
14247
* task (irrespective of its reference count) and
14248
* _free_event()'s put_task_struct(event->hw.target) will be a
14249
* use-after-free.
14250
*
14251
* Wait for all events to drop their context reference.
14252
*/
14253
wait_var_event(&ctx->refcount,
14254
refcount_read(&ctx->refcount) == 1);
14255
}
14256
put_ctx(ctx);
14257
}
14258
14259
/*
14260
* When a task exits, feed back event values to parent events.
14261
*
14262
* Can be called with exec_update_lock held when called from
14263
* setup_new_exec().
14264
*/
14265
void perf_event_exit_task(struct task_struct *task)
14266
{
14267
struct perf_event *event, *tmp;
14268
14269
WARN_ON_ONCE(task != current);
14270
14271
mutex_lock(&task->perf_event_mutex);
14272
list_for_each_entry_safe(event, tmp, &task->perf_event_list,
14273
owner_entry) {
14274
list_del_init(&event->owner_entry);
14275
14276
/*
14277
* Ensure the list deletion is visible before we clear
14278
* the owner, closes a race against perf_release() where
14279
* we need to serialize on the owner->perf_event_mutex.
14280
*/
14281
smp_store_release(&event->owner, NULL);
14282
}
14283
mutex_unlock(&task->perf_event_mutex);
14284
14285
perf_event_exit_task_context(task, true);
14286
14287
/*
14288
* The perf_event_exit_task_context calls perf_event_task
14289
* with task's task_ctx, which generates EXIT events for
14290
* task contexts and sets task->perf_event_ctxp[] to NULL.
14291
* At this point we need to send EXIT events to cpu contexts.
14292
*/
14293
perf_event_task(task, NULL, 0);
14294
14295
/*
14296
* Detach the perf_ctx_data for the system-wide event.
14297
*/
14298
guard(percpu_read)(&global_ctx_data_rwsem);
14299
detach_task_ctx_data(task);
14300
}
14301
14302
/*
14303
* Free a context as created by inheritance by perf_event_init_task() below,
14304
* used by fork() in case of fail.
14305
*
14306
* Even though the task has never lived, the context and events have been
14307
* exposed through the child_list, so we must take care tearing it all down.
14308
*/
14309
void perf_event_free_task(struct task_struct *task)
14310
{
14311
perf_event_exit_task_context(task, false);
14312
}
14313
14314
void perf_event_delayed_put(struct task_struct *task)
14315
{
14316
WARN_ON_ONCE(task->perf_event_ctxp);
14317
}
14318
14319
struct file *perf_event_get(unsigned int fd)
14320
{
14321
struct file *file = fget(fd);
14322
if (!file)
14323
return ERR_PTR(-EBADF);
14324
14325
if (file->f_op != &perf_fops) {
14326
fput(file);
14327
return ERR_PTR(-EBADF);
14328
}
14329
14330
return file;
14331
}
14332
14333
const struct perf_event *perf_get_event(struct file *file)
14334
{
14335
if (file->f_op != &perf_fops)
14336
return ERR_PTR(-EINVAL);
14337
14338
return file->private_data;
14339
}
14340
14341
const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
14342
{
14343
if (!event)
14344
return ERR_PTR(-EINVAL);
14345
14346
return &event->attr;
14347
}
14348
14349
int perf_allow_kernel(void)
14350
{
14351
if (sysctl_perf_event_paranoid > 1 && !perfmon_capable())
14352
return -EACCES;
14353
14354
return security_perf_event_open(PERF_SECURITY_KERNEL);
14355
}
14356
EXPORT_SYMBOL_GPL(perf_allow_kernel);
14357
14358
/*
14359
* Inherit an event from parent task to child task.
14360
*
14361
* Returns:
14362
* - valid pointer on success
14363
* - NULL for orphaned events
14364
* - IS_ERR() on error
14365
*/
14366
static struct perf_event *
14367
inherit_event(struct perf_event *parent_event,
14368
struct task_struct *parent,
14369
struct perf_event_context *parent_ctx,
14370
struct task_struct *child,
14371
struct perf_event *group_leader,
14372
struct perf_event_context *child_ctx)
14373
{
14374
enum perf_event_state parent_state = parent_event->state;
14375
struct perf_event_pmu_context *pmu_ctx;
14376
struct perf_event *child_event;
14377
unsigned long flags;
14378
14379
/*
14380
* Instead of creating recursive hierarchies of events,
14381
* we link inherited events back to the original parent,
14382
* which has a filp for sure, which we use as the reference
14383
* count:
14384
*/
14385
if (parent_event->parent)
14386
parent_event = parent_event->parent;
14387
14388
if (parent_event->state <= PERF_EVENT_STATE_REVOKED)
14389
return NULL;
14390
14391
/*
14392
* Event creation should be under SRCU, see perf_pmu_unregister().
14393
*/
14394
guard(srcu)(&pmus_srcu);
14395
14396
child_event = perf_event_alloc(&parent_event->attr,
14397
parent_event->cpu,
14398
child,
14399
group_leader, parent_event,
14400
NULL, NULL, -1);
14401
if (IS_ERR(child_event))
14402
return child_event;
14403
14404
get_ctx(child_ctx);
14405
child_event->ctx = child_ctx;
14406
14407
pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
14408
if (IS_ERR(pmu_ctx)) {
14409
free_event(child_event);
14410
return ERR_CAST(pmu_ctx);
14411
}
14412
child_event->pmu_ctx = pmu_ctx;
14413
14414
/*
14415
* is_orphaned_event() and list_add_tail(&parent_event->child_list)
14416
* must be under the same lock in order to serialize against
14417
* perf_event_release_kernel(), such that either we must observe
14418
* is_orphaned_event() or they will observe us on the child_list.
14419
*/
14420
mutex_lock(&parent_event->child_mutex);
14421
if (is_orphaned_event(parent_event) ||
14422
!atomic_long_inc_not_zero(&parent_event->refcount)) {
14423
mutex_unlock(&parent_event->child_mutex);
14424
free_event(child_event);
14425
return NULL;
14426
}
14427
14428
/*
14429
* Make the child state follow the state of the parent event,
14430
* not its attr.disabled bit. We hold the parent's mutex,
14431
* so we won't race with perf_event_{en, dis}able_family.
14432
*/
14433
if (parent_state >= PERF_EVENT_STATE_INACTIVE)
14434
child_event->state = PERF_EVENT_STATE_INACTIVE;
14435
else
14436
child_event->state = PERF_EVENT_STATE_OFF;
14437
14438
if (parent_event->attr.freq) {
14439
u64 sample_period = parent_event->hw.sample_period;
14440
struct hw_perf_event *hwc = &child_event->hw;
14441
14442
hwc->sample_period = sample_period;
14443
hwc->last_period = sample_period;
14444
14445
local64_set(&hwc->period_left, sample_period);
14446
}
14447
14448
child_event->overflow_handler = parent_event->overflow_handler;
14449
child_event->overflow_handler_context
14450
= parent_event->overflow_handler_context;
14451
14452
/*
14453
* Precalculate sample_data sizes
14454
*/
14455
perf_event__header_size(child_event);
14456
perf_event__id_header_size(child_event);
14457
14458
/*
14459
* Link it up in the child's context:
14460
*/
14461
raw_spin_lock_irqsave(&child_ctx->lock, flags);
14462
add_event_to_ctx(child_event, child_ctx);
14463
child_event->attach_state |= PERF_ATTACH_CHILD;
14464
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
14465
14466
/*
14467
* Link this into the parent event's child list
14468
*/
14469
list_add_tail(&child_event->child_list, &parent_event->child_list);
14470
mutex_unlock(&parent_event->child_mutex);
14471
14472
return child_event;
14473
}
14474
14475
/*
14476
* Inherits an event group.
14477
*
14478
* This will quietly suppress orphaned events; !inherit_event() is not an error.
14479
* This matches with perf_event_release_kernel() removing all child events.
14480
*
14481
* Returns:
14482
* - 0 on success
14483
* - <0 on error
14484
*/
14485
static int inherit_group(struct perf_event *parent_event,
14486
struct task_struct *parent,
14487
struct perf_event_context *parent_ctx,
14488
struct task_struct *child,
14489
struct perf_event_context *child_ctx)
14490
{
14491
struct perf_event *leader;
14492
struct perf_event *sub;
14493
struct perf_event *child_ctr;
14494
14495
leader = inherit_event(parent_event, parent, parent_ctx,
14496
child, NULL, child_ctx);
14497
if (IS_ERR(leader))
14498
return PTR_ERR(leader);
14499
/*
14500
* @leader can be NULL here because of is_orphaned_event(). In this
14501
* case inherit_event() will create individual events, similar to what
14502
* perf_group_detach() would do anyway.
14503
*/
14504
for_each_sibling_event(sub, parent_event) {
14505
child_ctr = inherit_event(sub, parent, parent_ctx,
14506
child, leader, child_ctx);
14507
if (IS_ERR(child_ctr))
14508
return PTR_ERR(child_ctr);
14509
14510
if (sub->aux_event == parent_event && child_ctr &&
14511
!perf_get_aux_event(child_ctr, leader))
14512
return -EINVAL;
14513
}
14514
if (leader)
14515
leader->group_generation = parent_event->group_generation;
14516
return 0;
14517
}
14518
14519
/*
14520
* Creates the child task context and tries to inherit the event-group.
14521
*
14522
* Clears @inherited_all on !attr.inherited or error. Note that we'll leave
14523
* inherited_all set when we 'fail' to inherit an orphaned event; this is
14524
* consistent with perf_event_release_kernel() removing all child events.
14525
*
14526
* Returns:
14527
* - 0 on success
14528
* - <0 on error
14529
*/
14530
static int
14531
inherit_task_group(struct perf_event *event, struct task_struct *parent,
14532
struct perf_event_context *parent_ctx,
14533
struct task_struct *child,
14534
u64 clone_flags, int *inherited_all)
14535
{
14536
struct perf_event_context *child_ctx;
14537
int ret;
14538
14539
if (!event->attr.inherit ||
14540
(event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
14541
/* Do not inherit if sigtrap and signal handlers were cleared. */
14542
(event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) {
14543
*inherited_all = 0;
14544
return 0;
14545
}
14546
14547
child_ctx = child->perf_event_ctxp;
14548
if (!child_ctx) {
14549
/*
14550
* This is executed from the parent task context, so
14551
* inherit events that have been marked for cloning.
14552
* First allocate and initialize a context for the
14553
* child.
14554
*/
14555
child_ctx = alloc_perf_context(child);
14556
if (!child_ctx)
14557
return -ENOMEM;
14558
14559
child->perf_event_ctxp = child_ctx;
14560
}
14561
14562
ret = inherit_group(event, parent, parent_ctx, child, child_ctx);
14563
if (ret)
14564
*inherited_all = 0;
14565
14566
return ret;
14567
}
14568
14569
/*
14570
* Initialize the perf_event context in task_struct
14571
*/
14572
static int perf_event_init_context(struct task_struct *child, u64 clone_flags)
14573
{
14574
struct perf_event_context *child_ctx, *parent_ctx;
14575
struct perf_event_context *cloned_ctx;
14576
struct perf_event *event;
14577
struct task_struct *parent = current;
14578
int inherited_all = 1;
14579
unsigned long flags;
14580
int ret = 0;
14581
14582
if (likely(!parent->perf_event_ctxp))
14583
return 0;
14584
14585
/*
14586
* If the parent's context is a clone, pin it so it won't get
14587
* swapped under us.
14588
*/
14589
parent_ctx = perf_pin_task_context(parent);
14590
if (!parent_ctx)
14591
return 0;
14592
14593
/*
14594
* No need to check if parent_ctx != NULL here; since we saw
14595
* it non-NULL earlier, the only reason for it to become NULL
14596
* is if we exit, and since we're currently in the middle of
14597
* a fork we can't be exiting at the same time.
14598
*/
14599
14600
/*
14601
* Lock the parent list. No need to lock the child - not PID
14602
* hashed yet and not running, so nobody can access it.
14603
*/
14604
mutex_lock(&parent_ctx->mutex);
14605
14606
/*
14607
* We dont have to disable NMIs - we are only looking at
14608
* the list, not manipulating it:
14609
*/
14610
perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
14611
ret = inherit_task_group(event, parent, parent_ctx,
14612
child, clone_flags, &inherited_all);
14613
if (ret)
14614
goto out_unlock;
14615
}
14616
14617
/*
14618
* We can't hold ctx->lock when iterating the ->flexible_group list due
14619
* to allocations, but we need to prevent rotation because
14620
* rotate_ctx() will change the list from interrupt context.
14621
*/
14622
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
14623
parent_ctx->rotate_disable = 1;
14624
raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
14625
14626
perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
14627
ret = inherit_task_group(event, parent, parent_ctx,
14628
child, clone_flags, &inherited_all);
14629
if (ret)
14630
goto out_unlock;
14631
}
14632
14633
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
14634
parent_ctx->rotate_disable = 0;
14635
14636
child_ctx = child->perf_event_ctxp;
14637
14638
if (child_ctx && inherited_all) {
14639
/*
14640
* Mark the child context as a clone of the parent
14641
* context, or of whatever the parent is a clone of.
14642
*
14643
* Note that if the parent is a clone, the holding of
14644
* parent_ctx->lock avoids it from being uncloned.
14645
*/
14646
cloned_ctx = parent_ctx->parent_ctx;
14647
if (cloned_ctx) {
14648
child_ctx->parent_ctx = cloned_ctx;
14649
child_ctx->parent_gen = parent_ctx->parent_gen;
14650
} else {
14651
child_ctx->parent_ctx = parent_ctx;
14652
child_ctx->parent_gen = parent_ctx->generation;
14653
}
14654
get_ctx(child_ctx->parent_ctx);
14655
}
14656
14657
raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
14658
out_unlock:
14659
mutex_unlock(&parent_ctx->mutex);
14660
14661
perf_unpin_context(parent_ctx);
14662
put_ctx(parent_ctx);
14663
14664
return ret;
14665
}
14666
14667
/*
14668
* Initialize the perf_event context in task_struct
14669
*/
14670
int perf_event_init_task(struct task_struct *child, u64 clone_flags)
14671
{
14672
int ret;
14673
14674
memset(child->perf_recursion, 0, sizeof(child->perf_recursion));
14675
child->perf_event_ctxp = NULL;
14676
mutex_init(&child->perf_event_mutex);
14677
INIT_LIST_HEAD(&child->perf_event_list);
14678
child->perf_ctx_data = NULL;
14679
14680
ret = perf_event_init_context(child, clone_flags);
14681
if (ret) {
14682
perf_event_free_task(child);
14683
return ret;
14684
}
14685
14686
return 0;
14687
}
14688
14689
static void __init perf_event_init_all_cpus(void)
14690
{
14691
struct swevent_htable *swhash;
14692
struct perf_cpu_context *cpuctx;
14693
int cpu;
14694
14695
zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
14696
zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
14697
zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
14698
zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
14699
zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
14700
zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);
14701
14702
14703
for_each_possible_cpu(cpu) {
14704
swhash = &per_cpu(swevent_htable, cpu);
14705
mutex_init(&swhash->hlist_mutex);
14706
14707
INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
14708
raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
14709
14710
INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
14711
14712
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
14713
__perf_event_init_context(&cpuctx->ctx);
14714
lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
14715
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
14716
cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
14717
cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
14718
cpuctx->heap = cpuctx->heap_default;
14719
}
14720
}
14721
14722
static void perf_swevent_init_cpu(unsigned int cpu)
14723
{
14724
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
14725
14726
mutex_lock(&swhash->hlist_mutex);
14727
if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
14728
struct swevent_hlist *hlist;
14729
14730
hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
14731
WARN_ON(!hlist);
14732
rcu_assign_pointer(swhash->swevent_hlist, hlist);
14733
}
14734
mutex_unlock(&swhash->hlist_mutex);
14735
}
14736
14737
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
14738
static void __perf_event_exit_context(void *__info)
14739
{
14740
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
14741
struct perf_event_context *ctx = __info;
14742
struct perf_event *event;
14743
14744
raw_spin_lock(&ctx->lock);
14745
ctx_sched_out(ctx, NULL, EVENT_TIME);
14746
list_for_each_entry(event, &ctx->event_list, event_entry)
14747
__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
14748
raw_spin_unlock(&ctx->lock);
14749
}
14750
14751
static void perf_event_clear_cpumask(unsigned int cpu)
14752
{
14753
int target[PERF_PMU_MAX_SCOPE];
14754
unsigned int scope;
14755
struct pmu *pmu;
14756
14757
cpumask_clear_cpu(cpu, perf_online_mask);
14758
14759
for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
14760
const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
14761
struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
14762
14763
target[scope] = -1;
14764
if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
14765
continue;
14766
14767
if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
14768
continue;
14769
target[scope] = cpumask_any_but(cpumask, cpu);
14770
if (target[scope] < nr_cpu_ids)
14771
cpumask_set_cpu(target[scope], pmu_cpumask);
14772
}
14773
14774
/* migrate */
14775
list_for_each_entry(pmu, &pmus, entry) {
14776
if (pmu->scope == PERF_PMU_SCOPE_NONE ||
14777
WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
14778
continue;
14779
14780
if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
14781
perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
14782
}
14783
}
14784
14785
static void perf_event_exit_cpu_context(int cpu)
14786
{
14787
struct perf_cpu_context *cpuctx;
14788
struct perf_event_context *ctx;
14789
14790
// XXX simplify cpuctx->online
14791
mutex_lock(&pmus_lock);
14792
/*
14793
* Clear the cpumasks, and migrate to other CPUs if possible.
14794
* Must be invoked before the __perf_event_exit_context.
14795
*/
14796
perf_event_clear_cpumask(cpu);
14797
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
14798
ctx = &cpuctx->ctx;
14799
14800
mutex_lock(&ctx->mutex);
14801
smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
14802
cpuctx->online = 0;
14803
mutex_unlock(&ctx->mutex);
14804
mutex_unlock(&pmus_lock);
14805
}
14806
#else
14807
14808
static void perf_event_exit_cpu_context(int cpu) { }
14809
14810
#endif
14811
14812
static void perf_event_setup_cpumask(unsigned int cpu)
14813
{
14814
struct cpumask *pmu_cpumask;
14815
unsigned int scope;
14816
14817
/*
14818
* Early boot stage, the cpumask hasn't been set yet.
14819
* The perf_online_<domain>_masks includes the first CPU of each domain.
14820
* Always unconditionally set the boot CPU for the perf_online_<domain>_masks.
14821
*/
14822
if (cpumask_empty(perf_online_mask)) {
14823
for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
14824
pmu_cpumask = perf_scope_cpumask(scope);
14825
if (WARN_ON_ONCE(!pmu_cpumask))
14826
continue;
14827
cpumask_set_cpu(cpu, pmu_cpumask);
14828
}
14829
goto end;
14830
}
14831
14832
for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
14833
const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
14834
14835
pmu_cpumask = perf_scope_cpumask(scope);
14836
14837
if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
14838
continue;
14839
14840
if (!cpumask_empty(cpumask) &&
14841
cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
14842
cpumask_set_cpu(cpu, pmu_cpumask);
14843
}
14844
end:
14845
cpumask_set_cpu(cpu, perf_online_mask);
14846
}
14847
14848
int perf_event_init_cpu(unsigned int cpu)
14849
{
14850
struct perf_cpu_context *cpuctx;
14851
struct perf_event_context *ctx;
14852
14853
perf_swevent_init_cpu(cpu);
14854
14855
mutex_lock(&pmus_lock);
14856
perf_event_setup_cpumask(cpu);
14857
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
14858
ctx = &cpuctx->ctx;
14859
14860
mutex_lock(&ctx->mutex);
14861
cpuctx->online = 1;
14862
mutex_unlock(&ctx->mutex);
14863
mutex_unlock(&pmus_lock);
14864
14865
return 0;
14866
}
14867
14868
int perf_event_exit_cpu(unsigned int cpu)
14869
{
14870
perf_event_exit_cpu_context(cpu);
14871
return 0;
14872
}
14873
14874
static int
14875
perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
14876
{
14877
int cpu;
14878
14879
for_each_online_cpu(cpu)
14880
perf_event_exit_cpu(cpu);
14881
14882
return NOTIFY_OK;
14883
}
14884
14885
/*
14886
* Run the perf reboot notifier at the very last possible moment so that
14887
* the generic watchdog code runs as long as possible.
14888
*/
14889
static struct notifier_block perf_reboot_notifier = {
14890
.notifier_call = perf_reboot,
14891
.priority = INT_MIN,
14892
};
14893
14894
void __init perf_event_init(void)
14895
{
14896
int ret;
14897
14898
idr_init(&pmu_idr);
14899
14900
unwind_deferred_init(&perf_unwind_work,
14901
perf_unwind_deferred_callback);
14902
14903
perf_event_init_all_cpus();
14904
init_srcu_struct(&pmus_srcu);
14905
perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
14906
perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1);
14907
perf_pmu_register(&perf_task_clock, "task_clock", -1);
14908
perf_tp_register();
14909
perf_event_init_cpu(smp_processor_id());
14910
register_reboot_notifier(&perf_reboot_notifier);
14911
14912
ret = init_hw_breakpoint();
14913
WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
14914
14915
perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC);
14916
14917
/*
14918
* Build time assertion that we keep the data_head at the intended
14919
* location. IOW, validation we got the __reserved[] size right.
14920
*/
14921
BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
14922
!= 1024);
14923
}
14924
14925
ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
14926
char *page)
14927
{
14928
struct perf_pmu_events_attr *pmu_attr =
14929
container_of(attr, struct perf_pmu_events_attr, attr);
14930
14931
if (pmu_attr->event_str)
14932
return sprintf(page, "%s\n", pmu_attr->event_str);
14933
14934
return 0;
14935
}
14936
EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
14937
14938
static int __init perf_event_sysfs_init(void)
14939
{
14940
struct pmu *pmu;
14941
int ret;
14942
14943
mutex_lock(&pmus_lock);
14944
14945
ret = bus_register(&pmu_bus);
14946
if (ret)
14947
goto unlock;
14948
14949
list_for_each_entry(pmu, &pmus, entry) {
14950
if (pmu->dev)
14951
continue;
14952
14953
ret = pmu_dev_alloc(pmu);
14954
WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
14955
}
14956
pmu_bus_running = 1;
14957
ret = 0;
14958
14959
unlock:
14960
mutex_unlock(&pmus_lock);
14961
14962
return ret;
14963
}
14964
device_initcall(perf_event_sysfs_init);
14965
14966
#ifdef CONFIG_CGROUP_PERF
14967
static struct cgroup_subsys_state *
14968
perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
14969
{
14970
struct perf_cgroup *jc;
14971
14972
jc = kzalloc(sizeof(*jc), GFP_KERNEL);
14973
if (!jc)
14974
return ERR_PTR(-ENOMEM);
14975
14976
jc->info = alloc_percpu(struct perf_cgroup_info);
14977
if (!jc->info) {
14978
kfree(jc);
14979
return ERR_PTR(-ENOMEM);
14980
}
14981
14982
return &jc->css;
14983
}
14984
14985
static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
14986
{
14987
struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
14988
14989
free_percpu(jc->info);
14990
kfree(jc);
14991
}
14992
14993
static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
14994
{
14995
perf_event_cgroup(css->cgroup);
14996
return 0;
14997
}
14998
14999
static int __perf_cgroup_move(void *info)
15000
{
15001
struct task_struct *task = info;
15002
15003
preempt_disable();
15004
perf_cgroup_switch(task);
15005
preempt_enable();
15006
15007
return 0;
15008
}
15009
15010
static void perf_cgroup_attach(struct cgroup_taskset *tset)
15011
{
15012
struct task_struct *task;
15013
struct cgroup_subsys_state *css;
15014
15015
cgroup_taskset_for_each(task, css, tset)
15016
task_function_call(task, __perf_cgroup_move, task);
15017
}
15018
15019
struct cgroup_subsys perf_event_cgrp_subsys = {
15020
.css_alloc = perf_cgroup_css_alloc,
15021
.css_free = perf_cgroup_css_free,
15022
.css_online = perf_cgroup_css_online,
15023
.attach = perf_cgroup_attach,
15024
/*
15025
* Implicitly enable on dfl hierarchy so that perf events can
15026
* always be filtered by cgroup2 path as long as perf_event
15027
* controller is not mounted on a legacy hierarchy.
15028
*/
15029
.implicit_on_dfl = true,
15030
.threaded = true,
15031
};
15032
#endif /* CONFIG_CGROUP_PERF */
15033
15034
DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);
15035
15036