CoCalc -- core.c

GitHub Repository: torvalds/linux
Path: blob/master/kernel/events/core.c
⁴⁹²⁸⁷ views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
 * Performance events core code:
4
 *
5
 *  Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner <[email protected]>
6
 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
7
 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
8
 *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <[email protected]>
9
 */
10

11
#include <linux/fs.h>
12
#include <linux/mm.h>
13
#include <linux/cpu.h>
14
#include <linux/smp.h>
15
#include <linux/idr.h>
16
#include <linux/file.h>
17
#include <linux/poll.h>
18
#include <linux/slab.h>
19
#include <linux/hash.h>
20
#include <linux/tick.h>
21
#include <linux/sysfs.h>
22
#include <linux/dcache.h>
23
#include <linux/percpu.h>
24
#include <linux/ptrace.h>
25
#include <linux/reboot.h>
26
#include <linux/vmstat.h>
27
#include <linux/device.h>
28
#include <linux/export.h>
29
#include <linux/vmalloc.h>
30
#include <linux/hardirq.h>
31
#include <linux/hugetlb.h>
32
#include <linux/rculist.h>
33
#include <linux/uaccess.h>
34
#include <linux/syscalls.h>
35
#include <linux/anon_inodes.h>
36
#include <linux/kernel_stat.h>
37
#include <linux/cgroup.h>
38
#include <linux/perf_event.h>
39
#include <linux/trace_events.h>
40
#include <linux/hw_breakpoint.h>
41
#include <linux/mm_types.h>
42
#include <linux/module.h>
43
#include <linux/mman.h>
44
#include <linux/compat.h>
45
#include <linux/bpf.h>
46
#include <linux/filter.h>
47
#include <linux/namei.h>
48
#include <linux/parser.h>
49
#include <linux/sched/clock.h>
50
#include <linux/sched/mm.h>
51
#include <linux/proc_ns.h>
52
#include <linux/mount.h>
53
#include <linux/min_heap.h>
54
#include <linux/highmem.h>
55
#include <linux/pgtable.h>
56
#include <linux/buildid.h>
57
#include <linux/task_work.h>
58
#include <linux/percpu-rwsem.h>
59
#include <linux/unwind_deferred.h>
60

61
#include "internal.h"
62

63
#include <asm/irq_regs.h>
64

65
typedef int (*remote_function_f)(void *);
66

67
struct remote_function_call {
68
	struct task_struct	*p;
69
	remote_function_f	func;
70
	void			*info;
71
	int			ret;
72
};
73

74
static void remote_function(void *data)
75
{
76
	struct remote_function_call *tfc = data;
77
	struct task_struct *p = tfc->p;
78

79
	if (p) {
80
		/* -EAGAIN */
81
		if (task_cpu(p) != smp_processor_id())
82
			return;
83

84
		/*
85
		 * Now that we're on right CPU with IRQs disabled, we can test
86
		 * if we hit the right task without races.
87
		 */
88

89
		tfc->ret = -ESRCH; /* No such (running) process */
90
		if (p != current)
91
			return;
92
	}
93

94
	tfc->ret = tfc->func(tfc->info);
95
}
96

97
/**
98
 * task_function_call - call a function on the cpu on which a task runs
99
 * @p:		the task to evaluate
100
 * @func:	the function to be called
101
 * @info:	the function call argument
102
 *
103
 * Calls the function @func when the task is currently running. This might
104
 * be on the current CPU, which just calls the function directly.  This will
105
 * retry due to any failures in smp_call_function_single(), such as if the
106
 * task_cpu() goes offline concurrently.
107
 *
108
 * returns @func return value or -ESRCH or -ENXIO when the process isn't running
109
 */
110
static int
111
task_function_call(struct task_struct *p, remote_function_f func, void *info)
112
{
113
	struct remote_function_call data = {
114
		.p	= p,
115
		.func	= func,
116
		.info	= info,
117
		.ret	= -EAGAIN,
118
	};
119
	int ret;
120

121
	for (;;) {
122
		ret = smp_call_function_single(task_cpu(p), remote_function,
123
					       &data, 1);
124
		if (!ret)
125
			ret = data.ret;
126

127
		if (ret != -EAGAIN)
128
			break;
129

130
		cond_resched();
131
	}
132

133
	return ret;
134
}
135

136
/**
137
 * cpu_function_call - call a function on the cpu
138
 * @cpu:	target cpu to queue this function
139
 * @func:	the function to be called
140
 * @info:	the function call argument
141
 *
142
 * Calls the function @func on the remote cpu.
143
 *
144
 * returns: @func return value or -ENXIO when the cpu is offline
145
 */
146
static int cpu_function_call(int cpu, remote_function_f func, void *info)
147
{
148
	struct remote_function_call data = {
149
		.p	= NULL,
150
		.func	= func,
151
		.info	= info,
152
		.ret	= -ENXIO, /* No such CPU */
153
	};
154

155
	smp_call_function_single(cpu, remote_function, &data, 1);
156

157
	return data.ret;
158
}
159

160
enum event_type_t {
161
	EVENT_FLEXIBLE	= 0x01,
162
	EVENT_PINNED	= 0x02,
163
	EVENT_TIME	= 0x04,
164
	EVENT_FROZEN	= 0x08,
165
	/* see ctx_resched() for details */
166
	EVENT_CPU	= 0x10,
167
	EVENT_CGROUP	= 0x20,
168

169
	/* compound helpers */
170
	EVENT_ALL         = EVENT_FLEXIBLE | EVENT_PINNED,
171
	EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
172
};
173

174
static inline void __perf_ctx_lock(struct perf_event_context *ctx)
175
{
176
	raw_spin_lock(&ctx->lock);
177
	WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN);
178
}
179

180
static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
181
			  struct perf_event_context *ctx)
182
{
183
	__perf_ctx_lock(&cpuctx->ctx);
184
	if (ctx)
185
		__perf_ctx_lock(ctx);
186
}
187

188
static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
189
{
190
	/*
191
	 * If ctx_sched_in() didn't again set any ALL flags, clean up
192
	 * after ctx_sched_out() by clearing is_active.
193
	 */
194
	if (ctx->is_active & EVENT_FROZEN) {
195
		if (!(ctx->is_active & EVENT_ALL))
196
			ctx->is_active = 0;
197
		else
198
			ctx->is_active &= ~EVENT_FROZEN;
199
	}
200
	raw_spin_unlock(&ctx->lock);
201
}
202

203
static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
204
			    struct perf_event_context *ctx)
205
{
206
	if (ctx)
207
		__perf_ctx_unlock(ctx);
208
	__perf_ctx_unlock(&cpuctx->ctx);
209
}
210

211
typedef struct {
212
	struct perf_cpu_context *cpuctx;
213
	struct perf_event_context *ctx;
214
} class_perf_ctx_lock_t;
215

216
static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T)
217
{ perf_ctx_unlock(_T->cpuctx, _T->ctx); }
218

219
static inline class_perf_ctx_lock_t
220
class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx,
221
				struct perf_event_context *ctx)
222
{ perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; }
223

224
#define TASK_TOMBSTONE ((void *)-1L)
225

226
static bool is_kernel_event(struct perf_event *event)
227
{
228
	return READ_ONCE(event->owner) == TASK_TOMBSTONE;
229
}
230

231
static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
232

233
struct perf_event_context *perf_cpu_task_ctx(void)
234
{
235
	lockdep_assert_irqs_disabled();
236
	return this_cpu_ptr(&perf_cpu_context)->task_ctx;
237
}
238

239
/*
240
 * On task ctx scheduling...
241
 *
242
 * When !ctx->nr_events a task context will not be scheduled. This means
243
 * we can disable the scheduler hooks (for performance) without leaving
244
 * pending task ctx state.
245
 *
246
 * This however results in two special cases:
247
 *
248
 *  - removing the last event from a task ctx; this is relatively straight
249
 *    forward and is done in __perf_remove_from_context.
250
 *
251
 *  - adding the first event to a task ctx; this is tricky because we cannot
252
 *    rely on ctx->is_active and therefore cannot use event_function_call().
253
 *    See perf_install_in_context().
254
 *
255
 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
256
 */
257

258
typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
259
			struct perf_event_context *, void *);
260

261
struct event_function_struct {
262
	struct perf_event *event;
263
	event_f func;
264
	void *data;
265
};
266

267
static int event_function(void *info)
268
{
269
	struct event_function_struct *efs = info;
270
	struct perf_event *event = efs->event;
271
	struct perf_event_context *ctx = event->ctx;
272
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
273
	struct perf_event_context *task_ctx = cpuctx->task_ctx;
274
	int ret = 0;
275

276
	lockdep_assert_irqs_disabled();
277

278
	perf_ctx_lock(cpuctx, task_ctx);
279
	/*
280
	 * Since we do the IPI call without holding ctx->lock things can have
281
	 * changed, double check we hit the task we set out to hit.
282
	 */
283
	if (ctx->task) {
284
		if (ctx->task != current) {
285
			ret = -ESRCH;
286
			goto unlock;
287
		}
288

289
		/*
290
		 * We only use event_function_call() on established contexts,
291
		 * and event_function() is only ever called when active (or
292
		 * rather, we'll have bailed in task_function_call() or the
293
		 * above ctx->task != current test), therefore we must have
294
		 * ctx->is_active here.
295
		 */
296
		WARN_ON_ONCE(!ctx->is_active);
297
		/*
298
		 * And since we have ctx->is_active, cpuctx->task_ctx must
299
		 * match.
300
		 */
301
		WARN_ON_ONCE(task_ctx != ctx);
302
	} else {
303
		WARN_ON_ONCE(&cpuctx->ctx != ctx);
304
	}
305

306
	efs->func(event, cpuctx, ctx, efs->data);
307
unlock:
308
	perf_ctx_unlock(cpuctx, task_ctx);
309

310
	return ret;
311
}
312

313
static void event_function_call(struct perf_event *event, event_f func, void *data)
314
{
315
	struct perf_event_context *ctx = event->ctx;
316
	struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
317
	struct perf_cpu_context *cpuctx;
318
	struct event_function_struct efs = {
319
		.event = event,
320
		.func = func,
321
		.data = data,
322
	};
323

324
	if (!event->parent) {
325
		/*
326
		 * If this is a !child event, we must hold ctx::mutex to
327
		 * stabilize the event->ctx relation. See
328
		 * perf_event_ctx_lock().
329
		 */
330
		lockdep_assert_held(&ctx->mutex);
331
	}
332

333
	if (!task) {
334
		cpu_function_call(event->cpu, event_function, &efs);
335
		return;
336
	}
337

338
	if (task == TASK_TOMBSTONE)
339
		return;
340

341
again:
342
	if (!task_function_call(task, event_function, &efs))
343
		return;
344

345
	local_irq_disable();
346
	cpuctx = this_cpu_ptr(&perf_cpu_context);
347
	perf_ctx_lock(cpuctx, ctx);
348
	/*
349
	 * Reload the task pointer, it might have been changed by
350
	 * a concurrent perf_event_context_sched_out().
351
	 */
352
	task = ctx->task;
353
	if (task == TASK_TOMBSTONE)
354
		goto unlock;
355
	if (ctx->is_active) {
356
		perf_ctx_unlock(cpuctx, ctx);
357
		local_irq_enable();
358
		goto again;
359
	}
360
	func(event, NULL, ctx, data);
361
unlock:
362
	perf_ctx_unlock(cpuctx, ctx);
363
	local_irq_enable();
364
}
365

366
/*
367
 * Similar to event_function_call() + event_function(), but hard assumes IRQs
368
 * are already disabled and we're on the right CPU.
369
 */
370
static void event_function_local(struct perf_event *event, event_f func, void *data)
371
{
372
	struct perf_event_context *ctx = event->ctx;
373
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
374
	struct task_struct *task = READ_ONCE(ctx->task);
375
	struct perf_event_context *task_ctx = NULL;
376

377
	lockdep_assert_irqs_disabled();
378

379
	if (task) {
380
		if (task == TASK_TOMBSTONE)
381
			return;
382

383
		task_ctx = ctx;
384
	}
385

386
	perf_ctx_lock(cpuctx, task_ctx);
387

388
	task = ctx->task;
389
	if (task == TASK_TOMBSTONE)
390
		goto unlock;
391

392
	if (task) {
393
		/*
394
		 * We must be either inactive or active and the right task,
395
		 * otherwise we're screwed, since we cannot IPI to somewhere
396
		 * else.
397
		 */
398
		if (ctx->is_active) {
399
			if (WARN_ON_ONCE(task != current))
400
				goto unlock;
401

402
			if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
403
				goto unlock;
404
		}
405
	} else {
406
		WARN_ON_ONCE(&cpuctx->ctx != ctx);
407
	}
408

409
	func(event, cpuctx, ctx, data);
410
unlock:
411
	perf_ctx_unlock(cpuctx, task_ctx);
412
}
413

414
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
415
		       PERF_FLAG_FD_OUTPUT  |\
416
		       PERF_FLAG_PID_CGROUP |\
417
		       PERF_FLAG_FD_CLOEXEC)
418

419
/*
420
 * branch priv levels that need permission checks
421
 */
422
#define PERF_SAMPLE_BRANCH_PERM_PLM \
423
	(PERF_SAMPLE_BRANCH_KERNEL |\
424
	 PERF_SAMPLE_BRANCH_HV)
425

426
/*
427
 * perf_sched_events : >0 events exist
428
 */
429

430
static void perf_sched_delayed(struct work_struct *work);
431
DEFINE_STATIC_KEY_FALSE(perf_sched_events);
432
static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
433
static DEFINE_MUTEX(perf_sched_mutex);
434
static atomic_t perf_sched_count;
435

436
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
437

438
static atomic_t nr_mmap_events __read_mostly;
439
static atomic_t nr_comm_events __read_mostly;
440
static atomic_t nr_namespaces_events __read_mostly;
441
static atomic_t nr_task_events __read_mostly;
442
static atomic_t nr_freq_events __read_mostly;
443
static atomic_t nr_switch_events __read_mostly;
444
static atomic_t nr_ksymbol_events __read_mostly;
445
static atomic_t nr_bpf_events __read_mostly;
446
static atomic_t nr_cgroup_events __read_mostly;
447
static atomic_t nr_text_poke_events __read_mostly;
448
static atomic_t nr_build_id_events __read_mostly;
449

450
static LIST_HEAD(pmus);
451
static DEFINE_MUTEX(pmus_lock);
452
static struct srcu_struct pmus_srcu;
453
static cpumask_var_t perf_online_mask;
454
static cpumask_var_t perf_online_core_mask;
455
static cpumask_var_t perf_online_die_mask;
456
static cpumask_var_t perf_online_cluster_mask;
457
static cpumask_var_t perf_online_pkg_mask;
458
static cpumask_var_t perf_online_sys_mask;
459
static struct kmem_cache *perf_event_cache;
460

461
/*
462
 * perf event paranoia level:
463
 *  -1 - not paranoid at all
464
 *   0 - disallow raw tracepoint access for unpriv
465
 *   1 - disallow cpu events for unpriv
466
 *   2 - disallow kernel profiling for unpriv
467
 */
468
int sysctl_perf_event_paranoid __read_mostly = 2;
469

470
/* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */
471
static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
472

473
/*
474
 * max perf event sample rate
475
 */
476
#define DEFAULT_MAX_SAMPLE_RATE		100000
477
#define DEFAULT_SAMPLE_PERIOD_NS	(NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
478
#define DEFAULT_CPU_TIME_MAX_PERCENT	25
479

480
int sysctl_perf_event_sample_rate __read_mostly	= DEFAULT_MAX_SAMPLE_RATE;
481
static int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
482

483
static int max_samples_per_tick __read_mostly	= DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
484
static int perf_sample_period_ns __read_mostly	= DEFAULT_SAMPLE_PERIOD_NS;
485

486
static int perf_sample_allowed_ns __read_mostly =
487
	DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
488

489
static void update_perf_cpu_limits(void)
490
{
491
	u64 tmp = perf_sample_period_ns;
492

493
	tmp *= sysctl_perf_cpu_time_max_percent;
494
	tmp = div_u64(tmp, 100);
495
	if (!tmp)
496
		tmp = 1;
497

498
	WRITE_ONCE(perf_sample_allowed_ns, tmp);
499
}
500

501
static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
502

503
static int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write,
504
				       void *buffer, size_t *lenp, loff_t *ppos)
505
{
506
	int ret;
507
	int perf_cpu = sysctl_perf_cpu_time_max_percent;
508
	/*
509
	 * If throttling is disabled don't allow the write:
510
	 */
511
	if (write && (perf_cpu == 100 || perf_cpu == 0))
512
		return -EINVAL;
513

514
	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
515
	if (ret || !write)
516
		return ret;
517

518
	max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
519
	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
520
	update_perf_cpu_limits();
521

522
	return 0;
523
}
524

525
static int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write,
526
		void *buffer, size_t *lenp, loff_t *ppos)
527
{
528
	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
529

530
	if (ret || !write)
531
		return ret;
532

533
	if (sysctl_perf_cpu_time_max_percent == 100 ||
534
	    sysctl_perf_cpu_time_max_percent == 0) {
535
		printk(KERN_WARNING
536
		       "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
537
		WRITE_ONCE(perf_sample_allowed_ns, 0);
538
	} else {
539
		update_perf_cpu_limits();
540
	}
541

542
	return 0;
543
}
544

545
static const struct ctl_table events_core_sysctl_table[] = {
546
	/*
547
	 * User-space relies on this file as a feature check for
548
	 * perf_events being enabled. It's an ABI, do not remove!
549
	 */
550
	{
551
		.procname	= "perf_event_paranoid",
552
		.data		= &sysctl_perf_event_paranoid,
553
		.maxlen		= sizeof(sysctl_perf_event_paranoid),
554
		.mode		= 0644,
555
		.proc_handler	= proc_dointvec,
556
	},
557
	{
558
		.procname	= "perf_event_mlock_kb",
559
		.data		= &sysctl_perf_event_mlock,
560
		.maxlen		= sizeof(sysctl_perf_event_mlock),
561
		.mode		= 0644,
562
		.proc_handler	= proc_dointvec,
563
	},
564
	{
565
		.procname	= "perf_event_max_sample_rate",
566
		.data		= &sysctl_perf_event_sample_rate,
567
		.maxlen		= sizeof(sysctl_perf_event_sample_rate),
568
		.mode		= 0644,
569
		.proc_handler	= perf_event_max_sample_rate_handler,
570
		.extra1		= SYSCTL_ONE,
571
	},
572
	{
573
		.procname	= "perf_cpu_time_max_percent",
574
		.data		= &sysctl_perf_cpu_time_max_percent,
575
		.maxlen		= sizeof(sysctl_perf_cpu_time_max_percent),
576
		.mode		= 0644,
577
		.proc_handler	= perf_cpu_time_max_percent_handler,
578
		.extra1		= SYSCTL_ZERO,
579
		.extra2		= SYSCTL_ONE_HUNDRED,
580
	},
581
};
582

583
static int __init init_events_core_sysctls(void)
584
{
585
	register_sysctl_init("kernel", events_core_sysctl_table);
586
	return 0;
587
}
588
core_initcall(init_events_core_sysctls);
589

590

591
/*
592
 * perf samples are done in some very critical code paths (NMIs).
593
 * If they take too much CPU time, the system can lock up and not
594
 * get any real work done.  This will drop the sample rate when
595
 * we detect that events are taking too long.
596
 */
597
#define NR_ACCUMULATED_SAMPLES 128
598
static DEFINE_PER_CPU(u64, running_sample_length);
599

600
static u64 __report_avg;
601
static u64 __report_allowed;
602

603
static void perf_duration_warn(struct irq_work *w)
604
{
605
	printk_ratelimited(KERN_INFO
606
		"perf: interrupt took too long (%lld > %lld), lowering "
607
		"kernel.perf_event_max_sample_rate to %d\n",
608
		__report_avg, __report_allowed,
609
		sysctl_perf_event_sample_rate);
610
}
611

612
static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
613

614
void perf_sample_event_took(u64 sample_len_ns)
615
{
616
	u64 max_len = READ_ONCE(perf_sample_allowed_ns);
617
	u64 running_len;
618
	u64 avg_len;
619
	u32 max;
620

621
	if (max_len == 0)
622
		return;
623

624
	/* Decay the counter by 1 average sample. */
625
	running_len = __this_cpu_read(running_sample_length);
626
	running_len -= running_len/NR_ACCUMULATED_SAMPLES;
627
	running_len += sample_len_ns;
628
	__this_cpu_write(running_sample_length, running_len);
629

630
	/*
631
	 * Note: this will be biased artificially low until we have
632
	 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
633
	 * from having to maintain a count.
634
	 */
635
	avg_len = running_len/NR_ACCUMULATED_SAMPLES;
636
	if (avg_len <= max_len)
637
		return;
638

639
	__report_avg = avg_len;
640
	__report_allowed = max_len;
641

642
	/*
643
	 * Compute a throttle threshold 25% below the current duration.
644
	 */
645
	avg_len += avg_len / 4;
646
	max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
647
	if (avg_len < max)
648
		max /= (u32)avg_len;
649
	else
650
		max = 1;
651

652
	WRITE_ONCE(perf_sample_allowed_ns, avg_len);
653
	WRITE_ONCE(max_samples_per_tick, max);
654

655
	sysctl_perf_event_sample_rate = max * HZ;
656
	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
657

658
	if (!irq_work_queue(&perf_duration_work)) {
659
		early_printk("perf: interrupt took too long (%lld > %lld), lowering "
660
			     "kernel.perf_event_max_sample_rate to %d\n",
661
			     __report_avg, __report_allowed,
662
			     sysctl_perf_event_sample_rate);
663
	}
664
}
665

666
static atomic64_t perf_event_id;
667

668
static void update_context_time(struct perf_event_context *ctx);
669
static u64 perf_event_time(struct perf_event *event);
670

671
void __weak perf_event_print_debug(void)	{ }
672

673
static inline u64 perf_clock(void)
674
{
675
	return local_clock();
676
}
677

678
static inline u64 perf_event_clock(struct perf_event *event)
679
{
680
	return event->clock();
681
}
682

683
/*
684
 * State based event timekeeping...
685
 *
686
 * The basic idea is to use event->state to determine which (if any) time
687
 * fields to increment with the current delta. This means we only need to
688
 * update timestamps when we change state or when they are explicitly requested
689
 * (read).
690
 *
691
 * Event groups make things a little more complicated, but not terribly so. The
692
 * rules for a group are that if the group leader is OFF the entire group is
693
 * OFF, irrespective of what the group member states are. This results in
694
 * __perf_effective_state().
695
 *
696
 * A further ramification is that when a group leader flips between OFF and
697
 * !OFF, we need to update all group member times.
698
 *
699
 *
700
 * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
701
 * need to make sure the relevant context time is updated before we try and
702
 * update our timestamps.
703
 */
704

705
static __always_inline enum perf_event_state
706
__perf_effective_state(struct perf_event *event)
707
{
708
	struct perf_event *leader = event->group_leader;
709

710
	if (leader->state <= PERF_EVENT_STATE_OFF)
711
		return leader->state;
712

713
	return event->state;
714
}
715

716
static __always_inline void
717
__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
718
{
719
	enum perf_event_state state = __perf_effective_state(event);
720
	u64 delta = now - event->tstamp;
721

722
	*enabled = event->total_time_enabled;
723
	if (state >= PERF_EVENT_STATE_INACTIVE)
724
		*enabled += delta;
725

726
	*running = event->total_time_running;
727
	if (state >= PERF_EVENT_STATE_ACTIVE)
728
		*running += delta;
729
}
730

731
static void perf_event_update_time(struct perf_event *event)
732
{
733
	u64 now = perf_event_time(event);
734

735
	__perf_update_times(event, now, &event->total_time_enabled,
736
					&event->total_time_running);
737
	event->tstamp = now;
738
}
739

740
static void perf_event_update_sibling_time(struct perf_event *leader)
741
{
742
	struct perf_event *sibling;
743

744
	for_each_sibling_event(sibling, leader)
745
		perf_event_update_time(sibling);
746
}
747

748
static void
749
perf_event_set_state(struct perf_event *event, enum perf_event_state state)
750
{
751
	if (event->state == state)
752
		return;
753

754
	perf_event_update_time(event);
755
	/*
756
	 * If a group leader gets enabled/disabled all its siblings
757
	 * are affected too.
758
	 */
759
	if ((event->state < 0) ^ (state < 0))
760
		perf_event_update_sibling_time(event);
761

762
	WRITE_ONCE(event->state, state);
763
}
764

765
/*
766
 * UP store-release, load-acquire
767
 */
768

769
#define __store_release(ptr, val)					\
770
do {									\
771
	barrier();							\
772
	WRITE_ONCE(*(ptr), (val));					\
773
} while (0)
774

775
#define __load_acquire(ptr)						\
776
({									\
777
	__unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr));	\
778
	barrier();							\
779
	___p;								\
780
})
781

782
#define for_each_epc(_epc, _ctx, _pmu, _cgroup)				\
783
	list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \
784
		if (_cgroup && !_epc->nr_cgroups)			\
785
			continue;					\
786
		else if (_pmu && _epc->pmu != _pmu)			\
787
			continue;					\
788
		else
789

790
static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
791
{
792
	struct perf_event_pmu_context *pmu_ctx;
793

794
	for_each_epc(pmu_ctx, ctx, NULL, cgroup)
795
		perf_pmu_disable(pmu_ctx->pmu);
796
}
797

798
static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
799
{
800
	struct perf_event_pmu_context *pmu_ctx;
801

802
	for_each_epc(pmu_ctx, ctx, NULL, cgroup)
803
		perf_pmu_enable(pmu_ctx->pmu);
804
}
805

806
static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
807
static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
808

809
#ifdef CONFIG_CGROUP_PERF
810

811
static inline bool
812
perf_cgroup_match(struct perf_event *event)
813
{
814
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
815

816
	/* @event doesn't care about cgroup */
817
	if (!event->cgrp)
818
		return true;
819

820
	/* wants specific cgroup scope but @cpuctx isn't associated with any */
821
	if (!cpuctx->cgrp)
822
		return false;
823

824
	/*
825
	 * Cgroup scoping is recursive.  An event enabled for a cgroup is
826
	 * also enabled for all its descendant cgroups.  If @cpuctx's
827
	 * cgroup is a descendant of @event's (the test covers identity
828
	 * case), it's a match.
829
	 */
830
	return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
831
				    event->cgrp->css.cgroup);
832
}
833

834
static inline void perf_detach_cgroup(struct perf_event *event)
835
{
836
	css_put(&event->cgrp->css);
837
	event->cgrp = NULL;
838
}
839

840
static inline int is_cgroup_event(struct perf_event *event)
841
{
842
	return event->cgrp != NULL;
843
}
844

845
static inline u64 perf_cgroup_event_time(struct perf_event *event)
846
{
847
	struct perf_cgroup_info *t;
848

849
	t = per_cpu_ptr(event->cgrp->info, event->cpu);
850
	return t->time;
851
}
852

853
static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
854
{
855
	struct perf_cgroup_info *t;
856

857
	t = per_cpu_ptr(event->cgrp->info, event->cpu);
858
	if (!__load_acquire(&t->active))
859
		return t->time;
860
	now += READ_ONCE(t->timeoffset);
861
	return now;
862
}
863

864
static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
865
{
866
	if (adv)
867
		info->time += now - info->timestamp;
868
	info->timestamp = now;
869
	/*
870
	 * see update_context_time()
871
	 */
872
	WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
873
}
874

875
static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
876
{
877
	struct perf_cgroup *cgrp = cpuctx->cgrp;
878
	struct cgroup_subsys_state *css;
879
	struct perf_cgroup_info *info;
880

881
	if (cgrp) {
882
		u64 now = perf_clock();
883

884
		for (css = &cgrp->css; css; css = css->parent) {
885
			cgrp = container_of(css, struct perf_cgroup, css);
886
			info = this_cpu_ptr(cgrp->info);
887

888
			__update_cgrp_time(info, now, true);
889
			if (final)
890
				__store_release(&info->active, 0);
891
		}
892
	}
893
}
894

895
static inline void update_cgrp_time_from_event(struct perf_event *event)
896
{
897
	struct perf_cgroup_info *info;
898

899
	/*
900
	 * ensure we access cgroup data only when needed and
901
	 * when we know the cgroup is pinned (css_get)
902
	 */
903
	if (!is_cgroup_event(event))
904
		return;
905

906
	info = this_cpu_ptr(event->cgrp->info);
907
	/*
908
	 * Do not update time when cgroup is not active
909
	 */
910
	if (info->active)
911
		__update_cgrp_time(info, perf_clock(), true);
912
}
913

914
static inline void
915
perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
916
{
917
	struct perf_event_context *ctx = &cpuctx->ctx;
918
	struct perf_cgroup *cgrp = cpuctx->cgrp;
919
	struct perf_cgroup_info *info;
920
	struct cgroup_subsys_state *css;
921

922
	/*
923
	 * ctx->lock held by caller
924
	 * ensure we do not access cgroup data
925
	 * unless we have the cgroup pinned (css_get)
926
	 */
927
	if (!cgrp)
928
		return;
929

930
	WARN_ON_ONCE(!ctx->nr_cgroups);
931

932
	for (css = &cgrp->css; css; css = css->parent) {
933
		cgrp = container_of(css, struct perf_cgroup, css);
934
		info = this_cpu_ptr(cgrp->info);
935
		__update_cgrp_time(info, ctx->timestamp, false);
936
		__store_release(&info->active, 1);
937
	}
938
}
939

940
/*
941
 * reschedule events based on the cgroup constraint of task.
942
 */
943
static void perf_cgroup_switch(struct task_struct *task)
944
{
945
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
946
	struct perf_cgroup *cgrp;
947

948
	/*
949
	 * cpuctx->cgrp is set when the first cgroup event enabled,
950
	 * and is cleared when the last cgroup event disabled.
951
	 */
952
	if (READ_ONCE(cpuctx->cgrp) == NULL)
953
		return;
954

955
	cgrp = perf_cgroup_from_task(task, NULL);
956
	if (READ_ONCE(cpuctx->cgrp) == cgrp)
957
		return;
958

959
	guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx);
960
	/*
961
	 * Re-check, could've raced vs perf_remove_from_context().
962
	 */
963
	if (READ_ONCE(cpuctx->cgrp) == NULL)
964
		return;
965

966
	WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
967

968
	perf_ctx_disable(&cpuctx->ctx, true);
969

970
	ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
971
	/*
972
	 * must not be done before ctxswout due
973
	 * to update_cgrp_time_from_cpuctx() in
974
	 * ctx_sched_out()
975
	 */
976
	cpuctx->cgrp = cgrp;
977
	/*
978
	 * set cgrp before ctxsw in to allow
979
	 * perf_cgroup_set_timestamp() in ctx_sched_in()
980
	 * to not have to pass task around
981
	 */
982
	ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
983

984
	perf_ctx_enable(&cpuctx->ctx, true);
985
}
986

987
static int perf_cgroup_ensure_storage(struct perf_event *event,
988
				struct cgroup_subsys_state *css)
989
{
990
	struct perf_cpu_context *cpuctx;
991
	struct perf_event **storage;
992
	int cpu, heap_size, ret = 0;
993

994
	/*
995
	 * Allow storage to have sufficient space for an iterator for each
996
	 * possibly nested cgroup plus an iterator for events with no cgroup.
997
	 */
998
	for (heap_size = 1; css; css = css->parent)
999
		heap_size++;
1000

1001
	for_each_possible_cpu(cpu) {
1002
		cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
1003
		if (heap_size <= cpuctx->heap_size)
1004
			continue;
1005

1006
		storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
1007
				       GFP_KERNEL, cpu_to_node(cpu));
1008
		if (!storage) {
1009
			ret = -ENOMEM;
1010
			break;
1011
		}
1012

1013
		raw_spin_lock_irq(&cpuctx->ctx.lock);
1014
		if (cpuctx->heap_size < heap_size) {
1015
			swap(cpuctx->heap, storage);
1016
			if (storage == cpuctx->heap_default)
1017
				storage = NULL;
1018
			cpuctx->heap_size = heap_size;
1019
		}
1020
		raw_spin_unlock_irq(&cpuctx->ctx.lock);
1021

1022
		kfree(storage);
1023
	}
1024

1025
	return ret;
1026
}
1027

1028
static inline int perf_cgroup_connect(int fd, struct perf_event *event,
1029
				      struct perf_event_attr *attr,
1030
				      struct perf_event *group_leader)
1031
{
1032
	struct perf_cgroup *cgrp;
1033
	struct cgroup_subsys_state *css;
1034
	CLASS(fd, f)(fd);
1035
	int ret = 0;
1036

1037
	if (fd_empty(f))
1038
		return -EBADF;
1039

1040
	css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry,
1041
					 &perf_event_cgrp_subsys);
1042
	if (IS_ERR(css))
1043
		return PTR_ERR(css);
1044

1045
	ret = perf_cgroup_ensure_storage(event, css);
1046
	if (ret)
1047
		return ret;
1048

1049
	cgrp = container_of(css, struct perf_cgroup, css);
1050
	event->cgrp = cgrp;
1051

1052
	/*
1053
	 * all events in a group must monitor
1054
	 * the same cgroup because a task belongs
1055
	 * to only one perf cgroup at a time
1056
	 */
1057
	if (group_leader && group_leader->cgrp != cgrp) {
1058
		perf_detach_cgroup(event);
1059
		ret = -EINVAL;
1060
	}
1061
	return ret;
1062
}
1063

1064
static inline void
1065
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
1066
{
1067
	struct perf_cpu_context *cpuctx;
1068

1069
	if (!is_cgroup_event(event))
1070
		return;
1071

1072
	event->pmu_ctx->nr_cgroups++;
1073

1074
	/*
1075
	 * Because cgroup events are always per-cpu events,
1076
	 * @ctx == &cpuctx->ctx.
1077
	 */
1078
	cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1079

1080
	if (ctx->nr_cgroups++)
1081
		return;
1082

1083
	cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
1084
}
1085

1086
static inline void
1087
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1088
{
1089
	struct perf_cpu_context *cpuctx;
1090

1091
	if (!is_cgroup_event(event))
1092
		return;
1093

1094
	event->pmu_ctx->nr_cgroups--;
1095

1096
	/*
1097
	 * Because cgroup events are always per-cpu events,
1098
	 * @ctx == &cpuctx->ctx.
1099
	 */
1100
	cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1101

1102
	if (--ctx->nr_cgroups)
1103
		return;
1104

1105
	cpuctx->cgrp = NULL;
1106
}
1107

1108
#else /* !CONFIG_CGROUP_PERF */
1109

1110
static inline bool
1111
perf_cgroup_match(struct perf_event *event)
1112
{
1113
	return true;
1114
}
1115

1116
static inline void perf_detach_cgroup(struct perf_event *event)
1117
{}
1118

1119
static inline int is_cgroup_event(struct perf_event *event)
1120
{
1121
	return 0;
1122
}
1123

1124
static inline void update_cgrp_time_from_event(struct perf_event *event)
1125
{
1126
}
1127

1128
static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
1129
						bool final)
1130
{
1131
}
1132

1133
static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1134
				      struct perf_event_attr *attr,
1135
				      struct perf_event *group_leader)
1136
{
1137
	return -EINVAL;
1138
}
1139

1140
static inline void
1141
perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
1142
{
1143
}
1144

1145
static inline u64 perf_cgroup_event_time(struct perf_event *event)
1146
{
1147
	return 0;
1148
}
1149

1150
static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
1151
{
1152
	return 0;
1153
}
1154

1155
static inline void
1156
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
1157
{
1158
}
1159

1160
static inline void
1161
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1162
{
1163
}
1164

1165
static void perf_cgroup_switch(struct task_struct *task)
1166
{
1167
}
1168
#endif
1169

1170
/*
1171
 * set default to be dependent on timer tick just
1172
 * like original code
1173
 */
1174
#define PERF_CPU_HRTIMER (1000 / HZ)
1175
/*
1176
 * function must be called with interrupts disabled
1177
 */
1178
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1179
{
1180
	struct perf_cpu_pmu_context *cpc;
1181
	bool rotations;
1182

1183
	lockdep_assert_irqs_disabled();
1184

1185
	cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer);
1186
	rotations = perf_rotate_context(cpc);
1187

1188
	raw_spin_lock(&cpc->hrtimer_lock);
1189
	if (rotations)
1190
		hrtimer_forward_now(hr, cpc->hrtimer_interval);
1191
	else
1192
		cpc->hrtimer_active = 0;
1193
	raw_spin_unlock(&cpc->hrtimer_lock);
1194

1195
	return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1196
}
1197

1198
static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
1199
{
1200
	struct hrtimer *timer = &cpc->hrtimer;
1201
	struct pmu *pmu = cpc->epc.pmu;
1202
	u64 interval;
1203

1204
	/*
1205
	 * check default is sane, if not set then force to
1206
	 * default interval (1/tick)
1207
	 */
1208
	interval = pmu->hrtimer_interval_ms;
1209
	if (interval < 1)
1210
		interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1211

1212
	cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1213

1214
	raw_spin_lock_init(&cpc->hrtimer_lock);
1215
	hrtimer_setup(timer, perf_mux_hrtimer_handler, CLOCK_MONOTONIC,
1216
		      HRTIMER_MODE_ABS_PINNED_HARD);
1217
}
1218

1219
static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
1220
{
1221
	struct hrtimer *timer = &cpc->hrtimer;
1222
	unsigned long flags;
1223

1224
	raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags);
1225
	if (!cpc->hrtimer_active) {
1226
		cpc->hrtimer_active = 1;
1227
		hrtimer_forward_now(timer, cpc->hrtimer_interval);
1228
		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
1229
	}
1230
	raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags);
1231

1232
	return 0;
1233
}
1234

1235
static int perf_mux_hrtimer_restart_ipi(void *arg)
1236
{
1237
	return perf_mux_hrtimer_restart(arg);
1238
}
1239

1240
static __always_inline struct perf_cpu_pmu_context *this_cpc(struct pmu *pmu)
1241
{
1242
	return *this_cpu_ptr(pmu->cpu_pmu_context);
1243
}
1244

1245
void perf_pmu_disable(struct pmu *pmu)
1246
{
1247
	int *count = &this_cpc(pmu)->pmu_disable_count;
1248
	if (!(*count)++)
1249
		pmu->pmu_disable(pmu);
1250
}
1251

1252
void perf_pmu_enable(struct pmu *pmu)
1253
{
1254
	int *count = &this_cpc(pmu)->pmu_disable_count;
1255
	if (!--(*count))
1256
		pmu->pmu_enable(pmu);
1257
}
1258

1259
static void perf_assert_pmu_disabled(struct pmu *pmu)
1260
{
1261
	int *count = &this_cpc(pmu)->pmu_disable_count;
1262
	WARN_ON_ONCE(*count == 0);
1263
}
1264

1265
static inline void perf_pmu_read(struct perf_event *event)
1266
{
1267
	if (event->state == PERF_EVENT_STATE_ACTIVE)
1268
		event->pmu->read(event);
1269
}
1270

1271
static void get_ctx(struct perf_event_context *ctx)
1272
{
1273
	refcount_inc(&ctx->refcount);
1274
}
1275

1276
static void free_ctx(struct rcu_head *head)
1277
{
1278
	struct perf_event_context *ctx;
1279

1280
	ctx = container_of(head, struct perf_event_context, rcu_head);
1281
	kfree(ctx);
1282
}
1283

1284
static void put_ctx(struct perf_event_context *ctx)
1285
{
1286
	if (refcount_dec_and_test(&ctx->refcount)) {
1287
		if (ctx->parent_ctx)
1288
			put_ctx(ctx->parent_ctx);
1289
		if (ctx->task && ctx->task != TASK_TOMBSTONE)
1290
			put_task_struct(ctx->task);
1291
		call_rcu(&ctx->rcu_head, free_ctx);
1292
	} else {
1293
		smp_mb__after_atomic(); /* pairs with wait_var_event() */
1294
		if (ctx->task == TASK_TOMBSTONE)
1295
			wake_up_var(&ctx->refcount);
1296
	}
1297
}
1298

1299
/*
1300
 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1301
 * perf_pmu_migrate_context() we need some magic.
1302
 *
1303
 * Those places that change perf_event::ctx will hold both
1304
 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1305
 *
1306
 * Lock ordering is by mutex address. There are two other sites where
1307
 * perf_event_context::mutex nests and those are:
1308
 *
1309
 *  - perf_event_exit_task_context()	[ child , 0 ]
1310
 *      perf_event_exit_event()
1311
 *        put_event()			[ parent, 1 ]
1312
 *
1313
 *  - perf_event_init_context()		[ parent, 0 ]
1314
 *      inherit_task_group()
1315
 *        inherit_group()
1316
 *          inherit_event()
1317
 *            perf_event_alloc()
1318
 *              perf_init_event()
1319
 *                perf_try_init_event()	[ child , 1 ]
1320
 *
1321
 * While it appears there is an obvious deadlock here -- the parent and child
1322
 * nesting levels are inverted between the two. This is in fact safe because
1323
 * life-time rules separate them. That is an exiting task cannot fork, and a
1324
 * spawning task cannot (yet) exit.
1325
 *
1326
 * But remember that these are parent<->child context relations, and
1327
 * migration does not affect children, therefore these two orderings should not
1328
 * interact.
1329
 *
1330
 * The change in perf_event::ctx does not affect children (as claimed above)
1331
 * because the sys_perf_event_open() case will install a new event and break
1332
 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1333
 * concerned with cpuctx and that doesn't have children.
1334
 *
1335
 * The places that change perf_event::ctx will issue:
1336
 *
1337
 *   perf_remove_from_context();
1338
 *   synchronize_rcu();
1339
 *   perf_install_in_context();
1340
 *
1341
 * to affect the change. The remove_from_context() + synchronize_rcu() should
1342
 * quiesce the event, after which we can install it in the new location. This
1343
 * means that only external vectors (perf_fops, prctl) can perturb the event
1344
 * while in transit. Therefore all such accessors should also acquire
1345
 * perf_event_context::mutex to serialize against this.
1346
 *
1347
 * However; because event->ctx can change while we're waiting to acquire
1348
 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1349
 * function.
1350
 *
1351
 * Lock order:
1352
 *    exec_update_lock
1353
 *	task_struct::perf_event_mutex
1354
 *	  perf_event_context::mutex
1355
 *	    perf_event::child_mutex;
1356
 *	      perf_event_context::lock
1357
 *	    mmap_lock
1358
 *	      perf_event::mmap_mutex
1359
 *	        perf_buffer::aux_mutex
1360
 *	      perf_addr_filters_head::lock
1361
 *
1362
 *    cpu_hotplug_lock
1363
 *      pmus_lock
1364
 *	  cpuctx->mutex / perf_event_context::mutex
1365
 */
1366
static struct perf_event_context *
1367
perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1368
{
1369
	struct perf_event_context *ctx;
1370

1371
again:
1372
	rcu_read_lock();
1373
	ctx = READ_ONCE(event->ctx);
1374
	if (!refcount_inc_not_zero(&ctx->refcount)) {
1375
		rcu_read_unlock();
1376
		goto again;
1377
	}
1378
	rcu_read_unlock();
1379

1380
	mutex_lock_nested(&ctx->mutex, nesting);
1381
	if (event->ctx != ctx) {
1382
		mutex_unlock(&ctx->mutex);
1383
		put_ctx(ctx);
1384
		goto again;
1385
	}
1386

1387
	return ctx;
1388
}
1389

1390
static inline struct perf_event_context *
1391
perf_event_ctx_lock(struct perf_event *event)
1392
{
1393
	return perf_event_ctx_lock_nested(event, 0);
1394
}
1395

1396
static void perf_event_ctx_unlock(struct perf_event *event,
1397
				  struct perf_event_context *ctx)
1398
{
1399
	mutex_unlock(&ctx->mutex);
1400
	put_ctx(ctx);
1401
}
1402

1403
/*
1404
 * This must be done under the ctx->lock, such as to serialize against
1405
 * context_equiv(), therefore we cannot call put_ctx() since that might end up
1406
 * calling scheduler related locks and ctx->lock nests inside those.
1407
 */
1408
static __must_check struct perf_event_context *
1409
unclone_ctx(struct perf_event_context *ctx)
1410
{
1411
	struct perf_event_context *parent_ctx = ctx->parent_ctx;
1412

1413
	lockdep_assert_held(&ctx->lock);
1414

1415
	if (parent_ctx)
1416
		ctx->parent_ctx = NULL;
1417
	ctx->generation++;
1418

1419
	return parent_ctx;
1420
}
1421

1422
static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1423
				enum pid_type type)
1424
{
1425
	u32 nr;
1426
	/*
1427
	 * only top level events have the pid namespace they were created in
1428
	 */
1429
	if (event->parent)
1430
		event = event->parent;
1431

1432
	nr = __task_pid_nr_ns(p, type, event->ns);
1433
	/* avoid -1 if it is idle thread or runs in another ns */
1434
	if (!nr && !pid_alive(p))
1435
		nr = -1;
1436
	return nr;
1437
}
1438

1439
static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1440
{
1441
	return perf_event_pid_type(event, p, PIDTYPE_TGID);
1442
}
1443

1444
static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1445
{
1446
	return perf_event_pid_type(event, p, PIDTYPE_PID);
1447
}
1448

1449
/*
1450
 * If we inherit events we want to return the parent event id
1451
 * to userspace.
1452
 */
1453
static u64 primary_event_id(struct perf_event *event)
1454
{
1455
	u64 id = event->id;
1456

1457
	if (event->parent)
1458
		id = event->parent->id;
1459

1460
	return id;
1461
}
1462

1463
/*
1464
 * Get the perf_event_context for a task and lock it.
1465
 *
1466
 * This has to cope with the fact that until it is locked,
1467
 * the context could get moved to another task.
1468
 */
1469
static struct perf_event_context *
1470
perf_lock_task_context(struct task_struct *task, unsigned long *flags)
1471
{
1472
	struct perf_event_context *ctx;
1473

1474
retry:
1475
	/*
1476
	 * One of the few rules of preemptible RCU is that one cannot do
1477
	 * rcu_read_unlock() while holding a scheduler (or nested) lock when
1478
	 * part of the read side critical section was irqs-enabled -- see
1479
	 * rcu_read_unlock_special().
1480
	 *
1481
	 * Since ctx->lock nests under rq->lock we must ensure the entire read
1482
	 * side critical section has interrupts disabled.
1483
	 */
1484
	local_irq_save(*flags);
1485
	rcu_read_lock();
1486
	ctx = rcu_dereference(task->perf_event_ctxp);
1487
	if (ctx) {
1488
		/*
1489
		 * If this context is a clone of another, it might
1490
		 * get swapped for another underneath us by
1491
		 * perf_event_task_sched_out, though the
1492
		 * rcu_read_lock() protects us from any context
1493
		 * getting freed.  Lock the context and check if it
1494
		 * got swapped before we could get the lock, and retry
1495
		 * if so.  If we locked the right context, then it
1496
		 * can't get swapped on us any more.
1497
		 */
1498
		raw_spin_lock(&ctx->lock);
1499
		if (ctx != rcu_dereference(task->perf_event_ctxp)) {
1500
			raw_spin_unlock(&ctx->lock);
1501
			rcu_read_unlock();
1502
			local_irq_restore(*flags);
1503
			goto retry;
1504
		}
1505

1506
		if (ctx->task == TASK_TOMBSTONE ||
1507
		    !refcount_inc_not_zero(&ctx->refcount)) {
1508
			raw_spin_unlock(&ctx->lock);
1509
			ctx = NULL;
1510
		} else {
1511
			WARN_ON_ONCE(ctx->task != task);
1512
		}
1513
	}
1514
	rcu_read_unlock();
1515
	if (!ctx)
1516
		local_irq_restore(*flags);
1517
	return ctx;
1518
}
1519

1520
/*
1521
 * Get the context for a task and increment its pin_count so it
1522
 * can't get swapped to another task.  This also increments its
1523
 * reference count so that the context can't get freed.
1524
 */
1525
static struct perf_event_context *
1526
perf_pin_task_context(struct task_struct *task)
1527
{
1528
	struct perf_event_context *ctx;
1529
	unsigned long flags;
1530

1531
	ctx = perf_lock_task_context(task, &flags);
1532
	if (ctx) {
1533
		++ctx->pin_count;
1534
		raw_spin_unlock_irqrestore(&ctx->lock, flags);
1535
	}
1536
	return ctx;
1537
}
1538

1539
static void perf_unpin_context(struct perf_event_context *ctx)
1540
{
1541
	unsigned long flags;
1542

1543
	raw_spin_lock_irqsave(&ctx->lock, flags);
1544
	--ctx->pin_count;
1545
	raw_spin_unlock_irqrestore(&ctx->lock, flags);
1546
}
1547

1548
/*
1549
 * Update the record of the current time in a context.
1550
 */
1551
static void __update_context_time(struct perf_event_context *ctx, bool adv)
1552
{
1553
	u64 now = perf_clock();
1554

1555
	lockdep_assert_held(&ctx->lock);
1556

1557
	if (adv)
1558
		ctx->time += now - ctx->timestamp;
1559
	ctx->timestamp = now;
1560

1561
	/*
1562
	 * The above: time' = time + (now - timestamp), can be re-arranged
1563
	 * into: time` = now + (time - timestamp), which gives a single value
1564
	 * offset to compute future time without locks on.
1565
	 *
1566
	 * See perf_event_time_now(), which can be used from NMI context where
1567
	 * it's (obviously) not possible to acquire ctx->lock in order to read
1568
	 * both the above values in a consistent manner.
1569
	 */
1570
	WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
1571
}
1572

1573
static void update_context_time(struct perf_event_context *ctx)
1574
{
1575
	__update_context_time(ctx, true);
1576
}
1577

1578
static u64 perf_event_time(struct perf_event *event)
1579
{
1580
	struct perf_event_context *ctx = event->ctx;
1581

1582
	if (unlikely(!ctx))
1583
		return 0;
1584

1585
	if (is_cgroup_event(event))
1586
		return perf_cgroup_event_time(event);
1587

1588
	return ctx->time;
1589
}
1590

1591
static u64 perf_event_time_now(struct perf_event *event, u64 now)
1592
{
1593
	struct perf_event_context *ctx = event->ctx;
1594

1595
	if (unlikely(!ctx))
1596
		return 0;
1597

1598
	if (is_cgroup_event(event))
1599
		return perf_cgroup_event_time_now(event, now);
1600

1601
	if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
1602
		return ctx->time;
1603

1604
	now += READ_ONCE(ctx->timeoffset);
1605
	return now;
1606
}
1607

1608
static enum event_type_t get_event_type(struct perf_event *event)
1609
{
1610
	struct perf_event_context *ctx = event->ctx;
1611
	enum event_type_t event_type;
1612

1613
	lockdep_assert_held(&ctx->lock);
1614

1615
	/*
1616
	 * It's 'group type', really, because if our group leader is
1617
	 * pinned, so are we.
1618
	 */
1619
	if (event->group_leader != event)
1620
		event = event->group_leader;
1621

1622
	event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1623
	if (!ctx->task)
1624
		event_type |= EVENT_CPU;
1625

1626
	return event_type;
1627
}
1628

1629
/*
1630
 * Helper function to initialize event group nodes.
1631
 */
1632
static void init_event_group(struct perf_event *event)
1633
{
1634
	RB_CLEAR_NODE(&event->group_node);
1635
	event->group_index = 0;
1636
}
1637

1638
/*
1639
 * Extract pinned or flexible groups from the context
1640
 * based on event attrs bits.
1641
 */
1642
static struct perf_event_groups *
1643
get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1644
{
1645
	if (event->attr.pinned)
1646
		return &ctx->pinned_groups;
1647
	else
1648
		return &ctx->flexible_groups;
1649
}
1650

1651
/*
1652
 * Helper function to initializes perf_event_group trees.
1653
 */
1654
static void perf_event_groups_init(struct perf_event_groups *groups)
1655
{
1656
	groups->tree = RB_ROOT;
1657
	groups->index = 0;
1658
}
1659

1660
static inline struct cgroup *event_cgroup(const struct perf_event *event)
1661
{
1662
	struct cgroup *cgroup = NULL;
1663

1664
#ifdef CONFIG_CGROUP_PERF
1665
	if (event->cgrp)
1666
		cgroup = event->cgrp->css.cgroup;
1667
#endif
1668

1669
	return cgroup;
1670
}
1671

1672
/*
1673
 * Compare function for event groups;
1674
 *
1675
 * Implements complex key that first sorts by CPU and then by virtual index
1676
 * which provides ordering when rotating groups for the same CPU.
1677
 */
1678
static __always_inline int
1679
perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu,
1680
		      const struct cgroup *left_cgroup, const u64 left_group_index,
1681
		      const struct perf_event *right)
1682
{
1683
	if (left_cpu < right->cpu)
1684
		return -1;
1685
	if (left_cpu > right->cpu)
1686
		return 1;
1687

1688
	if (left_pmu) {
1689
		if (left_pmu < right->pmu_ctx->pmu)
1690
			return -1;
1691
		if (left_pmu > right->pmu_ctx->pmu)
1692
			return 1;
1693
	}
1694

1695
#ifdef CONFIG_CGROUP_PERF
1696
	{
1697
		const struct cgroup *right_cgroup = event_cgroup(right);
1698

1699
		if (left_cgroup != right_cgroup) {
1700
			if (!left_cgroup) {
1701
				/*
1702
				 * Left has no cgroup but right does, no
1703
				 * cgroups come first.
1704
				 */
1705
				return -1;
1706
			}
1707
			if (!right_cgroup) {
1708
				/*
1709
				 * Right has no cgroup but left does, no
1710
				 * cgroups come first.
1711
				 */
1712
				return 1;
1713
			}
1714
			/* Two dissimilar cgroups, order by id. */
1715
			if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
1716
				return -1;
1717

1718
			return 1;
1719
		}
1720
	}
1721
#endif
1722

1723
	if (left_group_index < right->group_index)
1724
		return -1;
1725
	if (left_group_index > right->group_index)
1726
		return 1;
1727

1728
	return 0;
1729
}
1730

1731
#define __node_2_pe(node) \
1732
	rb_entry((node), struct perf_event, group_node)
1733

1734
static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
1735
{
1736
	struct perf_event *e = __node_2_pe(a);
1737
	return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e),
1738
				     e->group_index, __node_2_pe(b)) < 0;
1739
}
1740

1741
struct __group_key {
1742
	int cpu;
1743
	struct pmu *pmu;
1744
	struct cgroup *cgroup;
1745
};
1746

1747
static inline int __group_cmp(const void *key, const struct rb_node *node)
1748
{
1749
	const struct __group_key *a = key;
1750
	const struct perf_event *b = __node_2_pe(node);
1751

1752
	/* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */
1753
	return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b);
1754
}
1755

1756
static inline int
1757
__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node)
1758
{
1759
	const struct __group_key *a = key;
1760
	const struct perf_event *b = __node_2_pe(node);
1761

1762
	/* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */
1763
	return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b),
1764
				     b->group_index, b);
1765
}
1766

1767
/*
1768
 * Insert @event into @groups' tree; using
1769
 *   {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index}
1770
 * as key. This places it last inside the {cpu,pmu,cgroup} subtree.
1771
 */
1772
static void
1773
perf_event_groups_insert(struct perf_event_groups *groups,
1774
			 struct perf_event *event)
1775
{
1776
	event->group_index = ++groups->index;
1777

1778
	rb_add(&event->group_node, &groups->tree, __group_less);
1779
}
1780

1781
/*
1782
 * Helper function to insert event into the pinned or flexible groups.
1783
 */
1784
static void
1785
add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1786
{
1787
	struct perf_event_groups *groups;
1788

1789
	groups = get_event_groups(event, ctx);
1790
	perf_event_groups_insert(groups, event);
1791
}
1792

1793
/*
1794
 * Delete a group from a tree.
1795
 */
1796
static void
1797
perf_event_groups_delete(struct perf_event_groups *groups,
1798
			 struct perf_event *event)
1799
{
1800
	WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1801
		     RB_EMPTY_ROOT(&groups->tree));
1802

1803
	rb_erase(&event->group_node, &groups->tree);
1804
	init_event_group(event);
1805
}
1806

1807
/*
1808
 * Helper function to delete event from its groups.
1809
 */
1810
static void
1811
del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1812
{
1813
	struct perf_event_groups *groups;
1814

1815
	groups = get_event_groups(event, ctx);
1816
	perf_event_groups_delete(groups, event);
1817
}
1818

1819
/*
1820
 * Get the leftmost event in the {cpu,pmu,cgroup} subtree.
1821
 */
1822
static struct perf_event *
1823
perf_event_groups_first(struct perf_event_groups *groups, int cpu,
1824
			struct pmu *pmu, struct cgroup *cgrp)
1825
{
1826
	struct __group_key key = {
1827
		.cpu = cpu,
1828
		.pmu = pmu,
1829
		.cgroup = cgrp,
1830
	};
1831
	struct rb_node *node;
1832

1833
	node = rb_find_first(&key, &groups->tree, __group_cmp);
1834
	if (node)
1835
		return __node_2_pe(node);
1836

1837
	return NULL;
1838
}
1839

1840
static struct perf_event *
1841
perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
1842
{
1843
	struct __group_key key = {
1844
		.cpu = event->cpu,
1845
		.pmu = pmu,
1846
		.cgroup = event_cgroup(event),
1847
	};
1848
	struct rb_node *next;
1849

1850
	next = rb_next_match(&key, &event->group_node, __group_cmp);
1851
	if (next)
1852
		return __node_2_pe(next);
1853

1854
	return NULL;
1855
}
1856

1857
#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu)		\
1858
	for (event = perf_event_groups_first(groups, cpu, pmu, NULL);	\
1859
	     event; event = perf_event_groups_next(event, pmu))
1860

1861
/*
1862
 * Iterate through the whole groups tree.
1863
 */
1864
#define perf_event_groups_for_each(event, groups)			\
1865
	for (event = rb_entry_safe(rb_first(&((groups)->tree)),		\
1866
				typeof(*event), group_node); event;	\
1867
		event = rb_entry_safe(rb_next(&event->group_node),	\
1868
				typeof(*event), group_node))
1869

1870
/*
1871
 * Does the event attribute request inherit with PERF_SAMPLE_READ
1872
 */
1873
static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr)
1874
{
1875
	return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ);
1876
}
1877

1878
/*
1879
 * Add an event from the lists for its context.
1880
 * Must be called with ctx->mutex and ctx->lock held.
1881
 */
1882
static void
1883
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1884
{
1885
	lockdep_assert_held(&ctx->lock);
1886

1887
	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1888
	event->attach_state |= PERF_ATTACH_CONTEXT;
1889

1890
	event->tstamp = perf_event_time(event);
1891

1892
	/*
1893
	 * If we're a stand alone event or group leader, we go to the context
1894
	 * list, group events are kept attached to the group so that
1895
	 * perf_group_detach can, at all times, locate all siblings.
1896
	 */
1897
	if (event->group_leader == event) {
1898
		event->group_caps = event->event_caps;
1899
		add_event_to_groups(event, ctx);
1900
	}
1901

1902
	list_add_rcu(&event->event_entry, &ctx->event_list);
1903
	ctx->nr_events++;
1904
	if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
1905
		ctx->nr_user++;
1906
	if (event->attr.inherit_stat)
1907
		ctx->nr_stat++;
1908
	if (has_inherit_and_sample_read(&event->attr))
1909
		local_inc(&ctx->nr_no_switch_fast);
1910

1911
	if (event->state > PERF_EVENT_STATE_OFF)
1912
		perf_cgroup_event_enable(event, ctx);
1913

1914
	ctx->generation++;
1915
	event->pmu_ctx->nr_events++;
1916
}
1917

1918
/*
1919
 * Initialize event state based on the perf_event_attr::disabled.
1920
 */
1921
static inline void perf_event__state_init(struct perf_event *event)
1922
{
1923
	event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1924
					      PERF_EVENT_STATE_INACTIVE;
1925
}
1926

1927
static int __perf_event_read_size(u64 read_format, int nr_siblings)
1928
{
1929
	int entry = sizeof(u64); /* value */
1930
	int size = 0;
1931
	int nr = 1;
1932

1933
	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1934
		size += sizeof(u64);
1935

1936
	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1937
		size += sizeof(u64);
1938

1939
	if (read_format & PERF_FORMAT_ID)
1940
		entry += sizeof(u64);
1941

1942
	if (read_format & PERF_FORMAT_LOST)
1943
		entry += sizeof(u64);
1944

1945
	if (read_format & PERF_FORMAT_GROUP) {
1946
		nr += nr_siblings;
1947
		size += sizeof(u64);
1948
	}
1949

1950
	/*
1951
	 * Since perf_event_validate_size() limits this to 16k and inhibits
1952
	 * adding more siblings, this will never overflow.
1953
	 */
1954
	return size + nr * entry;
1955
}
1956

1957
static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1958
{
1959
	struct perf_sample_data *data;
1960
	u16 size = 0;
1961

1962
	if (sample_type & PERF_SAMPLE_IP)
1963
		size += sizeof(data->ip);
1964

1965
	if (sample_type & PERF_SAMPLE_ADDR)
1966
		size += sizeof(data->addr);
1967

1968
	if (sample_type & PERF_SAMPLE_PERIOD)
1969
		size += sizeof(data->period);
1970

1971
	if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
1972
		size += sizeof(data->weight.full);
1973

1974
	if (sample_type & PERF_SAMPLE_READ)
1975
		size += event->read_size;
1976

1977
	if (sample_type & PERF_SAMPLE_DATA_SRC)
1978
		size += sizeof(data->data_src.val);
1979

1980
	if (sample_type & PERF_SAMPLE_TRANSACTION)
1981
		size += sizeof(data->txn);
1982

1983
	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1984
		size += sizeof(data->phys_addr);
1985

1986
	if (sample_type & PERF_SAMPLE_CGROUP)
1987
		size += sizeof(data->cgroup);
1988

1989
	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
1990
		size += sizeof(data->data_page_size);
1991

1992
	if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
1993
		size += sizeof(data->code_page_size);
1994

1995
	event->header_size = size;
1996
}
1997

1998
/*
1999
 * Called at perf_event creation and when events are attached/detached from a
2000
 * group.
2001
 */
2002
static void perf_event__header_size(struct perf_event *event)
2003
{
2004
	event->read_size =
2005
		__perf_event_read_size(event->attr.read_format,
2006
				       event->group_leader->nr_siblings);
2007
	__perf_event_header_size(event, event->attr.sample_type);
2008
}
2009

2010
static void perf_event__id_header_size(struct perf_event *event)
2011
{
2012
	struct perf_sample_data *data;
2013
	u64 sample_type = event->attr.sample_type;
2014
	u16 size = 0;
2015

2016
	if (sample_type & PERF_SAMPLE_TID)
2017
		size += sizeof(data->tid_entry);
2018

2019
	if (sample_type & PERF_SAMPLE_TIME)
2020
		size += sizeof(data->time);
2021

2022
	if (sample_type & PERF_SAMPLE_IDENTIFIER)
2023
		size += sizeof(data->id);
2024

2025
	if (sample_type & PERF_SAMPLE_ID)
2026
		size += sizeof(data->id);
2027

2028
	if (sample_type & PERF_SAMPLE_STREAM_ID)
2029
		size += sizeof(data->stream_id);
2030

2031
	if (sample_type & PERF_SAMPLE_CPU)
2032
		size += sizeof(data->cpu_entry);
2033

2034
	event->id_header_size = size;
2035
}
2036

2037
/*
2038
 * Check that adding an event to the group does not result in anybody
2039
 * overflowing the 64k event limit imposed by the output buffer.
2040
 *
2041
 * Specifically, check that the read_size for the event does not exceed 16k,
2042
 * read_size being the one term that grows with groups size. Since read_size
2043
 * depends on per-event read_format, also (re)check the existing events.
2044
 *
2045
 * This leaves 48k for the constant size fields and things like callchains,
2046
 * branch stacks and register sets.
2047
 */
2048
static bool perf_event_validate_size(struct perf_event *event)
2049
{
2050
	struct perf_event *sibling, *group_leader = event->group_leader;
2051

2052
	if (__perf_event_read_size(event->attr.read_format,
2053
				   group_leader->nr_siblings + 1) > 16*1024)
2054
		return false;
2055

2056
	if (__perf_event_read_size(group_leader->attr.read_format,
2057
				   group_leader->nr_siblings + 1) > 16*1024)
2058
		return false;
2059

2060
	/*
2061
	 * When creating a new group leader, group_leader->ctx is initialized
2062
	 * after the size has been validated, but we cannot safely use
2063
	 * for_each_sibling_event() until group_leader->ctx is set. A new group
2064
	 * leader cannot have any siblings yet, so we can safely skip checking
2065
	 * the non-existent siblings.
2066
	 */
2067
	if (event == group_leader)
2068
		return true;
2069

2070
	for_each_sibling_event(sibling, group_leader) {
2071
		if (__perf_event_read_size(sibling->attr.read_format,
2072
					   group_leader->nr_siblings + 1) > 16*1024)
2073
			return false;
2074
	}
2075

2076
	return true;
2077
}
2078

2079
static void perf_group_attach(struct perf_event *event)
2080
{
2081
	struct perf_event *group_leader = event->group_leader, *pos;
2082

2083
	lockdep_assert_held(&event->ctx->lock);
2084

2085
	/*
2086
	 * We can have double attach due to group movement (move_group) in
2087
	 * perf_event_open().
2088
	 */
2089
	if (event->attach_state & PERF_ATTACH_GROUP)
2090
		return;
2091

2092
	event->attach_state |= PERF_ATTACH_GROUP;
2093

2094
	if (group_leader == event)
2095
		return;
2096

2097
	WARN_ON_ONCE(group_leader->ctx != event->ctx);
2098

2099
	group_leader->group_caps &= event->event_caps;
2100

2101
	list_add_tail(&event->sibling_list, &group_leader->sibling_list);
2102
	group_leader->nr_siblings++;
2103
	group_leader->group_generation++;
2104

2105
	perf_event__header_size(group_leader);
2106

2107
	for_each_sibling_event(pos, group_leader)
2108
		perf_event__header_size(pos);
2109
}
2110

2111
/*
2112
 * Remove an event from the lists for its context.
2113
 * Must be called with ctx->mutex and ctx->lock held.
2114
 */
2115
static void
2116
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
2117
{
2118
	WARN_ON_ONCE(event->ctx != ctx);
2119
	lockdep_assert_held(&ctx->lock);
2120

2121
	/*
2122
	 * We can have double detach due to exit/hot-unplug + close.
2123
	 */
2124
	if (!(event->attach_state & PERF_ATTACH_CONTEXT))
2125
		return;
2126

2127
	event->attach_state &= ~PERF_ATTACH_CONTEXT;
2128

2129
	ctx->nr_events--;
2130
	if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
2131
		ctx->nr_user--;
2132
	if (event->attr.inherit_stat)
2133
		ctx->nr_stat--;
2134
	if (has_inherit_and_sample_read(&event->attr))
2135
		local_dec(&ctx->nr_no_switch_fast);
2136

2137
	list_del_rcu(&event->event_entry);
2138

2139
	if (event->group_leader == event)
2140
		del_event_from_groups(event, ctx);
2141

2142
	ctx->generation++;
2143
	event->pmu_ctx->nr_events--;
2144
}
2145

2146
static int
2147
perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
2148
{
2149
	if (!has_aux(aux_event))
2150
		return 0;
2151

2152
	if (!event->pmu->aux_output_match)
2153
		return 0;
2154

2155
	return event->pmu->aux_output_match(aux_event);
2156
}
2157

2158
static void put_event(struct perf_event *event);
2159
static void __event_disable(struct perf_event *event,
2160
			    struct perf_event_context *ctx,
2161
			    enum perf_event_state state);
2162

2163
static void perf_put_aux_event(struct perf_event *event)
2164
{
2165
	struct perf_event_context *ctx = event->ctx;
2166
	struct perf_event *iter;
2167

2168
	/*
2169
	 * If event uses aux_event tear down the link
2170
	 */
2171
	if (event->aux_event) {
2172
		iter = event->aux_event;
2173
		event->aux_event = NULL;
2174
		put_event(iter);
2175
		return;
2176
	}
2177

2178
	/*
2179
	 * If the event is an aux_event, tear down all links to
2180
	 * it from other events.
2181
	 */
2182
	for_each_sibling_event(iter, event) {
2183
		if (iter->aux_event != event)
2184
			continue;
2185

2186
		iter->aux_event = NULL;
2187
		put_event(event);
2188

2189
		/*
2190
		 * If it's ACTIVE, schedule it out and put it into ERROR
2191
		 * state so that we don't try to schedule it again. Note
2192
		 * that perf_event_enable() will clear the ERROR status.
2193
		 */
2194
		__event_disable(iter, ctx, PERF_EVENT_STATE_ERROR);
2195
	}
2196
}
2197

2198
static bool perf_need_aux_event(struct perf_event *event)
2199
{
2200
	return event->attr.aux_output || has_aux_action(event);
2201
}
2202

2203
static int perf_get_aux_event(struct perf_event *event,
2204
			      struct perf_event *group_leader)
2205
{
2206
	/*
2207
	 * Our group leader must be an aux event if we want to be
2208
	 * an aux_output. This way, the aux event will precede its
2209
	 * aux_output events in the group, and therefore will always
2210
	 * schedule first.
2211
	 */
2212
	if (!group_leader)
2213
		return 0;
2214

2215
	/*
2216
	 * aux_output and aux_sample_size are mutually exclusive.
2217
	 */
2218
	if (event->attr.aux_output && event->attr.aux_sample_size)
2219
		return 0;
2220

2221
	if (event->attr.aux_output &&
2222
	    !perf_aux_output_match(event, group_leader))
2223
		return 0;
2224

2225
	if ((event->attr.aux_pause || event->attr.aux_resume) &&
2226
	    !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
2227
		return 0;
2228

2229
	if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
2230
		return 0;
2231

2232
	if (!atomic_long_inc_not_zero(&group_leader->refcount))
2233
		return 0;
2234

2235
	/*
2236
	 * Link aux_outputs to their aux event; this is undone in
2237
	 * perf_group_detach() by perf_put_aux_event(). When the
2238
	 * group in torn down, the aux_output events loose their
2239
	 * link to the aux_event and can't schedule any more.
2240
	 */
2241
	event->aux_event = group_leader;
2242

2243
	return 1;
2244
}
2245

2246
static inline struct list_head *get_event_list(struct perf_event *event)
2247
{
2248
	return event->attr.pinned ? &event->pmu_ctx->pinned_active :
2249
				    &event->pmu_ctx->flexible_active;
2250
}
2251

2252
static void perf_group_detach(struct perf_event *event)
2253
{
2254
	struct perf_event *leader = event->group_leader;
2255
	struct perf_event *sibling, *tmp;
2256
	struct perf_event_context *ctx = event->ctx;
2257

2258
	lockdep_assert_held(&ctx->lock);
2259

2260
	/*
2261
	 * We can have double detach due to exit/hot-unplug + close.
2262
	 */
2263
	if (!(event->attach_state & PERF_ATTACH_GROUP))
2264
		return;
2265

2266
	event->attach_state &= ~PERF_ATTACH_GROUP;
2267

2268
	perf_put_aux_event(event);
2269

2270
	/*
2271
	 * If this is a sibling, remove it from its group.
2272
	 */
2273
	if (leader != event) {
2274
		list_del_init(&event->sibling_list);
2275
		event->group_leader->nr_siblings--;
2276
		event->group_leader->group_generation++;
2277
		goto out;
2278
	}
2279

2280
	/*
2281
	 * If this was a group event with sibling events then
2282
	 * upgrade the siblings to singleton events by adding them
2283
	 * to whatever list we are on.
2284
	 */
2285
	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
2286

2287
		/*
2288
		 * Events that have PERF_EV_CAP_SIBLING require being part of
2289
		 * a group and cannot exist on their own, schedule them out
2290
		 * and move them into the ERROR state. Also see
2291
		 * _perf_event_enable(), it will not be able to recover this
2292
		 * ERROR state.
2293
		 */
2294
		if (sibling->event_caps & PERF_EV_CAP_SIBLING)
2295
			__event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR);
2296

2297
		sibling->group_leader = sibling;
2298
		list_del_init(&sibling->sibling_list);
2299

2300
		/* Inherit group flags from the previous leader */
2301
		sibling->group_caps = event->group_caps;
2302

2303
		if (sibling->attach_state & PERF_ATTACH_CONTEXT) {
2304
			add_event_to_groups(sibling, event->ctx);
2305

2306
			if (sibling->state == PERF_EVENT_STATE_ACTIVE)
2307
				list_add_tail(&sibling->active_list, get_event_list(sibling));
2308
		}
2309

2310
		WARN_ON_ONCE(sibling->ctx != event->ctx);
2311
	}
2312

2313
out:
2314
	for_each_sibling_event(tmp, leader)
2315
		perf_event__header_size(tmp);
2316

2317
	perf_event__header_size(leader);
2318
}
2319

2320
static void perf_child_detach(struct perf_event *event)
2321
{
2322
	struct perf_event *parent_event = event->parent;
2323

2324
	if (!(event->attach_state & PERF_ATTACH_CHILD))
2325
		return;
2326

2327
	event->attach_state &= ~PERF_ATTACH_CHILD;
2328

2329
	if (WARN_ON_ONCE(!parent_event))
2330
		return;
2331

2332
	/*
2333
	 * Can't check this from an IPI, the holder is likey another CPU.
2334
	 *
2335
	lockdep_assert_held(&parent_event->child_mutex);
2336
	 */
2337

2338
	list_del_init(&event->child_list);
2339
}
2340

2341
static bool is_orphaned_event(struct perf_event *event)
2342
{
2343
	return event->state == PERF_EVENT_STATE_DEAD;
2344
}
2345

2346
static inline int
2347
event_filter_match(struct perf_event *event)
2348
{
2349
	return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
2350
	       perf_cgroup_match(event);
2351
}
2352

2353
static inline bool is_event_in_freq_mode(struct perf_event *event)
2354
{
2355
	return event->attr.freq && event->attr.sample_freq;
2356
}
2357

2358
static void
2359
event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
2360
{
2361
	struct perf_event_pmu_context *epc = event->pmu_ctx;
2362
	struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
2363
	enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
2364

2365
	// XXX cpc serialization, probably per-cpu IRQ disabled
2366

2367
	WARN_ON_ONCE(event->ctx != ctx);
2368
	lockdep_assert_held(&ctx->lock);
2369

2370
	if (event->state != PERF_EVENT_STATE_ACTIVE)
2371
		return;
2372

2373
	/*
2374
	 * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
2375
	 * we can schedule events _OUT_ individually through things like
2376
	 * __perf_remove_from_context().
2377
	 */
2378
	list_del_init(&event->active_list);
2379

2380
	perf_pmu_disable(event->pmu);
2381

2382
	event->pmu->del(event, 0);
2383
	event->oncpu = -1;
2384

2385
	if (event->pending_disable) {
2386
		event->pending_disable = 0;
2387
		perf_cgroup_event_disable(event, ctx);
2388
		state = PERF_EVENT_STATE_OFF;
2389
	}
2390

2391
	perf_event_set_state(event, state);
2392

2393
	if (!is_software_event(event))
2394
		cpc->active_oncpu--;
2395
	if (is_event_in_freq_mode(event)) {
2396
		ctx->nr_freq--;
2397
		epc->nr_freq--;
2398
	}
2399
	if (event->attr.exclusive || !cpc->active_oncpu)
2400
		cpc->exclusive = 0;
2401

2402
	perf_pmu_enable(event->pmu);
2403
}
2404

2405
static void
2406
group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
2407
{
2408
	struct perf_event *event;
2409

2410
	if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2411
		return;
2412

2413
	perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);
2414

2415
	event_sched_out(group_event, ctx);
2416

2417
	/*
2418
	 * Schedule out siblings (if any):
2419
	 */
2420
	for_each_sibling_event(event, group_event)
2421
		event_sched_out(event, ctx);
2422
}
2423

2424
static inline void
2425
__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
2426
{
2427
	if (ctx->is_active & EVENT_TIME) {
2428
		if (ctx->is_active & EVENT_FROZEN)
2429
			return;
2430
		update_context_time(ctx);
2431
		update_cgrp_time_from_cpuctx(cpuctx, final);
2432
	}
2433
}
2434

2435
static inline void
2436
ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
2437
{
2438
	__ctx_time_update(cpuctx, ctx, false);
2439
}
2440

2441
/*
2442
 * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock().
2443
 */
2444
static inline void
2445
ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
2446
{
2447
	ctx_time_update(cpuctx, ctx);
2448
	if (ctx->is_active & EVENT_TIME)
2449
		ctx->is_active |= EVENT_FROZEN;
2450
}
2451

2452
static inline void
2453
ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
2454
{
2455
	if (ctx->is_active & EVENT_TIME) {
2456
		if (ctx->is_active & EVENT_FROZEN)
2457
			return;
2458
		update_context_time(ctx);
2459
		update_cgrp_time_from_event(event);
2460
	}
2461
}
2462

2463
#define DETACH_GROUP	0x01UL
2464
#define DETACH_CHILD	0x02UL
2465
#define DETACH_EXIT	0x04UL
2466
#define DETACH_REVOKE	0x08UL
2467
#define DETACH_DEAD	0x10UL
2468

2469
/*
2470
 * Cross CPU call to remove a performance event
2471
 *
2472
 * We disable the event on the hardware level first. After that we
2473
 * remove it from the context list.
2474
 */
2475
static void
2476
__perf_remove_from_context(struct perf_event *event,
2477
			   struct perf_cpu_context *cpuctx,
2478
			   struct perf_event_context *ctx,
2479
			   void *info)
2480
{
2481
	struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
2482
	enum perf_event_state state = PERF_EVENT_STATE_OFF;
2483
	unsigned long flags = (unsigned long)info;
2484

2485
	ctx_time_update(cpuctx, ctx);
2486

2487
	/*
2488
	 * Ensure event_sched_out() switches to OFF, at the very least
2489
	 * this avoids raising perf_pending_task() at this time.
2490
	 */
2491
	if (flags & DETACH_EXIT)
2492
		state = PERF_EVENT_STATE_EXIT;
2493
	if (flags & DETACH_REVOKE)
2494
		state = PERF_EVENT_STATE_REVOKED;
2495
	if (flags & DETACH_DEAD)
2496
		state = PERF_EVENT_STATE_DEAD;
2497

2498
	event_sched_out(event, ctx);
2499

2500
	if (event->state > PERF_EVENT_STATE_OFF)
2501
		perf_cgroup_event_disable(event, ctx);
2502

2503
	perf_event_set_state(event, min(event->state, state));
2504

2505
	if (flags & DETACH_GROUP)
2506
		perf_group_detach(event);
2507
	if (flags & DETACH_CHILD)
2508
		perf_child_detach(event);
2509
	list_del_event(event, ctx);
2510

2511
	if (!pmu_ctx->nr_events) {
2512
		pmu_ctx->rotate_necessary = 0;
2513

2514
		if (ctx->task && ctx->is_active) {
2515
			struct perf_cpu_pmu_context *cpc = this_cpc(pmu_ctx->pmu);
2516

2517
			WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
2518
			cpc->task_epc = NULL;
2519
		}
2520
	}
2521

2522
	if (!ctx->nr_events && ctx->is_active) {
2523
		if (ctx == &cpuctx->ctx)
2524
			update_cgrp_time_from_cpuctx(cpuctx, true);
2525

2526
		ctx->is_active = 0;
2527
		if (ctx->task) {
2528
			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2529
			cpuctx->task_ctx = NULL;
2530
		}
2531
	}
2532
}
2533

2534
/*
2535
 * Remove the event from a task's (or a CPU's) list of events.
2536
 *
2537
 * If event->ctx is a cloned context, callers must make sure that
2538
 * every task struct that event->ctx->task could possibly point to
2539
 * remains valid.  This is OK when called from perf_release since
2540
 * that only calls us on the top-level context, which can't be a clone.
2541
 * When called from perf_event_exit_task, it's OK because the
2542
 * context has been detached from its task.
2543
 */
2544
static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2545
{
2546
	struct perf_event_context *ctx = event->ctx;
2547

2548
	lockdep_assert_held(&ctx->mutex);
2549

2550
	/*
2551
	 * Because of perf_event_exit_task(), perf_remove_from_context() ought
2552
	 * to work in the face of TASK_TOMBSTONE, unlike every other
2553
	 * event_function_call() user.
2554
	 */
2555
	raw_spin_lock_irq(&ctx->lock);
2556
	if (!ctx->is_active) {
2557
		__perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
2558
					   ctx, (void *)flags);
2559
		raw_spin_unlock_irq(&ctx->lock);
2560
		return;
2561
	}
2562
	raw_spin_unlock_irq(&ctx->lock);
2563

2564
	event_function_call(event, __perf_remove_from_context, (void *)flags);
2565
}
2566

2567
static void __event_disable(struct perf_event *event,
2568
			    struct perf_event_context *ctx,
2569
			    enum perf_event_state state)
2570
{
2571
	event_sched_out(event, ctx);
2572
	perf_cgroup_event_disable(event, ctx);
2573
	perf_event_set_state(event, state);
2574
}
2575

2576
/*
2577
 * Cross CPU call to disable a performance event
2578
 */
2579
static void __perf_event_disable(struct perf_event *event,
2580
				 struct perf_cpu_context *cpuctx,
2581
				 struct perf_event_context *ctx,
2582
				 void *info)
2583
{
2584
	if (event->state < PERF_EVENT_STATE_INACTIVE)
2585
		return;
2586

2587
	perf_pmu_disable(event->pmu_ctx->pmu);
2588
	ctx_time_update_event(ctx, event);
2589

2590
	/*
2591
	 * When disabling a group leader, the whole group becomes ineligible
2592
	 * to run, so schedule out the full group.
2593
	 */
2594
	if (event == event->group_leader)
2595
		group_sched_out(event, ctx);
2596

2597
	/*
2598
	 * But only mark the leader OFF; the siblings will remain
2599
	 * INACTIVE.
2600
	 */
2601
	__event_disable(event, ctx, PERF_EVENT_STATE_OFF);
2602

2603
	perf_pmu_enable(event->pmu_ctx->pmu);
2604
}
2605

2606
/*
2607
 * Disable an event.
2608
 *
2609
 * If event->ctx is a cloned context, callers must make sure that
2610
 * every task struct that event->ctx->task could possibly point to
2611
 * remains valid.  This condition is satisfied when called through
2612
 * perf_event_for_each_child or perf_event_for_each because they
2613
 * hold the top-level event's child_mutex, so any descendant that
2614
 * goes to exit will block in perf_event_exit_event().
2615
 *
2616
 * When called from perf_pending_disable it's OK because event->ctx
2617
 * is the current context on this CPU and preemption is disabled,
2618
 * hence we can't get into perf_event_task_sched_out for this context.
2619
 */
2620
static void _perf_event_disable(struct perf_event *event)
2621
{
2622
	struct perf_event_context *ctx = event->ctx;
2623

2624
	raw_spin_lock_irq(&ctx->lock);
2625
	if (event->state <= PERF_EVENT_STATE_OFF) {
2626
		raw_spin_unlock_irq(&ctx->lock);
2627
		return;
2628
	}
2629
	raw_spin_unlock_irq(&ctx->lock);
2630

2631
	event_function_call(event, __perf_event_disable, NULL);
2632
}
2633

2634
void perf_event_disable_local(struct perf_event *event)
2635
{
2636
	event_function_local(event, __perf_event_disable, NULL);
2637
}
2638

2639
/*
2640
 * Strictly speaking kernel users cannot create groups and therefore this
2641
 * interface does not need the perf_event_ctx_lock() magic.
2642
 */
2643
void perf_event_disable(struct perf_event *event)
2644
{
2645
	struct perf_event_context *ctx;
2646

2647
	ctx = perf_event_ctx_lock(event);
2648
	_perf_event_disable(event);
2649
	perf_event_ctx_unlock(event, ctx);
2650
}
2651
EXPORT_SYMBOL_GPL(perf_event_disable);
2652

2653
void perf_event_disable_inatomic(struct perf_event *event)
2654
{
2655
	event->pending_disable = 1;
2656
	irq_work_queue(&event->pending_disable_irq);
2657
}
2658

2659
#define MAX_INTERRUPTS (~0ULL)
2660

2661
static void perf_log_throttle(struct perf_event *event, int enable);
2662
static void perf_log_itrace_start(struct perf_event *event);
2663

2664
static void perf_event_unthrottle(struct perf_event *event, bool start)
2665
{
2666
	if (event->state != PERF_EVENT_STATE_ACTIVE)
2667
		return;
2668

2669
	event->hw.interrupts = 0;
2670
	if (start)
2671
		event->pmu->start(event, 0);
2672
	if (event == event->group_leader)
2673
		perf_log_throttle(event, 1);
2674
}
2675

2676
static void perf_event_throttle(struct perf_event *event)
2677
{
2678
	if (event->state != PERF_EVENT_STATE_ACTIVE)
2679
		return;
2680

2681
	event->hw.interrupts = MAX_INTERRUPTS;
2682
	event->pmu->stop(event, 0);
2683
	if (event == event->group_leader)
2684
		perf_log_throttle(event, 0);
2685
}
2686

2687
static void perf_event_unthrottle_group(struct perf_event *event, bool skip_start_event)
2688
{
2689
	struct perf_event *sibling, *leader = event->group_leader;
2690

2691
	perf_event_unthrottle(leader, skip_start_event ? leader != event : true);
2692
	for_each_sibling_event(sibling, leader)
2693
		perf_event_unthrottle(sibling, skip_start_event ? sibling != event : true);
2694
}
2695

2696
static void perf_event_throttle_group(struct perf_event *event)
2697
{
2698
	struct perf_event *sibling, *leader = event->group_leader;
2699

2700
	perf_event_throttle(leader);
2701
	for_each_sibling_event(sibling, leader)
2702
		perf_event_throttle(sibling);
2703
}
2704

2705
static int
2706
event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
2707
{
2708
	struct perf_event_pmu_context *epc = event->pmu_ctx;
2709
	struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
2710
	int ret = 0;
2711

2712
	WARN_ON_ONCE(event->ctx != ctx);
2713

2714
	lockdep_assert_held(&ctx->lock);
2715

2716
	if (event->state <= PERF_EVENT_STATE_OFF)
2717
		return 0;
2718

2719
	WRITE_ONCE(event->oncpu, smp_processor_id());
2720
	/*
2721
	 * Order event::oncpu write to happen before the ACTIVE state is
2722
	 * visible. This allows perf_event_{stop,read}() to observe the correct
2723
	 * ->oncpu if it sees ACTIVE.
2724
	 */
2725
	smp_wmb();
2726
	perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2727

2728
	/*
2729
	 * Unthrottle events, since we scheduled we might have missed several
2730
	 * ticks already, also for a heavily scheduling task there is little
2731
	 * guarantee it'll get a tick in a timely manner.
2732
	 */
2733
	if (unlikely(event->hw.interrupts == MAX_INTERRUPTS))
2734
		perf_event_unthrottle(event, false);
2735

2736
	perf_pmu_disable(event->pmu);
2737

2738
	perf_log_itrace_start(event);
2739

2740
	if (event->pmu->add(event, PERF_EF_START)) {
2741
		perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2742
		event->oncpu = -1;
2743
		ret = -EAGAIN;
2744
		goto out;
2745
	}
2746

2747
	if (!is_software_event(event))
2748
		cpc->active_oncpu++;
2749
	if (is_event_in_freq_mode(event)) {
2750
		ctx->nr_freq++;
2751
		epc->nr_freq++;
2752
	}
2753
	if (event->attr.exclusive)
2754
		cpc->exclusive = 1;
2755

2756
out:
2757
	perf_pmu_enable(event->pmu);
2758

2759
	return ret;
2760
}
2761

2762
static int
2763
group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx)
2764
{
2765
	struct perf_event *event, *partial_group = NULL;
2766
	struct pmu *pmu = group_event->pmu_ctx->pmu;
2767

2768
	if (group_event->state == PERF_EVENT_STATE_OFF)
2769
		return 0;
2770

2771
	pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2772

2773
	if (event_sched_in(group_event, ctx))
2774
		goto error;
2775

2776
	/*
2777
	 * Schedule in siblings as one group (if any):
2778
	 */
2779
	for_each_sibling_event(event, group_event) {
2780
		if (event_sched_in(event, ctx)) {
2781
			partial_group = event;
2782
			goto group_error;
2783
		}
2784
	}
2785

2786
	if (!pmu->commit_txn(pmu))
2787
		return 0;
2788

2789
group_error:
2790
	/*
2791
	 * Groups can be scheduled in as one unit only, so undo any
2792
	 * partial group before returning:
2793
	 * The events up to the failed event are scheduled out normally.
2794
	 */
2795
	for_each_sibling_event(event, group_event) {
2796
		if (event == partial_group)
2797
			break;
2798

2799
		event_sched_out(event, ctx);
2800
	}
2801
	event_sched_out(group_event, ctx);
2802

2803
error:
2804
	pmu->cancel_txn(pmu);
2805
	return -EAGAIN;
2806
}
2807

2808
/*
2809
 * Work out whether we can put this event group on the CPU now.
2810
 */
2811
static int group_can_go_on(struct perf_event *event, int can_add_hw)
2812
{
2813
	struct perf_event_pmu_context *epc = event->pmu_ctx;
2814
	struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
2815

2816
	/*
2817
	 * Groups consisting entirely of software events can always go on.
2818
	 */
2819
	if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2820
		return 1;
2821
	/*
2822
	 * If an exclusive group is already on, no other hardware
2823
	 * events can go on.
2824
	 */
2825
	if (cpc->exclusive)
2826
		return 0;
2827
	/*
2828
	 * If this group is exclusive and there are already
2829
	 * events on the CPU, it can't go on.
2830
	 */
2831
	if (event->attr.exclusive && !list_empty(get_event_list(event)))
2832
		return 0;
2833
	/*
2834
	 * Otherwise, try to add it if all previous groups were able
2835
	 * to go on.
2836
	 */
2837
	return can_add_hw;
2838
}
2839

2840
static void add_event_to_ctx(struct perf_event *event,
2841
			       struct perf_event_context *ctx)
2842
{
2843
	list_add_event(event, ctx);
2844
	perf_group_attach(event);
2845
}
2846

2847
static void task_ctx_sched_out(struct perf_event_context *ctx,
2848
			       struct pmu *pmu,
2849
			       enum event_type_t event_type)
2850
{
2851
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
2852

2853
	if (!cpuctx->task_ctx)
2854
		return;
2855

2856
	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2857
		return;
2858

2859
	ctx_sched_out(ctx, pmu, event_type);
2860
}
2861

2862
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2863
				struct perf_event_context *ctx,
2864
				struct pmu *pmu)
2865
{
2866
	ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
2867
	if (ctx)
2868
		 ctx_sched_in(ctx, pmu, EVENT_PINNED);
2869
	ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
2870
	if (ctx)
2871
		 ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
2872
}
2873

2874
/*
2875
 * We want to maintain the following priority of scheduling:
2876
 *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
2877
 *  - task pinned (EVENT_PINNED)
2878
 *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
2879
 *  - task flexible (EVENT_FLEXIBLE).
2880
 *
2881
 * In order to avoid unscheduling and scheduling back in everything every
2882
 * time an event is added, only do it for the groups of equal priority and
2883
 * below.
2884
 *
2885
 * This can be called after a batch operation on task events, in which case
2886
 * event_type is a bit mask of the types of events involved. For CPU events,
2887
 * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
2888
 */
2889
static void ctx_resched(struct perf_cpu_context *cpuctx,
2890
			struct perf_event_context *task_ctx,
2891
			struct pmu *pmu, enum event_type_t event_type)
2892
{
2893
	bool cpu_event = !!(event_type & EVENT_CPU);
2894
	struct perf_event_pmu_context *epc;
2895

2896
	/*
2897
	 * If pinned groups are involved, flexible groups also need to be
2898
	 * scheduled out.
2899
	 */
2900
	if (event_type & EVENT_PINNED)
2901
		event_type |= EVENT_FLEXIBLE;
2902

2903
	event_type &= EVENT_ALL;
2904

2905
	for_each_epc(epc, &cpuctx->ctx, pmu, false)
2906
		perf_pmu_disable(epc->pmu);
2907

2908
	if (task_ctx) {
2909
		for_each_epc(epc, task_ctx, pmu, false)
2910
			perf_pmu_disable(epc->pmu);
2911

2912
		task_ctx_sched_out(task_ctx, pmu, event_type);
2913
	}
2914

2915
	/*
2916
	 * Decide which cpu ctx groups to schedule out based on the types
2917
	 * of events that caused rescheduling:
2918
	 *  - EVENT_CPU: schedule out corresponding groups;
2919
	 *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
2920
	 *  - otherwise, do nothing more.
2921
	 */
2922
	if (cpu_event)
2923
		ctx_sched_out(&cpuctx->ctx, pmu, event_type);
2924
	else if (event_type & EVENT_PINNED)
2925
		ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
2926

2927
	perf_event_sched_in(cpuctx, task_ctx, pmu);
2928

2929
	for_each_epc(epc, &cpuctx->ctx, pmu, false)
2930
		perf_pmu_enable(epc->pmu);
2931

2932
	if (task_ctx) {
2933
		for_each_epc(epc, task_ctx, pmu, false)
2934
			perf_pmu_enable(epc->pmu);
2935
	}
2936
}
2937

2938
void perf_pmu_resched(struct pmu *pmu)
2939
{
2940
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
2941
	struct perf_event_context *task_ctx = cpuctx->task_ctx;
2942

2943
	perf_ctx_lock(cpuctx, task_ctx);
2944
	ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
2945
	perf_ctx_unlock(cpuctx, task_ctx);
2946
}
2947

2948
/*
2949
 * Cross CPU call to install and enable a performance event
2950
 *
2951
 * Very similar to remote_function() + event_function() but cannot assume that
2952
 * things like ctx->is_active and cpuctx->task_ctx are set.
2953
 */
2954
static int  __perf_install_in_context(void *info)
2955
{
2956
	struct perf_event *event = info;
2957
	struct perf_event_context *ctx = event->ctx;
2958
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
2959
	struct perf_event_context *task_ctx = cpuctx->task_ctx;
2960
	bool reprogram = true;
2961
	int ret = 0;
2962

2963
	raw_spin_lock(&cpuctx->ctx.lock);
2964
	if (ctx->task) {
2965
		raw_spin_lock(&ctx->lock);
2966
		task_ctx = ctx;
2967

2968
		reprogram = (ctx->task == current);
2969

2970
		/*
2971
		 * If the task is running, it must be running on this CPU,
2972
		 * otherwise we cannot reprogram things.
2973
		 *
2974
		 * If its not running, we don't care, ctx->lock will
2975
		 * serialize against it becoming runnable.
2976
		 */
2977
		if (task_curr(ctx->task) && !reprogram) {
2978
			ret = -ESRCH;
2979
			goto unlock;
2980
		}
2981

2982
		WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2983
	} else if (task_ctx) {
2984
		raw_spin_lock(&task_ctx->lock);
2985
	}
2986

2987
#ifdef CONFIG_CGROUP_PERF
2988
	if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
2989
		/*
2990
		 * If the current cgroup doesn't match the event's
2991
		 * cgroup, we should not try to schedule it.
2992
		 */
2993
		struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2994
		reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2995
					event->cgrp->css.cgroup);
2996
	}
2997
#endif
2998

2999
	if (reprogram) {
3000
		ctx_time_freeze(cpuctx, ctx);
3001
		add_event_to_ctx(event, ctx);
3002
		ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
3003
			    get_event_type(event));
3004
	} else {
3005
		add_event_to_ctx(event, ctx);
3006
	}
3007

3008
unlock:
3009
	perf_ctx_unlock(cpuctx, task_ctx);
3010

3011
	return ret;
3012
}
3013

3014
static bool exclusive_event_installable(struct perf_event *event,
3015
					struct perf_event_context *ctx);
3016

3017
/*
3018
 * Attach a performance event to a context.
3019
 *
3020
 * Very similar to event_function_call, see comment there.
3021
 */
3022
static void
3023
perf_install_in_context(struct perf_event_context *ctx,
3024
			struct perf_event *event,
3025
			int cpu)
3026
{
3027
	struct task_struct *task = READ_ONCE(ctx->task);
3028

3029
	lockdep_assert_held(&ctx->mutex);
3030

3031
	WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
3032

3033
	if (event->cpu != -1)
3034
		WARN_ON_ONCE(event->cpu != cpu);
3035

3036
	/*
3037
	 * Ensures that if we can observe event->ctx, both the event and ctx
3038
	 * will be 'complete'. See perf_iterate_sb_cpu().
3039
	 */
3040
	smp_store_release(&event->ctx, ctx);
3041

3042
	/*
3043
	 * perf_event_attr::disabled events will not run and can be initialized
3044
	 * without IPI. Except when this is the first event for the context, in
3045
	 * that case we need the magic of the IPI to set ctx->is_active.
3046
	 *
3047
	 * The IOC_ENABLE that is sure to follow the creation of a disabled
3048
	 * event will issue the IPI and reprogram the hardware.
3049
	 */
3050
	if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
3051
	    ctx->nr_events && !is_cgroup_event(event)) {
3052
		raw_spin_lock_irq(&ctx->lock);
3053
		if (ctx->task == TASK_TOMBSTONE) {
3054
			raw_spin_unlock_irq(&ctx->lock);
3055
			return;
3056
		}
3057
		add_event_to_ctx(event, ctx);
3058
		raw_spin_unlock_irq(&ctx->lock);
3059
		return;
3060
	}
3061

3062
	if (!task) {
3063
		cpu_function_call(cpu, __perf_install_in_context, event);
3064
		return;
3065
	}
3066

3067
	/*
3068
	 * Should not happen, we validate the ctx is still alive before calling.
3069
	 */
3070
	if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
3071
		return;
3072

3073
	/*
3074
	 * Installing events is tricky because we cannot rely on ctx->is_active
3075
	 * to be set in case this is the nr_events 0 -> 1 transition.
3076
	 *
3077
	 * Instead we use task_curr(), which tells us if the task is running.
3078
	 * However, since we use task_curr() outside of rq::lock, we can race
3079
	 * against the actual state. This means the result can be wrong.
3080
	 *
3081
	 * If we get a false positive, we retry, this is harmless.
3082
	 *
3083
	 * If we get a false negative, things are complicated. If we are after
3084
	 * perf_event_context_sched_in() ctx::lock will serialize us, and the
3085
	 * value must be correct. If we're before, it doesn't matter since
3086
	 * perf_event_context_sched_in() will program the counter.
3087
	 *
3088
	 * However, this hinges on the remote context switch having observed
3089
	 * our task->perf_event_ctxp[] store, such that it will in fact take
3090
	 * ctx::lock in perf_event_context_sched_in().
3091
	 *
3092
	 * We do this by task_function_call(), if the IPI fails to hit the task
3093
	 * we know any future context switch of task must see the
3094
	 * perf_event_ctpx[] store.
3095
	 */
3096

3097
	/*
3098
	 * This smp_mb() orders the task->perf_event_ctxp[] store with the
3099
	 * task_cpu() load, such that if the IPI then does not find the task
3100
	 * running, a future context switch of that task must observe the
3101
	 * store.
3102
	 */
3103
	smp_mb();
3104
again:
3105
	if (!task_function_call(task, __perf_install_in_context, event))
3106
		return;
3107

3108
	raw_spin_lock_irq(&ctx->lock);
3109
	task = ctx->task;
3110
	if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
3111
		/*
3112
		 * Cannot happen because we already checked above (which also
3113
		 * cannot happen), and we hold ctx->mutex, which serializes us
3114
		 * against perf_event_exit_task_context().
3115
		 */
3116
		raw_spin_unlock_irq(&ctx->lock);
3117
		return;
3118
	}
3119
	/*
3120
	 * If the task is not running, ctx->lock will avoid it becoming so,
3121
	 * thus we can safely install the event.
3122
	 */
3123
	if (task_curr(task)) {
3124
		raw_spin_unlock_irq(&ctx->lock);
3125
		goto again;
3126
	}
3127
	add_event_to_ctx(event, ctx);
3128
	raw_spin_unlock_irq(&ctx->lock);
3129
}
3130

3131
/*
3132
 * Cross CPU call to enable a performance event
3133
 */
3134
static void __perf_event_enable(struct perf_event *event,
3135
				struct perf_cpu_context *cpuctx,
3136
				struct perf_event_context *ctx,
3137
				void *info)
3138
{
3139
	struct perf_event *leader = event->group_leader;
3140
	struct perf_event_context *task_ctx;
3141

3142
	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
3143
	    event->state <= PERF_EVENT_STATE_ERROR)
3144
		return;
3145

3146
	ctx_time_freeze(cpuctx, ctx);
3147

3148
	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
3149
	perf_cgroup_event_enable(event, ctx);
3150

3151
	if (!ctx->is_active)
3152
		return;
3153

3154
	if (!event_filter_match(event))
3155
		return;
3156

3157
	/*
3158
	 * If the event is in a group and isn't the group leader,
3159
	 * then don't put it on unless the group is on.
3160
	 */
3161
	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
3162
		return;
3163

3164
	task_ctx = cpuctx->task_ctx;
3165
	if (ctx->task)
3166
		WARN_ON_ONCE(task_ctx != ctx);
3167

3168
	ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
3169
}
3170

3171
/*
3172
 * Enable an event.
3173
 *
3174
 * If event->ctx is a cloned context, callers must make sure that
3175
 * every task struct that event->ctx->task could possibly point to
3176
 * remains valid.  This condition is satisfied when called through
3177
 * perf_event_for_each_child or perf_event_for_each as described
3178
 * for perf_event_disable.
3179
 */
3180
static void _perf_event_enable(struct perf_event *event)
3181
{
3182
	struct perf_event_context *ctx = event->ctx;
3183

3184
	raw_spin_lock_irq(&ctx->lock);
3185
	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
3186
	    event->state <  PERF_EVENT_STATE_ERROR) {
3187
out:
3188
		raw_spin_unlock_irq(&ctx->lock);
3189
		return;
3190
	}
3191

3192
	/*
3193
	 * If the event is in error state, clear that first.
3194
	 *
3195
	 * That way, if we see the event in error state below, we know that it
3196
	 * has gone back into error state, as distinct from the task having
3197
	 * been scheduled away before the cross-call arrived.
3198
	 */
3199
	if (event->state == PERF_EVENT_STATE_ERROR) {
3200
		/*
3201
		 * Detached SIBLING events cannot leave ERROR state.
3202
		 */
3203
		if (event->event_caps & PERF_EV_CAP_SIBLING &&
3204
		    event->group_leader == event)
3205
			goto out;
3206

3207
		event->state = PERF_EVENT_STATE_OFF;
3208
	}
3209
	raw_spin_unlock_irq(&ctx->lock);
3210

3211
	event_function_call(event, __perf_event_enable, NULL);
3212
}
3213

3214
/*
3215
 * See perf_event_disable();
3216
 */
3217
void perf_event_enable(struct perf_event *event)
3218
{
3219
	struct perf_event_context *ctx;
3220

3221
	ctx = perf_event_ctx_lock(event);
3222
	_perf_event_enable(event);
3223
	perf_event_ctx_unlock(event, ctx);
3224
}
3225
EXPORT_SYMBOL_GPL(perf_event_enable);
3226

3227
struct stop_event_data {
3228
	struct perf_event	*event;
3229
	unsigned int		restart;
3230
};
3231

3232
static int __perf_event_stop(void *info)
3233
{
3234
	struct stop_event_data *sd = info;
3235
	struct perf_event *event = sd->event;
3236

3237
	/* if it's already INACTIVE, do nothing */
3238
	if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3239
		return 0;
3240

3241
	/* matches smp_wmb() in event_sched_in() */
3242
	smp_rmb();
3243

3244
	/*
3245
	 * There is a window with interrupts enabled before we get here,
3246
	 * so we need to check again lest we try to stop another CPU's event.
3247
	 */
3248
	if (READ_ONCE(event->oncpu) != smp_processor_id())
3249
		return -EAGAIN;
3250

3251
	event->pmu->stop(event, PERF_EF_UPDATE);
3252

3253
	/*
3254
	 * May race with the actual stop (through perf_pmu_output_stop()),
3255
	 * but it is only used for events with AUX ring buffer, and such
3256
	 * events will refuse to restart because of rb::aux_mmap_count==0,
3257
	 * see comments in perf_aux_output_begin().
3258
	 *
3259
	 * Since this is happening on an event-local CPU, no trace is lost
3260
	 * while restarting.
3261
	 */
3262
	if (sd->restart)
3263
		event->pmu->start(event, 0);
3264

3265
	return 0;
3266
}
3267

3268
static int perf_event_stop(struct perf_event *event, int restart)
3269
{
3270
	struct stop_event_data sd = {
3271
		.event		= event,
3272
		.restart	= restart,
3273
	};
3274
	int ret = 0;
3275

3276
	do {
3277
		if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3278
			return 0;
3279

3280
		/* matches smp_wmb() in event_sched_in() */
3281
		smp_rmb();
3282

3283
		/*
3284
		 * We only want to restart ACTIVE events, so if the event goes
3285
		 * inactive here (event->oncpu==-1), there's nothing more to do;
3286
		 * fall through with ret==-ENXIO.
3287
		 */
3288
		ret = cpu_function_call(READ_ONCE(event->oncpu),
3289
					__perf_event_stop, &sd);
3290
	} while (ret == -EAGAIN);
3291

3292
	return ret;
3293
}
3294

3295
/*
3296
 * In order to contain the amount of racy and tricky in the address filter
3297
 * configuration management, it is a two part process:
3298
 *
3299
 * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
3300
 *      we update the addresses of corresponding vmas in
3301
 *	event::addr_filter_ranges array and bump the event::addr_filters_gen;
3302
 * (p2) when an event is scheduled in (pmu::add), it calls
3303
 *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
3304
 *      if the generation has changed since the previous call.
3305
 *
3306
 * If (p1) happens while the event is active, we restart it to force (p2).
3307
 *
3308
 * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
3309
 *     pre-existing mappings, called once when new filters arrive via SET_FILTER
3310
 *     ioctl;
3311
 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
3312
 *     registered mapping, called for every new mmap(), with mm::mmap_lock down
3313
 *     for reading;
3314
 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
3315
 *     of exec.
3316
 */
3317
void perf_event_addr_filters_sync(struct perf_event *event)
3318
{
3319
	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
3320

3321
	if (!has_addr_filter(event))
3322
		return;
3323

3324
	raw_spin_lock(&ifh->lock);
3325
	if (event->addr_filters_gen != event->hw.addr_filters_gen) {
3326
		event->pmu->addr_filters_sync(event);
3327
		event->hw.addr_filters_gen = event->addr_filters_gen;
3328
	}
3329
	raw_spin_unlock(&ifh->lock);
3330
}
3331
EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
3332

3333
static int _perf_event_refresh(struct perf_event *event, int refresh)
3334
{
3335
	/*
3336
	 * not supported on inherited events
3337
	 */
3338
	if (event->attr.inherit || !is_sampling_event(event))
3339
		return -EINVAL;
3340

3341
	atomic_add(refresh, &event->event_limit);
3342
	_perf_event_enable(event);
3343

3344
	return 0;
3345
}
3346

3347
/*
3348
 * See perf_event_disable()
3349
 */
3350
int perf_event_refresh(struct perf_event *event, int refresh)
3351
{
3352
	struct perf_event_context *ctx;
3353
	int ret;
3354

3355
	ctx = perf_event_ctx_lock(event);
3356
	ret = _perf_event_refresh(event, refresh);
3357
	perf_event_ctx_unlock(event, ctx);
3358

3359
	return ret;
3360
}
3361
EXPORT_SYMBOL_GPL(perf_event_refresh);
3362

3363
static int perf_event_modify_breakpoint(struct perf_event *bp,
3364
					 struct perf_event_attr *attr)
3365
{
3366
	int err;
3367

3368
	_perf_event_disable(bp);
3369

3370
	err = modify_user_hw_breakpoint_check(bp, attr, true);
3371

3372
	if (!bp->attr.disabled)
3373
		_perf_event_enable(bp);
3374

3375
	return err;
3376
}
3377

3378
/*
3379
 * Copy event-type-independent attributes that may be modified.
3380
 */
3381
static void perf_event_modify_copy_attr(struct perf_event_attr *to,
3382
					const struct perf_event_attr *from)
3383
{
3384
	to->sig_data = from->sig_data;
3385
}
3386

3387
static int perf_event_modify_attr(struct perf_event *event,
3388
				  struct perf_event_attr *attr)
3389
{
3390
	int (*func)(struct perf_event *, struct perf_event_attr *);
3391
	struct perf_event *child;
3392
	int err;
3393

3394
	if (event->attr.type != attr->type)
3395
		return -EINVAL;
3396

3397
	switch (event->attr.type) {
3398
	case PERF_TYPE_BREAKPOINT:
3399
		func = perf_event_modify_breakpoint;
3400
		break;
3401
	default:
3402
		/* Place holder for future additions. */
3403
		return -EOPNOTSUPP;
3404
	}
3405

3406
	WARN_ON_ONCE(event->ctx->parent_ctx);
3407

3408
	mutex_lock(&event->child_mutex);
3409
	/*
3410
	 * Event-type-independent attributes must be copied before event-type
3411
	 * modification, which will validate that final attributes match the
3412
	 * source attributes after all relevant attributes have been copied.
3413
	 */
3414
	perf_event_modify_copy_attr(&event->attr, attr);
3415
	err = func(event, attr);
3416
	if (err)
3417
		goto out;
3418
	list_for_each_entry(child, &event->child_list, child_list) {
3419
		perf_event_modify_copy_attr(&child->attr, attr);
3420
		err = func(child, attr);
3421
		if (err)
3422
			goto out;
3423
	}
3424
out:
3425
	mutex_unlock(&event->child_mutex);
3426
	return err;
3427
}
3428

3429
static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
3430
				enum event_type_t event_type)
3431
{
3432
	struct perf_event_context *ctx = pmu_ctx->ctx;
3433
	struct perf_event *event, *tmp;
3434
	struct pmu *pmu = pmu_ctx->pmu;
3435

3436
	if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
3437
		struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
3438

3439
		WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
3440
		cpc->task_epc = NULL;
3441
	}
3442

3443
	if (!(event_type & EVENT_ALL))
3444
		return;
3445

3446
	perf_pmu_disable(pmu);
3447
	if (event_type & EVENT_PINNED) {
3448
		list_for_each_entry_safe(event, tmp,
3449
					 &pmu_ctx->pinned_active,
3450
					 active_list)
3451
			group_sched_out(event, ctx);
3452
	}
3453

3454
	if (event_type & EVENT_FLEXIBLE) {
3455
		list_for_each_entry_safe(event, tmp,
3456
					 &pmu_ctx->flexible_active,
3457
					 active_list)
3458
			group_sched_out(event, ctx);
3459
		/*
3460
		 * Since we cleared EVENT_FLEXIBLE, also clear
3461
		 * rotate_necessary, is will be reset by
3462
		 * ctx_flexible_sched_in() when needed.
3463
		 */
3464
		pmu_ctx->rotate_necessary = 0;
3465
	}
3466
	perf_pmu_enable(pmu);
3467
}
3468

3469
/*
3470
 * Be very careful with the @pmu argument since this will change ctx state.
3471
 * The @pmu argument works for ctx_resched(), because that is symmetric in
3472
 * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant.
3473
 *
3474
 * However, if you were to be asymmetrical, you could end up with messed up
3475
 * state, eg. ctx->is_active cleared even though most EPCs would still actually
3476
 * be active.
3477
 */
3478
static void
3479
ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
3480
{
3481
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
3482
	struct perf_event_pmu_context *pmu_ctx;
3483
	int is_active = ctx->is_active;
3484
	bool cgroup = event_type & EVENT_CGROUP;
3485

3486
	event_type &= ~EVENT_CGROUP;
3487

3488
	lockdep_assert_held(&ctx->lock);
3489

3490
	if (likely(!ctx->nr_events)) {
3491
		/*
3492
		 * See __perf_remove_from_context().
3493
		 */
3494
		WARN_ON_ONCE(ctx->is_active);
3495
		if (ctx->task)
3496
			WARN_ON_ONCE(cpuctx->task_ctx);
3497
		return;
3498
	}
3499

3500
	/*
3501
	 * Always update time if it was set; not only when it changes.
3502
	 * Otherwise we can 'forget' to update time for any but the last
3503
	 * context we sched out. For example:
3504
	 *
3505
	 *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
3506
	 *   ctx_sched_out(.event_type = EVENT_PINNED)
3507
	 *
3508
	 * would only update time for the pinned events.
3509
	 */
3510
	__ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
3511

3512
	/*
3513
	 * CPU-release for the below ->is_active store,
3514
	 * see __load_acquire() in perf_event_time_now()
3515
	 */
3516
	barrier();
3517
	ctx->is_active &= ~event_type;
3518

3519
	if (!(ctx->is_active & EVENT_ALL)) {
3520
		/*
3521
		 * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now()
3522
		 * does not observe a hole. perf_ctx_unlock() will clean up.
3523
		 */
3524
		if (ctx->is_active & EVENT_FROZEN)
3525
			ctx->is_active &= EVENT_TIME_FROZEN;
3526
		else
3527
			ctx->is_active = 0;
3528
	}
3529

3530
	if (ctx->task) {
3531
		WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3532
		if (!(ctx->is_active & EVENT_ALL))
3533
			cpuctx->task_ctx = NULL;
3534
	}
3535

3536
	is_active ^= ctx->is_active; /* changed bits */
3537

3538
	for_each_epc(pmu_ctx, ctx, pmu, cgroup)
3539
		__pmu_ctx_sched_out(pmu_ctx, is_active);
3540
}
3541

3542
/*
3543
 * Test whether two contexts are equivalent, i.e. whether they have both been
3544
 * cloned from the same version of the same context.
3545
 *
3546
 * Equivalence is measured using a generation number in the context that is
3547
 * incremented on each modification to it; see unclone_ctx(), list_add_event()
3548
 * and list_del_event().
3549
 */
3550
static int context_equiv(struct perf_event_context *ctx1,
3551
			 struct perf_event_context *ctx2)
3552
{
3553
	lockdep_assert_held(&ctx1->lock);
3554
	lockdep_assert_held(&ctx2->lock);
3555

3556
	/* Pinning disables the swap optimization */
3557
	if (ctx1->pin_count || ctx2->pin_count)
3558
		return 0;
3559

3560
	/* If ctx1 is the parent of ctx2 */
3561
	if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
3562
		return 1;
3563

3564
	/* If ctx2 is the parent of ctx1 */
3565
	if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
3566
		return 1;
3567

3568
	/*
3569
	 * If ctx1 and ctx2 have the same parent; we flatten the parent
3570
	 * hierarchy, see perf_event_init_context().
3571
	 */
3572
	if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
3573
			ctx1->parent_gen == ctx2->parent_gen)
3574
		return 1;
3575

3576
	/* Unmatched */
3577
	return 0;
3578
}
3579

3580
static void __perf_event_sync_stat(struct perf_event *event,
3581
				     struct perf_event *next_event)
3582
{
3583
	u64 value;
3584

3585
	if (!event->attr.inherit_stat)
3586
		return;
3587

3588
	/*
3589
	 * Update the event value, we cannot use perf_event_read()
3590
	 * because we're in the middle of a context switch and have IRQs
3591
	 * disabled, which upsets smp_call_function_single(), however
3592
	 * we know the event must be on the current CPU, therefore we
3593
	 * don't need to use it.
3594
	 */
3595
	perf_pmu_read(event);
3596

3597
	perf_event_update_time(event);
3598

3599
	/*
3600
	 * In order to keep per-task stats reliable we need to flip the event
3601
	 * values when we flip the contexts.
3602
	 */
3603
	value = local64_read(&next_event->count);
3604
	value = local64_xchg(&event->count, value);
3605
	local64_set(&next_event->count, value);
3606

3607
	swap(event->total_time_enabled, next_event->total_time_enabled);
3608
	swap(event->total_time_running, next_event->total_time_running);
3609

3610
	/*
3611
	 * Since we swizzled the values, update the user visible data too.
3612
	 */
3613
	perf_event_update_userpage(event);
3614
	perf_event_update_userpage(next_event);
3615
}
3616

3617
static void perf_event_sync_stat(struct perf_event_context *ctx,
3618
				   struct perf_event_context *next_ctx)
3619
{
3620
	struct perf_event *event, *next_event;
3621

3622
	if (!ctx->nr_stat)
3623
		return;
3624

3625
	update_context_time(ctx);
3626

3627
	event = list_first_entry(&ctx->event_list,
3628
				   struct perf_event, event_entry);
3629

3630
	next_event = list_first_entry(&next_ctx->event_list,
3631
					struct perf_event, event_entry);
3632

3633
	while (&event->event_entry != &ctx->event_list &&
3634
	       &next_event->event_entry != &next_ctx->event_list) {
3635

3636
		__perf_event_sync_stat(event, next_event);
3637

3638
		event = list_next_entry(event, event_entry);
3639
		next_event = list_next_entry(next_event, event_entry);
3640
	}
3641
}
3642

3643
static void perf_ctx_sched_task_cb(struct perf_event_context *ctx,
3644
				   struct task_struct *task, bool sched_in)
3645
{
3646
	struct perf_event_pmu_context *pmu_ctx;
3647
	struct perf_cpu_pmu_context *cpc;
3648

3649
	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
3650
		cpc = this_cpc(pmu_ctx->pmu);
3651

3652
		if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
3653
			pmu_ctx->pmu->sched_task(pmu_ctx, task, sched_in);
3654
	}
3655
}
3656

3657
static void
3658
perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
3659
{
3660
	struct perf_event_context *ctx = task->perf_event_ctxp;
3661
	struct perf_event_context *next_ctx;
3662
	struct perf_event_context *parent, *next_parent;
3663
	int do_switch = 1;
3664

3665
	if (likely(!ctx))
3666
		return;
3667

3668
	rcu_read_lock();
3669
	next_ctx = rcu_dereference(next->perf_event_ctxp);
3670
	if (!next_ctx)
3671
		goto unlock;
3672

3673
	parent = rcu_dereference(ctx->parent_ctx);
3674
	next_parent = rcu_dereference(next_ctx->parent_ctx);
3675

3676
	/* If neither context have a parent context; they cannot be clones. */
3677
	if (!parent && !next_parent)
3678
		goto unlock;
3679

3680
	if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3681
		/*
3682
		 * Looks like the two contexts are clones, so we might be
3683
		 * able to optimize the context switch.  We lock both
3684
		 * contexts and check that they are clones under the
3685
		 * lock (including re-checking that neither has been
3686
		 * uncloned in the meantime).  It doesn't matter which
3687
		 * order we take the locks because no other cpu could
3688
		 * be trying to lock both of these tasks.
3689
		 */
3690
		raw_spin_lock(&ctx->lock);
3691
		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3692
		if (context_equiv(ctx, next_ctx)) {
3693

3694
			perf_ctx_disable(ctx, false);
3695

3696
			/* PMIs are disabled; ctx->nr_no_switch_fast is stable. */
3697
			if (local_read(&ctx->nr_no_switch_fast) ||
3698
			    local_read(&next_ctx->nr_no_switch_fast)) {
3699
				/*
3700
				 * Must not swap out ctx when there's pending
3701
				 * events that rely on the ctx->task relation.
3702
				 *
3703
				 * Likewise, when a context contains inherit +
3704
				 * SAMPLE_READ events they should be switched
3705
				 * out using the slow path so that they are
3706
				 * treated as if they were distinct contexts.
3707
				 */
3708
				raw_spin_unlock(&next_ctx->lock);
3709
				rcu_read_unlock();
3710
				goto inside_switch;
3711
			}
3712

3713
			WRITE_ONCE(ctx->task, next);
3714
			WRITE_ONCE(next_ctx->task, task);
3715

3716
			perf_ctx_sched_task_cb(ctx, task, false);
3717

3718
			perf_ctx_enable(ctx, false);
3719

3720
			/*
3721
			 * RCU_INIT_POINTER here is safe because we've not
3722
			 * modified the ctx and the above modification of
3723
			 * ctx->task is immaterial since this value is
3724
			 * always verified under ctx->lock which we're now
3725
			 * holding.
3726
			 */
3727
			RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
3728
			RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
3729

3730
			do_switch = 0;
3731

3732
			perf_event_sync_stat(ctx, next_ctx);
3733
		}
3734
		raw_spin_unlock(&next_ctx->lock);
3735
		raw_spin_unlock(&ctx->lock);
3736
	}
3737
unlock:
3738
	rcu_read_unlock();
3739

3740
	if (do_switch) {
3741
		raw_spin_lock(&ctx->lock);
3742
		perf_ctx_disable(ctx, false);
3743

3744
inside_switch:
3745
		perf_ctx_sched_task_cb(ctx, task, false);
3746
		task_ctx_sched_out(ctx, NULL, EVENT_ALL);
3747

3748
		perf_ctx_enable(ctx, false);
3749
		raw_spin_unlock(&ctx->lock);
3750
	}
3751
}
3752

3753
static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3754
static DEFINE_PER_CPU(int, perf_sched_cb_usages);
3755

3756
void perf_sched_cb_dec(struct pmu *pmu)
3757
{
3758
	struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
3759

3760
	this_cpu_dec(perf_sched_cb_usages);
3761
	barrier();
3762

3763
	if (!--cpc->sched_cb_usage)
3764
		list_del(&cpc->sched_cb_entry);
3765
}
3766

3767

3768
void perf_sched_cb_inc(struct pmu *pmu)
3769
{
3770
	struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
3771

3772
	if (!cpc->sched_cb_usage++)
3773
		list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3774

3775
	barrier();
3776
	this_cpu_inc(perf_sched_cb_usages);
3777
}
3778

3779
/*
3780
 * This function provides the context switch callback to the lower code
3781
 * layer. It is invoked ONLY when the context switch callback is enabled.
3782
 *
3783
 * This callback is relevant even to per-cpu events; for example multi event
3784
 * PEBS requires this to provide PID/TID information. This requires we flush
3785
 * all queued PEBS records before we context switch to a new task.
3786
 */
3787
static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc,
3788
				  struct task_struct *task, bool sched_in)
3789
{
3790
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
3791
	struct pmu *pmu;
3792

3793
	pmu = cpc->epc.pmu;
3794

3795
	/* software PMUs will not have sched_task */
3796
	if (WARN_ON_ONCE(!pmu->sched_task))
3797
		return;
3798

3799
	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3800
	perf_pmu_disable(pmu);
3801

3802
	pmu->sched_task(cpc->task_epc, task, sched_in);
3803

3804
	perf_pmu_enable(pmu);
3805
	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3806
}
3807

3808
static void perf_pmu_sched_task(struct task_struct *prev,
3809
				struct task_struct *next,
3810
				bool sched_in)
3811
{
3812
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
3813
	struct perf_cpu_pmu_context *cpc;
3814

3815
	/* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */
3816
	if (prev == next || cpuctx->task_ctx)
3817
		return;
3818

3819
	list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
3820
		__perf_pmu_sched_task(cpc, sched_in ? next : prev, sched_in);
3821
}
3822

3823
static void perf_event_switch(struct task_struct *task,
3824
			      struct task_struct *next_prev, bool sched_in);
3825

3826
/*
3827
 * Called from scheduler to remove the events of the current task,
3828
 * with interrupts disabled.
3829
 *
3830
 * We stop each event and update the event value in event->count.
3831
 *
3832
 * This does not protect us against NMI, but disable()
3833
 * sets the disabled bit in the control field of event _before_
3834
 * accessing the event control register. If a NMI hits, then it will
3835
 * not restart the event.
3836
 */
3837
void __perf_event_task_sched_out(struct task_struct *task,
3838
				 struct task_struct *next)
3839
{
3840
	if (__this_cpu_read(perf_sched_cb_usages))
3841
		perf_pmu_sched_task(task, next, false);
3842

3843
	if (atomic_read(&nr_switch_events))
3844
		perf_event_switch(task, next, false);
3845

3846
	perf_event_context_sched_out(task, next);
3847

3848
	/*
3849
	 * if cgroup events exist on this CPU, then we need
3850
	 * to check if we have to switch out PMU state.
3851
	 * cgroup event are system-wide mode only
3852
	 */
3853
	perf_cgroup_switch(next);
3854
}
3855

3856
static bool perf_less_group_idx(const void *l, const void *r, void __always_unused *args)
3857
{
3858
	const struct perf_event *le = *(const struct perf_event **)l;
3859
	const struct perf_event *re = *(const struct perf_event **)r;
3860

3861
	return le->group_index < re->group_index;
3862
}
3863

3864
DEFINE_MIN_HEAP(struct perf_event *, perf_event_min_heap);
3865

3866
static const struct min_heap_callbacks perf_min_heap = {
3867
	.less = perf_less_group_idx,
3868
	.swp = NULL,
3869
};
3870

3871
static void __heap_add(struct perf_event_min_heap *heap, struct perf_event *event)
3872
{
3873
	struct perf_event **itrs = heap->data;
3874

3875
	if (event) {
3876
		itrs[heap->nr] = event;
3877
		heap->nr++;
3878
	}
3879
}
3880

3881
static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
3882
{
3883
	struct perf_cpu_pmu_context *cpc;
3884

3885
	if (!pmu_ctx->ctx->task)
3886
		return;
3887

3888
	cpc = this_cpc(pmu_ctx->pmu);
3889
	WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
3890
	cpc->task_epc = pmu_ctx;
3891
}
3892

3893
static noinline int visit_groups_merge(struct perf_event_context *ctx,
3894
				struct perf_event_groups *groups, int cpu,
3895
				struct pmu *pmu,
3896
				int (*func)(struct perf_event *, void *),
3897
				void *data)
3898
{
3899
#ifdef CONFIG_CGROUP_PERF
3900
	struct cgroup_subsys_state *css = NULL;
3901
#endif
3902
	struct perf_cpu_context *cpuctx = NULL;
3903
	/* Space for per CPU and/or any CPU event iterators. */
3904
	struct perf_event *itrs[2];
3905
	struct perf_event_min_heap event_heap;
3906
	struct perf_event **evt;
3907
	int ret;
3908

3909
	if (pmu->filter && pmu->filter(pmu, cpu))
3910
		return 0;
3911

3912
	if (!ctx->task) {
3913
		cpuctx = this_cpu_ptr(&perf_cpu_context);
3914
		event_heap = (struct perf_event_min_heap){
3915
			.data = cpuctx->heap,
3916
			.nr = 0,
3917
			.size = cpuctx->heap_size,
3918
		};
3919

3920
		lockdep_assert_held(&cpuctx->ctx.lock);
3921

3922
#ifdef CONFIG_CGROUP_PERF
3923
		if (cpuctx->cgrp)
3924
			css = &cpuctx->cgrp->css;
3925
#endif
3926
	} else {
3927
		event_heap = (struct perf_event_min_heap){
3928
			.data = itrs,
3929
			.nr = 0,
3930
			.size = ARRAY_SIZE(itrs),
3931
		};
3932
		/* Events not within a CPU context may be on any CPU. */
3933
		__heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL));
3934
	}
3935
	evt = event_heap.data;
3936

3937
	__heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL));
3938

3939
#ifdef CONFIG_CGROUP_PERF
3940
	for (; css; css = css->parent)
3941
		__heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup));
3942
#endif
3943

3944
	if (event_heap.nr) {
3945
		__link_epc((*evt)->pmu_ctx);
3946
		perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
3947
	}
3948

3949
	min_heapify_all_inline(&event_heap, &perf_min_heap, NULL);
3950

3951
	while (event_heap.nr) {
3952
		ret = func(*evt, data);
3953
		if (ret)
3954
			return ret;
3955

3956
		*evt = perf_event_groups_next(*evt, pmu);
3957
		if (*evt)
3958
			min_heap_sift_down_inline(&event_heap, 0, &perf_min_heap, NULL);
3959
		else
3960
			min_heap_pop_inline(&event_heap, &perf_min_heap, NULL);
3961
	}
3962

3963
	return 0;
3964
}
3965

3966
/*
3967
 * Because the userpage is strictly per-event (there is no concept of context,
3968
 * so there cannot be a context indirection), every userpage must be updated
3969
 * when context time starts :-(
3970
 *
3971
 * IOW, we must not miss EVENT_TIME edges.
3972
 */
3973
static inline bool event_update_userpage(struct perf_event *event)
3974
{
3975
	if (likely(!refcount_read(&event->mmap_count)))
3976
		return false;
3977

3978
	perf_event_update_time(event);
3979
	perf_event_update_userpage(event);
3980

3981
	return true;
3982
}
3983

3984
static inline void group_update_userpage(struct perf_event *group_event)
3985
{
3986
	struct perf_event *event;
3987

3988
	if (!event_update_userpage(group_event))
3989
		return;
3990

3991
	for_each_sibling_event(event, group_event)
3992
		event_update_userpage(event);
3993
}
3994

3995
static int merge_sched_in(struct perf_event *event, void *data)
3996
{
3997
	struct perf_event_context *ctx = event->ctx;
3998
	int *can_add_hw = data;
3999

4000
	if (event->state <= PERF_EVENT_STATE_OFF)
4001
		return 0;
4002

4003
	if (!event_filter_match(event))
4004
		return 0;
4005

4006
	if (group_can_go_on(event, *can_add_hw)) {
4007
		if (!group_sched_in(event, ctx))
4008
			list_add_tail(&event->active_list, get_event_list(event));
4009
	}
4010

4011
	if (event->state == PERF_EVENT_STATE_INACTIVE) {
4012
		*can_add_hw = 0;
4013
		if (event->attr.pinned) {
4014
			perf_cgroup_event_disable(event, ctx);
4015
			perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
4016

4017
			if (*perf_event_fasync(event))
4018
				event->pending_kill = POLL_ERR;
4019

4020
			perf_event_wakeup(event);
4021
		} else {
4022
			struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu);
4023

4024
			event->pmu_ctx->rotate_necessary = 1;
4025
			perf_mux_hrtimer_restart(cpc);
4026
			group_update_userpage(event);
4027
		}
4028
	}
4029

4030
	return 0;
4031
}
4032

4033
static void pmu_groups_sched_in(struct perf_event_context *ctx,
4034
				struct perf_event_groups *groups,
4035
				struct pmu *pmu)
4036
{
4037
	int can_add_hw = 1;
4038
	visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
4039
			   merge_sched_in, &can_add_hw);
4040
}
4041

4042
static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
4043
			       enum event_type_t event_type)
4044
{
4045
	struct perf_event_context *ctx = pmu_ctx->ctx;
4046

4047
	if (event_type & EVENT_PINNED)
4048
		pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
4049
	if (event_type & EVENT_FLEXIBLE)
4050
		pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
4051
}
4052

4053
static void
4054
ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
4055
{
4056
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
4057
	struct perf_event_pmu_context *pmu_ctx;
4058
	int is_active = ctx->is_active;
4059
	bool cgroup = event_type & EVENT_CGROUP;
4060

4061
	event_type &= ~EVENT_CGROUP;
4062

4063
	lockdep_assert_held(&ctx->lock);
4064

4065
	if (likely(!ctx->nr_events))
4066
		return;
4067

4068
	if (!(is_active & EVENT_TIME)) {
4069
		/* start ctx time */
4070
		__update_context_time(ctx, false);
4071
		perf_cgroup_set_timestamp(cpuctx);
4072
		/*
4073
		 * CPU-release for the below ->is_active store,
4074
		 * see __load_acquire() in perf_event_time_now()
4075
		 */
4076
		barrier();
4077
	}
4078

4079
	ctx->is_active |= (event_type | EVENT_TIME);
4080
	if (ctx->task) {
4081
		if (!(is_active & EVENT_ALL))
4082
			cpuctx->task_ctx = ctx;
4083
		else
4084
			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
4085
	}
4086

4087
	is_active ^= ctx->is_active; /* changed bits */
4088

4089
	/*
4090
	 * First go through the list and put on any pinned groups
4091
	 * in order to give them the best chance of going on.
4092
	 */
4093
	if (is_active & EVENT_PINNED) {
4094
		for_each_epc(pmu_ctx, ctx, pmu, cgroup)
4095
			__pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
4096
	}
4097

4098
	/* Then walk through the lower prio flexible groups */
4099
	if (is_active & EVENT_FLEXIBLE) {
4100
		for_each_epc(pmu_ctx, ctx, pmu, cgroup)
4101
			__pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
4102
	}
4103
}
4104

4105
static void perf_event_context_sched_in(struct task_struct *task)
4106
{
4107
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
4108
	struct perf_event_context *ctx;
4109

4110
	rcu_read_lock();
4111
	ctx = rcu_dereference(task->perf_event_ctxp);
4112
	if (!ctx)
4113
		goto rcu_unlock;
4114

4115
	if (cpuctx->task_ctx == ctx) {
4116
		perf_ctx_lock(cpuctx, ctx);
4117
		perf_ctx_disable(ctx, false);
4118

4119
		perf_ctx_sched_task_cb(ctx, task, true);
4120

4121
		perf_ctx_enable(ctx, false);
4122
		perf_ctx_unlock(cpuctx, ctx);
4123
		goto rcu_unlock;
4124
	}
4125

4126
	perf_ctx_lock(cpuctx, ctx);
4127
	/*
4128
	 * We must check ctx->nr_events while holding ctx->lock, such
4129
	 * that we serialize against perf_install_in_context().
4130
	 */
4131
	if (!ctx->nr_events)
4132
		goto unlock;
4133

4134
	perf_ctx_disable(ctx, false);
4135
	/*
4136
	 * We want to keep the following priority order:
4137
	 * cpu pinned (that don't need to move), task pinned,
4138
	 * cpu flexible, task flexible.
4139
	 *
4140
	 * However, if task's ctx is not carrying any pinned
4141
	 * events, no need to flip the cpuctx's events around.
4142
	 */
4143
	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
4144
		perf_ctx_disable(&cpuctx->ctx, false);
4145
		ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
4146
	}
4147

4148
	perf_event_sched_in(cpuctx, ctx, NULL);
4149

4150
	perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true);
4151

4152
	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
4153
		perf_ctx_enable(&cpuctx->ctx, false);
4154

4155
	perf_ctx_enable(ctx, false);
4156

4157
unlock:
4158
	perf_ctx_unlock(cpuctx, ctx);
4159
rcu_unlock:
4160
	rcu_read_unlock();
4161
}
4162

4163
/*
4164
 * Called from scheduler to add the events of the current task
4165
 * with interrupts disabled.
4166
 *
4167
 * We restore the event value and then enable it.
4168
 *
4169
 * This does not protect us against NMI, but enable()
4170
 * sets the enabled bit in the control field of event _before_
4171
 * accessing the event control register. If a NMI hits, then it will
4172
 * keep the event running.
4173
 */
4174
void __perf_event_task_sched_in(struct task_struct *prev,
4175
				struct task_struct *task)
4176
{
4177
	perf_event_context_sched_in(task);
4178

4179
	if (atomic_read(&nr_switch_events))
4180
		perf_event_switch(task, prev, true);
4181

4182
	if (__this_cpu_read(perf_sched_cb_usages))
4183
		perf_pmu_sched_task(prev, task, true);
4184
}
4185

4186
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
4187
{
4188
	u64 frequency = event->attr.sample_freq;
4189
	u64 sec = NSEC_PER_SEC;
4190
	u64 divisor, dividend;
4191

4192
	int count_fls, nsec_fls, frequency_fls, sec_fls;
4193

4194
	count_fls = fls64(count);
4195
	nsec_fls = fls64(nsec);
4196
	frequency_fls = fls64(frequency);
4197
	sec_fls = 30;
4198

4199
	/*
4200
	 * We got @count in @nsec, with a target of sample_freq HZ
4201
	 * the target period becomes:
4202
	 *
4203
	 *             @count * 10^9
4204
	 * period = -------------------
4205
	 *          @nsec * sample_freq
4206
	 *
4207
	 */
4208

4209
	/*
4210
	 * Reduce accuracy by one bit such that @a and @b converge
4211
	 * to a similar magnitude.
4212
	 */
4213
#define REDUCE_FLS(a, b)		\
4214
do {					\
4215
	if (a##_fls > b##_fls) {	\
4216
		a >>= 1;		\
4217
		a##_fls--;		\
4218
	} else {			\
4219
		b >>= 1;		\
4220
		b##_fls--;		\
4221
	}				\
4222
} while (0)
4223

4224
	/*
4225
	 * Reduce accuracy until either term fits in a u64, then proceed with
4226
	 * the other, so that finally we can do a u64/u64 division.
4227
	 */
4228
	while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
4229
		REDUCE_FLS(nsec, frequency);
4230
		REDUCE_FLS(sec, count);
4231
	}
4232

4233
	if (count_fls + sec_fls > 64) {
4234
		divisor = nsec * frequency;
4235

4236
		while (count_fls + sec_fls > 64) {
4237
			REDUCE_FLS(count, sec);
4238
			divisor >>= 1;
4239
		}
4240

4241
		dividend = count * sec;
4242
	} else {
4243
		dividend = count * sec;
4244

4245
		while (nsec_fls + frequency_fls > 64) {
4246
			REDUCE_FLS(nsec, frequency);
4247
			dividend >>= 1;
4248
		}
4249

4250
		divisor = nsec * frequency;
4251
	}
4252

4253
	if (!divisor)
4254
		return dividend;
4255

4256
	return div64_u64(dividend, divisor);
4257
}
4258

4259
static DEFINE_PER_CPU(int, perf_throttled_count);
4260
static DEFINE_PER_CPU(u64, perf_throttled_seq);
4261

4262
static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
4263
{
4264
	struct hw_perf_event *hwc = &event->hw;
4265
	s64 period, sample_period;
4266
	s64 delta;
4267

4268
	period = perf_calculate_period(event, nsec, count);
4269

4270
	delta = (s64)(period - hwc->sample_period);
4271
	if (delta >= 0)
4272
		delta += 7;
4273
	else
4274
		delta -= 7;
4275
	delta /= 8; /* low pass filter */
4276

4277
	sample_period = hwc->sample_period + delta;
4278

4279
	if (!sample_period)
4280
		sample_period = 1;
4281

4282
	hwc->sample_period = sample_period;
4283

4284
	if (local64_read(&hwc->period_left) > 8*sample_period) {
4285
		if (disable)
4286
			event->pmu->stop(event, PERF_EF_UPDATE);
4287

4288
		local64_set(&hwc->period_left, 0);
4289

4290
		if (disable)
4291
			event->pmu->start(event, PERF_EF_RELOAD);
4292
	}
4293
}
4294

4295
static void perf_adjust_freq_unthr_events(struct list_head *event_list)
4296
{
4297
	struct perf_event *event;
4298
	struct hw_perf_event *hwc;
4299
	u64 now, period = TICK_NSEC;
4300
	s64 delta;
4301

4302
	list_for_each_entry(event, event_list, active_list) {
4303
		if (event->state != PERF_EVENT_STATE_ACTIVE)
4304
			continue;
4305

4306
		// XXX use visit thingy to avoid the -1,cpu match
4307
		if (!event_filter_match(event))
4308
			continue;
4309

4310
		hwc = &event->hw;
4311

4312
		if (hwc->interrupts == MAX_INTERRUPTS)
4313
			perf_event_unthrottle_group(event, is_event_in_freq_mode(event));
4314

4315
		if (!is_event_in_freq_mode(event))
4316
			continue;
4317

4318
		/*
4319
		 * stop the event and update event->count
4320
		 */
4321
		event->pmu->stop(event, PERF_EF_UPDATE);
4322

4323
		now = local64_read(&event->count);
4324
		delta = now - hwc->freq_count_stamp;
4325
		hwc->freq_count_stamp = now;
4326

4327
		/*
4328
		 * restart the event
4329
		 * reload only if value has changed
4330
		 * we have stopped the event so tell that
4331
		 * to perf_adjust_period() to avoid stopping it
4332
		 * twice.
4333
		 */
4334
		if (delta > 0)
4335
			perf_adjust_period(event, period, delta, false);
4336

4337
		event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
4338
	}
4339
}
4340

4341
/*
4342
 * combine freq adjustment with unthrottling to avoid two passes over the
4343
 * events. At the same time, make sure, having freq events does not change
4344
 * the rate of unthrottling as that would introduce bias.
4345
 */
4346
static void
4347
perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
4348
{
4349
	struct perf_event_pmu_context *pmu_ctx;
4350

4351
	/*
4352
	 * only need to iterate over all events iff:
4353
	 * - context have events in frequency mode (needs freq adjust)
4354
	 * - there are events to unthrottle on this cpu
4355
	 */
4356
	if (!(ctx->nr_freq || unthrottle))
4357
		return;
4358

4359
	raw_spin_lock(&ctx->lock);
4360

4361
	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
4362
		if (!(pmu_ctx->nr_freq || unthrottle))
4363
			continue;
4364
		if (!perf_pmu_ctx_is_active(pmu_ctx))
4365
			continue;
4366
		if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
4367
			continue;
4368

4369
		perf_pmu_disable(pmu_ctx->pmu);
4370
		perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
4371
		perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
4372
		perf_pmu_enable(pmu_ctx->pmu);
4373
	}
4374

4375
	raw_spin_unlock(&ctx->lock);
4376
}
4377

4378
/*
4379
 * Move @event to the tail of the @ctx's elegible events.
4380
 */
4381
static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
4382
{
4383
	/*
4384
	 * Rotate the first entry last of non-pinned groups. Rotation might be
4385
	 * disabled by the inheritance code.
4386
	 */
4387
	if (ctx->rotate_disable)
4388
		return;
4389

4390
	perf_event_groups_delete(&ctx->flexible_groups, event);
4391
	perf_event_groups_insert(&ctx->flexible_groups, event);
4392
}
4393

4394
/* pick an event from the flexible_groups to rotate */
4395
static inline struct perf_event *
4396
ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx)
4397
{
4398
	struct perf_event *event;
4399
	struct rb_node *node;
4400
	struct rb_root *tree;
4401
	struct __group_key key = {
4402
		.pmu = pmu_ctx->pmu,
4403
	};
4404

4405
	/* pick the first active flexible event */
4406
	event = list_first_entry_or_null(&pmu_ctx->flexible_active,
4407
					 struct perf_event, active_list);
4408
	if (event)
4409
		goto out;
4410

4411
	/* if no active flexible event, pick the first event */
4412
	tree = &pmu_ctx->ctx->flexible_groups.tree;
4413

4414
	if (!pmu_ctx->ctx->task) {
4415
		key.cpu = smp_processor_id();
4416

4417
		node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
4418
		if (node)
4419
			event = __node_2_pe(node);
4420
		goto out;
4421
	}
4422

4423
	key.cpu = -1;
4424
	node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
4425
	if (node) {
4426
		event = __node_2_pe(node);
4427
		goto out;
4428
	}
4429

4430
	key.cpu = smp_processor_id();
4431
	node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
4432
	if (node)
4433
		event = __node_2_pe(node);
4434

4435
out:
4436
	/*
4437
	 * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
4438
	 * finds there are unschedulable events, it will set it again.
4439
	 */
4440
	pmu_ctx->rotate_necessary = 0;
4441

4442
	return event;
4443
}
4444

4445
static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
4446
{
4447
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
4448
	struct perf_event_pmu_context *cpu_epc, *task_epc = NULL;
4449
	struct perf_event *cpu_event = NULL, *task_event = NULL;
4450
	int cpu_rotate, task_rotate;
4451
	struct pmu *pmu;
4452

4453
	/*
4454
	 * Since we run this from IRQ context, nobody can install new
4455
	 * events, thus the event count values are stable.
4456
	 */
4457

4458
	cpu_epc = &cpc->epc;
4459
	pmu = cpu_epc->pmu;
4460
	task_epc = cpc->task_epc;
4461

4462
	cpu_rotate = cpu_epc->rotate_necessary;
4463
	task_rotate = task_epc ? task_epc->rotate_necessary : 0;
4464

4465
	if (!(cpu_rotate || task_rotate))
4466
		return false;
4467

4468
	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
4469
	perf_pmu_disable(pmu);
4470

4471
	if (task_rotate)
4472
		task_event = ctx_event_to_rotate(task_epc);
4473
	if (cpu_rotate)
4474
		cpu_event = ctx_event_to_rotate(cpu_epc);
4475

4476
	/*
4477
	 * As per the order given at ctx_resched() first 'pop' task flexible
4478
	 * and then, if needed CPU flexible.
4479
	 */
4480
	if (task_event || (task_epc && cpu_event)) {
4481
		update_context_time(task_epc->ctx);
4482
		__pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
4483
	}
4484

4485
	if (cpu_event) {
4486
		update_context_time(&cpuctx->ctx);
4487
		__pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
4488
		rotate_ctx(&cpuctx->ctx, cpu_event);
4489
		__pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
4490
	}
4491

4492
	if (task_event)
4493
		rotate_ctx(task_epc->ctx, task_event);
4494

4495
	if (task_event || (task_epc && cpu_event))
4496
		__pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
4497

4498
	perf_pmu_enable(pmu);
4499
	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
4500

4501
	return true;
4502
}
4503

4504
void perf_event_task_tick(void)
4505
{
4506
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
4507
	struct perf_event_context *ctx;
4508
	int throttled;
4509

4510
	lockdep_assert_irqs_disabled();
4511

4512
	__this_cpu_inc(perf_throttled_seq);
4513
	throttled = __this_cpu_xchg(perf_throttled_count, 0);
4514
	tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
4515

4516
	perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled);
4517

4518
	rcu_read_lock();
4519
	ctx = rcu_dereference(current->perf_event_ctxp);
4520
	if (ctx)
4521
		perf_adjust_freq_unthr_context(ctx, !!throttled);
4522
	rcu_read_unlock();
4523
}
4524

4525
static int event_enable_on_exec(struct perf_event *event,
4526
				struct perf_event_context *ctx)
4527
{
4528
	if (!event->attr.enable_on_exec)
4529
		return 0;
4530

4531
	event->attr.enable_on_exec = 0;
4532
	if (event->state >= PERF_EVENT_STATE_INACTIVE)
4533
		return 0;
4534

4535
	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
4536

4537
	return 1;
4538
}
4539

4540
/*
4541
 * Enable all of a task's events that have been marked enable-on-exec.
4542
 * This expects task == current.
4543
 */
4544
static void perf_event_enable_on_exec(struct perf_event_context *ctx)
4545
{
4546
	struct perf_event_context *clone_ctx = NULL;
4547
	enum event_type_t event_type = 0;
4548
	struct perf_cpu_context *cpuctx;
4549
	struct perf_event *event;
4550
	unsigned long flags;
4551
	int enabled = 0;
4552

4553
	local_irq_save(flags);
4554
	if (WARN_ON_ONCE(current->perf_event_ctxp != ctx))
4555
		goto out;
4556

4557
	if (!ctx->nr_events)
4558
		goto out;
4559

4560
	cpuctx = this_cpu_ptr(&perf_cpu_context);
4561
	perf_ctx_lock(cpuctx, ctx);
4562
	ctx_time_freeze(cpuctx, ctx);
4563

4564
	list_for_each_entry(event, &ctx->event_list, event_entry) {
4565
		enabled |= event_enable_on_exec(event, ctx);
4566
		event_type |= get_event_type(event);
4567
	}
4568

4569
	/*
4570
	 * Unclone and reschedule this context if we enabled any event.
4571
	 */
4572
	if (enabled) {
4573
		clone_ctx = unclone_ctx(ctx);
4574
		ctx_resched(cpuctx, ctx, NULL, event_type);
4575
	}
4576
	perf_ctx_unlock(cpuctx, ctx);
4577

4578
out:
4579
	local_irq_restore(flags);
4580

4581
	if (clone_ctx)
4582
		put_ctx(clone_ctx);
4583
}
4584

4585
static void perf_remove_from_owner(struct perf_event *event);
4586
static void perf_event_exit_event(struct perf_event *event,
4587
				  struct perf_event_context *ctx,
4588
				  struct task_struct *task,
4589
				  bool revoke);
4590

4591
/*
4592
 * Removes all events from the current task that have been marked
4593
 * remove-on-exec, and feeds their values back to parent events.
4594
 */
4595
static void perf_event_remove_on_exec(struct perf_event_context *ctx)
4596
{
4597
	struct perf_event_context *clone_ctx = NULL;
4598
	struct perf_event *event, *next;
4599
	unsigned long flags;
4600
	bool modified = false;
4601

4602
	mutex_lock(&ctx->mutex);
4603

4604
	if (WARN_ON_ONCE(ctx->task != current))
4605
		goto unlock;
4606

4607
	list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
4608
		if (!event->attr.remove_on_exec)
4609
			continue;
4610

4611
		if (!is_kernel_event(event))
4612
			perf_remove_from_owner(event);
4613

4614
		modified = true;
4615

4616
		perf_event_exit_event(event, ctx, ctx->task, false);
4617
	}
4618

4619
	raw_spin_lock_irqsave(&ctx->lock, flags);
4620
	if (modified)
4621
		clone_ctx = unclone_ctx(ctx);
4622
	raw_spin_unlock_irqrestore(&ctx->lock, flags);
4623

4624
unlock:
4625
	mutex_unlock(&ctx->mutex);
4626

4627
	if (clone_ctx)
4628
		put_ctx(clone_ctx);
4629
}
4630

4631
struct perf_read_data {
4632
	struct perf_event *event;
4633
	bool group;
4634
	int ret;
4635
};
4636

4637
static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu);
4638

4639
static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
4640
{
4641
	int local_cpu = smp_processor_id();
4642
	u16 local_pkg, event_pkg;
4643

4644
	if ((unsigned)event_cpu >= nr_cpu_ids)
4645
		return event_cpu;
4646

4647
	if (event->group_caps & PERF_EV_CAP_READ_SCOPE) {
4648
		const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu);
4649

4650
		if (cpumask && cpumask_test_cpu(local_cpu, cpumask))
4651
			return local_cpu;
4652
	}
4653

4654
	if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
4655
		event_pkg = topology_physical_package_id(event_cpu);
4656
		local_pkg = topology_physical_package_id(local_cpu);
4657

4658
		if (event_pkg == local_pkg)
4659
			return local_cpu;
4660
	}
4661

4662
	return event_cpu;
4663
}
4664

4665
/*
4666
 * Cross CPU call to read the hardware event
4667
 */
4668
static void __perf_event_read(void *info)
4669
{
4670
	struct perf_read_data *data = info;
4671
	struct perf_event *sub, *event = data->event;
4672
	struct perf_event_context *ctx = event->ctx;
4673
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
4674
	struct pmu *pmu = event->pmu;
4675

4676
	/*
4677
	 * If this is a task context, we need to check whether it is
4678
	 * the current task context of this cpu.  If not it has been
4679
	 * scheduled out before the smp call arrived.  In that case
4680
	 * event->count would have been updated to a recent sample
4681
	 * when the event was scheduled out.
4682
	 */
4683
	if (ctx->task && cpuctx->task_ctx != ctx)
4684
		return;
4685

4686
	raw_spin_lock(&ctx->lock);
4687
	ctx_time_update_event(ctx, event);
4688

4689
	perf_event_update_time(event);
4690
	if (data->group)
4691
		perf_event_update_sibling_time(event);
4692

4693
	if (event->state != PERF_EVENT_STATE_ACTIVE)
4694
		goto unlock;
4695

4696
	if (!data->group) {
4697
		pmu->read(event);
4698
		data->ret = 0;
4699
		goto unlock;
4700
	}
4701

4702
	pmu->start_txn(pmu, PERF_PMU_TXN_READ);
4703

4704
	pmu->read(event);
4705

4706
	for_each_sibling_event(sub, event)
4707
		perf_pmu_read(sub);
4708

4709
	data->ret = pmu->commit_txn(pmu);
4710

4711
unlock:
4712
	raw_spin_unlock(&ctx->lock);
4713
}
4714

4715
static inline u64 perf_event_count(struct perf_event *event, bool self)
4716
{
4717
	if (self)
4718
		return local64_read(&event->count);
4719

4720
	return local64_read(&event->count) + atomic64_read(&event->child_count);
4721
}
4722

4723
static void calc_timer_values(struct perf_event *event,
4724
				u64 *now,
4725
				u64 *enabled,
4726
				u64 *running)
4727
{
4728
	u64 ctx_time;
4729

4730
	*now = perf_clock();
4731
	ctx_time = perf_event_time_now(event, *now);
4732
	__perf_update_times(event, ctx_time, enabled, running);
4733
}
4734

4735
/*
4736
 * NMI-safe method to read a local event, that is an event that
4737
 * is:
4738
 *   - either for the current task, or for this CPU
4739
 *   - does not have inherit set, for inherited task events
4740
 *     will not be local and we cannot read them atomically
4741
 *   - must not have a pmu::count method
4742
 */
4743
int perf_event_read_local(struct perf_event *event, u64 *value,
4744
			  u64 *enabled, u64 *running)
4745
{
4746
	unsigned long flags;
4747
	int event_oncpu;
4748
	int event_cpu;
4749
	int ret = 0;
4750

4751
	/*
4752
	 * Disabling interrupts avoids all counter scheduling (context
4753
	 * switches, timer based rotation and IPIs).
4754
	 */
4755
	local_irq_save(flags);
4756

4757
	/*
4758
	 * It must not be an event with inherit set, we cannot read
4759
	 * all child counters from atomic context.
4760
	 */
4761
	if (event->attr.inherit) {
4762
		ret = -EOPNOTSUPP;
4763
		goto out;
4764
	}
4765

4766
	/* If this is a per-task event, it must be for current */
4767
	if ((event->attach_state & PERF_ATTACH_TASK) &&
4768
	    event->hw.target != current) {
4769
		ret = -EINVAL;
4770
		goto out;
4771
	}
4772

4773
	/*
4774
	 * Get the event CPU numbers, and adjust them to local if the event is
4775
	 * a per-package event that can be read locally
4776
	 */
4777
	event_oncpu = __perf_event_read_cpu(event, event->oncpu);
4778
	event_cpu = __perf_event_read_cpu(event, event->cpu);
4779

4780
	/* If this is a per-CPU event, it must be for this CPU */
4781
	if (!(event->attach_state & PERF_ATTACH_TASK) &&
4782
	    event_cpu != smp_processor_id()) {
4783
		ret = -EINVAL;
4784
		goto out;
4785
	}
4786

4787
	/* If this is a pinned event it must be running on this CPU */
4788
	if (event->attr.pinned && event_oncpu != smp_processor_id()) {
4789
		ret = -EBUSY;
4790
		goto out;
4791
	}
4792

4793
	/*
4794
	 * If the event is currently on this CPU, its either a per-task event,
4795
	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
4796
	 * oncpu == -1).
4797
	 */
4798
	if (event_oncpu == smp_processor_id())
4799
		event->pmu->read(event);
4800

4801
	*value = local64_read(&event->count);
4802
	if (enabled || running) {
4803
		u64 __enabled, __running, __now;
4804

4805
		calc_timer_values(event, &__now, &__enabled, &__running);
4806
		if (enabled)
4807
			*enabled = __enabled;
4808
		if (running)
4809
			*running = __running;
4810
	}
4811
out:
4812
	local_irq_restore(flags);
4813

4814
	return ret;
4815
}
4816

4817
static int perf_event_read(struct perf_event *event, bool group)
4818
{
4819
	enum perf_event_state state = READ_ONCE(event->state);
4820
	int event_cpu, ret = 0;
4821

4822
	/*
4823
	 * If event is enabled and currently active on a CPU, update the
4824
	 * value in the event structure:
4825
	 */
4826
again:
4827
	if (state == PERF_EVENT_STATE_ACTIVE) {
4828
		struct perf_read_data data;
4829

4830
		/*
4831
		 * Orders the ->state and ->oncpu loads such that if we see
4832
		 * ACTIVE we must also see the right ->oncpu.
4833
		 *
4834
		 * Matches the smp_wmb() from event_sched_in().
4835
		 */
4836
		smp_rmb();
4837

4838
		event_cpu = READ_ONCE(event->oncpu);
4839
		if ((unsigned)event_cpu >= nr_cpu_ids)
4840
			return 0;
4841

4842
		data = (struct perf_read_data){
4843
			.event = event,
4844
			.group = group,
4845
			.ret = 0,
4846
		};
4847

4848
		preempt_disable();
4849
		event_cpu = __perf_event_read_cpu(event, event_cpu);
4850

4851
		/*
4852
		 * Purposely ignore the smp_call_function_single() return
4853
		 * value.
4854
		 *
4855
		 * If event_cpu isn't a valid CPU it means the event got
4856
		 * scheduled out and that will have updated the event count.
4857
		 *
4858
		 * Therefore, either way, we'll have an up-to-date event count
4859
		 * after this.
4860
		 */
4861
		(void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4862
		preempt_enable();
4863
		ret = data.ret;
4864

4865
	} else if (state == PERF_EVENT_STATE_INACTIVE) {
4866
		struct perf_event_context *ctx = event->ctx;
4867
		unsigned long flags;
4868

4869
		raw_spin_lock_irqsave(&ctx->lock, flags);
4870
		state = event->state;
4871
		if (state != PERF_EVENT_STATE_INACTIVE) {
4872
			raw_spin_unlock_irqrestore(&ctx->lock, flags);
4873
			goto again;
4874
		}
4875

4876
		/*
4877
		 * May read while context is not active (e.g., thread is
4878
		 * blocked), in that case we cannot update context time
4879
		 */
4880
		ctx_time_update_event(ctx, event);
4881

4882
		perf_event_update_time(event);
4883
		if (group)
4884
			perf_event_update_sibling_time(event);
4885
		raw_spin_unlock_irqrestore(&ctx->lock, flags);
4886
	}
4887

4888
	return ret;
4889
}
4890

4891
/*
4892
 * Initialize the perf_event context in a task_struct:
4893
 */
4894
static void __perf_event_init_context(struct perf_event_context *ctx)
4895
{
4896
	raw_spin_lock_init(&ctx->lock);
4897
	mutex_init(&ctx->mutex);
4898
	INIT_LIST_HEAD(&ctx->pmu_ctx_list);
4899
	perf_event_groups_init(&ctx->pinned_groups);
4900
	perf_event_groups_init(&ctx->flexible_groups);
4901
	INIT_LIST_HEAD(&ctx->event_list);
4902
	refcount_set(&ctx->refcount, 1);
4903
}
4904

4905
static void
4906
__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu)
4907
{
4908
	epc->pmu = pmu;
4909
	INIT_LIST_HEAD(&epc->pmu_ctx_entry);
4910
	INIT_LIST_HEAD(&epc->pinned_active);
4911
	INIT_LIST_HEAD(&epc->flexible_active);
4912
	atomic_set(&epc->refcount, 1);
4913
}
4914

4915
static struct perf_event_context *
4916
alloc_perf_context(struct task_struct *task)
4917
{
4918
	struct perf_event_context *ctx;
4919

4920
	ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4921
	if (!ctx)
4922
		return NULL;
4923

4924
	__perf_event_init_context(ctx);
4925
	if (task)
4926
		ctx->task = get_task_struct(task);
4927

4928
	return ctx;
4929
}
4930

4931
static struct task_struct *
4932
find_lively_task_by_vpid(pid_t vpid)
4933
{
4934
	struct task_struct *task;
4935

4936
	rcu_read_lock();
4937
	if (!vpid)
4938
		task = current;
4939
	else
4940
		task = find_task_by_vpid(vpid);
4941
	if (task)
4942
		get_task_struct(task);
4943
	rcu_read_unlock();
4944

4945
	if (!task)
4946
		return ERR_PTR(-ESRCH);
4947

4948
	return task;
4949
}
4950

4951
/*
4952
 * Returns a matching context with refcount and pincount.
4953
 */
4954
static struct perf_event_context *
4955
find_get_context(struct task_struct *task, struct perf_event *event)
4956
{
4957
	struct perf_event_context *ctx, *clone_ctx = NULL;
4958
	struct perf_cpu_context *cpuctx;
4959
	unsigned long flags;
4960
	int err;
4961

4962
	if (!task) {
4963
		/* Must be root to operate on a CPU event: */
4964
		err = perf_allow_cpu();
4965
		if (err)
4966
			return ERR_PTR(err);
4967

4968
		cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
4969
		ctx = &cpuctx->ctx;
4970
		get_ctx(ctx);
4971
		raw_spin_lock_irqsave(&ctx->lock, flags);
4972
		++ctx->pin_count;
4973
		raw_spin_unlock_irqrestore(&ctx->lock, flags);
4974

4975
		return ctx;
4976
	}
4977

4978
	err = -EINVAL;
4979
retry:
4980
	ctx = perf_lock_task_context(task, &flags);
4981
	if (ctx) {
4982
		clone_ctx = unclone_ctx(ctx);
4983
		++ctx->pin_count;
4984

4985
		raw_spin_unlock_irqrestore(&ctx->lock, flags);
4986

4987
		if (clone_ctx)
4988
			put_ctx(clone_ctx);
4989
	} else {
4990
		ctx = alloc_perf_context(task);
4991
		err = -ENOMEM;
4992
		if (!ctx)
4993
			goto errout;
4994

4995
		err = 0;
4996
		mutex_lock(&task->perf_event_mutex);
4997
		/*
4998
		 * If it has already passed perf_event_exit_task().
4999
		 * we must see PF_EXITING, it takes this mutex too.
5000
		 */
5001
		if (task->flags & PF_EXITING)
5002
			err = -ESRCH;
5003
		else if (task->perf_event_ctxp)
5004
			err = -EAGAIN;
5005
		else {
5006
			get_ctx(ctx);
5007
			++ctx->pin_count;
5008
			rcu_assign_pointer(task->perf_event_ctxp, ctx);
5009
		}
5010
		mutex_unlock(&task->perf_event_mutex);
5011

5012
		if (unlikely(err)) {
5013
			put_ctx(ctx);
5014

5015
			if (err == -EAGAIN)
5016
				goto retry;
5017
			goto errout;
5018
		}
5019
	}
5020

5021
	return ctx;
5022

5023
errout:
5024
	return ERR_PTR(err);
5025
}
5026

5027
static struct perf_event_pmu_context *
5028
find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
5029
		     struct perf_event *event)
5030
{
5031
	struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc;
5032

5033
	if (!ctx->task) {
5034
		/*
5035
		 * perf_pmu_migrate_context() / __perf_pmu_install_event()
5036
		 * relies on the fact that find_get_pmu_context() cannot fail
5037
		 * for CPU contexts.
5038
		 */
5039
		struct perf_cpu_pmu_context *cpc;
5040

5041
		cpc = *per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
5042
		epc = &cpc->epc;
5043
		raw_spin_lock_irq(&ctx->lock);
5044
		if (!epc->ctx) {
5045
			/*
5046
			 * One extra reference for the pmu; see perf_pmu_free().
5047
			 */
5048
			atomic_set(&epc->refcount, 2);
5049
			epc->embedded = 1;
5050
			list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
5051
			epc->ctx = ctx;
5052
		} else {
5053
			WARN_ON_ONCE(epc->ctx != ctx);
5054
			atomic_inc(&epc->refcount);
5055
		}
5056
		raw_spin_unlock_irq(&ctx->lock);
5057
		return epc;
5058
	}
5059

5060
	new = kzalloc(sizeof(*epc), GFP_KERNEL);
5061
	if (!new)
5062
		return ERR_PTR(-ENOMEM);
5063

5064
	__perf_init_event_pmu_context(new, pmu);
5065

5066
	/*
5067
	 * XXX
5068
	 *
5069
	 * lockdep_assert_held(&ctx->mutex);
5070
	 *
5071
	 * can't because perf_event_init_task() doesn't actually hold the
5072
	 * child_ctx->mutex.
5073
	 */
5074

5075
	raw_spin_lock_irq(&ctx->lock);
5076
	list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) {
5077
		if (epc->pmu == pmu) {
5078
			WARN_ON_ONCE(epc->ctx != ctx);
5079
			atomic_inc(&epc->refcount);
5080
			goto found_epc;
5081
		}
5082
		/* Make sure the pmu_ctx_list is sorted by PMU type: */
5083
		if (!pos && epc->pmu->type > pmu->type)
5084
			pos = epc;
5085
	}
5086

5087
	epc = new;
5088
	new = NULL;
5089

5090
	if (!pos)
5091
		list_add_tail(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
5092
	else
5093
		list_add(&epc->pmu_ctx_entry, pos->pmu_ctx_entry.prev);
5094

5095
	epc->ctx = ctx;
5096

5097
found_epc:
5098
	raw_spin_unlock_irq(&ctx->lock);
5099
	kfree(new);
5100

5101
	return epc;
5102
}
5103

5104
static void get_pmu_ctx(struct perf_event_pmu_context *epc)
5105
{
5106
	WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
5107
}
5108

5109
static void free_cpc_rcu(struct rcu_head *head)
5110
{
5111
	struct perf_cpu_pmu_context *cpc =
5112
		container_of(head, typeof(*cpc), epc.rcu_head);
5113

5114
	kfree(cpc);
5115
}
5116

5117
static void free_epc_rcu(struct rcu_head *head)
5118
{
5119
	struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);
5120

5121
	kfree(epc);
5122
}
5123

5124
static void put_pmu_ctx(struct perf_event_pmu_context *epc)
5125
{
5126
	struct perf_event_context *ctx = epc->ctx;
5127
	unsigned long flags;
5128

5129
	/*
5130
	 * XXX
5131
	 *
5132
	 * lockdep_assert_held(&ctx->mutex);
5133
	 *
5134
	 * can't because of the call-site in _free_event()/put_event()
5135
	 * which isn't always called under ctx->mutex.
5136
	 */
5137
	if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags))
5138
		return;
5139

5140
	WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
5141

5142
	list_del_init(&epc->pmu_ctx_entry);
5143
	epc->ctx = NULL;
5144

5145
	WARN_ON_ONCE(!list_empty(&epc->pinned_active));
5146
	WARN_ON_ONCE(!list_empty(&epc->flexible_active));
5147

5148
	raw_spin_unlock_irqrestore(&ctx->lock, flags);
5149

5150
	if (epc->embedded) {
5151
		call_rcu(&epc->rcu_head, free_cpc_rcu);
5152
		return;
5153
	}
5154

5155
	call_rcu(&epc->rcu_head, free_epc_rcu);
5156
}
5157

5158
static void perf_event_free_filter(struct perf_event *event);
5159

5160
static void free_event_rcu(struct rcu_head *head)
5161
{
5162
	struct perf_event *event = container_of(head, typeof(*event), rcu_head);
5163

5164
	if (event->ns)
5165
		put_pid_ns(event->ns);
5166
	perf_event_free_filter(event);
5167
	kmem_cache_free(perf_event_cache, event);
5168
}
5169

5170
static void ring_buffer_attach(struct perf_event *event,
5171
			       struct perf_buffer *rb);
5172

5173
static void detach_sb_event(struct perf_event *event)
5174
{
5175
	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
5176

5177
	raw_spin_lock(&pel->lock);
5178
	list_del_rcu(&event->sb_list);
5179
	raw_spin_unlock(&pel->lock);
5180
}
5181

5182
static bool is_sb_event(struct perf_event *event)
5183
{
5184
	struct perf_event_attr *attr = &event->attr;
5185

5186
	if (event->parent)
5187
		return false;
5188

5189
	if (event->attach_state & PERF_ATTACH_TASK)
5190
		return false;
5191

5192
	if (attr->mmap || attr->mmap_data || attr->mmap2 ||
5193
	    attr->comm || attr->comm_exec ||
5194
	    attr->task || attr->ksymbol ||
5195
	    attr->context_switch || attr->text_poke ||
5196
	    attr->bpf_event)
5197
		return true;
5198

5199
	return false;
5200
}
5201

5202
static void unaccount_pmu_sb_event(struct perf_event *event)
5203
{
5204
	if (is_sb_event(event))
5205
		detach_sb_event(event);
5206
}
5207

5208
#ifdef CONFIG_NO_HZ_FULL
5209
static DEFINE_SPINLOCK(nr_freq_lock);
5210
#endif
5211

5212
static void unaccount_freq_event_nohz(void)
5213
{
5214
#ifdef CONFIG_NO_HZ_FULL
5215
	spin_lock(&nr_freq_lock);
5216
	if (atomic_dec_and_test(&nr_freq_events))
5217
		tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
5218
	spin_unlock(&nr_freq_lock);
5219
#endif
5220
}
5221

5222
static void unaccount_freq_event(void)
5223
{
5224
	if (tick_nohz_full_enabled())
5225
		unaccount_freq_event_nohz();
5226
	else
5227
		atomic_dec(&nr_freq_events);
5228
}
5229

5230

5231
static struct perf_ctx_data *
5232
alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global)
5233
{
5234
	struct perf_ctx_data *cd;
5235

5236
	cd = kzalloc(sizeof(*cd), GFP_KERNEL);
5237
	if (!cd)
5238
		return NULL;
5239

5240
	cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL);
5241
	if (!cd->data) {
5242
		kfree(cd);
5243
		return NULL;
5244
	}
5245

5246
	cd->global = global;
5247
	cd->ctx_cache = ctx_cache;
5248
	refcount_set(&cd->refcount, 1);
5249

5250
	return cd;
5251
}
5252

5253
static void free_perf_ctx_data(struct perf_ctx_data *cd)
5254
{
5255
	kmem_cache_free(cd->ctx_cache, cd->data);
5256
	kfree(cd);
5257
}
5258

5259
static void __free_perf_ctx_data_rcu(struct rcu_head *rcu_head)
5260
{
5261
	struct perf_ctx_data *cd;
5262

5263
	cd = container_of(rcu_head, struct perf_ctx_data, rcu_head);
5264
	free_perf_ctx_data(cd);
5265
}
5266

5267
static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd)
5268
{
5269
	call_rcu(&cd->rcu_head, __free_perf_ctx_data_rcu);
5270
}
5271

5272
static int
5273
attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
5274
		     bool global)
5275
{
5276
	struct perf_ctx_data *cd, *old = NULL;
5277

5278
	cd = alloc_perf_ctx_data(ctx_cache, global);
5279
	if (!cd)
5280
		return -ENOMEM;
5281

5282
	for (;;) {
5283
		if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) {
5284
			if (old)
5285
				perf_free_ctx_data_rcu(old);
5286
			return 0;
5287
		}
5288

5289
		if (!old) {
5290
			/*
5291
			 * After seeing a dead @old, we raced with
5292
			 * removal and lost, try again to install @cd.
5293
			 */
5294
			continue;
5295
		}
5296

5297
		if (refcount_inc_not_zero(&old->refcount)) {
5298
			free_perf_ctx_data(cd); /* unused */
5299
			return 0;
5300
		}
5301

5302
		/*
5303
		 * @old is a dead object, refcount==0 is stable, try and
5304
		 * replace it with @cd.
5305
		 */
5306
	}
5307
	return 0;
5308
}
5309

5310
static void __detach_global_ctx_data(void);
5311
DEFINE_STATIC_PERCPU_RWSEM(global_ctx_data_rwsem);
5312
static refcount_t global_ctx_data_ref;
5313

5314
static int
5315
attach_global_ctx_data(struct kmem_cache *ctx_cache)
5316
{
5317
	struct task_struct *g, *p;
5318
	struct perf_ctx_data *cd;
5319
	int ret;
5320

5321
	if (refcount_inc_not_zero(&global_ctx_data_ref))
5322
		return 0;
5323

5324
	guard(percpu_write)(&global_ctx_data_rwsem);
5325
	if (refcount_inc_not_zero(&global_ctx_data_ref))
5326
		return 0;
5327
again:
5328
	/* Allocate everything */
5329
	scoped_guard (rcu) {
5330
		for_each_process_thread(g, p) {
5331
			cd = rcu_dereference(p->perf_ctx_data);
5332
			if (cd && !cd->global) {
5333
				cd->global = 1;
5334
				if (!refcount_inc_not_zero(&cd->refcount))
5335
					cd = NULL;
5336
			}
5337
			if (!cd) {
5338
				get_task_struct(p);
5339
				goto alloc;
5340
			}
5341
		}
5342
	}
5343

5344
	refcount_set(&global_ctx_data_ref, 1);
5345

5346
	return 0;
5347
alloc:
5348
	ret = attach_task_ctx_data(p, ctx_cache, true);
5349
	put_task_struct(p);
5350
	if (ret) {
5351
		__detach_global_ctx_data();
5352
		return ret;
5353
	}
5354
	goto again;
5355
}
5356

5357
static int
5358
attach_perf_ctx_data(struct perf_event *event)
5359
{
5360
	struct task_struct *task = event->hw.target;
5361
	struct kmem_cache *ctx_cache = event->pmu->task_ctx_cache;
5362
	int ret;
5363

5364
	if (!ctx_cache)
5365
		return -ENOMEM;
5366

5367
	if (task)
5368
		return attach_task_ctx_data(task, ctx_cache, false);
5369

5370
	ret = attach_global_ctx_data(ctx_cache);
5371
	if (ret)
5372
		return ret;
5373

5374
	event->attach_state |= PERF_ATTACH_GLOBAL_DATA;
5375
	return 0;
5376
}
5377

5378
static void
5379
detach_task_ctx_data(struct task_struct *p)
5380
{
5381
	struct perf_ctx_data *cd;
5382

5383
	scoped_guard (rcu) {
5384
		cd = rcu_dereference(p->perf_ctx_data);
5385
		if (!cd || !refcount_dec_and_test(&cd->refcount))
5386
			return;
5387
	}
5388

5389
	/*
5390
	 * The old ctx_data may be lost because of the race.
5391
	 * Nothing is required to do for the case.
5392
	 * See attach_task_ctx_data().
5393
	 */
5394
	if (try_cmpxchg((struct perf_ctx_data **)&p->perf_ctx_data, &cd, NULL))
5395
		perf_free_ctx_data_rcu(cd);
5396
}
5397

5398
static void __detach_global_ctx_data(void)
5399
{
5400
	struct task_struct *g, *p;
5401
	struct perf_ctx_data *cd;
5402

5403
again:
5404
	scoped_guard (rcu) {
5405
		for_each_process_thread(g, p) {
5406
			cd = rcu_dereference(p->perf_ctx_data);
5407
			if (!cd || !cd->global)
5408
				continue;
5409
			cd->global = 0;
5410
			get_task_struct(p);
5411
			goto detach;
5412
		}
5413
	}
5414
	return;
5415
detach:
5416
	detach_task_ctx_data(p);
5417
	put_task_struct(p);
5418
	goto again;
5419
}
5420

5421
static void detach_global_ctx_data(void)
5422
{
5423
	if (refcount_dec_not_one(&global_ctx_data_ref))
5424
		return;
5425

5426
	guard(percpu_write)(&global_ctx_data_rwsem);
5427
	if (!refcount_dec_and_test(&global_ctx_data_ref))
5428
		return;
5429

5430
	/* remove everything */
5431
	__detach_global_ctx_data();
5432
}
5433

5434
static void detach_perf_ctx_data(struct perf_event *event)
5435
{
5436
	struct task_struct *task = event->hw.target;
5437

5438
	event->attach_state &= ~PERF_ATTACH_TASK_DATA;
5439

5440
	if (task)
5441
		return detach_task_ctx_data(task);
5442

5443
	if (event->attach_state & PERF_ATTACH_GLOBAL_DATA) {
5444
		detach_global_ctx_data();
5445
		event->attach_state &= ~PERF_ATTACH_GLOBAL_DATA;
5446
	}
5447
}
5448

5449
static void unaccount_event(struct perf_event *event)
5450
{
5451
	bool dec = false;
5452

5453
	if (event->parent)
5454
		return;
5455

5456
	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
5457
		dec = true;
5458
	if (event->attr.mmap || event->attr.mmap_data)
5459
		atomic_dec(&nr_mmap_events);
5460
	if (event->attr.build_id)
5461
		atomic_dec(&nr_build_id_events);
5462
	if (event->attr.comm)
5463
		atomic_dec(&nr_comm_events);
5464
	if (event->attr.namespaces)
5465
		atomic_dec(&nr_namespaces_events);
5466
	if (event->attr.cgroup)
5467
		atomic_dec(&nr_cgroup_events);
5468
	if (event->attr.task)
5469
		atomic_dec(&nr_task_events);
5470
	if (event->attr.freq)
5471
		unaccount_freq_event();
5472
	if (event->attr.context_switch) {
5473
		dec = true;
5474
		atomic_dec(&nr_switch_events);
5475
	}
5476
	if (is_cgroup_event(event))
5477
		dec = true;
5478
	if (has_branch_stack(event))
5479
		dec = true;
5480
	if (event->attr.ksymbol)
5481
		atomic_dec(&nr_ksymbol_events);
5482
	if (event->attr.bpf_event)
5483
		atomic_dec(&nr_bpf_events);
5484
	if (event->attr.text_poke)
5485
		atomic_dec(&nr_text_poke_events);
5486

5487
	if (dec) {
5488
		if (!atomic_add_unless(&perf_sched_count, -1, 1))
5489
			schedule_delayed_work(&perf_sched_work, HZ);
5490
	}
5491

5492
	unaccount_pmu_sb_event(event);
5493
}
5494

5495
static void perf_sched_delayed(struct work_struct *work)
5496
{
5497
	mutex_lock(&perf_sched_mutex);
5498
	if (atomic_dec_and_test(&perf_sched_count))
5499
		static_branch_disable(&perf_sched_events);
5500
	mutex_unlock(&perf_sched_mutex);
5501
}
5502

5503
/*
5504
 * The following implement mutual exclusion of events on "exclusive" pmus
5505
 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
5506
 * at a time, so we disallow creating events that might conflict, namely:
5507
 *
5508
 *  1) cpu-wide events in the presence of per-task events,
5509
 *  2) per-task events in the presence of cpu-wide events,
5510
 *  3) two matching events on the same perf_event_context.
5511
 *
5512
 * The former two cases are handled in the allocation path (perf_event_alloc(),
5513
 * _free_event()), the latter -- before the first perf_install_in_context().
5514
 */
5515
static int exclusive_event_init(struct perf_event *event)
5516
{
5517
	struct pmu *pmu = event->pmu;
5518

5519
	if (!is_exclusive_pmu(pmu))
5520
		return 0;
5521

5522
	/*
5523
	 * Prevent co-existence of per-task and cpu-wide events on the
5524
	 * same exclusive pmu.
5525
	 *
5526
	 * Negative pmu::exclusive_cnt means there are cpu-wide
5527
	 * events on this "exclusive" pmu, positive means there are
5528
	 * per-task events.
5529
	 *
5530
	 * Since this is called in perf_event_alloc() path, event::ctx
5531
	 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
5532
	 * to mean "per-task event", because unlike other attach states it
5533
	 * never gets cleared.
5534
	 */
5535
	if (event->attach_state & PERF_ATTACH_TASK) {
5536
		if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
5537
			return -EBUSY;
5538
	} else {
5539
		if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
5540
			return -EBUSY;
5541
	}
5542

5543
	event->attach_state |= PERF_ATTACH_EXCLUSIVE;
5544

5545
	return 0;
5546
}
5547

5548
static void exclusive_event_destroy(struct perf_event *event)
5549
{
5550
	struct pmu *pmu = event->pmu;
5551

5552
	/* see comment in exclusive_event_init() */
5553
	if (event->attach_state & PERF_ATTACH_TASK)
5554
		atomic_dec(&pmu->exclusive_cnt);
5555
	else
5556
		atomic_inc(&pmu->exclusive_cnt);
5557

5558
	event->attach_state &= ~PERF_ATTACH_EXCLUSIVE;
5559
}
5560

5561
static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
5562
{
5563
	if ((e1->pmu == e2->pmu) &&
5564
	    (e1->cpu == e2->cpu ||
5565
	     e1->cpu == -1 ||
5566
	     e2->cpu == -1))
5567
		return true;
5568
	return false;
5569
}
5570

5571
static bool exclusive_event_installable(struct perf_event *event,
5572
					struct perf_event_context *ctx)
5573
{
5574
	struct perf_event *iter_event;
5575
	struct pmu *pmu = event->pmu;
5576

5577
	lockdep_assert_held(&ctx->mutex);
5578

5579
	if (!is_exclusive_pmu(pmu))
5580
		return true;
5581

5582
	list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
5583
		if (exclusive_event_match(iter_event, event))
5584
			return false;
5585
	}
5586

5587
	return true;
5588
}
5589

5590
static void perf_free_addr_filters(struct perf_event *event);
5591

5592
/* vs perf_event_alloc() error */
5593
static void __free_event(struct perf_event *event)
5594
{
5595
	struct pmu *pmu = event->pmu;
5596

5597
	if (event->attach_state & PERF_ATTACH_CALLCHAIN)
5598
		put_callchain_buffers();
5599

5600
	kfree(event->addr_filter_ranges);
5601

5602
	if (event->attach_state & PERF_ATTACH_EXCLUSIVE)
5603
		exclusive_event_destroy(event);
5604

5605
	if (is_cgroup_event(event))
5606
		perf_detach_cgroup(event);
5607

5608
	if (event->attach_state & PERF_ATTACH_TASK_DATA)
5609
		detach_perf_ctx_data(event);
5610

5611
	if (event->destroy)
5612
		event->destroy(event);
5613

5614
	/*
5615
	 * Must be after ->destroy(), due to uprobe_perf_close() using
5616
	 * hw.target.
5617
	 */
5618
	if (event->hw.target)
5619
		put_task_struct(event->hw.target);
5620

5621
	if (event->pmu_ctx) {
5622
		/*
5623
		 * put_pmu_ctx() needs an event->ctx reference, because of
5624
		 * epc->ctx.
5625
		 */
5626
		WARN_ON_ONCE(!pmu);
5627
		WARN_ON_ONCE(!event->ctx);
5628
		WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
5629
		put_pmu_ctx(event->pmu_ctx);
5630
	}
5631

5632
	/*
5633
	 * perf_event_free_task() relies on put_ctx() being 'last', in
5634
	 * particular all task references must be cleaned up.
5635
	 */
5636
	if (event->ctx)
5637
		put_ctx(event->ctx);
5638

5639
	if (pmu) {
5640
		module_put(pmu->module);
5641
		scoped_guard (spinlock, &pmu->events_lock) {
5642
			list_del(&event->pmu_list);
5643
			wake_up_var(pmu);
5644
		}
5645
	}
5646

5647
	call_rcu(&event->rcu_head, free_event_rcu);
5648
}
5649

5650
DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T))
5651

5652
/* vs perf_event_alloc() success */
5653
static void _free_event(struct perf_event *event)
5654
{
5655
	irq_work_sync(&event->pending_irq);
5656
	irq_work_sync(&event->pending_disable_irq);
5657

5658
	unaccount_event(event);
5659

5660
	security_perf_event_free(event);
5661

5662
	if (event->rb) {
5663
		/*
5664
		 * Can happen when we close an event with re-directed output.
5665
		 *
5666
		 * Since we have a 0 refcount, perf_mmap_close() will skip
5667
		 * over us; possibly making our ring_buffer_put() the last.
5668
		 */
5669
		mutex_lock(&event->mmap_mutex);
5670
		ring_buffer_attach(event, NULL);
5671
		mutex_unlock(&event->mmap_mutex);
5672
	}
5673

5674
	perf_event_free_bpf_prog(event);
5675
	perf_free_addr_filters(event);
5676

5677
	__free_event(event);
5678
}
5679

5680
/*
5681
 * Used to free events which have a known refcount of 1, such as in error paths
5682
 * of inherited events.
5683
 */
5684
static void free_event(struct perf_event *event)
5685
{
5686
	if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
5687
				     "unexpected event refcount: %ld; ptr=%p\n",
5688
				     atomic_long_read(&event->refcount), event)) {
5689
		/* leak to avoid use-after-free */
5690
		return;
5691
	}
5692

5693
	_free_event(event);
5694
}
5695

5696
/*
5697
 * Remove user event from the owner task.
5698
 */
5699
static void perf_remove_from_owner(struct perf_event *event)
5700
{
5701
	struct task_struct *owner;
5702

5703
	rcu_read_lock();
5704
	/*
5705
	 * Matches the smp_store_release() in perf_event_exit_task(). If we
5706
	 * observe !owner it means the list deletion is complete and we can
5707
	 * indeed free this event, otherwise we need to serialize on
5708
	 * owner->perf_event_mutex.
5709
	 */
5710
	owner = READ_ONCE(event->owner);
5711
	if (owner) {
5712
		/*
5713
		 * Since delayed_put_task_struct() also drops the last
5714
		 * task reference we can safely take a new reference
5715
		 * while holding the rcu_read_lock().
5716
		 */
5717
		get_task_struct(owner);
5718
	}
5719
	rcu_read_unlock();
5720

5721
	if (owner) {
5722
		/*
5723
		 * If we're here through perf_event_exit_task() we're already
5724
		 * holding ctx->mutex which would be an inversion wrt. the
5725
		 * normal lock order.
5726
		 *
5727
		 * However we can safely take this lock because its the child
5728
		 * ctx->mutex.
5729
		 */
5730
		mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
5731

5732
		/*
5733
		 * We have to re-check the event->owner field, if it is cleared
5734
		 * we raced with perf_event_exit_task(), acquiring the mutex
5735
		 * ensured they're done, and we can proceed with freeing the
5736
		 * event.
5737
		 */
5738
		if (event->owner) {
5739
			list_del_init(&event->owner_entry);
5740
			smp_store_release(&event->owner, NULL);
5741
		}
5742
		mutex_unlock(&owner->perf_event_mutex);
5743
		put_task_struct(owner);
5744
	}
5745
}
5746

5747
static void put_event(struct perf_event *event)
5748
{
5749
	struct perf_event *parent;
5750

5751
	if (!atomic_long_dec_and_test(&event->refcount))
5752
		return;
5753

5754
	parent = event->parent;
5755
	_free_event(event);
5756

5757
	/* Matches the refcount bump in inherit_event() */
5758
	if (parent)
5759
		put_event(parent);
5760
}
5761

5762
/*
5763
 * Kill an event dead; while event:refcount will preserve the event
5764
 * object, it will not preserve its functionality. Once the last 'user'
5765
 * gives up the object, we'll destroy the thing.
5766
 */
5767
int perf_event_release_kernel(struct perf_event *event)
5768
{
5769
	struct perf_event_context *ctx = event->ctx;
5770
	struct perf_event *child, *tmp;
5771

5772
	/*
5773
	 * If we got here through err_alloc: free_event(event); we will not
5774
	 * have attached to a context yet.
5775
	 */
5776
	if (!ctx) {
5777
		WARN_ON_ONCE(event->attach_state &
5778
				(PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
5779
		goto no_ctx;
5780
	}
5781

5782
	if (!is_kernel_event(event))
5783
		perf_remove_from_owner(event);
5784

5785
	ctx = perf_event_ctx_lock(event);
5786
	WARN_ON_ONCE(ctx->parent_ctx);
5787

5788
	/*
5789
	 * Mark this event as STATE_DEAD, there is no external reference to it
5790
	 * anymore.
5791
	 *
5792
	 * Anybody acquiring event->child_mutex after the below loop _must_
5793
	 * also see this, most importantly inherit_event() which will avoid
5794
	 * placing more children on the list.
5795
	 *
5796
	 * Thus this guarantees that we will in fact observe and kill _ALL_
5797
	 * child events.
5798
	 */
5799
	if (event->state > PERF_EVENT_STATE_REVOKED) {
5800
		perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
5801
	} else {
5802
		event->state = PERF_EVENT_STATE_DEAD;
5803
	}
5804

5805
	perf_event_ctx_unlock(event, ctx);
5806

5807
again:
5808
	mutex_lock(&event->child_mutex);
5809
	list_for_each_entry(child, &event->child_list, child_list) {
5810
		/*
5811
		 * Cannot change, child events are not migrated, see the
5812
		 * comment with perf_event_ctx_lock_nested().
5813
		 */
5814
		ctx = READ_ONCE(child->ctx);
5815
		/*
5816
		 * Since child_mutex nests inside ctx::mutex, we must jump
5817
		 * through hoops. We start by grabbing a reference on the ctx.
5818
		 *
5819
		 * Since the event cannot get freed while we hold the
5820
		 * child_mutex, the context must also exist and have a !0
5821
		 * reference count.
5822
		 */
5823
		get_ctx(ctx);
5824

5825
		/*
5826
		 * Now that we have a ctx ref, we can drop child_mutex, and
5827
		 * acquire ctx::mutex without fear of it going away. Then we
5828
		 * can re-acquire child_mutex.
5829
		 */
5830
		mutex_unlock(&event->child_mutex);
5831
		mutex_lock(&ctx->mutex);
5832
		mutex_lock(&event->child_mutex);
5833

5834
		/*
5835
		 * Now that we hold ctx::mutex and child_mutex, revalidate our
5836
		 * state, if child is still the first entry, it didn't get freed
5837
		 * and we can continue doing so.
5838
		 */
5839
		tmp = list_first_entry_or_null(&event->child_list,
5840
					       struct perf_event, child_list);
5841
		if (tmp == child) {
5842
			perf_remove_from_context(child, DETACH_GROUP | DETACH_CHILD);
5843
		} else {
5844
			child = NULL;
5845
		}
5846

5847
		mutex_unlock(&event->child_mutex);
5848
		mutex_unlock(&ctx->mutex);
5849

5850
		if (child) {
5851
			/* Last reference unless ->pending_task work is pending */
5852
			put_event(child);
5853
		}
5854
		put_ctx(ctx);
5855

5856
		goto again;
5857
	}
5858
	mutex_unlock(&event->child_mutex);
5859

5860
no_ctx:
5861
	/*
5862
	 * Last reference unless ->pending_task work is pending on this event
5863
	 * or any of its children.
5864
	 */
5865
	put_event(event);
5866
	return 0;
5867
}
5868
EXPORT_SYMBOL_GPL(perf_event_release_kernel);
5869

5870
/*
5871
 * Called when the last reference to the file is gone.
5872
 */
5873
static int perf_release(struct inode *inode, struct file *file)
5874
{
5875
	perf_event_release_kernel(file->private_data);
5876
	return 0;
5877
}
5878

5879
static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5880
{
5881
	struct perf_event *child;
5882
	u64 total = 0;
5883

5884
	*enabled = 0;
5885
	*running = 0;
5886

5887
	mutex_lock(&event->child_mutex);
5888

5889
	(void)perf_event_read(event, false);
5890
	total += perf_event_count(event, false);
5891

5892
	*enabled += event->total_time_enabled +
5893
			atomic64_read(&event->child_total_time_enabled);
5894
	*running += event->total_time_running +
5895
			atomic64_read(&event->child_total_time_running);
5896

5897
	list_for_each_entry(child, &event->child_list, child_list) {
5898
		(void)perf_event_read(child, false);
5899
		total += perf_event_count(child, false);
5900
		*enabled += child->total_time_enabled;
5901
		*running += child->total_time_running;
5902
	}
5903
	mutex_unlock(&event->child_mutex);
5904

5905
	return total;
5906
}
5907

5908
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5909
{
5910
	struct perf_event_context *ctx;
5911
	u64 count;
5912

5913
	ctx = perf_event_ctx_lock(event);
5914
	count = __perf_event_read_value(event, enabled, running);
5915
	perf_event_ctx_unlock(event, ctx);
5916

5917
	return count;
5918
}
5919
EXPORT_SYMBOL_GPL(perf_event_read_value);
5920

5921
static int __perf_read_group_add(struct perf_event *leader,
5922
					u64 read_format, u64 *values)
5923
{
5924
	struct perf_event_context *ctx = leader->ctx;
5925
	struct perf_event *sub, *parent;
5926
	unsigned long flags;
5927
	int n = 1; /* skip @nr */
5928
	int ret;
5929

5930
	ret = perf_event_read(leader, true);
5931
	if (ret)
5932
		return ret;
5933

5934
	raw_spin_lock_irqsave(&ctx->lock, flags);
5935
	/*
5936
	 * Verify the grouping between the parent and child (inherited)
5937
	 * events is still in tact.
5938
	 *
5939
	 * Specifically:
5940
	 *  - leader->ctx->lock pins leader->sibling_list
5941
	 *  - parent->child_mutex pins parent->child_list
5942
	 *  - parent->ctx->mutex pins parent->sibling_list
5943
	 *
5944
	 * Because parent->ctx != leader->ctx (and child_list nests inside
5945
	 * ctx->mutex), group destruction is not atomic between children, also
5946
	 * see perf_event_release_kernel(). Additionally, parent can grow the
5947
	 * group.
5948
	 *
5949
	 * Therefore it is possible to have parent and child groups in a
5950
	 * different configuration and summing over such a beast makes no sense
5951
	 * what so ever.
5952
	 *
5953
	 * Reject this.
5954
	 */
5955
	parent = leader->parent;
5956
	if (parent &&
5957
	    (parent->group_generation != leader->group_generation ||
5958
	     parent->nr_siblings != leader->nr_siblings)) {
5959
		ret = -ECHILD;
5960
		goto unlock;
5961
	}
5962

5963
	/*
5964
	 * Since we co-schedule groups, {enabled,running} times of siblings
5965
	 * will be identical to those of the leader, so we only publish one
5966
	 * set.
5967
	 */
5968
	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5969
		values[n++] += leader->total_time_enabled +
5970
			atomic64_read(&leader->child_total_time_enabled);
5971
	}
5972

5973
	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5974
		values[n++] += leader->total_time_running +
5975
			atomic64_read(&leader->child_total_time_running);
5976
	}
5977

5978
	/*
5979
	 * Write {count,id} tuples for every sibling.
5980
	 */
5981
	values[n++] += perf_event_count(leader, false);
5982
	if (read_format & PERF_FORMAT_ID)
5983
		values[n++] = primary_event_id(leader);
5984
	if (read_format & PERF_FORMAT_LOST)
5985
		values[n++] = atomic64_read(&leader->lost_samples);
5986

5987
	for_each_sibling_event(sub, leader) {
5988
		values[n++] += perf_event_count(sub, false);
5989
		if (read_format & PERF_FORMAT_ID)
5990
			values[n++] = primary_event_id(sub);
5991
		if (read_format & PERF_FORMAT_LOST)
5992
			values[n++] = atomic64_read(&sub->lost_samples);
5993
	}
5994

5995
unlock:
5996
	raw_spin_unlock_irqrestore(&ctx->lock, flags);
5997
	return ret;
5998
}
5999

6000
static int perf_read_group(struct perf_event *event,
6001
				   u64 read_format, char __user *buf)
6002
{
6003
	struct perf_event *leader = event->group_leader, *child;
6004
	struct perf_event_context *ctx = leader->ctx;
6005
	int ret;
6006
	u64 *values;
6007

6008
	lockdep_assert_held(&ctx->mutex);
6009

6010
	values = kzalloc(event->read_size, GFP_KERNEL);
6011
	if (!values)
6012
		return -ENOMEM;
6013

6014
	values[0] = 1 + leader->nr_siblings;
6015

6016
	mutex_lock(&leader->child_mutex);
6017

6018
	ret = __perf_read_group_add(leader, read_format, values);
6019
	if (ret)
6020
		goto unlock;
6021

6022
	list_for_each_entry(child, &leader->child_list, child_list) {
6023
		ret = __perf_read_group_add(child, read_format, values);
6024
		if (ret)
6025
			goto unlock;
6026
	}
6027

6028
	mutex_unlock(&leader->child_mutex);
6029

6030
	ret = event->read_size;
6031
	if (copy_to_user(buf, values, event->read_size))
6032
		ret = -EFAULT;
6033
	goto out;
6034

6035
unlock:
6036
	mutex_unlock(&leader->child_mutex);
6037
out:
6038
	kfree(values);
6039
	return ret;
6040
}
6041

6042
static int perf_read_one(struct perf_event *event,
6043
				 u64 read_format, char __user *buf)
6044
{
6045
	u64 enabled, running;
6046
	u64 values[5];
6047
	int n = 0;
6048

6049
	values[n++] = __perf_event_read_value(event, &enabled, &running);
6050
	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
6051
		values[n++] = enabled;
6052
	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
6053
		values[n++] = running;
6054
	if (read_format & PERF_FORMAT_ID)
6055
		values[n++] = primary_event_id(event);
6056
	if (read_format & PERF_FORMAT_LOST)
6057
		values[n++] = atomic64_read(&event->lost_samples);
6058

6059
	if (copy_to_user(buf, values, n * sizeof(u64)))
6060
		return -EFAULT;
6061

6062
	return n * sizeof(u64);
6063
}
6064

6065
static bool is_event_hup(struct perf_event *event)
6066
{
6067
	bool no_children;
6068

6069
	if (event->state > PERF_EVENT_STATE_EXIT)
6070
		return false;
6071

6072
	mutex_lock(&event->child_mutex);
6073
	no_children = list_empty(&event->child_list);
6074
	mutex_unlock(&event->child_mutex);
6075
	return no_children;
6076
}
6077

6078
/*
6079
 * Read the performance event - simple non blocking version for now
6080
 */
6081
static ssize_t
6082
__perf_read(struct perf_event *event, char __user *buf, size_t count)
6083
{
6084
	u64 read_format = event->attr.read_format;
6085
	int ret;
6086

6087
	/*
6088
	 * Return end-of-file for a read on an event that is in
6089
	 * error state (i.e. because it was pinned but it couldn't be
6090
	 * scheduled on to the CPU at some point).
6091
	 */
6092
	if (event->state == PERF_EVENT_STATE_ERROR)
6093
		return 0;
6094

6095
	if (count < event->read_size)
6096
		return -ENOSPC;
6097

6098
	WARN_ON_ONCE(event->ctx->parent_ctx);
6099
	if (read_format & PERF_FORMAT_GROUP)
6100
		ret = perf_read_group(event, read_format, buf);
6101
	else
6102
		ret = perf_read_one(event, read_format, buf);
6103

6104
	return ret;
6105
}
6106

6107
static ssize_t
6108
perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
6109
{
6110
	struct perf_event *event = file->private_data;
6111
	struct perf_event_context *ctx;
6112
	int ret;
6113

6114
	ret = security_perf_event_read(event);
6115
	if (ret)
6116
		return ret;
6117

6118
	ctx = perf_event_ctx_lock(event);
6119
	ret = __perf_read(event, buf, count);
6120
	perf_event_ctx_unlock(event, ctx);
6121

6122
	return ret;
6123
}
6124

6125
static __poll_t perf_poll(struct file *file, poll_table *wait)
6126
{
6127
	struct perf_event *event = file->private_data;
6128
	struct perf_buffer *rb;
6129
	__poll_t events = EPOLLHUP;
6130

6131
	if (event->state <= PERF_EVENT_STATE_REVOKED)
6132
		return EPOLLERR;
6133

6134
	poll_wait(file, &event->waitq, wait);
6135

6136
	if (event->state <= PERF_EVENT_STATE_REVOKED)
6137
		return EPOLLERR;
6138

6139
	if (is_event_hup(event))
6140
		return events;
6141

6142
	if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR &&
6143
		     event->attr.pinned))
6144
		return EPOLLERR;
6145

6146
	/*
6147
	 * Pin the event->rb by taking event->mmap_mutex; otherwise
6148
	 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
6149
	 */
6150
	mutex_lock(&event->mmap_mutex);
6151
	rb = event->rb;
6152
	if (rb)
6153
		events = atomic_xchg(&rb->poll, 0);
6154
	mutex_unlock(&event->mmap_mutex);
6155
	return events;
6156
}
6157

6158
static void _perf_event_reset(struct perf_event *event)
6159
{
6160
	(void)perf_event_read(event, false);
6161
	local64_set(&event->count, 0);
6162
	perf_event_update_userpage(event);
6163
}
6164

6165
/* Assume it's not an event with inherit set. */
6166
u64 perf_event_pause(struct perf_event *event, bool reset)
6167
{
6168
	struct perf_event_context *ctx;
6169
	u64 count;
6170

6171
	ctx = perf_event_ctx_lock(event);
6172
	WARN_ON_ONCE(event->attr.inherit);
6173
	_perf_event_disable(event);
6174
	count = local64_read(&event->count);
6175
	if (reset)
6176
		local64_set(&event->count, 0);
6177
	perf_event_ctx_unlock(event, ctx);
6178

6179
	return count;
6180
}
6181
EXPORT_SYMBOL_GPL(perf_event_pause);
6182

6183
/*
6184
 * Holding the top-level event's child_mutex means that any
6185
 * descendant process that has inherited this event will block
6186
 * in perf_event_exit_event() if it goes to exit, thus satisfying the
6187
 * task existence requirements of perf_event_enable/disable.
6188
 */
6189
static void perf_event_for_each_child(struct perf_event *event,
6190
					void (*func)(struct perf_event *))
6191
{
6192
	struct perf_event *child;
6193

6194
	WARN_ON_ONCE(event->ctx->parent_ctx);
6195

6196
	mutex_lock(&event->child_mutex);
6197
	func(event);
6198
	list_for_each_entry(child, &event->child_list, child_list)
6199
		func(child);
6200
	mutex_unlock(&event->child_mutex);
6201
}
6202

6203
static void perf_event_for_each(struct perf_event *event,
6204
				  void (*func)(struct perf_event *))
6205
{
6206
	struct perf_event_context *ctx = event->ctx;
6207
	struct perf_event *sibling;
6208

6209
	lockdep_assert_held(&ctx->mutex);
6210

6211
	event = event->group_leader;
6212

6213
	perf_event_for_each_child(event, func);
6214
	for_each_sibling_event(sibling, event)
6215
		perf_event_for_each_child(sibling, func);
6216
}
6217

6218
static void __perf_event_period(struct perf_event *event,
6219
				struct perf_cpu_context *cpuctx,
6220
				struct perf_event_context *ctx,
6221
				void *info)
6222
{
6223
	u64 value = *((u64 *)info);
6224
	bool active;
6225

6226
	if (event->attr.freq) {
6227
		event->attr.sample_freq = value;
6228
	} else {
6229
		event->attr.sample_period = value;
6230
		event->hw.sample_period = value;
6231
	}
6232

6233
	active = (event->state == PERF_EVENT_STATE_ACTIVE);
6234
	if (active) {
6235
		perf_pmu_disable(event->pmu);
6236
		event->pmu->stop(event, PERF_EF_UPDATE);
6237
	}
6238

6239
	local64_set(&event->hw.period_left, 0);
6240

6241
	if (active) {
6242
		event->pmu->start(event, PERF_EF_RELOAD);
6243
		/*
6244
		 * Once the period is force-reset, the event starts immediately.
6245
		 * But the event/group could be throttled. Unthrottle the
6246
		 * event/group now to avoid the next tick trying to unthrottle
6247
		 * while we already re-started the event/group.
6248
		 */
6249
		if (event->hw.interrupts == MAX_INTERRUPTS)
6250
			perf_event_unthrottle_group(event, true);
6251
		perf_pmu_enable(event->pmu);
6252
	}
6253
}
6254

6255
static int perf_event_check_period(struct perf_event *event, u64 value)
6256
{
6257
	return event->pmu->check_period(event, value);
6258
}
6259

6260
static int _perf_event_period(struct perf_event *event, u64 value)
6261
{
6262
	if (!is_sampling_event(event))
6263
		return -EINVAL;
6264

6265
	if (!value)
6266
		return -EINVAL;
6267

6268
	if (event->attr.freq) {
6269
		if (value > sysctl_perf_event_sample_rate)
6270
			return -EINVAL;
6271
	} else {
6272
		if (perf_event_check_period(event, value))
6273
			return -EINVAL;
6274
		if (value & (1ULL << 63))
6275
			return -EINVAL;
6276
	}
6277

6278
	event_function_call(event, __perf_event_period, &value);
6279

6280
	return 0;
6281
}
6282

6283
int perf_event_period(struct perf_event *event, u64 value)
6284
{
6285
	struct perf_event_context *ctx;
6286
	int ret;
6287

6288
	ctx = perf_event_ctx_lock(event);
6289
	ret = _perf_event_period(event, value);
6290
	perf_event_ctx_unlock(event, ctx);
6291

6292
	return ret;
6293
}
6294
EXPORT_SYMBOL_GPL(perf_event_period);
6295

6296
static const struct file_operations perf_fops;
6297

6298
static inline bool is_perf_file(struct fd f)
6299
{
6300
	return !fd_empty(f) && fd_file(f)->f_op == &perf_fops;
6301
}
6302

6303
static int perf_event_set_output(struct perf_event *event,
6304
				 struct perf_event *output_event);
6305
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
6306
static int perf_copy_attr(struct perf_event_attr __user *uattr,
6307
			  struct perf_event_attr *attr);
6308
static int __perf_event_set_bpf_prog(struct perf_event *event,
6309
				     struct bpf_prog *prog,
6310
				     u64 bpf_cookie);
6311

6312
static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
6313
{
6314
	void (*func)(struct perf_event *);
6315
	u32 flags = arg;
6316

6317
	if (event->state <= PERF_EVENT_STATE_REVOKED)
6318
		return -ENODEV;
6319

6320
	switch (cmd) {
6321
	case PERF_EVENT_IOC_ENABLE:
6322
		func = _perf_event_enable;
6323
		break;
6324
	case PERF_EVENT_IOC_DISABLE:
6325
		func = _perf_event_disable;
6326
		break;
6327
	case PERF_EVENT_IOC_RESET:
6328
		func = _perf_event_reset;
6329
		break;
6330

6331
	case PERF_EVENT_IOC_REFRESH:
6332
		return _perf_event_refresh(event, arg);
6333

6334
	case PERF_EVENT_IOC_PERIOD:
6335
	{
6336
		u64 value;
6337

6338
		if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
6339
			return -EFAULT;
6340

6341
		return _perf_event_period(event, value);
6342
	}
6343
	case PERF_EVENT_IOC_ID:
6344
	{
6345
		u64 id = primary_event_id(event);
6346

6347
		if (copy_to_user((void __user *)arg, &id, sizeof(id)))
6348
			return -EFAULT;
6349
		return 0;
6350
	}
6351

6352
	case PERF_EVENT_IOC_SET_OUTPUT:
6353
	{
6354
		CLASS(fd, output)(arg);	     // arg == -1 => empty
6355
		struct perf_event *output_event = NULL;
6356
		if (arg != -1) {
6357
			if (!is_perf_file(output))
6358
				return -EBADF;
6359
			output_event = fd_file(output)->private_data;
6360
		}
6361
		return perf_event_set_output(event, output_event);
6362
	}
6363

6364
	case PERF_EVENT_IOC_SET_FILTER:
6365
		return perf_event_set_filter(event, (void __user *)arg);
6366

6367
	case PERF_EVENT_IOC_SET_BPF:
6368
	{
6369
		struct bpf_prog *prog;
6370
		int err;
6371

6372
		prog = bpf_prog_get(arg);
6373
		if (IS_ERR(prog))
6374
			return PTR_ERR(prog);
6375

6376
		err = __perf_event_set_bpf_prog(event, prog, 0);
6377
		if (err) {
6378
			bpf_prog_put(prog);
6379
			return err;
6380
		}
6381

6382
		return 0;
6383
	}
6384

6385
	case PERF_EVENT_IOC_PAUSE_OUTPUT: {
6386
		struct perf_buffer *rb;
6387

6388
		rcu_read_lock();
6389
		rb = rcu_dereference(event->rb);
6390
		if (!rb || !rb->nr_pages) {
6391
			rcu_read_unlock();
6392
			return -EINVAL;
6393
		}
6394
		rb_toggle_paused(rb, !!arg);
6395
		rcu_read_unlock();
6396
		return 0;
6397
	}
6398

6399
	case PERF_EVENT_IOC_QUERY_BPF:
6400
		return perf_event_query_prog_array(event, (void __user *)arg);
6401

6402
	case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
6403
		struct perf_event_attr new_attr;
6404
		int err = perf_copy_attr((struct perf_event_attr __user *)arg,
6405
					 &new_attr);
6406

6407
		if (err)
6408
			return err;
6409

6410
		return perf_event_modify_attr(event,  &new_attr);
6411
	}
6412
	default:
6413
		return -ENOTTY;
6414
	}
6415

6416
	if (flags & PERF_IOC_FLAG_GROUP)
6417
		perf_event_for_each(event, func);
6418
	else
6419
		perf_event_for_each_child(event, func);
6420

6421
	return 0;
6422
}
6423

6424
static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
6425
{
6426
	struct perf_event *event = file->private_data;
6427
	struct perf_event_context *ctx;
6428
	long ret;
6429

6430
	/* Treat ioctl like writes as it is likely a mutating operation. */
6431
	ret = security_perf_event_write(event);
6432
	if (ret)
6433
		return ret;
6434

6435
	ctx = perf_event_ctx_lock(event);
6436
	ret = _perf_ioctl(event, cmd, arg);
6437
	perf_event_ctx_unlock(event, ctx);
6438

6439
	return ret;
6440
}
6441

6442
#ifdef CONFIG_COMPAT
6443
static long perf_compat_ioctl(struct file *file, unsigned int cmd,
6444
				unsigned long arg)
6445
{
6446
	switch (_IOC_NR(cmd)) {
6447
	case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
6448
	case _IOC_NR(PERF_EVENT_IOC_ID):
6449
	case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
6450
	case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
6451
		/* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
6452
		if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
6453
			cmd &= ~IOCSIZE_MASK;
6454
			cmd |= sizeof(void *) << IOCSIZE_SHIFT;
6455
		}
6456
		break;
6457
	}
6458
	return perf_ioctl(file, cmd, arg);
6459
}
6460
#else
6461
# define perf_compat_ioctl NULL
6462
#endif
6463

6464
int perf_event_task_enable(void)
6465
{
6466
	struct perf_event_context *ctx;
6467
	struct perf_event *event;
6468

6469
	mutex_lock(&current->perf_event_mutex);
6470
	list_for_each_entry(event, &current->perf_event_list, owner_entry) {
6471
		ctx = perf_event_ctx_lock(event);
6472
		perf_event_for_each_child(event, _perf_event_enable);
6473
		perf_event_ctx_unlock(event, ctx);
6474
	}
6475
	mutex_unlock(&current->perf_event_mutex);
6476

6477
	return 0;
6478
}
6479

6480
int perf_event_task_disable(void)
6481
{
6482
	struct perf_event_context *ctx;
6483
	struct perf_event *event;
6484

6485
	mutex_lock(&current->perf_event_mutex);
6486
	list_for_each_entry(event, &current->perf_event_list, owner_entry) {
6487
		ctx = perf_event_ctx_lock(event);
6488
		perf_event_for_each_child(event, _perf_event_disable);
6489
		perf_event_ctx_unlock(event, ctx);
6490
	}
6491
	mutex_unlock(&current->perf_event_mutex);
6492

6493
	return 0;
6494
}
6495

6496
static int perf_event_index(struct perf_event *event)
6497
{
6498
	if (event->hw.state & PERF_HES_STOPPED)
6499
		return 0;
6500

6501
	if (event->state != PERF_EVENT_STATE_ACTIVE)
6502
		return 0;
6503

6504
	return event->pmu->event_idx(event);
6505
}
6506

6507
static void perf_event_init_userpage(struct perf_event *event)
6508
{
6509
	struct perf_event_mmap_page *userpg;
6510
	struct perf_buffer *rb;
6511

6512
	rcu_read_lock();
6513
	rb = rcu_dereference(event->rb);
6514
	if (!rb)
6515
		goto unlock;
6516

6517
	userpg = rb->user_page;
6518

6519
	/* Allow new userspace to detect that bit 0 is deprecated */
6520
	userpg->cap_bit0_is_deprecated = 1;
6521
	userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
6522
	userpg->data_offset = PAGE_SIZE;
6523
	userpg->data_size = perf_data_size(rb);
6524

6525
unlock:
6526
	rcu_read_unlock();
6527
}
6528

6529
void __weak arch_perf_update_userpage(
6530
	struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
6531
{
6532
}
6533

6534
/*
6535
 * Callers need to ensure there can be no nesting of this function, otherwise
6536
 * the seqlock logic goes bad. We can not serialize this because the arch
6537
 * code calls this from NMI context.
6538
 */
6539
void perf_event_update_userpage(struct perf_event *event)
6540
{
6541
	struct perf_event_mmap_page *userpg;
6542
	struct perf_buffer *rb;
6543
	u64 enabled, running, now;
6544

6545
	rcu_read_lock();
6546
	rb = rcu_dereference(event->rb);
6547
	if (!rb)
6548
		goto unlock;
6549

6550
	/*
6551
	 * compute total_time_enabled, total_time_running
6552
	 * based on snapshot values taken when the event
6553
	 * was last scheduled in.
6554
	 *
6555
	 * we cannot simply called update_context_time()
6556
	 * because of locking issue as we can be called in
6557
	 * NMI context
6558
	 */
6559
	calc_timer_values(event, &now, &enabled, &running);
6560

6561
	userpg = rb->user_page;
6562
	/*
6563
	 * Disable preemption to guarantee consistent time stamps are stored to
6564
	 * the user page.
6565
	 */
6566
	preempt_disable();
6567
	++userpg->lock;
6568
	barrier();
6569
	userpg->index = perf_event_index(event);
6570
	userpg->offset = perf_event_count(event, false);
6571
	if (userpg->index)
6572
		userpg->offset -= local64_read(&event->hw.prev_count);
6573

6574
	userpg->time_enabled = enabled +
6575
			atomic64_read(&event->child_total_time_enabled);
6576

6577
	userpg->time_running = running +
6578
			atomic64_read(&event->child_total_time_running);
6579

6580
	arch_perf_update_userpage(event, userpg, now);
6581

6582
	barrier();
6583
	++userpg->lock;
6584
	preempt_enable();
6585
unlock:
6586
	rcu_read_unlock();
6587
}
6588
EXPORT_SYMBOL_GPL(perf_event_update_userpage);
6589

6590
static void ring_buffer_attach(struct perf_event *event,
6591
			       struct perf_buffer *rb)
6592
{
6593
	struct perf_buffer *old_rb = NULL;
6594
	unsigned long flags;
6595

6596
	WARN_ON_ONCE(event->parent);
6597

6598
	if (event->rb) {
6599
		/*
6600
		 * Should be impossible, we set this when removing
6601
		 * event->rb_entry and wait/clear when adding event->rb_entry.
6602
		 */
6603
		WARN_ON_ONCE(event->rcu_pending);
6604

6605
		old_rb = event->rb;
6606
		spin_lock_irqsave(&old_rb->event_lock, flags);
6607
		list_del_rcu(&event->rb_entry);
6608
		spin_unlock_irqrestore(&old_rb->event_lock, flags);
6609

6610
		event->rcu_batches = get_state_synchronize_rcu();
6611
		event->rcu_pending = 1;
6612
	}
6613

6614
	if (rb) {
6615
		if (event->rcu_pending) {
6616
			cond_synchronize_rcu(event->rcu_batches);
6617
			event->rcu_pending = 0;
6618
		}
6619

6620
		spin_lock_irqsave(&rb->event_lock, flags);
6621
		list_add_rcu(&event->rb_entry, &rb->event_list);
6622
		spin_unlock_irqrestore(&rb->event_lock, flags);
6623
	}
6624

6625
	/*
6626
	 * Avoid racing with perf_mmap_close(AUX): stop the event
6627
	 * before swizzling the event::rb pointer; if it's getting
6628
	 * unmapped, its aux_mmap_count will be 0 and it won't
6629
	 * restart. See the comment in __perf_pmu_output_stop().
6630
	 *
6631
	 * Data will inevitably be lost when set_output is done in
6632
	 * mid-air, but then again, whoever does it like this is
6633
	 * not in for the data anyway.
6634
	 */
6635
	if (has_aux(event))
6636
		perf_event_stop(event, 0);
6637

6638
	rcu_assign_pointer(event->rb, rb);
6639

6640
	if (old_rb) {
6641
		ring_buffer_put(old_rb);
6642
		/*
6643
		 * Since we detached before setting the new rb, so that we
6644
		 * could attach the new rb, we could have missed a wakeup.
6645
		 * Provide it now.
6646
		 */
6647
		wake_up_all(&event->waitq);
6648
	}
6649
}
6650

6651
static void ring_buffer_wakeup(struct perf_event *event)
6652
{
6653
	struct perf_buffer *rb;
6654

6655
	if (event->parent)
6656
		event = event->parent;
6657

6658
	rcu_read_lock();
6659
	rb = rcu_dereference(event->rb);
6660
	if (rb) {
6661
		list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
6662
			wake_up_all(&event->waitq);
6663
	}
6664
	rcu_read_unlock();
6665
}
6666

6667
struct perf_buffer *ring_buffer_get(struct perf_event *event)
6668
{
6669
	struct perf_buffer *rb;
6670

6671
	if (event->parent)
6672
		event = event->parent;
6673

6674
	rcu_read_lock();
6675
	rb = rcu_dereference(event->rb);
6676
	if (rb) {
6677
		if (!refcount_inc_not_zero(&rb->refcount))
6678
			rb = NULL;
6679
	}
6680
	rcu_read_unlock();
6681

6682
	return rb;
6683
}
6684

6685
void ring_buffer_put(struct perf_buffer *rb)
6686
{
6687
	if (!refcount_dec_and_test(&rb->refcount))
6688
		return;
6689

6690
	WARN_ON_ONCE(!list_empty(&rb->event_list));
6691

6692
	call_rcu(&rb->rcu_head, rb_free_rcu);
6693
}
6694

6695
typedef void (*mapped_f)(struct perf_event *event, struct mm_struct *mm);
6696

6697
#define get_mapped(event, func)			\
6698
({	struct pmu *pmu;			\
6699
	mapped_f f = NULL;			\
6700
	guard(rcu)();				\
6701
	pmu = READ_ONCE(event->pmu);		\
6702
	if (pmu)				\
6703
		f = pmu->func;			\
6704
	f;					\
6705
})
6706

6707
static void perf_mmap_open(struct vm_area_struct *vma)
6708
{
6709
	struct perf_event *event = vma->vm_file->private_data;
6710
	mapped_f mapped = get_mapped(event, event_mapped);
6711

6712
	refcount_inc(&event->mmap_count);
6713
	refcount_inc(&event->rb->mmap_count);
6714

6715
	if (vma->vm_pgoff)
6716
		refcount_inc(&event->rb->aux_mmap_count);
6717

6718
	if (mapped)
6719
		mapped(event, vma->vm_mm);
6720
}
6721

6722
static void perf_pmu_output_stop(struct perf_event *event);
6723

6724
/*
6725
 * A buffer can be mmap()ed multiple times; either directly through the same
6726
 * event, or through other events by use of perf_event_set_output().
6727
 *
6728
 * In order to undo the VM accounting done by perf_mmap() we need to destroy
6729
 * the buffer here, where we still have a VM context. This means we need
6730
 * to detach all events redirecting to us.
6731
 */
6732
static void perf_mmap_close(struct vm_area_struct *vma)
6733
{
6734
	struct perf_event *event = vma->vm_file->private_data;
6735
	mapped_f unmapped = get_mapped(event, event_unmapped);
6736
	struct perf_buffer *rb = ring_buffer_get(event);
6737
	struct user_struct *mmap_user = rb->mmap_user;
6738
	int mmap_locked = rb->mmap_locked;
6739
	unsigned long size = perf_data_size(rb);
6740
	bool detach_rest = false;
6741

6742
	/* FIXIES vs perf_pmu_unregister() */
6743
	if (unmapped)
6744
		unmapped(event, vma->vm_mm);
6745

6746
	/*
6747
	 * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex
6748
	 * to avoid complications.
6749
	 */
6750
	if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
6751
	    refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
6752
		/*
6753
		 * Stop all AUX events that are writing to this buffer,
6754
		 * so that we can free its AUX pages and corresponding PMU
6755
		 * data. Note that after rb::aux_mmap_count dropped to zero,
6756
		 * they won't start any more (see perf_aux_output_begin()).
6757
		 */
6758
		perf_pmu_output_stop(event);
6759

6760
		/* now it's safe to free the pages */
6761
		atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
6762
		atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
6763

6764
		/* this has to be the last one */
6765
		rb_free_aux(rb);
6766
		WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
6767

6768
		mutex_unlock(&rb->aux_mutex);
6769
	}
6770

6771
	if (refcount_dec_and_test(&rb->mmap_count))
6772
		detach_rest = true;
6773

6774
	if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
6775
		goto out_put;
6776

6777
	ring_buffer_attach(event, NULL);
6778
	mutex_unlock(&event->mmap_mutex);
6779

6780
	/* If there's still other mmap()s of this buffer, we're done. */
6781
	if (!detach_rest)
6782
		goto out_put;
6783

6784
	/*
6785
	 * No other mmap()s, detach from all other events that might redirect
6786
	 * into the now unreachable buffer. Somewhat complicated by the
6787
	 * fact that rb::event_lock otherwise nests inside mmap_mutex.
6788
	 */
6789
again:
6790
	rcu_read_lock();
6791
	list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
6792
		if (!atomic_long_inc_not_zero(&event->refcount)) {
6793
			/*
6794
			 * This event is en-route to free_event() which will
6795
			 * detach it and remove it from the list.
6796
			 */
6797
			continue;
6798
		}
6799
		rcu_read_unlock();
6800

6801
		mutex_lock(&event->mmap_mutex);
6802
		/*
6803
		 * Check we didn't race with perf_event_set_output() which can
6804
		 * swizzle the rb from under us while we were waiting to
6805
		 * acquire mmap_mutex.
6806
		 *
6807
		 * If we find a different rb; ignore this event, a next
6808
		 * iteration will no longer find it on the list. We have to
6809
		 * still restart the iteration to make sure we're not now
6810
		 * iterating the wrong list.
6811
		 */
6812
		if (event->rb == rb)
6813
			ring_buffer_attach(event, NULL);
6814

6815
		mutex_unlock(&event->mmap_mutex);
6816
		put_event(event);
6817

6818
		/*
6819
		 * Restart the iteration; either we're on the wrong list or
6820
		 * destroyed its integrity by doing a deletion.
6821
		 */
6822
		goto again;
6823
	}
6824
	rcu_read_unlock();
6825

6826
	/*
6827
	 * It could be there's still a few 0-ref events on the list; they'll
6828
	 * get cleaned up by free_event() -- they'll also still have their
6829
	 * ref on the rb and will free it whenever they are done with it.
6830
	 *
6831
	 * Aside from that, this buffer is 'fully' detached and unmapped,
6832
	 * undo the VM accounting.
6833
	 */
6834

6835
	atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
6836
			&mmap_user->locked_vm);
6837
	atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
6838
	free_uid(mmap_user);
6839

6840
out_put:
6841
	ring_buffer_put(rb); /* could be last */
6842
}
6843

6844
static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf)
6845
{
6846
	/* The first page is the user control page, others are read-only. */
6847
	return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS;
6848
}
6849

6850
static int perf_mmap_may_split(struct vm_area_struct *vma, unsigned long addr)
6851
{
6852
	/*
6853
	 * Forbid splitting perf mappings to prevent refcount leaks due to
6854
	 * the resulting non-matching offsets and sizes. See open()/close().
6855
	 */
6856
	return -EINVAL;
6857
}
6858

6859
static const struct vm_operations_struct perf_mmap_vmops = {
6860
	.open		= perf_mmap_open,
6861
	.close		= perf_mmap_close, /* non mergeable */
6862
	.pfn_mkwrite	= perf_mmap_pfn_mkwrite,
6863
	.may_split	= perf_mmap_may_split,
6864
};
6865

6866
static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
6867
{
6868
	unsigned long nr_pages = vma_pages(vma);
6869
	int err = 0;
6870
	unsigned long pagenum;
6871

6872
	/*
6873
	 * We map this as a VM_PFNMAP VMA.
6874
	 *
6875
	 * This is not ideal as this is designed broadly for mappings of PFNs
6876
	 * referencing memory-mapped I/O ranges or non-system RAM i.e. for which
6877
	 * !pfn_valid(pfn).
6878
	 *
6879
	 * We are mapping kernel-allocated memory (memory we manage ourselves)
6880
	 * which would more ideally be mapped using vm_insert_page() or a
6881
	 * similar mechanism, that is as a VM_MIXEDMAP mapping.
6882
	 *
6883
	 * However this won't work here, because:
6884
	 *
6885
	 * 1. It uses vma->vm_page_prot, but this field has not been completely
6886
	 *    setup at the point of the f_op->mmp() hook, so we are unable to
6887
	 *    indicate that this should be mapped CoW in order that the
6888
	 *    mkwrite() hook can be invoked to make the first page R/W and the
6889
	 *    rest R/O as desired.
6890
	 *
6891
	 * 2. Anything other than a VM_PFNMAP of valid PFNs will result in
6892
	 *    vm_normal_page() returning a struct page * pointer, which means
6893
	 *    vm_ops->page_mkwrite() will be invoked rather than
6894
	 *    vm_ops->pfn_mkwrite(), and this means we have to set page->mapping
6895
	 *    to work around retry logic in the fault handler, however this
6896
	 *    field is no longer allowed to be used within struct page.
6897
	 *
6898
	 * 3. Having a struct page * made available in the fault logic also
6899
	 *    means that the page gets put on the rmap and becomes
6900
	 *    inappropriately accessible and subject to map and ref counting.
6901
	 *
6902
	 * Ideally we would have a mechanism that could explicitly express our
6903
	 * desires, but this is not currently the case, so we instead use
6904
	 * VM_PFNMAP.
6905
	 *
6906
	 * We manage the lifetime of these mappings with internal refcounts (see
6907
	 * perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of
6908
	 * this mapping is maintained correctly.
6909
	 */
6910
	for (pagenum = 0; pagenum < nr_pages; pagenum++) {
6911
		unsigned long va = vma->vm_start + PAGE_SIZE * pagenum;
6912
		struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum);
6913

6914
		if (page == NULL) {
6915
			err = -EINVAL;
6916
			break;
6917
		}
6918

6919
		/* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */
6920
		err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE,
6921
				      vm_get_page_prot(vma->vm_flags & ~VM_SHARED));
6922
		if (err)
6923
			break;
6924
	}
6925

6926
#ifdef CONFIG_MMU
6927
	/* Clear any partial mappings on error. */
6928
	if (err)
6929
		zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL);
6930
#endif
6931

6932
	return err;
6933
}
6934

6935
static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, long *extra)
6936
{
6937
	unsigned long user_locked, user_lock_limit, locked, lock_limit;
6938
	struct user_struct *user = current_user();
6939

6940
	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
6941
	/* Increase the limit linearly with more CPUs */
6942
	user_lock_limit *= num_online_cpus();
6943

6944
	user_locked = atomic_long_read(&user->locked_vm);
6945

6946
	/*
6947
	 * sysctl_perf_event_mlock may have changed, so that
6948
	 *     user->locked_vm > user_lock_limit
6949
	 */
6950
	if (user_locked > user_lock_limit)
6951
		user_locked = user_lock_limit;
6952
	user_locked += *user_extra;
6953

6954
	if (user_locked > user_lock_limit) {
6955
		/*
6956
		 * charge locked_vm until it hits user_lock_limit;
6957
		 * charge the rest from pinned_vm
6958
		 */
6959
		*extra = user_locked - user_lock_limit;
6960
		*user_extra -= *extra;
6961
	}
6962

6963
	lock_limit = rlimit(RLIMIT_MEMLOCK);
6964
	lock_limit >>= PAGE_SHIFT;
6965
	locked = atomic64_read(&vma->vm_mm->pinned_vm) + *extra;
6966

6967
	return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK);
6968
}
6969

6970
static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long extra)
6971
{
6972
	struct user_struct *user = current_user();
6973

6974
	atomic_long_add(user_extra, &user->locked_vm);
6975
	atomic64_add(extra, &vma->vm_mm->pinned_vm);
6976
}
6977

6978
static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
6979
			unsigned long nr_pages)
6980
{
6981
	long extra = 0, user_extra = nr_pages;
6982
	struct perf_buffer *rb;
6983
	int rb_flags = 0;
6984

6985
	nr_pages -= 1;
6986

6987
	/*
6988
	 * If we have rb pages ensure they're a power-of-two number, so we
6989
	 * can do bitmasks instead of modulo.
6990
	 */
6991
	if (nr_pages != 0 && !is_power_of_2(nr_pages))
6992
		return -EINVAL;
6993

6994
	WARN_ON_ONCE(event->ctx->parent_ctx);
6995

6996
	if (event->rb) {
6997
		if (data_page_nr(event->rb) != nr_pages)
6998
			return -EINVAL;
6999

7000
		/*
7001
		 * If this event doesn't have mmap_count, we're attempting to
7002
		 * create an alias of another event's mmap(); this would mean
7003
		 * both events will end up scribbling the same user_page;
7004
		 * which makes no sense.
7005
		 */
7006
		if (!refcount_read(&event->mmap_count))
7007
			return -EBUSY;
7008

7009
		if (refcount_inc_not_zero(&event->rb->mmap_count)) {
7010
			/*
7011
			 * Success -- managed to mmap() the same buffer
7012
			 * multiple times.
7013
			 */
7014
			perf_mmap_account(vma, user_extra, extra);
7015
			refcount_inc(&event->mmap_count);
7016
			return 0;
7017
		}
7018

7019
		/*
7020
		 * Raced against perf_mmap_close()'s
7021
		 * refcount_dec_and_mutex_lock() remove the
7022
		 * event and continue as if !event->rb
7023
		 */
7024
		ring_buffer_attach(event, NULL);
7025
	}
7026

7027
	if (!perf_mmap_calc_limits(vma, &user_extra, &extra))
7028
		return -EPERM;
7029

7030
	if (vma->vm_flags & VM_WRITE)
7031
		rb_flags |= RING_BUFFER_WRITABLE;
7032

7033
	rb = rb_alloc(nr_pages,
7034
		      event->attr.watermark ? event->attr.wakeup_watermark : 0,
7035
		      event->cpu, rb_flags);
7036

7037
	if (!rb)
7038
		return -ENOMEM;
7039

7040
	refcount_set(&rb->mmap_count, 1);
7041
	rb->mmap_user = get_current_user();
7042
	rb->mmap_locked = extra;
7043

7044
	ring_buffer_attach(event, rb);
7045

7046
	perf_event_update_time(event);
7047
	perf_event_init_userpage(event);
7048
	perf_event_update_userpage(event);
7049

7050
	perf_mmap_account(vma, user_extra, extra);
7051
	refcount_set(&event->mmap_count, 1);
7052

7053
	return 0;
7054
}
7055

7056
static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event,
7057
			 unsigned long nr_pages)
7058
{
7059
	long extra = 0, user_extra = nr_pages;
7060
	u64 aux_offset, aux_size;
7061
	struct perf_buffer *rb;
7062
	int ret, rb_flags = 0;
7063

7064
	rb = event->rb;
7065
	if (!rb)
7066
		return -EINVAL;
7067

7068
	guard(mutex)(&rb->aux_mutex);
7069

7070
	/*
7071
	 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
7072
	 * mapped, all subsequent mappings should have the same size
7073
	 * and offset. Must be above the normal perf buffer.
7074
	 */
7075
	aux_offset = READ_ONCE(rb->user_page->aux_offset);
7076
	aux_size = READ_ONCE(rb->user_page->aux_size);
7077

7078
	if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
7079
		return -EINVAL;
7080

7081
	if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
7082
		return -EINVAL;
7083

7084
	/* already mapped with a different offset */
7085
	if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
7086
		return -EINVAL;
7087

7088
	if (aux_size != nr_pages * PAGE_SIZE)
7089
		return -EINVAL;
7090

7091
	/* already mapped with a different size */
7092
	if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
7093
		return -EINVAL;
7094

7095
	if (!is_power_of_2(nr_pages))
7096
		return -EINVAL;
7097

7098
	if (!refcount_inc_not_zero(&rb->mmap_count))
7099
		return -EINVAL;
7100

7101
	if (rb_has_aux(rb)) {
7102
		refcount_inc(&rb->aux_mmap_count);
7103

7104
	} else {
7105
		if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
7106
			refcount_dec(&rb->mmap_count);
7107
			return -EPERM;
7108
		}
7109

7110
		WARN_ON(!rb && event->rb);
7111

7112
		if (vma->vm_flags & VM_WRITE)
7113
			rb_flags |= RING_BUFFER_WRITABLE;
7114

7115
		ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
7116
				   event->attr.aux_watermark, rb_flags);
7117
		if (ret) {
7118
			refcount_dec(&rb->mmap_count);
7119
			return ret;
7120
		}
7121

7122
		refcount_set(&rb->aux_mmap_count, 1);
7123
		rb->aux_mmap_locked = extra;
7124
	}
7125

7126
	perf_mmap_account(vma, user_extra, extra);
7127
	refcount_inc(&event->mmap_count);
7128

7129
	return 0;
7130
}
7131

7132
static int perf_mmap(struct file *file, struct vm_area_struct *vma)
7133
{
7134
	struct perf_event *event = file->private_data;
7135
	unsigned long vma_size, nr_pages;
7136
	mapped_f mapped;
7137
	int ret;
7138

7139
	/*
7140
	 * Don't allow mmap() of inherited per-task counters. This would
7141
	 * create a performance issue due to all children writing to the
7142
	 * same rb.
7143
	 */
7144
	if (event->cpu == -1 && event->attr.inherit)
7145
		return -EINVAL;
7146

7147
	if (!(vma->vm_flags & VM_SHARED))
7148
		return -EINVAL;
7149

7150
	ret = security_perf_event_read(event);
7151
	if (ret)
7152
		return ret;
7153

7154
	vma_size = vma->vm_end - vma->vm_start;
7155
	nr_pages = vma_size / PAGE_SIZE;
7156

7157
	if (nr_pages > INT_MAX)
7158
		return -ENOMEM;
7159

7160
	if (vma_size != PAGE_SIZE * nr_pages)
7161
		return -EINVAL;
7162

7163
	scoped_guard (mutex, &event->mmap_mutex) {
7164
		/*
7165
		 * This relies on __pmu_detach_event() taking mmap_mutex after marking
7166
		 * the event REVOKED. Either we observe the state, or __pmu_detach_event()
7167
		 * will detach the rb created here.
7168
		 */
7169
		if (event->state <= PERF_EVENT_STATE_REVOKED)
7170
			return -ENODEV;
7171

7172
		if (vma->vm_pgoff == 0)
7173
			ret = perf_mmap_rb(vma, event, nr_pages);
7174
		else
7175
			ret = perf_mmap_aux(vma, event, nr_pages);
7176
		if (ret)
7177
			return ret;
7178
	}
7179

7180
	/*
7181
	 * Since pinned accounting is per vm we cannot allow fork() to copy our
7182
	 * vma.
7183
	 */
7184
	vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
7185
	vma->vm_ops = &perf_mmap_vmops;
7186

7187
	mapped = get_mapped(event, event_mapped);
7188
	if (mapped)
7189
		mapped(event, vma->vm_mm);
7190

7191
	/*
7192
	 * Try to map it into the page table. On fail, invoke
7193
	 * perf_mmap_close() to undo the above, as the callsite expects
7194
	 * full cleanup in this case and therefore does not invoke
7195
	 * vmops::close().
7196
	 */
7197
	ret = map_range(event->rb, vma);
7198
	if (ret)
7199
		perf_mmap_close(vma);
7200

7201
	return ret;
7202
}
7203

7204
static int perf_fasync(int fd, struct file *filp, int on)
7205
{
7206
	struct inode *inode = file_inode(filp);
7207
	struct perf_event *event = filp->private_data;
7208
	int retval;
7209

7210
	if (event->state <= PERF_EVENT_STATE_REVOKED)
7211
		return -ENODEV;
7212

7213
	inode_lock(inode);
7214
	retval = fasync_helper(fd, filp, on, &event->fasync);
7215
	inode_unlock(inode);
7216

7217
	if (retval < 0)
7218
		return retval;
7219

7220
	return 0;
7221
}
7222

7223
static const struct file_operations perf_fops = {
7224
	.release		= perf_release,
7225
	.read			= perf_read,
7226
	.poll			= perf_poll,
7227
	.unlocked_ioctl		= perf_ioctl,
7228
	.compat_ioctl		= perf_compat_ioctl,
7229
	.mmap			= perf_mmap,
7230
	.fasync			= perf_fasync,
7231
};
7232

7233
/*
7234
 * Perf event wakeup
7235
 *
7236
 * If there's data, ensure we set the poll() state and publish everything
7237
 * to user-space before waking everybody up.
7238
 */
7239

7240
void perf_event_wakeup(struct perf_event *event)
7241
{
7242
	ring_buffer_wakeup(event);
7243

7244
	if (event->pending_kill) {
7245
		kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
7246
		event->pending_kill = 0;
7247
	}
7248
}
7249

7250
static void perf_sigtrap(struct perf_event *event)
7251
{
7252
	/*
7253
	 * Both perf_pending_task() and perf_pending_irq() can race with the
7254
	 * task exiting.
7255
	 */
7256
	if (current->flags & PF_EXITING)
7257
		return;
7258

7259
	/*
7260
	 * We'd expect this to only occur if the irq_work is delayed and either
7261
	 * ctx->task or current has changed in the meantime. This can be the
7262
	 * case on architectures that do not implement arch_irq_work_raise().
7263
	 */
7264
	if (WARN_ON_ONCE(event->ctx->task != current))
7265
		return;
7266

7267
	send_sig_perf((void __user *)event->pending_addr,
7268
		      event->orig_type, event->attr.sig_data);
7269
}
7270

7271
/*
7272
 * Deliver the pending work in-event-context or follow the context.
7273
 */
7274
static void __perf_pending_disable(struct perf_event *event)
7275
{
7276
	int cpu = READ_ONCE(event->oncpu);
7277

7278
	/*
7279
	 * If the event isn't running; we done. event_sched_out() will have
7280
	 * taken care of things.
7281
	 */
7282
	if (cpu < 0)
7283
		return;
7284

7285
	/*
7286
	 * Yay, we hit home and are in the context of the event.
7287
	 */
7288
	if (cpu == smp_processor_id()) {
7289
		if (event->pending_disable) {
7290
			event->pending_disable = 0;
7291
			perf_event_disable_local(event);
7292
		}
7293
		return;
7294
	}
7295

7296
	/*
7297
	 *  CPU-A			CPU-B
7298
	 *
7299
	 *  perf_event_disable_inatomic()
7300
	 *    @pending_disable = 1;
7301
	 *    irq_work_queue();
7302
	 *
7303
	 *  sched-out
7304
	 *    @pending_disable = 0;
7305
	 *
7306
	 *				sched-in
7307
	 *				perf_event_disable_inatomic()
7308
	 *				  @pending_disable = 1;
7309
	 *				  irq_work_queue(); // FAILS
7310
	 *
7311
	 *  irq_work_run()
7312
	 *    perf_pending_disable()
7313
	 *
7314
	 * But the event runs on CPU-B and wants disabling there.
7315
	 */
7316
	irq_work_queue_on(&event->pending_disable_irq, cpu);
7317
}
7318

7319
static void perf_pending_disable(struct irq_work *entry)
7320
{
7321
	struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq);
7322
	int rctx;
7323

7324
	/*
7325
	 * If we 'fail' here, that's OK, it means recursion is already disabled
7326
	 * and we won't recurse 'further'.
7327
	 */
7328
	rctx = perf_swevent_get_recursion_context();
7329
	__perf_pending_disable(event);
7330
	if (rctx >= 0)
7331
		perf_swevent_put_recursion_context(rctx);
7332
}
7333

7334
static void perf_pending_irq(struct irq_work *entry)
7335
{
7336
	struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
7337
	int rctx;
7338

7339
	/*
7340
	 * If we 'fail' here, that's OK, it means recursion is already disabled
7341
	 * and we won't recurse 'further'.
7342
	 */
7343
	rctx = perf_swevent_get_recursion_context();
7344

7345
	/*
7346
	 * The wakeup isn't bound to the context of the event -- it can happen
7347
	 * irrespective of where the event is.
7348
	 */
7349
	if (event->pending_wakeup) {
7350
		event->pending_wakeup = 0;
7351
		perf_event_wakeup(event);
7352
	}
7353

7354
	if (rctx >= 0)
7355
		perf_swevent_put_recursion_context(rctx);
7356
}
7357

7358
static void perf_pending_task(struct callback_head *head)
7359
{
7360
	struct perf_event *event = container_of(head, struct perf_event, pending_task);
7361
	int rctx;
7362

7363
	/*
7364
	 * If we 'fail' here, that's OK, it means recursion is already disabled
7365
	 * and we won't recurse 'further'.
7366
	 */
7367
	rctx = perf_swevent_get_recursion_context();
7368

7369
	if (event->pending_work) {
7370
		event->pending_work = 0;
7371
		perf_sigtrap(event);
7372
		local_dec(&event->ctx->nr_no_switch_fast);
7373
	}
7374
	put_event(event);
7375

7376
	if (rctx >= 0)
7377
		perf_swevent_put_recursion_context(rctx);
7378
}
7379

7380
#ifdef CONFIG_GUEST_PERF_EVENTS
7381
struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
7382

7383
DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state);
7384
DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
7385
DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);
7386

7387
void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
7388
{
7389
	if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
7390
		return;
7391

7392
	rcu_assign_pointer(perf_guest_cbs, cbs);
7393
	static_call_update(__perf_guest_state, cbs->state);
7394
	static_call_update(__perf_guest_get_ip, cbs->get_ip);
7395

7396
	/* Implementing ->handle_intel_pt_intr is optional. */
7397
	if (cbs->handle_intel_pt_intr)
7398
		static_call_update(__perf_guest_handle_intel_pt_intr,
7399
				   cbs->handle_intel_pt_intr);
7400
}
7401
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
7402

7403
void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
7404
{
7405
	if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
7406
		return;
7407

7408
	rcu_assign_pointer(perf_guest_cbs, NULL);
7409
	static_call_update(__perf_guest_state, (void *)&__static_call_return0);
7410
	static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0);
7411
	static_call_update(__perf_guest_handle_intel_pt_intr,
7412
			   (void *)&__static_call_return0);
7413
	synchronize_rcu();
7414
}
7415
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
7416
#endif
7417

7418
static bool should_sample_guest(struct perf_event *event)
7419
{
7420
	return !event->attr.exclude_guest && perf_guest_state();
7421
}
7422

7423
unsigned long perf_misc_flags(struct perf_event *event,
7424
			      struct pt_regs *regs)
7425
{
7426
	if (should_sample_guest(event))
7427
		return perf_arch_guest_misc_flags(regs);
7428

7429
	return perf_arch_misc_flags(regs);
7430
}
7431

7432
unsigned long perf_instruction_pointer(struct perf_event *event,
7433
				       struct pt_regs *regs)
7434
{
7435
	if (should_sample_guest(event))
7436
		return perf_guest_get_ip();
7437

7438
	return perf_arch_instruction_pointer(regs);
7439
}
7440

7441
static void
7442
perf_output_sample_regs(struct perf_output_handle *handle,
7443
			struct pt_regs *regs, u64 mask)
7444
{
7445
	int bit;
7446
	DECLARE_BITMAP(_mask, 64);
7447

7448
	bitmap_from_u64(_mask, mask);
7449
	for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
7450
		u64 val;
7451

7452
		val = perf_reg_value(regs, bit);
7453
		perf_output_put(handle, val);
7454
	}
7455
}
7456

7457
static void perf_sample_regs_user(struct perf_regs *regs_user,
7458
				  struct pt_regs *regs)
7459
{
7460
	if (user_mode(regs)) {
7461
		regs_user->abi = perf_reg_abi(current);
7462
		regs_user->regs = regs;
7463
	} else if (is_user_task(current)) {
7464
		perf_get_regs_user(regs_user, regs);
7465
	} else {
7466
		regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
7467
		regs_user->regs = NULL;
7468
	}
7469
}
7470

7471
static void perf_sample_regs_intr(struct perf_regs *regs_intr,
7472
				  struct pt_regs *regs)
7473
{
7474
	regs_intr->regs = regs;
7475
	regs_intr->abi  = perf_reg_abi(current);
7476
}
7477

7478

7479
/*
7480
 * Get remaining task size from user stack pointer.
7481
 *
7482
 * It'd be better to take stack vma map and limit this more
7483
 * precisely, but there's no way to get it safely under interrupt,
7484
 * so using TASK_SIZE as limit.
7485
 */
7486
static u64 perf_ustack_task_size(struct pt_regs *regs)
7487
{
7488
	unsigned long addr = perf_user_stack_pointer(regs);
7489

7490
	if (!addr || addr >= TASK_SIZE)
7491
		return 0;
7492

7493
	return TASK_SIZE - addr;
7494
}
7495

7496
static u16
7497
perf_sample_ustack_size(u16 stack_size, u16 header_size,
7498
			struct pt_regs *regs)
7499
{
7500
	u64 task_size;
7501

7502
	/* No regs, no stack pointer, no dump. */
7503
	if (!regs)
7504
		return 0;
7505

7506
	/* No mm, no stack, no dump. */
7507
	if (!current->mm)
7508
		return 0;
7509

7510
	/*
7511
	 * Check if we fit in with the requested stack size into the:
7512
	 * - TASK_SIZE
7513
	 *   If we don't, we limit the size to the TASK_SIZE.
7514
	 *
7515
	 * - remaining sample size
7516
	 *   If we don't, we customize the stack size to
7517
	 *   fit in to the remaining sample size.
7518
	 */
7519

7520
	task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
7521
	stack_size = min(stack_size, (u16) task_size);
7522

7523
	/* Current header size plus static size and dynamic size. */
7524
	header_size += 2 * sizeof(u64);
7525

7526
	/* Do we fit in with the current stack dump size? */
7527
	if ((u16) (header_size + stack_size) < header_size) {
7528
		/*
7529
		 * If we overflow the maximum size for the sample,
7530
		 * we customize the stack dump size to fit in.
7531
		 */
7532
		stack_size = USHRT_MAX - header_size - sizeof(u64);
7533
		stack_size = round_up(stack_size, sizeof(u64));
7534
	}
7535

7536
	return stack_size;
7537
}
7538

7539
static void
7540
perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
7541
			  struct pt_regs *regs)
7542
{
7543
	/* Case of a kernel thread, nothing to dump */
7544
	if (!regs) {
7545
		u64 size = 0;
7546
		perf_output_put(handle, size);
7547
	} else {
7548
		unsigned long sp;
7549
		unsigned int rem;
7550
		u64 dyn_size;
7551

7552
		/*
7553
		 * We dump:
7554
		 * static size
7555
		 *   - the size requested by user or the best one we can fit
7556
		 *     in to the sample max size
7557
		 * data
7558
		 *   - user stack dump data
7559
		 * dynamic size
7560
		 *   - the actual dumped size
7561
		 */
7562

7563
		/* Static size. */
7564
		perf_output_put(handle, dump_size);
7565

7566
		/* Data. */
7567
		sp = perf_user_stack_pointer(regs);
7568
		rem = __output_copy_user(handle, (void *) sp, dump_size);
7569
		dyn_size = dump_size - rem;
7570

7571
		perf_output_skip(handle, rem);
7572

7573
		/* Dynamic size. */
7574
		perf_output_put(handle, dyn_size);
7575
	}
7576
}
7577

7578
static unsigned long perf_prepare_sample_aux(struct perf_event *event,
7579
					  struct perf_sample_data *data,
7580
					  size_t size)
7581
{
7582
	struct perf_event *sampler = event->aux_event;
7583
	struct perf_buffer *rb;
7584

7585
	data->aux_size = 0;
7586

7587
	if (!sampler)
7588
		goto out;
7589

7590
	if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
7591
		goto out;
7592

7593
	if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
7594
		goto out;
7595

7596
	rb = ring_buffer_get(sampler);
7597
	if (!rb)
7598
		goto out;
7599

7600
	/*
7601
	 * If this is an NMI hit inside sampling code, don't take
7602
	 * the sample. See also perf_aux_sample_output().
7603
	 */
7604
	if (READ_ONCE(rb->aux_in_sampling)) {
7605
		data->aux_size = 0;
7606
	} else {
7607
		size = min_t(size_t, size, perf_aux_size(rb));
7608
		data->aux_size = ALIGN(size, sizeof(u64));
7609
	}
7610
	ring_buffer_put(rb);
7611

7612
out:
7613
	return data->aux_size;
7614
}
7615

7616
static long perf_pmu_snapshot_aux(struct perf_buffer *rb,
7617
                                 struct perf_event *event,
7618
                                 struct perf_output_handle *handle,
7619
                                 unsigned long size)
7620
{
7621
	unsigned long flags;
7622
	long ret;
7623

7624
	/*
7625
	 * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
7626
	 * paths. If we start calling them in NMI context, they may race with
7627
	 * the IRQ ones, that is, for example, re-starting an event that's just
7628
	 * been stopped, which is why we're using a separate callback that
7629
	 * doesn't change the event state.
7630
	 *
7631
	 * IRQs need to be disabled to prevent IPIs from racing with us.
7632
	 */
7633
	local_irq_save(flags);
7634
	/*
7635
	 * Guard against NMI hits inside the critical section;
7636
	 * see also perf_prepare_sample_aux().
7637
	 */
7638
	WRITE_ONCE(rb->aux_in_sampling, 1);
7639
	barrier();
7640

7641
	ret = event->pmu->snapshot_aux(event, handle, size);
7642

7643
	barrier();
7644
	WRITE_ONCE(rb->aux_in_sampling, 0);
7645
	local_irq_restore(flags);
7646

7647
	return ret;
7648
}
7649

7650
static void perf_aux_sample_output(struct perf_event *event,
7651
				   struct perf_output_handle *handle,
7652
				   struct perf_sample_data *data)
7653
{
7654
	struct perf_event *sampler = event->aux_event;
7655
	struct perf_buffer *rb;
7656
	unsigned long pad;
7657
	long size;
7658

7659
	if (WARN_ON_ONCE(!sampler || !data->aux_size))
7660
		return;
7661

7662
	rb = ring_buffer_get(sampler);
7663
	if (!rb)
7664
		return;
7665

7666
	size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
7667

7668
	/*
7669
	 * An error here means that perf_output_copy() failed (returned a
7670
	 * non-zero surplus that it didn't copy), which in its current
7671
	 * enlightened implementation is not possible. If that changes, we'd
7672
	 * like to know.
7673
	 */
7674
	if (WARN_ON_ONCE(size < 0))
7675
		goto out_put;
7676

7677
	/*
7678
	 * The pad comes from ALIGN()ing data->aux_size up to u64 in
7679
	 * perf_prepare_sample_aux(), so should not be more than that.
7680
	 */
7681
	pad = data->aux_size - size;
7682
	if (WARN_ON_ONCE(pad >= sizeof(u64)))
7683
		pad = 8;
7684

7685
	if (pad) {
7686
		u64 zero = 0;
7687
		perf_output_copy(handle, &zero, pad);
7688
	}
7689

7690
out_put:
7691
	ring_buffer_put(rb);
7692
}
7693

7694
/*
7695
 * A set of common sample data types saved even for non-sample records
7696
 * when event->attr.sample_id_all is set.
7697
 */
7698
#define PERF_SAMPLE_ID_ALL  (PERF_SAMPLE_TID | PERF_SAMPLE_TIME |	\
7699
			     PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID |	\
7700
			     PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER)
7701

7702
static void __perf_event_header__init_id(struct perf_sample_data *data,
7703
					 struct perf_event *event,
7704
					 u64 sample_type)
7705
{
7706
	data->type = event->attr.sample_type;
7707
	data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL;
7708

7709
	if (sample_type & PERF_SAMPLE_TID) {
7710
		/* namespace issues */
7711
		data->tid_entry.pid = perf_event_pid(event, current);
7712
		data->tid_entry.tid = perf_event_tid(event, current);
7713
	}
7714

7715
	if (sample_type & PERF_SAMPLE_TIME)
7716
		data->time = perf_event_clock(event);
7717

7718
	if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
7719
		data->id = primary_event_id(event);
7720

7721
	if (sample_type & PERF_SAMPLE_STREAM_ID)
7722
		data->stream_id = event->id;
7723

7724
	if (sample_type & PERF_SAMPLE_CPU) {
7725
		data->cpu_entry.cpu	 = raw_smp_processor_id();
7726
		data->cpu_entry.reserved = 0;
7727
	}
7728
}
7729

7730
void perf_event_header__init_id(struct perf_event_header *header,
7731
				struct perf_sample_data *data,
7732
				struct perf_event *event)
7733
{
7734
	if (event->attr.sample_id_all) {
7735
		header->size += event->id_header_size;
7736
		__perf_event_header__init_id(data, event, event->attr.sample_type);
7737
	}
7738
}
7739

7740
static void __perf_event__output_id_sample(struct perf_output_handle *handle,
7741
					   struct perf_sample_data *data)
7742
{
7743
	u64 sample_type = data->type;
7744

7745
	if (sample_type & PERF_SAMPLE_TID)
7746
		perf_output_put(handle, data->tid_entry);
7747

7748
	if (sample_type & PERF_SAMPLE_TIME)
7749
		perf_output_put(handle, data->time);
7750

7751
	if (sample_type & PERF_SAMPLE_ID)
7752
		perf_output_put(handle, data->id);
7753

7754
	if (sample_type & PERF_SAMPLE_STREAM_ID)
7755
		perf_output_put(handle, data->stream_id);
7756

7757
	if (sample_type & PERF_SAMPLE_CPU)
7758
		perf_output_put(handle, data->cpu_entry);
7759

7760
	if (sample_type & PERF_SAMPLE_IDENTIFIER)
7761
		perf_output_put(handle, data->id);
7762
}
7763

7764
void perf_event__output_id_sample(struct perf_event *event,
7765
				  struct perf_output_handle *handle,
7766
				  struct perf_sample_data *sample)
7767
{
7768
	if (event->attr.sample_id_all)
7769
		__perf_event__output_id_sample(handle, sample);
7770
}
7771

7772
static void perf_output_read_one(struct perf_output_handle *handle,
7773
				 struct perf_event *event,
7774
				 u64 enabled, u64 running)
7775
{
7776
	u64 read_format = event->attr.read_format;
7777
	u64 values[5];
7778
	int n = 0;
7779

7780
	values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr));
7781
	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
7782
		values[n++] = enabled +
7783
			atomic64_read(&event->child_total_time_enabled);
7784
	}
7785
	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
7786
		values[n++] = running +
7787
			atomic64_read(&event->child_total_time_running);
7788
	}
7789
	if (read_format & PERF_FORMAT_ID)
7790
		values[n++] = primary_event_id(event);
7791
	if (read_format & PERF_FORMAT_LOST)
7792
		values[n++] = atomic64_read(&event->lost_samples);
7793

7794
	__output_copy(handle, values, n * sizeof(u64));
7795
}
7796

7797
static void perf_output_read_group(struct perf_output_handle *handle,
7798
				   struct perf_event *event,
7799
				   u64 enabled, u64 running)
7800
{
7801
	struct perf_event *leader = event->group_leader, *sub;
7802
	u64 read_format = event->attr.read_format;
7803
	unsigned long flags;
7804
	u64 values[6];
7805
	int n = 0;
7806
	bool self = has_inherit_and_sample_read(&event->attr);
7807

7808
	/*
7809
	 * Disabling interrupts avoids all counter scheduling
7810
	 * (context switches, timer based rotation and IPIs).
7811
	 */
7812
	local_irq_save(flags);
7813

7814
	values[n++] = 1 + leader->nr_siblings;
7815

7816
	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
7817
		values[n++] = enabled;
7818

7819
	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
7820
		values[n++] = running;
7821

7822
	if ((leader != event) && !handle->skip_read)
7823
		perf_pmu_read(leader);
7824

7825
	values[n++] = perf_event_count(leader, self);
7826
	if (read_format & PERF_FORMAT_ID)
7827
		values[n++] = primary_event_id(leader);
7828
	if (read_format & PERF_FORMAT_LOST)
7829
		values[n++] = atomic64_read(&leader->lost_samples);
7830

7831
	__output_copy(handle, values, n * sizeof(u64));
7832

7833
	for_each_sibling_event(sub, leader) {
7834
		n = 0;
7835

7836
		if ((sub != event) && !handle->skip_read)
7837
			perf_pmu_read(sub);
7838

7839
		values[n++] = perf_event_count(sub, self);
7840
		if (read_format & PERF_FORMAT_ID)
7841
			values[n++] = primary_event_id(sub);
7842
		if (read_format & PERF_FORMAT_LOST)
7843
			values[n++] = atomic64_read(&sub->lost_samples);
7844

7845
		__output_copy(handle, values, n * sizeof(u64));
7846
	}
7847

7848
	local_irq_restore(flags);
7849
}
7850

7851
#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
7852
				 PERF_FORMAT_TOTAL_TIME_RUNNING)
7853

7854
/*
7855
 * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
7856
 *
7857
 * The problem is that its both hard and excessively expensive to iterate the
7858
 * child list, not to mention that its impossible to IPI the children running
7859
 * on another CPU, from interrupt/NMI context.
7860
 *
7861
 * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread
7862
 * counts rather than attempting to accumulate some value across all children on
7863
 * all cores.
7864
 */
7865
static void perf_output_read(struct perf_output_handle *handle,
7866
			     struct perf_event *event)
7867
{
7868
	u64 enabled = 0, running = 0, now;
7869
	u64 read_format = event->attr.read_format;
7870

7871
	/*
7872
	 * compute total_time_enabled, total_time_running
7873
	 * based on snapshot values taken when the event
7874
	 * was last scheduled in.
7875
	 *
7876
	 * we cannot simply called update_context_time()
7877
	 * because of locking issue as we are called in
7878
	 * NMI context
7879
	 */
7880
	if (read_format & PERF_FORMAT_TOTAL_TIMES)
7881
		calc_timer_values(event, &now, &enabled, &running);
7882

7883
	if (event->attr.read_format & PERF_FORMAT_GROUP)
7884
		perf_output_read_group(handle, event, enabled, running);
7885
	else
7886
		perf_output_read_one(handle, event, enabled, running);
7887
}
7888

7889
void perf_output_sample(struct perf_output_handle *handle,
7890
			struct perf_event_header *header,
7891
			struct perf_sample_data *data,
7892
			struct perf_event *event)
7893
{
7894
	u64 sample_type = data->type;
7895

7896
	if (data->sample_flags & PERF_SAMPLE_READ)
7897
		handle->skip_read = 1;
7898

7899
	perf_output_put(handle, *header);
7900

7901
	if (sample_type & PERF_SAMPLE_IDENTIFIER)
7902
		perf_output_put(handle, data->id);
7903

7904
	if (sample_type & PERF_SAMPLE_IP)
7905
		perf_output_put(handle, data->ip);
7906

7907
	if (sample_type & PERF_SAMPLE_TID)
7908
		perf_output_put(handle, data->tid_entry);
7909

7910
	if (sample_type & PERF_SAMPLE_TIME)
7911
		perf_output_put(handle, data->time);
7912

7913
	if (sample_type & PERF_SAMPLE_ADDR)
7914
		perf_output_put(handle, data->addr);
7915

7916
	if (sample_type & PERF_SAMPLE_ID)
7917
		perf_output_put(handle, data->id);
7918

7919
	if (sample_type & PERF_SAMPLE_STREAM_ID)
7920
		perf_output_put(handle, data->stream_id);
7921

7922
	if (sample_type & PERF_SAMPLE_CPU)
7923
		perf_output_put(handle, data->cpu_entry);
7924

7925
	if (sample_type & PERF_SAMPLE_PERIOD)
7926
		perf_output_put(handle, data->period);
7927

7928
	if (sample_type & PERF_SAMPLE_READ)
7929
		perf_output_read(handle, event);
7930

7931
	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
7932
		int size = 1;
7933

7934
		size += data->callchain->nr;
7935
		size *= sizeof(u64);
7936
		__output_copy(handle, data->callchain, size);
7937
	}
7938

7939
	if (sample_type & PERF_SAMPLE_RAW) {
7940
		struct perf_raw_record *raw = data->raw;
7941

7942
		if (raw) {
7943
			struct perf_raw_frag *frag = &raw->frag;
7944

7945
			perf_output_put(handle, raw->size);
7946
			do {
7947
				if (frag->copy) {
7948
					__output_custom(handle, frag->copy,
7949
							frag->data, frag->size);
7950
				} else {
7951
					__output_copy(handle, frag->data,
7952
						      frag->size);
7953
				}
7954
				if (perf_raw_frag_last(frag))
7955
					break;
7956
				frag = frag->next;
7957
			} while (1);
7958
			if (frag->pad)
7959
				__output_skip(handle, NULL, frag->pad);
7960
		} else {
7961
			struct {
7962
				u32	size;
7963
				u32	data;
7964
			} raw = {
7965
				.size = sizeof(u32),
7966
				.data = 0,
7967
			};
7968
			perf_output_put(handle, raw);
7969
		}
7970
	}
7971

7972
	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
7973
		if (data->br_stack) {
7974
			size_t size;
7975

7976
			size = data->br_stack->nr
7977
			     * sizeof(struct perf_branch_entry);
7978

7979
			perf_output_put(handle, data->br_stack->nr);
7980
			if (branch_sample_hw_index(event))
7981
				perf_output_put(handle, data->br_stack->hw_idx);
7982
			perf_output_copy(handle, data->br_stack->entries, size);
7983
			/*
7984
			 * Add the extension space which is appended
7985
			 * right after the struct perf_branch_stack.
7986
			 */
7987
			if (data->br_stack_cntr) {
7988
				size = data->br_stack->nr * sizeof(u64);
7989
				perf_output_copy(handle, data->br_stack_cntr, size);
7990
			}
7991
		} else {
7992
			/*
7993
			 * we always store at least the value of nr
7994
			 */
7995
			u64 nr = 0;
7996
			perf_output_put(handle, nr);
7997
		}
7998
	}
7999

8000
	if (sample_type & PERF_SAMPLE_REGS_USER) {
8001
		u64 abi = data->regs_user.abi;
8002

8003
		/*
8004
		 * If there are no regs to dump, notice it through
8005
		 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
8006
		 */
8007
		perf_output_put(handle, abi);
8008

8009
		if (abi) {
8010
			u64 mask = event->attr.sample_regs_user;
8011
			perf_output_sample_regs(handle,
8012
						data->regs_user.regs,
8013
						mask);
8014
		}
8015
	}
8016

8017
	if (sample_type & PERF_SAMPLE_STACK_USER) {
8018
		perf_output_sample_ustack(handle,
8019
					  data->stack_user_size,
8020
					  data->regs_user.regs);
8021
	}
8022

8023
	if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
8024
		perf_output_put(handle, data->weight.full);
8025

8026
	if (sample_type & PERF_SAMPLE_DATA_SRC)
8027
		perf_output_put(handle, data->data_src.val);
8028

8029
	if (sample_type & PERF_SAMPLE_TRANSACTION)
8030
		perf_output_put(handle, data->txn);
8031

8032
	if (sample_type & PERF_SAMPLE_REGS_INTR) {
8033
		u64 abi = data->regs_intr.abi;
8034
		/*
8035
		 * If there are no regs to dump, notice it through
8036
		 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
8037
		 */
8038
		perf_output_put(handle, abi);
8039

8040
		if (abi) {
8041
			u64 mask = event->attr.sample_regs_intr;
8042

8043
			perf_output_sample_regs(handle,
8044
						data->regs_intr.regs,
8045
						mask);
8046
		}
8047
	}
8048

8049
	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
8050
		perf_output_put(handle, data->phys_addr);
8051

8052
	if (sample_type & PERF_SAMPLE_CGROUP)
8053
		perf_output_put(handle, data->cgroup);
8054

8055
	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
8056
		perf_output_put(handle, data->data_page_size);
8057

8058
	if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
8059
		perf_output_put(handle, data->code_page_size);
8060

8061
	if (sample_type & PERF_SAMPLE_AUX) {
8062
		perf_output_put(handle, data->aux_size);
8063

8064
		if (data->aux_size)
8065
			perf_aux_sample_output(event, handle, data);
8066
	}
8067

8068
	if (!event->attr.watermark) {
8069
		int wakeup_events = event->attr.wakeup_events;
8070

8071
		if (wakeup_events) {
8072
			struct perf_buffer *rb = handle->rb;
8073
			int events = local_inc_return(&rb->events);
8074

8075
			if (events >= wakeup_events) {
8076
				local_sub(wakeup_events, &rb->events);
8077
				local_inc(&rb->wakeup);
8078
			}
8079
		}
8080
	}
8081
}
8082

8083
static u64 perf_virt_to_phys(u64 virt)
8084
{
8085
	u64 phys_addr = 0;
8086

8087
	if (!virt)
8088
		return 0;
8089

8090
	if (virt >= TASK_SIZE) {
8091
		/* If it's vmalloc()d memory, leave phys_addr as 0 */
8092
		if (virt_addr_valid((void *)(uintptr_t)virt) &&
8093
		    !(virt >= VMALLOC_START && virt < VMALLOC_END))
8094
			phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
8095
	} else {
8096
		/*
8097
		 * Walking the pages tables for user address.
8098
		 * Interrupts are disabled, so it prevents any tear down
8099
		 * of the page tables.
8100
		 * Try IRQ-safe get_user_page_fast_only first.
8101
		 * If failed, leave phys_addr as 0.
8102
		 */
8103
		if (is_user_task(current)) {
8104
			struct page *p;
8105

8106
			pagefault_disable();
8107
			if (get_user_page_fast_only(virt, 0, &p)) {
8108
				phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
8109
				put_page(p);
8110
			}
8111
			pagefault_enable();
8112
		}
8113
	}
8114

8115
	return phys_addr;
8116
}
8117

8118
/*
8119
 * Return the pagetable size of a given virtual address.
8120
 */
8121
static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
8122
{
8123
	u64 size = 0;
8124

8125
#ifdef CONFIG_HAVE_GUP_FAST
8126
	pgd_t *pgdp, pgd;
8127
	p4d_t *p4dp, p4d;
8128
	pud_t *pudp, pud;
8129
	pmd_t *pmdp, pmd;
8130
	pte_t *ptep, pte;
8131

8132
	pgdp = pgd_offset(mm, addr);
8133
	pgd = READ_ONCE(*pgdp);
8134
	if (pgd_none(pgd))
8135
		return 0;
8136

8137
	if (pgd_leaf(pgd))
8138
		return pgd_leaf_size(pgd);
8139

8140
	p4dp = p4d_offset_lockless(pgdp, pgd, addr);
8141
	p4d = READ_ONCE(*p4dp);
8142
	if (!p4d_present(p4d))
8143
		return 0;
8144

8145
	if (p4d_leaf(p4d))
8146
		return p4d_leaf_size(p4d);
8147

8148
	pudp = pud_offset_lockless(p4dp, p4d, addr);
8149
	pud = READ_ONCE(*pudp);
8150
	if (!pud_present(pud))
8151
		return 0;
8152

8153
	if (pud_leaf(pud))
8154
		return pud_leaf_size(pud);
8155

8156
	pmdp = pmd_offset_lockless(pudp, pud, addr);
8157
again:
8158
	pmd = pmdp_get_lockless(pmdp);
8159
	if (!pmd_present(pmd))
8160
		return 0;
8161

8162
	if (pmd_leaf(pmd))
8163
		return pmd_leaf_size(pmd);
8164

8165
	ptep = pte_offset_map(&pmd, addr);
8166
	if (!ptep)
8167
		goto again;
8168

8169
	pte = ptep_get_lockless(ptep);
8170
	if (pte_present(pte))
8171
		size = __pte_leaf_size(pmd, pte);
8172
	pte_unmap(ptep);
8173
#endif /* CONFIG_HAVE_GUP_FAST */
8174

8175
	return size;
8176
}
8177

8178
static u64 perf_get_page_size(unsigned long addr)
8179
{
8180
	struct mm_struct *mm;
8181
	unsigned long flags;
8182
	u64 size;
8183

8184
	if (!addr)
8185
		return 0;
8186

8187
	/*
8188
	 * Software page-table walkers must disable IRQs,
8189
	 * which prevents any tear down of the page tables.
8190
	 */
8191
	local_irq_save(flags);
8192

8193
	mm = current->mm;
8194
	if (!mm) {
8195
		/*
8196
		 * For kernel threads and the like, use init_mm so that
8197
		 * we can find kernel memory.
8198
		 */
8199
		mm = &init_mm;
8200
	}
8201

8202
	size = perf_get_pgtable_size(mm, addr);
8203

8204
	local_irq_restore(flags);
8205

8206
	return size;
8207
}
8208

8209
static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
8210

8211
static struct unwind_work perf_unwind_work;
8212

8213
struct perf_callchain_entry *
8214
perf_callchain(struct perf_event *event, struct pt_regs *regs)
8215
{
8216
	bool kernel = !event->attr.exclude_callchain_kernel;
8217
	bool user   = !event->attr.exclude_callchain_user &&
8218
		is_user_task(current);
8219
	/* Disallow cross-task user callchains. */
8220
	bool crosstask = event->ctx->task && event->ctx->task != current;
8221
	bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user &&
8222
			  event->attr.defer_callchain;
8223
	const u32 max_stack = event->attr.sample_max_stack;
8224
	struct perf_callchain_entry *callchain;
8225
	u64 defer_cookie;
8226

8227
	if (!current->mm)
8228
		user = false;
8229

8230
	if (!kernel && !user)
8231
		return &__empty_callchain;
8232

8233
	if (!(user && defer_user && !crosstask &&
8234
	      unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0))
8235
		defer_cookie = 0;
8236

8237
	callchain = get_perf_callchain(regs, kernel, user, max_stack,
8238
				       crosstask, true, defer_cookie);
8239

8240
	return callchain ?: &__empty_callchain;
8241
}
8242

8243
static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d)
8244
{
8245
	return d * !!(flags & s);
8246
}
8247

8248
void perf_prepare_sample(struct perf_sample_data *data,
8249
			 struct perf_event *event,
8250
			 struct pt_regs *regs)
8251
{
8252
	u64 sample_type = event->attr.sample_type;
8253
	u64 filtered_sample_type;
8254

8255
	/*
8256
	 * Add the sample flags that are dependent to others.  And clear the
8257
	 * sample flags that have already been done by the PMU driver.
8258
	 */
8259
	filtered_sample_type = sample_type;
8260
	filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE,
8261
					   PERF_SAMPLE_IP);
8262
	filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE |
8263
					   PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR);
8264
	filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER,
8265
					   PERF_SAMPLE_REGS_USER);
8266
	filtered_sample_type &= ~data->sample_flags;
8267

8268
	if (filtered_sample_type == 0) {
8269
		/* Make sure it has the correct data->type for output */
8270
		data->type = event->attr.sample_type;
8271
		return;
8272
	}
8273

8274
	__perf_event_header__init_id(data, event, filtered_sample_type);
8275

8276
	if (filtered_sample_type & PERF_SAMPLE_IP) {
8277
		data->ip = perf_instruction_pointer(event, regs);
8278
		data->sample_flags |= PERF_SAMPLE_IP;
8279
	}
8280

8281
	if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
8282
		perf_sample_save_callchain(data, event, regs);
8283

8284
	if (filtered_sample_type & PERF_SAMPLE_RAW) {
8285
		data->raw = NULL;
8286
		data->dyn_size += sizeof(u64);
8287
		data->sample_flags |= PERF_SAMPLE_RAW;
8288
	}
8289

8290
	if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) {
8291
		data->br_stack = NULL;
8292
		data->dyn_size += sizeof(u64);
8293
		data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
8294
	}
8295

8296
	if (filtered_sample_type & PERF_SAMPLE_REGS_USER)
8297
		perf_sample_regs_user(&data->regs_user, regs);
8298

8299
	/*
8300
	 * It cannot use the filtered_sample_type here as REGS_USER can be set
8301
	 * by STACK_USER (using __cond_set() above) and we don't want to update
8302
	 * the dyn_size if it's not requested by users.
8303
	 */
8304
	if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) {
8305
		/* regs dump ABI info */
8306
		int size = sizeof(u64);
8307

8308
		if (data->regs_user.regs) {
8309
			u64 mask = event->attr.sample_regs_user;
8310
			size += hweight64(mask) * sizeof(u64);
8311
		}
8312

8313
		data->dyn_size += size;
8314
		data->sample_flags |= PERF_SAMPLE_REGS_USER;
8315
	}
8316

8317
	if (filtered_sample_type & PERF_SAMPLE_STACK_USER) {
8318
		/*
8319
		 * Either we need PERF_SAMPLE_STACK_USER bit to be always
8320
		 * processed as the last one or have additional check added
8321
		 * in case new sample type is added, because we could eat
8322
		 * up the rest of the sample size.
8323
		 */
8324
		u16 stack_size = event->attr.sample_stack_user;
8325
		u16 header_size = perf_sample_data_size(data, event);
8326
		u16 size = sizeof(u64);
8327

8328
		stack_size = perf_sample_ustack_size(stack_size, header_size,
8329
						     data->regs_user.regs);
8330

8331
		/*
8332
		 * If there is something to dump, add space for the dump
8333
		 * itself and for the field that tells the dynamic size,
8334
		 * which is how many have been actually dumped.
8335
		 */
8336
		if (stack_size)
8337
			size += sizeof(u64) + stack_size;
8338

8339
		data->stack_user_size = stack_size;
8340
		data->dyn_size += size;
8341
		data->sample_flags |= PERF_SAMPLE_STACK_USER;
8342
	}
8343

8344
	if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
8345
		data->weight.full = 0;
8346
		data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
8347
	}
8348

8349
	if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) {
8350
		data->data_src.val = PERF_MEM_NA;
8351
		data->sample_flags |= PERF_SAMPLE_DATA_SRC;
8352
	}
8353

8354
	if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) {
8355
		data->txn = 0;
8356
		data->sample_flags |= PERF_SAMPLE_TRANSACTION;
8357
	}
8358

8359
	if (filtered_sample_type & PERF_SAMPLE_ADDR) {
8360
		data->addr = 0;
8361
		data->sample_flags |= PERF_SAMPLE_ADDR;
8362
	}
8363

8364
	if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) {
8365
		/* regs dump ABI info */
8366
		int size = sizeof(u64);
8367

8368
		perf_sample_regs_intr(&data->regs_intr, regs);
8369

8370
		if (data->regs_intr.regs) {
8371
			u64 mask = event->attr.sample_regs_intr;
8372

8373
			size += hweight64(mask) * sizeof(u64);
8374
		}
8375

8376
		data->dyn_size += size;
8377
		data->sample_flags |= PERF_SAMPLE_REGS_INTR;
8378
	}
8379

8380
	if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) {
8381
		data->phys_addr = perf_virt_to_phys(data->addr);
8382
		data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
8383
	}
8384

8385
#ifdef CONFIG_CGROUP_PERF
8386
	if (filtered_sample_type & PERF_SAMPLE_CGROUP) {
8387
		struct cgroup *cgrp;
8388

8389
		/* protected by RCU */
8390
		cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
8391
		data->cgroup = cgroup_id(cgrp);
8392
		data->sample_flags |= PERF_SAMPLE_CGROUP;
8393
	}
8394
#endif
8395

8396
	/*
8397
	 * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't
8398
	 * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
8399
	 * but the value will not dump to the userspace.
8400
	 */
8401
	if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) {
8402
		data->data_page_size = perf_get_page_size(data->addr);
8403
		data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE;
8404
	}
8405

8406
	if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) {
8407
		data->code_page_size = perf_get_page_size(data->ip);
8408
		data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE;
8409
	}
8410

8411
	if (filtered_sample_type & PERF_SAMPLE_AUX) {
8412
		u64 size;
8413
		u16 header_size = perf_sample_data_size(data, event);
8414

8415
		header_size += sizeof(u64); /* size */
8416

8417
		/*
8418
		 * Given the 16bit nature of header::size, an AUX sample can
8419
		 * easily overflow it, what with all the preceding sample bits.
8420
		 * Make sure this doesn't happen by using up to U16_MAX bytes
8421
		 * per sample in total (rounded down to 8 byte boundary).
8422
		 */
8423
		size = min_t(size_t, U16_MAX - header_size,
8424
			     event->attr.aux_sample_size);
8425
		size = rounddown(size, 8);
8426
		size = perf_prepare_sample_aux(event, data, size);
8427

8428
		WARN_ON_ONCE(size + header_size > U16_MAX);
8429
		data->dyn_size += size + sizeof(u64); /* size above */
8430
		data->sample_flags |= PERF_SAMPLE_AUX;
8431
	}
8432
}
8433

8434
void perf_prepare_header(struct perf_event_header *header,
8435
			 struct perf_sample_data *data,
8436
			 struct perf_event *event,
8437
			 struct pt_regs *regs)
8438
{
8439
	header->type = PERF_RECORD_SAMPLE;
8440
	header->size = perf_sample_data_size(data, event);
8441
	header->misc = perf_misc_flags(event, regs);
8442

8443
	/*
8444
	 * If you're adding more sample types here, you likely need to do
8445
	 * something about the overflowing header::size, like repurpose the
8446
	 * lowest 3 bits of size, which should be always zero at the moment.
8447
	 * This raises a more important question, do we really need 512k sized
8448
	 * samples and why, so good argumentation is in order for whatever you
8449
	 * do here next.
8450
	 */
8451
	WARN_ON_ONCE(header->size & 7);
8452
}
8453

8454
static void __perf_event_aux_pause(struct perf_event *event, bool pause)
8455
{
8456
	if (pause) {
8457
		if (!event->hw.aux_paused) {
8458
			event->hw.aux_paused = 1;
8459
			event->pmu->stop(event, PERF_EF_PAUSE);
8460
		}
8461
	} else {
8462
		if (event->hw.aux_paused) {
8463
			event->hw.aux_paused = 0;
8464
			event->pmu->start(event, PERF_EF_RESUME);
8465
		}
8466
	}
8467
}
8468

8469
static void perf_event_aux_pause(struct perf_event *event, bool pause)
8470
{
8471
	struct perf_buffer *rb;
8472

8473
	if (WARN_ON_ONCE(!event))
8474
		return;
8475

8476
	rb = ring_buffer_get(event);
8477
	if (!rb)
8478
		return;
8479

8480
	scoped_guard (irqsave) {
8481
		/*
8482
		 * Guard against self-recursion here. Another event could trip
8483
		 * this same from NMI context.
8484
		 */
8485
		if (READ_ONCE(rb->aux_in_pause_resume))
8486
			break;
8487

8488
		WRITE_ONCE(rb->aux_in_pause_resume, 1);
8489
		barrier();
8490
		__perf_event_aux_pause(event, pause);
8491
		barrier();
8492
		WRITE_ONCE(rb->aux_in_pause_resume, 0);
8493
	}
8494
	ring_buffer_put(rb);
8495
}
8496

8497
static __always_inline int
8498
__perf_event_output(struct perf_event *event,
8499
		    struct perf_sample_data *data,
8500
		    struct pt_regs *regs,
8501
		    int (*output_begin)(struct perf_output_handle *,
8502
					struct perf_sample_data *,
8503
					struct perf_event *,
8504
					unsigned int))
8505
{
8506
	struct perf_output_handle handle;
8507
	struct perf_event_header header;
8508
	int err;
8509

8510
	/* protect the callchain buffers */
8511
	rcu_read_lock();
8512

8513
	perf_prepare_sample(data, event, regs);
8514
	perf_prepare_header(&header, data, event, regs);
8515

8516
	err = output_begin(&handle, data, event, header.size);
8517
	if (err)
8518
		goto exit;
8519

8520
	perf_output_sample(&handle, &header, data, event);
8521

8522
	perf_output_end(&handle);
8523

8524
exit:
8525
	rcu_read_unlock();
8526
	return err;
8527
}
8528

8529
void
8530
perf_event_output_forward(struct perf_event *event,
8531
			 struct perf_sample_data *data,
8532
			 struct pt_regs *regs)
8533
{
8534
	__perf_event_output(event, data, regs, perf_output_begin_forward);
8535
}
8536

8537
void
8538
perf_event_output_backward(struct perf_event *event,
8539
			   struct perf_sample_data *data,
8540
			   struct pt_regs *regs)
8541
{
8542
	__perf_event_output(event, data, regs, perf_output_begin_backward);
8543
}
8544

8545
int
8546
perf_event_output(struct perf_event *event,
8547
		  struct perf_sample_data *data,
8548
		  struct pt_regs *regs)
8549
{
8550
	return __perf_event_output(event, data, regs, perf_output_begin);
8551
}
8552

8553
/*
8554
 * read event_id
8555
 */
8556

8557
struct perf_read_event {
8558
	struct perf_event_header	header;
8559

8560
	u32				pid;
8561
	u32				tid;
8562
};
8563

8564
static void
8565
perf_event_read_event(struct perf_event *event,
8566
			struct task_struct *task)
8567
{
8568
	struct perf_output_handle handle;
8569
	struct perf_sample_data sample;
8570
	struct perf_read_event read_event = {
8571
		.header = {
8572
			.type = PERF_RECORD_READ,
8573
			.misc = 0,
8574
			.size = sizeof(read_event) + event->read_size,
8575
		},
8576
		.pid = perf_event_pid(event, task),
8577
		.tid = perf_event_tid(event, task),
8578
	};
8579
	int ret;
8580

8581
	perf_event_header__init_id(&read_event.header, &sample, event);
8582
	ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
8583
	if (ret)
8584
		return;
8585

8586
	perf_output_put(&handle, read_event);
8587
	perf_output_read(&handle, event);
8588
	perf_event__output_id_sample(event, &handle, &sample);
8589

8590
	perf_output_end(&handle);
8591
}
8592

8593
typedef void (perf_iterate_f)(struct perf_event *event, void *data);
8594

8595
static void
8596
perf_iterate_ctx(struct perf_event_context *ctx,
8597
		   perf_iterate_f output,
8598
		   void *data, bool all)
8599
{
8600
	struct perf_event *event;
8601

8602
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
8603
		if (!all) {
8604
			if (event->state < PERF_EVENT_STATE_INACTIVE)
8605
				continue;
8606
			if (!event_filter_match(event))
8607
				continue;
8608
		}
8609

8610
		output(event, data);
8611
	}
8612
}
8613

8614
static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
8615
{
8616
	struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
8617
	struct perf_event *event;
8618

8619
	list_for_each_entry_rcu(event, &pel->list, sb_list) {
8620
		/*
8621
		 * Skip events that are not fully formed yet; ensure that
8622
		 * if we observe event->ctx, both event and ctx will be
8623
		 * complete enough. See perf_install_in_context().
8624
		 */
8625
		if (!smp_load_acquire(&event->ctx))
8626
			continue;
8627

8628
		if (event->state < PERF_EVENT_STATE_INACTIVE)
8629
			continue;
8630
		if (!event_filter_match(event))
8631
			continue;
8632
		output(event, data);
8633
	}
8634
}
8635

8636
/*
8637
 * Iterate all events that need to receive side-band events.
8638
 *
8639
 * For new callers; ensure that account_pmu_sb_event() includes
8640
 * your event, otherwise it might not get delivered.
8641
 */
8642
static void
8643
perf_iterate_sb(perf_iterate_f output, void *data,
8644
	       struct perf_event_context *task_ctx)
8645
{
8646
	struct perf_event_context *ctx;
8647

8648
	rcu_read_lock();
8649
	preempt_disable();
8650

8651
	/*
8652
	 * If we have task_ctx != NULL we only notify the task context itself.
8653
	 * The task_ctx is set only for EXIT events before releasing task
8654
	 * context.
8655
	 */
8656
	if (task_ctx) {
8657
		perf_iterate_ctx(task_ctx, output, data, false);
8658
		goto done;
8659
	}
8660

8661
	perf_iterate_sb_cpu(output, data);
8662

8663
	ctx = rcu_dereference(current->perf_event_ctxp);
8664
	if (ctx)
8665
		perf_iterate_ctx(ctx, output, data, false);
8666
done:
8667
	preempt_enable();
8668
	rcu_read_unlock();
8669
}
8670

8671
/*
8672
 * Clear all file-based filters at exec, they'll have to be
8673
 * re-instated when/if these objects are mmapped again.
8674
 */
8675
static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
8676
{
8677
	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
8678
	struct perf_addr_filter *filter;
8679
	unsigned int restart = 0, count = 0;
8680
	unsigned long flags;
8681

8682
	if (!has_addr_filter(event))
8683
		return;
8684

8685
	raw_spin_lock_irqsave(&ifh->lock, flags);
8686
	list_for_each_entry(filter, &ifh->list, entry) {
8687
		if (filter->path.dentry) {
8688
			event->addr_filter_ranges[count].start = 0;
8689
			event->addr_filter_ranges[count].size = 0;
8690
			restart++;
8691
		}
8692

8693
		count++;
8694
	}
8695

8696
	if (restart)
8697
		event->addr_filters_gen++;
8698
	raw_spin_unlock_irqrestore(&ifh->lock, flags);
8699

8700
	if (restart)
8701
		perf_event_stop(event, 1);
8702
}
8703

8704
void perf_event_exec(void)
8705
{
8706
	struct perf_event_context *ctx;
8707

8708
	ctx = perf_pin_task_context(current);
8709
	if (!ctx)
8710
		return;
8711

8712
	perf_event_enable_on_exec(ctx);
8713
	perf_event_remove_on_exec(ctx);
8714
	scoped_guard(rcu)
8715
		perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);
8716

8717
	perf_unpin_context(ctx);
8718
	put_ctx(ctx);
8719
}
8720

8721
struct remote_output {
8722
	struct perf_buffer	*rb;
8723
	int			err;
8724
};
8725

8726
static void __perf_event_output_stop(struct perf_event *event, void *data)
8727
{
8728
	struct perf_event *parent = event->parent;
8729
	struct remote_output *ro = data;
8730
	struct perf_buffer *rb = ro->rb;
8731
	struct stop_event_data sd = {
8732
		.event	= event,
8733
	};
8734

8735
	if (!has_aux(event))
8736
		return;
8737

8738
	if (!parent)
8739
		parent = event;
8740

8741
	/*
8742
	 * In case of inheritance, it will be the parent that links to the
8743
	 * ring-buffer, but it will be the child that's actually using it.
8744
	 *
8745
	 * We are using event::rb to determine if the event should be stopped,
8746
	 * however this may race with ring_buffer_attach() (through set_output),
8747
	 * which will make us skip the event that actually needs to be stopped.
8748
	 * So ring_buffer_attach() has to stop an aux event before re-assigning
8749
	 * its rb pointer.
8750
	 */
8751
	if (rcu_dereference(parent->rb) == rb)
8752
		ro->err = __perf_event_stop(&sd);
8753
}
8754

8755
static int __perf_pmu_output_stop(void *info)
8756
{
8757
	struct perf_event *event = info;
8758
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
8759
	struct remote_output ro = {
8760
		.rb	= event->rb,
8761
	};
8762

8763
	rcu_read_lock();
8764
	perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
8765
	if (cpuctx->task_ctx)
8766
		perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
8767
				   &ro, false);
8768
	rcu_read_unlock();
8769

8770
	return ro.err;
8771
}
8772

8773
static void perf_pmu_output_stop(struct perf_event *event)
8774
{
8775
	struct perf_event *iter;
8776
	int err, cpu;
8777

8778
restart:
8779
	rcu_read_lock();
8780
	list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
8781
		/*
8782
		 * For per-CPU events, we need to make sure that neither they
8783
		 * nor their children are running; for cpu==-1 events it's
8784
		 * sufficient to stop the event itself if it's active, since
8785
		 * it can't have children.
8786
		 */
8787
		cpu = iter->cpu;
8788
		if (cpu == -1)
8789
			cpu = READ_ONCE(iter->oncpu);
8790

8791
		if (cpu == -1)
8792
			continue;
8793

8794
		err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
8795
		if (err == -EAGAIN) {
8796
			rcu_read_unlock();
8797
			goto restart;
8798
		}
8799
	}
8800
	rcu_read_unlock();
8801
}
8802

8803
/*
8804
 * task tracking -- fork/exit
8805
 *
8806
 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
8807
 */
8808

8809
struct perf_task_event {
8810
	struct task_struct		*task;
8811
	struct perf_event_context	*task_ctx;
8812

8813
	struct {
8814
		struct perf_event_header	header;
8815

8816
		u32				pid;
8817
		u32				ppid;
8818
		u32				tid;
8819
		u32				ptid;
8820
		u64				time;
8821
	} event_id;
8822
};
8823

8824
static int perf_event_task_match(struct perf_event *event)
8825
{
8826
	return event->attr.comm  || event->attr.mmap ||
8827
	       event->attr.mmap2 || event->attr.mmap_data ||
8828
	       event->attr.task;
8829
}
8830

8831
static void perf_event_task_output(struct perf_event *event,
8832
				   void *data)
8833
{
8834
	struct perf_task_event *task_event = data;
8835
	struct perf_output_handle handle;
8836
	struct perf_sample_data	sample;
8837
	struct task_struct *task = task_event->task;
8838
	int ret, size = task_event->event_id.header.size;
8839

8840
	if (!perf_event_task_match(event))
8841
		return;
8842

8843
	perf_event_header__init_id(&task_event->event_id.header, &sample, event);
8844

8845
	ret = perf_output_begin(&handle, &sample, event,
8846
				task_event->event_id.header.size);
8847
	if (ret)
8848
		goto out;
8849

8850
	task_event->event_id.pid = perf_event_pid(event, task);
8851
	task_event->event_id.tid = perf_event_tid(event, task);
8852

8853
	if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
8854
		task_event->event_id.ppid = perf_event_pid(event,
8855
							task->real_parent);
8856
		task_event->event_id.ptid = perf_event_pid(event,
8857
							task->real_parent);
8858
	} else {  /* PERF_RECORD_FORK */
8859
		task_event->event_id.ppid = perf_event_pid(event, current);
8860
		task_event->event_id.ptid = perf_event_tid(event, current);
8861
	}
8862

8863
	task_event->event_id.time = perf_event_clock(event);
8864

8865
	perf_output_put(&handle, task_event->event_id);
8866

8867
	perf_event__output_id_sample(event, &handle, &sample);
8868

8869
	perf_output_end(&handle);
8870
out:
8871
	task_event->event_id.header.size = size;
8872
}
8873

8874
static void perf_event_task(struct task_struct *task,
8875
			      struct perf_event_context *task_ctx,
8876
			      int new)
8877
{
8878
	struct perf_task_event task_event;
8879

8880
	if (!atomic_read(&nr_comm_events) &&
8881
	    !atomic_read(&nr_mmap_events) &&
8882
	    !atomic_read(&nr_task_events))
8883
		return;
8884

8885
	task_event = (struct perf_task_event){
8886
		.task	  = task,
8887
		.task_ctx = task_ctx,
8888
		.event_id    = {
8889
			.header = {
8890
				.type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
8891
				.misc = 0,
8892
				.size = sizeof(task_event.event_id),
8893
			},
8894
			/* .pid  */
8895
			/* .ppid */
8896
			/* .tid  */
8897
			/* .ptid */
8898
			/* .time */
8899
		},
8900
	};
8901

8902
	perf_iterate_sb(perf_event_task_output,
8903
		       &task_event,
8904
		       task_ctx);
8905
}
8906

8907
/*
8908
 * Allocate data for a new task when profiling system-wide
8909
 * events which require PMU specific data
8910
 */
8911
static void
8912
perf_event_alloc_task_data(struct task_struct *child,
8913
			   struct task_struct *parent)
8914
{
8915
	struct kmem_cache *ctx_cache = NULL;
8916
	struct perf_ctx_data *cd;
8917

8918
	if (!refcount_read(&global_ctx_data_ref))
8919
		return;
8920

8921
	scoped_guard (rcu) {
8922
		cd = rcu_dereference(parent->perf_ctx_data);
8923
		if (cd)
8924
			ctx_cache = cd->ctx_cache;
8925
	}
8926

8927
	if (!ctx_cache)
8928
		return;
8929

8930
	guard(percpu_read)(&global_ctx_data_rwsem);
8931
	scoped_guard (rcu) {
8932
		cd = rcu_dereference(child->perf_ctx_data);
8933
		if (!cd) {
8934
			/*
8935
			 * A system-wide event may be unaccount,
8936
			 * when attaching the perf_ctx_data.
8937
			 */
8938
			if (!refcount_read(&global_ctx_data_ref))
8939
				return;
8940
			goto attach;
8941
		}
8942

8943
		if (!cd->global) {
8944
			cd->global = 1;
8945
			refcount_inc(&cd->refcount);
8946
		}
8947
	}
8948

8949
	return;
8950
attach:
8951
	attach_task_ctx_data(child, ctx_cache, true);
8952
}
8953

8954
void perf_event_fork(struct task_struct *task)
8955
{
8956
	perf_event_task(task, NULL, 1);
8957
	perf_event_namespaces(task);
8958
	perf_event_alloc_task_data(task, current);
8959
}
8960

8961
/*
8962
 * comm tracking
8963
 */
8964

8965
struct perf_comm_event {
8966
	struct task_struct	*task;
8967
	char			*comm;
8968
	int			comm_size;
8969

8970
	struct {
8971
		struct perf_event_header	header;
8972

8973
		u32				pid;
8974
		u32				tid;
8975
	} event_id;
8976
};
8977

8978
static int perf_event_comm_match(struct perf_event *event)
8979
{
8980
	return event->attr.comm;
8981
}
8982

8983
static void perf_event_comm_output(struct perf_event *event,
8984
				   void *data)
8985
{
8986
	struct perf_comm_event *comm_event = data;
8987
	struct perf_output_handle handle;
8988
	struct perf_sample_data sample;
8989
	int size = comm_event->event_id.header.size;
8990
	int ret;
8991

8992
	if (!perf_event_comm_match(event))
8993
		return;
8994

8995
	perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
8996
	ret = perf_output_begin(&handle, &sample, event,
8997
				comm_event->event_id.header.size);
8998

8999
	if (ret)
9000
		goto out;
9001

9002
	comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
9003
	comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
9004

9005
	perf_output_put(&handle, comm_event->event_id);
9006
	__output_copy(&handle, comm_event->comm,
9007
				   comm_event->comm_size);
9008

9009
	perf_event__output_id_sample(event, &handle, &sample);
9010

9011
	perf_output_end(&handle);
9012
out:
9013
	comm_event->event_id.header.size = size;
9014
}
9015

9016
static void perf_event_comm_event(struct perf_comm_event *comm_event)
9017
{
9018
	char comm[TASK_COMM_LEN];
9019
	unsigned int size;
9020

9021
	memset(comm, 0, sizeof(comm));
9022
	strscpy(comm, comm_event->task->comm);
9023
	size = ALIGN(strlen(comm)+1, sizeof(u64));
9024

9025
	comm_event->comm = comm;
9026
	comm_event->comm_size = size;
9027

9028
	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
9029

9030
	perf_iterate_sb(perf_event_comm_output,
9031
		       comm_event,
9032
		       NULL);
9033
}
9034

9035
void perf_event_comm(struct task_struct *task, bool exec)
9036
{
9037
	struct perf_comm_event comm_event;
9038

9039
	if (!atomic_read(&nr_comm_events))
9040
		return;
9041

9042
	comm_event = (struct perf_comm_event){
9043
		.task	= task,
9044
		/* .comm      */
9045
		/* .comm_size */
9046
		.event_id  = {
9047
			.header = {
9048
				.type = PERF_RECORD_COMM,
9049
				.misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
9050
				/* .size */
9051
			},
9052
			/* .pid */
9053
			/* .tid */
9054
		},
9055
	};
9056

9057
	perf_event_comm_event(&comm_event);
9058
}
9059

9060
/*
9061
 * namespaces tracking
9062
 */
9063

9064
struct perf_namespaces_event {
9065
	struct task_struct		*task;
9066

9067
	struct {
9068
		struct perf_event_header	header;
9069

9070
		u32				pid;
9071
		u32				tid;
9072
		u64				nr_namespaces;
9073
		struct perf_ns_link_info	link_info[NR_NAMESPACES];
9074
	} event_id;
9075
};
9076

9077
static int perf_event_namespaces_match(struct perf_event *event)
9078
{
9079
	return event->attr.namespaces;
9080
}
9081

9082
static void perf_event_namespaces_output(struct perf_event *event,
9083
					 void *data)
9084
{
9085
	struct perf_namespaces_event *namespaces_event = data;
9086
	struct perf_output_handle handle;
9087
	struct perf_sample_data sample;
9088
	u16 header_size = namespaces_event->event_id.header.size;
9089
	int ret;
9090

9091
	if (!perf_event_namespaces_match(event))
9092
		return;
9093

9094
	perf_event_header__init_id(&namespaces_event->event_id.header,
9095
				   &sample, event);
9096
	ret = perf_output_begin(&handle, &sample, event,
9097
				namespaces_event->event_id.header.size);
9098
	if (ret)
9099
		goto out;
9100

9101
	namespaces_event->event_id.pid = perf_event_pid(event,
9102
							namespaces_event->task);
9103
	namespaces_event->event_id.tid = perf_event_tid(event,
9104
							namespaces_event->task);
9105

9106
	perf_output_put(&handle, namespaces_event->event_id);
9107

9108
	perf_event__output_id_sample(event, &handle, &sample);
9109

9110
	perf_output_end(&handle);
9111
out:
9112
	namespaces_event->event_id.header.size = header_size;
9113
}
9114

9115
static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
9116
				   struct task_struct *task,
9117
				   const struct proc_ns_operations *ns_ops)
9118
{
9119
	struct path ns_path;
9120
	struct inode *ns_inode;
9121
	int error;
9122

9123
	error = ns_get_path(&ns_path, task, ns_ops);
9124
	if (!error) {
9125
		ns_inode = ns_path.dentry->d_inode;
9126
		ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
9127
		ns_link_info->ino = ns_inode->i_ino;
9128
		path_put(&ns_path);
9129
	}
9130
}
9131

9132
void perf_event_namespaces(struct task_struct *task)
9133
{
9134
	struct perf_namespaces_event namespaces_event;
9135
	struct perf_ns_link_info *ns_link_info;
9136

9137
	if (!atomic_read(&nr_namespaces_events))
9138
		return;
9139

9140
	namespaces_event = (struct perf_namespaces_event){
9141
		.task	= task,
9142
		.event_id  = {
9143
			.header = {
9144
				.type = PERF_RECORD_NAMESPACES,
9145
				.misc = 0,
9146
				.size = sizeof(namespaces_event.event_id),
9147
			},
9148
			/* .pid */
9149
			/* .tid */
9150
			.nr_namespaces = NR_NAMESPACES,
9151
			/* .link_info[NR_NAMESPACES] */
9152
		},
9153
	};
9154

9155
	ns_link_info = namespaces_event.event_id.link_info;
9156

9157
	perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
9158
			       task, &mntns_operations);
9159

9160
#ifdef CONFIG_USER_NS
9161
	perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
9162
			       task, &userns_operations);
9163
#endif
9164
#ifdef CONFIG_NET_NS
9165
	perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
9166
			       task, &netns_operations);
9167
#endif
9168
#ifdef CONFIG_UTS_NS
9169
	perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
9170
			       task, &utsns_operations);
9171
#endif
9172
#ifdef CONFIG_IPC_NS
9173
	perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
9174
			       task, &ipcns_operations);
9175
#endif
9176
#ifdef CONFIG_PID_NS
9177
	perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
9178
			       task, &pidns_operations);
9179
#endif
9180
#ifdef CONFIG_CGROUPS
9181
	perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
9182
			       task, &cgroupns_operations);
9183
#endif
9184

9185
	perf_iterate_sb(perf_event_namespaces_output,
9186
			&namespaces_event,
9187
			NULL);
9188
}
9189

9190
/*
9191
 * cgroup tracking
9192
 */
9193
#ifdef CONFIG_CGROUP_PERF
9194

9195
struct perf_cgroup_event {
9196
	char				*path;
9197
	int				path_size;
9198
	struct {
9199
		struct perf_event_header	header;
9200
		u64				id;
9201
		char				path[];
9202
	} event_id;
9203
};
9204

9205
static int perf_event_cgroup_match(struct perf_event *event)
9206
{
9207
	return event->attr.cgroup;
9208
}
9209

9210
static void perf_event_cgroup_output(struct perf_event *event, void *data)
9211
{
9212
	struct perf_cgroup_event *cgroup_event = data;
9213
	struct perf_output_handle handle;
9214
	struct perf_sample_data sample;
9215
	u16 header_size = cgroup_event->event_id.header.size;
9216
	int ret;
9217

9218
	if (!perf_event_cgroup_match(event))
9219
		return;
9220

9221
	perf_event_header__init_id(&cgroup_event->event_id.header,
9222
				   &sample, event);
9223
	ret = perf_output_begin(&handle, &sample, event,
9224
				cgroup_event->event_id.header.size);
9225
	if (ret)
9226
		goto out;
9227

9228
	perf_output_put(&handle, cgroup_event->event_id);
9229
	__output_copy(&handle, cgroup_event->path, cgroup_event->path_size);
9230

9231
	perf_event__output_id_sample(event, &handle, &sample);
9232

9233
	perf_output_end(&handle);
9234
out:
9235
	cgroup_event->event_id.header.size = header_size;
9236
}
9237

9238
static void perf_event_cgroup(struct cgroup *cgrp)
9239
{
9240
	struct perf_cgroup_event cgroup_event;
9241
	char path_enomem[16] = "//enomem";
9242
	char *pathname;
9243
	size_t size;
9244

9245
	if (!atomic_read(&nr_cgroup_events))
9246
		return;
9247

9248
	cgroup_event = (struct perf_cgroup_event){
9249
		.event_id  = {
9250
			.header = {
9251
				.type = PERF_RECORD_CGROUP,
9252
				.misc = 0,
9253
				.size = sizeof(cgroup_event.event_id),
9254
			},
9255
			.id = cgroup_id(cgrp),
9256
		},
9257
	};
9258

9259
	pathname = kmalloc(PATH_MAX, GFP_KERNEL);
9260
	if (pathname == NULL) {
9261
		cgroup_event.path = path_enomem;
9262
	} else {
9263
		/* just to be sure to have enough space for alignment */
9264
		cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
9265
		cgroup_event.path = pathname;
9266
	}
9267

9268
	/*
9269
	 * Since our buffer works in 8 byte units we need to align our string
9270
	 * size to a multiple of 8. However, we must guarantee the tail end is
9271
	 * zero'd out to avoid leaking random bits to userspace.
9272
	 */
9273
	size = strlen(cgroup_event.path) + 1;
9274
	while (!IS_ALIGNED(size, sizeof(u64)))
9275
		cgroup_event.path[size++] = '\0';
9276

9277
	cgroup_event.event_id.header.size += size;
9278
	cgroup_event.path_size = size;
9279

9280
	perf_iterate_sb(perf_event_cgroup_output,
9281
			&cgroup_event,
9282
			NULL);
9283

9284
	kfree(pathname);
9285
}
9286

9287
#endif
9288

9289
/*
9290
 * mmap tracking
9291
 */
9292

9293
struct perf_mmap_event {
9294
	struct vm_area_struct	*vma;
9295

9296
	const char		*file_name;
9297
	int			file_size;
9298
	int			maj, min;
9299
	u64			ino;
9300
	u64			ino_generation;
9301
	u32			prot, flags;
9302
	u8			build_id[BUILD_ID_SIZE_MAX];
9303
	u32			build_id_size;
9304

9305
	struct {
9306
		struct perf_event_header	header;
9307

9308
		u32				pid;
9309
		u32				tid;
9310
		u64				start;
9311
		u64				len;
9312
		u64				pgoff;
9313
	} event_id;
9314
};
9315

9316
static int perf_event_mmap_match(struct perf_event *event,
9317
				 void *data)
9318
{
9319
	struct perf_mmap_event *mmap_event = data;
9320
	struct vm_area_struct *vma = mmap_event->vma;
9321
	int executable = vma->vm_flags & VM_EXEC;
9322

9323
	return (!executable && event->attr.mmap_data) ||
9324
	       (executable && (event->attr.mmap || event->attr.mmap2));
9325
}
9326

9327
static void perf_event_mmap_output(struct perf_event *event,
9328
				   void *data)
9329
{
9330
	struct perf_mmap_event *mmap_event = data;
9331
	struct perf_output_handle handle;
9332
	struct perf_sample_data sample;
9333
	int size = mmap_event->event_id.header.size;
9334
	u32 type = mmap_event->event_id.header.type;
9335
	bool use_build_id;
9336
	int ret;
9337

9338
	if (!perf_event_mmap_match(event, data))
9339
		return;
9340

9341
	if (event->attr.mmap2) {
9342
		mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
9343
		mmap_event->event_id.header.size += sizeof(mmap_event->maj);
9344
		mmap_event->event_id.header.size += sizeof(mmap_event->min);
9345
		mmap_event->event_id.header.size += sizeof(mmap_event->ino);
9346
		mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
9347
		mmap_event->event_id.header.size += sizeof(mmap_event->prot);
9348
		mmap_event->event_id.header.size += sizeof(mmap_event->flags);
9349
	}
9350

9351
	perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
9352
	ret = perf_output_begin(&handle, &sample, event,
9353
				mmap_event->event_id.header.size);
9354
	if (ret)
9355
		goto out;
9356

9357
	mmap_event->event_id.pid = perf_event_pid(event, current);
9358
	mmap_event->event_id.tid = perf_event_tid(event, current);
9359

9360
	use_build_id = event->attr.build_id && mmap_event->build_id_size;
9361

9362
	if (event->attr.mmap2 && use_build_id)
9363
		mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID;
9364

9365
	perf_output_put(&handle, mmap_event->event_id);
9366

9367
	if (event->attr.mmap2) {
9368
		if (use_build_id) {
9369
			u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 };
9370

9371
			__output_copy(&handle, size, 4);
9372
			__output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX);
9373
		} else {
9374
			perf_output_put(&handle, mmap_event->maj);
9375
			perf_output_put(&handle, mmap_event->min);
9376
			perf_output_put(&handle, mmap_event->ino);
9377
			perf_output_put(&handle, mmap_event->ino_generation);
9378
		}
9379
		perf_output_put(&handle, mmap_event->prot);
9380
		perf_output_put(&handle, mmap_event->flags);
9381
	}
9382

9383
	__output_copy(&handle, mmap_event->file_name,
9384
				   mmap_event->file_size);
9385

9386
	perf_event__output_id_sample(event, &handle, &sample);
9387

9388
	perf_output_end(&handle);
9389
out:
9390
	mmap_event->event_id.header.size = size;
9391
	mmap_event->event_id.header.type = type;
9392
}
9393

9394
static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
9395
{
9396
	struct vm_area_struct *vma = mmap_event->vma;
9397
	struct file *file = vma->vm_file;
9398
	int maj = 0, min = 0;
9399
	u64 ino = 0, gen = 0;
9400
	u32 prot = 0, flags = 0;
9401
	unsigned int size;
9402
	char tmp[16];
9403
	char *buf = NULL;
9404
	char *name = NULL;
9405

9406
	if (vma->vm_flags & VM_READ)
9407
		prot |= PROT_READ;
9408
	if (vma->vm_flags & VM_WRITE)
9409
		prot |= PROT_WRITE;
9410
	if (vma->vm_flags & VM_EXEC)
9411
		prot |= PROT_EXEC;
9412

9413
	if (vma->vm_flags & VM_MAYSHARE)
9414
		flags = MAP_SHARED;
9415
	else
9416
		flags = MAP_PRIVATE;
9417

9418
	if (vma->vm_flags & VM_LOCKED)
9419
		flags |= MAP_LOCKED;
9420
	if (is_vm_hugetlb_page(vma))
9421
		flags |= MAP_HUGETLB;
9422

9423
	if (file) {
9424
		const struct inode *inode;
9425
		dev_t dev;
9426

9427
		buf = kmalloc(PATH_MAX, GFP_KERNEL);
9428
		if (!buf) {
9429
			name = "//enomem";
9430
			goto cpy_name;
9431
		}
9432
		/*
9433
		 * d_path() works from the end of the rb backwards, so we
9434
		 * need to add enough zero bytes after the string to handle
9435
		 * the 64bit alignment we do later.
9436
		 */
9437
		name = d_path(file_user_path(file), buf, PATH_MAX - sizeof(u64));
9438
		if (IS_ERR(name)) {
9439
			name = "//toolong";
9440
			goto cpy_name;
9441
		}
9442
		inode = file_user_inode(vma->vm_file);
9443
		dev = inode->i_sb->s_dev;
9444
		ino = inode->i_ino;
9445
		gen = inode->i_generation;
9446
		maj = MAJOR(dev);
9447
		min = MINOR(dev);
9448

9449
		goto got_name;
9450
	} else {
9451
		if (vma->vm_ops && vma->vm_ops->name)
9452
			name = (char *) vma->vm_ops->name(vma);
9453
		if (!name)
9454
			name = (char *)arch_vma_name(vma);
9455
		if (!name) {
9456
			if (vma_is_initial_heap(vma))
9457
				name = "[heap]";
9458
			else if (vma_is_initial_stack(vma))
9459
				name = "[stack]";
9460
			else
9461
				name = "//anon";
9462
		}
9463
	}
9464

9465
cpy_name:
9466
	strscpy(tmp, name);
9467
	name = tmp;
9468
got_name:
9469
	/*
9470
	 * Since our buffer works in 8 byte units we need to align our string
9471
	 * size to a multiple of 8. However, we must guarantee the tail end is
9472
	 * zero'd out to avoid leaking random bits to userspace.
9473
	 */
9474
	size = strlen(name)+1;
9475
	while (!IS_ALIGNED(size, sizeof(u64)))
9476
		name[size++] = '\0';
9477

9478
	mmap_event->file_name = name;
9479
	mmap_event->file_size = size;
9480
	mmap_event->maj = maj;
9481
	mmap_event->min = min;
9482
	mmap_event->ino = ino;
9483
	mmap_event->ino_generation = gen;
9484
	mmap_event->prot = prot;
9485
	mmap_event->flags = flags;
9486

9487
	if (!(vma->vm_flags & VM_EXEC))
9488
		mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
9489

9490
	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
9491

9492
	if (atomic_read(&nr_build_id_events))
9493
		build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size);
9494

9495
	perf_iterate_sb(perf_event_mmap_output,
9496
		       mmap_event,
9497
		       NULL);
9498

9499
	kfree(buf);
9500
}
9501

9502
/*
9503
 * Check whether inode and address range match filter criteria.
9504
 */
9505
static bool perf_addr_filter_match(struct perf_addr_filter *filter,
9506
				     struct file *file, unsigned long offset,
9507
				     unsigned long size)
9508
{
9509
	/* d_inode(NULL) won't be equal to any mapped user-space file */
9510
	if (!filter->path.dentry)
9511
		return false;
9512

9513
	if (d_inode(filter->path.dentry) != file_user_inode(file))
9514
		return false;
9515

9516
	if (filter->offset > offset + size)
9517
		return false;
9518

9519
	if (filter->offset + filter->size < offset)
9520
		return false;
9521

9522
	return true;
9523
}
9524

9525
static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
9526
					struct vm_area_struct *vma,
9527
					struct perf_addr_filter_range *fr)
9528
{
9529
	unsigned long vma_size = vma->vm_end - vma->vm_start;
9530
	unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
9531
	struct file *file = vma->vm_file;
9532

9533
	if (!perf_addr_filter_match(filter, file, off, vma_size))
9534
		return false;
9535

9536
	if (filter->offset < off) {
9537
		fr->start = vma->vm_start;
9538
		fr->size = min(vma_size, filter->size - (off - filter->offset));
9539
	} else {
9540
		fr->start = vma->vm_start + filter->offset - off;
9541
		fr->size = min(vma->vm_end - fr->start, filter->size);
9542
	}
9543

9544
	return true;
9545
}
9546

9547
static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
9548
{
9549
	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
9550
	struct vm_area_struct *vma = data;
9551
	struct perf_addr_filter *filter;
9552
	unsigned int restart = 0, count = 0;
9553
	unsigned long flags;
9554

9555
	if (!has_addr_filter(event))
9556
		return;
9557

9558
	if (!vma->vm_file)
9559
		return;
9560

9561
	raw_spin_lock_irqsave(&ifh->lock, flags);
9562
	list_for_each_entry(filter, &ifh->list, entry) {
9563
		if (perf_addr_filter_vma_adjust(filter, vma,
9564
						&event->addr_filter_ranges[count]))
9565
			restart++;
9566

9567
		count++;
9568
	}
9569

9570
	if (restart)
9571
		event->addr_filters_gen++;
9572
	raw_spin_unlock_irqrestore(&ifh->lock, flags);
9573

9574
	if (restart)
9575
		perf_event_stop(event, 1);
9576
}
9577

9578
/*
9579
 * Adjust all task's events' filters to the new vma
9580
 */
9581
static void perf_addr_filters_adjust(struct vm_area_struct *vma)
9582
{
9583
	struct perf_event_context *ctx;
9584

9585
	/*
9586
	 * Data tracing isn't supported yet and as such there is no need
9587
	 * to keep track of anything that isn't related to executable code:
9588
	 */
9589
	if (!(vma->vm_flags & VM_EXEC))
9590
		return;
9591

9592
	rcu_read_lock();
9593
	ctx = rcu_dereference(current->perf_event_ctxp);
9594
	if (ctx)
9595
		perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
9596
	rcu_read_unlock();
9597
}
9598

9599
void perf_event_mmap(struct vm_area_struct *vma)
9600
{
9601
	struct perf_mmap_event mmap_event;
9602

9603
	if (!atomic_read(&nr_mmap_events))
9604
		return;
9605

9606
	mmap_event = (struct perf_mmap_event){
9607
		.vma	= vma,
9608
		/* .file_name */
9609
		/* .file_size */
9610
		.event_id  = {
9611
			.header = {
9612
				.type = PERF_RECORD_MMAP,
9613
				.misc = PERF_RECORD_MISC_USER,
9614
				/* .size */
9615
			},
9616
			/* .pid */
9617
			/* .tid */
9618
			.start  = vma->vm_start,
9619
			.len    = vma->vm_end - vma->vm_start,
9620
			.pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
9621
		},
9622
		/* .maj (attr_mmap2 only) */
9623
		/* .min (attr_mmap2 only) */
9624
		/* .ino (attr_mmap2 only) */
9625
		/* .ino_generation (attr_mmap2 only) */
9626
		/* .prot (attr_mmap2 only) */
9627
		/* .flags (attr_mmap2 only) */
9628
	};
9629

9630
	perf_addr_filters_adjust(vma);
9631
	perf_event_mmap_event(&mmap_event);
9632
}
9633

9634
void perf_event_aux_event(struct perf_event *event, unsigned long head,
9635
			  unsigned long size, u64 flags)
9636
{
9637
	struct perf_output_handle handle;
9638
	struct perf_sample_data sample;
9639
	struct perf_aux_event {
9640
		struct perf_event_header	header;
9641
		u64				offset;
9642
		u64				size;
9643
		u64				flags;
9644
	} rec = {
9645
		.header = {
9646
			.type = PERF_RECORD_AUX,
9647
			.misc = 0,
9648
			.size = sizeof(rec),
9649
		},
9650
		.offset		= head,
9651
		.size		= size,
9652
		.flags		= flags,
9653
	};
9654
	int ret;
9655

9656
	perf_event_header__init_id(&rec.header, &sample, event);
9657
	ret = perf_output_begin(&handle, &sample, event, rec.header.size);
9658

9659
	if (ret)
9660
		return;
9661

9662
	perf_output_put(&handle, rec);
9663
	perf_event__output_id_sample(event, &handle, &sample);
9664

9665
	perf_output_end(&handle);
9666
}
9667

9668
/*
9669
 * Lost/dropped samples logging
9670
 */
9671
void perf_log_lost_samples(struct perf_event *event, u64 lost)
9672
{
9673
	struct perf_output_handle handle;
9674
	struct perf_sample_data sample;
9675
	int ret;
9676

9677
	struct {
9678
		struct perf_event_header	header;
9679
		u64				lost;
9680
	} lost_samples_event = {
9681
		.header = {
9682
			.type = PERF_RECORD_LOST_SAMPLES,
9683
			.misc = 0,
9684
			.size = sizeof(lost_samples_event),
9685
		},
9686
		.lost		= lost,
9687
	};
9688

9689
	perf_event_header__init_id(&lost_samples_event.header, &sample, event);
9690

9691
	ret = perf_output_begin(&handle, &sample, event,
9692
				lost_samples_event.header.size);
9693
	if (ret)
9694
		return;
9695

9696
	perf_output_put(&handle, lost_samples_event);
9697
	perf_event__output_id_sample(event, &handle, &sample);
9698
	perf_output_end(&handle);
9699
}
9700

9701
/*
9702
 * context_switch tracking
9703
 */
9704

9705
struct perf_switch_event {
9706
	struct task_struct	*task;
9707
	struct task_struct	*next_prev;
9708

9709
	struct {
9710
		struct perf_event_header	header;
9711
		u32				next_prev_pid;
9712
		u32				next_prev_tid;
9713
	} event_id;
9714
};
9715

9716
static int perf_event_switch_match(struct perf_event *event)
9717
{
9718
	return event->attr.context_switch;
9719
}
9720

9721
static void perf_event_switch_output(struct perf_event *event, void *data)
9722
{
9723
	struct perf_switch_event *se = data;
9724
	struct perf_output_handle handle;
9725
	struct perf_sample_data sample;
9726
	int ret;
9727

9728
	if (!perf_event_switch_match(event))
9729
		return;
9730

9731
	/* Only CPU-wide events are allowed to see next/prev pid/tid */
9732
	if (event->ctx->task) {
9733
		se->event_id.header.type = PERF_RECORD_SWITCH;
9734
		se->event_id.header.size = sizeof(se->event_id.header);
9735
	} else {
9736
		se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
9737
		se->event_id.header.size = sizeof(se->event_id);
9738
		se->event_id.next_prev_pid =
9739
					perf_event_pid(event, se->next_prev);
9740
		se->event_id.next_prev_tid =
9741
					perf_event_tid(event, se->next_prev);
9742
	}
9743

9744
	perf_event_header__init_id(&se->event_id.header, &sample, event);
9745

9746
	ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
9747
	if (ret)
9748
		return;
9749

9750
	if (event->ctx->task)
9751
		perf_output_put(&handle, se->event_id.header);
9752
	else
9753
		perf_output_put(&handle, se->event_id);
9754

9755
	perf_event__output_id_sample(event, &handle, &sample);
9756

9757
	perf_output_end(&handle);
9758
}
9759

9760
static void perf_event_switch(struct task_struct *task,
9761
			      struct task_struct *next_prev, bool sched_in)
9762
{
9763
	struct perf_switch_event switch_event;
9764

9765
	/* N.B. caller checks nr_switch_events != 0 */
9766

9767
	switch_event = (struct perf_switch_event){
9768
		.task		= task,
9769
		.next_prev	= next_prev,
9770
		.event_id	= {
9771
			.header = {
9772
				/* .type */
9773
				.misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
9774
				/* .size */
9775
			},
9776
			/* .next_prev_pid */
9777
			/* .next_prev_tid */
9778
		},
9779
	};
9780

9781
	if (!sched_in && task_is_runnable(task)) {
9782
		switch_event.event_id.header.misc |=
9783
				PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
9784
	}
9785

9786
	perf_iterate_sb(perf_event_switch_output, &switch_event, NULL);
9787
}
9788

9789
/*
9790
 * IRQ throttle logging
9791
 */
9792

9793
static void perf_log_throttle(struct perf_event *event, int enable)
9794
{
9795
	struct perf_output_handle handle;
9796
	struct perf_sample_data sample;
9797
	int ret;
9798

9799
	struct {
9800
		struct perf_event_header	header;
9801
		u64				time;
9802
		u64				id;
9803
		u64				stream_id;
9804
	} throttle_event = {
9805
		.header = {
9806
			.type = PERF_RECORD_THROTTLE,
9807
			.misc = 0,
9808
			.size = sizeof(throttle_event),
9809
		},
9810
		.time		= perf_event_clock(event),
9811
		.id		= primary_event_id(event),
9812
		.stream_id	= event->id,
9813
	};
9814

9815
	if (enable)
9816
		throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
9817

9818
	perf_event_header__init_id(&throttle_event.header, &sample, event);
9819

9820
	ret = perf_output_begin(&handle, &sample, event,
9821
				throttle_event.header.size);
9822
	if (ret)
9823
		return;
9824

9825
	perf_output_put(&handle, throttle_event);
9826
	perf_event__output_id_sample(event, &handle, &sample);
9827
	perf_output_end(&handle);
9828
}
9829

9830
/*
9831
 * ksymbol register/unregister tracking
9832
 */
9833

9834
struct perf_ksymbol_event {
9835
	const char	*name;
9836
	int		name_len;
9837
	struct {
9838
		struct perf_event_header        header;
9839
		u64				addr;
9840
		u32				len;
9841
		u16				ksym_type;
9842
		u16				flags;
9843
	} event_id;
9844
};
9845

9846
static int perf_event_ksymbol_match(struct perf_event *event)
9847
{
9848
	return event->attr.ksymbol;
9849
}
9850

9851
static void perf_event_ksymbol_output(struct perf_event *event, void *data)
9852
{
9853
	struct perf_ksymbol_event *ksymbol_event = data;
9854
	struct perf_output_handle handle;
9855
	struct perf_sample_data sample;
9856
	int ret;
9857

9858
	if (!perf_event_ksymbol_match(event))
9859
		return;
9860

9861
	perf_event_header__init_id(&ksymbol_event->event_id.header,
9862
				   &sample, event);
9863
	ret = perf_output_begin(&handle, &sample, event,
9864
				ksymbol_event->event_id.header.size);
9865
	if (ret)
9866
		return;
9867

9868
	perf_output_put(&handle, ksymbol_event->event_id);
9869
	__output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
9870
	perf_event__output_id_sample(event, &handle, &sample);
9871

9872
	perf_output_end(&handle);
9873
}
9874

9875
void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
9876
			const char *sym)
9877
{
9878
	struct perf_ksymbol_event ksymbol_event;
9879
	char name[KSYM_NAME_LEN];
9880
	u16 flags = 0;
9881
	int name_len;
9882

9883
	if (!atomic_read(&nr_ksymbol_events))
9884
		return;
9885

9886
	if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
9887
	    ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
9888
		goto err;
9889

9890
	strscpy(name, sym);
9891
	name_len = strlen(name) + 1;
9892
	while (!IS_ALIGNED(name_len, sizeof(u64)))
9893
		name[name_len++] = '\0';
9894
	BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
9895

9896
	if (unregister)
9897
		flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
9898

9899
	ksymbol_event = (struct perf_ksymbol_event){
9900
		.name = name,
9901
		.name_len = name_len,
9902
		.event_id = {
9903
			.header = {
9904
				.type = PERF_RECORD_KSYMBOL,
9905
				.size = sizeof(ksymbol_event.event_id) +
9906
					name_len,
9907
			},
9908
			.addr = addr,
9909
			.len = len,
9910
			.ksym_type = ksym_type,
9911
			.flags = flags,
9912
		},
9913
	};
9914

9915
	perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
9916
	return;
9917
err:
9918
	WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
9919
}
9920

9921
/*
9922
 * bpf program load/unload tracking
9923
 */
9924

9925
struct perf_bpf_event {
9926
	struct bpf_prog	*prog;
9927
	struct {
9928
		struct perf_event_header        header;
9929
		u16				type;
9930
		u16				flags;
9931
		u32				id;
9932
		u8				tag[BPF_TAG_SIZE];
9933
	} event_id;
9934
};
9935

9936
static int perf_event_bpf_match(struct perf_event *event)
9937
{
9938
	return event->attr.bpf_event;
9939
}
9940

9941
static void perf_event_bpf_output(struct perf_event *event, void *data)
9942
{
9943
	struct perf_bpf_event *bpf_event = data;
9944
	struct perf_output_handle handle;
9945
	struct perf_sample_data sample;
9946
	int ret;
9947

9948
	if (!perf_event_bpf_match(event))
9949
		return;
9950

9951
	perf_event_header__init_id(&bpf_event->event_id.header,
9952
				   &sample, event);
9953
	ret = perf_output_begin(&handle, &sample, event,
9954
				bpf_event->event_id.header.size);
9955
	if (ret)
9956
		return;
9957

9958
	perf_output_put(&handle, bpf_event->event_id);
9959
	perf_event__output_id_sample(event, &handle, &sample);
9960

9961
	perf_output_end(&handle);
9962
}
9963

9964
static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
9965
					 enum perf_bpf_event_type type)
9966
{
9967
	bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
9968
	int i;
9969

9970
	perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
9971
			   (u64)(unsigned long)prog->bpf_func,
9972
			   prog->jited_len, unregister,
9973
			   prog->aux->ksym.name);
9974

9975
	for (i = 1; i < prog->aux->func_cnt; i++) {
9976
		struct bpf_prog *subprog = prog->aux->func[i];
9977

9978
		perf_event_ksymbol(
9979
			PERF_RECORD_KSYMBOL_TYPE_BPF,
9980
			(u64)(unsigned long)subprog->bpf_func,
9981
			subprog->jited_len, unregister,
9982
			subprog->aux->ksym.name);
9983
	}
9984
}
9985

9986
void perf_event_bpf_event(struct bpf_prog *prog,
9987
			  enum perf_bpf_event_type type,
9988
			  u16 flags)
9989
{
9990
	struct perf_bpf_event bpf_event;
9991

9992
	switch (type) {
9993
	case PERF_BPF_EVENT_PROG_LOAD:
9994
	case PERF_BPF_EVENT_PROG_UNLOAD:
9995
		if (atomic_read(&nr_ksymbol_events))
9996
			perf_event_bpf_emit_ksymbols(prog, type);
9997
		break;
9998
	default:
9999
		return;
10000
	}
10001

10002
	if (!atomic_read(&nr_bpf_events))
10003
		return;
10004

10005
	bpf_event = (struct perf_bpf_event){
10006
		.prog = prog,
10007
		.event_id = {
10008
			.header = {
10009
				.type = PERF_RECORD_BPF_EVENT,
10010
				.size = sizeof(bpf_event.event_id),
10011
			},
10012
			.type = type,
10013
			.flags = flags,
10014
			.id = prog->aux->id,
10015
		},
10016
	};
10017

10018
	BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
10019

10020
	memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
10021
	perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
10022
}
10023

10024
struct perf_callchain_deferred_event {
10025
	struct unwind_stacktrace *trace;
10026
	struct {
10027
		struct perf_event_header	header;
10028
		u64				cookie;
10029
		u64				nr;
10030
		u64				ips[];
10031
	} event;
10032
};
10033

10034
static void perf_callchain_deferred_output(struct perf_event *event, void *data)
10035
{
10036
	struct perf_callchain_deferred_event *deferred_event = data;
10037
	struct perf_output_handle handle;
10038
	struct perf_sample_data sample;
10039
	int ret, size = deferred_event->event.header.size;
10040

10041
	if (!event->attr.defer_output)
10042
		return;
10043

10044
	/* XXX do we really need sample_id_all for this ??? */
10045
	perf_event_header__init_id(&deferred_event->event.header, &sample, event);
10046

10047
	ret = perf_output_begin(&handle, &sample, event,
10048
				deferred_event->event.header.size);
10049
	if (ret)
10050
		goto out;
10051

10052
	perf_output_put(&handle, deferred_event->event);
10053
	for (int i = 0; i < deferred_event->trace->nr; i++) {
10054
		u64 entry = deferred_event->trace->entries[i];
10055
		perf_output_put(&handle, entry);
10056
	}
10057
	perf_event__output_id_sample(event, &handle, &sample);
10058

10059
	perf_output_end(&handle);
10060
out:
10061
	deferred_event->event.header.size = size;
10062
}
10063

10064
static void perf_unwind_deferred_callback(struct unwind_work *work,
10065
					 struct unwind_stacktrace *trace, u64 cookie)
10066
{
10067
	struct perf_callchain_deferred_event deferred_event = {
10068
		.trace = trace,
10069
		.event = {
10070
			.header = {
10071
				.type = PERF_RECORD_CALLCHAIN_DEFERRED,
10072
				.misc = PERF_RECORD_MISC_USER,
10073
				.size = sizeof(deferred_event.event) +
10074
					(trace->nr * sizeof(u64)),
10075
			},
10076
			.cookie = cookie,
10077
			.nr = trace->nr,
10078
		},
10079
	};
10080

10081
	perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL);
10082
}
10083

10084
struct perf_text_poke_event {
10085
	const void		*old_bytes;
10086
	const void		*new_bytes;
10087
	size_t			pad;
10088
	u16			old_len;
10089
	u16			new_len;
10090

10091
	struct {
10092
		struct perf_event_header	header;
10093

10094
		u64				addr;
10095
	} event_id;
10096
};
10097

10098
static int perf_event_text_poke_match(struct perf_event *event)
10099
{
10100
	return event->attr.text_poke;
10101
}
10102

10103
static void perf_event_text_poke_output(struct perf_event *event, void *data)
10104
{
10105
	struct perf_text_poke_event *text_poke_event = data;
10106
	struct perf_output_handle handle;
10107
	struct perf_sample_data sample;
10108
	u64 padding = 0;
10109
	int ret;
10110

10111
	if (!perf_event_text_poke_match(event))
10112
		return;
10113

10114
	perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
10115

10116
	ret = perf_output_begin(&handle, &sample, event,
10117
				text_poke_event->event_id.header.size);
10118
	if (ret)
10119
		return;
10120

10121
	perf_output_put(&handle, text_poke_event->event_id);
10122
	perf_output_put(&handle, text_poke_event->old_len);
10123
	perf_output_put(&handle, text_poke_event->new_len);
10124

10125
	__output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
10126
	__output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
10127

10128
	if (text_poke_event->pad)
10129
		__output_copy(&handle, &padding, text_poke_event->pad);
10130

10131
	perf_event__output_id_sample(event, &handle, &sample);
10132

10133
	perf_output_end(&handle);
10134
}
10135

10136
void perf_event_text_poke(const void *addr, const void *old_bytes,
10137
			  size_t old_len, const void *new_bytes, size_t new_len)
10138
{
10139
	struct perf_text_poke_event text_poke_event;
10140
	size_t tot, pad;
10141

10142
	if (!atomic_read(&nr_text_poke_events))
10143
		return;
10144

10145
	tot  = sizeof(text_poke_event.old_len) + old_len;
10146
	tot += sizeof(text_poke_event.new_len) + new_len;
10147
	pad  = ALIGN(tot, sizeof(u64)) - tot;
10148

10149
	text_poke_event = (struct perf_text_poke_event){
10150
		.old_bytes    = old_bytes,
10151
		.new_bytes    = new_bytes,
10152
		.pad          = pad,
10153
		.old_len      = old_len,
10154
		.new_len      = new_len,
10155
		.event_id  = {
10156
			.header = {
10157
				.type = PERF_RECORD_TEXT_POKE,
10158
				.misc = PERF_RECORD_MISC_KERNEL,
10159
				.size = sizeof(text_poke_event.event_id) + tot + pad,
10160
			},
10161
			.addr = (unsigned long)addr,
10162
		},
10163
	};
10164

10165
	perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
10166
}
10167

10168
void perf_event_itrace_started(struct perf_event *event)
10169
{
10170
	WRITE_ONCE(event->attach_state, event->attach_state | PERF_ATTACH_ITRACE);
10171
}
10172

10173
static void perf_log_itrace_start(struct perf_event *event)
10174
{
10175
	struct perf_output_handle handle;
10176
	struct perf_sample_data sample;
10177
	struct perf_aux_event {
10178
		struct perf_event_header        header;
10179
		u32				pid;
10180
		u32				tid;
10181
	} rec;
10182
	int ret;
10183

10184
	if (event->parent)
10185
		event = event->parent;
10186

10187
	if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
10188
	    event->attach_state & PERF_ATTACH_ITRACE)
10189
		return;
10190

10191
	rec.header.type	= PERF_RECORD_ITRACE_START;
10192
	rec.header.misc	= 0;
10193
	rec.header.size	= sizeof(rec);
10194
	rec.pid	= perf_event_pid(event, current);
10195
	rec.tid	= perf_event_tid(event, current);
10196

10197
	perf_event_header__init_id(&rec.header, &sample, event);
10198
	ret = perf_output_begin(&handle, &sample, event, rec.header.size);
10199

10200
	if (ret)
10201
		return;
10202

10203
	perf_output_put(&handle, rec);
10204
	perf_event__output_id_sample(event, &handle, &sample);
10205

10206
	perf_output_end(&handle);
10207
}
10208

10209
void perf_report_aux_output_id(struct perf_event *event, u64 hw_id)
10210
{
10211
	struct perf_output_handle handle;
10212
	struct perf_sample_data sample;
10213
	struct perf_aux_event {
10214
		struct perf_event_header        header;
10215
		u64				hw_id;
10216
	} rec;
10217
	int ret;
10218

10219
	if (event->parent)
10220
		event = event->parent;
10221

10222
	rec.header.type	= PERF_RECORD_AUX_OUTPUT_HW_ID;
10223
	rec.header.misc	= 0;
10224
	rec.header.size	= sizeof(rec);
10225
	rec.hw_id	= hw_id;
10226

10227
	perf_event_header__init_id(&rec.header, &sample, event);
10228
	ret = perf_output_begin(&handle, &sample, event, rec.header.size);
10229

10230
	if (ret)
10231
		return;
10232

10233
	perf_output_put(&handle, rec);
10234
	perf_event__output_id_sample(event, &handle, &sample);
10235

10236
	perf_output_end(&handle);
10237
}
10238
EXPORT_SYMBOL_GPL(perf_report_aux_output_id);
10239

10240
static int
10241
__perf_event_account_interrupt(struct perf_event *event, int throttle)
10242
{
10243
	struct hw_perf_event *hwc = &event->hw;
10244
	int ret = 0;
10245
	u64 seq;
10246

10247
	seq = __this_cpu_read(perf_throttled_seq);
10248
	if (seq != hwc->interrupts_seq) {
10249
		hwc->interrupts_seq = seq;
10250
		hwc->interrupts = 1;
10251
	} else {
10252
		hwc->interrupts++;
10253
	}
10254

10255
	if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) {
10256
		__this_cpu_inc(perf_throttled_count);
10257
		tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
10258
		perf_event_throttle_group(event);
10259
		ret = 1;
10260
	}
10261

10262
	if (event->attr.freq) {
10263
		u64 now = perf_clock();
10264
		s64 delta = now - hwc->freq_time_stamp;
10265

10266
		hwc->freq_time_stamp = now;
10267

10268
		if (delta > 0 && delta < 2*TICK_NSEC)
10269
			perf_adjust_period(event, delta, hwc->last_period, true);
10270
	}
10271

10272
	return ret;
10273
}
10274

10275
int perf_event_account_interrupt(struct perf_event *event)
10276
{
10277
	return __perf_event_account_interrupt(event, 1);
10278
}
10279

10280
static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
10281
{
10282
	/*
10283
	 * Due to interrupt latency (AKA "skid"), we may enter the
10284
	 * kernel before taking an overflow, even if the PMU is only
10285
	 * counting user events.
10286
	 */
10287
	if (event->attr.exclude_kernel && !user_mode(regs))
10288
		return false;
10289

10290
	return true;
10291
}
10292

10293
#ifdef CONFIG_BPF_SYSCALL
10294
static int bpf_overflow_handler(struct perf_event *event,
10295
				struct perf_sample_data *data,
10296
				struct pt_regs *regs)
10297
{
10298
	struct bpf_perf_event_data_kern ctx = {
10299
		.data = data,
10300
		.event = event,
10301
	};
10302
	struct bpf_prog *prog;
10303
	int ret = 0;
10304

10305
	ctx.regs = perf_arch_bpf_user_pt_regs(regs);
10306
	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
10307
		goto out;
10308
	rcu_read_lock();
10309
	prog = READ_ONCE(event->prog);
10310
	if (prog) {
10311
		perf_prepare_sample(data, event, regs);
10312
		ret = bpf_prog_run(prog, &ctx);
10313
	}
10314
	rcu_read_unlock();
10315
out:
10316
	__this_cpu_dec(bpf_prog_active);
10317

10318
	return ret;
10319
}
10320

10321
static inline int perf_event_set_bpf_handler(struct perf_event *event,
10322
					     struct bpf_prog *prog,
10323
					     u64 bpf_cookie)
10324
{
10325
	if (event->overflow_handler_context)
10326
		/* hw breakpoint or kernel counter */
10327
		return -EINVAL;
10328

10329
	if (event->prog)
10330
		return -EEXIST;
10331

10332
	if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
10333
		return -EINVAL;
10334

10335
	if (event->attr.precise_ip &&
10336
	    prog->call_get_stack &&
10337
	    (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
10338
	     event->attr.exclude_callchain_kernel ||
10339
	     event->attr.exclude_callchain_user)) {
10340
		/*
10341
		 * On perf_event with precise_ip, calling bpf_get_stack()
10342
		 * may trigger unwinder warnings and occasional crashes.
10343
		 * bpf_get_[stack|stackid] works around this issue by using
10344
		 * callchain attached to perf_sample_data. If the
10345
		 * perf_event does not full (kernel and user) callchain
10346
		 * attached to perf_sample_data, do not allow attaching BPF
10347
		 * program that calls bpf_get_[stack|stackid].
10348
		 */
10349
		return -EPROTO;
10350
	}
10351

10352
	event->prog = prog;
10353
	event->bpf_cookie = bpf_cookie;
10354
	return 0;
10355
}
10356

10357
static inline void perf_event_free_bpf_handler(struct perf_event *event)
10358
{
10359
	struct bpf_prog *prog = event->prog;
10360

10361
	if (!prog)
10362
		return;
10363

10364
	event->prog = NULL;
10365
	bpf_prog_put(prog);
10366
}
10367
#else
10368
static inline int bpf_overflow_handler(struct perf_event *event,
10369
				       struct perf_sample_data *data,
10370
				       struct pt_regs *regs)
10371
{
10372
	return 1;
10373
}
10374

10375
static inline int perf_event_set_bpf_handler(struct perf_event *event,
10376
					     struct bpf_prog *prog,
10377
					     u64 bpf_cookie)
10378
{
10379
	return -EOPNOTSUPP;
10380
}
10381

10382
static inline void perf_event_free_bpf_handler(struct perf_event *event)
10383
{
10384
}
10385
#endif
10386

10387
/*
10388
 * Generic event overflow handling, sampling.
10389
 */
10390

10391
static int __perf_event_overflow(struct perf_event *event,
10392
				 int throttle, struct perf_sample_data *data,
10393
				 struct pt_regs *regs)
10394
{
10395
	int events = atomic_read(&event->event_limit);
10396
	int ret = 0;
10397

10398
	/*
10399
	 * Non-sampling counters might still use the PMI to fold short
10400
	 * hardware counters, ignore those.
10401
	 */
10402
	if (unlikely(!is_sampling_event(event)))
10403
		return 0;
10404

10405
	ret = __perf_event_account_interrupt(event, throttle);
10406

10407
	if (event->attr.aux_pause)
10408
		perf_event_aux_pause(event->aux_event, true);
10409

10410
	if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT &&
10411
	    !bpf_overflow_handler(event, data, regs))
10412
		goto out;
10413

10414
	/*
10415
	 * XXX event_limit might not quite work as expected on inherited
10416
	 * events
10417
	 */
10418

10419
	event->pending_kill = POLL_IN;
10420
	if (events && atomic_dec_and_test(&event->event_limit)) {
10421
		ret = 1;
10422
		event->pending_kill = POLL_HUP;
10423
		perf_event_disable_inatomic(event);
10424
		event->pmu->stop(event, 0);
10425
	}
10426

10427
	if (event->attr.sigtrap) {
10428
		/*
10429
		 * The desired behaviour of sigtrap vs invalid samples is a bit
10430
		 * tricky; on the one hand, one should not loose the SIGTRAP if
10431
		 * it is the first event, on the other hand, we should also not
10432
		 * trigger the WARN or override the data address.
10433
		 */
10434
		bool valid_sample = sample_is_allowed(event, regs);
10435
		unsigned int pending_id = 1;
10436
		enum task_work_notify_mode notify_mode;
10437

10438
		if (regs)
10439
			pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
10440

10441
		notify_mode = in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME;
10442

10443
		if (!event->pending_work &&
10444
		    !task_work_add(current, &event->pending_task, notify_mode)) {
10445
			event->pending_work = pending_id;
10446
			local_inc(&event->ctx->nr_no_switch_fast);
10447
			WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
10448

10449
			event->pending_addr = 0;
10450
			if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
10451
				event->pending_addr = data->addr;
10452

10453
		} else if (event->attr.exclude_kernel && valid_sample) {
10454
			/*
10455
			 * Should not be able to return to user space without
10456
			 * consuming pending_work; with exceptions:
10457
			 *
10458
			 *  1. Where !exclude_kernel, events can overflow again
10459
			 *     in the kernel without returning to user space.
10460
			 *
10461
			 *  2. Events that can overflow again before the IRQ-
10462
			 *     work without user space progress (e.g. hrtimer).
10463
			 *     To approximate progress (with false negatives),
10464
			 *     check 32-bit hash of the current IP.
10465
			 */
10466
			WARN_ON_ONCE(event->pending_work != pending_id);
10467
		}
10468
	}
10469

10470
	READ_ONCE(event->overflow_handler)(event, data, regs);
10471

10472
	if (*perf_event_fasync(event) && event->pending_kill) {
10473
		event->pending_wakeup = 1;
10474
		irq_work_queue(&event->pending_irq);
10475
	}
10476
out:
10477
	if (event->attr.aux_resume)
10478
		perf_event_aux_pause(event->aux_event, false);
10479

10480
	return ret;
10481
}
10482

10483
int perf_event_overflow(struct perf_event *event,
10484
			struct perf_sample_data *data,
10485
			struct pt_regs *regs)
10486
{
10487
	return __perf_event_overflow(event, 1, data, regs);
10488
}
10489

10490
/*
10491
 * Generic software event infrastructure
10492
 */
10493

10494
struct swevent_htable {
10495
	struct swevent_hlist		*swevent_hlist;
10496
	struct mutex			hlist_mutex;
10497
	int				hlist_refcount;
10498
};
10499
static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
10500

10501
/*
10502
 * We directly increment event->count and keep a second value in
10503
 * event->hw.period_left to count intervals. This period event
10504
 * is kept in the range [-sample_period, 0] so that we can use the
10505
 * sign as trigger.
10506
 */
10507

10508
u64 perf_swevent_set_period(struct perf_event *event)
10509
{
10510
	struct hw_perf_event *hwc = &event->hw;
10511
	u64 period = hwc->last_period;
10512
	u64 nr, offset;
10513
	s64 old, val;
10514

10515
	hwc->last_period = hwc->sample_period;
10516

10517
	old = local64_read(&hwc->period_left);
10518
	do {
10519
		val = old;
10520
		if (val < 0)
10521
			return 0;
10522

10523
		nr = div64_u64(period + val, period);
10524
		offset = nr * period;
10525
		val -= offset;
10526
	} while (!local64_try_cmpxchg(&hwc->period_left, &old, val));
10527

10528
	return nr;
10529
}
10530

10531
static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
10532
				    struct perf_sample_data *data,
10533
				    struct pt_regs *regs)
10534
{
10535
	struct hw_perf_event *hwc = &event->hw;
10536
	int throttle = 0;
10537

10538
	if (!overflow)
10539
		overflow = perf_swevent_set_period(event);
10540

10541
	if (hwc->interrupts == MAX_INTERRUPTS)
10542
		return;
10543

10544
	for (; overflow; overflow--) {
10545
		if (__perf_event_overflow(event, throttle,
10546
					    data, regs)) {
10547
			/*
10548
			 * We inhibit the overflow from happening when
10549
			 * hwc->interrupts == MAX_INTERRUPTS.
10550
			 */
10551
			break;
10552
		}
10553
		throttle = 1;
10554
	}
10555
}
10556

10557
static void perf_swevent_event(struct perf_event *event, u64 nr,
10558
			       struct perf_sample_data *data,
10559
			       struct pt_regs *regs)
10560
{
10561
	struct hw_perf_event *hwc = &event->hw;
10562

10563
	local64_add(nr, &event->count);
10564

10565
	if (!regs)
10566
		return;
10567

10568
	if (!is_sampling_event(event))
10569
		return;
10570

10571
	if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
10572
		data->period = nr;
10573
		return perf_swevent_overflow(event, 1, data, regs);
10574
	} else
10575
		data->period = event->hw.last_period;
10576

10577
	if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
10578
		return perf_swevent_overflow(event, 1, data, regs);
10579

10580
	if (local64_add_negative(nr, &hwc->period_left))
10581
		return;
10582

10583
	perf_swevent_overflow(event, 0, data, regs);
10584
}
10585

10586
int perf_exclude_event(struct perf_event *event, struct pt_regs *regs)
10587
{
10588
	if (event->hw.state & PERF_HES_STOPPED)
10589
		return 1;
10590

10591
	if (regs) {
10592
		if (event->attr.exclude_user && user_mode(regs))
10593
			return 1;
10594

10595
		if (event->attr.exclude_kernel && !user_mode(regs))
10596
			return 1;
10597
	}
10598

10599
	return 0;
10600
}
10601

10602
static int perf_swevent_match(struct perf_event *event,
10603
				enum perf_type_id type,
10604
				u32 event_id,
10605
				struct perf_sample_data *data,
10606
				struct pt_regs *regs)
10607
{
10608
	if (event->attr.type != type)
10609
		return 0;
10610

10611
	if (event->attr.config != event_id)
10612
		return 0;
10613

10614
	if (perf_exclude_event(event, regs))
10615
		return 0;
10616

10617
	return 1;
10618
}
10619

10620
static inline u64 swevent_hash(u64 type, u32 event_id)
10621
{
10622
	u64 val = event_id | (type << 32);
10623

10624
	return hash_64(val, SWEVENT_HLIST_BITS);
10625
}
10626

10627
static inline struct hlist_head *
10628
__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
10629
{
10630
	u64 hash = swevent_hash(type, event_id);
10631

10632
	return &hlist->heads[hash];
10633
}
10634

10635
/* For the read side: events when they trigger */
10636
static inline struct hlist_head *
10637
find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
10638
{
10639
	struct swevent_hlist *hlist;
10640

10641
	hlist = rcu_dereference(swhash->swevent_hlist);
10642
	if (!hlist)
10643
		return NULL;
10644

10645
	return __find_swevent_head(hlist, type, event_id);
10646
}
10647

10648
/* For the event head insertion and removal in the hlist */
10649
static inline struct hlist_head *
10650
find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
10651
{
10652
	struct swevent_hlist *hlist;
10653
	u32 event_id = event->attr.config;
10654
	u64 type = event->attr.type;
10655

10656
	/*
10657
	 * Event scheduling is always serialized against hlist allocation
10658
	 * and release. Which makes the protected version suitable here.
10659
	 * The context lock guarantees that.
10660
	 */
10661
	hlist = rcu_dereference_protected(swhash->swevent_hlist,
10662
					  lockdep_is_held(&event->ctx->lock));
10663
	if (!hlist)
10664
		return NULL;
10665

10666
	return __find_swevent_head(hlist, type, event_id);
10667
}
10668

10669
static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
10670
				    u64 nr,
10671
				    struct perf_sample_data *data,
10672
				    struct pt_regs *regs)
10673
{
10674
	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
10675
	struct perf_event *event;
10676
	struct hlist_head *head;
10677

10678
	rcu_read_lock();
10679
	head = find_swevent_head_rcu(swhash, type, event_id);
10680
	if (!head)
10681
		goto end;
10682

10683
	hlist_for_each_entry_rcu(event, head, hlist_entry) {
10684
		if (perf_swevent_match(event, type, event_id, data, regs))
10685
			perf_swevent_event(event, nr, data, regs);
10686
	}
10687
end:
10688
	rcu_read_unlock();
10689
}
10690

10691
DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
10692

10693
int perf_swevent_get_recursion_context(void)
10694
{
10695
	return get_recursion_context(current->perf_recursion);
10696
}
10697
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
10698

10699
void perf_swevent_put_recursion_context(int rctx)
10700
{
10701
	put_recursion_context(current->perf_recursion, rctx);
10702
}
10703

10704
void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
10705
{
10706
	struct perf_sample_data data;
10707

10708
	if (WARN_ON_ONCE(!regs))
10709
		return;
10710

10711
	perf_sample_data_init(&data, addr, 0);
10712
	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
10713
}
10714

10715
void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
10716
{
10717
	int rctx;
10718

10719
	preempt_disable_notrace();
10720
	rctx = perf_swevent_get_recursion_context();
10721
	if (unlikely(rctx < 0))
10722
		goto fail;
10723

10724
	___perf_sw_event(event_id, nr, regs, addr);
10725

10726
	perf_swevent_put_recursion_context(rctx);
10727
fail:
10728
	preempt_enable_notrace();
10729
}
10730

10731
static void perf_swevent_read(struct perf_event *event)
10732
{
10733
}
10734

10735
static int perf_swevent_add(struct perf_event *event, int flags)
10736
{
10737
	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
10738
	struct hw_perf_event *hwc = &event->hw;
10739
	struct hlist_head *head;
10740

10741
	if (is_sampling_event(event)) {
10742
		hwc->last_period = hwc->sample_period;
10743
		perf_swevent_set_period(event);
10744
	}
10745

10746
	hwc->state = !(flags & PERF_EF_START);
10747

10748
	head = find_swevent_head(swhash, event);
10749
	if (WARN_ON_ONCE(!head))
10750
		return -EINVAL;
10751

10752
	hlist_add_head_rcu(&event->hlist_entry, head);
10753
	perf_event_update_userpage(event);
10754

10755
	return 0;
10756
}
10757

10758
static void perf_swevent_del(struct perf_event *event, int flags)
10759
{
10760
	hlist_del_rcu(&event->hlist_entry);
10761
}
10762

10763
static void perf_swevent_start(struct perf_event *event, int flags)
10764
{
10765
	event->hw.state = 0;
10766
}
10767

10768
static void perf_swevent_stop(struct perf_event *event, int flags)
10769
{
10770
	event->hw.state = PERF_HES_STOPPED;
10771
}
10772

10773
/* Deref the hlist from the update side */
10774
static inline struct swevent_hlist *
10775
swevent_hlist_deref(struct swevent_htable *swhash)
10776
{
10777
	return rcu_dereference_protected(swhash->swevent_hlist,
10778
					 lockdep_is_held(&swhash->hlist_mutex));
10779
}
10780

10781
static void swevent_hlist_release(struct swevent_htable *swhash)
10782
{
10783
	struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
10784

10785
	if (!hlist)
10786
		return;
10787

10788
	RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
10789
	kfree_rcu(hlist, rcu_head);
10790
}
10791

10792
static void swevent_hlist_put_cpu(int cpu)
10793
{
10794
	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
10795

10796
	mutex_lock(&swhash->hlist_mutex);
10797

10798
	if (!--swhash->hlist_refcount)
10799
		swevent_hlist_release(swhash);
10800

10801
	mutex_unlock(&swhash->hlist_mutex);
10802
}
10803

10804
static void swevent_hlist_put(void)
10805
{
10806
	int cpu;
10807

10808
	for_each_possible_cpu(cpu)
10809
		swevent_hlist_put_cpu(cpu);
10810
}
10811

10812
static int swevent_hlist_get_cpu(int cpu)
10813
{
10814
	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
10815
	int err = 0;
10816

10817
	mutex_lock(&swhash->hlist_mutex);
10818
	if (!swevent_hlist_deref(swhash) &&
10819
	    cpumask_test_cpu(cpu, perf_online_mask)) {
10820
		struct swevent_hlist *hlist;
10821

10822
		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
10823
		if (!hlist) {
10824
			err = -ENOMEM;
10825
			goto exit;
10826
		}
10827
		rcu_assign_pointer(swhash->swevent_hlist, hlist);
10828
	}
10829
	swhash->hlist_refcount++;
10830
exit:
10831
	mutex_unlock(&swhash->hlist_mutex);
10832

10833
	return err;
10834
}
10835

10836
static int swevent_hlist_get(void)
10837
{
10838
	int err, cpu, failed_cpu;
10839

10840
	mutex_lock(&pmus_lock);
10841
	for_each_possible_cpu(cpu) {
10842
		err = swevent_hlist_get_cpu(cpu);
10843
		if (err) {
10844
			failed_cpu = cpu;
10845
			goto fail;
10846
		}
10847
	}
10848
	mutex_unlock(&pmus_lock);
10849
	return 0;
10850
fail:
10851
	for_each_possible_cpu(cpu) {
10852
		if (cpu == failed_cpu)
10853
			break;
10854
		swevent_hlist_put_cpu(cpu);
10855
	}
10856
	mutex_unlock(&pmus_lock);
10857
	return err;
10858
}
10859

10860
struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
10861

10862
static void sw_perf_event_destroy(struct perf_event *event)
10863
{
10864
	u64 event_id = event->attr.config;
10865

10866
	WARN_ON(event->parent);
10867

10868
	static_key_slow_dec(&perf_swevent_enabled[event_id]);
10869
	swevent_hlist_put();
10870
}
10871

10872
static struct pmu perf_cpu_clock; /* fwd declaration */
10873
static struct pmu perf_task_clock;
10874

10875
static int perf_swevent_init(struct perf_event *event)
10876
{
10877
	u64 event_id = event->attr.config;
10878

10879
	if (event->attr.type != PERF_TYPE_SOFTWARE)
10880
		return -ENOENT;
10881

10882
	/*
10883
	 * no branch sampling for software events
10884
	 */
10885
	if (has_branch_stack(event))
10886
		return -EOPNOTSUPP;
10887

10888
	switch (event_id) {
10889
	case PERF_COUNT_SW_CPU_CLOCK:
10890
		event->attr.type = perf_cpu_clock.type;
10891
		return -ENOENT;
10892
	case PERF_COUNT_SW_TASK_CLOCK:
10893
		event->attr.type = perf_task_clock.type;
10894
		return -ENOENT;
10895

10896
	default:
10897
		break;
10898
	}
10899

10900
	if (event_id >= PERF_COUNT_SW_MAX)
10901
		return -ENOENT;
10902

10903
	if (!event->parent) {
10904
		int err;
10905

10906
		err = swevent_hlist_get();
10907
		if (err)
10908
			return err;
10909

10910
		static_key_slow_inc(&perf_swevent_enabled[event_id]);
10911
		event->destroy = sw_perf_event_destroy;
10912
	}
10913

10914
	return 0;
10915
}
10916

10917
static struct pmu perf_swevent = {
10918
	.task_ctx_nr	= perf_sw_context,
10919

10920
	.capabilities	= PERF_PMU_CAP_NO_NMI,
10921

10922
	.event_init	= perf_swevent_init,
10923
	.add		= perf_swevent_add,
10924
	.del		= perf_swevent_del,
10925
	.start		= perf_swevent_start,
10926
	.stop		= perf_swevent_stop,
10927
	.read		= perf_swevent_read,
10928
};
10929

10930
#ifdef CONFIG_EVENT_TRACING
10931

10932
static void tp_perf_event_destroy(struct perf_event *event)
10933
{
10934
	perf_trace_destroy(event);
10935
}
10936

10937
static int perf_tp_event_init(struct perf_event *event)
10938
{
10939
	int err;
10940

10941
	if (event->attr.type != PERF_TYPE_TRACEPOINT)
10942
		return -ENOENT;
10943

10944
	/*
10945
	 * no branch sampling for tracepoint events
10946
	 */
10947
	if (has_branch_stack(event))
10948
		return -EOPNOTSUPP;
10949

10950
	err = perf_trace_init(event);
10951
	if (err)
10952
		return err;
10953

10954
	event->destroy = tp_perf_event_destroy;
10955

10956
	return 0;
10957
}
10958

10959
static struct pmu perf_tracepoint = {
10960
	.task_ctx_nr	= perf_sw_context,
10961

10962
	.event_init	= perf_tp_event_init,
10963
	.add		= perf_trace_add,
10964
	.del		= perf_trace_del,
10965
	.start		= perf_swevent_start,
10966
	.stop		= perf_swevent_stop,
10967
	.read		= perf_swevent_read,
10968
};
10969

10970
static int perf_tp_filter_match(struct perf_event *event,
10971
				struct perf_raw_record *raw)
10972
{
10973
	void *record = raw->frag.data;
10974

10975
	/* only top level events have filters set */
10976
	if (event->parent)
10977
		event = event->parent;
10978

10979
	if (likely(!event->filter) || filter_match_preds(event->filter, record))
10980
		return 1;
10981
	return 0;
10982
}
10983

10984
static int perf_tp_event_match(struct perf_event *event,
10985
				struct perf_raw_record *raw,
10986
				struct pt_regs *regs)
10987
{
10988
	if (event->hw.state & PERF_HES_STOPPED)
10989
		return 0;
10990
	/*
10991
	 * If exclude_kernel, only trace user-space tracepoints (uprobes)
10992
	 */
10993
	if (event->attr.exclude_kernel && !user_mode(regs))
10994
		return 0;
10995

10996
	if (!perf_tp_filter_match(event, raw))
10997
		return 0;
10998

10999
	return 1;
11000
}
11001

11002
void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
11003
			       struct trace_event_call *call, u64 count,
11004
			       struct pt_regs *regs, struct hlist_head *head,
11005
			       struct task_struct *task)
11006
{
11007
	if (bpf_prog_array_valid(call)) {
11008
		*(struct pt_regs **)raw_data = regs;
11009
		if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
11010
			perf_swevent_put_recursion_context(rctx);
11011
			return;
11012
		}
11013
	}
11014
	perf_tp_event(call->event.type, count, raw_data, size, regs, head,
11015
		      rctx, task);
11016
}
11017
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
11018

11019
static void __perf_tp_event_target_task(u64 count, void *record,
11020
					struct pt_regs *regs,
11021
					struct perf_sample_data *data,
11022
					struct perf_raw_record *raw,
11023
					struct perf_event *event)
11024
{
11025
	struct trace_entry *entry = record;
11026

11027
	if (event->attr.config != entry->type)
11028
		return;
11029
	/* Cannot deliver synchronous signal to other task. */
11030
	if (event->attr.sigtrap)
11031
		return;
11032
	if (perf_tp_event_match(event, raw, regs)) {
11033
		perf_sample_data_init(data, 0, 0);
11034
		perf_sample_save_raw_data(data, event, raw);
11035
		perf_swevent_event(event, count, data, regs);
11036
	}
11037
}
11038

11039
static void perf_tp_event_target_task(u64 count, void *record,
11040
				      struct pt_regs *regs,
11041
				      struct perf_sample_data *data,
11042
				      struct perf_raw_record *raw,
11043
				      struct perf_event_context *ctx)
11044
{
11045
	unsigned int cpu = smp_processor_id();
11046
	struct pmu *pmu = &perf_tracepoint;
11047
	struct perf_event *event, *sibling;
11048

11049
	perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
11050
		__perf_tp_event_target_task(count, record, regs, data, raw, event);
11051
		for_each_sibling_event(sibling, event)
11052
			__perf_tp_event_target_task(count, record, regs, data, raw, sibling);
11053
	}
11054

11055
	perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
11056
		__perf_tp_event_target_task(count, record, regs, data, raw, event);
11057
		for_each_sibling_event(sibling, event)
11058
			__perf_tp_event_target_task(count, record, regs, data, raw, sibling);
11059
	}
11060
}
11061

11062
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
11063
		   struct pt_regs *regs, struct hlist_head *head, int rctx,
11064
		   struct task_struct *task)
11065
{
11066
	struct perf_sample_data data;
11067
	struct perf_event *event;
11068

11069
	struct perf_raw_record raw = {
11070
		.frag = {
11071
			.size = entry_size,
11072
			.data = record,
11073
		},
11074
	};
11075

11076
	perf_trace_buf_update(record, event_type);
11077

11078
	hlist_for_each_entry_rcu(event, head, hlist_entry) {
11079
		if (perf_tp_event_match(event, &raw, regs)) {
11080
			/*
11081
			 * Here use the same on-stack perf_sample_data,
11082
			 * some members in data are event-specific and
11083
			 * need to be re-computed for different sweveents.
11084
			 * Re-initialize data->sample_flags safely to avoid
11085
			 * the problem that next event skips preparing data
11086
			 * because data->sample_flags is set.
11087
			 */
11088
			perf_sample_data_init(&data, 0, 0);
11089
			perf_sample_save_raw_data(&data, event, &raw);
11090
			perf_swevent_event(event, count, &data, regs);
11091
		}
11092
	}
11093

11094
	/*
11095
	 * If we got specified a target task, also iterate its context and
11096
	 * deliver this event there too.
11097
	 */
11098
	if (task && task != current) {
11099
		struct perf_event_context *ctx;
11100

11101
		rcu_read_lock();
11102
		ctx = rcu_dereference(task->perf_event_ctxp);
11103
		if (!ctx)
11104
			goto unlock;
11105

11106
		raw_spin_lock(&ctx->lock);
11107
		perf_tp_event_target_task(count, record, regs, &data, &raw, ctx);
11108
		raw_spin_unlock(&ctx->lock);
11109
unlock:
11110
		rcu_read_unlock();
11111
	}
11112

11113
	perf_swevent_put_recursion_context(rctx);
11114
}
11115
EXPORT_SYMBOL_GPL(perf_tp_event);
11116

11117
#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
11118
/*
11119
 * Flags in config, used by dynamic PMU kprobe and uprobe
11120
 * The flags should match following PMU_FORMAT_ATTR().
11121
 *
11122
 * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
11123
 *                               if not set, create kprobe/uprobe
11124
 *
11125
 * The following values specify a reference counter (or semaphore in the
11126
 * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
11127
 * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
11128
 *
11129
 * PERF_UPROBE_REF_CTR_OFFSET_BITS	# of bits in config as th offset
11130
 * PERF_UPROBE_REF_CTR_OFFSET_SHIFT	# of bits to shift left
11131
 */
11132
enum perf_probe_config {
11133
	PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
11134
	PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
11135
	PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
11136
};
11137

11138
PMU_FORMAT_ATTR(retprobe, "config:0");
11139
#endif
11140

11141
#ifdef CONFIG_KPROBE_EVENTS
11142
static struct attribute *kprobe_attrs[] = {
11143
	&format_attr_retprobe.attr,
11144
	NULL,
11145
};
11146

11147
static struct attribute_group kprobe_format_group = {
11148
	.name = "format",
11149
	.attrs = kprobe_attrs,
11150
};
11151

11152
static const struct attribute_group *kprobe_attr_groups[] = {
11153
	&kprobe_format_group,
11154
	NULL,
11155
};
11156

11157
static int perf_kprobe_event_init(struct perf_event *event);
11158
static struct pmu perf_kprobe = {
11159
	.task_ctx_nr	= perf_sw_context,
11160
	.event_init	= perf_kprobe_event_init,
11161
	.add		= perf_trace_add,
11162
	.del		= perf_trace_del,
11163
	.start		= perf_swevent_start,
11164
	.stop		= perf_swevent_stop,
11165
	.read		= perf_swevent_read,
11166
	.attr_groups	= kprobe_attr_groups,
11167
};
11168

11169
static int perf_kprobe_event_init(struct perf_event *event)
11170
{
11171
	int err;
11172
	bool is_retprobe;
11173

11174
	if (event->attr.type != perf_kprobe.type)
11175
		return -ENOENT;
11176

11177
	if (!perfmon_capable())
11178
		return -EACCES;
11179

11180
	/*
11181
	 * no branch sampling for probe events
11182
	 */
11183
	if (has_branch_stack(event))
11184
		return -EOPNOTSUPP;
11185

11186
	is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
11187
	err = perf_kprobe_init(event, is_retprobe);
11188
	if (err)
11189
		return err;
11190

11191
	event->destroy = perf_kprobe_destroy;
11192

11193
	return 0;
11194
}
11195
#endif /* CONFIG_KPROBE_EVENTS */
11196

11197
#ifdef CONFIG_UPROBE_EVENTS
11198
PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
11199

11200
static struct attribute *uprobe_attrs[] = {
11201
	&format_attr_retprobe.attr,
11202
	&format_attr_ref_ctr_offset.attr,
11203
	NULL,
11204
};
11205

11206
static struct attribute_group uprobe_format_group = {
11207
	.name = "format",
11208
	.attrs = uprobe_attrs,
11209
};
11210

11211
static const struct attribute_group *uprobe_attr_groups[] = {
11212
	&uprobe_format_group,
11213
	NULL,
11214
};
11215

11216
static int perf_uprobe_event_init(struct perf_event *event);
11217
static struct pmu perf_uprobe = {
11218
	.task_ctx_nr	= perf_sw_context,
11219
	.event_init	= perf_uprobe_event_init,
11220
	.add		= perf_trace_add,
11221
	.del		= perf_trace_del,
11222
	.start		= perf_swevent_start,
11223
	.stop		= perf_swevent_stop,
11224
	.read		= perf_swevent_read,
11225
	.attr_groups	= uprobe_attr_groups,
11226
};
11227

11228
static int perf_uprobe_event_init(struct perf_event *event)
11229
{
11230
	int err;
11231
	unsigned long ref_ctr_offset;
11232
	bool is_retprobe;
11233

11234
	if (event->attr.type != perf_uprobe.type)
11235
		return -ENOENT;
11236

11237
	if (!capable(CAP_SYS_ADMIN))
11238
		return -EACCES;
11239

11240
	/*
11241
	 * no branch sampling for probe events
11242
	 */
11243
	if (has_branch_stack(event))
11244
		return -EOPNOTSUPP;
11245

11246
	is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
11247
	ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
11248
	err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
11249
	if (err)
11250
		return err;
11251

11252
	event->destroy = perf_uprobe_destroy;
11253

11254
	return 0;
11255
}
11256
#endif /* CONFIG_UPROBE_EVENTS */
11257

11258
static inline void perf_tp_register(void)
11259
{
11260
	perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
11261
#ifdef CONFIG_KPROBE_EVENTS
11262
	perf_pmu_register(&perf_kprobe, "kprobe", -1);
11263
#endif
11264
#ifdef CONFIG_UPROBE_EVENTS
11265
	perf_pmu_register(&perf_uprobe, "uprobe", -1);
11266
#endif
11267
}
11268

11269
static void perf_event_free_filter(struct perf_event *event)
11270
{
11271
	ftrace_profile_free_filter(event);
11272
}
11273

11274
/*
11275
 * returns true if the event is a tracepoint, or a kprobe/upprobe created
11276
 * with perf_event_open()
11277
 */
11278
static inline bool perf_event_is_tracing(struct perf_event *event)
11279
{
11280
	if (event->pmu == &perf_tracepoint)
11281
		return true;
11282
#ifdef CONFIG_KPROBE_EVENTS
11283
	if (event->pmu == &perf_kprobe)
11284
		return true;
11285
#endif
11286
#ifdef CONFIG_UPROBE_EVENTS
11287
	if (event->pmu == &perf_uprobe)
11288
		return true;
11289
#endif
11290
	return false;
11291
}
11292

11293
static int __perf_event_set_bpf_prog(struct perf_event *event,
11294
				     struct bpf_prog *prog,
11295
				     u64 bpf_cookie)
11296
{
11297
	bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;
11298

11299
	if (event->state <= PERF_EVENT_STATE_REVOKED)
11300
		return -ENODEV;
11301

11302
	if (!perf_event_is_tracing(event))
11303
		return perf_event_set_bpf_handler(event, prog, bpf_cookie);
11304

11305
	is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE;
11306
	is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE;
11307
	is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
11308
	is_syscall_tp = is_syscall_trace_event(event->tp_event);
11309
	if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp)
11310
		/* bpf programs can only be attached to u/kprobe or tracepoint */
11311
		return -EINVAL;
11312

11313
	if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) ||
11314
	    (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
11315
	    (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
11316
		return -EINVAL;
11317

11318
	if (prog->type == BPF_PROG_TYPE_KPROBE && prog->sleepable && !is_uprobe)
11319
		/* only uprobe programs are allowed to be sleepable */
11320
		return -EINVAL;
11321

11322
	/* Kprobe override only works for kprobes, not uprobes. */
11323
	if (prog->kprobe_override && !is_kprobe)
11324
		return -EINVAL;
11325

11326
	/* Writing to context allowed only for uprobes. */
11327
	if (prog->aux->kprobe_write_ctx && !is_uprobe)
11328
		return -EINVAL;
11329

11330
	if (is_tracepoint || is_syscall_tp) {
11331
		int off = trace_event_get_offsets(event->tp_event);
11332

11333
		if (prog->aux->max_ctx_offset > off)
11334
			return -EACCES;
11335
	}
11336

11337
	return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
11338
}
11339

11340
int perf_event_set_bpf_prog(struct perf_event *event,
11341
			    struct bpf_prog *prog,
11342
			    u64 bpf_cookie)
11343
{
11344
	struct perf_event_context *ctx;
11345
	int ret;
11346

11347
	ctx = perf_event_ctx_lock(event);
11348
	ret = __perf_event_set_bpf_prog(event, prog, bpf_cookie);
11349
	perf_event_ctx_unlock(event, ctx);
11350

11351
	return ret;
11352
}
11353

11354
void perf_event_free_bpf_prog(struct perf_event *event)
11355
{
11356
	if (!event->prog)
11357
		return;
11358

11359
	if (!perf_event_is_tracing(event)) {
11360
		perf_event_free_bpf_handler(event);
11361
		return;
11362
	}
11363
	perf_event_detach_bpf_prog(event);
11364
}
11365

11366
#else
11367

11368
static inline void perf_tp_register(void)
11369
{
11370
}
11371

11372
static void perf_event_free_filter(struct perf_event *event)
11373
{
11374
}
11375

11376
static int __perf_event_set_bpf_prog(struct perf_event *event,
11377
				     struct bpf_prog *prog,
11378
				     u64 bpf_cookie)
11379
{
11380
	return -ENOENT;
11381
}
11382

11383
int perf_event_set_bpf_prog(struct perf_event *event,
11384
			    struct bpf_prog *prog,
11385
			    u64 bpf_cookie)
11386
{
11387
	return -ENOENT;
11388
}
11389

11390
void perf_event_free_bpf_prog(struct perf_event *event)
11391
{
11392
}
11393
#endif /* CONFIG_EVENT_TRACING */
11394

11395
#ifdef CONFIG_HAVE_HW_BREAKPOINT
11396
void perf_bp_event(struct perf_event *bp, void *data)
11397
{
11398
	struct perf_sample_data sample;
11399
	struct pt_regs *regs = data;
11400

11401
	perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
11402

11403
	if (!bp->hw.state && !perf_exclude_event(bp, regs))
11404
		perf_swevent_event(bp, 1, &sample, regs);
11405
}
11406
#endif
11407

11408
/*
11409
 * Allocate a new address filter
11410
 */
11411
static struct perf_addr_filter *
11412
perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
11413
{
11414
	int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
11415
	struct perf_addr_filter *filter;
11416

11417
	filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
11418
	if (!filter)
11419
		return NULL;
11420

11421
	INIT_LIST_HEAD(&filter->entry);
11422
	list_add_tail(&filter->entry, filters);
11423

11424
	return filter;
11425
}
11426

11427
static void free_filters_list(struct list_head *filters)
11428
{
11429
	struct perf_addr_filter *filter, *iter;
11430

11431
	list_for_each_entry_safe(filter, iter, filters, entry) {
11432
		path_put(&filter->path);
11433
		list_del(&filter->entry);
11434
		kfree(filter);
11435
	}
11436
}
11437

11438
/*
11439
 * Free existing address filters and optionally install new ones
11440
 */
11441
static void perf_addr_filters_splice(struct perf_event *event,
11442
				     struct list_head *head)
11443
{
11444
	unsigned long flags;
11445
	LIST_HEAD(list);
11446

11447
	if (!has_addr_filter(event))
11448
		return;
11449

11450
	/* don't bother with children, they don't have their own filters */
11451
	if (event->parent)
11452
		return;
11453

11454
	raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
11455

11456
	list_splice_init(&event->addr_filters.list, &list);
11457
	if (head)
11458
		list_splice(head, &event->addr_filters.list);
11459

11460
	raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
11461

11462
	free_filters_list(&list);
11463
}
11464

11465
static void perf_free_addr_filters(struct perf_event *event)
11466
{
11467
	/*
11468
	 * Used during free paths, there is no concurrency.
11469
	 */
11470
	if (list_empty(&event->addr_filters.list))
11471
		return;
11472

11473
	perf_addr_filters_splice(event, NULL);
11474
}
11475

11476
/*
11477
 * Scan through mm's vmas and see if one of them matches the
11478
 * @filter; if so, adjust filter's address range.
11479
 * Called with mm::mmap_lock down for reading.
11480
 */
11481
static void perf_addr_filter_apply(struct perf_addr_filter *filter,
11482
				   struct mm_struct *mm,
11483
				   struct perf_addr_filter_range *fr)
11484
{
11485
	struct vm_area_struct *vma;
11486
	VMA_ITERATOR(vmi, mm, 0);
11487

11488
	for_each_vma(vmi, vma) {
11489
		if (!vma->vm_file)
11490
			continue;
11491

11492
		if (perf_addr_filter_vma_adjust(filter, vma, fr))
11493
			return;
11494
	}
11495
}
11496

11497
/*
11498
 * Update event's address range filters based on the
11499
 * task's existing mappings, if any.
11500
 */
11501
static void perf_event_addr_filters_apply(struct perf_event *event)
11502
{
11503
	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
11504
	struct task_struct *task = READ_ONCE(event->ctx->task);
11505
	struct perf_addr_filter *filter;
11506
	struct mm_struct *mm = NULL;
11507
	unsigned int count = 0;
11508
	unsigned long flags;
11509

11510
	/*
11511
	 * We may observe TASK_TOMBSTONE, which means that the event tear-down
11512
	 * will stop on the parent's child_mutex that our caller is also holding
11513
	 */
11514
	if (task == TASK_TOMBSTONE)
11515
		return;
11516

11517
	if (ifh->nr_file_filters) {
11518
		mm = get_task_mm(task);
11519
		if (!mm)
11520
			goto restart;
11521

11522
		mmap_read_lock(mm);
11523
	}
11524

11525
	raw_spin_lock_irqsave(&ifh->lock, flags);
11526
	list_for_each_entry(filter, &ifh->list, entry) {
11527
		if (filter->path.dentry) {
11528
			/*
11529
			 * Adjust base offset if the filter is associated to a
11530
			 * binary that needs to be mapped:
11531
			 */
11532
			event->addr_filter_ranges[count].start = 0;
11533
			event->addr_filter_ranges[count].size = 0;
11534

11535
			perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
11536
		} else {
11537
			event->addr_filter_ranges[count].start = filter->offset;
11538
			event->addr_filter_ranges[count].size  = filter->size;
11539
		}
11540

11541
		count++;
11542
	}
11543

11544
	event->addr_filters_gen++;
11545
	raw_spin_unlock_irqrestore(&ifh->lock, flags);
11546

11547
	if (ifh->nr_file_filters) {
11548
		mmap_read_unlock(mm);
11549

11550
		mmput(mm);
11551
	}
11552

11553
restart:
11554
	perf_event_stop(event, 1);
11555
}
11556

11557
/*
11558
 * Address range filtering: limiting the data to certain
11559
 * instruction address ranges. Filters are ioctl()ed to us from
11560
 * userspace as ascii strings.
11561
 *
11562
 * Filter string format:
11563
 *
11564
 * ACTION RANGE_SPEC
11565
 * where ACTION is one of the
11566
 *  * "filter": limit the trace to this region
11567
 *  * "start": start tracing from this address
11568
 *  * "stop": stop tracing at this address/region;
11569
 * RANGE_SPEC is
11570
 *  * for kernel addresses: <start address>[/<size>]
11571
 *  * for object files:     <start address>[/<size>]@</path/to/object/file>
11572
 *
11573
 * if <size> is not specified or is zero, the range is treated as a single
11574
 * address; not valid for ACTION=="filter".
11575
 */
11576
enum {
11577
	IF_ACT_NONE = -1,
11578
	IF_ACT_FILTER,
11579
	IF_ACT_START,
11580
	IF_ACT_STOP,
11581
	IF_SRC_FILE,
11582
	IF_SRC_KERNEL,
11583
	IF_SRC_FILEADDR,
11584
	IF_SRC_KERNELADDR,
11585
};
11586

11587
enum {
11588
	IF_STATE_ACTION = 0,
11589
	IF_STATE_SOURCE,
11590
	IF_STATE_END,
11591
};
11592

11593
static const match_table_t if_tokens = {
11594
	{ IF_ACT_FILTER,	"filter" },
11595
	{ IF_ACT_START,		"start" },
11596
	{ IF_ACT_STOP,		"stop" },
11597
	{ IF_SRC_FILE,		"%u/%u@%s" },
11598
	{ IF_SRC_KERNEL,	"%u/%u" },
11599
	{ IF_SRC_FILEADDR,	"%u@%s" },
11600
	{ IF_SRC_KERNELADDR,	"%u" },
11601
	{ IF_ACT_NONE,		NULL },
11602
};
11603

11604
/*
11605
 * Address filter string parser
11606
 */
11607
static int
11608
perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
11609
			     struct list_head *filters)
11610
{
11611
	struct perf_addr_filter *filter = NULL;
11612
	char *start, *orig, *filename = NULL;
11613
	substring_t args[MAX_OPT_ARGS];
11614
	int state = IF_STATE_ACTION, token;
11615
	unsigned int kernel = 0;
11616
	int ret = -EINVAL;
11617

11618
	orig = fstr = kstrdup(fstr, GFP_KERNEL);
11619
	if (!fstr)
11620
		return -ENOMEM;
11621

11622
	while ((start = strsep(&fstr, " ,\n")) != NULL) {
11623
		static const enum perf_addr_filter_action_t actions[] = {
11624
			[IF_ACT_FILTER]	= PERF_ADDR_FILTER_ACTION_FILTER,
11625
			[IF_ACT_START]	= PERF_ADDR_FILTER_ACTION_START,
11626
			[IF_ACT_STOP]	= PERF_ADDR_FILTER_ACTION_STOP,
11627
		};
11628
		ret = -EINVAL;
11629

11630
		if (!*start)
11631
			continue;
11632

11633
		/* filter definition begins */
11634
		if (state == IF_STATE_ACTION) {
11635
			filter = perf_addr_filter_new(event, filters);
11636
			if (!filter)
11637
				goto fail;
11638
		}
11639

11640
		token = match_token(start, if_tokens, args);
11641
		switch (token) {
11642
		case IF_ACT_FILTER:
11643
		case IF_ACT_START:
11644
		case IF_ACT_STOP:
11645
			if (state != IF_STATE_ACTION)
11646
				goto fail;
11647

11648
			filter->action = actions[token];
11649
			state = IF_STATE_SOURCE;
11650
			break;
11651

11652
		case IF_SRC_KERNELADDR:
11653
		case IF_SRC_KERNEL:
11654
			kernel = 1;
11655
			fallthrough;
11656

11657
		case IF_SRC_FILEADDR:
11658
		case IF_SRC_FILE:
11659
			if (state != IF_STATE_SOURCE)
11660
				goto fail;
11661

11662
			*args[0].to = 0;
11663
			ret = kstrtoul(args[0].from, 0, &filter->offset);
11664
			if (ret)
11665
				goto fail;
11666

11667
			if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
11668
				*args[1].to = 0;
11669
				ret = kstrtoul(args[1].from, 0, &filter->size);
11670
				if (ret)
11671
					goto fail;
11672
			}
11673

11674
			if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
11675
				int fpos = token == IF_SRC_FILE ? 2 : 1;
11676

11677
				kfree(filename);
11678
				filename = match_strdup(&args[fpos]);
11679
				if (!filename) {
11680
					ret = -ENOMEM;
11681
					goto fail;
11682
				}
11683
			}
11684

11685
			state = IF_STATE_END;
11686
			break;
11687

11688
		default:
11689
			goto fail;
11690
		}
11691

11692
		/*
11693
		 * Filter definition is fully parsed, validate and install it.
11694
		 * Make sure that it doesn't contradict itself or the event's
11695
		 * attribute.
11696
		 */
11697
		if (state == IF_STATE_END) {
11698
			ret = -EINVAL;
11699

11700
			/*
11701
			 * ACTION "filter" must have a non-zero length region
11702
			 * specified.
11703
			 */
11704
			if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
11705
			    !filter->size)
11706
				goto fail;
11707

11708
			if (!kernel) {
11709
				if (!filename)
11710
					goto fail;
11711

11712
				/*
11713
				 * For now, we only support file-based filters
11714
				 * in per-task events; doing so for CPU-wide
11715
				 * events requires additional context switching
11716
				 * trickery, since same object code will be
11717
				 * mapped at different virtual addresses in
11718
				 * different processes.
11719
				 */
11720
				ret = -EOPNOTSUPP;
11721
				if (!event->ctx->task)
11722
					goto fail;
11723

11724
				/* look up the path and grab its inode */
11725
				ret = kern_path(filename, LOOKUP_FOLLOW,
11726
						&filter->path);
11727
				if (ret)
11728
					goto fail;
11729

11730
				ret = -EINVAL;
11731
				if (!filter->path.dentry ||
11732
				    !S_ISREG(d_inode(filter->path.dentry)
11733
					     ->i_mode))
11734
					goto fail;
11735

11736
				event->addr_filters.nr_file_filters++;
11737
			}
11738

11739
			/* ready to consume more filters */
11740
			kfree(filename);
11741
			filename = NULL;
11742
			state = IF_STATE_ACTION;
11743
			filter = NULL;
11744
			kernel = 0;
11745
		}
11746
	}
11747

11748
	if (state != IF_STATE_ACTION)
11749
		goto fail;
11750

11751
	kfree(filename);
11752
	kfree(orig);
11753

11754
	return 0;
11755

11756
fail:
11757
	kfree(filename);
11758
	free_filters_list(filters);
11759
	kfree(orig);
11760

11761
	return ret;
11762
}
11763

11764
static int
11765
perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
11766
{
11767
	LIST_HEAD(filters);
11768
	int ret;
11769

11770
	/*
11771
	 * Since this is called in perf_ioctl() path, we're already holding
11772
	 * ctx::mutex.
11773
	 */
11774
	lockdep_assert_held(&event->ctx->mutex);
11775

11776
	if (WARN_ON_ONCE(event->parent))
11777
		return -EINVAL;
11778

11779
	ret = perf_event_parse_addr_filter(event, filter_str, &filters);
11780
	if (ret)
11781
		goto fail_clear_files;
11782

11783
	ret = event->pmu->addr_filters_validate(&filters);
11784
	if (ret)
11785
		goto fail_free_filters;
11786

11787
	/* remove existing filters, if any */
11788
	perf_addr_filters_splice(event, &filters);
11789

11790
	/* install new filters */
11791
	perf_event_for_each_child(event, perf_event_addr_filters_apply);
11792

11793
	return ret;
11794

11795
fail_free_filters:
11796
	free_filters_list(&filters);
11797

11798
fail_clear_files:
11799
	event->addr_filters.nr_file_filters = 0;
11800

11801
	return ret;
11802
}
11803

11804
static int perf_event_set_filter(struct perf_event *event, void __user *arg)
11805
{
11806
	int ret = -EINVAL;
11807
	char *filter_str;
11808

11809
	filter_str = strndup_user(arg, PAGE_SIZE);
11810
	if (IS_ERR(filter_str))
11811
		return PTR_ERR(filter_str);
11812

11813
#ifdef CONFIG_EVENT_TRACING
11814
	if (perf_event_is_tracing(event)) {
11815
		struct perf_event_context *ctx = event->ctx;
11816

11817
		/*
11818
		 * Beware, here be dragons!!
11819
		 *
11820
		 * the tracepoint muck will deadlock against ctx->mutex, but
11821
		 * the tracepoint stuff does not actually need it. So
11822
		 * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
11823
		 * already have a reference on ctx.
11824
		 *
11825
		 * This can result in event getting moved to a different ctx,
11826
		 * but that does not affect the tracepoint state.
11827
		 */
11828
		mutex_unlock(&ctx->mutex);
11829
		ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
11830
		mutex_lock(&ctx->mutex);
11831
	} else
11832
#endif
11833
	if (has_addr_filter(event))
11834
		ret = perf_event_set_addr_filter(event, filter_str);
11835

11836
	kfree(filter_str);
11837
	return ret;
11838
}
11839

11840
/*
11841
 * hrtimer based swevent callback
11842
 */
11843

11844
static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
11845
{
11846
	enum hrtimer_restart ret = HRTIMER_RESTART;
11847
	struct perf_sample_data data;
11848
	struct pt_regs *regs;
11849
	struct perf_event *event;
11850
	u64 period;
11851

11852
	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
11853

11854
	if (event->state != PERF_EVENT_STATE_ACTIVE ||
11855
	    event->hw.state & PERF_HES_STOPPED)
11856
		return HRTIMER_NORESTART;
11857

11858
	event->pmu->read(event);
11859

11860
	perf_sample_data_init(&data, 0, event->hw.last_period);
11861
	regs = get_irq_regs();
11862

11863
	if (regs && !perf_exclude_event(event, regs)) {
11864
		if (!(event->attr.exclude_idle && is_idle_task(current)))
11865
			if (__perf_event_overflow(event, 1, &data, regs))
11866
				ret = HRTIMER_NORESTART;
11867
	}
11868

11869
	period = max_t(u64, 10000, event->hw.sample_period);
11870
	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
11871

11872
	return ret;
11873
}
11874

11875
static void perf_swevent_start_hrtimer(struct perf_event *event)
11876
{
11877
	struct hw_perf_event *hwc = &event->hw;
11878
	s64 period;
11879

11880
	if (!is_sampling_event(event))
11881
		return;
11882

11883
	period = local64_read(&hwc->period_left);
11884
	if (period) {
11885
		if (period < 0)
11886
			period = 10000;
11887

11888
		local64_set(&hwc->period_left, 0);
11889
	} else {
11890
		period = max_t(u64, 10000, hwc->sample_period);
11891
	}
11892
	hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
11893
		      HRTIMER_MODE_REL_PINNED_HARD);
11894
}
11895

11896
static void perf_swevent_cancel_hrtimer(struct perf_event *event)
11897
{
11898
	struct hw_perf_event *hwc = &event->hw;
11899

11900
	/*
11901
	 * Careful: this function can be triggered in the hrtimer handler,
11902
	 * for cpu-clock events, so hrtimer_cancel() would cause a
11903
	 * deadlock.
11904
	 *
11905
	 * So use hrtimer_try_to_cancel() to try to stop the hrtimer,
11906
	 * and the cpu-clock handler also sets the PERF_HES_STOPPED flag,
11907
	 * which guarantees that perf_swevent_hrtimer() will stop the
11908
	 * hrtimer once it sees the PERF_HES_STOPPED flag.
11909
	 */
11910
	if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) {
11911
		ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
11912
		local64_set(&hwc->period_left, ktime_to_ns(remaining));
11913

11914
		hrtimer_try_to_cancel(&hwc->hrtimer);
11915
	}
11916
}
11917

11918
static void perf_swevent_destroy_hrtimer(struct perf_event *event)
11919
{
11920
	hrtimer_cancel(&event->hw.hrtimer);
11921
}
11922

11923
static void perf_swevent_init_hrtimer(struct perf_event *event)
11924
{
11925
	struct hw_perf_event *hwc = &event->hw;
11926

11927
	if (!is_sampling_event(event))
11928
		return;
11929

11930
	hrtimer_setup(&hwc->hrtimer, perf_swevent_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
11931
	event->destroy = perf_swevent_destroy_hrtimer;
11932

11933
	/*
11934
	 * Since hrtimers have a fixed rate, we can do a static freq->period
11935
	 * mapping and avoid the whole period adjust feedback stuff.
11936
	 */
11937
	if (event->attr.freq) {
11938
		long freq = event->attr.sample_freq;
11939

11940
		event->attr.sample_period = NSEC_PER_SEC / freq;
11941
		hwc->sample_period = event->attr.sample_period;
11942
		local64_set(&hwc->period_left, hwc->sample_period);
11943
		hwc->last_period = hwc->sample_period;
11944
		event->attr.freq = 0;
11945
	}
11946
}
11947

11948
/*
11949
 * Software event: cpu wall time clock
11950
 */
11951

11952
static void cpu_clock_event_update(struct perf_event *event)
11953
{
11954
	s64 prev;
11955
	u64 now;
11956

11957
	now = local_clock();
11958
	prev = local64_xchg(&event->hw.prev_count, now);
11959
	local64_add(now - prev, &event->count);
11960
}
11961

11962
static void cpu_clock_event_start(struct perf_event *event, int flags)
11963
{
11964
	event->hw.state = 0;
11965
	local64_set(&event->hw.prev_count, local_clock());
11966
	perf_swevent_start_hrtimer(event);
11967
}
11968

11969
static void cpu_clock_event_stop(struct perf_event *event, int flags)
11970
{
11971
	event->hw.state = PERF_HES_STOPPED;
11972
	perf_swevent_cancel_hrtimer(event);
11973
	if (flags & PERF_EF_UPDATE)
11974
		cpu_clock_event_update(event);
11975
}
11976

11977
static int cpu_clock_event_add(struct perf_event *event, int flags)
11978
{
11979
	if (flags & PERF_EF_START)
11980
		cpu_clock_event_start(event, flags);
11981
	perf_event_update_userpage(event);
11982

11983
	return 0;
11984
}
11985

11986
static void cpu_clock_event_del(struct perf_event *event, int flags)
11987
{
11988
	cpu_clock_event_stop(event, PERF_EF_UPDATE);
11989
}
11990

11991
static void cpu_clock_event_read(struct perf_event *event)
11992
{
11993
	cpu_clock_event_update(event);
11994
}
11995

11996
static int cpu_clock_event_init(struct perf_event *event)
11997
{
11998
	if (event->attr.type != perf_cpu_clock.type)
11999
		return -ENOENT;
12000

12001
	if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
12002
		return -ENOENT;
12003

12004
	/*
12005
	 * no branch sampling for software events
12006
	 */
12007
	if (has_branch_stack(event))
12008
		return -EOPNOTSUPP;
12009

12010
	perf_swevent_init_hrtimer(event);
12011

12012
	return 0;
12013
}
12014

12015
static struct pmu perf_cpu_clock = {
12016
	.task_ctx_nr	= perf_sw_context,
12017

12018
	.capabilities	= PERF_PMU_CAP_NO_NMI,
12019
	.dev		= PMU_NULL_DEV,
12020

12021
	.event_init	= cpu_clock_event_init,
12022
	.add		= cpu_clock_event_add,
12023
	.del		= cpu_clock_event_del,
12024
	.start		= cpu_clock_event_start,
12025
	.stop		= cpu_clock_event_stop,
12026
	.read		= cpu_clock_event_read,
12027
};
12028

12029
/*
12030
 * Software event: task time clock
12031
 */
12032

12033
static void task_clock_event_update(struct perf_event *event, u64 now)
12034
{
12035
	u64 prev;
12036
	s64 delta;
12037

12038
	prev = local64_xchg(&event->hw.prev_count, now);
12039
	delta = now - prev;
12040
	local64_add(delta, &event->count);
12041
}
12042

12043
static void task_clock_event_start(struct perf_event *event, int flags)
12044
{
12045
	event->hw.state = 0;
12046
	local64_set(&event->hw.prev_count, event->ctx->time);
12047
	perf_swevent_start_hrtimer(event);
12048
}
12049

12050
static void task_clock_event_stop(struct perf_event *event, int flags)
12051
{
12052
	event->hw.state = PERF_HES_STOPPED;
12053
	perf_swevent_cancel_hrtimer(event);
12054
	if (flags & PERF_EF_UPDATE)
12055
		task_clock_event_update(event, event->ctx->time);
12056
}
12057

12058
static int task_clock_event_add(struct perf_event *event, int flags)
12059
{
12060
	if (flags & PERF_EF_START)
12061
		task_clock_event_start(event, flags);
12062
	perf_event_update_userpage(event);
12063

12064
	return 0;
12065
}
12066

12067
static void task_clock_event_del(struct perf_event *event, int flags)
12068
{
12069
	task_clock_event_stop(event, PERF_EF_UPDATE);
12070
}
12071

12072
static void task_clock_event_read(struct perf_event *event)
12073
{
12074
	u64 now = perf_clock();
12075
	u64 delta = now - event->ctx->timestamp;
12076
	u64 time = event->ctx->time + delta;
12077

12078
	task_clock_event_update(event, time);
12079
}
12080

12081
static int task_clock_event_init(struct perf_event *event)
12082
{
12083
	if (event->attr.type != perf_task_clock.type)
12084
		return -ENOENT;
12085

12086
	if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
12087
		return -ENOENT;
12088

12089
	/*
12090
	 * no branch sampling for software events
12091
	 */
12092
	if (has_branch_stack(event))
12093
		return -EOPNOTSUPP;
12094

12095
	perf_swevent_init_hrtimer(event);
12096

12097
	return 0;
12098
}
12099

12100
static struct pmu perf_task_clock = {
12101
	.task_ctx_nr	= perf_sw_context,
12102

12103
	.capabilities	= PERF_PMU_CAP_NO_NMI,
12104
	.dev		= PMU_NULL_DEV,
12105

12106
	.event_init	= task_clock_event_init,
12107
	.add		= task_clock_event_add,
12108
	.del		= task_clock_event_del,
12109
	.start		= task_clock_event_start,
12110
	.stop		= task_clock_event_stop,
12111
	.read		= task_clock_event_read,
12112
};
12113

12114
static void perf_pmu_nop_void(struct pmu *pmu)
12115
{
12116
}
12117

12118
static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
12119
{
12120
}
12121

12122
static int perf_pmu_nop_int(struct pmu *pmu)
12123
{
12124
	return 0;
12125
}
12126

12127
static int perf_event_nop_int(struct perf_event *event, u64 value)
12128
{
12129
	return 0;
12130
}
12131

12132
static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
12133

12134
static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
12135
{
12136
	__this_cpu_write(nop_txn_flags, flags);
12137

12138
	if (flags & ~PERF_PMU_TXN_ADD)
12139
		return;
12140

12141
	perf_pmu_disable(pmu);
12142
}
12143

12144
static int perf_pmu_commit_txn(struct pmu *pmu)
12145
{
12146
	unsigned int flags = __this_cpu_read(nop_txn_flags);
12147

12148
	__this_cpu_write(nop_txn_flags, 0);
12149

12150
	if (flags & ~PERF_PMU_TXN_ADD)
12151
		return 0;
12152

12153
	perf_pmu_enable(pmu);
12154
	return 0;
12155
}
12156

12157
static void perf_pmu_cancel_txn(struct pmu *pmu)
12158
{
12159
	unsigned int flags =  __this_cpu_read(nop_txn_flags);
12160

12161
	__this_cpu_write(nop_txn_flags, 0);
12162

12163
	if (flags & ~PERF_PMU_TXN_ADD)
12164
		return;
12165

12166
	perf_pmu_enable(pmu);
12167
}
12168

12169
static int perf_event_idx_default(struct perf_event *event)
12170
{
12171
	return 0;
12172
}
12173

12174
/*
12175
 * Let userspace know that this PMU supports address range filtering:
12176
 */
12177
static ssize_t nr_addr_filters_show(struct device *dev,
12178
				    struct device_attribute *attr,
12179
				    char *page)
12180
{
12181
	struct pmu *pmu = dev_get_drvdata(dev);
12182

12183
	return sysfs_emit(page, "%d\n", pmu->nr_addr_filters);
12184
}
12185
DEVICE_ATTR_RO(nr_addr_filters);
12186

12187
static struct idr pmu_idr;
12188

12189
static ssize_t
12190
type_show(struct device *dev, struct device_attribute *attr, char *page)
12191
{
12192
	struct pmu *pmu = dev_get_drvdata(dev);
12193

12194
	return sysfs_emit(page, "%d\n", pmu->type);
12195
}
12196
static DEVICE_ATTR_RO(type);
12197

12198
static ssize_t
12199
perf_event_mux_interval_ms_show(struct device *dev,
12200
				struct device_attribute *attr,
12201
				char *page)
12202
{
12203
	struct pmu *pmu = dev_get_drvdata(dev);
12204

12205
	return sysfs_emit(page, "%d\n", pmu->hrtimer_interval_ms);
12206
}
12207

12208
static DEFINE_MUTEX(mux_interval_mutex);
12209

12210
static ssize_t
12211
perf_event_mux_interval_ms_store(struct device *dev,
12212
				 struct device_attribute *attr,
12213
				 const char *buf, size_t count)
12214
{
12215
	struct pmu *pmu = dev_get_drvdata(dev);
12216
	int timer, cpu, ret;
12217

12218
	ret = kstrtoint(buf, 0, &timer);
12219
	if (ret)
12220
		return ret;
12221

12222
	if (timer < 1)
12223
		return -EINVAL;
12224

12225
	/* same value, noting to do */
12226
	if (timer == pmu->hrtimer_interval_ms)
12227
		return count;
12228

12229
	mutex_lock(&mux_interval_mutex);
12230
	pmu->hrtimer_interval_ms = timer;
12231

12232
	/* update all cpuctx for this PMU */
12233
	cpus_read_lock();
12234
	for_each_online_cpu(cpu) {
12235
		struct perf_cpu_pmu_context *cpc;
12236
		cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu);
12237
		cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
12238

12239
		cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc);
12240
	}
12241
	cpus_read_unlock();
12242
	mutex_unlock(&mux_interval_mutex);
12243

12244
	return count;
12245
}
12246
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
12247

12248
static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
12249
{
12250
	switch (scope) {
12251
	case PERF_PMU_SCOPE_CORE:
12252
		return topology_sibling_cpumask(cpu);
12253
	case PERF_PMU_SCOPE_DIE:
12254
		return topology_die_cpumask(cpu);
12255
	case PERF_PMU_SCOPE_CLUSTER:
12256
		return topology_cluster_cpumask(cpu);
12257
	case PERF_PMU_SCOPE_PKG:
12258
		return topology_core_cpumask(cpu);
12259
	case PERF_PMU_SCOPE_SYS_WIDE:
12260
		return cpu_online_mask;
12261
	}
12262

12263
	return NULL;
12264
}
12265

12266
static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
12267
{
12268
	switch (scope) {
12269
	case PERF_PMU_SCOPE_CORE:
12270
		return perf_online_core_mask;
12271
	case PERF_PMU_SCOPE_DIE:
12272
		return perf_online_die_mask;
12273
	case PERF_PMU_SCOPE_CLUSTER:
12274
		return perf_online_cluster_mask;
12275
	case PERF_PMU_SCOPE_PKG:
12276
		return perf_online_pkg_mask;
12277
	case PERF_PMU_SCOPE_SYS_WIDE:
12278
		return perf_online_sys_mask;
12279
	}
12280

12281
	return NULL;
12282
}
12283

12284
static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
12285
			    char *buf)
12286
{
12287
	struct pmu *pmu = dev_get_drvdata(dev);
12288
	struct cpumask *mask = perf_scope_cpumask(pmu->scope);
12289

12290
	if (mask)
12291
		return cpumap_print_to_pagebuf(true, buf, mask);
12292
	return 0;
12293
}
12294

12295
static DEVICE_ATTR_RO(cpumask);
12296

12297
static struct attribute *pmu_dev_attrs[] = {
12298
	&dev_attr_type.attr,
12299
	&dev_attr_perf_event_mux_interval_ms.attr,
12300
	&dev_attr_nr_addr_filters.attr,
12301
	&dev_attr_cpumask.attr,
12302
	NULL,
12303
};
12304

12305
static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n)
12306
{
12307
	struct device *dev = kobj_to_dev(kobj);
12308
	struct pmu *pmu = dev_get_drvdata(dev);
12309

12310
	if (n == 2 && !pmu->nr_addr_filters)
12311
		return 0;
12312

12313
	/* cpumask */
12314
	if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
12315
		return 0;
12316

12317
	return a->mode;
12318
}
12319

12320
static struct attribute_group pmu_dev_attr_group = {
12321
	.is_visible = pmu_dev_is_visible,
12322
	.attrs = pmu_dev_attrs,
12323
};
12324

12325
static const struct attribute_group *pmu_dev_groups[] = {
12326
	&pmu_dev_attr_group,
12327
	NULL,
12328
};
12329

12330
static int pmu_bus_running;
12331
static const struct bus_type pmu_bus = {
12332
	.name		= "event_source",
12333
	.dev_groups	= pmu_dev_groups,
12334
};
12335

12336
static void pmu_dev_release(struct device *dev)
12337
{
12338
	kfree(dev);
12339
}
12340

12341
static int pmu_dev_alloc(struct pmu *pmu)
12342
{
12343
	int ret = -ENOMEM;
12344

12345
	pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
12346
	if (!pmu->dev)
12347
		goto out;
12348

12349
	pmu->dev->groups = pmu->attr_groups;
12350
	device_initialize(pmu->dev);
12351

12352
	dev_set_drvdata(pmu->dev, pmu);
12353
	pmu->dev->bus = &pmu_bus;
12354
	pmu->dev->parent = pmu->parent;
12355
	pmu->dev->release = pmu_dev_release;
12356

12357
	ret = dev_set_name(pmu->dev, "%s", pmu->name);
12358
	if (ret)
12359
		goto free_dev;
12360

12361
	ret = device_add(pmu->dev);
12362
	if (ret)
12363
		goto free_dev;
12364

12365
	if (pmu->attr_update) {
12366
		ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
12367
		if (ret)
12368
			goto del_dev;
12369
	}
12370

12371
out:
12372
	return ret;
12373

12374
del_dev:
12375
	device_del(pmu->dev);
12376

12377
free_dev:
12378
	put_device(pmu->dev);
12379
	pmu->dev = NULL;
12380
	goto out;
12381
}
12382

12383
static struct lock_class_key cpuctx_mutex;
12384
static struct lock_class_key cpuctx_lock;
12385

12386
static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new)
12387
{
12388
	void *tmp, *val = idr_find(idr, id);
12389

12390
	if (val != old)
12391
		return false;
12392

12393
	tmp = idr_replace(idr, new, id);
12394
	if (IS_ERR(tmp))
12395
		return false;
12396

12397
	WARN_ON_ONCE(tmp != val);
12398
	return true;
12399
}
12400

12401
static void perf_pmu_free(struct pmu *pmu)
12402
{
12403
	if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
12404
		if (pmu->nr_addr_filters)
12405
			device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
12406
		device_del(pmu->dev);
12407
		put_device(pmu->dev);
12408
	}
12409

12410
	if (pmu->cpu_pmu_context) {
12411
		int cpu;
12412

12413
		for_each_possible_cpu(cpu) {
12414
			struct perf_cpu_pmu_context *cpc;
12415

12416
			cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu);
12417
			if (!cpc)
12418
				continue;
12419
			if (cpc->epc.embedded) {
12420
				/* refcount managed */
12421
				put_pmu_ctx(&cpc->epc);
12422
				continue;
12423
			}
12424
			kfree(cpc);
12425
		}
12426
		free_percpu(pmu->cpu_pmu_context);
12427
	}
12428
}
12429

12430
DEFINE_FREE(pmu_unregister, struct pmu *, if (_T) perf_pmu_free(_T))
12431

12432
int perf_pmu_register(struct pmu *_pmu, const char *name, int type)
12433
{
12434
	int cpu, max = PERF_TYPE_MAX;
12435

12436
	struct pmu *pmu __free(pmu_unregister) = _pmu;
12437
	guard(mutex)(&pmus_lock);
12438

12439
	if (WARN_ONCE(!name, "Can not register anonymous pmu.\n"))
12440
		return -EINVAL;
12441

12442
	if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE,
12443
		      "Can not register a pmu with an invalid scope.\n"))
12444
		return -EINVAL;
12445

12446
	pmu->name = name;
12447

12448
	if (type >= 0)
12449
		max = type;
12450

12451
	CLASS(idr_alloc, pmu_type)(&pmu_idr, NULL, max, 0, GFP_KERNEL);
12452
	if (pmu_type.id < 0)
12453
		return pmu_type.id;
12454

12455
	WARN_ON(type >= 0 && pmu_type.id != type);
12456

12457
	pmu->type = pmu_type.id;
12458
	atomic_set(&pmu->exclusive_cnt, 0);
12459

12460
	if (pmu_bus_running && !pmu->dev) {
12461
		int ret = pmu_dev_alloc(pmu);
12462
		if (ret)
12463
			return ret;
12464
	}
12465

12466
	pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context *);
12467
	if (!pmu->cpu_pmu_context)
12468
		return -ENOMEM;
12469

12470
	for_each_possible_cpu(cpu) {
12471
		struct perf_cpu_pmu_context *cpc =
12472
			kmalloc_node(sizeof(struct perf_cpu_pmu_context),
12473
				     GFP_KERNEL | __GFP_ZERO,
12474
				     cpu_to_node(cpu));
12475

12476
		if (!cpc)
12477
			return -ENOMEM;
12478

12479
		*per_cpu_ptr(pmu->cpu_pmu_context, cpu) = cpc;
12480
		__perf_init_event_pmu_context(&cpc->epc, pmu);
12481
		__perf_mux_hrtimer_init(cpc, cpu);
12482
	}
12483

12484
	if (!pmu->start_txn) {
12485
		if (pmu->pmu_enable) {
12486
			/*
12487
			 * If we have pmu_enable/pmu_disable calls, install
12488
			 * transaction stubs that use that to try and batch
12489
			 * hardware accesses.
12490
			 */
12491
			pmu->start_txn  = perf_pmu_start_txn;
12492
			pmu->commit_txn = perf_pmu_commit_txn;
12493
			pmu->cancel_txn = perf_pmu_cancel_txn;
12494
		} else {
12495
			pmu->start_txn  = perf_pmu_nop_txn;
12496
			pmu->commit_txn = perf_pmu_nop_int;
12497
			pmu->cancel_txn = perf_pmu_nop_void;
12498
		}
12499
	}
12500

12501
	if (!pmu->pmu_enable) {
12502
		pmu->pmu_enable  = perf_pmu_nop_void;
12503
		pmu->pmu_disable = perf_pmu_nop_void;
12504
	}
12505

12506
	if (!pmu->check_period)
12507
		pmu->check_period = perf_event_nop_int;
12508

12509
	if (!pmu->event_idx)
12510
		pmu->event_idx = perf_event_idx_default;
12511

12512
	INIT_LIST_HEAD(&pmu->events);
12513
	spin_lock_init(&pmu->events_lock);
12514

12515
	/*
12516
	 * Now that the PMU is complete, make it visible to perf_try_init_event().
12517
	 */
12518
	if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu))
12519
		return -EINVAL;
12520
	list_add_rcu(&pmu->entry, &pmus);
12521

12522
	take_idr_id(pmu_type);
12523
	_pmu = no_free_ptr(pmu); // let it rip
12524
	return 0;
12525
}
12526
EXPORT_SYMBOL_GPL(perf_pmu_register);
12527

12528
static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
12529
			       struct perf_event_context *ctx)
12530
{
12531
	/*
12532
	 * De-schedule the event and mark it REVOKED.
12533
	 */
12534
	perf_event_exit_event(event, ctx, ctx->task, true);
12535

12536
	/*
12537
	 * All _free_event() bits that rely on event->pmu:
12538
	 *
12539
	 * Notably, perf_mmap() relies on the ordering here.
12540
	 */
12541
	scoped_guard (mutex, &event->mmap_mutex) {
12542
		WARN_ON_ONCE(pmu->event_unmapped);
12543
		/*
12544
		 * Mostly an empty lock sequence, such that perf_mmap(), which
12545
		 * relies on mmap_mutex, is sure to observe the state change.
12546
		 */
12547
	}
12548

12549
	perf_event_free_bpf_prog(event);
12550
	perf_free_addr_filters(event);
12551

12552
	if (event->destroy) {
12553
		event->destroy(event);
12554
		event->destroy = NULL;
12555
	}
12556

12557
	if (event->pmu_ctx) {
12558
		put_pmu_ctx(event->pmu_ctx);
12559
		event->pmu_ctx = NULL;
12560
	}
12561

12562
	exclusive_event_destroy(event);
12563
	module_put(pmu->module);
12564

12565
	event->pmu = NULL; /* force fault instead of UAF */
12566
}
12567

12568
static void pmu_detach_event(struct pmu *pmu, struct perf_event *event)
12569
{
12570
	struct perf_event_context *ctx;
12571

12572
	ctx = perf_event_ctx_lock(event);
12573
	__pmu_detach_event(pmu, event, ctx);
12574
	perf_event_ctx_unlock(event, ctx);
12575

12576
	scoped_guard (spinlock, &pmu->events_lock)
12577
		list_del(&event->pmu_list);
12578
}
12579

12580
static struct perf_event *pmu_get_event(struct pmu *pmu)
12581
{
12582
	struct perf_event *event;
12583

12584
	guard(spinlock)(&pmu->events_lock);
12585
	list_for_each_entry(event, &pmu->events, pmu_list) {
12586
		if (atomic_long_inc_not_zero(&event->refcount))
12587
			return event;
12588
	}
12589

12590
	return NULL;
12591
}
12592

12593
static bool pmu_empty(struct pmu *pmu)
12594
{
12595
	guard(spinlock)(&pmu->events_lock);
12596
	return list_empty(&pmu->events);
12597
}
12598

12599
static void pmu_detach_events(struct pmu *pmu)
12600
{
12601
	struct perf_event *event;
12602

12603
	for (;;) {
12604
		event = pmu_get_event(pmu);
12605
		if (!event)
12606
			break;
12607

12608
		pmu_detach_event(pmu, event);
12609
		put_event(event);
12610
	}
12611

12612
	/*
12613
	 * wait for pending _free_event()s
12614
	 */
12615
	wait_var_event(pmu, pmu_empty(pmu));
12616
}
12617

12618
int perf_pmu_unregister(struct pmu *pmu)
12619
{
12620
	scoped_guard (mutex, &pmus_lock) {
12621
		if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL))
12622
			return -EINVAL;
12623

12624
		list_del_rcu(&pmu->entry);
12625
	}
12626

12627
	/*
12628
	 * We dereference the pmu list under both SRCU and regular RCU, so
12629
	 * synchronize against both of those.
12630
	 *
12631
	 * Notably, the entirety of event creation, from perf_init_event()
12632
	 * (which will now fail, because of the above) until
12633
	 * perf_install_in_context() should be under SRCU such that
12634
	 * this synchronizes against event creation. This avoids trying to
12635
	 * detach events that are not fully formed.
12636
	 */
12637
	synchronize_srcu(&pmus_srcu);
12638
	synchronize_rcu();
12639

12640
	if (pmu->event_unmapped && !pmu_empty(pmu)) {
12641
		/*
12642
		 * Can't force remove events when pmu::event_unmapped()
12643
		 * is used in perf_mmap_close().
12644
		 */
12645
		guard(mutex)(&pmus_lock);
12646
		idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu);
12647
		list_add_rcu(&pmu->entry, &pmus);
12648
		return -EBUSY;
12649
	}
12650

12651
	scoped_guard (mutex, &pmus_lock)
12652
		idr_remove(&pmu_idr, pmu->type);
12653

12654
	/*
12655
	 * PMU is removed from the pmus list, so no new events will
12656
	 * be created, now take care of the existing ones.
12657
	 */
12658
	pmu_detach_events(pmu);
12659

12660
	/*
12661
	 * PMU is unused, make it go away.
12662
	 */
12663
	perf_pmu_free(pmu);
12664
	return 0;
12665
}
12666
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
12667

12668
static inline bool has_extended_regs(struct perf_event *event)
12669
{
12670
	return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
12671
	       (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
12672
}
12673

12674
static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
12675
{
12676
	struct perf_event_context *ctx = NULL;
12677
	int ret;
12678

12679
	if (!try_module_get(pmu->module))
12680
		return -ENODEV;
12681

12682
	/*
12683
	 * A number of pmu->event_init() methods iterate the sibling_list to,
12684
	 * for example, validate if the group fits on the PMU. Therefore,
12685
	 * if this is a sibling event, acquire the ctx->mutex to protect
12686
	 * the sibling_list.
12687
	 */
12688
	if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
12689
		/*
12690
		 * This ctx->mutex can nest when we're called through
12691
		 * inheritance. See the perf_event_ctx_lock_nested() comment.
12692
		 */
12693
		ctx = perf_event_ctx_lock_nested(event->group_leader,
12694
						 SINGLE_DEPTH_NESTING);
12695
		BUG_ON(!ctx);
12696
	}
12697

12698
	event->pmu = pmu;
12699
	ret = pmu->event_init(event);
12700

12701
	if (ctx)
12702
		perf_event_ctx_unlock(event->group_leader, ctx);
12703

12704
	if (ret)
12705
		goto err_pmu;
12706

12707
	if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
12708
	    has_extended_regs(event)) {
12709
		ret = -EOPNOTSUPP;
12710
		goto err_destroy;
12711
	}
12712

12713
	if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
12714
	    event_has_any_exclude_flag(event)) {
12715
		ret = -EINVAL;
12716
		goto err_destroy;
12717
	}
12718

12719
	if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
12720
		const struct cpumask *cpumask;
12721
		struct cpumask *pmu_cpumask;
12722
		int cpu;
12723

12724
		cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
12725
		pmu_cpumask = perf_scope_cpumask(pmu->scope);
12726

12727
		ret = -ENODEV;
12728
		if (!pmu_cpumask || !cpumask)
12729
			goto err_destroy;
12730

12731
		cpu = cpumask_any_and(pmu_cpumask, cpumask);
12732
		if (cpu >= nr_cpu_ids)
12733
			goto err_destroy;
12734

12735
		event->event_caps |= PERF_EV_CAP_READ_SCOPE;
12736
	}
12737

12738
	return 0;
12739

12740
err_destroy:
12741
	if (event->destroy) {
12742
		event->destroy(event);
12743
		event->destroy = NULL;
12744
	}
12745

12746
err_pmu:
12747
	event->pmu = NULL;
12748
	module_put(pmu->module);
12749
	return ret;
12750
}
12751

12752
static struct pmu *perf_init_event(struct perf_event *event)
12753
{
12754
	bool extended_type = false;
12755
	struct pmu *pmu;
12756
	int type, ret;
12757

12758
	guard(srcu)(&pmus_srcu); /* pmu idr/list access */
12759

12760
	/*
12761
	 * Save original type before calling pmu->event_init() since certain
12762
	 * pmus overwrites event->attr.type to forward event to another pmu.
12763
	 */
12764
	event->orig_type = event->attr.type;
12765

12766
	/* Try parent's PMU first: */
12767
	if (event->parent && event->parent->pmu) {
12768
		pmu = event->parent->pmu;
12769
		ret = perf_try_init_event(pmu, event);
12770
		if (!ret)
12771
			return pmu;
12772
	}
12773

12774
	/*
12775
	 * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
12776
	 * are often aliases for PERF_TYPE_RAW.
12777
	 */
12778
	type = event->attr.type;
12779
	if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) {
12780
		type = event->attr.config >> PERF_PMU_TYPE_SHIFT;
12781
		if (!type) {
12782
			type = PERF_TYPE_RAW;
12783
		} else {
12784
			extended_type = true;
12785
			event->attr.config &= PERF_HW_EVENT_MASK;
12786
		}
12787
	}
12788

12789
again:
12790
	scoped_guard (rcu)
12791
		pmu = idr_find(&pmu_idr, type);
12792
	if (pmu) {
12793
		if (event->attr.type != type && type != PERF_TYPE_RAW &&
12794
		    !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
12795
			return ERR_PTR(-ENOENT);
12796

12797
		ret = perf_try_init_event(pmu, event);
12798
		if (ret == -ENOENT && event->attr.type != type && !extended_type) {
12799
			type = event->attr.type;
12800
			goto again;
12801
		}
12802

12803
		if (ret)
12804
			return ERR_PTR(ret);
12805

12806
		return pmu;
12807
	}
12808

12809
	list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
12810
		ret = perf_try_init_event(pmu, event);
12811
		if (!ret)
12812
			return pmu;
12813

12814
		if (ret != -ENOENT)
12815
			return ERR_PTR(ret);
12816
	}
12817

12818
	return ERR_PTR(-ENOENT);
12819
}
12820

12821
static void attach_sb_event(struct perf_event *event)
12822
{
12823
	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
12824

12825
	raw_spin_lock(&pel->lock);
12826
	list_add_rcu(&event->sb_list, &pel->list);
12827
	raw_spin_unlock(&pel->lock);
12828
}
12829

12830
/*
12831
 * We keep a list of all !task (and therefore per-cpu) events
12832
 * that need to receive side-band records.
12833
 *
12834
 * This avoids having to scan all the various PMU per-cpu contexts
12835
 * looking for them.
12836
 */
12837
static void account_pmu_sb_event(struct perf_event *event)
12838
{
12839
	if (is_sb_event(event))
12840
		attach_sb_event(event);
12841
}
12842

12843
/* Freq events need the tick to stay alive (see perf_event_task_tick). */
12844
static void account_freq_event_nohz(void)
12845
{
12846
#ifdef CONFIG_NO_HZ_FULL
12847
	/* Lock so we don't race with concurrent unaccount */
12848
	spin_lock(&nr_freq_lock);
12849
	if (atomic_inc_return(&nr_freq_events) == 1)
12850
		tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
12851
	spin_unlock(&nr_freq_lock);
12852
#endif
12853
}
12854

12855
static void account_freq_event(void)
12856
{
12857
	if (tick_nohz_full_enabled())
12858
		account_freq_event_nohz();
12859
	else
12860
		atomic_inc(&nr_freq_events);
12861
}
12862

12863

12864
static void account_event(struct perf_event *event)
12865
{
12866
	bool inc = false;
12867

12868
	if (event->parent)
12869
		return;
12870

12871
	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
12872
		inc = true;
12873
	if (event->attr.mmap || event->attr.mmap_data)
12874
		atomic_inc(&nr_mmap_events);
12875
	if (event->attr.build_id)
12876
		atomic_inc(&nr_build_id_events);
12877
	if (event->attr.comm)
12878
		atomic_inc(&nr_comm_events);
12879
	if (event->attr.namespaces)
12880
		atomic_inc(&nr_namespaces_events);
12881
	if (event->attr.cgroup)
12882
		atomic_inc(&nr_cgroup_events);
12883
	if (event->attr.task)
12884
		atomic_inc(&nr_task_events);
12885
	if (event->attr.freq)
12886
		account_freq_event();
12887
	if (event->attr.context_switch) {
12888
		atomic_inc(&nr_switch_events);
12889
		inc = true;
12890
	}
12891
	if (has_branch_stack(event))
12892
		inc = true;
12893
	if (is_cgroup_event(event))
12894
		inc = true;
12895
	if (event->attr.ksymbol)
12896
		atomic_inc(&nr_ksymbol_events);
12897
	if (event->attr.bpf_event)
12898
		atomic_inc(&nr_bpf_events);
12899
	if (event->attr.text_poke)
12900
		atomic_inc(&nr_text_poke_events);
12901

12902
	if (inc) {
12903
		/*
12904
		 * We need the mutex here because static_branch_enable()
12905
		 * must complete *before* the perf_sched_count increment
12906
		 * becomes visible.
12907
		 */
12908
		if (atomic_inc_not_zero(&perf_sched_count))
12909
			goto enabled;
12910

12911
		mutex_lock(&perf_sched_mutex);
12912
		if (!atomic_read(&perf_sched_count)) {
12913
			static_branch_enable(&perf_sched_events);
12914
			/*
12915
			 * Guarantee that all CPUs observe they key change and
12916
			 * call the perf scheduling hooks before proceeding to
12917
			 * install events that need them.
12918
			 */
12919
			synchronize_rcu();
12920
		}
12921
		/*
12922
		 * Now that we have waited for the sync_sched(), allow further
12923
		 * increments to by-pass the mutex.
12924
		 */
12925
		atomic_inc(&perf_sched_count);
12926
		mutex_unlock(&perf_sched_mutex);
12927
	}
12928
enabled:
12929

12930
	account_pmu_sb_event(event);
12931
}
12932

12933
/*
12934
 * Allocate and initialize an event structure
12935
 */
12936
static struct perf_event *
12937
perf_event_alloc(struct perf_event_attr *attr, int cpu,
12938
		 struct task_struct *task,
12939
		 struct perf_event *group_leader,
12940
		 struct perf_event *parent_event,
12941
		 perf_overflow_handler_t overflow_handler,
12942
		 void *context, int cgroup_fd)
12943
{
12944
	struct pmu *pmu;
12945
	struct hw_perf_event *hwc;
12946
	long err = -EINVAL;
12947
	int node;
12948

12949
	if ((unsigned)cpu >= nr_cpu_ids) {
12950
		if (!task || cpu != -1)
12951
			return ERR_PTR(-EINVAL);
12952
	}
12953
	if (attr->sigtrap && !task) {
12954
		/* Requires a task: avoid signalling random tasks. */
12955
		return ERR_PTR(-EINVAL);
12956
	}
12957

12958
	node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
12959
	struct perf_event *event __free(__free_event) =
12960
		kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node);
12961
	if (!event)
12962
		return ERR_PTR(-ENOMEM);
12963

12964
	/*
12965
	 * Single events are their own group leaders, with an
12966
	 * empty sibling list:
12967
	 */
12968
	if (!group_leader)
12969
		group_leader = event;
12970

12971
	mutex_init(&event->child_mutex);
12972
	INIT_LIST_HEAD(&event->child_list);
12973

12974
	INIT_LIST_HEAD(&event->event_entry);
12975
	INIT_LIST_HEAD(&event->sibling_list);
12976
	INIT_LIST_HEAD(&event->active_list);
12977
	init_event_group(event);
12978
	INIT_LIST_HEAD(&event->rb_entry);
12979
	INIT_LIST_HEAD(&event->active_entry);
12980
	INIT_LIST_HEAD(&event->addr_filters.list);
12981
	INIT_HLIST_NODE(&event->hlist_entry);
12982
	INIT_LIST_HEAD(&event->pmu_list);
12983

12984

12985
	init_waitqueue_head(&event->waitq);
12986
	init_irq_work(&event->pending_irq, perf_pending_irq);
12987
	event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable);
12988
	init_task_work(&event->pending_task, perf_pending_task);
12989

12990
	mutex_init(&event->mmap_mutex);
12991
	raw_spin_lock_init(&event->addr_filters.lock);
12992

12993
	atomic_long_set(&event->refcount, 1);
12994
	event->cpu		= cpu;
12995
	event->attr		= *attr;
12996
	event->group_leader	= group_leader;
12997
	event->pmu		= NULL;
12998
	event->oncpu		= -1;
12999

13000
	event->parent		= parent_event;
13001

13002
	event->ns		= get_pid_ns(task_active_pid_ns(current));
13003
	event->id		= atomic64_inc_return(&perf_event_id);
13004

13005
	event->state		= PERF_EVENT_STATE_INACTIVE;
13006

13007
	if (parent_event)
13008
		event->event_caps = parent_event->event_caps;
13009

13010
	if (task) {
13011
		event->attach_state = PERF_ATTACH_TASK;
13012
		/*
13013
		 * XXX pmu::event_init needs to know what task to account to
13014
		 * and we cannot use the ctx information because we need the
13015
		 * pmu before we get a ctx.
13016
		 */
13017
		event->hw.target = get_task_struct(task);
13018
	}
13019

13020
	event->clock = &local_clock;
13021
	if (parent_event)
13022
		event->clock = parent_event->clock;
13023

13024
	if (!overflow_handler && parent_event) {
13025
		overflow_handler = parent_event->overflow_handler;
13026
		context = parent_event->overflow_handler_context;
13027
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
13028
		if (parent_event->prog) {
13029
			struct bpf_prog *prog = parent_event->prog;
13030

13031
			bpf_prog_inc(prog);
13032
			event->prog = prog;
13033
		}
13034
#endif
13035
	}
13036

13037
	if (overflow_handler) {
13038
		event->overflow_handler	= overflow_handler;
13039
		event->overflow_handler_context = context;
13040
	} else if (is_write_backward(event)){
13041
		event->overflow_handler = perf_event_output_backward;
13042
		event->overflow_handler_context = NULL;
13043
	} else {
13044
		event->overflow_handler = perf_event_output_forward;
13045
		event->overflow_handler_context = NULL;
13046
	}
13047

13048
	perf_event__state_init(event);
13049

13050
	pmu = NULL;
13051

13052
	hwc = &event->hw;
13053
	hwc->sample_period = attr->sample_period;
13054
	if (is_event_in_freq_mode(event))
13055
		hwc->sample_period = 1;
13056
	hwc->last_period = hwc->sample_period;
13057

13058
	local64_set(&hwc->period_left, hwc->sample_period);
13059

13060
	/*
13061
	 * We do not support PERF_SAMPLE_READ on inherited events unless
13062
	 * PERF_SAMPLE_TID is also selected, which allows inherited events to
13063
	 * collect per-thread samples.
13064
	 * See perf_output_read().
13065
	 */
13066
	if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID))
13067
		return ERR_PTR(-EINVAL);
13068

13069
	if (!has_branch_stack(event))
13070
		event->attr.branch_sample_type = 0;
13071

13072
	pmu = perf_init_event(event);
13073
	if (IS_ERR(pmu))
13074
		return (void*)pmu;
13075

13076
	/*
13077
	 * The PERF_ATTACH_TASK_DATA is set in the event_init()->hw_config().
13078
	 * The attach should be right after the perf_init_event().
13079
	 * Otherwise, the __free_event() would mistakenly detach the non-exist
13080
	 * perf_ctx_data because of the other errors between them.
13081
	 */
13082
	if (event->attach_state & PERF_ATTACH_TASK_DATA) {
13083
		err = attach_perf_ctx_data(event);
13084
		if (err)
13085
			return ERR_PTR(err);
13086
	}
13087

13088
	/*
13089
	 * Disallow uncore-task events. Similarly, disallow uncore-cgroup
13090
	 * events (they don't make sense as the cgroup will be different
13091
	 * on other CPUs in the uncore mask).
13092
	 */
13093
	if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1))
13094
		return ERR_PTR(-EINVAL);
13095

13096
	if (event->attr.aux_output &&
13097
	    (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) ||
13098
	     event->attr.aux_pause || event->attr.aux_resume))
13099
		return ERR_PTR(-EOPNOTSUPP);
13100

13101
	if (event->attr.aux_pause && event->attr.aux_resume)
13102
		return ERR_PTR(-EINVAL);
13103

13104
	if (event->attr.aux_start_paused) {
13105
		if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
13106
			return ERR_PTR(-EOPNOTSUPP);
13107
		event->hw.aux_paused = 1;
13108
	}
13109

13110
	if (cgroup_fd != -1) {
13111
		err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
13112
		if (err)
13113
			return ERR_PTR(err);
13114
	}
13115

13116
	err = exclusive_event_init(event);
13117
	if (err)
13118
		return ERR_PTR(err);
13119

13120
	if (has_addr_filter(event)) {
13121
		event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
13122
						    sizeof(struct perf_addr_filter_range),
13123
						    GFP_KERNEL);
13124
		if (!event->addr_filter_ranges)
13125
			return ERR_PTR(-ENOMEM);
13126

13127
		/*
13128
		 * Clone the parent's vma offsets: they are valid until exec()
13129
		 * even if the mm is not shared with the parent.
13130
		 */
13131
		if (event->parent) {
13132
			struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
13133

13134
			raw_spin_lock_irq(&ifh->lock);
13135
			memcpy(event->addr_filter_ranges,
13136
			       event->parent->addr_filter_ranges,
13137
			       pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
13138
			raw_spin_unlock_irq(&ifh->lock);
13139
		}
13140

13141
		/* force hw sync on the address filters */
13142
		event->addr_filters_gen = 1;
13143
	}
13144

13145
	if (!event->parent) {
13146
		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
13147
			err = get_callchain_buffers(attr->sample_max_stack);
13148
			if (err)
13149
				return ERR_PTR(err);
13150
			event->attach_state |= PERF_ATTACH_CALLCHAIN;
13151
		}
13152
	}
13153

13154
	err = security_perf_event_alloc(event);
13155
	if (err)
13156
		return ERR_PTR(err);
13157

13158
	/* symmetric to unaccount_event() in _free_event() */
13159
	account_event(event);
13160

13161
	/*
13162
	 * Event creation should be under SRCU, see perf_pmu_unregister().
13163
	 */
13164
	lockdep_assert_held(&pmus_srcu);
13165
	scoped_guard (spinlock, &pmu->events_lock)
13166
		list_add(&event->pmu_list, &pmu->events);
13167

13168
	return_ptr(event);
13169
}
13170

13171
static int perf_copy_attr(struct perf_event_attr __user *uattr,
13172
			  struct perf_event_attr *attr)
13173
{
13174
	u32 size;
13175
	int ret;
13176

13177
	/* Zero the full structure, so that a short copy will be nice. */
13178
	memset(attr, 0, sizeof(*attr));
13179

13180
	ret = get_user(size, &uattr->size);
13181
	if (ret)
13182
		return ret;
13183

13184
	/* ABI compatibility quirk: */
13185
	if (!size)
13186
		size = PERF_ATTR_SIZE_VER0;
13187
	if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
13188
		goto err_size;
13189

13190
	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
13191
	if (ret) {
13192
		if (ret == -E2BIG)
13193
			goto err_size;
13194
		return ret;
13195
	}
13196

13197
	attr->size = size;
13198

13199
	if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
13200
		return -EINVAL;
13201

13202
	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
13203
		return -EINVAL;
13204

13205
	if (attr->read_format & ~(PERF_FORMAT_MAX-1))
13206
		return -EINVAL;
13207

13208
	if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
13209
		u64 mask = attr->branch_sample_type;
13210

13211
		/* only using defined bits */
13212
		if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
13213
			return -EINVAL;
13214

13215
		/* at least one branch bit must be set */
13216
		if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
13217
			return -EINVAL;
13218

13219
		/* propagate priv level, when not set for branch */
13220
		if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
13221

13222
			/* exclude_kernel checked on syscall entry */
13223
			if (!attr->exclude_kernel)
13224
				mask |= PERF_SAMPLE_BRANCH_KERNEL;
13225

13226
			if (!attr->exclude_user)
13227
				mask |= PERF_SAMPLE_BRANCH_USER;
13228

13229
			if (!attr->exclude_hv)
13230
				mask |= PERF_SAMPLE_BRANCH_HV;
13231
			/*
13232
			 * adjust user setting (for HW filter setup)
13233
			 */
13234
			attr->branch_sample_type = mask;
13235
		}
13236
		/* privileged levels capture (kernel, hv): check permissions */
13237
		if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
13238
			ret = perf_allow_kernel();
13239
			if (ret)
13240
				return ret;
13241
		}
13242
	}
13243

13244
	if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
13245
		ret = perf_reg_validate(attr->sample_regs_user);
13246
		if (ret)
13247
			return ret;
13248
	}
13249

13250
	if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
13251
		if (!arch_perf_have_user_stack_dump())
13252
			return -ENOSYS;
13253

13254
		/*
13255
		 * We have __u32 type for the size, but so far
13256
		 * we can only use __u16 as maximum due to the
13257
		 * __u16 sample size limit.
13258
		 */
13259
		if (attr->sample_stack_user >= USHRT_MAX)
13260
			return -EINVAL;
13261
		else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
13262
			return -EINVAL;
13263
	}
13264

13265
	if (!attr->sample_max_stack)
13266
		attr->sample_max_stack = sysctl_perf_event_max_stack;
13267

13268
	if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
13269
		ret = perf_reg_validate(attr->sample_regs_intr);
13270

13271
#ifndef CONFIG_CGROUP_PERF
13272
	if (attr->sample_type & PERF_SAMPLE_CGROUP)
13273
		return -EINVAL;
13274
#endif
13275
	if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
13276
	    (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
13277
		return -EINVAL;
13278

13279
	if (!attr->inherit && attr->inherit_thread)
13280
		return -EINVAL;
13281

13282
	if (attr->remove_on_exec && attr->enable_on_exec)
13283
		return -EINVAL;
13284

13285
	if (attr->sigtrap && !attr->remove_on_exec)
13286
		return -EINVAL;
13287

13288
out:
13289
	return ret;
13290

13291
err_size:
13292
	put_user(sizeof(*attr), &uattr->size);
13293
	ret = -E2BIG;
13294
	goto out;
13295
}
13296

13297
static void mutex_lock_double(struct mutex *a, struct mutex *b)
13298
{
13299
	if (b < a)
13300
		swap(a, b);
13301

13302
	mutex_lock(a);
13303
	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
13304
}
13305

13306
static int
13307
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
13308
{
13309
	struct perf_buffer *rb = NULL;
13310
	int ret = -EINVAL;
13311

13312
	if (!output_event) {
13313
		mutex_lock(&event->mmap_mutex);
13314
		goto set;
13315
	}
13316

13317
	/* don't allow circular references */
13318
	if (event == output_event)
13319
		goto out;
13320

13321
	/*
13322
	 * Don't allow cross-cpu buffers
13323
	 */
13324
	if (output_event->cpu != event->cpu)
13325
		goto out;
13326

13327
	/*
13328
	 * If its not a per-cpu rb, it must be the same task.
13329
	 */
13330
	if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
13331
		goto out;
13332

13333
	/*
13334
	 * Mixing clocks in the same buffer is trouble you don't need.
13335
	 */
13336
	if (output_event->clock != event->clock)
13337
		goto out;
13338

13339
	/*
13340
	 * Either writing ring buffer from beginning or from end.
13341
	 * Mixing is not allowed.
13342
	 */
13343
	if (is_write_backward(output_event) != is_write_backward(event))
13344
		goto out;
13345

13346
	/*
13347
	 * If both events generate aux data, they must be on the same PMU
13348
	 */
13349
	if (has_aux(event) && has_aux(output_event) &&
13350
	    event->pmu != output_event->pmu)
13351
		goto out;
13352

13353
	/*
13354
	 * Hold both mmap_mutex to serialize against perf_mmap_close().  Since
13355
	 * output_event is already on rb->event_list, and the list iteration
13356
	 * restarts after every removal, it is guaranteed this new event is
13357
	 * observed *OR* if output_event is already removed, it's guaranteed we
13358
	 * observe !rb->mmap_count.
13359
	 */
13360
	mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
13361
set:
13362
	/* Can't redirect output if we've got an active mmap() */
13363
	if (refcount_read(&event->mmap_count))
13364
		goto unlock;
13365

13366
	if (output_event) {
13367
		if (output_event->state <= PERF_EVENT_STATE_REVOKED)
13368
			goto unlock;
13369

13370
		/* get the rb we want to redirect to */
13371
		rb = ring_buffer_get(output_event);
13372
		if (!rb)
13373
			goto unlock;
13374

13375
		/* did we race against perf_mmap_close() */
13376
		if (!refcount_read(&rb->mmap_count)) {
13377
			ring_buffer_put(rb);
13378
			goto unlock;
13379
		}
13380
	}
13381

13382
	ring_buffer_attach(event, rb);
13383

13384
	ret = 0;
13385
unlock:
13386
	mutex_unlock(&event->mmap_mutex);
13387
	if (output_event)
13388
		mutex_unlock(&output_event->mmap_mutex);
13389

13390
out:
13391
	return ret;
13392
}
13393

13394
static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
13395
{
13396
	bool nmi_safe = false;
13397

13398
	switch (clk_id) {
13399
	case CLOCK_MONOTONIC:
13400
		event->clock = &ktime_get_mono_fast_ns;
13401
		nmi_safe = true;
13402
		break;
13403

13404
	case CLOCK_MONOTONIC_RAW:
13405
		event->clock = &ktime_get_raw_fast_ns;
13406
		nmi_safe = true;
13407
		break;
13408

13409
	case CLOCK_REALTIME:
13410
		event->clock = &ktime_get_real_ns;
13411
		break;
13412

13413
	case CLOCK_BOOTTIME:
13414
		event->clock = &ktime_get_boottime_ns;
13415
		break;
13416

13417
	case CLOCK_TAI:
13418
		event->clock = &ktime_get_clocktai_ns;
13419
		break;
13420

13421
	default:
13422
		return -EINVAL;
13423
	}
13424

13425
	if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
13426
		return -EINVAL;
13427

13428
	return 0;
13429
}
13430

13431
static bool
13432
perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
13433
{
13434
	unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS;
13435
	bool is_capable = perfmon_capable();
13436

13437
	if (attr->sigtrap) {
13438
		/*
13439
		 * perf_event_attr::sigtrap sends signals to the other task.
13440
		 * Require the current task to also have CAP_KILL.
13441
		 */
13442
		rcu_read_lock();
13443
		is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL);
13444
		rcu_read_unlock();
13445

13446
		/*
13447
		 * If the required capabilities aren't available, checks for
13448
		 * ptrace permissions: upgrade to ATTACH, since sending signals
13449
		 * can effectively change the target task.
13450
		 */
13451
		ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS;
13452
	}
13453

13454
	/*
13455
	 * Preserve ptrace permission check for backwards compatibility. The
13456
	 * ptrace check also includes checks that the current task and other
13457
	 * task have matching uids, and is therefore not done here explicitly.
13458
	 */
13459
	return is_capable || ptrace_may_access(task, ptrace_mode);
13460
}
13461

13462
/**
13463
 * sys_perf_event_open - open a performance event, associate it to a task/cpu
13464
 *
13465
 * @attr_uptr:	event_id type attributes for monitoring/sampling
13466
 * @pid:		target pid
13467
 * @cpu:		target cpu
13468
 * @group_fd:		group leader event fd
13469
 * @flags:		perf event open flags
13470
 */
13471
SYSCALL_DEFINE5(perf_event_open,
13472
		struct perf_event_attr __user *, attr_uptr,
13473
		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
13474
{
13475
	struct perf_event *group_leader = NULL, *output_event = NULL;
13476
	struct perf_event_pmu_context *pmu_ctx;
13477
	struct perf_event *event, *sibling;
13478
	struct perf_event_attr attr;
13479
	struct perf_event_context *ctx;
13480
	struct file *event_file = NULL;
13481
	struct task_struct *task = NULL;
13482
	struct pmu *pmu;
13483
	int event_fd;
13484
	int move_group = 0;
13485
	int err;
13486
	int f_flags = O_RDWR;
13487
	int cgroup_fd = -1;
13488

13489
	/* for future expandability... */
13490
	if (flags & ~PERF_FLAG_ALL)
13491
		return -EINVAL;
13492

13493
	err = perf_copy_attr(attr_uptr, &attr);
13494
	if (err)
13495
		return err;
13496

13497
	/* Do we allow access to perf_event_open(2) ? */
13498
	err = security_perf_event_open(PERF_SECURITY_OPEN);
13499
	if (err)
13500
		return err;
13501

13502
	if (!attr.exclude_kernel) {
13503
		err = perf_allow_kernel();
13504
		if (err)
13505
			return err;
13506
	}
13507

13508
	if (attr.namespaces) {
13509
		if (!perfmon_capable())
13510
			return -EACCES;
13511
	}
13512

13513
	if (attr.freq) {
13514
		if (attr.sample_freq > sysctl_perf_event_sample_rate)
13515
			return -EINVAL;
13516
	} else {
13517
		if (attr.sample_period & (1ULL << 63))
13518
			return -EINVAL;
13519
	}
13520

13521
	/* Only privileged users can get physical addresses */
13522
	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
13523
		err = perf_allow_kernel();
13524
		if (err)
13525
			return err;
13526
	}
13527

13528
	/* REGS_INTR can leak data, lockdown must prevent this */
13529
	if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
13530
		err = security_locked_down(LOCKDOWN_PERF);
13531
		if (err)
13532
			return err;
13533
	}
13534

13535
	/*
13536
	 * In cgroup mode, the pid argument is used to pass the fd
13537
	 * opened to the cgroup directory in cgroupfs. The cpu argument
13538
	 * designates the cpu on which to monitor threads from that
13539
	 * cgroup.
13540
	 */
13541
	if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
13542
		return -EINVAL;
13543

13544
	if (flags & PERF_FLAG_FD_CLOEXEC)
13545
		f_flags |= O_CLOEXEC;
13546

13547
	event_fd = get_unused_fd_flags(f_flags);
13548
	if (event_fd < 0)
13549
		return event_fd;
13550

13551
	/*
13552
	 * Event creation should be under SRCU, see perf_pmu_unregister().
13553
	 */
13554
	guard(srcu)(&pmus_srcu);
13555

13556
	CLASS(fd, group)(group_fd);     // group_fd == -1 => empty
13557
	if (group_fd != -1) {
13558
		if (!is_perf_file(group)) {
13559
			err = -EBADF;
13560
			goto err_fd;
13561
		}
13562
		group_leader = fd_file(group)->private_data;
13563
		if (group_leader->state <= PERF_EVENT_STATE_REVOKED) {
13564
			err = -ENODEV;
13565
			goto err_fd;
13566
		}
13567
		if (flags & PERF_FLAG_FD_OUTPUT)
13568
			output_event = group_leader;
13569
		if (flags & PERF_FLAG_FD_NO_GROUP)
13570
			group_leader = NULL;
13571
	}
13572

13573
	if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
13574
		task = find_lively_task_by_vpid(pid);
13575
		if (IS_ERR(task)) {
13576
			err = PTR_ERR(task);
13577
			goto err_fd;
13578
		}
13579
	}
13580

13581
	if (task && group_leader &&
13582
	    group_leader->attr.inherit != attr.inherit) {
13583
		err = -EINVAL;
13584
		goto err_task;
13585
	}
13586

13587
	if (flags & PERF_FLAG_PID_CGROUP)
13588
		cgroup_fd = pid;
13589

13590
	event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
13591
				 NULL, NULL, cgroup_fd);
13592
	if (IS_ERR(event)) {
13593
		err = PTR_ERR(event);
13594
		goto err_task;
13595
	}
13596

13597
	if (is_sampling_event(event)) {
13598
		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
13599
			err = -EOPNOTSUPP;
13600
			goto err_alloc;
13601
		}
13602
	}
13603

13604
	/*
13605
	 * Special case software events and allow them to be part of
13606
	 * any hardware group.
13607
	 */
13608
	pmu = event->pmu;
13609

13610
	if (attr.use_clockid) {
13611
		err = perf_event_set_clock(event, attr.clockid);
13612
		if (err)
13613
			goto err_alloc;
13614
	}
13615

13616
	if (pmu->task_ctx_nr == perf_sw_context)
13617
		event->event_caps |= PERF_EV_CAP_SOFTWARE;
13618

13619
	if (task) {
13620
		err = down_read_interruptible(&task->signal->exec_update_lock);
13621
		if (err)
13622
			goto err_alloc;
13623

13624
		/*
13625
		 * We must hold exec_update_lock across this and any potential
13626
		 * perf_install_in_context() call for this new event to
13627
		 * serialize against exec() altering our credentials (and the
13628
		 * perf_event_exit_task() that could imply).
13629
		 */
13630
		err = -EACCES;
13631
		if (!perf_check_permission(&attr, task))
13632
			goto err_cred;
13633
	}
13634

13635
	/*
13636
	 * Get the target context (task or percpu):
13637
	 */
13638
	ctx = find_get_context(task, event);
13639
	if (IS_ERR(ctx)) {
13640
		err = PTR_ERR(ctx);
13641
		goto err_cred;
13642
	}
13643

13644
	mutex_lock(&ctx->mutex);
13645

13646
	if (ctx->task == TASK_TOMBSTONE) {
13647
		err = -ESRCH;
13648
		goto err_locked;
13649
	}
13650

13651
	if (!task) {
13652
		/*
13653
		 * Check if the @cpu we're creating an event for is online.
13654
		 *
13655
		 * We use the perf_cpu_context::ctx::mutex to serialize against
13656
		 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
13657
		 */
13658
		struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
13659

13660
		if (!cpuctx->online) {
13661
			err = -ENODEV;
13662
			goto err_locked;
13663
		}
13664
	}
13665

13666
	if (group_leader) {
13667
		err = -EINVAL;
13668

13669
		/*
13670
		 * Do not allow a recursive hierarchy (this new sibling
13671
		 * becoming part of another group-sibling):
13672
		 */
13673
		if (group_leader->group_leader != group_leader)
13674
			goto err_locked;
13675

13676
		/* All events in a group should have the same clock */
13677
		if (group_leader->clock != event->clock)
13678
			goto err_locked;
13679

13680
		/*
13681
		 * Make sure we're both events for the same CPU;
13682
		 * grouping events for different CPUs is broken; since
13683
		 * you can never concurrently schedule them anyhow.
13684
		 */
13685
		if (group_leader->cpu != event->cpu)
13686
			goto err_locked;
13687

13688
		/*
13689
		 * Make sure we're both on the same context; either task or cpu.
13690
		 */
13691
		if (group_leader->ctx != ctx)
13692
			goto err_locked;
13693

13694
		/*
13695
		 * Only a group leader can be exclusive or pinned
13696
		 */
13697
		if (attr.exclusive || attr.pinned)
13698
			goto err_locked;
13699

13700
		if (is_software_event(event) &&
13701
		    !in_software_context(group_leader)) {
13702
			/*
13703
			 * If the event is a sw event, but the group_leader
13704
			 * is on hw context.
13705
			 *
13706
			 * Allow the addition of software events to hw
13707
			 * groups, this is safe because software events
13708
			 * never fail to schedule.
13709
			 *
13710
			 * Note the comment that goes with struct
13711
			 * perf_event_pmu_context.
13712
			 */
13713
			pmu = group_leader->pmu_ctx->pmu;
13714
		} else if (!is_software_event(event)) {
13715
			if (is_software_event(group_leader) &&
13716
			    (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
13717
				/*
13718
				 * In case the group is a pure software group, and we
13719
				 * try to add a hardware event, move the whole group to
13720
				 * the hardware context.
13721
				 */
13722
				move_group = 1;
13723
			}
13724

13725
			/* Don't allow group of multiple hw events from different pmus */
13726
			if (!in_software_context(group_leader) &&
13727
			    group_leader->pmu_ctx->pmu != pmu)
13728
				goto err_locked;
13729
		}
13730
	}
13731

13732
	/*
13733
	 * Now that we're certain of the pmu; find the pmu_ctx.
13734
	 */
13735
	pmu_ctx = find_get_pmu_context(pmu, ctx, event);
13736
	if (IS_ERR(pmu_ctx)) {
13737
		err = PTR_ERR(pmu_ctx);
13738
		goto err_locked;
13739
	}
13740
	event->pmu_ctx = pmu_ctx;
13741

13742
	if (output_event) {
13743
		err = perf_event_set_output(event, output_event);
13744
		if (err)
13745
			goto err_context;
13746
	}
13747

13748
	if (!perf_event_validate_size(event)) {
13749
		err = -E2BIG;
13750
		goto err_context;
13751
	}
13752

13753
	if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
13754
		err = -EINVAL;
13755
		goto err_context;
13756
	}
13757

13758
	/*
13759
	 * Must be under the same ctx::mutex as perf_install_in_context(),
13760
	 * because we need to serialize with concurrent event creation.
13761
	 */
13762
	if (!exclusive_event_installable(event, ctx)) {
13763
		err = -EBUSY;
13764
		goto err_context;
13765
	}
13766

13767
	WARN_ON_ONCE(ctx->parent_ctx);
13768

13769
	event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags);
13770
	if (IS_ERR(event_file)) {
13771
		err = PTR_ERR(event_file);
13772
		event_file = NULL;
13773
		goto err_context;
13774
	}
13775

13776
	/*
13777
	 * This is the point on no return; we cannot fail hereafter. This is
13778
	 * where we start modifying current state.
13779
	 */
13780

13781
	if (move_group) {
13782
		perf_remove_from_context(group_leader, 0);
13783
		put_pmu_ctx(group_leader->pmu_ctx);
13784

13785
		for_each_sibling_event(sibling, group_leader) {
13786
			perf_remove_from_context(sibling, 0);
13787
			put_pmu_ctx(sibling->pmu_ctx);
13788
		}
13789

13790
		/*
13791
		 * Install the group siblings before the group leader.
13792
		 *
13793
		 * Because a group leader will try and install the entire group
13794
		 * (through the sibling list, which is still in-tact), we can
13795
		 * end up with siblings installed in the wrong context.
13796
		 *
13797
		 * By installing siblings first we NO-OP because they're not
13798
		 * reachable through the group lists.
13799
		 */
13800
		for_each_sibling_event(sibling, group_leader) {
13801
			sibling->pmu_ctx = pmu_ctx;
13802
			get_pmu_ctx(pmu_ctx);
13803
			perf_event__state_init(sibling);
13804
			perf_install_in_context(ctx, sibling, sibling->cpu);
13805
		}
13806

13807
		/*
13808
		 * Removing from the context ends up with disabled
13809
		 * event. What we want here is event in the initial
13810
		 * startup state, ready to be add into new context.
13811
		 */
13812
		group_leader->pmu_ctx = pmu_ctx;
13813
		get_pmu_ctx(pmu_ctx);
13814
		perf_event__state_init(group_leader);
13815
		perf_install_in_context(ctx, group_leader, group_leader->cpu);
13816
	}
13817

13818
	/*
13819
	 * Precalculate sample_data sizes; do while holding ctx::mutex such
13820
	 * that we're serialized against further additions and before
13821
	 * perf_install_in_context() which is the point the event is active and
13822
	 * can use these values.
13823
	 */
13824
	perf_event__header_size(event);
13825
	perf_event__id_header_size(event);
13826

13827
	event->owner = current;
13828

13829
	perf_install_in_context(ctx, event, event->cpu);
13830
	perf_unpin_context(ctx);
13831

13832
	mutex_unlock(&ctx->mutex);
13833

13834
	if (task) {
13835
		up_read(&task->signal->exec_update_lock);
13836
		put_task_struct(task);
13837
	}
13838

13839
	mutex_lock(&current->perf_event_mutex);
13840
	list_add_tail(&event->owner_entry, &current->perf_event_list);
13841
	mutex_unlock(&current->perf_event_mutex);
13842

13843
	/*
13844
	 * File reference in group guarantees that group_leader has been
13845
	 * kept alive until we place the new event on the sibling_list.
13846
	 * This ensures destruction of the group leader will find
13847
	 * the pointer to itself in perf_group_detach().
13848
	 */
13849
	fd_install(event_fd, event_file);
13850
	return event_fd;
13851

13852
err_context:
13853
	put_pmu_ctx(event->pmu_ctx);
13854
	event->pmu_ctx = NULL; /* _free_event() */
13855
err_locked:
13856
	mutex_unlock(&ctx->mutex);
13857
	perf_unpin_context(ctx);
13858
	put_ctx(ctx);
13859
err_cred:
13860
	if (task)
13861
		up_read(&task->signal->exec_update_lock);
13862
err_alloc:
13863
	put_event(event);
13864
err_task:
13865
	if (task)
13866
		put_task_struct(task);
13867
err_fd:
13868
	put_unused_fd(event_fd);
13869
	return err;
13870
}
13871

13872
/**
13873
 * perf_event_create_kernel_counter
13874
 *
13875
 * @attr: attributes of the counter to create
13876
 * @cpu: cpu in which the counter is bound
13877
 * @task: task to profile (NULL for percpu)
13878
 * @overflow_handler: callback to trigger when we hit the event
13879
 * @context: context data could be used in overflow_handler callback
13880
 */
13881
struct perf_event *
13882
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
13883
				 struct task_struct *task,
13884
				 perf_overflow_handler_t overflow_handler,
13885
				 void *context)
13886
{
13887
	struct perf_event_pmu_context *pmu_ctx;
13888
	struct perf_event_context *ctx;
13889
	struct perf_event *event;
13890
	struct pmu *pmu;
13891
	int err;
13892

13893
	/*
13894
	 * Grouping is not supported for kernel events, neither is 'AUX',
13895
	 * make sure the caller's intentions are adjusted.
13896
	 */
13897
	if (attr->aux_output || attr->aux_action)
13898
		return ERR_PTR(-EINVAL);
13899

13900
	/*
13901
	 * Event creation should be under SRCU, see perf_pmu_unregister().
13902
	 */
13903
	guard(srcu)(&pmus_srcu);
13904

13905
	event = perf_event_alloc(attr, cpu, task, NULL, NULL,
13906
				 overflow_handler, context, -1);
13907
	if (IS_ERR(event)) {
13908
		err = PTR_ERR(event);
13909
		goto err;
13910
	}
13911

13912
	/* Mark owner so we could distinguish it from user events. */
13913
	event->owner = TASK_TOMBSTONE;
13914
	pmu = event->pmu;
13915

13916
	if (pmu->task_ctx_nr == perf_sw_context)
13917
		event->event_caps |= PERF_EV_CAP_SOFTWARE;
13918

13919
	/*
13920
	 * Get the target context (task or percpu):
13921
	 */
13922
	ctx = find_get_context(task, event);
13923
	if (IS_ERR(ctx)) {
13924
		err = PTR_ERR(ctx);
13925
		goto err_alloc;
13926
	}
13927

13928
	WARN_ON_ONCE(ctx->parent_ctx);
13929
	mutex_lock(&ctx->mutex);
13930
	if (ctx->task == TASK_TOMBSTONE) {
13931
		err = -ESRCH;
13932
		goto err_unlock;
13933
	}
13934

13935
	pmu_ctx = find_get_pmu_context(pmu, ctx, event);
13936
	if (IS_ERR(pmu_ctx)) {
13937
		err = PTR_ERR(pmu_ctx);
13938
		goto err_unlock;
13939
	}
13940
	event->pmu_ctx = pmu_ctx;
13941

13942
	if (!task) {
13943
		/*
13944
		 * Check if the @cpu we're creating an event for is online.
13945
		 *
13946
		 * We use the perf_cpu_context::ctx::mutex to serialize against
13947
		 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
13948
		 */
13949
		struct perf_cpu_context *cpuctx =
13950
			container_of(ctx, struct perf_cpu_context, ctx);
13951
		if (!cpuctx->online) {
13952
			err = -ENODEV;
13953
			goto err_pmu_ctx;
13954
		}
13955
	}
13956

13957
	if (!exclusive_event_installable(event, ctx)) {
13958
		err = -EBUSY;
13959
		goto err_pmu_ctx;
13960
	}
13961

13962
	perf_install_in_context(ctx, event, event->cpu);
13963
	perf_unpin_context(ctx);
13964
	mutex_unlock(&ctx->mutex);
13965

13966
	return event;
13967

13968
err_pmu_ctx:
13969
	put_pmu_ctx(pmu_ctx);
13970
	event->pmu_ctx = NULL; /* _free_event() */
13971
err_unlock:
13972
	mutex_unlock(&ctx->mutex);
13973
	perf_unpin_context(ctx);
13974
	put_ctx(ctx);
13975
err_alloc:
13976
	put_event(event);
13977
err:
13978
	return ERR_PTR(err);
13979
}
13980
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
13981

13982
static void __perf_pmu_remove(struct perf_event_context *ctx,
13983
			      int cpu, struct pmu *pmu,
13984
			      struct perf_event_groups *groups,
13985
			      struct list_head *events)
13986
{
13987
	struct perf_event *event, *sibling;
13988

13989
	perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
13990
		perf_remove_from_context(event, 0);
13991
		put_pmu_ctx(event->pmu_ctx);
13992
		list_add(&event->migrate_entry, events);
13993

13994
		for_each_sibling_event(sibling, event) {
13995
			perf_remove_from_context(sibling, 0);
13996
			put_pmu_ctx(sibling->pmu_ctx);
13997
			list_add(&sibling->migrate_entry, events);
13998
		}
13999
	}
14000
}
14001

14002
static void __perf_pmu_install_event(struct pmu *pmu,
14003
				     struct perf_event_context *ctx,
14004
				     int cpu, struct perf_event *event)
14005
{
14006
	struct perf_event_pmu_context *epc;
14007
	struct perf_event_context *old_ctx = event->ctx;
14008

14009
	get_ctx(ctx); /* normally find_get_context() */
14010

14011
	event->cpu = cpu;
14012
	epc = find_get_pmu_context(pmu, ctx, event);
14013
	event->pmu_ctx = epc;
14014

14015
	if (event->state >= PERF_EVENT_STATE_OFF)
14016
		event->state = PERF_EVENT_STATE_INACTIVE;
14017
	perf_install_in_context(ctx, event, cpu);
14018

14019
	/*
14020
	 * Now that event->ctx is updated and visible, put the old ctx.
14021
	 */
14022
	put_ctx(old_ctx);
14023
}
14024

14025
static void __perf_pmu_install(struct perf_event_context *ctx,
14026
			       int cpu, struct pmu *pmu, struct list_head *events)
14027
{
14028
	struct perf_event *event, *tmp;
14029

14030
	/*
14031
	 * Re-instate events in 2 passes.
14032
	 *
14033
	 * Skip over group leaders and only install siblings on this first
14034
	 * pass, siblings will not get enabled without a leader, however a
14035
	 * leader will enable its siblings, even if those are still on the old
14036
	 * context.
14037
	 */
14038
	list_for_each_entry_safe(event, tmp, events, migrate_entry) {
14039
		if (event->group_leader == event)
14040
			continue;
14041

14042
		list_del(&event->migrate_entry);
14043
		__perf_pmu_install_event(pmu, ctx, cpu, event);
14044
	}
14045

14046
	/*
14047
	 * Once all the siblings are setup properly, install the group leaders
14048
	 * to make it go.
14049
	 */
14050
	list_for_each_entry_safe(event, tmp, events, migrate_entry) {
14051
		list_del(&event->migrate_entry);
14052
		__perf_pmu_install_event(pmu, ctx, cpu, event);
14053
	}
14054
}
14055

14056
void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
14057
{
14058
	struct perf_event_context *src_ctx, *dst_ctx;
14059
	LIST_HEAD(events);
14060

14061
	/*
14062
	 * Since per-cpu context is persistent, no need to grab an extra
14063
	 * reference.
14064
	 */
14065
	src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
14066
	dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
14067

14068
	/*
14069
	 * See perf_event_ctx_lock() for comments on the details
14070
	 * of swizzling perf_event::ctx.
14071
	 */
14072
	mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
14073

14074
	__perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
14075
	__perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);
14076

14077
	if (!list_empty(&events)) {
14078
		/*
14079
		 * Wait for the events to quiesce before re-instating them.
14080
		 */
14081
		synchronize_rcu();
14082

14083
		__perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
14084
	}
14085

14086
	mutex_unlock(&dst_ctx->mutex);
14087
	mutex_unlock(&src_ctx->mutex);
14088
}
14089
EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
14090

14091
static void sync_child_event(struct perf_event *child_event,
14092
			     struct task_struct *task)
14093
{
14094
	struct perf_event *parent_event = child_event->parent;
14095
	u64 child_val;
14096

14097
	if (child_event->attr.inherit_stat) {
14098
		if (task && task != TASK_TOMBSTONE)
14099
			perf_event_read_event(child_event, task);
14100
	}
14101

14102
	child_val = perf_event_count(child_event, false);
14103

14104
	/*
14105
	 * Add back the child's count to the parent's count:
14106
	 */
14107
	atomic64_add(child_val, &parent_event->child_count);
14108
	atomic64_add(child_event->total_time_enabled,
14109
		     &parent_event->child_total_time_enabled);
14110
	atomic64_add(child_event->total_time_running,
14111
		     &parent_event->child_total_time_running);
14112
}
14113

14114
static void
14115
perf_event_exit_event(struct perf_event *event,
14116
		      struct perf_event_context *ctx,
14117
		      struct task_struct *task,
14118
		      bool revoke)
14119
{
14120
	struct perf_event *parent_event = event->parent;
14121
	unsigned long detach_flags = DETACH_EXIT;
14122
	unsigned int attach_state;
14123

14124
	if (parent_event) {
14125
		/*
14126
		 * Do not destroy the 'original' grouping; because of the
14127
		 * context switch optimization the original events could've
14128
		 * ended up in a random child task.
14129
		 *
14130
		 * If we were to destroy the original group, all group related
14131
		 * operations would cease to function properly after this
14132
		 * random child dies.
14133
		 *
14134
		 * Do destroy all inherited groups, we don't care about those
14135
		 * and being thorough is better.
14136
		 */
14137
		detach_flags |= DETACH_GROUP | DETACH_CHILD;
14138
		mutex_lock(&parent_event->child_mutex);
14139
		/* PERF_ATTACH_ITRACE might be set concurrently */
14140
		attach_state = READ_ONCE(event->attach_state);
14141

14142
		if (attach_state & PERF_ATTACH_CHILD)
14143
			sync_child_event(event, task);
14144
	}
14145

14146
	if (revoke)
14147
		detach_flags |= DETACH_GROUP | DETACH_REVOKE;
14148

14149
	perf_remove_from_context(event, detach_flags);
14150
	/*
14151
	 * Child events can be freed.
14152
	 */
14153
	if (parent_event) {
14154
		mutex_unlock(&parent_event->child_mutex);
14155

14156
		/*
14157
		 * Match the refcount initialization. Make sure it doesn't happen
14158
		 * twice if pmu_detach_event() calls it on an already exited task.
14159
		 */
14160
		if (attach_state & PERF_ATTACH_CHILD) {
14161
			/*
14162
			 * Kick perf_poll() for is_event_hup();
14163
			 */
14164
			perf_event_wakeup(parent_event);
14165
			/*
14166
			 * pmu_detach_event() will have an extra refcount.
14167
			 * perf_pending_task() might have one too.
14168
			 */
14169
			put_event(event);
14170
		}
14171

14172
		return;
14173
	}
14174

14175
	/*
14176
	 * Parent events are governed by their filedesc, retain them.
14177
	 */
14178
	perf_event_wakeup(event);
14179
}
14180

14181
static void perf_event_exit_task_context(struct task_struct *task, bool exit)
14182
{
14183
	struct perf_event_context *ctx, *clone_ctx = NULL;
14184
	struct perf_event *child_event, *next;
14185

14186
	ctx = perf_pin_task_context(task);
14187
	if (!ctx)
14188
		return;
14189

14190
	/*
14191
	 * In order to reduce the amount of tricky in ctx tear-down, we hold
14192
	 * ctx::mutex over the entire thing. This serializes against almost
14193
	 * everything that wants to access the ctx.
14194
	 *
14195
	 * The exception is sys_perf_event_open() /
14196
	 * perf_event_create_kernel_count() which does find_get_context()
14197
	 * without ctx::mutex (it cannot because of the move_group double mutex
14198
	 * lock thing). See the comments in perf_install_in_context().
14199
	 */
14200
	mutex_lock(&ctx->mutex);
14201

14202
	/*
14203
	 * In a single ctx::lock section, de-schedule the events and detach the
14204
	 * context from the task such that we cannot ever get it scheduled back
14205
	 * in.
14206
	 */
14207
	raw_spin_lock_irq(&ctx->lock);
14208
	if (exit)
14209
		task_ctx_sched_out(ctx, NULL, EVENT_ALL);
14210

14211
	/*
14212
	 * Now that the context is inactive, destroy the task <-> ctx relation
14213
	 * and mark the context dead.
14214
	 */
14215
	RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
14216
	put_ctx(ctx); /* cannot be last */
14217
	WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
14218
	put_task_struct(task); /* cannot be last */
14219

14220
	clone_ctx = unclone_ctx(ctx);
14221
	raw_spin_unlock_irq(&ctx->lock);
14222

14223
	if (clone_ctx)
14224
		put_ctx(clone_ctx);
14225

14226
	/*
14227
	 * Report the task dead after unscheduling the events so that we
14228
	 * won't get any samples after PERF_RECORD_EXIT. We can however still
14229
	 * get a few PERF_RECORD_READ events.
14230
	 */
14231
	if (exit)
14232
		perf_event_task(task, ctx, 0);
14233

14234
	list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry)
14235
		perf_event_exit_event(child_event, ctx, exit ? task : NULL, false);
14236

14237
	mutex_unlock(&ctx->mutex);
14238

14239
	if (!exit) {
14240
		/*
14241
		 * perf_event_release_kernel() could still have a reference on
14242
		 * this context. In that case we must wait for these events to
14243
		 * have been freed (in particular all their references to this
14244
		 * task must've been dropped).
14245
		 *
14246
		 * Without this copy_process() will unconditionally free this
14247
		 * task (irrespective of its reference count) and
14248
		 * _free_event()'s put_task_struct(event->hw.target) will be a
14249
		 * use-after-free.
14250
		 *
14251
		 * Wait for all events to drop their context reference.
14252
		 */
14253
		wait_var_event(&ctx->refcount,
14254
			       refcount_read(&ctx->refcount) == 1);
14255
	}
14256
	put_ctx(ctx);
14257
}
14258

14259
/*
14260
 * When a task exits, feed back event values to parent events.
14261
 *
14262
 * Can be called with exec_update_lock held when called from
14263
 * setup_new_exec().
14264
 */
14265
void perf_event_exit_task(struct task_struct *task)
14266
{
14267
	struct perf_event *event, *tmp;
14268

14269
	WARN_ON_ONCE(task != current);
14270

14271
	mutex_lock(&task->perf_event_mutex);
14272
	list_for_each_entry_safe(event, tmp, &task->perf_event_list,
14273
				 owner_entry) {
14274
		list_del_init(&event->owner_entry);
14275

14276
		/*
14277
		 * Ensure the list deletion is visible before we clear
14278
		 * the owner, closes a race against perf_release() where
14279
		 * we need to serialize on the owner->perf_event_mutex.
14280
		 */
14281
		smp_store_release(&event->owner, NULL);
14282
	}
14283
	mutex_unlock(&task->perf_event_mutex);
14284

14285
	perf_event_exit_task_context(task, true);
14286

14287
	/*
14288
	 * The perf_event_exit_task_context calls perf_event_task
14289
	 * with task's task_ctx, which generates EXIT events for
14290
	 * task contexts and sets task->perf_event_ctxp[] to NULL.
14291
	 * At this point we need to send EXIT events to cpu contexts.
14292
	 */
14293
	perf_event_task(task, NULL, 0);
14294

14295
	/*
14296
	 * Detach the perf_ctx_data for the system-wide event.
14297
	 */
14298
	guard(percpu_read)(&global_ctx_data_rwsem);
14299
	detach_task_ctx_data(task);
14300
}
14301

14302
/*
14303
 * Free a context as created by inheritance by perf_event_init_task() below,
14304
 * used by fork() in case of fail.
14305
 *
14306
 * Even though the task has never lived, the context and events have been
14307
 * exposed through the child_list, so we must take care tearing it all down.
14308
 */
14309
void perf_event_free_task(struct task_struct *task)
14310
{
14311
	perf_event_exit_task_context(task, false);
14312
}
14313

14314
void perf_event_delayed_put(struct task_struct *task)
14315
{
14316
	WARN_ON_ONCE(task->perf_event_ctxp);
14317
}
14318

14319
struct file *perf_event_get(unsigned int fd)
14320
{
14321
	struct file *file = fget(fd);
14322
	if (!file)
14323
		return ERR_PTR(-EBADF);
14324

14325
	if (file->f_op != &perf_fops) {
14326
		fput(file);
14327
		return ERR_PTR(-EBADF);
14328
	}
14329

14330
	return file;
14331
}
14332

14333
const struct perf_event *perf_get_event(struct file *file)
14334
{
14335
	if (file->f_op != &perf_fops)
14336
		return ERR_PTR(-EINVAL);
14337

14338
	return file->private_data;
14339
}
14340

14341
const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
14342
{
14343
	if (!event)
14344
		return ERR_PTR(-EINVAL);
14345

14346
	return &event->attr;
14347
}
14348

14349
int perf_allow_kernel(void)
14350
{
14351
	if (sysctl_perf_event_paranoid > 1 && !perfmon_capable())
14352
		return -EACCES;
14353

14354
	return security_perf_event_open(PERF_SECURITY_KERNEL);
14355
}
14356
EXPORT_SYMBOL_GPL(perf_allow_kernel);
14357

14358
/*
14359
 * Inherit an event from parent task to child task.
14360
 *
14361
 * Returns:
14362
 *  - valid pointer on success
14363
 *  - NULL for orphaned events
14364
 *  - IS_ERR() on error
14365
 */
14366
static struct perf_event *
14367
inherit_event(struct perf_event *parent_event,
14368
	      struct task_struct *parent,
14369
	      struct perf_event_context *parent_ctx,
14370
	      struct task_struct *child,
14371
	      struct perf_event *group_leader,
14372
	      struct perf_event_context *child_ctx)
14373
{
14374
	enum perf_event_state parent_state = parent_event->state;
14375
	struct perf_event_pmu_context *pmu_ctx;
14376
	struct perf_event *child_event;
14377
	unsigned long flags;
14378

14379
	/*
14380
	 * Instead of creating recursive hierarchies of events,
14381
	 * we link inherited events back to the original parent,
14382
	 * which has a filp for sure, which we use as the reference
14383
	 * count:
14384
	 */
14385
	if (parent_event->parent)
14386
		parent_event = parent_event->parent;
14387

14388
	if (parent_event->state <= PERF_EVENT_STATE_REVOKED)
14389
		return NULL;
14390

14391
	/*
14392
	 * Event creation should be under SRCU, see perf_pmu_unregister().
14393
	 */
14394
	guard(srcu)(&pmus_srcu);
14395

14396
	child_event = perf_event_alloc(&parent_event->attr,
14397
					   parent_event->cpu,
14398
					   child,
14399
					   group_leader, parent_event,
14400
					   NULL, NULL, -1);
14401
	if (IS_ERR(child_event))
14402
		return child_event;
14403

14404
	get_ctx(child_ctx);
14405
	child_event->ctx = child_ctx;
14406

14407
	pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
14408
	if (IS_ERR(pmu_ctx)) {
14409
		free_event(child_event);
14410
		return ERR_CAST(pmu_ctx);
14411
	}
14412
	child_event->pmu_ctx = pmu_ctx;
14413

14414
	/*
14415
	 * is_orphaned_event() and list_add_tail(&parent_event->child_list)
14416
	 * must be under the same lock in order to serialize against
14417
	 * perf_event_release_kernel(), such that either we must observe
14418
	 * is_orphaned_event() or they will observe us on the child_list.
14419
	 */
14420
	mutex_lock(&parent_event->child_mutex);
14421
	if (is_orphaned_event(parent_event) ||
14422
	    !atomic_long_inc_not_zero(&parent_event->refcount)) {
14423
		mutex_unlock(&parent_event->child_mutex);
14424
		free_event(child_event);
14425
		return NULL;
14426
	}
14427

14428
	/*
14429
	 * Make the child state follow the state of the parent event,
14430
	 * not its attr.disabled bit.  We hold the parent's mutex,
14431
	 * so we won't race with perf_event_{en, dis}able_family.
14432
	 */
14433
	if (parent_state >= PERF_EVENT_STATE_INACTIVE)
14434
		child_event->state = PERF_EVENT_STATE_INACTIVE;
14435
	else
14436
		child_event->state = PERF_EVENT_STATE_OFF;
14437

14438
	if (parent_event->attr.freq) {
14439
		u64 sample_period = parent_event->hw.sample_period;
14440
		struct hw_perf_event *hwc = &child_event->hw;
14441

14442
		hwc->sample_period = sample_period;
14443
		hwc->last_period   = sample_period;
14444

14445
		local64_set(&hwc->period_left, sample_period);
14446
	}
14447

14448
	child_event->overflow_handler = parent_event->overflow_handler;
14449
	child_event->overflow_handler_context
14450
		= parent_event->overflow_handler_context;
14451

14452
	/*
14453
	 * Precalculate sample_data sizes
14454
	 */
14455
	perf_event__header_size(child_event);
14456
	perf_event__id_header_size(child_event);
14457

14458
	/*
14459
	 * Link it up in the child's context:
14460
	 */
14461
	raw_spin_lock_irqsave(&child_ctx->lock, flags);
14462
	add_event_to_ctx(child_event, child_ctx);
14463
	child_event->attach_state |= PERF_ATTACH_CHILD;
14464
	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
14465

14466
	/*
14467
	 * Link this into the parent event's child list
14468
	 */
14469
	list_add_tail(&child_event->child_list, &parent_event->child_list);
14470
	mutex_unlock(&parent_event->child_mutex);
14471

14472
	return child_event;
14473
}
14474

14475
/*
14476
 * Inherits an event group.
14477
 *
14478
 * This will quietly suppress orphaned events; !inherit_event() is not an error.
14479
 * This matches with perf_event_release_kernel() removing all child events.
14480
 *
14481
 * Returns:
14482
 *  - 0 on success
14483
 *  - <0 on error
14484
 */
14485
static int inherit_group(struct perf_event *parent_event,
14486
	      struct task_struct *parent,
14487
	      struct perf_event_context *parent_ctx,
14488
	      struct task_struct *child,
14489
	      struct perf_event_context *child_ctx)
14490
{
14491
	struct perf_event *leader;
14492
	struct perf_event *sub;
14493
	struct perf_event *child_ctr;
14494

14495
	leader = inherit_event(parent_event, parent, parent_ctx,
14496
				 child, NULL, child_ctx);
14497
	if (IS_ERR(leader))
14498
		return PTR_ERR(leader);
14499
	/*
14500
	 * @leader can be NULL here because of is_orphaned_event(). In this
14501
	 * case inherit_event() will create individual events, similar to what
14502
	 * perf_group_detach() would do anyway.
14503
	 */
14504
	for_each_sibling_event(sub, parent_event) {
14505
		child_ctr = inherit_event(sub, parent, parent_ctx,
14506
					    child, leader, child_ctx);
14507
		if (IS_ERR(child_ctr))
14508
			return PTR_ERR(child_ctr);
14509

14510
		if (sub->aux_event == parent_event && child_ctr &&
14511
		    !perf_get_aux_event(child_ctr, leader))
14512
			return -EINVAL;
14513
	}
14514
	if (leader)
14515
		leader->group_generation = parent_event->group_generation;
14516
	return 0;
14517
}
14518

14519
/*
14520
 * Creates the child task context and tries to inherit the event-group.
14521
 *
14522
 * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
14523
 * inherited_all set when we 'fail' to inherit an orphaned event; this is
14524
 * consistent with perf_event_release_kernel() removing all child events.
14525
 *
14526
 * Returns:
14527
 *  - 0 on success
14528
 *  - <0 on error
14529
 */
14530
static int
14531
inherit_task_group(struct perf_event *event, struct task_struct *parent,
14532
		   struct perf_event_context *parent_ctx,
14533
		   struct task_struct *child,
14534
		   u64 clone_flags, int *inherited_all)
14535
{
14536
	struct perf_event_context *child_ctx;
14537
	int ret;
14538

14539
	if (!event->attr.inherit ||
14540
	    (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
14541
	    /* Do not inherit if sigtrap and signal handlers were cleared. */
14542
	    (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) {
14543
		*inherited_all = 0;
14544
		return 0;
14545
	}
14546

14547
	child_ctx = child->perf_event_ctxp;
14548
	if (!child_ctx) {
14549
		/*
14550
		 * This is executed from the parent task context, so
14551
		 * inherit events that have been marked for cloning.
14552
		 * First allocate and initialize a context for the
14553
		 * child.
14554
		 */
14555
		child_ctx = alloc_perf_context(child);
14556
		if (!child_ctx)
14557
			return -ENOMEM;
14558

14559
		child->perf_event_ctxp = child_ctx;
14560
	}
14561

14562
	ret = inherit_group(event, parent, parent_ctx, child, child_ctx);
14563
	if (ret)
14564
		*inherited_all = 0;
14565

14566
	return ret;
14567
}
14568

14569
/*
14570
 * Initialize the perf_event context in task_struct
14571
 */
14572
static int perf_event_init_context(struct task_struct *child, u64 clone_flags)
14573
{
14574
	struct perf_event_context *child_ctx, *parent_ctx;
14575
	struct perf_event_context *cloned_ctx;
14576
	struct perf_event *event;
14577
	struct task_struct *parent = current;
14578
	int inherited_all = 1;
14579
	unsigned long flags;
14580
	int ret = 0;
14581

14582
	if (likely(!parent->perf_event_ctxp))
14583
		return 0;
14584

14585
	/*
14586
	 * If the parent's context is a clone, pin it so it won't get
14587
	 * swapped under us.
14588
	 */
14589
	parent_ctx = perf_pin_task_context(parent);
14590
	if (!parent_ctx)
14591
		return 0;
14592

14593
	/*
14594
	 * No need to check if parent_ctx != NULL here; since we saw
14595
	 * it non-NULL earlier, the only reason for it to become NULL
14596
	 * is if we exit, and since we're currently in the middle of
14597
	 * a fork we can't be exiting at the same time.
14598
	 */
14599

14600
	/*
14601
	 * Lock the parent list. No need to lock the child - not PID
14602
	 * hashed yet and not running, so nobody can access it.
14603
	 */
14604
	mutex_lock(&parent_ctx->mutex);
14605

14606
	/*
14607
	 * We dont have to disable NMIs - we are only looking at
14608
	 * the list, not manipulating it:
14609
	 */
14610
	perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
14611
		ret = inherit_task_group(event, parent, parent_ctx,
14612
					 child, clone_flags, &inherited_all);
14613
		if (ret)
14614
			goto out_unlock;
14615
	}
14616

14617
	/*
14618
	 * We can't hold ctx->lock when iterating the ->flexible_group list due
14619
	 * to allocations, but we need to prevent rotation because
14620
	 * rotate_ctx() will change the list from interrupt context.
14621
	 */
14622
	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
14623
	parent_ctx->rotate_disable = 1;
14624
	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
14625

14626
	perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
14627
		ret = inherit_task_group(event, parent, parent_ctx,
14628
					 child, clone_flags, &inherited_all);
14629
		if (ret)
14630
			goto out_unlock;
14631
	}
14632

14633
	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
14634
	parent_ctx->rotate_disable = 0;
14635

14636
	child_ctx = child->perf_event_ctxp;
14637

14638
	if (child_ctx && inherited_all) {
14639
		/*
14640
		 * Mark the child context as a clone of the parent
14641
		 * context, or of whatever the parent is a clone of.
14642
		 *
14643
		 * Note that if the parent is a clone, the holding of
14644
		 * parent_ctx->lock avoids it from being uncloned.
14645
		 */
14646
		cloned_ctx = parent_ctx->parent_ctx;
14647
		if (cloned_ctx) {
14648
			child_ctx->parent_ctx = cloned_ctx;
14649
			child_ctx->parent_gen = parent_ctx->parent_gen;
14650
		} else {
14651
			child_ctx->parent_ctx = parent_ctx;
14652
			child_ctx->parent_gen = parent_ctx->generation;
14653
		}
14654
		get_ctx(child_ctx->parent_ctx);
14655
	}
14656

14657
	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
14658
out_unlock:
14659
	mutex_unlock(&parent_ctx->mutex);
14660

14661
	perf_unpin_context(parent_ctx);
14662
	put_ctx(parent_ctx);
14663

14664
	return ret;
14665
}
14666

14667
/*
14668
 * Initialize the perf_event context in task_struct
14669
 */
14670
int perf_event_init_task(struct task_struct *child, u64 clone_flags)
14671
{
14672
	int ret;
14673

14674
	memset(child->perf_recursion, 0, sizeof(child->perf_recursion));
14675
	child->perf_event_ctxp = NULL;
14676
	mutex_init(&child->perf_event_mutex);
14677
	INIT_LIST_HEAD(&child->perf_event_list);
14678
	child->perf_ctx_data = NULL;
14679

14680
	ret = perf_event_init_context(child, clone_flags);
14681
	if (ret) {
14682
		perf_event_free_task(child);
14683
		return ret;
14684
	}
14685

14686
	return 0;
14687
}
14688

14689
static void __init perf_event_init_all_cpus(void)
14690
{
14691
	struct swevent_htable *swhash;
14692
	struct perf_cpu_context *cpuctx;
14693
	int cpu;
14694

14695
	zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
14696
	zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
14697
	zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
14698
	zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
14699
	zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
14700
	zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);
14701

14702

14703
	for_each_possible_cpu(cpu) {
14704
		swhash = &per_cpu(swevent_htable, cpu);
14705
		mutex_init(&swhash->hlist_mutex);
14706

14707
		INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
14708
		raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
14709

14710
		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
14711

14712
		cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
14713
		__perf_event_init_context(&cpuctx->ctx);
14714
		lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
14715
		lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
14716
		cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
14717
		cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
14718
		cpuctx->heap = cpuctx->heap_default;
14719
	}
14720
}
14721

14722
static void perf_swevent_init_cpu(unsigned int cpu)
14723
{
14724
	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
14725

14726
	mutex_lock(&swhash->hlist_mutex);
14727
	if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
14728
		struct swevent_hlist *hlist;
14729

14730
		hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
14731
		WARN_ON(!hlist);
14732
		rcu_assign_pointer(swhash->swevent_hlist, hlist);
14733
	}
14734
	mutex_unlock(&swhash->hlist_mutex);
14735
}
14736

14737
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
14738
static void __perf_event_exit_context(void *__info)
14739
{
14740
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
14741
	struct perf_event_context *ctx = __info;
14742
	struct perf_event *event;
14743

14744
	raw_spin_lock(&ctx->lock);
14745
	ctx_sched_out(ctx, NULL, EVENT_TIME);
14746
	list_for_each_entry(event, &ctx->event_list, event_entry)
14747
		__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
14748
	raw_spin_unlock(&ctx->lock);
14749
}
14750

14751
static void perf_event_clear_cpumask(unsigned int cpu)
14752
{
14753
	int target[PERF_PMU_MAX_SCOPE];
14754
	unsigned int scope;
14755
	struct pmu *pmu;
14756

14757
	cpumask_clear_cpu(cpu, perf_online_mask);
14758

14759
	for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
14760
		const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
14761
		struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
14762

14763
		target[scope] = -1;
14764
		if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
14765
			continue;
14766

14767
		if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
14768
			continue;
14769
		target[scope] = cpumask_any_but(cpumask, cpu);
14770
		if (target[scope] < nr_cpu_ids)
14771
			cpumask_set_cpu(target[scope], pmu_cpumask);
14772
	}
14773

14774
	/* migrate */
14775
	list_for_each_entry(pmu, &pmus, entry) {
14776
		if (pmu->scope == PERF_PMU_SCOPE_NONE ||
14777
		    WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
14778
			continue;
14779

14780
		if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
14781
			perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
14782
	}
14783
}
14784

14785
static void perf_event_exit_cpu_context(int cpu)
14786
{
14787
	struct perf_cpu_context *cpuctx;
14788
	struct perf_event_context *ctx;
14789

14790
	// XXX simplify cpuctx->online
14791
	mutex_lock(&pmus_lock);
14792
	/*
14793
	 * Clear the cpumasks, and migrate to other CPUs if possible.
14794
	 * Must be invoked before the __perf_event_exit_context.
14795
	 */
14796
	perf_event_clear_cpumask(cpu);
14797
	cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
14798
	ctx = &cpuctx->ctx;
14799

14800
	mutex_lock(&ctx->mutex);
14801
	smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
14802
	cpuctx->online = 0;
14803
	mutex_unlock(&ctx->mutex);
14804
	mutex_unlock(&pmus_lock);
14805
}
14806
#else
14807

14808
static void perf_event_exit_cpu_context(int cpu) { }
14809

14810
#endif
14811

14812
static void perf_event_setup_cpumask(unsigned int cpu)
14813
{
14814
	struct cpumask *pmu_cpumask;
14815
	unsigned int scope;
14816

14817
	/*
14818
	 * Early boot stage, the cpumask hasn't been set yet.
14819
	 * The perf_online_<domain>_masks includes the first CPU of each domain.
14820
	 * Always unconditionally set the boot CPU for the perf_online_<domain>_masks.
14821
	 */
14822
	if (cpumask_empty(perf_online_mask)) {
14823
		for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
14824
			pmu_cpumask = perf_scope_cpumask(scope);
14825
			if (WARN_ON_ONCE(!pmu_cpumask))
14826
				continue;
14827
			cpumask_set_cpu(cpu, pmu_cpumask);
14828
		}
14829
		goto end;
14830
	}
14831

14832
	for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
14833
		const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
14834

14835
		pmu_cpumask = perf_scope_cpumask(scope);
14836

14837
		if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
14838
			continue;
14839

14840
		if (!cpumask_empty(cpumask) &&
14841
		    cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
14842
			cpumask_set_cpu(cpu, pmu_cpumask);
14843
	}
14844
end:
14845
	cpumask_set_cpu(cpu, perf_online_mask);
14846
}
14847

14848
int perf_event_init_cpu(unsigned int cpu)
14849
{
14850
	struct perf_cpu_context *cpuctx;
14851
	struct perf_event_context *ctx;
14852

14853
	perf_swevent_init_cpu(cpu);
14854

14855
	mutex_lock(&pmus_lock);
14856
	perf_event_setup_cpumask(cpu);
14857
	cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
14858
	ctx = &cpuctx->ctx;
14859

14860
	mutex_lock(&ctx->mutex);
14861
	cpuctx->online = 1;
14862
	mutex_unlock(&ctx->mutex);
14863
	mutex_unlock(&pmus_lock);
14864

14865
	return 0;
14866
}
14867

14868
int perf_event_exit_cpu(unsigned int cpu)
14869
{
14870
	perf_event_exit_cpu_context(cpu);
14871
	return 0;
14872
}
14873

14874
static int
14875
perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
14876
{
14877
	int cpu;
14878

14879
	for_each_online_cpu(cpu)
14880
		perf_event_exit_cpu(cpu);
14881

14882
	return NOTIFY_OK;
14883
}
14884

14885
/*
14886
 * Run the perf reboot notifier at the very last possible moment so that
14887
 * the generic watchdog code runs as long as possible.
14888
 */
14889
static struct notifier_block perf_reboot_notifier = {
14890
	.notifier_call = perf_reboot,
14891
	.priority = INT_MIN,
14892
};
14893

14894
void __init perf_event_init(void)
14895
{
14896
	int ret;
14897

14898
	idr_init(&pmu_idr);
14899

14900
	unwind_deferred_init(&perf_unwind_work,
14901
			     perf_unwind_deferred_callback);
14902

14903
	perf_event_init_all_cpus();
14904
	init_srcu_struct(&pmus_srcu);
14905
	perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
14906
	perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1);
14907
	perf_pmu_register(&perf_task_clock, "task_clock", -1);
14908
	perf_tp_register();
14909
	perf_event_init_cpu(smp_processor_id());
14910
	register_reboot_notifier(&perf_reboot_notifier);
14911

14912
	ret = init_hw_breakpoint();
14913
	WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
14914

14915
	perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC);
14916

14917
	/*
14918
	 * Build time assertion that we keep the data_head at the intended
14919
	 * location.  IOW, validation we got the __reserved[] size right.
14920
	 */
14921
	BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
14922
		     != 1024);
14923
}
14924

14925
ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
14926
			      char *page)
14927
{
14928
	struct perf_pmu_events_attr *pmu_attr =
14929
		container_of(attr, struct perf_pmu_events_attr, attr);
14930

14931
	if (pmu_attr->event_str)
14932
		return sprintf(page, "%s\n", pmu_attr->event_str);
14933

14934
	return 0;
14935
}
14936
EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
14937

14938
static int __init perf_event_sysfs_init(void)
14939
{
14940
	struct pmu *pmu;
14941
	int ret;
14942

14943
	mutex_lock(&pmus_lock);
14944

14945
	ret = bus_register(&pmu_bus);
14946
	if (ret)
14947
		goto unlock;
14948

14949
	list_for_each_entry(pmu, &pmus, entry) {
14950
		if (pmu->dev)
14951
			continue;
14952

14953
		ret = pmu_dev_alloc(pmu);
14954
		WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
14955
	}
14956
	pmu_bus_running = 1;
14957
	ret = 0;
14958

14959
unlock:
14960
	mutex_unlock(&pmus_lock);
14961

14962
	return ret;
14963
}
14964
device_initcall(perf_event_sysfs_init);
14965

14966
#ifdef CONFIG_CGROUP_PERF
14967
static struct cgroup_subsys_state *
14968
perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
14969
{
14970
	struct perf_cgroup *jc;
14971

14972
	jc = kzalloc(sizeof(*jc), GFP_KERNEL);
14973
	if (!jc)
14974
		return ERR_PTR(-ENOMEM);
14975

14976
	jc->info = alloc_percpu(struct perf_cgroup_info);
14977
	if (!jc->info) {
14978
		kfree(jc);
14979
		return ERR_PTR(-ENOMEM);
14980
	}
14981

14982
	return &jc->css;
14983
}
14984

14985
static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
14986
{
14987
	struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
14988

14989
	free_percpu(jc->info);
14990
	kfree(jc);
14991
}
14992

14993
static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
14994
{
14995
	perf_event_cgroup(css->cgroup);
14996
	return 0;
14997
}
14998

14999
static int __perf_cgroup_move(void *info)
15000
{
15001
	struct task_struct *task = info;
15002

15003
	preempt_disable();
15004
	perf_cgroup_switch(task);
15005
	preempt_enable();
15006

15007
	return 0;
15008
}
15009

15010
static void perf_cgroup_attach(struct cgroup_taskset *tset)
15011
{
15012
	struct task_struct *task;
15013
	struct cgroup_subsys_state *css;
15014

15015
	cgroup_taskset_for_each(task, css, tset)
15016
		task_function_call(task, __perf_cgroup_move, task);
15017
}
15018

15019
struct cgroup_subsys perf_event_cgrp_subsys = {
15020
	.css_alloc	= perf_cgroup_css_alloc,
15021
	.css_free	= perf_cgroup_css_free,
15022
	.css_online	= perf_cgroup_css_online,
15023
	.attach		= perf_cgroup_attach,
15024
	/*
15025
	 * Implicitly enable on dfl hierarchy so that perf events can
15026
	 * always be filtered by cgroup2 path as long as perf_event
15027
	 * controller is not mounted on a legacy hierarchy.
15028
	 */
15029
	.implicit_on_dfl = true,
15030
	.threaded	= true,
15031
};
15032
#endif /* CONFIG_CGROUP_PERF */
15033

15034
DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);
15035

15036
Product

Resources

Company