CoCalc -- builtin-record.c

GitHub Repository: torvalds/linux
Path: blob/master/tools/perf/builtin-record.c
²⁶²⁸² views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
 * builtin-record.c
4
 *
5
 * Builtin record command: Record the profile of a workload
6
 * (or a CPU, or a PID) into the perf.data output file - for
7
 * later analysis via perf report.
8
 */
9
#include "builtin.h"
10

11
#include "util/build-id.h"
12
#include <subcmd/parse-options.h>
13
#include <internal/xyarray.h>
14
#include "util/parse-events.h"
15
#include "util/config.h"
16

17
#include "util/callchain.h"
18
#include "util/cgroup.h"
19
#include "util/header.h"
20
#include "util/event.h"
21
#include "util/evlist.h"
22
#include "util/evsel.h"
23
#include "util/debug.h"
24
#include "util/mmap.h"
25
#include "util/mutex.h"
26
#include "util/target.h"
27
#include "util/session.h"
28
#include "util/tool.h"
29
#include "util/stat.h"
30
#include "util/symbol.h"
31
#include "util/record.h"
32
#include "util/cpumap.h"
33
#include "util/thread_map.h"
34
#include "util/data.h"
35
#include "util/perf_regs.h"
36
#include "util/auxtrace.h"
37
#include "util/tsc.h"
38
#include "util/parse-branch-options.h"
39
#include "util/parse-regs-options.h"
40
#include "util/perf_api_probe.h"
41
#include "util/trigger.h"
42
#include "util/perf-hooks.h"
43
#include "util/cpu-set-sched.h"
44
#include "util/synthetic-events.h"
45
#include "util/time-utils.h"
46
#include "util/units.h"
47
#include "util/bpf-event.h"
48
#include "util/util.h"
49
#include "util/pfm.h"
50
#include "util/pmu.h"
51
#include "util/pmus.h"
52
#include "util/clockid.h"
53
#include "util/off_cpu.h"
54
#include "util/bpf-filter.h"
55
#include "util/strbuf.h"
56
#include "asm/bug.h"
57
#include "perf.h"
58
#include "cputopo.h"
59

60
#include <errno.h>
61
#include <inttypes.h>
62
#include <locale.h>
63
#include <poll.h>
64
#include <pthread.h>
65
#include <unistd.h>
66
#ifndef HAVE_GETTID
67
#include <syscall.h>
68
#endif
69
#include <sched.h>
70
#include <signal.h>
71
#ifdef HAVE_EVENTFD_SUPPORT
72
#include <sys/eventfd.h>
73
#endif
74
#include <sys/mman.h>
75
#include <sys/wait.h>
76
#include <sys/types.h>
77
#include <sys/stat.h>
78
#include <fcntl.h>
79
#include <linux/err.h>
80
#include <linux/string.h>
81
#include <linux/time64.h>
82
#include <linux/zalloc.h>
83
#include <linux/bitmap.h>
84
#include <sys/time.h>
85

86
struct switch_output {
87
	bool		 enabled;
88
	bool		 signal;
89
	unsigned long	 size;
90
	unsigned long	 time;
91
	const char	*str;
92
	bool		 set;
93
	char		 **filenames;
94
	int		 num_files;
95
	int		 cur_file;
96
};
97

98
struct thread_mask {
99
	struct mmap_cpu_mask	maps;
100
	struct mmap_cpu_mask	affinity;
101
};
102

103
struct record_thread {
104
	pid_t			tid;
105
	struct thread_mask	*mask;
106
	struct {
107
		int		msg[2];
108
		int		ack[2];
109
	} pipes;
110
	struct fdarray		pollfd;
111
	int			ctlfd_pos;
112
	int			nr_mmaps;
113
	struct mmap		**maps;
114
	struct mmap		**overwrite_maps;
115
	struct record		*rec;
116
	unsigned long long	samples;
117
	unsigned long		waking;
118
	u64			bytes_written;
119
	u64			bytes_transferred;
120
	u64			bytes_compressed;
121
};
122

123
static __thread struct record_thread *thread;
124

125
enum thread_msg {
126
	THREAD_MSG__UNDEFINED = 0,
127
	THREAD_MSG__READY,
128
	THREAD_MSG__MAX,
129
};
130

131
static const char *thread_msg_tags[THREAD_MSG__MAX] = {
132
	"UNDEFINED", "READY"
133
};
134

135
enum thread_spec {
136
	THREAD_SPEC__UNDEFINED = 0,
137
	THREAD_SPEC__CPU,
138
	THREAD_SPEC__CORE,
139
	THREAD_SPEC__PACKAGE,
140
	THREAD_SPEC__NUMA,
141
	THREAD_SPEC__USER,
142
	THREAD_SPEC__MAX,
143
};
144

145
static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
146
	"undefined", "cpu", "core", "package", "numa", "user"
147
};
148

149
struct pollfd_index_map {
150
	int evlist_pollfd_index;
151
	int thread_pollfd_index;
152
};
153

154
struct record {
155
	struct perf_tool	tool;
156
	struct record_opts	opts;
157
	u64			bytes_written;
158
	u64			thread_bytes_written;
159
	struct perf_data	data;
160
	struct auxtrace_record	*itr;
161
	struct evlist	*evlist;
162
	struct perf_session	*session;
163
	struct evlist		*sb_evlist;
164
	pthread_t		thread_id;
165
	int			realtime_prio;
166
	bool			latency;
167
	bool			switch_output_event_set;
168
	bool			no_buildid;
169
	bool			no_buildid_set;
170
	bool			no_buildid_cache;
171
	bool			no_buildid_cache_set;
172
	bool			buildid_all;
173
	bool			buildid_mmap;
174
	bool			buildid_mmap_set;
175
	bool			timestamp_filename;
176
	bool			timestamp_boundary;
177
	bool			off_cpu;
178
	const char		*filter_action;
179
	const char		*uid_str;
180
	struct switch_output	switch_output;
181
	unsigned long long	samples;
182
	unsigned long		output_max_size;	/* = 0: unlimited */
183
	struct perf_debuginfod	debuginfod;
184
	int			nr_threads;
185
	struct thread_mask	*thread_masks;
186
	struct record_thread	*thread_data;
187
	struct pollfd_index_map	*index_map;
188
	size_t			index_map_sz;
189
	size_t			index_map_cnt;
190
};
191

192
static volatile int done;
193

194
static volatile int auxtrace_record__snapshot_started;
195
static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
196
static DEFINE_TRIGGER(switch_output_trigger);
197

198
static const char *affinity_tags[PERF_AFFINITY_MAX] = {
199
	"SYS", "NODE", "CPU"
200
};
201

202
static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
203
				  struct perf_sample *sample, struct machine *machine);
204
static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
205
				   struct perf_sample *sample, struct machine *machine);
206
static int process_timestamp_boundary(const struct perf_tool *tool,
207
				      union perf_event *event,
208
				      struct perf_sample *sample,
209
				      struct machine *machine);
210

211
#ifndef HAVE_GETTID
212
static inline pid_t gettid(void)
213
{
214
	return (pid_t)syscall(__NR_gettid);
215
}
216
#endif
217

218
static int record__threads_enabled(struct record *rec)
219
{
220
	return rec->opts.threads_spec;
221
}
222

223
static bool switch_output_signal(struct record *rec)
224
{
225
	return rec->switch_output.signal &&
226
	       trigger_is_ready(&switch_output_trigger);
227
}
228

229
static bool switch_output_size(struct record *rec)
230
{
231
	return rec->switch_output.size &&
232
	       trigger_is_ready(&switch_output_trigger) &&
233
	       (rec->bytes_written >= rec->switch_output.size);
234
}
235

236
static bool switch_output_time(struct record *rec)
237
{
238
	return rec->switch_output.time &&
239
	       trigger_is_ready(&switch_output_trigger);
240
}
241

242
static u64 record__bytes_written(struct record *rec)
243
{
244
	return rec->bytes_written + rec->thread_bytes_written;
245
}
246

247
static bool record__output_max_size_exceeded(struct record *rec)
248
{
249
	return rec->output_max_size &&
250
	       (record__bytes_written(rec) >= rec->output_max_size);
251
}
252

253
static int record__write(struct record *rec, struct mmap *map __maybe_unused,
254
			 void *bf, size_t size)
255
{
256
	struct perf_data_file *file = &rec->session->data->file;
257

258
	if (map && map->file)
259
		file = map->file;
260

261
	if (perf_data_file__write(file, bf, size) < 0) {
262
		pr_err("failed to write perf data, error: %m\n");
263
		return -1;
264
	}
265

266
	if (map && map->file) {
267
		thread->bytes_written += size;
268
		rec->thread_bytes_written += size;
269
	} else {
270
		rec->bytes_written += size;
271
	}
272

273
	if (record__output_max_size_exceeded(rec) && !done) {
274
		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
275
				" stopping session ]\n",
276
				record__bytes_written(rec) >> 10);
277
		done = 1;
278
	}
279

280
	if (switch_output_size(rec))
281
		trigger_hit(&switch_output_trigger);
282

283
	return 0;
284
}
285

286
static int record__aio_enabled(struct record *rec);
287
static int record__comp_enabled(struct record *rec);
288
static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
289
			    void *dst, size_t dst_size, void *src, size_t src_size);
290

291
#ifdef HAVE_AIO_SUPPORT
292
static int record__aio_write(struct aiocb *cblock, int trace_fd,
293
		void *buf, size_t size, off_t off)
294
{
295
	int rc;
296

297
	cblock->aio_fildes = trace_fd;
298
	cblock->aio_buf    = buf;
299
	cblock->aio_nbytes = size;
300
	cblock->aio_offset = off;
301
	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
302

303
	do {
304
		rc = aio_write(cblock);
305
		if (rc == 0) {
306
			break;
307
		} else if (errno != EAGAIN) {
308
			cblock->aio_fildes = -1;
309
			pr_err("failed to queue perf data, error: %m\n");
310
			break;
311
		}
312
	} while (1);
313

314
	return rc;
315
}
316

317
static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
318
{
319
	void *rem_buf;
320
	off_t rem_off;
321
	size_t rem_size;
322
	int rc, aio_errno;
323
	ssize_t aio_ret, written;
324

325
	aio_errno = aio_error(cblock);
326
	if (aio_errno == EINPROGRESS)
327
		return 0;
328

329
	written = aio_ret = aio_return(cblock);
330
	if (aio_ret < 0) {
331
		if (aio_errno != EINTR)
332
			pr_err("failed to write perf data, error: %m\n");
333
		written = 0;
334
	}
335

336
	rem_size = cblock->aio_nbytes - written;
337

338
	if (rem_size == 0) {
339
		cblock->aio_fildes = -1;
340
		/*
341
		 * md->refcount is incremented in record__aio_pushfn() for
342
		 * every aio write request started in record__aio_push() so
343
		 * decrement it because the request is now complete.
344
		 */
345
		perf_mmap__put(&md->core);
346
		rc = 1;
347
	} else {
348
		/*
349
		 * aio write request may require restart with the
350
		 * remainder if the kernel didn't write whole
351
		 * chunk at once.
352
		 */
353
		rem_off = cblock->aio_offset + written;
354
		rem_buf = (void *)(cblock->aio_buf + written);
355
		record__aio_write(cblock, cblock->aio_fildes,
356
				rem_buf, rem_size, rem_off);
357
		rc = 0;
358
	}
359

360
	return rc;
361
}
362

363
static int record__aio_sync(struct mmap *md, bool sync_all)
364
{
365
	struct aiocb **aiocb = md->aio.aiocb;
366
	struct aiocb *cblocks = md->aio.cblocks;
367
	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
368
	int i, do_suspend;
369

370
	do {
371
		do_suspend = 0;
372
		for (i = 0; i < md->aio.nr_cblocks; ++i) {
373
			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
374
				if (sync_all)
375
					aiocb[i] = NULL;
376
				else
377
					return i;
378
			} else {
379
				/*
380
				 * Started aio write is not complete yet
381
				 * so it has to be waited before the
382
				 * next allocation.
383
				 */
384
				aiocb[i] = &cblocks[i];
385
				do_suspend = 1;
386
			}
387
		}
388
		if (!do_suspend)
389
			return -1;
390

391
		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
392
			if (!(errno == EAGAIN || errno == EINTR))
393
				pr_err("failed to sync perf data, error: %m\n");
394
		}
395
	} while (1);
396
}
397

398
struct record_aio {
399
	struct record	*rec;
400
	void		*data;
401
	size_t		size;
402
};
403

404
static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
405
{
406
	struct record_aio *aio = to;
407

408
	/*
409
	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
410
	 * to release space in the kernel buffer as fast as possible, calling
411
	 * perf_mmap__consume() from perf_mmap__push() function.
412
	 *
413
	 * That lets the kernel to proceed with storing more profiling data into
414
	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
415
	 *
416
	 * Coping can be done in two steps in case the chunk of profiling data
417
	 * crosses the upper bound of the kernel buffer. In this case we first move
418
	 * part of data from map->start till the upper bound and then the remainder
419
	 * from the beginning of the kernel buffer till the end of the data chunk.
420
	 */
421

422
	if (record__comp_enabled(aio->rec)) {
423
		ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
424
						   mmap__mmap_len(map) - aio->size,
425
						   buf, size);
426
		if (compressed < 0)
427
			return (int)compressed;
428

429
		size = compressed;
430
	} else {
431
		memcpy(aio->data + aio->size, buf, size);
432
	}
433

434
	if (!aio->size) {
435
		/*
436
		 * Increment map->refcount to guard map->aio.data[] buffer
437
		 * from premature deallocation because map object can be
438
		 * released earlier than aio write request started on
439
		 * map->aio.data[] buffer is complete.
440
		 *
441
		 * perf_mmap__put() is done at record__aio_complete()
442
		 * after started aio request completion or at record__aio_push()
443
		 * if the request failed to start.
444
		 */
445
		perf_mmap__get(&map->core);
446
	}
447

448
	aio->size += size;
449

450
	return size;
451
}
452

453
static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
454
{
455
	int ret, idx;
456
	int trace_fd = rec->session->data->file.fd;
457
	struct record_aio aio = { .rec = rec, .size = 0 };
458

459
	/*
460
	 * Call record__aio_sync() to wait till map->aio.data[] buffer
461
	 * becomes available after previous aio write operation.
462
	 */
463

464
	idx = record__aio_sync(map, false);
465
	aio.data = map->aio.data[idx];
466
	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
467
	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
468
		return ret;
469

470
	rec->samples++;
471
	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
472
	if (!ret) {
473
		*off += aio.size;
474
		rec->bytes_written += aio.size;
475
		if (switch_output_size(rec))
476
			trigger_hit(&switch_output_trigger);
477
	} else {
478
		/*
479
		 * Decrement map->refcount incremented in record__aio_pushfn()
480
		 * back if record__aio_write() operation failed to start, otherwise
481
		 * map->refcount is decremented in record__aio_complete() after
482
		 * aio write operation finishes successfully.
483
		 */
484
		perf_mmap__put(&map->core);
485
	}
486

487
	return ret;
488
}
489

490
static off_t record__aio_get_pos(int trace_fd)
491
{
492
	return lseek(trace_fd, 0, SEEK_CUR);
493
}
494

495
static void record__aio_set_pos(int trace_fd, off_t pos)
496
{
497
	lseek(trace_fd, pos, SEEK_SET);
498
}
499

500
static void record__aio_mmap_read_sync(struct record *rec)
501
{
502
	int i;
503
	struct evlist *evlist = rec->evlist;
504
	struct mmap *maps = evlist->mmap;
505

506
	if (!record__aio_enabled(rec))
507
		return;
508

509
	for (i = 0; i < evlist->core.nr_mmaps; i++) {
510
		struct mmap *map = &maps[i];
511

512
		if (map->core.base)
513
			record__aio_sync(map, true);
514
	}
515
}
516

517
static int nr_cblocks_default = 1;
518
static int nr_cblocks_max = 4;
519

520
static int record__aio_parse(const struct option *opt,
521
			     const char *str,
522
			     int unset)
523
{
524
	struct record_opts *opts = (struct record_opts *)opt->value;
525

526
	if (unset) {
527
		opts->nr_cblocks = 0;
528
	} else {
529
		if (str)
530
			opts->nr_cblocks = strtol(str, NULL, 0);
531
		if (!opts->nr_cblocks)
532
			opts->nr_cblocks = nr_cblocks_default;
533
	}
534

535
	return 0;
536
}
537
#else /* HAVE_AIO_SUPPORT */
538
static int nr_cblocks_max = 0;
539

540
static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
541
			    off_t *off __maybe_unused)
542
{
543
	return -1;
544
}
545

546
static off_t record__aio_get_pos(int trace_fd __maybe_unused)
547
{
548
	return -1;
549
}
550

551
static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
552
{
553
}
554

555
static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
556
{
557
}
558
#endif
559

560
static int record__aio_enabled(struct record *rec)
561
{
562
	return rec->opts.nr_cblocks > 0;
563
}
564

565
#define MMAP_FLUSH_DEFAULT 1
566
static int record__mmap_flush_parse(const struct option *opt,
567
				    const char *str,
568
				    int unset)
569
{
570
	int flush_max;
571
	struct record_opts *opts = (struct record_opts *)opt->value;
572
	static struct parse_tag tags[] = {
573
			{ .tag  = 'B', .mult = 1       },
574
			{ .tag  = 'K', .mult = 1 << 10 },
575
			{ .tag  = 'M', .mult = 1 << 20 },
576
			{ .tag  = 'G', .mult = 1 << 30 },
577
			{ .tag  = 0 },
578
	};
579

580
	if (unset)
581
		return 0;
582

583
	if (str) {
584
		opts->mmap_flush = parse_tag_value(str, tags);
585
		if (opts->mmap_flush == (int)-1)
586
			opts->mmap_flush = strtol(str, NULL, 0);
587
	}
588

589
	if (!opts->mmap_flush)
590
		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
591

592
	flush_max = evlist__mmap_size(opts->mmap_pages);
593
	flush_max /= 4;
594
	if (opts->mmap_flush > flush_max)
595
		opts->mmap_flush = flush_max;
596

597
	return 0;
598
}
599

600
#ifdef HAVE_ZSTD_SUPPORT
601
static unsigned int comp_level_default = 1;
602

603
static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
604
{
605
	struct record_opts *opts = opt->value;
606

607
	if (unset) {
608
		opts->comp_level = 0;
609
	} else {
610
		if (str)
611
			opts->comp_level = strtol(str, NULL, 0);
612
		if (!opts->comp_level)
613
			opts->comp_level = comp_level_default;
614
	}
615

616
	return 0;
617
}
618
#endif
619
static unsigned int comp_level_max = 22;
620

621
static int record__comp_enabled(struct record *rec)
622
{
623
	return rec->opts.comp_level > 0;
624
}
625

626
static int process_synthesized_event(const struct perf_tool *tool,
627
				     union perf_event *event,
628
				     struct perf_sample *sample __maybe_unused,
629
				     struct machine *machine __maybe_unused)
630
{
631
	struct record *rec = container_of(tool, struct record, tool);
632
	return record__write(rec, NULL, event, event->header.size);
633
}
634

635
static struct mutex synth_lock;
636

637
static int process_locked_synthesized_event(const struct perf_tool *tool,
638
				     union perf_event *event,
639
				     struct perf_sample *sample __maybe_unused,
640
				     struct machine *machine __maybe_unused)
641
{
642
	int ret;
643

644
	mutex_lock(&synth_lock);
645
	ret = process_synthesized_event(tool, event, sample, machine);
646
	mutex_unlock(&synth_lock);
647
	return ret;
648
}
649

650
static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
651
{
652
	struct record *rec = to;
653

654
	if (record__comp_enabled(rec)) {
655
		struct perf_record_compressed2 *event = map->data;
656
		size_t padding = 0;
657
		u8 pad[8] = {0};
658
		ssize_t compressed = zstd_compress(rec->session, map, map->data,
659
						   mmap__mmap_len(map), bf, size);
660

661
		if (compressed < 0)
662
			return (int)compressed;
663

664
		bf = event;
665
		thread->samples++;
666

667
		/*
668
		 * The record from `zstd_compress` is not 8 bytes aligned, which would cause asan
669
		 * error. We make it aligned here.
670
		 */
671
		event->data_size = compressed - sizeof(struct perf_record_compressed2);
672
		event->header.size = PERF_ALIGN(compressed, sizeof(u64));
673
		padding = event->header.size - compressed;
674
		return record__write(rec, map, bf, compressed) ||
675
		       record__write(rec, map, &pad, padding);
676
	}
677

678
	thread->samples++;
679
	return record__write(rec, map, bf, size);
680
}
681

682
static volatile sig_atomic_t signr = -1;
683
static volatile sig_atomic_t child_finished;
684
#ifdef HAVE_EVENTFD_SUPPORT
685
static volatile sig_atomic_t done_fd = -1;
686
#endif
687

688
static void sig_handler(int sig)
689
{
690
	if (sig == SIGCHLD)
691
		child_finished = 1;
692
	else
693
		signr = sig;
694

695
	done = 1;
696
#ifdef HAVE_EVENTFD_SUPPORT
697
	if (done_fd >= 0) {
698
		u64 tmp = 1;
699
		int orig_errno = errno;
700

701
		/*
702
		 * It is possible for this signal handler to run after done is
703
		 * checked in the main loop, but before the perf counter fds are
704
		 * polled. If this happens, the poll() will continue to wait
705
		 * even though done is set, and will only break out if either
706
		 * another signal is received, or the counters are ready for
707
		 * read. To ensure the poll() doesn't sleep when done is set,
708
		 * use an eventfd (done_fd) to wake up the poll().
709
		 */
710
		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
711
			pr_err("failed to signal wakeup fd, error: %m\n");
712

713
		errno = orig_errno;
714
	}
715
#endif // HAVE_EVENTFD_SUPPORT
716
}
717

718
static void sigsegv_handler(int sig)
719
{
720
	perf_hooks__recover();
721
	sighandler_dump_stack(sig);
722
}
723

724
static void record__sig_exit(void)
725
{
726
	if (signr == -1)
727
		return;
728

729
	signal(signr, SIG_DFL);
730
	raise(signr);
731
}
732

733
#ifdef HAVE_AUXTRACE_SUPPORT
734

735
static int record__process_auxtrace(const struct perf_tool *tool,
736
				    struct mmap *map,
737
				    union perf_event *event, void *data1,
738
				    size_t len1, void *data2, size_t len2)
739
{
740
	struct record *rec = container_of(tool, struct record, tool);
741
	struct perf_data *data = &rec->data;
742
	size_t padding;
743
	u8 pad[8] = {0};
744

745
	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
746
		off_t file_offset;
747
		int fd = perf_data__fd(data);
748
		int err;
749

750
		file_offset = lseek(fd, 0, SEEK_CUR);
751
		if (file_offset == -1)
752
			return -1;
753
		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
754
						     event, file_offset);
755
		if (err)
756
			return err;
757
	}
758

759
	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
760
	padding = (len1 + len2) & 7;
761
	if (padding)
762
		padding = 8 - padding;
763

764
	record__write(rec, map, event, event->header.size);
765
	record__write(rec, map, data1, len1);
766
	if (len2)
767
		record__write(rec, map, data2, len2);
768
	record__write(rec, map, &pad, padding);
769

770
	return 0;
771
}
772

773
static int record__auxtrace_mmap_read(struct record *rec,
774
				      struct mmap *map)
775
{
776
	int ret;
777

778
	ret = auxtrace_mmap__read(map, rec->itr,
779
				  perf_session__env(rec->session),
780
				  &rec->tool,
781
				  record__process_auxtrace);
782
	if (ret < 0)
783
		return ret;
784

785
	if (ret)
786
		rec->samples++;
787

788
	return 0;
789
}
790

791
static int record__auxtrace_mmap_read_snapshot(struct record *rec,
792
					       struct mmap *map)
793
{
794
	int ret;
795

796
	ret = auxtrace_mmap__read_snapshot(map, rec->itr,
797
					   perf_session__env(rec->session),
798
					   &rec->tool,
799
					   record__process_auxtrace,
800
					   rec->opts.auxtrace_snapshot_size);
801
	if (ret < 0)
802
		return ret;
803

804
	if (ret)
805
		rec->samples++;
806

807
	return 0;
808
}
809

810
static int record__auxtrace_read_snapshot_all(struct record *rec)
811
{
812
	int i;
813
	int rc = 0;
814

815
	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
816
		struct mmap *map = &rec->evlist->mmap[i];
817

818
		if (!map->auxtrace_mmap.base)
819
			continue;
820

821
		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
822
			rc = -1;
823
			goto out;
824
		}
825
	}
826
out:
827
	return rc;
828
}
829

830
static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
831
{
832
	pr_debug("Recording AUX area tracing snapshot\n");
833
	if (record__auxtrace_read_snapshot_all(rec) < 0) {
834
		trigger_error(&auxtrace_snapshot_trigger);
835
	} else {
836
		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
837
			trigger_error(&auxtrace_snapshot_trigger);
838
		else
839
			trigger_ready(&auxtrace_snapshot_trigger);
840
	}
841
}
842

843
static int record__auxtrace_snapshot_exit(struct record *rec)
844
{
845
	if (trigger_is_error(&auxtrace_snapshot_trigger))
846
		return 0;
847

848
	if (!auxtrace_record__snapshot_started &&
849
	    auxtrace_record__snapshot_start(rec->itr))
850
		return -1;
851

852
	record__read_auxtrace_snapshot(rec, true);
853
	if (trigger_is_error(&auxtrace_snapshot_trigger))
854
		return -1;
855

856
	return 0;
857
}
858

859
static int record__auxtrace_init(struct record *rec)
860
{
861
	int err;
862

863
	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
864
	    && record__threads_enabled(rec)) {
865
		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
866
		return -EINVAL;
867
	}
868

869
	if (!rec->itr) {
870
		rec->itr = auxtrace_record__init(rec->evlist, &err);
871
		if (err)
872
			return err;
873
	}
874

875
	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
876
					      rec->opts.auxtrace_snapshot_opts);
877
	if (err)
878
		return err;
879

880
	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
881
					    rec->opts.auxtrace_sample_opts);
882
	if (err)
883
		return err;
884

885
	err = auxtrace_parse_aux_action(rec->evlist);
886
	if (err)
887
		return err;
888

889
	return auxtrace_parse_filters(rec->evlist);
890
}
891

892
#else
893

894
static inline
895
int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
896
			       struct mmap *map __maybe_unused)
897
{
898
	return 0;
899
}
900

901
static inline
902
void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
903
				    bool on_exit __maybe_unused)
904
{
905
}
906

907
static inline
908
int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
909
{
910
	return 0;
911
}
912

913
static inline
914
int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
915
{
916
	return 0;
917
}
918

919
static int record__auxtrace_init(struct record *rec __maybe_unused)
920
{
921
	return 0;
922
}
923

924
#endif
925

926
static int record__config_text_poke(struct evlist *evlist)
927
{
928
	struct evsel *evsel;
929

930
	/* Nothing to do if text poke is already configured */
931
	evlist__for_each_entry(evlist, evsel) {
932
		if (evsel->core.attr.text_poke)
933
			return 0;
934
	}
935

936
	evsel = evlist__add_dummy_on_all_cpus(evlist);
937
	if (!evsel)
938
		return -ENOMEM;
939

940
	evsel->core.attr.text_poke = 1;
941
	evsel->core.attr.ksymbol = 1;
942
	evsel->immediate = true;
943
	evsel__set_sample_bit(evsel, TIME);
944

945
	return 0;
946
}
947

948
static int record__config_off_cpu(struct record *rec)
949
{
950
	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
951
}
952

953
static bool record__tracking_system_wide(struct record *rec)
954
{
955
	struct evlist *evlist = rec->evlist;
956
	struct evsel *evsel;
957

958
	/*
959
	 * If non-dummy evsel exists, system_wide sideband is need to
960
	 * help parse sample information.
961
	 * For example, PERF_EVENT_MMAP event to help parse symbol,
962
	 * and PERF_EVENT_COMM event to help parse task executable name.
963
	 */
964
	evlist__for_each_entry(evlist, evsel) {
965
		if (!evsel__is_dummy_event(evsel))
966
			return true;
967
	}
968

969
	return false;
970
}
971

972
static int record__config_tracking_events(struct record *rec)
973
{
974
	struct record_opts *opts = &rec->opts;
975
	struct evlist *evlist = rec->evlist;
976
	bool system_wide = false;
977
	struct evsel *evsel;
978

979
	/*
980
	 * For initial_delay, system wide or a hybrid system, we need to add
981
	 * tracking event so that we can track PERF_RECORD_MMAP to cover the
982
	 * delay of waiting or event synthesis.
983
	 */
984
	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
985
	    perf_pmus__num_core_pmus() > 1) {
986

987
		/*
988
		 * User space tasks can migrate between CPUs, so when tracing
989
		 * selected CPUs, sideband for all CPUs is still needed.
990
		 */
991
		if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
992
			system_wide = true;
993

994
		evsel = evlist__findnew_tracking_event(evlist, system_wide);
995
		if (!evsel)
996
			return -ENOMEM;
997

998
		/*
999
		 * Enable the tracking event when the process is forked for
1000
		 * initial_delay, immediately for system wide.
1001
		 */
1002
		if (opts->target.initial_delay && !evsel->immediate &&
1003
		    !target__has_cpu(&opts->target))
1004
			evsel->core.attr.enable_on_exec = 1;
1005
		else
1006
			evsel->immediate = 1;
1007
	}
1008

1009
	return 0;
1010
}
1011

1012
static bool record__kcore_readable(struct machine *machine)
1013
{
1014
	char kcore[PATH_MAX];
1015
	int fd;
1016

1017
	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
1018

1019
	fd = open(kcore, O_RDONLY);
1020
	if (fd < 0)
1021
		return false;
1022

1023
	close(fd);
1024

1025
	return true;
1026
}
1027

1028
static int record__kcore_copy(struct machine *machine, struct perf_data *data)
1029
{
1030
	char from_dir[PATH_MAX];
1031
	char kcore_dir[PATH_MAX];
1032
	int ret;
1033

1034
	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
1035

1036
	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1037
	if (ret)
1038
		return ret;
1039

1040
	return kcore_copy(from_dir, kcore_dir);
1041
}
1042

1043
static void record__thread_data_init_pipes(struct record_thread *thread_data)
1044
{
1045
	thread_data->pipes.msg[0] = -1;
1046
	thread_data->pipes.msg[1] = -1;
1047
	thread_data->pipes.ack[0] = -1;
1048
	thread_data->pipes.ack[1] = -1;
1049
}
1050

1051
static int record__thread_data_open_pipes(struct record_thread *thread_data)
1052
{
1053
	if (pipe(thread_data->pipes.msg))
1054
		return -EINVAL;
1055

1056
	if (pipe(thread_data->pipes.ack)) {
1057
		close(thread_data->pipes.msg[0]);
1058
		thread_data->pipes.msg[0] = -1;
1059
		close(thread_data->pipes.msg[1]);
1060
		thread_data->pipes.msg[1] = -1;
1061
		return -EINVAL;
1062
	}
1063

1064
	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1065
		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1066
		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1067

1068
	return 0;
1069
}
1070

1071
static void record__thread_data_close_pipes(struct record_thread *thread_data)
1072
{
1073
	if (thread_data->pipes.msg[0] != -1) {
1074
		close(thread_data->pipes.msg[0]);
1075
		thread_data->pipes.msg[0] = -1;
1076
	}
1077
	if (thread_data->pipes.msg[1] != -1) {
1078
		close(thread_data->pipes.msg[1]);
1079
		thread_data->pipes.msg[1] = -1;
1080
	}
1081
	if (thread_data->pipes.ack[0] != -1) {
1082
		close(thread_data->pipes.ack[0]);
1083
		thread_data->pipes.ack[0] = -1;
1084
	}
1085
	if (thread_data->pipes.ack[1] != -1) {
1086
		close(thread_data->pipes.ack[1]);
1087
		thread_data->pipes.ack[1] = -1;
1088
	}
1089
}
1090

1091
static bool evlist__per_thread(struct evlist *evlist)
1092
{
1093
	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1094
}
1095

1096
static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1097
{
1098
	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1099
	struct mmap *mmap = evlist->mmap;
1100
	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1101
	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1102
	bool per_thread = evlist__per_thread(evlist);
1103

1104
	if (per_thread)
1105
		thread_data->nr_mmaps = nr_mmaps;
1106
	else
1107
		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1108
						      thread_data->mask->maps.nbits);
1109
	if (mmap) {
1110
		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1111
		if (!thread_data->maps)
1112
			return -ENOMEM;
1113
	}
1114
	if (overwrite_mmap) {
1115
		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1116
		if (!thread_data->overwrite_maps) {
1117
			zfree(&thread_data->maps);
1118
			return -ENOMEM;
1119
		}
1120
	}
1121
	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1122
		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1123

1124
	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1125
		if (per_thread ||
1126
		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1127
			if (thread_data->maps) {
1128
				thread_data->maps[tm] = &mmap[m];
1129
				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1130
					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1131
			}
1132
			if (thread_data->overwrite_maps) {
1133
				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1134
				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1135
					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1136
			}
1137
			tm++;
1138
		}
1139
	}
1140

1141
	return 0;
1142
}
1143

1144
static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1145
{
1146
	int f, tm, pos;
1147
	struct mmap *map, *overwrite_map;
1148

1149
	fdarray__init(&thread_data->pollfd, 64);
1150

1151
	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1152
		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1153
		overwrite_map = thread_data->overwrite_maps ?
1154
				thread_data->overwrite_maps[tm] : NULL;
1155

1156
		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1157
			void *ptr = evlist->core.pollfd.priv[f].ptr;
1158

1159
			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1160
				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1161
							      &evlist->core.pollfd);
1162
				if (pos < 0)
1163
					return pos;
1164
				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1165
					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1166
			}
1167
		}
1168
	}
1169

1170
	return 0;
1171
}
1172

1173
static void record__free_thread_data(struct record *rec)
1174
{
1175
	int t;
1176
	struct record_thread *thread_data = rec->thread_data;
1177

1178
	if (thread_data == NULL)
1179
		return;
1180

1181
	for (t = 0; t < rec->nr_threads; t++) {
1182
		record__thread_data_close_pipes(&thread_data[t]);
1183
		zfree(&thread_data[t].maps);
1184
		zfree(&thread_data[t].overwrite_maps);
1185
		fdarray__exit(&thread_data[t].pollfd);
1186
	}
1187

1188
	zfree(&rec->thread_data);
1189
}
1190

1191
static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1192
						    int evlist_pollfd_index,
1193
						    int thread_pollfd_index)
1194
{
1195
	size_t x = rec->index_map_cnt;
1196

1197
	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1198
		return -ENOMEM;
1199
	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1200
	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1201
	rec->index_map_cnt += 1;
1202
	return 0;
1203
}
1204

1205
static int record__update_evlist_pollfd_from_thread(struct record *rec,
1206
						    struct evlist *evlist,
1207
						    struct record_thread *thread_data)
1208
{
1209
	struct pollfd *e_entries = evlist->core.pollfd.entries;
1210
	struct pollfd *t_entries = thread_data->pollfd.entries;
1211
	int err = 0;
1212
	size_t i;
1213

1214
	for (i = 0; i < rec->index_map_cnt; i++) {
1215
		int e_pos = rec->index_map[i].evlist_pollfd_index;
1216
		int t_pos = rec->index_map[i].thread_pollfd_index;
1217

1218
		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1219
		    e_entries[e_pos].events != t_entries[t_pos].events) {
1220
			pr_err("Thread and evlist pollfd index mismatch\n");
1221
			err = -EINVAL;
1222
			continue;
1223
		}
1224
		e_entries[e_pos].revents = t_entries[t_pos].revents;
1225
	}
1226
	return err;
1227
}
1228

1229
static int record__dup_non_perf_events(struct record *rec,
1230
				       struct evlist *evlist,
1231
				       struct record_thread *thread_data)
1232
{
1233
	struct fdarray *fda = &evlist->core.pollfd;
1234
	int i, ret;
1235

1236
	for (i = 0; i < fda->nr; i++) {
1237
		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1238
			continue;
1239
		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1240
		if (ret < 0) {
1241
			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1242
			return ret;
1243
		}
1244
		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1245
			  thread_data, ret, fda->entries[i].fd);
1246
		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1247
		if (ret < 0) {
1248
			pr_err("Failed to map thread and evlist pollfd indexes\n");
1249
			return ret;
1250
		}
1251
	}
1252
	return 0;
1253
}
1254

1255
static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1256
{
1257
	int t, ret;
1258
	struct record_thread *thread_data;
1259

1260
	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1261
	if (!rec->thread_data) {
1262
		pr_err("Failed to allocate thread data\n");
1263
		return -ENOMEM;
1264
	}
1265
	thread_data = rec->thread_data;
1266

1267
	for (t = 0; t < rec->nr_threads; t++)
1268
		record__thread_data_init_pipes(&thread_data[t]);
1269

1270
	for (t = 0; t < rec->nr_threads; t++) {
1271
		thread_data[t].rec = rec;
1272
		thread_data[t].mask = &rec->thread_masks[t];
1273
		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1274
		if (ret) {
1275
			pr_err("Failed to initialize thread[%d] maps\n", t);
1276
			goto out_free;
1277
		}
1278
		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1279
		if (ret) {
1280
			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1281
			goto out_free;
1282
		}
1283
		if (t) {
1284
			thread_data[t].tid = -1;
1285
			ret = record__thread_data_open_pipes(&thread_data[t]);
1286
			if (ret) {
1287
				pr_err("Failed to open thread[%d] communication pipes\n", t);
1288
				goto out_free;
1289
			}
1290
			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1291
					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1292
			if (ret < 0) {
1293
				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1294
				goto out_free;
1295
			}
1296
			thread_data[t].ctlfd_pos = ret;
1297
			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1298
				 thread_data, thread_data[t].ctlfd_pos,
1299
				 thread_data[t].pipes.msg[0]);
1300
		} else {
1301
			thread_data[t].tid = gettid();
1302

1303
			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1304
			if (ret < 0)
1305
				goto out_free;
1306

1307
			thread_data[t].ctlfd_pos = -1; /* Not used */
1308
		}
1309
	}
1310

1311
	return 0;
1312

1313
out_free:
1314
	record__free_thread_data(rec);
1315

1316
	return ret;
1317
}
1318

1319
static int record__mmap_evlist(struct record *rec,
1320
			       struct evlist *evlist)
1321
{
1322
	int i, ret;
1323
	struct record_opts *opts = &rec->opts;
1324
	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1325
				  opts->auxtrace_sample_mode;
1326
	char msg[512];
1327

1328
	if (opts->affinity != PERF_AFFINITY_SYS)
1329
		cpu__setup_cpunode_map();
1330

1331
	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1332
				 opts->auxtrace_mmap_pages,
1333
				 auxtrace_overwrite,
1334
				 opts->nr_cblocks, opts->affinity,
1335
				 opts->mmap_flush, opts->comp_level) < 0) {
1336
		if (errno == EPERM) {
1337
			pr_err("Permission error mapping pages.\n"
1338
			       "Consider increasing "
1339
			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1340
			       "or try again with a smaller value of -m/--mmap_pages.\n"
1341
			       "(current value: %u,%u)\n",
1342
			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1343
			return -errno;
1344
		} else {
1345
			pr_err("failed to mmap with %d (%s)\n", errno,
1346
				str_error_r(errno, msg, sizeof(msg)));
1347
			if (errno)
1348
				return -errno;
1349
			else
1350
				return -EINVAL;
1351
		}
1352
	}
1353

1354
	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1355
		return -1;
1356

1357
	ret = record__alloc_thread_data(rec, evlist);
1358
	if (ret)
1359
		return ret;
1360

1361
	if (record__threads_enabled(rec)) {
1362
		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1363
		if (ret) {
1364
			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1365
			return ret;
1366
		}
1367
		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1368
			if (evlist->mmap)
1369
				evlist->mmap[i].file = &rec->data.dir.files[i];
1370
			if (evlist->overwrite_mmap)
1371
				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1372
		}
1373
	}
1374

1375
	return 0;
1376
}
1377

1378
static int record__mmap(struct record *rec)
1379
{
1380
	return record__mmap_evlist(rec, rec->evlist);
1381
}
1382

1383
static int record__open(struct record *rec)
1384
{
1385
	char msg[BUFSIZ];
1386
	struct evsel *pos;
1387
	struct evlist *evlist = rec->evlist;
1388
	struct perf_session *session = rec->session;
1389
	struct record_opts *opts = &rec->opts;
1390
	int rc = 0;
1391

1392
	evlist__for_each_entry(evlist, pos) {
1393
try_again:
1394
		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1395
			if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1396
				if (verbose > 0)
1397
					ui__warning("%s\n", msg);
1398
				goto try_again;
1399
			}
1400
			if ((errno == EINVAL || errno == EBADF) &&
1401
			    pos->core.leader != &pos->core &&
1402
			    pos->weak_group) {
1403
			        pos = evlist__reset_weak_group(evlist, pos, true);
1404
				goto try_again;
1405
			}
1406
			rc = -errno;
1407
			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1408
			ui__error("%s\n", msg);
1409
			goto out;
1410
		}
1411

1412
		pos->supported = true;
1413
	}
1414

1415
	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1416
		pr_warning(
1417
"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1418
"check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1419
"Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1420
"file is not found in the buildid cache or in the vmlinux path.\n\n"
1421
"Samples in kernel modules won't be resolved at all.\n\n"
1422
"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1423
"even with a suitable vmlinux or kallsyms file.\n\n");
1424
	}
1425

1426
	if (evlist__apply_filters(evlist, &pos, &opts->target)) {
1427
		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1428
			pos->filter ?: "BPF", evsel__name(pos), errno,
1429
			str_error_r(errno, msg, sizeof(msg)));
1430
		rc = -1;
1431
		goto out;
1432
	}
1433

1434
	rc = record__mmap(rec);
1435
	if (rc)
1436
		goto out;
1437

1438
	session->evlist = evlist;
1439
	perf_session__set_id_hdr_size(session);
1440
out:
1441
	return rc;
1442
}
1443

1444
static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1445
{
1446
	if (rec->evlist->first_sample_time == 0)
1447
		rec->evlist->first_sample_time = sample_time;
1448

1449
	if (sample_time)
1450
		rec->evlist->last_sample_time = sample_time;
1451
}
1452

1453
static int process_sample_event(const struct perf_tool *tool,
1454
				union perf_event *event,
1455
				struct perf_sample *sample,
1456
				struct evsel *evsel,
1457
				struct machine *machine)
1458
{
1459
	struct record *rec = container_of(tool, struct record, tool);
1460

1461
	set_timestamp_boundary(rec, sample->time);
1462

1463
	if (rec->buildid_all)
1464
		return 0;
1465

1466
	rec->samples++;
1467
	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1468
}
1469

1470
static int process_buildids(struct record *rec)
1471
{
1472
	struct perf_session *session = rec->session;
1473

1474
	if (perf_data__size(&rec->data) == 0)
1475
		return 0;
1476

1477
	/*
1478
	 * During this process, it'll load kernel map and replace the
1479
	 * dso->long_name to a real pathname it found.  In this case
1480
	 * we prefer the vmlinux path like
1481
	 *   /lib/modules/3.16.4/build/vmlinux
1482
	 *
1483
	 * rather than build-id path (in debug directory).
1484
	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1485
	 */
1486
	symbol_conf.ignore_vmlinux_buildid = true;
1487

1488
	/*
1489
	 * If --buildid-all is given, it marks all DSO regardless of hits,
1490
	 * so no need to process samples. But if timestamp_boundary is enabled,
1491
	 * it still needs to walk on all samples to get the timestamps of
1492
	 * first/last samples.
1493
	 */
1494
	if (rec->buildid_all && !rec->timestamp_boundary)
1495
		rec->tool.sample = process_event_sample_stub;
1496

1497
	return perf_session__process_events(session);
1498
}
1499

1500
static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1501
{
1502
	int err;
1503
	struct perf_tool *tool = data;
1504
	/*
1505
	 *As for guest kernel when processing subcommand record&report,
1506
	 *we arrange module mmap prior to guest kernel mmap and trigger
1507
	 *a preload dso because default guest module symbols are loaded
1508
	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1509
	 *method is used to avoid symbol missing when the first addr is
1510
	 *in module instead of in guest kernel.
1511
	 */
1512
	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1513
					     machine);
1514
	if (err < 0)
1515
		pr_err("Couldn't record guest kernel [%d]'s reference"
1516
		       " relocation symbol.\n", machine->pid);
1517

1518
	/*
1519
	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1520
	 * have no _text sometimes.
1521
	 */
1522
	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1523
						 machine);
1524
	if (err < 0)
1525
		pr_err("Couldn't record guest kernel [%d]'s reference"
1526
		       " relocation symbol.\n", machine->pid);
1527
}
1528

1529
static struct perf_event_header finished_round_event = {
1530
	.size = sizeof(struct perf_event_header),
1531
	.type = PERF_RECORD_FINISHED_ROUND,
1532
};
1533

1534
static struct perf_event_header finished_init_event = {
1535
	.size = sizeof(struct perf_event_header),
1536
	.type = PERF_RECORD_FINISHED_INIT,
1537
};
1538

1539
static void record__adjust_affinity(struct record *rec, struct mmap *map)
1540
{
1541
	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1542
	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1543
			  thread->mask->affinity.nbits)) {
1544
		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1545
		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1546
			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1547
		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1548
					(cpu_set_t *)thread->mask->affinity.bits);
1549
		if (verbose == 2) {
1550
			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1551
			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1552
		}
1553
	}
1554
}
1555

1556
static size_t process_comp_header(void *record, size_t increment)
1557
{
1558
	struct perf_record_compressed2 *event = record;
1559
	size_t size = sizeof(*event);
1560

1561
	if (increment) {
1562
		event->header.size += increment;
1563
		return increment;
1564
	}
1565

1566
	event->header.type = PERF_RECORD_COMPRESSED2;
1567
	event->header.size = size;
1568

1569
	return size;
1570
}
1571

1572
static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1573
			    void *dst, size_t dst_size, void *src, size_t src_size)
1574
{
1575
	ssize_t compressed;
1576
	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed2) - 1;
1577
	struct zstd_data *zstd_data = &session->zstd_data;
1578

1579
	if (map && map->file)
1580
		zstd_data = &map->zstd_data;
1581

1582
	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1583
						     max_record_size, process_comp_header);
1584
	if (compressed < 0)
1585
		return compressed;
1586

1587
	if (map && map->file) {
1588
		thread->bytes_transferred += src_size;
1589
		thread->bytes_compressed  += compressed;
1590
	} else {
1591
		session->bytes_transferred += src_size;
1592
		session->bytes_compressed  += compressed;
1593
	}
1594

1595
	return compressed;
1596
}
1597

1598
static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1599
				    bool overwrite, bool synch)
1600
{
1601
	u64 bytes_written = rec->bytes_written;
1602
	int i;
1603
	int rc = 0;
1604
	int nr_mmaps;
1605
	struct mmap **maps;
1606
	int trace_fd = rec->data.file.fd;
1607
	off_t off = 0;
1608

1609
	if (!evlist)
1610
		return 0;
1611

1612
	nr_mmaps = thread->nr_mmaps;
1613
	maps = overwrite ? thread->overwrite_maps : thread->maps;
1614

1615
	if (!maps)
1616
		return 0;
1617

1618
	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1619
		return 0;
1620

1621
	if (record__aio_enabled(rec))
1622
		off = record__aio_get_pos(trace_fd);
1623

1624
	for (i = 0; i < nr_mmaps; i++) {
1625
		u64 flush = 0;
1626
		struct mmap *map = maps[i];
1627

1628
		if (map->core.base) {
1629
			record__adjust_affinity(rec, map);
1630
			if (synch) {
1631
				flush = map->core.flush;
1632
				map->core.flush = 1;
1633
			}
1634
			if (!record__aio_enabled(rec)) {
1635
				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1636
					if (synch)
1637
						map->core.flush = flush;
1638
					rc = -1;
1639
					goto out;
1640
				}
1641
			} else {
1642
				if (record__aio_push(rec, map, &off) < 0) {
1643
					record__aio_set_pos(trace_fd, off);
1644
					if (synch)
1645
						map->core.flush = flush;
1646
					rc = -1;
1647
					goto out;
1648
				}
1649
			}
1650
			if (synch)
1651
				map->core.flush = flush;
1652
		}
1653

1654
		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1655
		    !rec->opts.auxtrace_sample_mode &&
1656
		    record__auxtrace_mmap_read(rec, map) != 0) {
1657
			rc = -1;
1658
			goto out;
1659
		}
1660
	}
1661

1662
	if (record__aio_enabled(rec))
1663
		record__aio_set_pos(trace_fd, off);
1664

1665
	/*
1666
	 * Mark the round finished in case we wrote
1667
	 * at least one event.
1668
	 *
1669
	 * No need for round events in directory mode,
1670
	 * because per-cpu maps and files have data
1671
	 * sorted by kernel.
1672
	 */
1673
	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1674
		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1675

1676
	if (overwrite)
1677
		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1678
out:
1679
	return rc;
1680
}
1681

1682
static int record__mmap_read_all(struct record *rec, bool synch)
1683
{
1684
	int err;
1685

1686
	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1687
	if (err)
1688
		return err;
1689

1690
	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1691
}
1692

1693
static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1694
					   void *arg __maybe_unused)
1695
{
1696
	struct perf_mmap *map = fda->priv[fd].ptr;
1697

1698
	if (map)
1699
		perf_mmap__put(map);
1700
}
1701

1702
static void *record__thread(void *arg)
1703
{
1704
	enum thread_msg msg = THREAD_MSG__READY;
1705
	bool terminate = false;
1706
	struct fdarray *pollfd;
1707
	int err, ctlfd_pos;
1708

1709
	thread = arg;
1710
	thread->tid = gettid();
1711

1712
	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1713
	if (err == -1)
1714
		pr_warning("threads[%d]: failed to notify on start: %s\n",
1715
			   thread->tid, strerror(errno));
1716

1717
	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1718

1719
	pollfd = &thread->pollfd;
1720
	ctlfd_pos = thread->ctlfd_pos;
1721

1722
	for (;;) {
1723
		unsigned long long hits = thread->samples;
1724

1725
		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1726
			break;
1727

1728
		if (hits == thread->samples) {
1729

1730
			err = fdarray__poll(pollfd, -1);
1731
			/*
1732
			 * Propagate error, only if there's any. Ignore positive
1733
			 * number of returned events and interrupt error.
1734
			 */
1735
			if (err > 0 || (err < 0 && errno == EINTR))
1736
				err = 0;
1737
			thread->waking++;
1738

1739
			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1740
					    record__thread_munmap_filtered, NULL) == 0)
1741
				break;
1742
		}
1743

1744
		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1745
			terminate = true;
1746
			close(thread->pipes.msg[0]);
1747
			thread->pipes.msg[0] = -1;
1748
			pollfd->entries[ctlfd_pos].fd = -1;
1749
			pollfd->entries[ctlfd_pos].events = 0;
1750
		}
1751

1752
		pollfd->entries[ctlfd_pos].revents = 0;
1753
	}
1754
	record__mmap_read_all(thread->rec, true);
1755

1756
	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1757
	if (err == -1)
1758
		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1759
			   thread->tid, strerror(errno));
1760

1761
	return NULL;
1762
}
1763

1764
static void record__init_features(struct record *rec)
1765
{
1766
	struct perf_session *session = rec->session;
1767
	int feat;
1768

1769
	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1770
		perf_header__set_feat(&session->header, feat);
1771

1772
	if (rec->no_buildid)
1773
		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1774

1775
	if (!have_tracepoints(&rec->evlist->core.entries))
1776
		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1777

1778
	if (!rec->opts.branch_stack)
1779
		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1780

1781
	if (!rec->opts.full_auxtrace)
1782
		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1783

1784
	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1785
		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1786

1787
	if (!rec->opts.use_clockid)
1788
		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1789

1790
	if (!record__threads_enabled(rec))
1791
		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1792

1793
	if (!record__comp_enabled(rec))
1794
		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1795

1796
	perf_header__clear_feat(&session->header, HEADER_STAT);
1797
}
1798

1799
static void
1800
record__finish_output(struct record *rec)
1801
{
1802
	int i;
1803
	struct perf_data *data = &rec->data;
1804
	int fd = perf_data__fd(data);
1805

1806
	if (data->is_pipe) {
1807
		/* Just to display approx. size */
1808
		data->file.size = rec->bytes_written;
1809
		return;
1810
	}
1811

1812
	rec->session->header.data_size += rec->bytes_written;
1813
	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1814
	if (record__threads_enabled(rec)) {
1815
		for (i = 0; i < data->dir.nr; i++)
1816
			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1817
	}
1818

1819
	/* Buildid scanning disabled or build ID in kernel and synthesized map events. */
1820
	if (!rec->no_buildid) {
1821
		process_buildids(rec);
1822

1823
		if (rec->buildid_all)
1824
			perf_session__dsos_hit_all(rec->session);
1825
	}
1826
	perf_session__write_header(rec->session, rec->evlist, fd, true);
1827

1828
	return;
1829
}
1830

1831
static int record__synthesize_workload(struct record *rec, bool tail)
1832
{
1833
	int err;
1834
	struct perf_thread_map *thread_map;
1835
	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1836

1837
	if (rec->opts.tail_synthesize != tail)
1838
		return 0;
1839

1840
	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1841
	if (thread_map == NULL)
1842
		return -1;
1843

1844
	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1845
						 process_synthesized_event,
1846
						 &rec->session->machines.host,
1847
						 needs_mmap,
1848
						 rec->opts.sample_address);
1849
	perf_thread_map__put(thread_map);
1850
	return err;
1851
}
1852

1853
static int write_finished_init(struct record *rec, bool tail)
1854
{
1855
	if (rec->opts.tail_synthesize != tail)
1856
		return 0;
1857

1858
	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1859
}
1860

1861
static int record__synthesize(struct record *rec, bool tail);
1862

1863
static int
1864
record__switch_output(struct record *rec, bool at_exit)
1865
{
1866
	struct perf_data *data = &rec->data;
1867
	char *new_filename = NULL;
1868
	int fd, err;
1869

1870
	/* Same Size:      "2015122520103046"*/
1871
	char timestamp[] = "InvalidTimestamp";
1872

1873
	record__aio_mmap_read_sync(rec);
1874

1875
	write_finished_init(rec, true);
1876

1877
	record__synthesize(rec, true);
1878
	if (target__none(&rec->opts.target))
1879
		record__synthesize_workload(rec, true);
1880

1881
	rec->samples = 0;
1882
	record__finish_output(rec);
1883
	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1884
	if (err) {
1885
		pr_err("Failed to get current timestamp\n");
1886
		return -EINVAL;
1887
	}
1888

1889
	fd = perf_data__switch(data, timestamp,
1890
			       rec->session->header.data_offset,
1891
			       at_exit, &new_filename);
1892
	if (fd >= 0 && !at_exit) {
1893
		rec->bytes_written = 0;
1894
		rec->session->header.data_size = 0;
1895
	}
1896

1897
	if (!quiet) {
1898
		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1899
			data->path, timestamp);
1900
	}
1901

1902
	if (rec->switch_output.num_files) {
1903
		int n = rec->switch_output.cur_file + 1;
1904

1905
		if (n >= rec->switch_output.num_files)
1906
			n = 0;
1907
		rec->switch_output.cur_file = n;
1908
		if (rec->switch_output.filenames[n]) {
1909
			remove(rec->switch_output.filenames[n]);
1910
			zfree(&rec->switch_output.filenames[n]);
1911
		}
1912
		rec->switch_output.filenames[n] = new_filename;
1913
	} else {
1914
		free(new_filename);
1915
	}
1916

1917
	/* Output tracking events */
1918
	if (!at_exit) {
1919
		record__synthesize(rec, false);
1920

1921
		/*
1922
		 * In 'perf record --switch-output' without -a,
1923
		 * record__synthesize() in record__switch_output() won't
1924
		 * generate tracking events because there's no thread_map
1925
		 * in evlist. Which causes newly created perf.data doesn't
1926
		 * contain map and comm information.
1927
		 * Create a fake thread_map and directly call
1928
		 * perf_event__synthesize_thread_map() for those events.
1929
		 */
1930
		if (target__none(&rec->opts.target))
1931
			record__synthesize_workload(rec, false);
1932
		write_finished_init(rec, false);
1933
	}
1934
	return fd;
1935
}
1936

1937
static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1938
					struct perf_record_lost_samples *lost,
1939
					int cpu_idx, int thread_idx, u64 lost_count,
1940
					u16 misc_flag)
1941
{
1942
	struct perf_sample_id *sid;
1943
	struct perf_sample sample;
1944
	int id_hdr_size;
1945

1946
	perf_sample__init(&sample, /*all=*/true);
1947
	lost->lost = lost_count;
1948
	if (evsel->core.ids) {
1949
		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1950
		sample.id = sid->id;
1951
	}
1952

1953
	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1954
						       evsel->core.attr.sample_type, &sample);
1955
	lost->header.size = sizeof(*lost) + id_hdr_size;
1956
	lost->header.misc = misc_flag;
1957
	record__write(rec, NULL, lost, lost->header.size);
1958
	perf_sample__exit(&sample);
1959
}
1960

1961
static void record__read_lost_samples(struct record *rec)
1962
{
1963
	struct perf_session *session = rec->session;
1964
	struct perf_record_lost_samples_and_ids lost;
1965
	struct evsel *evsel;
1966

1967
	/* there was an error during record__open */
1968
	if (session->evlist == NULL)
1969
		return;
1970

1971
	evlist__for_each_entry(session->evlist, evsel) {
1972
		struct xyarray *xy = evsel->core.sample_id;
1973
		u64 lost_count;
1974

1975
		if (xy == NULL || evsel->core.fd == NULL)
1976
			continue;
1977
		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1978
		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1979
			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1980
			continue;
1981
		}
1982

1983
		for (int x = 0; x < xyarray__max_x(xy); x++) {
1984
			for (int y = 0; y < xyarray__max_y(xy); y++) {
1985
				struct perf_counts_values count;
1986

1987
				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1988
					pr_debug("read LOST count failed\n");
1989
					return;
1990
				}
1991

1992
				if (count.lost) {
1993
					memset(&lost, 0, sizeof(lost));
1994
					lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
1995
					__record__save_lost_samples(rec, evsel, &lost.lost,
1996
								    x, y, count.lost, 0);
1997
				}
1998
			}
1999
		}
2000

2001
		lost_count = perf_bpf_filter__lost_count(evsel);
2002
		if (lost_count) {
2003
			memset(&lost, 0, sizeof(lost));
2004
			lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
2005
			__record__save_lost_samples(rec, evsel, &lost.lost, 0, 0, lost_count,
2006
						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
2007
		}
2008
	}
2009
}
2010

2011
static volatile sig_atomic_t workload_exec_errno;
2012

2013
/*
2014
 * evlist__prepare_workload will send a SIGUSR1
2015
 * if the fork fails, since we asked by setting its
2016
 * want_signal to true.
2017
 */
2018
static void workload_exec_failed_signal(int signo __maybe_unused,
2019
					siginfo_t *info,
2020
					void *ucontext __maybe_unused)
2021
{
2022
	workload_exec_errno = info->si_value.sival_int;
2023
	done = 1;
2024
	child_finished = 1;
2025
}
2026

2027
static void snapshot_sig_handler(int sig);
2028
static void alarm_sig_handler(int sig);
2029

2030
static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
2031
{
2032
	if (evlist) {
2033
		if (evlist->mmap && evlist->mmap[0].core.base)
2034
			return evlist->mmap[0].core.base;
2035
		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2036
			return evlist->overwrite_mmap[0].core.base;
2037
	}
2038
	return NULL;
2039
}
2040

2041
static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2042
{
2043
	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2044
	if (pc)
2045
		return pc;
2046
	return NULL;
2047
}
2048

2049
static int record__synthesize(struct record *rec, bool tail)
2050
{
2051
	struct perf_session *session = rec->session;
2052
	struct machine *machine = &session->machines.host;
2053
	struct perf_data *data = &rec->data;
2054
	struct record_opts *opts = &rec->opts;
2055
	struct perf_tool *tool = &rec->tool;
2056
	int err = 0;
2057
	event_op f = process_synthesized_event;
2058

2059
	if (rec->opts.tail_synthesize != tail)
2060
		return 0;
2061

2062
	if (data->is_pipe) {
2063
		err = perf_event__synthesize_for_pipe(tool, session, data,
2064
						      process_synthesized_event);
2065
		if (err < 0)
2066
			goto out;
2067

2068
		rec->bytes_written += err;
2069
	}
2070

2071
	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2072
					  process_synthesized_event, machine);
2073
	if (err)
2074
		goto out;
2075

2076
	/* Synthesize id_index before auxtrace_info */
2077
	err = perf_event__synthesize_id_index(tool,
2078
					      process_synthesized_event,
2079
					      session->evlist, machine);
2080
	if (err)
2081
		goto out;
2082

2083
	if (rec->opts.full_auxtrace) {
2084
		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2085
					session, process_synthesized_event);
2086
		if (err)
2087
			goto out;
2088
	}
2089

2090
	if (!evlist__exclude_kernel(rec->evlist)) {
2091
		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2092
							 machine);
2093
		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2094
				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2095
				   "Check /proc/kallsyms permission or run as root.\n");
2096

2097
		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2098
						     machine);
2099
		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2100
				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2101
				   "Check /proc/modules permission or run as root.\n");
2102
	}
2103

2104
	if (perf_guest) {
2105
		machines__process_guests(&session->machines,
2106
					 perf_event__synthesize_guest_os, tool);
2107
	}
2108

2109
	err = perf_event__synthesize_extra_attr(&rec->tool,
2110
						rec->evlist,
2111
						process_synthesized_event,
2112
						data->is_pipe);
2113
	if (err)
2114
		goto out;
2115

2116
	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2117
						 process_synthesized_event,
2118
						NULL);
2119
	if (err < 0) {
2120
		pr_err("Couldn't synthesize thread map.\n");
2121
		return err;
2122
	}
2123

2124
	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2125
					     process_synthesized_event, NULL);
2126
	if (err < 0) {
2127
		pr_err("Couldn't synthesize cpu map.\n");
2128
		return err;
2129
	}
2130

2131
	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2132
						machine, opts);
2133
	if (err < 0) {
2134
		pr_warning("Couldn't synthesize bpf events.\n");
2135
		err = 0;
2136
	}
2137

2138
	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2139
		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2140
						     machine);
2141
		if (err < 0) {
2142
			pr_warning("Couldn't synthesize cgroup events.\n");
2143
			err = 0;
2144
		}
2145
	}
2146

2147
	if (rec->opts.nr_threads_synthesize > 1) {
2148
		mutex_init(&synth_lock);
2149
		perf_set_multithreaded();
2150
		f = process_locked_synthesized_event;
2151
	}
2152

2153
	if (rec->opts.synth & PERF_SYNTH_TASK) {
2154
		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2155

2156
		err = __machine__synthesize_threads(machine, tool, &opts->target,
2157
						    rec->evlist->core.threads,
2158
						    f, needs_mmap, opts->sample_address,
2159
						    rec->opts.nr_threads_synthesize);
2160
	}
2161

2162
	if (rec->opts.nr_threads_synthesize > 1) {
2163
		perf_set_singlethreaded();
2164
		mutex_destroy(&synth_lock);
2165
	}
2166

2167
out:
2168
	return err;
2169
}
2170

2171
static void record__synthesize_final_bpf_metadata(struct record *rec __maybe_unused)
2172
{
2173
#ifdef HAVE_LIBBPF_SUPPORT
2174
	perf_event__synthesize_final_bpf_metadata(rec->session,
2175
						  process_synthesized_event);
2176
#endif
2177
}
2178

2179
static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2180
{
2181
	struct record *rec = data;
2182
	pthread_kill(rec->thread_id, SIGUSR2);
2183
	return 0;
2184
}
2185

2186
static int record__setup_sb_evlist(struct record *rec)
2187
{
2188
	struct record_opts *opts = &rec->opts;
2189

2190
	if (rec->sb_evlist != NULL) {
2191
		/*
2192
		 * We get here if --switch-output-event populated the
2193
		 * sb_evlist, so associate a callback that will send a SIGUSR2
2194
		 * to the main thread.
2195
		 */
2196
		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2197
		rec->thread_id = pthread_self();
2198
	}
2199
#ifdef HAVE_LIBBPF_SUPPORT
2200
	if (!opts->no_bpf_event) {
2201
		if (rec->sb_evlist == NULL) {
2202
			rec->sb_evlist = evlist__new();
2203

2204
			if (rec->sb_evlist == NULL) {
2205
				pr_err("Couldn't create side band evlist.\n.");
2206
				return -1;
2207
			}
2208
		}
2209

2210
		if (evlist__add_bpf_sb_event(rec->sb_evlist, perf_session__env(rec->session))) {
2211
			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2212
			return -1;
2213
		}
2214
	}
2215
#endif
2216
	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2217
		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2218
		opts->no_bpf_event = true;
2219
	}
2220

2221
	return 0;
2222
}
2223

2224
static int record__init_clock(struct record *rec)
2225
{
2226
	struct perf_session *session = rec->session;
2227
	struct timespec ref_clockid;
2228
	struct timeval ref_tod;
2229
	struct perf_env *env = perf_session__env(session);
2230
	u64 ref;
2231

2232
	if (!rec->opts.use_clockid)
2233
		return 0;
2234

2235
	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2236
		env->clock.clockid_res_ns = rec->opts.clockid_res_ns;
2237

2238
	env->clock.clockid = rec->opts.clockid;
2239

2240
	if (gettimeofday(&ref_tod, NULL) != 0) {
2241
		pr_err("gettimeofday failed, cannot set reference time.\n");
2242
		return -1;
2243
	}
2244

2245
	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2246
		pr_err("clock_gettime failed, cannot set reference time.\n");
2247
		return -1;
2248
	}
2249

2250
	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2251
	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2252

2253
	env->clock.tod_ns = ref;
2254

2255
	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2256
	      (u64) ref_clockid.tv_nsec;
2257

2258
	env->clock.clockid_ns = ref;
2259
	return 0;
2260
}
2261

2262
static void hit_auxtrace_snapshot_trigger(struct record *rec)
2263
{
2264
	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2265
		trigger_hit(&auxtrace_snapshot_trigger);
2266
		auxtrace_record__snapshot_started = 1;
2267
		if (auxtrace_record__snapshot_start(rec->itr))
2268
			trigger_error(&auxtrace_snapshot_trigger);
2269
	}
2270
}
2271

2272
static int record__terminate_thread(struct record_thread *thread_data)
2273
{
2274
	int err;
2275
	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2276
	pid_t tid = thread_data->tid;
2277

2278
	close(thread_data->pipes.msg[1]);
2279
	thread_data->pipes.msg[1] = -1;
2280
	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2281
	if (err > 0)
2282
		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2283
	else
2284
		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2285
			   thread->tid, tid);
2286

2287
	return 0;
2288
}
2289

2290
static int record__start_threads(struct record *rec)
2291
{
2292
	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2293
	struct record_thread *thread_data = rec->thread_data;
2294
	sigset_t full, mask;
2295
	pthread_t handle;
2296
	pthread_attr_t attrs;
2297

2298
	thread = &thread_data[0];
2299

2300
	if (!record__threads_enabled(rec))
2301
		return 0;
2302

2303
	sigfillset(&full);
2304
	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2305
		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2306
		return -1;
2307
	}
2308

2309
	pthread_attr_init(&attrs);
2310
	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2311

2312
	for (t = 1; t < nr_threads; t++) {
2313
		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2314

2315
#ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2316
		pthread_attr_setaffinity_np(&attrs,
2317
					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2318
					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2319
#endif
2320
		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2321
			for (tt = 1; tt < t; tt++)
2322
				record__terminate_thread(&thread_data[t]);
2323
			pr_err("Failed to start threads: %s\n", strerror(errno));
2324
			ret = -1;
2325
			goto out_err;
2326
		}
2327

2328
		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2329
		if (err > 0)
2330
			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2331
				  thread_msg_tags[msg]);
2332
		else
2333
			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2334
				   thread->tid, rec->thread_data[t].tid);
2335
	}
2336

2337
	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2338
			(cpu_set_t *)thread->mask->affinity.bits);
2339

2340
	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2341

2342
out_err:
2343
	pthread_attr_destroy(&attrs);
2344

2345
	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2346
		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2347
		ret = -1;
2348
	}
2349

2350
	return ret;
2351
}
2352

2353
static int record__stop_threads(struct record *rec)
2354
{
2355
	int t;
2356
	struct record_thread *thread_data = rec->thread_data;
2357

2358
	for (t = 1; t < rec->nr_threads; t++)
2359
		record__terminate_thread(&thread_data[t]);
2360

2361
	for (t = 0; t < rec->nr_threads; t++) {
2362
		rec->samples += thread_data[t].samples;
2363
		if (!record__threads_enabled(rec))
2364
			continue;
2365
		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2366
		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2367
		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2368
			 thread_data[t].samples, thread_data[t].waking);
2369
		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2370
			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2371
				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2372
		else
2373
			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2374
	}
2375

2376
	return 0;
2377
}
2378

2379
static unsigned long record__waking(struct record *rec)
2380
{
2381
	int t;
2382
	unsigned long waking = 0;
2383
	struct record_thread *thread_data = rec->thread_data;
2384

2385
	for (t = 0; t < rec->nr_threads; t++)
2386
		waking += thread_data[t].waking;
2387

2388
	return waking;
2389
}
2390

2391
static int __cmd_record(struct record *rec, int argc, const char **argv)
2392
{
2393
	int err;
2394
	int status = 0;
2395
	const bool forks = argc > 0;
2396
	struct perf_tool *tool = &rec->tool;
2397
	struct record_opts *opts = &rec->opts;
2398
	struct perf_data *data = &rec->data;
2399
	struct perf_session *session;
2400
	bool disabled = false, draining = false;
2401
	int fd;
2402
	float ratio = 0;
2403
	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2404
	struct perf_env *env;
2405

2406
	atexit(record__sig_exit);
2407
	signal(SIGCHLD, sig_handler);
2408
	signal(SIGINT, sig_handler);
2409
	signal(SIGTERM, sig_handler);
2410
	signal(SIGSEGV, sigsegv_handler);
2411

2412
	if (rec->opts.record_cgroup) {
2413
#ifndef HAVE_FILE_HANDLE
2414
		pr_err("cgroup tracking is not supported\n");
2415
		return -1;
2416
#endif
2417
	}
2418

2419
	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2420
		signal(SIGUSR2, snapshot_sig_handler);
2421
		if (rec->opts.auxtrace_snapshot_mode)
2422
			trigger_on(&auxtrace_snapshot_trigger);
2423
		if (rec->switch_output.enabled)
2424
			trigger_on(&switch_output_trigger);
2425
	} else {
2426
		signal(SIGUSR2, SIG_IGN);
2427
	}
2428

2429
	perf_tool__init(tool, /*ordered_events=*/true);
2430
	tool->sample		= process_sample_event;
2431
	tool->fork		= perf_event__process_fork;
2432
	tool->exit		= perf_event__process_exit;
2433
	tool->comm		= perf_event__process_comm;
2434
	tool->namespaces	= perf_event__process_namespaces;
2435
	tool->mmap		= build_id__process_mmap;
2436
	tool->mmap2		= build_id__process_mmap2;
2437
	tool->itrace_start	= process_timestamp_boundary;
2438
	tool->aux		= process_timestamp_boundary;
2439
	tool->namespace_events	= rec->opts.record_namespaces;
2440
	tool->cgroup_events	= rec->opts.record_cgroup;
2441
	session = perf_session__new(data, tool);
2442
	if (IS_ERR(session)) {
2443
		pr_err("Perf session creation failed.\n");
2444
		return PTR_ERR(session);
2445
	}
2446
	env = perf_session__env(session);
2447
	if (record__threads_enabled(rec)) {
2448
		if (perf_data__is_pipe(&rec->data)) {
2449
			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2450
			return -1;
2451
		}
2452
		if (rec->opts.full_auxtrace) {
2453
			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2454
			return -1;
2455
		}
2456
	}
2457

2458
	fd = perf_data__fd(data);
2459
	rec->session = session;
2460

2461
	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2462
		pr_err("Compression initialization failed.\n");
2463
		return -1;
2464
	}
2465
#ifdef HAVE_EVENTFD_SUPPORT
2466
	done_fd = eventfd(0, EFD_NONBLOCK);
2467
	if (done_fd < 0) {
2468
		pr_err("Failed to create wakeup eventfd, error: %m\n");
2469
		status = -1;
2470
		goto out_delete_session;
2471
	}
2472
	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2473
	if (err < 0) {
2474
		pr_err("Failed to add wakeup eventfd to poll list\n");
2475
		status = err;
2476
		goto out_delete_session;
2477
	}
2478
#endif // HAVE_EVENTFD_SUPPORT
2479

2480
	env->comp_type  = PERF_COMP_ZSTD;
2481
	env->comp_level = rec->opts.comp_level;
2482

2483
	if (rec->opts.kcore &&
2484
	    !record__kcore_readable(&session->machines.host)) {
2485
		pr_err("ERROR: kcore is not readable.\n");
2486
		return -1;
2487
	}
2488

2489
	if (record__init_clock(rec))
2490
		return -1;
2491

2492
	record__init_features(rec);
2493

2494
	if (forks) {
2495
		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2496
					       workload_exec_failed_signal);
2497
		if (err < 0) {
2498
			pr_err("Couldn't run the workload!\n");
2499
			status = err;
2500
			goto out_delete_session;
2501
		}
2502
	}
2503

2504
	/*
2505
	 * If we have just single event and are sending data
2506
	 * through pipe, we need to force the ids allocation,
2507
	 * because we synthesize event name through the pipe
2508
	 * and need the id for that.
2509
	 */
2510
	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2511
		rec->opts.sample_id = true;
2512

2513
	if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2514
		rec->timestamp_filename = false;
2515
		pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2516
	}
2517

2518
	/*
2519
	 * Use global stat_config that is zero meaning aggr_mode is AGGR_NONE
2520
	 * and hybrid_merge is false.
2521
	 */
2522
	evlist__uniquify_evsel_names(rec->evlist, &stat_config);
2523

2524
	evlist__config(rec->evlist, opts, &callchain_param);
2525

2526
	/* Debug message used by test scripts */
2527
	pr_debug3("perf record opening and mmapping events\n");
2528
	if (record__open(rec) != 0) {
2529
		err = -1;
2530
		goto out_free_threads;
2531
	}
2532
	/* Debug message used by test scripts */
2533
	pr_debug3("perf record done opening and mmapping events\n");
2534
	env->comp_mmap_len = session->evlist->core.mmap_len;
2535

2536
	if (rec->opts.kcore) {
2537
		err = record__kcore_copy(&session->machines.host, data);
2538
		if (err) {
2539
			pr_err("ERROR: Failed to copy kcore\n");
2540
			goto out_free_threads;
2541
		}
2542
	}
2543

2544
	/*
2545
	 * Normally perf_session__new would do this, but it doesn't have the
2546
	 * evlist.
2547
	 */
2548
	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2549
		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2550
		rec->tool.ordered_events = false;
2551
	}
2552

2553
	if (evlist__nr_groups(rec->evlist) == 0)
2554
		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2555

2556
	if (data->is_pipe) {
2557
		err = perf_header__write_pipe(fd);
2558
		if (err < 0)
2559
			goto out_free_threads;
2560
	} else {
2561
		err = perf_session__write_header(session, rec->evlist, fd, false);
2562
		if (err < 0)
2563
			goto out_free_threads;
2564
	}
2565

2566
	err = -1;
2567
	if (!rec->no_buildid
2568
	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2569
		pr_err("Couldn't generate buildids. "
2570
		       "Use --no-buildid to profile anyway.\n");
2571
		goto out_free_threads;
2572
	}
2573

2574
	if (!evlist__needs_bpf_sb_event(rec->evlist))
2575
		opts->no_bpf_event = true;
2576

2577
	err = record__setup_sb_evlist(rec);
2578
	if (err)
2579
		goto out_free_threads;
2580

2581
	err = record__synthesize(rec, false);
2582
	if (err < 0)
2583
		goto out_free_threads;
2584

2585
	if (rec->realtime_prio) {
2586
		struct sched_param param;
2587

2588
		param.sched_priority = rec->realtime_prio;
2589
		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2590
			pr_err("Could not set realtime priority.\n");
2591
			err = -1;
2592
			goto out_free_threads;
2593
		}
2594
	}
2595

2596
	if (record__start_threads(rec))
2597
		goto out_free_threads;
2598

2599
	/*
2600
	 * When perf is starting the traced process, all the events
2601
	 * (apart from group members) have enable_on_exec=1 set,
2602
	 * so don't spoil it by prematurely enabling them.
2603
	 */
2604
	if (!target__none(&opts->target) && !opts->target.initial_delay)
2605
		evlist__enable(rec->evlist);
2606

2607
	/*
2608
	 * offcpu-time does not call execve, so enable_on_exe wouldn't work
2609
	 * when recording a workload, do it manually
2610
	 */
2611
	if (rec->off_cpu)
2612
		evlist__enable_evsel(rec->evlist, (char *)OFFCPU_EVENT);
2613

2614
	/*
2615
	 * Let the child rip
2616
	 */
2617
	if (forks) {
2618
		struct machine *machine = &session->machines.host;
2619
		union perf_event *event;
2620
		pid_t tgid;
2621

2622
		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2623
		if (event == NULL) {
2624
			err = -ENOMEM;
2625
			goto out_child;
2626
		}
2627

2628
		/*
2629
		 * Some H/W events are generated before COMM event
2630
		 * which is emitted during exec(), so perf script
2631
		 * cannot see a correct process name for those events.
2632
		 * Synthesize COMM event to prevent it.
2633
		 */
2634
		tgid = perf_event__synthesize_comm(tool, event,
2635
						   rec->evlist->workload.pid,
2636
						   process_synthesized_event,
2637
						   machine);
2638
		free(event);
2639

2640
		if (tgid == -1)
2641
			goto out_child;
2642

2643
		event = malloc(sizeof(event->namespaces) +
2644
			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2645
			       machine->id_hdr_size);
2646
		if (event == NULL) {
2647
			err = -ENOMEM;
2648
			goto out_child;
2649
		}
2650

2651
		/*
2652
		 * Synthesize NAMESPACES event for the command specified.
2653
		 */
2654
		perf_event__synthesize_namespaces(tool, event,
2655
						  rec->evlist->workload.pid,
2656
						  tgid, process_synthesized_event,
2657
						  machine);
2658
		free(event);
2659

2660
		evlist__start_workload(rec->evlist);
2661
	}
2662

2663
	if (opts->target.initial_delay) {
2664
		pr_info(EVLIST_DISABLED_MSG);
2665
		if (opts->target.initial_delay > 0) {
2666
			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2667
			evlist__enable(rec->evlist);
2668
			pr_info(EVLIST_ENABLED_MSG);
2669
		}
2670
	}
2671

2672
	err = event_enable_timer__start(rec->evlist->eet);
2673
	if (err)
2674
		goto out_child;
2675

2676
	/* Debug message used by test scripts */
2677
	pr_debug3("perf record has started\n");
2678
	fflush(stderr);
2679

2680
	trigger_ready(&auxtrace_snapshot_trigger);
2681
	trigger_ready(&switch_output_trigger);
2682
	perf_hooks__invoke_record_start();
2683

2684
	/*
2685
	 * Must write FINISHED_INIT so it will be seen after all other
2686
	 * synthesized user events, but before any regular events.
2687
	 */
2688
	err = write_finished_init(rec, false);
2689
	if (err < 0)
2690
		goto out_child;
2691

2692
	for (;;) {
2693
		unsigned long long hits = thread->samples;
2694

2695
		/*
2696
		 * rec->evlist->bkw_mmap_state is possible to be
2697
		 * BKW_MMAP_EMPTY here: when done == true and
2698
		 * hits != rec->samples in previous round.
2699
		 *
2700
		 * evlist__toggle_bkw_mmap ensure we never
2701
		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2702
		 */
2703
		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2704
			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2705

2706
		if (record__mmap_read_all(rec, false) < 0) {
2707
			trigger_error(&auxtrace_snapshot_trigger);
2708
			trigger_error(&switch_output_trigger);
2709
			err = -1;
2710
			goto out_child;
2711
		}
2712

2713
		if (auxtrace_record__snapshot_started) {
2714
			auxtrace_record__snapshot_started = 0;
2715
			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2716
				record__read_auxtrace_snapshot(rec, false);
2717
			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2718
				pr_err("AUX area tracing snapshot failed\n");
2719
				err = -1;
2720
				goto out_child;
2721
			}
2722
		}
2723

2724
		if (trigger_is_hit(&switch_output_trigger)) {
2725
			/*
2726
			 * If switch_output_trigger is hit, the data in
2727
			 * overwritable ring buffer should have been collected,
2728
			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2729
			 *
2730
			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2731
			 * record__mmap_read_all() didn't collect data from
2732
			 * overwritable ring buffer. Read again.
2733
			 */
2734
			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2735
				continue;
2736
			trigger_ready(&switch_output_trigger);
2737

2738
			/*
2739
			 * Reenable events in overwrite ring buffer after
2740
			 * record__mmap_read_all(): we should have collected
2741
			 * data from it.
2742
			 */
2743
			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2744

2745
			if (!quiet)
2746
				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2747
					record__waking(rec));
2748
			thread->waking = 0;
2749
			fd = record__switch_output(rec, false);
2750
			if (fd < 0) {
2751
				pr_err("Failed to switch to new file\n");
2752
				trigger_error(&switch_output_trigger);
2753
				err = fd;
2754
				goto out_child;
2755
			}
2756

2757
			/* re-arm the alarm */
2758
			if (rec->switch_output.time)
2759
				alarm(rec->switch_output.time);
2760
		}
2761

2762
		if (hits == thread->samples) {
2763
			if (done || draining)
2764
				break;
2765
			err = fdarray__poll(&thread->pollfd, -1);
2766
			/*
2767
			 * Propagate error, only if there's any. Ignore positive
2768
			 * number of returned events and interrupt error.
2769
			 */
2770
			if (err > 0 || (err < 0 && errno == EINTR))
2771
				err = 0;
2772
			thread->waking++;
2773

2774
			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2775
					    record__thread_munmap_filtered, NULL) == 0)
2776
				draining = true;
2777

2778
			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2779
			if (err)
2780
				goto out_child;
2781
		}
2782

2783
		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2784
			switch (cmd) {
2785
			case EVLIST_CTL_CMD_SNAPSHOT:
2786
				hit_auxtrace_snapshot_trigger(rec);
2787
				evlist__ctlfd_ack(rec->evlist);
2788
				break;
2789
			case EVLIST_CTL_CMD_STOP:
2790
				done = 1;
2791
				break;
2792
			case EVLIST_CTL_CMD_ACK:
2793
			case EVLIST_CTL_CMD_UNSUPPORTED:
2794
			case EVLIST_CTL_CMD_ENABLE:
2795
			case EVLIST_CTL_CMD_DISABLE:
2796
			case EVLIST_CTL_CMD_EVLIST:
2797
			case EVLIST_CTL_CMD_PING:
2798
			default:
2799
				break;
2800
			}
2801
		}
2802

2803
		err = event_enable_timer__process(rec->evlist->eet);
2804
		if (err < 0)
2805
			goto out_child;
2806
		if (err) {
2807
			err = 0;
2808
			done = 1;
2809
		}
2810

2811
		/*
2812
		 * When perf is starting the traced process, at the end events
2813
		 * die with the process and we wait for that. Thus no need to
2814
		 * disable events in this case.
2815
		 */
2816
		if (done && !disabled && !target__none(&opts->target)) {
2817
			trigger_off(&auxtrace_snapshot_trigger);
2818
			evlist__disable(rec->evlist);
2819
			disabled = true;
2820
		}
2821
	}
2822

2823
	trigger_off(&auxtrace_snapshot_trigger);
2824
	trigger_off(&switch_output_trigger);
2825

2826
	record__synthesize_final_bpf_metadata(rec);
2827

2828
	if (opts->auxtrace_snapshot_on_exit)
2829
		record__auxtrace_snapshot_exit(rec);
2830

2831
	if (forks && workload_exec_errno) {
2832
		char msg[STRERR_BUFSIZE];
2833
		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2834
		struct strbuf sb = STRBUF_INIT;
2835

2836
		evlist__format_evsels(rec->evlist, &sb, 2048);
2837

2838
		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2839
			sb.buf, argv[0], emsg);
2840
		strbuf_release(&sb);
2841
		err = -1;
2842
		goto out_child;
2843
	}
2844

2845
	if (!quiet)
2846
		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2847
			record__waking(rec));
2848

2849
	write_finished_init(rec, true);
2850

2851
	if (target__none(&rec->opts.target))
2852
		record__synthesize_workload(rec, true);
2853

2854
out_child:
2855
	record__stop_threads(rec);
2856
	record__mmap_read_all(rec, true);
2857
out_free_threads:
2858
	record__free_thread_data(rec);
2859
	evlist__finalize_ctlfd(rec->evlist);
2860
	record__aio_mmap_read_sync(rec);
2861

2862
	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2863
		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2864
		env->comp_ratio = ratio + 0.5;
2865
	}
2866

2867
	if (forks) {
2868
		int exit_status;
2869

2870
		if (!child_finished)
2871
			kill(rec->evlist->workload.pid, SIGTERM);
2872

2873
		wait(&exit_status);
2874

2875
		if (err < 0)
2876
			status = err;
2877
		else if (WIFEXITED(exit_status))
2878
			status = WEXITSTATUS(exit_status);
2879
		else if (WIFSIGNALED(exit_status))
2880
			signr = WTERMSIG(exit_status);
2881
	} else
2882
		status = err;
2883

2884
	if (rec->off_cpu)
2885
		rec->bytes_written += off_cpu_write(rec->session);
2886

2887
	record__read_lost_samples(rec);
2888
	record__synthesize(rec, true);
2889
	/* this will be recalculated during process_buildids() */
2890
	rec->samples = 0;
2891

2892
	if (!err) {
2893
		if (!rec->timestamp_filename) {
2894
			record__finish_output(rec);
2895
		} else {
2896
			fd = record__switch_output(rec, true);
2897
			if (fd < 0) {
2898
				status = fd;
2899
				goto out_delete_session;
2900
			}
2901
		}
2902
	}
2903

2904
	perf_hooks__invoke_record_end();
2905

2906
	if (!err && !quiet) {
2907
		char samples[128];
2908
		const char *postfix = rec->timestamp_filename ?
2909
					".<timestamp>" : "";
2910

2911
		if (rec->samples && !rec->opts.full_auxtrace)
2912
			scnprintf(samples, sizeof(samples),
2913
				  " (%" PRIu64 " samples)", rec->samples);
2914
		else
2915
			samples[0] = '\0';
2916

2917
		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2918
			perf_data__size(data) / 1024.0 / 1024.0,
2919
			data->path, postfix, samples);
2920
		if (ratio) {
2921
			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2922
					rec->session->bytes_transferred / 1024.0 / 1024.0,
2923
					ratio);
2924
		}
2925
		fprintf(stderr, " ]\n");
2926
	}
2927

2928
out_delete_session:
2929
#ifdef HAVE_EVENTFD_SUPPORT
2930
	if (done_fd >= 0) {
2931
		fd = done_fd;
2932
		done_fd = -1;
2933

2934
		close(fd);
2935
	}
2936
#endif
2937
	zstd_fini(&session->zstd_data);
2938
	if (!opts->no_bpf_event)
2939
		evlist__stop_sb_thread(rec->sb_evlist);
2940

2941
	perf_session__delete(session);
2942
	return status;
2943
}
2944

2945
static void callchain_debug(struct callchain_param *callchain)
2946
{
2947
	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2948

2949
	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2950

2951
	if (callchain->record_mode == CALLCHAIN_DWARF)
2952
		pr_debug("callchain: stack dump size %d\n",
2953
			 callchain->dump_size);
2954
}
2955

2956
int record_opts__parse_callchain(struct record_opts *record,
2957
				 struct callchain_param *callchain,
2958
				 const char *arg, bool unset)
2959
{
2960
	int ret;
2961
	callchain->enabled = !unset;
2962

2963
	/* --no-call-graph */
2964
	if (unset) {
2965
		callchain->record_mode = CALLCHAIN_NONE;
2966
		pr_debug("callchain: disabled\n");
2967
		return 0;
2968
	}
2969

2970
	ret = parse_callchain_record_opt(arg, callchain);
2971
	if (!ret) {
2972
		/* Enable data address sampling for DWARF unwind. */
2973
		if (callchain->record_mode == CALLCHAIN_DWARF)
2974
			record->sample_address = true;
2975
		callchain_debug(callchain);
2976
	}
2977

2978
	return ret;
2979
}
2980

2981
int record_parse_callchain_opt(const struct option *opt,
2982
			       const char *arg,
2983
			       int unset)
2984
{
2985
	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2986
}
2987

2988
int record_callchain_opt(const struct option *opt,
2989
			 const char *arg __maybe_unused,
2990
			 int unset __maybe_unused)
2991
{
2992
	struct callchain_param *callchain = opt->value;
2993

2994
	callchain->enabled = true;
2995

2996
	if (callchain->record_mode == CALLCHAIN_NONE)
2997
		callchain->record_mode = CALLCHAIN_FP;
2998

2999
	callchain_debug(callchain);
3000
	return 0;
3001
}
3002

3003
static int perf_record_config(const char *var, const char *value, void *cb)
3004
{
3005
	struct record *rec = cb;
3006

3007
	if (!strcmp(var, "record.build-id")) {
3008
		if (!strcmp(value, "cache"))
3009
			rec->no_buildid_cache = false;
3010
		else if (!strcmp(value, "no-cache"))
3011
			rec->no_buildid_cache = true;
3012
		else if (!strcmp(value, "skip"))
3013
			rec->no_buildid = true;
3014
		else if (!strcmp(value, "mmap"))
3015
			rec->buildid_mmap = true;
3016
		else if (!strcmp(value, "no-mmap"))
3017
			rec->buildid_mmap = false;
3018
		else
3019
			return -1;
3020
		return 0;
3021
	}
3022
	if (!strcmp(var, "record.call-graph")) {
3023
		var = "call-graph.record-mode";
3024
		return perf_default_config(var, value, cb);
3025
	}
3026
#ifdef HAVE_AIO_SUPPORT
3027
	if (!strcmp(var, "record.aio")) {
3028
		rec->opts.nr_cblocks = strtol(value, NULL, 0);
3029
		if (!rec->opts.nr_cblocks)
3030
			rec->opts.nr_cblocks = nr_cblocks_default;
3031
	}
3032
#endif
3033
	if (!strcmp(var, "record.debuginfod")) {
3034
		rec->debuginfod.urls = strdup(value);
3035
		if (!rec->debuginfod.urls)
3036
			return -ENOMEM;
3037
		rec->debuginfod.set = true;
3038
	}
3039

3040
	return 0;
3041
}
3042

3043
static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
3044
{
3045
	struct record *rec = (struct record *)opt->value;
3046

3047
	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
3048
}
3049

3050
static int record__parse_affinity(const struct option *opt, const char *str, int unset)
3051
{
3052
	struct record_opts *opts = (struct record_opts *)opt->value;
3053

3054
	if (unset || !str)
3055
		return 0;
3056

3057
	if (!strcasecmp(str, "node"))
3058
		opts->affinity = PERF_AFFINITY_NODE;
3059
	else if (!strcasecmp(str, "cpu"))
3060
		opts->affinity = PERF_AFFINITY_CPU;
3061

3062
	return 0;
3063
}
3064

3065
static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3066
{
3067
	mask->nbits = nr_bits;
3068
	mask->bits = bitmap_zalloc(mask->nbits);
3069
	if (!mask->bits)
3070
		return -ENOMEM;
3071

3072
	return 0;
3073
}
3074

3075
static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3076
{
3077
	bitmap_free(mask->bits);
3078
	mask->nbits = 0;
3079
}
3080

3081
static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3082
{
3083
	int ret;
3084

3085
	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3086
	if (ret) {
3087
		mask->affinity.bits = NULL;
3088
		return ret;
3089
	}
3090

3091
	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3092
	if (ret) {
3093
		record__mmap_cpu_mask_free(&mask->maps);
3094
		mask->maps.bits = NULL;
3095
	}
3096

3097
	return ret;
3098
}
3099

3100
static void record__thread_mask_free(struct thread_mask *mask)
3101
{
3102
	record__mmap_cpu_mask_free(&mask->maps);
3103
	record__mmap_cpu_mask_free(&mask->affinity);
3104
}
3105

3106
static int record__parse_threads(const struct option *opt, const char *str, int unset)
3107
{
3108
	int s;
3109
	struct record_opts *opts = opt->value;
3110

3111
	if (unset || !str || !strlen(str)) {
3112
		opts->threads_spec = THREAD_SPEC__CPU;
3113
	} else {
3114
		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3115
			if (s == THREAD_SPEC__USER) {
3116
				opts->threads_user_spec = strdup(str);
3117
				if (!opts->threads_user_spec)
3118
					return -ENOMEM;
3119
				opts->threads_spec = THREAD_SPEC__USER;
3120
				break;
3121
			}
3122
			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3123
				opts->threads_spec = s;
3124
				break;
3125
			}
3126
		}
3127
	}
3128

3129
	if (opts->threads_spec == THREAD_SPEC__USER)
3130
		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3131
	else
3132
		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3133

3134
	return 0;
3135
}
3136

3137
static int parse_output_max_size(const struct option *opt,
3138
				 const char *str, int unset)
3139
{
3140
	unsigned long *s = (unsigned long *)opt->value;
3141
	static struct parse_tag tags_size[] = {
3142
		{ .tag  = 'B', .mult = 1       },
3143
		{ .tag  = 'K', .mult = 1 << 10 },
3144
		{ .tag  = 'M', .mult = 1 << 20 },
3145
		{ .tag  = 'G', .mult = 1 << 30 },
3146
		{ .tag  = 0 },
3147
	};
3148
	unsigned long val;
3149

3150
	if (unset) {
3151
		*s = 0;
3152
		return 0;
3153
	}
3154

3155
	val = parse_tag_value(str, tags_size);
3156
	if (val != (unsigned long) -1) {
3157
		*s = val;
3158
		return 0;
3159
	}
3160

3161
	return -1;
3162
}
3163

3164
static int record__parse_mmap_pages(const struct option *opt,
3165
				    const char *str,
3166
				    int unset __maybe_unused)
3167
{
3168
	struct record_opts *opts = opt->value;
3169
	char *s, *p;
3170
	unsigned int mmap_pages;
3171
	int ret;
3172

3173
	if (!str)
3174
		return -EINVAL;
3175

3176
	s = strdup(str);
3177
	if (!s)
3178
		return -ENOMEM;
3179

3180
	p = strchr(s, ',');
3181
	if (p)
3182
		*p = '\0';
3183

3184
	if (*s) {
3185
		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3186
		if (ret)
3187
			goto out_free;
3188
		opts->mmap_pages = mmap_pages;
3189
	}
3190

3191
	if (!p) {
3192
		ret = 0;
3193
		goto out_free;
3194
	}
3195

3196
	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3197
	if (ret)
3198
		goto out_free;
3199

3200
	opts->auxtrace_mmap_pages = mmap_pages;
3201

3202
out_free:
3203
	free(s);
3204
	return ret;
3205
}
3206

3207
static int record__parse_off_cpu_thresh(const struct option *opt,
3208
					const char *str,
3209
					int unset __maybe_unused)
3210
{
3211
	struct record_opts *opts = opt->value;
3212
	char *endptr;
3213
	u64 off_cpu_thresh_ms;
3214

3215
	if (!str)
3216
		return -EINVAL;
3217

3218
	off_cpu_thresh_ms = strtoull(str, &endptr, 10);
3219

3220
	/* the threshold isn't string "0", yet strtoull() returns 0, parsing failed */
3221
	if (*endptr || (off_cpu_thresh_ms == 0 && strcmp(str, "0")))
3222
		return -EINVAL;
3223
	else
3224
		opts->off_cpu_thresh_ns = off_cpu_thresh_ms * NSEC_PER_MSEC;
3225

3226
	return 0;
3227
}
3228

3229
void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3230
{
3231
}
3232

3233
static int parse_control_option(const struct option *opt,
3234
				const char *str,
3235
				int unset __maybe_unused)
3236
{
3237
	struct record_opts *opts = opt->value;
3238

3239
	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3240
}
3241

3242
static void switch_output_size_warn(struct record *rec)
3243
{
3244
	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3245
	struct switch_output *s = &rec->switch_output;
3246

3247
	wakeup_size /= 2;
3248

3249
	if (s->size < wakeup_size) {
3250
		char buf[100];
3251

3252
		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3253
		pr_warning("WARNING: switch-output data size lower than "
3254
			   "wakeup kernel buffer size (%s) "
3255
			   "expect bigger perf.data sizes\n", buf);
3256
	}
3257
}
3258

3259
static int switch_output_setup(struct record *rec)
3260
{
3261
	struct switch_output *s = &rec->switch_output;
3262
	static struct parse_tag tags_size[] = {
3263
		{ .tag  = 'B', .mult = 1       },
3264
		{ .tag  = 'K', .mult = 1 << 10 },
3265
		{ .tag  = 'M', .mult = 1 << 20 },
3266
		{ .tag  = 'G', .mult = 1 << 30 },
3267
		{ .tag  = 0 },
3268
	};
3269
	static struct parse_tag tags_time[] = {
3270
		{ .tag  = 's', .mult = 1        },
3271
		{ .tag  = 'm', .mult = 60       },
3272
		{ .tag  = 'h', .mult = 60*60    },
3273
		{ .tag  = 'd', .mult = 60*60*24 },
3274
		{ .tag  = 0 },
3275
	};
3276
	unsigned long val;
3277

3278
	/*
3279
	 * If we're using --switch-output-events, then we imply its
3280
	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3281
	 *  thread to its parent.
3282
	 */
3283
	if (rec->switch_output_event_set) {
3284
		if (record__threads_enabled(rec)) {
3285
			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3286
			return 0;
3287
		}
3288
		goto do_signal;
3289
	}
3290

3291
	if (!s->set)
3292
		return 0;
3293

3294
	if (record__threads_enabled(rec)) {
3295
		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3296
		return 0;
3297
	}
3298

3299
	if (!strcmp(s->str, "signal")) {
3300
do_signal:
3301
		s->signal = true;
3302
		pr_debug("switch-output with SIGUSR2 signal\n");
3303
		goto enabled;
3304
	}
3305

3306
	val = parse_tag_value(s->str, tags_size);
3307
	if (val != (unsigned long) -1) {
3308
		s->size = val;
3309
		pr_debug("switch-output with %s size threshold\n", s->str);
3310
		goto enabled;
3311
	}
3312

3313
	val = parse_tag_value(s->str, tags_time);
3314
	if (val != (unsigned long) -1) {
3315
		s->time = val;
3316
		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3317
			 s->str, s->time);
3318
		goto enabled;
3319
	}
3320

3321
	return -1;
3322

3323
enabled:
3324
	rec->timestamp_filename = true;
3325
	s->enabled              = true;
3326

3327
	if (s->size && !rec->opts.no_buffering)
3328
		switch_output_size_warn(rec);
3329

3330
	return 0;
3331
}
3332

3333
static const char * const __record_usage[] = {
3334
	"perf record [<options>] [<command>]",
3335
	"perf record [<options>] -- <command> [<options>]",
3336
	NULL
3337
};
3338
const char * const *record_usage = __record_usage;
3339

3340
static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
3341
				  struct perf_sample *sample, struct machine *machine)
3342
{
3343
	/*
3344
	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3345
	 * no need to add them twice.
3346
	 */
3347
	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3348
		return 0;
3349
	return perf_event__process_mmap(tool, event, sample, machine);
3350
}
3351

3352
static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
3353
				   struct perf_sample *sample, struct machine *machine)
3354
{
3355
	/*
3356
	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3357
	 * no need to add them twice.
3358
	 */
3359
	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3360
		return 0;
3361

3362
	return perf_event__process_mmap2(tool, event, sample, machine);
3363
}
3364

3365
static int process_timestamp_boundary(const struct perf_tool *tool,
3366
				      union perf_event *event __maybe_unused,
3367
				      struct perf_sample *sample,
3368
				      struct machine *machine __maybe_unused)
3369
{
3370
	struct record *rec = container_of(tool, struct record, tool);
3371

3372
	set_timestamp_boundary(rec, sample->time);
3373
	return 0;
3374
}
3375

3376
static int parse_record_synth_option(const struct option *opt,
3377
				     const char *str,
3378
				     int unset __maybe_unused)
3379
{
3380
	struct record_opts *opts = opt->value;
3381
	char *p = strdup(str);
3382

3383
	if (p == NULL)
3384
		return -1;
3385

3386
	opts->synth = parse_synth_opt(p);
3387
	free(p);
3388

3389
	if (opts->synth < 0) {
3390
		pr_err("Invalid synth option: %s\n", str);
3391
		return -1;
3392
	}
3393
	return 0;
3394
}
3395

3396
/*
3397
 * XXX Ideally would be local to cmd_record() and passed to a record__new
3398
 * because we need to have access to it in record__exit, that is called
3399
 * after cmd_record() exits, but since record_options need to be accessible to
3400
 * builtin-script, leave it here.
3401
 *
3402
 * At least we don't ouch it in all the other functions here directly.
3403
 *
3404
 * Just say no to tons of global variables, sigh.
3405
 */
3406
static struct record record = {
3407
	.opts = {
3408
		.sample_time	     = true,
3409
		.mmap_pages	     = UINT_MAX,
3410
		.user_freq	     = UINT_MAX,
3411
		.user_interval	     = ULLONG_MAX,
3412
		.freq		     = 4000,
3413
		.target		     = {
3414
			.uses_mmap   = true,
3415
			.default_per_cpu = true,
3416
		},
3417
		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3418
		.nr_threads_synthesize = 1,
3419
		.ctl_fd              = -1,
3420
		.ctl_fd_ack          = -1,
3421
		.synth               = PERF_SYNTH_ALL,
3422
		.off_cpu_thresh_ns   = OFFCPU_THRESH,
3423
	},
3424
	.buildid_mmap = true,
3425
};
3426

3427
const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3428
	"\n\t\t\t\tDefault: fp";
3429

3430
static bool dry_run;
3431

3432
static struct parse_events_option_args parse_events_option_args = {
3433
	.evlistp = &record.evlist,
3434
};
3435

3436
static struct parse_events_option_args switch_output_parse_events_option_args = {
3437
	.evlistp = &record.sb_evlist,
3438
};
3439

3440
/*
3441
 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3442
 * with it and switch to use the library functions in perf_evlist that came
3443
 * from builtin-record.c, i.e. use record_opts,
3444
 * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3445
 * using pipes, etc.
3446
 */
3447
static struct option __record_options[] = {
3448
	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3449
		     "event selector. use 'perf list' to list available events",
3450
		     parse_events_option),
3451
	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3452
		     "event filter", parse_filter),
3453
	OPT_BOOLEAN(0, "latency", &record.latency,
3454
		    "Enable data collection for latency profiling.\n"
3455
		    "\t\t\t  Use perf report --latency for latency-centric profile."),
3456
	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3457
			   NULL, "don't record events from perf itself",
3458
			   exclude_perf),
3459
	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3460
		    "record events on existing process id"),
3461
	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3462
		    "record events on existing thread id"),
3463
	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3464
		    "collect data with this RT SCHED_FIFO priority"),
3465
	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3466
		    "collect data without buffering"),
3467
	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3468
		    "collect raw sample records from all opened counters"),
3469
	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3470
			    "system-wide collection from all CPUs"),
3471
	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3472
		    "list of cpus to monitor"),
3473
	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3474
	OPT_STRING('o', "output", &record.data.path, "file",
3475
		    "output file name"),
3476
	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3477
			&record.opts.no_inherit_set,
3478
			"child tasks do not inherit counters"),
3479
	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3480
		    "synthesize non-sample events at the end of output"),
3481
	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3482
	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3483
	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3484
		    "Fail if the specified frequency can't be used"),
3485
	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3486
		     "profile at this frequency",
3487
		      record__parse_freq),
3488
	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3489
		     "number of mmap data pages and AUX area tracing mmap pages",
3490
		     record__parse_mmap_pages),
3491
	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3492
		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3493
		     record__mmap_flush_parse),
3494
	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3495
			   NULL, "enables call-graph recording" ,
3496
			   &record_callchain_opt),
3497
	OPT_CALLBACK(0, "call-graph", &record.opts,
3498
		     "record_mode[,record_size]", record_callchain_help,
3499
		     &record_parse_callchain_opt),
3500
	OPT_INCR('v', "verbose", &verbose,
3501
		    "be more verbose (show counter open errors, etc)"),
3502
	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3503
	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3504
		    "per thread counts"),
3505
	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3506
	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3507
		    "Record the sample physical addresses"),
3508
	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3509
		    "Record the sampled data address data page size"),
3510
	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3511
		    "Record the sampled code address (ip) page size"),
3512
	OPT_BOOLEAN(0, "sample-mem-info", &record.opts.sample_data_src,
3513
		    "Record the data source for memory operations"),
3514
	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3515
	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3516
		    "Record the sample identifier"),
3517
	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3518
			&record.opts.sample_time_set,
3519
			"Record the sample timestamps"),
3520
	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3521
			"Record the sample period"),
3522
	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3523
		    "don't sample"),
3524
	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3525
			&record.no_buildid_cache_set,
3526
			"do not update the buildid cache"),
3527
	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3528
			&record.no_buildid_set,
3529
			"do not collect buildids in perf.data"),
3530
	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3531
		     "monitor event in cgroup name only",
3532
		     parse_cgroups),
3533
	OPT_CALLBACK('D', "delay", &record, "ms",
3534
		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3535
		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3536
		     record__parse_event_enable_time),
3537
	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3538
	OPT_STRING('u', "uid", &record.uid_str, "user", "user to profile"),
3539

3540
	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3541
		     "branch any", "sample any taken branches",
3542
		     parse_branch_stack),
3543

3544
	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3545
		     "branch filter mask", "branch stack filter modes",
3546
		     parse_branch_stack),
3547
	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3548
		    "sample by weight (on special events only)"),
3549
	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3550
		    "sample transaction flags (special events only)"),
3551
	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3552
		    "use per-thread mmaps"),
3553
	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3554
		    "sample selected machine registers on interrupt,"
3555
		    " use '-I?' to list register names", parse_intr_regs),
3556
	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3557
		    "sample selected machine registers in user space,"
3558
		    " use '--user-regs=?' to list register names", parse_user_regs),
3559
	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3560
		    "Record running/enabled time of read (:S) events"),
3561
	OPT_CALLBACK('k', "clockid", &record.opts,
3562
	"clockid", "clockid to use for events, see clock_gettime()",
3563
	parse_clockid),
3564
	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3565
			  "opts", "AUX area tracing Snapshot Mode", ""),
3566
	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3567
			  "opts", "sample AUX area", ""),
3568
	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3569
			"per thread proc mmap processing timeout in ms"),
3570
	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3571
		    "Record namespaces events"),
3572
	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3573
		    "Record cgroup events"),
3574
	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3575
			&record.opts.record_switch_events_set,
3576
			"Record context switch events"),
3577
	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3578
			 "Configure all used events to run in kernel space.",
3579
			 PARSE_OPT_EXCLUSIVE),
3580
	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3581
			 "Configure all used events to run in user space.",
3582
			 PARSE_OPT_EXCLUSIVE),
3583
	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3584
		    "collect kernel callchains"),
3585
	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3586
		    "collect user callchains"),
3587
	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3588
		   "file", "vmlinux pathname"),
3589
	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3590
		    "Record build-id of all DSOs regardless of hits"),
3591
	OPT_BOOLEAN_SET(0, "buildid-mmap", &record.buildid_mmap, &record.buildid_mmap_set,
3592
			"Record build-id in mmap events and skip build-id processing."),
3593
	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3594
		    "append timestamp to output filename"),
3595
	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3596
		    "Record timestamp boundary (time of first/last samples)"),
3597
	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3598
			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3599
			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3600
			  "signal"),
3601
	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3602
			 &record.switch_output_event_set, "switch output event",
3603
			 "switch output event selector. use 'perf list' to list available events",
3604
			 parse_events_option_new_evlist),
3605
	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3606
		   "Limit number of switch output generated files"),
3607
	OPT_BOOLEAN(0, "dry-run", &dry_run,
3608
		    "Parse options then exit"),
3609
#ifdef HAVE_AIO_SUPPORT
3610
	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3611
		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3612
		     record__aio_parse),
3613
#endif
3614
	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3615
		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3616
		     record__parse_affinity),
3617
#ifdef HAVE_ZSTD_SUPPORT
3618
	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3619
			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3620
			    record__parse_comp_level),
3621
#endif
3622
	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3623
		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3624
	OPT_UINTEGER(0, "num-thread-synthesize",
3625
		     &record.opts.nr_threads_synthesize,
3626
		     "number of threads to run for event synthesis"),
3627
#ifdef HAVE_LIBPFM
3628
	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3629
		"libpfm4 event selector. use 'perf list' to list available events",
3630
		parse_libpfm_events_option),
3631
#endif
3632
	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3633
		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3634
		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3635
		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3636
		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3637
		      parse_control_option),
3638
	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3639
		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3640
	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3641
			  &record.debuginfod.set, "debuginfod urls",
3642
			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3643
			  "system"),
3644
	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3645
			    "write collected trace data into several data files using parallel threads",
3646
			    record__parse_threads),
3647
	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3648
	OPT_STRING(0, "setup-filter", &record.filter_action, "pin|unpin",
3649
		   "BPF filter action"),
3650
	OPT_CALLBACK(0, "off-cpu-thresh", &record.opts, "ms",
3651
		     "Dump off-cpu samples if off-cpu time exceeds this threshold (in milliseconds). (Default: 500ms)",
3652
		     record__parse_off_cpu_thresh),
3653
	OPT_END()
3654
};
3655

3656
struct option *record_options = __record_options;
3657

3658
static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3659
{
3660
	struct perf_cpu cpu;
3661
	int idx;
3662

3663
	if (cpu_map__is_dummy(cpus))
3664
		return 0;
3665

3666
	perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3667
		/* Return ENODEV is input cpu is greater than max cpu */
3668
		if ((unsigned long)cpu.cpu > mask->nbits)
3669
			return -ENODEV;
3670
		__set_bit(cpu.cpu, mask->bits);
3671
	}
3672

3673
	return 0;
3674
}
3675

3676
static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3677
{
3678
	struct perf_cpu_map *cpus;
3679

3680
	cpus = perf_cpu_map__new(mask_spec);
3681
	if (!cpus)
3682
		return -ENOMEM;
3683

3684
	bitmap_zero(mask->bits, mask->nbits);
3685
	if (record__mmap_cpu_mask_init(mask, cpus))
3686
		return -ENODEV;
3687

3688
	perf_cpu_map__put(cpus);
3689

3690
	return 0;
3691
}
3692

3693
static void record__free_thread_masks(struct record *rec, int nr_threads)
3694
{
3695
	int t;
3696

3697
	if (rec->thread_masks)
3698
		for (t = 0; t < nr_threads; t++)
3699
			record__thread_mask_free(&rec->thread_masks[t]);
3700

3701
	zfree(&rec->thread_masks);
3702
}
3703

3704
static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3705
{
3706
	int t, ret;
3707

3708
	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3709
	if (!rec->thread_masks) {
3710
		pr_err("Failed to allocate thread masks\n");
3711
		return -ENOMEM;
3712
	}
3713

3714
	for (t = 0; t < nr_threads; t++) {
3715
		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3716
		if (ret) {
3717
			pr_err("Failed to allocate thread masks[%d]\n", t);
3718
			goto out_free;
3719
		}
3720
	}
3721

3722
	return 0;
3723

3724
out_free:
3725
	record__free_thread_masks(rec, nr_threads);
3726

3727
	return ret;
3728
}
3729

3730
static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3731
{
3732
	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3733

3734
	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3735
	if (ret)
3736
		return ret;
3737

3738
	rec->nr_threads = nr_cpus;
3739
	pr_debug("nr_threads: %d\n", rec->nr_threads);
3740

3741
	for (t = 0; t < rec->nr_threads; t++) {
3742
		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3743
		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3744
		if (verbose > 0) {
3745
			pr_debug("thread_masks[%d]: ", t);
3746
			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3747
			pr_debug("thread_masks[%d]: ", t);
3748
			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3749
		}
3750
	}
3751

3752
	return 0;
3753
}
3754

3755
static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3756
					  const char **maps_spec, const char **affinity_spec,
3757
					  u32 nr_spec)
3758
{
3759
	u32 s;
3760
	int ret = 0, t = 0;
3761
	struct mmap_cpu_mask cpus_mask;
3762
	struct thread_mask thread_mask, full_mask, *thread_masks;
3763

3764
	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3765
	if (ret) {
3766
		pr_err("Failed to allocate CPUs mask\n");
3767
		return ret;
3768
	}
3769

3770
	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3771
	if (ret) {
3772
		pr_err("Failed to init cpu mask\n");
3773
		goto out_free_cpu_mask;
3774
	}
3775

3776
	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3777
	if (ret) {
3778
		pr_err("Failed to allocate full mask\n");
3779
		goto out_free_cpu_mask;
3780
	}
3781

3782
	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3783
	if (ret) {
3784
		pr_err("Failed to allocate thread mask\n");
3785
		goto out_free_full_and_cpu_masks;
3786
	}
3787

3788
	for (s = 0; s < nr_spec; s++) {
3789
		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3790
		if (ret) {
3791
			pr_err("Failed to initialize maps thread mask\n");
3792
			goto out_free;
3793
		}
3794
		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3795
		if (ret) {
3796
			pr_err("Failed to initialize affinity thread mask\n");
3797
			goto out_free;
3798
		}
3799

3800
		/* ignore invalid CPUs but do not allow empty masks */
3801
		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3802
				cpus_mask.bits, thread_mask.maps.nbits)) {
3803
			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3804
			ret = -EINVAL;
3805
			goto out_free;
3806
		}
3807
		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3808
				cpus_mask.bits, thread_mask.affinity.nbits)) {
3809
			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3810
			ret = -EINVAL;
3811
			goto out_free;
3812
		}
3813

3814
		/* do not allow intersection with other masks (full_mask) */
3815
		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3816
				      thread_mask.maps.nbits)) {
3817
			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3818
			ret = -EINVAL;
3819
			goto out_free;
3820
		}
3821
		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3822
				      thread_mask.affinity.nbits)) {
3823
			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3824
			ret = -EINVAL;
3825
			goto out_free;
3826
		}
3827

3828
		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3829
			  thread_mask.maps.bits, full_mask.maps.nbits);
3830
		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3831
			  thread_mask.affinity.bits, full_mask.maps.nbits);
3832

3833
		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3834
		if (!thread_masks) {
3835
			pr_err("Failed to reallocate thread masks\n");
3836
			ret = -ENOMEM;
3837
			goto out_free;
3838
		}
3839
		rec->thread_masks = thread_masks;
3840
		rec->thread_masks[t] = thread_mask;
3841
		if (verbose > 0) {
3842
			pr_debug("thread_masks[%d]: ", t);
3843
			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3844
			pr_debug("thread_masks[%d]: ", t);
3845
			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3846
		}
3847
		t++;
3848
		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3849
		if (ret) {
3850
			pr_err("Failed to allocate thread mask\n");
3851
			goto out_free_full_and_cpu_masks;
3852
		}
3853
	}
3854
	rec->nr_threads = t;
3855
	pr_debug("nr_threads: %d\n", rec->nr_threads);
3856
	if (!rec->nr_threads)
3857
		ret = -EINVAL;
3858

3859
out_free:
3860
	record__thread_mask_free(&thread_mask);
3861
out_free_full_and_cpu_masks:
3862
	record__thread_mask_free(&full_mask);
3863
out_free_cpu_mask:
3864
	record__mmap_cpu_mask_free(&cpus_mask);
3865

3866
	return ret;
3867
}
3868

3869
static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3870
{
3871
	int ret;
3872
	struct cpu_topology *topo;
3873

3874
	topo = cpu_topology__new();
3875
	if (!topo) {
3876
		pr_err("Failed to allocate CPU topology\n");
3877
		return -ENOMEM;
3878
	}
3879

3880
	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3881
					     topo->core_cpus_list, topo->core_cpus_lists);
3882
	cpu_topology__delete(topo);
3883

3884
	return ret;
3885
}
3886

3887
static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3888
{
3889
	int ret;
3890
	struct cpu_topology *topo;
3891

3892
	topo = cpu_topology__new();
3893
	if (!topo) {
3894
		pr_err("Failed to allocate CPU topology\n");
3895
		return -ENOMEM;
3896
	}
3897

3898
	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3899
					     topo->package_cpus_list, topo->package_cpus_lists);
3900
	cpu_topology__delete(topo);
3901

3902
	return ret;
3903
}
3904

3905
static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3906
{
3907
	u32 s;
3908
	int ret;
3909
	const char **spec;
3910
	struct numa_topology *topo;
3911

3912
	topo = numa_topology__new();
3913
	if (!topo) {
3914
		pr_err("Failed to allocate NUMA topology\n");
3915
		return -ENOMEM;
3916
	}
3917

3918
	spec = zalloc(topo->nr * sizeof(char *));
3919
	if (!spec) {
3920
		pr_err("Failed to allocate NUMA spec\n");
3921
		ret = -ENOMEM;
3922
		goto out_delete_topo;
3923
	}
3924
	for (s = 0; s < topo->nr; s++)
3925
		spec[s] = topo->nodes[s].cpus;
3926

3927
	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3928

3929
	zfree(&spec);
3930

3931
out_delete_topo:
3932
	numa_topology__delete(topo);
3933

3934
	return ret;
3935
}
3936

3937
static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3938
{
3939
	int t, ret;
3940
	u32 s, nr_spec = 0;
3941
	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3942
	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3943

3944
	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3945
		spec = strtok_r(user_spec, ":", &spec_ptr);
3946
		if (spec == NULL)
3947
			break;
3948
		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3949
		mask = strtok_r(spec, "/", &mask_ptr);
3950
		if (mask == NULL)
3951
			break;
3952
		pr_debug2("  maps mask: %s\n", mask);
3953
		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3954
		if (!tmp_spec) {
3955
			pr_err("Failed to reallocate maps spec\n");
3956
			ret = -ENOMEM;
3957
			goto out_free;
3958
		}
3959
		maps_spec = tmp_spec;
3960
		maps_spec[nr_spec] = dup_mask = strdup(mask);
3961
		if (!maps_spec[nr_spec]) {
3962
			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3963
			ret = -ENOMEM;
3964
			goto out_free;
3965
		}
3966
		mask = strtok_r(NULL, "/", &mask_ptr);
3967
		if (mask == NULL) {
3968
			pr_err("Invalid thread maps or affinity specs\n");
3969
			ret = -EINVAL;
3970
			goto out_free;
3971
		}
3972
		pr_debug2("  affinity mask: %s\n", mask);
3973
		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3974
		if (!tmp_spec) {
3975
			pr_err("Failed to reallocate affinity spec\n");
3976
			ret = -ENOMEM;
3977
			goto out_free;
3978
		}
3979
		affinity_spec = tmp_spec;
3980
		affinity_spec[nr_spec] = strdup(mask);
3981
		if (!affinity_spec[nr_spec]) {
3982
			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3983
			ret = -ENOMEM;
3984
			goto out_free;
3985
		}
3986
		dup_mask = NULL;
3987
		nr_spec++;
3988
	}
3989

3990
	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3991
					     (const char **)affinity_spec, nr_spec);
3992

3993
out_free:
3994
	free(dup_mask);
3995
	for (s = 0; s < nr_spec; s++) {
3996
		if (maps_spec)
3997
			free(maps_spec[s]);
3998
		if (affinity_spec)
3999
			free(affinity_spec[s]);
4000
	}
4001
	free(affinity_spec);
4002
	free(maps_spec);
4003

4004
	return ret;
4005
}
4006

4007
static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
4008
{
4009
	int ret;
4010

4011
	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
4012
	if (ret)
4013
		return ret;
4014

4015
	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
4016
		return -ENODEV;
4017

4018
	rec->nr_threads = 1;
4019

4020
	return 0;
4021
}
4022

4023
static int record__init_thread_masks(struct record *rec)
4024
{
4025
	int ret = 0;
4026
	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
4027

4028
	if (!record__threads_enabled(rec))
4029
		return record__init_thread_default_masks(rec, cpus);
4030

4031
	if (evlist__per_thread(rec->evlist)) {
4032
		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
4033
		return -EINVAL;
4034
	}
4035

4036
	switch (rec->opts.threads_spec) {
4037
	case THREAD_SPEC__CPU:
4038
		ret = record__init_thread_cpu_masks(rec, cpus);
4039
		break;
4040
	case THREAD_SPEC__CORE:
4041
		ret = record__init_thread_core_masks(rec, cpus);
4042
		break;
4043
	case THREAD_SPEC__PACKAGE:
4044
		ret = record__init_thread_package_masks(rec, cpus);
4045
		break;
4046
	case THREAD_SPEC__NUMA:
4047
		ret = record__init_thread_numa_masks(rec, cpus);
4048
		break;
4049
	case THREAD_SPEC__USER:
4050
		ret = record__init_thread_user_masks(rec, cpus);
4051
		break;
4052
	default:
4053
		break;
4054
	}
4055

4056
	return ret;
4057
}
4058

4059
int cmd_record(int argc, const char **argv)
4060
{
4061
	int err;
4062
	struct record *rec = &record;
4063
	char errbuf[BUFSIZ];
4064

4065
	setlocale(LC_ALL, "");
4066

4067
#ifndef HAVE_BPF_SKEL
4068
# define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
4069
	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
4070
# undef set_nobuild
4071
#endif
4072

4073
	/* Disable eager loading of kernel symbols that adds overhead to perf record. */
4074
	symbol_conf.lazy_load_kernel_maps = true;
4075
	rec->opts.affinity = PERF_AFFINITY_SYS;
4076

4077
	rec->evlist = evlist__new();
4078
	if (rec->evlist == NULL)
4079
		return -ENOMEM;
4080

4081
	err = perf_config(perf_record_config, rec);
4082
	if (err)
4083
		return err;
4084

4085
	argc = parse_options(argc, argv, record_options, record_usage,
4086
			    PARSE_OPT_STOP_AT_NON_OPTION);
4087
	if (quiet)
4088
		perf_quiet_option();
4089

4090
	err = symbol__validate_sym_arguments();
4091
	if (err)
4092
		return err;
4093

4094
	perf_debuginfod_setup(&record.debuginfod);
4095

4096
	/* Make system wide (-a) the default target. */
4097
	if (!argc && target__none(&rec->opts.target))
4098
		rec->opts.target.system_wide = true;
4099

4100
	if (nr_cgroups && !rec->opts.target.system_wide) {
4101
		usage_with_options_msg(record_usage, record_options,
4102
			"cgroup monitoring only available in system-wide mode");
4103

4104
	}
4105

4106
	if (record.latency) {
4107
		/*
4108
		 * There is no fundamental reason why latency profiling
4109
		 * can't work for system-wide mode, but exact semantics
4110
		 * and details are to be defined.
4111
		 * See the following thread for details:
4112
		 * https://lore.kernel.org/all/[email protected]/
4113
		 */
4114
		if (record.opts.target.system_wide) {
4115
			pr_err("Failed: latency profiling is not supported with system-wide collection.\n");
4116
			err = -EINVAL;
4117
			goto out_opts;
4118
		}
4119
		record.opts.record_switch_events = true;
4120
	}
4121

4122
	if (!rec->buildid_mmap) {
4123
		pr_debug("Disabling build id in synthesized mmap2 events.\n");
4124
		symbol_conf.no_buildid_mmap2 = true;
4125
	} else if (rec->buildid_mmap_set) {
4126
		/*
4127
		 * Explicitly passing --buildid-mmap disables buildid processing
4128
		 * and cache generation.
4129
		 */
4130
		rec->no_buildid = true;
4131
	}
4132
	if (rec->buildid_mmap && !perf_can_record_build_id()) {
4133
		pr_warning("Missing support for build id in kernel mmap events.\n"
4134
			   "Disable this warning with --no-buildid-mmap\n");
4135
		rec->buildid_mmap = false;
4136
	}
4137
	if (rec->buildid_mmap) {
4138
		/* Enable perf_event_attr::build_id bit. */
4139
		rec->opts.build_id = true;
4140
	}
4141

4142
	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4143
		pr_err("Kernel has no cgroup sampling support.\n");
4144
		err = -EINVAL;
4145
		goto out_opts;
4146
	}
4147

4148
	if (rec->opts.kcore)
4149
		rec->opts.text_poke = true;
4150

4151
	if (rec->opts.kcore || record__threads_enabled(rec))
4152
		rec->data.is_dir = true;
4153

4154
	if (record__threads_enabled(rec)) {
4155
		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4156
			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4157
			goto out_opts;
4158
		}
4159
		if (record__aio_enabled(rec)) {
4160
			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4161
			goto out_opts;
4162
		}
4163
	}
4164

4165
	if (rec->opts.comp_level != 0) {
4166
		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4167
		rec->no_buildid = true;
4168
	}
4169

4170
	if (rec->opts.record_switch_events &&
4171
	    !perf_can_record_switch_events()) {
4172
		ui__error("kernel does not support recording context switch events\n");
4173
		parse_options_usage(record_usage, record_options, "switch-events", 0);
4174
		err = -EINVAL;
4175
		goto out_opts;
4176
	}
4177

4178
	if (switch_output_setup(rec)) {
4179
		parse_options_usage(record_usage, record_options, "switch-output", 0);
4180
		err = -EINVAL;
4181
		goto out_opts;
4182
	}
4183

4184
	if (rec->switch_output.time) {
4185
		signal(SIGALRM, alarm_sig_handler);
4186
		alarm(rec->switch_output.time);
4187
	}
4188

4189
	if (rec->switch_output.num_files) {
4190
		rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4191
						      sizeof(char *));
4192
		if (!rec->switch_output.filenames) {
4193
			err = -EINVAL;
4194
			goto out_opts;
4195
		}
4196
	}
4197

4198
	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4199
		rec->timestamp_filename = false;
4200
		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4201
	}
4202

4203
	if (rec->filter_action) {
4204
		if (!strcmp(rec->filter_action, "pin"))
4205
			err = perf_bpf_filter__pin();
4206
		else if (!strcmp(rec->filter_action, "unpin"))
4207
			err = perf_bpf_filter__unpin();
4208
		else {
4209
			pr_warning("Unknown BPF filter action: %s\n", rec->filter_action);
4210
			err = -EINVAL;
4211
		}
4212
		goto out_opts;
4213
	}
4214

4215
	/* For backward compatibility, -d implies --mem-info */
4216
	if (rec->opts.sample_address)
4217
		rec->opts.sample_data_src = true;
4218

4219
	/*
4220
	 * Allow aliases to facilitate the lookup of symbols for address
4221
	 * filters. Refer to auxtrace_parse_filters().
4222
	 */
4223
	symbol_conf.allow_aliases = true;
4224

4225
	symbol__init(NULL);
4226

4227
	err = record__auxtrace_init(rec);
4228
	if (err)
4229
		goto out;
4230

4231
	if (dry_run)
4232
		goto out;
4233

4234
	err = -ENOMEM;
4235

4236
	if (rec->no_buildid_cache || rec->no_buildid) {
4237
		disable_buildid_cache();
4238
	} else if (rec->switch_output.enabled) {
4239
		/*
4240
		 * In 'perf record --switch-output', disable buildid
4241
		 * generation by default to reduce data file switching
4242
		 * overhead. Still generate buildid if they are required
4243
		 * explicitly using
4244
		 *
4245
		 *  perf record --switch-output --no-no-buildid \
4246
		 *              --no-no-buildid-cache
4247
		 *
4248
		 * Following code equals to:
4249
		 *
4250
		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4251
		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4252
		 *         disable_buildid_cache();
4253
		 */
4254
		bool disable = true;
4255

4256
		if (rec->no_buildid_set && !rec->no_buildid)
4257
			disable = false;
4258
		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4259
			disable = false;
4260
		if (disable) {
4261
			rec->no_buildid = true;
4262
			rec->no_buildid_cache = true;
4263
			disable_buildid_cache();
4264
		}
4265
	}
4266

4267
	if (record.opts.overwrite)
4268
		record.opts.tail_synthesize = true;
4269

4270
	if (rec->evlist->core.nr_entries == 0) {
4271
		err = parse_event(rec->evlist, "cycles:P");
4272
		if (err)
4273
			goto out;
4274
	}
4275

4276
	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4277
		rec->opts.no_inherit = true;
4278

4279
	err = target__validate(&rec->opts.target);
4280
	if (err) {
4281
		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4282
		ui__warning("%s\n", errbuf);
4283
	}
4284

4285
	if (rec->uid_str) {
4286
		uid_t uid = parse_uid(rec->uid_str);
4287

4288
		if (uid == UINT_MAX) {
4289
			ui__error("Invalid User: %s", rec->uid_str);
4290
			err = -EINVAL;
4291
			goto out;
4292
		}
4293
		err = parse_uid_filter(rec->evlist, uid);
4294
		if (err)
4295
			goto out;
4296

4297
		/* User ID filtering implies system wide. */
4298
		rec->opts.target.system_wide = true;
4299
	}
4300

4301
	/* Enable ignoring missing threads when -p option is defined. */
4302
	rec->opts.ignore_missing_thread = rec->opts.target.pid;
4303

4304
	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4305

4306
	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4307
		arch__add_leaf_frame_record_opts(&rec->opts);
4308

4309
	err = -ENOMEM;
4310
	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4311
		if (rec->opts.target.pid != NULL) {
4312
			pr_err("Couldn't create thread/CPU maps: %s\n",
4313
				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4314
			goto out;
4315
		}
4316
		else
4317
			usage_with_options(record_usage, record_options);
4318
	}
4319

4320
	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4321
	if (err)
4322
		goto out;
4323

4324
	/*
4325
	 * We take all buildids when the file contains
4326
	 * AUX area tracing data because we do not decode the
4327
	 * trace because it would take too long.
4328
	 */
4329
	if (rec->opts.full_auxtrace)
4330
		rec->buildid_all = true;
4331

4332
	if (rec->opts.text_poke) {
4333
		err = record__config_text_poke(rec->evlist);
4334
		if (err) {
4335
			pr_err("record__config_text_poke failed, error %d\n", err);
4336
			goto out;
4337
		}
4338
	}
4339

4340
	if (rec->off_cpu) {
4341
		err = record__config_off_cpu(rec);
4342
		if (err) {
4343
			pr_err("record__config_off_cpu failed, error %d\n", err);
4344
			goto out;
4345
		}
4346
	}
4347

4348
	if (record_opts__config(&rec->opts)) {
4349
		err = -EINVAL;
4350
		goto out;
4351
	}
4352

4353
	err = record__config_tracking_events(rec);
4354
	if (err) {
4355
		pr_err("record__config_tracking_events failed, error %d\n", err);
4356
		goto out;
4357
	}
4358

4359
	err = record__init_thread_masks(rec);
4360
	if (err) {
4361
		pr_err("Failed to initialize parallel data streaming masks\n");
4362
		goto out;
4363
	}
4364

4365
	if (rec->opts.nr_cblocks > nr_cblocks_max)
4366
		rec->opts.nr_cblocks = nr_cblocks_max;
4367
	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4368

4369
	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4370
	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4371

4372
	if (rec->opts.comp_level > comp_level_max)
4373
		rec->opts.comp_level = comp_level_max;
4374
	pr_debug("comp level: %d\n", rec->opts.comp_level);
4375

4376
	err = __cmd_record(&record, argc, argv);
4377
out:
4378
	record__free_thread_masks(rec, rec->nr_threads);
4379
	rec->nr_threads = 0;
4380
	symbol__exit();
4381
	auxtrace_record__free(rec->itr);
4382
out_opts:
4383
	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4384
	evlist__delete(rec->evlist);
4385
	return err;
4386
}
4387

4388
static void snapshot_sig_handler(int sig __maybe_unused)
4389
{
4390
	struct record *rec = &record;
4391

4392
	hit_auxtrace_snapshot_trigger(rec);
4393

4394
	if (switch_output_signal(rec))
4395
		trigger_hit(&switch_output_trigger);
4396
}
4397

4398
static void alarm_sig_handler(int sig __maybe_unused)
4399
{
4400
	struct record *rec = &record;
4401

4402
	if (switch_output_time(rec))
4403
		trigger_hit(&switch_output_trigger);
4404
}
4405

4406
Product

Resources

Company