CoCalc -- builtin-sched.c

GitHub Repository: awilliam/linux-vfio
Path: blob/master/tools/perf/builtin-sched.c
¹⁷⁵¹⁰ views
1
#include "builtin.h"
2
#include "perf.h"
3

4
#include "util/util.h"
5
#include "util/cache.h"
6
#include "util/symbol.h"
7
#include "util/thread.h"
8
#include "util/header.h"
9
#include "util/session.h"
10

11
#include "util/parse-options.h"
12
#include "util/trace-event.h"
13

14
#include "util/debug.h"
15

16
#include <sys/prctl.h>
17

18
#include <semaphore.h>
19
#include <pthread.h>
20
#include <math.h>
21

22
static char			const *input_name = "perf.data";
23

24
static char			default_sort_order[] = "avg, max, switch, runtime";
25
static const char		*sort_order = default_sort_order;
26

27
static int			profile_cpu = -1;
28

29
#define PR_SET_NAME		15               /* Set process name */
30
#define MAX_CPUS		4096
31

32
static u64			run_measurement_overhead;
33
static u64			sleep_measurement_overhead;
34

35
#define COMM_LEN		20
36
#define SYM_LEN			129
37

38
#define MAX_PID			65536
39

40
static unsigned long		nr_tasks;
41

42
struct sched_atom;
43

44
struct task_desc {
45
	unsigned long		nr;
46
	unsigned long		pid;
47
	char			comm[COMM_LEN];
48

49
	unsigned long		nr_events;
50
	unsigned long		curr_event;
51
	struct sched_atom	**atoms;
52

53
	pthread_t		thread;
54
	sem_t			sleep_sem;
55

56
	sem_t			ready_for_work;
57
	sem_t			work_done_sem;
58

59
	u64			cpu_usage;
60
};
61

62
enum sched_event_type {
63
	SCHED_EVENT_RUN,
64
	SCHED_EVENT_SLEEP,
65
	SCHED_EVENT_WAKEUP,
66
	SCHED_EVENT_MIGRATION,
67
};
68

69
struct sched_atom {
70
	enum sched_event_type	type;
71
	int			specific_wait;
72
	u64			timestamp;
73
	u64			duration;
74
	unsigned long		nr;
75
	sem_t			*wait_sem;
76
	struct task_desc	*wakee;
77
};
78

79
static struct task_desc		*pid_to_task[MAX_PID];
80

81
static struct task_desc		**tasks;
82

83
static pthread_mutex_t		start_work_mutex = PTHREAD_MUTEX_INITIALIZER;
84
static u64			start_time;
85

86
static pthread_mutex_t		work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER;
87

88
static unsigned long		nr_run_events;
89
static unsigned long		nr_sleep_events;
90
static unsigned long		nr_wakeup_events;
91

92
static unsigned long		nr_sleep_corrections;
93
static unsigned long		nr_run_events_optimized;
94

95
static unsigned long		targetless_wakeups;
96
static unsigned long		multitarget_wakeups;
97

98
static u64			cpu_usage;
99
static u64			runavg_cpu_usage;
100
static u64			parent_cpu_usage;
101
static u64			runavg_parent_cpu_usage;
102

103
static unsigned long		nr_runs;
104
static u64			sum_runtime;
105
static u64			sum_fluct;
106
static u64			run_avg;
107

108
static unsigned int		replay_repeat = 10;
109
static unsigned long		nr_timestamps;
110
static unsigned long		nr_unordered_timestamps;
111
static unsigned long		nr_state_machine_bugs;
112
static unsigned long		nr_context_switch_bugs;
113
static unsigned long		nr_events;
114
static unsigned long		nr_lost_chunks;
115
static unsigned long		nr_lost_events;
116

117
#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
118

119
enum thread_state {
120
	THREAD_SLEEPING = 0,
121
	THREAD_WAIT_CPU,
122
	THREAD_SCHED_IN,
123
	THREAD_IGNORE
124
};
125

126
struct work_atom {
127
	struct list_head	list;
128
	enum thread_state	state;
129
	u64			sched_out_time;
130
	u64			wake_up_time;
131
	u64			sched_in_time;
132
	u64			runtime;
133
};
134

135
struct work_atoms {
136
	struct list_head	work_list;
137
	struct thread		*thread;
138
	struct rb_node		node;
139
	u64			max_lat;
140
	u64			max_lat_at;
141
	u64			total_lat;
142
	u64			nb_atoms;
143
	u64			total_runtime;
144
};
145

146
typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
147

148
static struct rb_root		atom_root, sorted_atom_root;
149

150
static u64			all_runtime;
151
static u64			all_count;
152

153

154
static u64 get_nsecs(void)
155
{
156
	struct timespec ts;
157

158
	clock_gettime(CLOCK_MONOTONIC, &ts);
159

160
	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
161
}
162

163
static void burn_nsecs(u64 nsecs)
164
{
165
	u64 T0 = get_nsecs(), T1;
166

167
	do {
168
		T1 = get_nsecs();
169
	} while (T1 + run_measurement_overhead < T0 + nsecs);
170
}
171

172
static void sleep_nsecs(u64 nsecs)
173
{
174
	struct timespec ts;
175

176
	ts.tv_nsec = nsecs % 999999999;
177
	ts.tv_sec = nsecs / 999999999;
178

179
	nanosleep(&ts, NULL);
180
}
181

182
static void calibrate_run_measurement_overhead(void)
183
{
184
	u64 T0, T1, delta, min_delta = 1000000000ULL;
185
	int i;
186

187
	for (i = 0; i < 10; i++) {
188
		T0 = get_nsecs();
189
		burn_nsecs(0);
190
		T1 = get_nsecs();
191
		delta = T1-T0;
192
		min_delta = min(min_delta, delta);
193
	}
194
	run_measurement_overhead = min_delta;
195

196
	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
197
}
198

199
static void calibrate_sleep_measurement_overhead(void)
200
{
201
	u64 T0, T1, delta, min_delta = 1000000000ULL;
202
	int i;
203

204
	for (i = 0; i < 10; i++) {
205
		T0 = get_nsecs();
206
		sleep_nsecs(10000);
207
		T1 = get_nsecs();
208
		delta = T1-T0;
209
		min_delta = min(min_delta, delta);
210
	}
211
	min_delta -= 10000;
212
	sleep_measurement_overhead = min_delta;
213

214
	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
215
}
216

217
static struct sched_atom *
218
get_new_event(struct task_desc *task, u64 timestamp)
219
{
220
	struct sched_atom *event = zalloc(sizeof(*event));
221
	unsigned long idx = task->nr_events;
222
	size_t size;
223

224
	event->timestamp = timestamp;
225
	event->nr = idx;
226

227
	task->nr_events++;
228
	size = sizeof(struct sched_atom *) * task->nr_events;
229
	task->atoms = realloc(task->atoms, size);
230
	BUG_ON(!task->atoms);
231

232
	task->atoms[idx] = event;
233

234
	return event;
235
}
236

237
static struct sched_atom *last_event(struct task_desc *task)
238
{
239
	if (!task->nr_events)
240
		return NULL;
241

242
	return task->atoms[task->nr_events - 1];
243
}
244

245
static void
246
add_sched_event_run(struct task_desc *task, u64 timestamp, u64 duration)
247
{
248
	struct sched_atom *event, *curr_event = last_event(task);
249

250
	/*
251
	 * optimize an existing RUN event by merging this one
252
	 * to it:
253
	 */
254
	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
255
		nr_run_events_optimized++;
256
		curr_event->duration += duration;
257
		return;
258
	}
259

260
	event = get_new_event(task, timestamp);
261

262
	event->type = SCHED_EVENT_RUN;
263
	event->duration = duration;
264

265
	nr_run_events++;
266
}
267

268
static void
269
add_sched_event_wakeup(struct task_desc *task, u64 timestamp,
270
		       struct task_desc *wakee)
271
{
272
	struct sched_atom *event, *wakee_event;
273

274
	event = get_new_event(task, timestamp);
275
	event->type = SCHED_EVENT_WAKEUP;
276
	event->wakee = wakee;
277

278
	wakee_event = last_event(wakee);
279
	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
280
		targetless_wakeups++;
281
		return;
282
	}
283
	if (wakee_event->wait_sem) {
284
		multitarget_wakeups++;
285
		return;
286
	}
287

288
	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
289
	sem_init(wakee_event->wait_sem, 0, 0);
290
	wakee_event->specific_wait = 1;
291
	event->wait_sem = wakee_event->wait_sem;
292

293
	nr_wakeup_events++;
294
}
295

296
static void
297
add_sched_event_sleep(struct task_desc *task, u64 timestamp,
298
		      u64 task_state __used)
299
{
300
	struct sched_atom *event = get_new_event(task, timestamp);
301

302
	event->type = SCHED_EVENT_SLEEP;
303

304
	nr_sleep_events++;
305
}
306

307
static struct task_desc *register_pid(unsigned long pid, const char *comm)
308
{
309
	struct task_desc *task;
310

311
	BUG_ON(pid >= MAX_PID);
312

313
	task = pid_to_task[pid];
314

315
	if (task)
316
		return task;
317

318
	task = zalloc(sizeof(*task));
319
	task->pid = pid;
320
	task->nr = nr_tasks;
321
	strcpy(task->comm, comm);
322
	/*
323
	 * every task starts in sleeping state - this gets ignored
324
	 * if there's no wakeup pointing to this sleep state:
325
	 */
326
	add_sched_event_sleep(task, 0, 0);
327

328
	pid_to_task[pid] = task;
329
	nr_tasks++;
330
	tasks = realloc(tasks, nr_tasks*sizeof(struct task_task *));
331
	BUG_ON(!tasks);
332
	tasks[task->nr] = task;
333

334
	if (verbose)
335
		printf("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm);
336

337
	return task;
338
}
339

340

341
static void print_task_traces(void)
342
{
343
	struct task_desc *task;
344
	unsigned long i;
345

346
	for (i = 0; i < nr_tasks; i++) {
347
		task = tasks[i];
348
		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
349
			task->nr, task->comm, task->pid, task->nr_events);
350
	}
351
}
352

353
static void add_cross_task_wakeups(void)
354
{
355
	struct task_desc *task1, *task2;
356
	unsigned long i, j;
357

358
	for (i = 0; i < nr_tasks; i++) {
359
		task1 = tasks[i];
360
		j = i + 1;
361
		if (j == nr_tasks)
362
			j = 0;
363
		task2 = tasks[j];
364
		add_sched_event_wakeup(task1, 0, task2);
365
	}
366
}
367

368
static void
369
process_sched_event(struct task_desc *this_task __used, struct sched_atom *atom)
370
{
371
	int ret = 0;
372

373
	switch (atom->type) {
374
		case SCHED_EVENT_RUN:
375
			burn_nsecs(atom->duration);
376
			break;
377
		case SCHED_EVENT_SLEEP:
378
			if (atom->wait_sem)
379
				ret = sem_wait(atom->wait_sem);
380
			BUG_ON(ret);
381
			break;
382
		case SCHED_EVENT_WAKEUP:
383
			if (atom->wait_sem)
384
				ret = sem_post(atom->wait_sem);
385
			BUG_ON(ret);
386
			break;
387
		case SCHED_EVENT_MIGRATION:
388
			break;
389
		default:
390
			BUG_ON(1);
391
	}
392
}
393

394
static u64 get_cpu_usage_nsec_parent(void)
395
{
396
	struct rusage ru;
397
	u64 sum;
398
	int err;
399

400
	err = getrusage(RUSAGE_SELF, &ru);
401
	BUG_ON(err);
402

403
	sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
404
	sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;
405

406
	return sum;
407
}
408

409
static int self_open_counters(void)
410
{
411
	struct perf_event_attr attr;
412
	int fd;
413

414
	memset(&attr, 0, sizeof(attr));
415

416
	attr.type = PERF_TYPE_SOFTWARE;
417
	attr.config = PERF_COUNT_SW_TASK_CLOCK;
418

419
	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
420

421
	if (fd < 0)
422
		die("Error: sys_perf_event_open() syscall returned"
423
		    "with %d (%s)\n", fd, strerror(errno));
424
	return fd;
425
}
426

427
static u64 get_cpu_usage_nsec_self(int fd)
428
{
429
	u64 runtime;
430
	int ret;
431

432
	ret = read(fd, &runtime, sizeof(runtime));
433
	BUG_ON(ret != sizeof(runtime));
434

435
	return runtime;
436
}
437

438
static void *thread_func(void *ctx)
439
{
440
	struct task_desc *this_task = ctx;
441
	u64 cpu_usage_0, cpu_usage_1;
442
	unsigned long i, ret;
443
	char comm2[22];
444
	int fd;
445

446
	sprintf(comm2, ":%s", this_task->comm);
447
	prctl(PR_SET_NAME, comm2);
448
	fd = self_open_counters();
449

450
again:
451
	ret = sem_post(&this_task->ready_for_work);
452
	BUG_ON(ret);
453
	ret = pthread_mutex_lock(&start_work_mutex);
454
	BUG_ON(ret);
455
	ret = pthread_mutex_unlock(&start_work_mutex);
456
	BUG_ON(ret);
457

458
	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
459

460
	for (i = 0; i < this_task->nr_events; i++) {
461
		this_task->curr_event = i;
462
		process_sched_event(this_task, this_task->atoms[i]);
463
	}
464

465
	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
466
	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
467
	ret = sem_post(&this_task->work_done_sem);
468
	BUG_ON(ret);
469

470
	ret = pthread_mutex_lock(&work_done_wait_mutex);
471
	BUG_ON(ret);
472
	ret = pthread_mutex_unlock(&work_done_wait_mutex);
473
	BUG_ON(ret);
474

475
	goto again;
476
}
477

478
static void create_tasks(void)
479
{
480
	struct task_desc *task;
481
	pthread_attr_t attr;
482
	unsigned long i;
483
	int err;
484

485
	err = pthread_attr_init(&attr);
486
	BUG_ON(err);
487
	err = pthread_attr_setstacksize(&attr,
488
			(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
489
	BUG_ON(err);
490
	err = pthread_mutex_lock(&start_work_mutex);
491
	BUG_ON(err);
492
	err = pthread_mutex_lock(&work_done_wait_mutex);
493
	BUG_ON(err);
494
	for (i = 0; i < nr_tasks; i++) {
495
		task = tasks[i];
496
		sem_init(&task->sleep_sem, 0, 0);
497
		sem_init(&task->ready_for_work, 0, 0);
498
		sem_init(&task->work_done_sem, 0, 0);
499
		task->curr_event = 0;
500
		err = pthread_create(&task->thread, &attr, thread_func, task);
501
		BUG_ON(err);
502
	}
503
}
504

505
static void wait_for_tasks(void)
506
{
507
	u64 cpu_usage_0, cpu_usage_1;
508
	struct task_desc *task;
509
	unsigned long i, ret;
510

511
	start_time = get_nsecs();
512
	cpu_usage = 0;
513
	pthread_mutex_unlock(&work_done_wait_mutex);
514

515
	for (i = 0; i < nr_tasks; i++) {
516
		task = tasks[i];
517
		ret = sem_wait(&task->ready_for_work);
518
		BUG_ON(ret);
519
		sem_init(&task->ready_for_work, 0, 0);
520
	}
521
	ret = pthread_mutex_lock(&work_done_wait_mutex);
522
	BUG_ON(ret);
523

524
	cpu_usage_0 = get_cpu_usage_nsec_parent();
525

526
	pthread_mutex_unlock(&start_work_mutex);
527

528
	for (i = 0; i < nr_tasks; i++) {
529
		task = tasks[i];
530
		ret = sem_wait(&task->work_done_sem);
531
		BUG_ON(ret);
532
		sem_init(&task->work_done_sem, 0, 0);
533
		cpu_usage += task->cpu_usage;
534
		task->cpu_usage = 0;
535
	}
536

537
	cpu_usage_1 = get_cpu_usage_nsec_parent();
538
	if (!runavg_cpu_usage)
539
		runavg_cpu_usage = cpu_usage;
540
	runavg_cpu_usage = (runavg_cpu_usage*9 + cpu_usage)/10;
541

542
	parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
543
	if (!runavg_parent_cpu_usage)
544
		runavg_parent_cpu_usage = parent_cpu_usage;
545
	runavg_parent_cpu_usage = (runavg_parent_cpu_usage*9 +
546
				   parent_cpu_usage)/10;
547

548
	ret = pthread_mutex_lock(&start_work_mutex);
549
	BUG_ON(ret);
550

551
	for (i = 0; i < nr_tasks; i++) {
552
		task = tasks[i];
553
		sem_init(&task->sleep_sem, 0, 0);
554
		task->curr_event = 0;
555
	}
556
}
557

558
static void run_one_test(void)
559
{
560
	u64 T0, T1, delta, avg_delta, fluct;
561

562
	T0 = get_nsecs();
563
	wait_for_tasks();
564
	T1 = get_nsecs();
565

566
	delta = T1 - T0;
567
	sum_runtime += delta;
568
	nr_runs++;
569

570
	avg_delta = sum_runtime / nr_runs;
571
	if (delta < avg_delta)
572
		fluct = avg_delta - delta;
573
	else
574
		fluct = delta - avg_delta;
575
	sum_fluct += fluct;
576
	if (!run_avg)
577
		run_avg = delta;
578
	run_avg = (run_avg*9 + delta)/10;
579

580
	printf("#%-3ld: %0.3f, ",
581
		nr_runs, (double)delta/1000000.0);
582

583
	printf("ravg: %0.2f, ",
584
		(double)run_avg/1e6);
585

586
	printf("cpu: %0.2f / %0.2f",
587
		(double)cpu_usage/1e6, (double)runavg_cpu_usage/1e6);
588

589
#if 0
590
	/*
591
	 * rusage statistics done by the parent, these are less
592
	 * accurate than the sum_exec_runtime based statistics:
593
	 */
594
	printf(" [%0.2f / %0.2f]",
595
		(double)parent_cpu_usage/1e6,
596
		(double)runavg_parent_cpu_usage/1e6);
597
#endif
598

599
	printf("\n");
600

601
	if (nr_sleep_corrections)
602
		printf(" (%ld sleep corrections)\n", nr_sleep_corrections);
603
	nr_sleep_corrections = 0;
604
}
605

606
static void test_calibrations(void)
607
{
608
	u64 T0, T1;
609

610
	T0 = get_nsecs();
611
	burn_nsecs(1e6);
612
	T1 = get_nsecs();
613

614
	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
615

616
	T0 = get_nsecs();
617
	sleep_nsecs(1e6);
618
	T1 = get_nsecs();
619

620
	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
621
}
622

623
#define FILL_FIELD(ptr, field, event, data)	\
624
	ptr.field = (typeof(ptr.field)) raw_field_value(event, #field, data)
625

626
#define FILL_ARRAY(ptr, array, event, data)			\
627
do {								\
628
	void *__array = raw_field_ptr(event, #array, data);	\
629
	memcpy(ptr.array, __array, sizeof(ptr.array));	\
630
} while(0)
631

632
#define FILL_COMMON_FIELDS(ptr, event, data)			\
633
do {								\
634
	FILL_FIELD(ptr, common_type, event, data);		\
635
	FILL_FIELD(ptr, common_flags, event, data);		\
636
	FILL_FIELD(ptr, common_preempt_count, event, data);	\
637
	FILL_FIELD(ptr, common_pid, event, data);		\
638
	FILL_FIELD(ptr, common_tgid, event, data);		\
639
} while (0)
640

641

642

643
struct trace_switch_event {
644
	u32 size;
645

646
	u16 common_type;
647
	u8 common_flags;
648
	u8 common_preempt_count;
649
	u32 common_pid;
650
	u32 common_tgid;
651

652
	char prev_comm[16];
653
	u32 prev_pid;
654
	u32 prev_prio;
655
	u64 prev_state;
656
	char next_comm[16];
657
	u32 next_pid;
658
	u32 next_prio;
659
};
660

661
struct trace_runtime_event {
662
	u32 size;
663

664
	u16 common_type;
665
	u8 common_flags;
666
	u8 common_preempt_count;
667
	u32 common_pid;
668
	u32 common_tgid;
669

670
	char comm[16];
671
	u32 pid;
672
	u64 runtime;
673
	u64 vruntime;
674
};
675

676
struct trace_wakeup_event {
677
	u32 size;
678

679
	u16 common_type;
680
	u8 common_flags;
681
	u8 common_preempt_count;
682
	u32 common_pid;
683
	u32 common_tgid;
684

685
	char comm[16];
686
	u32 pid;
687

688
	u32 prio;
689
	u32 success;
690
	u32 cpu;
691
};
692

693
struct trace_fork_event {
694
	u32 size;
695

696
	u16 common_type;
697
	u8 common_flags;
698
	u8 common_preempt_count;
699
	u32 common_pid;
700
	u32 common_tgid;
701

702
	char parent_comm[16];
703
	u32 parent_pid;
704
	char child_comm[16];
705
	u32 child_pid;
706
};
707

708
struct trace_migrate_task_event {
709
	u32 size;
710

711
	u16 common_type;
712
	u8 common_flags;
713
	u8 common_preempt_count;
714
	u32 common_pid;
715
	u32 common_tgid;
716

717
	char comm[16];
718
	u32 pid;
719

720
	u32 prio;
721
	u32 cpu;
722
};
723

724
struct trace_sched_handler {
725
	void (*switch_event)(struct trace_switch_event *,
726
			     struct perf_session *,
727
			     struct event *,
728
			     int cpu,
729
			     u64 timestamp,
730
			     struct thread *thread);
731

732
	void (*runtime_event)(struct trace_runtime_event *,
733
			      struct perf_session *,
734
			      struct event *,
735
			      int cpu,
736
			      u64 timestamp,
737
			      struct thread *thread);
738

739
	void (*wakeup_event)(struct trace_wakeup_event *,
740
			     struct perf_session *,
741
			     struct event *,
742
			     int cpu,
743
			     u64 timestamp,
744
			     struct thread *thread);
745

746
	void (*fork_event)(struct trace_fork_event *,
747
			   struct event *,
748
			   int cpu,
749
			   u64 timestamp,
750
			   struct thread *thread);
751

752
	void (*migrate_task_event)(struct trace_migrate_task_event *,
753
			   struct perf_session *session,
754
			   struct event *,
755
			   int cpu,
756
			   u64 timestamp,
757
			   struct thread *thread);
758
};
759

760

761
static void
762
replay_wakeup_event(struct trace_wakeup_event *wakeup_event,
763
		    struct perf_session *session __used,
764
		    struct event *event,
765
		    int cpu __used,
766
		    u64 timestamp __used,
767
		    struct thread *thread __used)
768
{
769
	struct task_desc *waker, *wakee;
770

771
	if (verbose) {
772
		printf("sched_wakeup event %p\n", event);
773

774
		printf(" ... pid %d woke up %s/%d\n",
775
			wakeup_event->common_pid,
776
			wakeup_event->comm,
777
			wakeup_event->pid);
778
	}
779

780
	waker = register_pid(wakeup_event->common_pid, "<unknown>");
781
	wakee = register_pid(wakeup_event->pid, wakeup_event->comm);
782

783
	add_sched_event_wakeup(waker, timestamp, wakee);
784
}
785

786
static u64 cpu_last_switched[MAX_CPUS];
787

788
static void
789
replay_switch_event(struct trace_switch_event *switch_event,
790
		    struct perf_session *session __used,
791
		    struct event *event,
792
		    int cpu,
793
		    u64 timestamp,
794
		    struct thread *thread __used)
795
{
796
	struct task_desc *prev, __used *next;
797
	u64 timestamp0;
798
	s64 delta;
799

800
	if (verbose)
801
		printf("sched_switch event %p\n", event);
802

803
	if (cpu >= MAX_CPUS || cpu < 0)
804
		return;
805

806
	timestamp0 = cpu_last_switched[cpu];
807
	if (timestamp0)
808
		delta = timestamp - timestamp0;
809
	else
810
		delta = 0;
811

812
	if (delta < 0)
813
		die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
814

815
	if (verbose) {
816
		printf(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
817
			switch_event->prev_comm, switch_event->prev_pid,
818
			switch_event->next_comm, switch_event->next_pid,
819
			delta);
820
	}
821

822
	prev = register_pid(switch_event->prev_pid, switch_event->prev_comm);
823
	next = register_pid(switch_event->next_pid, switch_event->next_comm);
824

825
	cpu_last_switched[cpu] = timestamp;
826

827
	add_sched_event_run(prev, timestamp, delta);
828
	add_sched_event_sleep(prev, timestamp, switch_event->prev_state);
829
}
830

831

832
static void
833
replay_fork_event(struct trace_fork_event *fork_event,
834
		  struct event *event,
835
		  int cpu __used,
836
		  u64 timestamp __used,
837
		  struct thread *thread __used)
838
{
839
	if (verbose) {
840
		printf("sched_fork event %p\n", event);
841
		printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
842
		printf("...  child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
843
	}
844
	register_pid(fork_event->parent_pid, fork_event->parent_comm);
845
	register_pid(fork_event->child_pid, fork_event->child_comm);
846
}
847

848
static struct trace_sched_handler replay_ops  = {
849
	.wakeup_event		= replay_wakeup_event,
850
	.switch_event		= replay_switch_event,
851
	.fork_event		= replay_fork_event,
852
};
853

854
struct sort_dimension {
855
	const char		*name;
856
	sort_fn_t		cmp;
857
	struct list_head	list;
858
};
859

860
static LIST_HEAD(cmp_pid);
861

862
static int
863
thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
864
{
865
	struct sort_dimension *sort;
866
	int ret = 0;
867

868
	BUG_ON(list_empty(list));
869

870
	list_for_each_entry(sort, list, list) {
871
		ret = sort->cmp(l, r);
872
		if (ret)
873
			return ret;
874
	}
875

876
	return ret;
877
}
878

879
static struct work_atoms *
880
thread_atoms_search(struct rb_root *root, struct thread *thread,
881
			 struct list_head *sort_list)
882
{
883
	struct rb_node *node = root->rb_node;
884
	struct work_atoms key = { .thread = thread };
885

886
	while (node) {
887
		struct work_atoms *atoms;
888
		int cmp;
889

890
		atoms = container_of(node, struct work_atoms, node);
891

892
		cmp = thread_lat_cmp(sort_list, &key, atoms);
893
		if (cmp > 0)
894
			node = node->rb_left;
895
		else if (cmp < 0)
896
			node = node->rb_right;
897
		else {
898
			BUG_ON(thread != atoms->thread);
899
			return atoms;
900
		}
901
	}
902
	return NULL;
903
}
904

905
static void
906
__thread_latency_insert(struct rb_root *root, struct work_atoms *data,
907
			 struct list_head *sort_list)
908
{
909
	struct rb_node **new = &(root->rb_node), *parent = NULL;
910

911
	while (*new) {
912
		struct work_atoms *this;
913
		int cmp;
914

915
		this = container_of(*new, struct work_atoms, node);
916
		parent = *new;
917

918
		cmp = thread_lat_cmp(sort_list, data, this);
919

920
		if (cmp > 0)
921
			new = &((*new)->rb_left);
922
		else
923
			new = &((*new)->rb_right);
924
	}
925

926
	rb_link_node(&data->node, parent, new);
927
	rb_insert_color(&data->node, root);
928
}
929

930
static void thread_atoms_insert(struct thread *thread)
931
{
932
	struct work_atoms *atoms = zalloc(sizeof(*atoms));
933
	if (!atoms)
934
		die("No memory");
935

936
	atoms->thread = thread;
937
	INIT_LIST_HEAD(&atoms->work_list);
938
	__thread_latency_insert(&atom_root, atoms, &cmp_pid);
939
}
940

941
static void
942
latency_fork_event(struct trace_fork_event *fork_event __used,
943
		   struct event *event __used,
944
		   int cpu __used,
945
		   u64 timestamp __used,
946
		   struct thread *thread __used)
947
{
948
	/* should insert the newcomer */
949
}
950

951
__used
952
static char sched_out_state(struct trace_switch_event *switch_event)
953
{
954
	const char *str = TASK_STATE_TO_CHAR_STR;
955

956
	return str[switch_event->prev_state];
957
}
958

959
static void
960
add_sched_out_event(struct work_atoms *atoms,
961
		    char run_state,
962
		    u64 timestamp)
963
{
964
	struct work_atom *atom = zalloc(sizeof(*atom));
965
	if (!atom)
966
		die("Non memory");
967

968
	atom->sched_out_time = timestamp;
969

970
	if (run_state == 'R') {
971
		atom->state = THREAD_WAIT_CPU;
972
		atom->wake_up_time = atom->sched_out_time;
973
	}
974

975
	list_add_tail(&atom->list, &atoms->work_list);
976
}
977

978
static void
979
add_runtime_event(struct work_atoms *atoms, u64 delta, u64 timestamp __used)
980
{
981
	struct work_atom *atom;
982

983
	BUG_ON(list_empty(&atoms->work_list));
984

985
	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
986

987
	atom->runtime += delta;
988
	atoms->total_runtime += delta;
989
}
990

991
static void
992
add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
993
{
994
	struct work_atom *atom;
995
	u64 delta;
996

997
	if (list_empty(&atoms->work_list))
998
		return;
999

1000
	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1001

1002
	if (atom->state != THREAD_WAIT_CPU)
1003
		return;
1004

1005
	if (timestamp < atom->wake_up_time) {
1006
		atom->state = THREAD_IGNORE;
1007
		return;
1008
	}
1009

1010
	atom->state = THREAD_SCHED_IN;
1011
	atom->sched_in_time = timestamp;
1012

1013
	delta = atom->sched_in_time - atom->wake_up_time;
1014
	atoms->total_lat += delta;
1015
	if (delta > atoms->max_lat) {
1016
		atoms->max_lat = delta;
1017
		atoms->max_lat_at = timestamp;
1018
	}
1019
	atoms->nb_atoms++;
1020
}
1021

1022
static void
1023
latency_switch_event(struct trace_switch_event *switch_event,
1024
		     struct perf_session *session,
1025
		     struct event *event __used,
1026
		     int cpu,
1027
		     u64 timestamp,
1028
		     struct thread *thread __used)
1029
{
1030
	struct work_atoms *out_events, *in_events;
1031
	struct thread *sched_out, *sched_in;
1032
	u64 timestamp0;
1033
	s64 delta;
1034

1035
	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1036

1037
	timestamp0 = cpu_last_switched[cpu];
1038
	cpu_last_switched[cpu] = timestamp;
1039
	if (timestamp0)
1040
		delta = timestamp - timestamp0;
1041
	else
1042
		delta = 0;
1043

1044
	if (delta < 0)
1045
		die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1046

1047

1048
	sched_out = perf_session__findnew(session, switch_event->prev_pid);
1049
	sched_in = perf_session__findnew(session, switch_event->next_pid);
1050

1051
	out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
1052
	if (!out_events) {
1053
		thread_atoms_insert(sched_out);
1054
		out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
1055
		if (!out_events)
1056
			die("out-event: Internal tree error");
1057
	}
1058
	add_sched_out_event(out_events, sched_out_state(switch_event), timestamp);
1059

1060
	in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
1061
	if (!in_events) {
1062
		thread_atoms_insert(sched_in);
1063
		in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
1064
		if (!in_events)
1065
			die("in-event: Internal tree error");
1066
		/*
1067
		 * Take came in we have not heard about yet,
1068
		 * add in an initial atom in runnable state:
1069
		 */
1070
		add_sched_out_event(in_events, 'R', timestamp);
1071
	}
1072
	add_sched_in_event(in_events, timestamp);
1073
}
1074

1075
static void
1076
latency_runtime_event(struct trace_runtime_event *runtime_event,
1077
		     struct perf_session *session,
1078
		     struct event *event __used,
1079
		     int cpu,
1080
		     u64 timestamp,
1081
		     struct thread *this_thread __used)
1082
{
1083
	struct thread *thread = perf_session__findnew(session, runtime_event->pid);
1084
	struct work_atoms *atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
1085

1086
	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1087
	if (!atoms) {
1088
		thread_atoms_insert(thread);
1089
		atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
1090
		if (!atoms)
1091
			die("in-event: Internal tree error");
1092
		add_sched_out_event(atoms, 'R', timestamp);
1093
	}
1094

1095
	add_runtime_event(atoms, runtime_event->runtime, timestamp);
1096
}
1097

1098
static void
1099
latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
1100
		     struct perf_session *session,
1101
		     struct event *__event __used,
1102
		     int cpu __used,
1103
		     u64 timestamp,
1104
		     struct thread *thread __used)
1105
{
1106
	struct work_atoms *atoms;
1107
	struct work_atom *atom;
1108
	struct thread *wakee;
1109

1110
	/* Note for later, it may be interesting to observe the failing cases */
1111
	if (!wakeup_event->success)
1112
		return;
1113

1114
	wakee = perf_session__findnew(session, wakeup_event->pid);
1115
	atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
1116
	if (!atoms) {
1117
		thread_atoms_insert(wakee);
1118
		atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
1119
		if (!atoms)
1120
			die("wakeup-event: Internal tree error");
1121
		add_sched_out_event(atoms, 'S', timestamp);
1122
	}
1123

1124
	BUG_ON(list_empty(&atoms->work_list));
1125

1126
	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1127

1128
	/*
1129
	 * You WILL be missing events if you've recorded only
1130
	 * one CPU, or are only looking at only one, so don't
1131
	 * make useless noise.
1132
	 */
1133
	if (profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1134
		nr_state_machine_bugs++;
1135

1136
	nr_timestamps++;
1137
	if (atom->sched_out_time > timestamp) {
1138
		nr_unordered_timestamps++;
1139
		return;
1140
	}
1141

1142
	atom->state = THREAD_WAIT_CPU;
1143
	atom->wake_up_time = timestamp;
1144
}
1145

1146
static void
1147
latency_migrate_task_event(struct trace_migrate_task_event *migrate_task_event,
1148
		     struct perf_session *session,
1149
		     struct event *__event __used,
1150
		     int cpu __used,
1151
		     u64 timestamp,
1152
		     struct thread *thread __used)
1153
{
1154
	struct work_atoms *atoms;
1155
	struct work_atom *atom;
1156
	struct thread *migrant;
1157

1158
	/*
1159
	 * Only need to worry about migration when profiling one CPU.
1160
	 */
1161
	if (profile_cpu == -1)
1162
		return;
1163

1164
	migrant = perf_session__findnew(session, migrate_task_event->pid);
1165
	atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid);
1166
	if (!atoms) {
1167
		thread_atoms_insert(migrant);
1168
		register_pid(migrant->pid, migrant->comm);
1169
		atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid);
1170
		if (!atoms)
1171
			die("migration-event: Internal tree error");
1172
		add_sched_out_event(atoms, 'R', timestamp);
1173
	}
1174

1175
	BUG_ON(list_empty(&atoms->work_list));
1176

1177
	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1178
	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
1179

1180
	nr_timestamps++;
1181

1182
	if (atom->sched_out_time > timestamp)
1183
		nr_unordered_timestamps++;
1184
}
1185

1186
static struct trace_sched_handler lat_ops  = {
1187
	.wakeup_event		= latency_wakeup_event,
1188
	.switch_event		= latency_switch_event,
1189
	.runtime_event		= latency_runtime_event,
1190
	.fork_event		= latency_fork_event,
1191
	.migrate_task_event	= latency_migrate_task_event,
1192
};
1193

1194
static void output_lat_thread(struct work_atoms *work_list)
1195
{
1196
	int i;
1197
	int ret;
1198
	u64 avg;
1199

1200
	if (!work_list->nb_atoms)
1201
		return;
1202
	/*
1203
	 * Ignore idle threads:
1204
	 */
1205
	if (!strcmp(work_list->thread->comm, "swapper"))
1206
		return;
1207

1208
	all_runtime += work_list->total_runtime;
1209
	all_count += work_list->nb_atoms;
1210

1211
	ret = printf("  %s:%d ", work_list->thread->comm, work_list->thread->pid);
1212

1213
	for (i = 0; i < 24 - ret; i++)
1214
		printf(" ");
1215

1216
	avg = work_list->total_lat / work_list->nb_atoms;
1217

1218
	printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %9.6f s\n",
1219
	      (double)work_list->total_runtime / 1e6,
1220
		 work_list->nb_atoms, (double)avg / 1e6,
1221
		 (double)work_list->max_lat / 1e6,
1222
		 (double)work_list->max_lat_at / 1e9);
1223
}
1224

1225
static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1226
{
1227
	if (l->thread->pid < r->thread->pid)
1228
		return -1;
1229
	if (l->thread->pid > r->thread->pid)
1230
		return 1;
1231

1232
	return 0;
1233
}
1234

1235
static struct sort_dimension pid_sort_dimension = {
1236
	.name			= "pid",
1237
	.cmp			= pid_cmp,
1238
};
1239

1240
static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1241
{
1242
	u64 avgl, avgr;
1243

1244
	if (!l->nb_atoms)
1245
		return -1;
1246

1247
	if (!r->nb_atoms)
1248
		return 1;
1249

1250
	avgl = l->total_lat / l->nb_atoms;
1251
	avgr = r->total_lat / r->nb_atoms;
1252

1253
	if (avgl < avgr)
1254
		return -1;
1255
	if (avgl > avgr)
1256
		return 1;
1257

1258
	return 0;
1259
}
1260

1261
static struct sort_dimension avg_sort_dimension = {
1262
	.name			= "avg",
1263
	.cmp			= avg_cmp,
1264
};
1265

1266
static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1267
{
1268
	if (l->max_lat < r->max_lat)
1269
		return -1;
1270
	if (l->max_lat > r->max_lat)
1271
		return 1;
1272

1273
	return 0;
1274
}
1275

1276
static struct sort_dimension max_sort_dimension = {
1277
	.name			= "max",
1278
	.cmp			= max_cmp,
1279
};
1280

1281
static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1282
{
1283
	if (l->nb_atoms < r->nb_atoms)
1284
		return -1;
1285
	if (l->nb_atoms > r->nb_atoms)
1286
		return 1;
1287

1288
	return 0;
1289
}
1290

1291
static struct sort_dimension switch_sort_dimension = {
1292
	.name			= "switch",
1293
	.cmp			= switch_cmp,
1294
};
1295

1296
static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1297
{
1298
	if (l->total_runtime < r->total_runtime)
1299
		return -1;
1300
	if (l->total_runtime > r->total_runtime)
1301
		return 1;
1302

1303
	return 0;
1304
}
1305

1306
static struct sort_dimension runtime_sort_dimension = {
1307
	.name			= "runtime",
1308
	.cmp			= runtime_cmp,
1309
};
1310

1311
static struct sort_dimension *available_sorts[] = {
1312
	&pid_sort_dimension,
1313
	&avg_sort_dimension,
1314
	&max_sort_dimension,
1315
	&switch_sort_dimension,
1316
	&runtime_sort_dimension,
1317
};
1318

1319
#define NB_AVAILABLE_SORTS	(int)(sizeof(available_sorts) / sizeof(struct sort_dimension *))
1320

1321
static LIST_HEAD(sort_list);
1322

1323
static int sort_dimension__add(const char *tok, struct list_head *list)
1324
{
1325
	int i;
1326

1327
	for (i = 0; i < NB_AVAILABLE_SORTS; i++) {
1328
		if (!strcmp(available_sorts[i]->name, tok)) {
1329
			list_add_tail(&available_sorts[i]->list, list);
1330

1331
			return 0;
1332
		}
1333
	}
1334

1335
	return -1;
1336
}
1337

1338
static void setup_sorting(void);
1339

1340
static void sort_lat(void)
1341
{
1342
	struct rb_node *node;
1343

1344
	for (;;) {
1345
		struct work_atoms *data;
1346
		node = rb_first(&atom_root);
1347
		if (!node)
1348
			break;
1349

1350
		rb_erase(node, &atom_root);
1351
		data = rb_entry(node, struct work_atoms, node);
1352
		__thread_latency_insert(&sorted_atom_root, data, &sort_list);
1353
	}
1354
}
1355

1356
static struct trace_sched_handler *trace_handler;
1357

1358
static void
1359
process_sched_wakeup_event(void *data, struct perf_session *session,
1360
			   struct event *event,
1361
			   int cpu __used,
1362
			   u64 timestamp __used,
1363
			   struct thread *thread __used)
1364
{
1365
	struct trace_wakeup_event wakeup_event;
1366

1367
	FILL_COMMON_FIELDS(wakeup_event, event, data);
1368

1369
	FILL_ARRAY(wakeup_event, comm, event, data);
1370
	FILL_FIELD(wakeup_event, pid, event, data);
1371
	FILL_FIELD(wakeup_event, prio, event, data);
1372
	FILL_FIELD(wakeup_event, success, event, data);
1373
	FILL_FIELD(wakeup_event, cpu, event, data);
1374

1375
	if (trace_handler->wakeup_event)
1376
		trace_handler->wakeup_event(&wakeup_event, session, event,
1377
					    cpu, timestamp, thread);
1378
}
1379

1380
/*
1381
 * Track the current task - that way we can know whether there's any
1382
 * weird events, such as a task being switched away that is not current.
1383
 */
1384
static int max_cpu;
1385

1386
static u32 curr_pid[MAX_CPUS] = { [0 ... MAX_CPUS-1] = -1 };
1387

1388
static struct thread *curr_thread[MAX_CPUS];
1389

1390
static char next_shortname1 = 'A';
1391
static char next_shortname2 = '0';
1392

1393
static void
1394
map_switch_event(struct trace_switch_event *switch_event,
1395
		 struct perf_session *session,
1396
		 struct event *event __used,
1397
		 int this_cpu,
1398
		 u64 timestamp,
1399
		 struct thread *thread __used)
1400
{
1401
	struct thread *sched_out __used, *sched_in;
1402
	int new_shortname;
1403
	u64 timestamp0;
1404
	s64 delta;
1405
	int cpu;
1406

1407
	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
1408

1409
	if (this_cpu > max_cpu)
1410
		max_cpu = this_cpu;
1411

1412
	timestamp0 = cpu_last_switched[this_cpu];
1413
	cpu_last_switched[this_cpu] = timestamp;
1414
	if (timestamp0)
1415
		delta = timestamp - timestamp0;
1416
	else
1417
		delta = 0;
1418

1419
	if (delta < 0)
1420
		die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1421

1422

1423
	sched_out = perf_session__findnew(session, switch_event->prev_pid);
1424
	sched_in = perf_session__findnew(session, switch_event->next_pid);
1425

1426
	curr_thread[this_cpu] = sched_in;
1427

1428
	printf("  ");
1429

1430
	new_shortname = 0;
1431
	if (!sched_in->shortname[0]) {
1432
		sched_in->shortname[0] = next_shortname1;
1433
		sched_in->shortname[1] = next_shortname2;
1434

1435
		if (next_shortname1 < 'Z') {
1436
			next_shortname1++;
1437
		} else {
1438
			next_shortname1='A';
1439
			if (next_shortname2 < '9') {
1440
				next_shortname2++;
1441
			} else {
1442
				next_shortname2='0';
1443
			}
1444
		}
1445
		new_shortname = 1;
1446
	}
1447

1448
	for (cpu = 0; cpu <= max_cpu; cpu++) {
1449
		if (cpu != this_cpu)
1450
			printf(" ");
1451
		else
1452
			printf("*");
1453

1454
		if (curr_thread[cpu]) {
1455
			if (curr_thread[cpu]->pid)
1456
				printf("%2s ", curr_thread[cpu]->shortname);
1457
			else
1458
				printf(".  ");
1459
		} else
1460
			printf("   ");
1461
	}
1462

1463
	printf("  %12.6f secs ", (double)timestamp/1e9);
1464
	if (new_shortname) {
1465
		printf("%s => %s:%d\n",
1466
			sched_in->shortname, sched_in->comm, sched_in->pid);
1467
	} else {
1468
		printf("\n");
1469
	}
1470
}
1471

1472

1473
static void
1474
process_sched_switch_event(void *data, struct perf_session *session,
1475
			   struct event *event,
1476
			   int this_cpu,
1477
			   u64 timestamp __used,
1478
			   struct thread *thread __used)
1479
{
1480
	struct trace_switch_event switch_event;
1481

1482
	FILL_COMMON_FIELDS(switch_event, event, data);
1483

1484
	FILL_ARRAY(switch_event, prev_comm, event, data);
1485
	FILL_FIELD(switch_event, prev_pid, event, data);
1486
	FILL_FIELD(switch_event, prev_prio, event, data);
1487
	FILL_FIELD(switch_event, prev_state, event, data);
1488
	FILL_ARRAY(switch_event, next_comm, event, data);
1489
	FILL_FIELD(switch_event, next_pid, event, data);
1490
	FILL_FIELD(switch_event, next_prio, event, data);
1491

1492
	if (curr_pid[this_cpu] != (u32)-1) {
1493
		/*
1494
		 * Are we trying to switch away a PID that is
1495
		 * not current?
1496
		 */
1497
		if (curr_pid[this_cpu] != switch_event.prev_pid)
1498
			nr_context_switch_bugs++;
1499
	}
1500
	if (trace_handler->switch_event)
1501
		trace_handler->switch_event(&switch_event, session, event,
1502
					    this_cpu, timestamp, thread);
1503

1504
	curr_pid[this_cpu] = switch_event.next_pid;
1505
}
1506

1507
static void
1508
process_sched_runtime_event(void *data, struct perf_session *session,
1509
			   struct event *event,
1510
			   int cpu __used,
1511
			   u64 timestamp __used,
1512
			   struct thread *thread __used)
1513
{
1514
	struct trace_runtime_event runtime_event;
1515

1516
	FILL_ARRAY(runtime_event, comm, event, data);
1517
	FILL_FIELD(runtime_event, pid, event, data);
1518
	FILL_FIELD(runtime_event, runtime, event, data);
1519
	FILL_FIELD(runtime_event, vruntime, event, data);
1520

1521
	if (trace_handler->runtime_event)
1522
		trace_handler->runtime_event(&runtime_event, session, event, cpu, timestamp, thread);
1523
}
1524

1525
static void
1526
process_sched_fork_event(void *data,
1527
			 struct event *event,
1528
			 int cpu __used,
1529
			 u64 timestamp __used,
1530
			 struct thread *thread __used)
1531
{
1532
	struct trace_fork_event fork_event;
1533

1534
	FILL_COMMON_FIELDS(fork_event, event, data);
1535

1536
	FILL_ARRAY(fork_event, parent_comm, event, data);
1537
	FILL_FIELD(fork_event, parent_pid, event, data);
1538
	FILL_ARRAY(fork_event, child_comm, event, data);
1539
	FILL_FIELD(fork_event, child_pid, event, data);
1540

1541
	if (trace_handler->fork_event)
1542
		trace_handler->fork_event(&fork_event, event,
1543
					  cpu, timestamp, thread);
1544
}
1545

1546
static void
1547
process_sched_exit_event(struct event *event,
1548
			 int cpu __used,
1549
			 u64 timestamp __used,
1550
			 struct thread *thread __used)
1551
{
1552
	if (verbose)
1553
		printf("sched_exit event %p\n", event);
1554
}
1555

1556
static void
1557
process_sched_migrate_task_event(void *data, struct perf_session *session,
1558
			   struct event *event,
1559
			   int cpu __used,
1560
			   u64 timestamp __used,
1561
			   struct thread *thread __used)
1562
{
1563
	struct trace_migrate_task_event migrate_task_event;
1564

1565
	FILL_COMMON_FIELDS(migrate_task_event, event, data);
1566

1567
	FILL_ARRAY(migrate_task_event, comm, event, data);
1568
	FILL_FIELD(migrate_task_event, pid, event, data);
1569
	FILL_FIELD(migrate_task_event, prio, event, data);
1570
	FILL_FIELD(migrate_task_event, cpu, event, data);
1571

1572
	if (trace_handler->migrate_task_event)
1573
		trace_handler->migrate_task_event(&migrate_task_event, session,
1574
						 event, cpu, timestamp, thread);
1575
}
1576

1577
static void process_raw_event(union perf_event *raw_event __used,
1578
			      struct perf_session *session, void *data, int cpu,
1579
			      u64 timestamp, struct thread *thread)
1580
{
1581
	struct event *event;
1582
	int type;
1583

1584

1585
	type = trace_parse_common_type(data);
1586
	event = trace_find_event(type);
1587

1588
	if (!strcmp(event->name, "sched_switch"))
1589
		process_sched_switch_event(data, session, event, cpu, timestamp, thread);
1590
	if (!strcmp(event->name, "sched_stat_runtime"))
1591
		process_sched_runtime_event(data, session, event, cpu, timestamp, thread);
1592
	if (!strcmp(event->name, "sched_wakeup"))
1593
		process_sched_wakeup_event(data, session, event, cpu, timestamp, thread);
1594
	if (!strcmp(event->name, "sched_wakeup_new"))
1595
		process_sched_wakeup_event(data, session, event, cpu, timestamp, thread);
1596
	if (!strcmp(event->name, "sched_process_fork"))
1597
		process_sched_fork_event(data, event, cpu, timestamp, thread);
1598
	if (!strcmp(event->name, "sched_process_exit"))
1599
		process_sched_exit_event(event, cpu, timestamp, thread);
1600
	if (!strcmp(event->name, "sched_migrate_task"))
1601
		process_sched_migrate_task_event(data, session, event, cpu, timestamp, thread);
1602
}
1603

1604
static int process_sample_event(union perf_event *event,
1605
				struct perf_sample *sample,
1606
				struct perf_evsel *evsel __used,
1607
				struct perf_session *session)
1608
{
1609
	struct thread *thread;
1610

1611
	if (!(session->sample_type & PERF_SAMPLE_RAW))
1612
		return 0;
1613

1614
	thread = perf_session__findnew(session, sample->pid);
1615
	if (thread == NULL) {
1616
		pr_debug("problem processing %d event, skipping it.\n",
1617
			 event->header.type);
1618
		return -1;
1619
	}
1620

1621
	dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
1622

1623
	if (profile_cpu != -1 && profile_cpu != (int)sample->cpu)
1624
		return 0;
1625

1626
	process_raw_event(event, session, sample->raw_data, sample->cpu,
1627
			  sample->time, thread);
1628

1629
	return 0;
1630
}
1631

1632
static struct perf_event_ops event_ops = {
1633
	.sample			= process_sample_event,
1634
	.comm			= perf_event__process_comm,
1635
	.lost			= perf_event__process_lost,
1636
	.fork			= perf_event__process_task,
1637
	.ordered_samples	= true,
1638
};
1639

1640
static int read_events(void)
1641
{
1642
	int err = -EINVAL;
1643
	struct perf_session *session = perf_session__new(input_name, O_RDONLY,
1644
							 0, false, &event_ops);
1645
	if (session == NULL)
1646
		return -ENOMEM;
1647

1648
	if (perf_session__has_traces(session, "record -R")) {
1649
		err = perf_session__process_events(session, &event_ops);
1650
		nr_events      = session->hists.stats.nr_events[0];
1651
		nr_lost_events = session->hists.stats.total_lost;
1652
		nr_lost_chunks = session->hists.stats.nr_events[PERF_RECORD_LOST];
1653
	}
1654

1655
	perf_session__delete(session);
1656
	return err;
1657
}
1658

1659
static void print_bad_events(void)
1660
{
1661
	if (nr_unordered_timestamps && nr_timestamps) {
1662
		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
1663
			(double)nr_unordered_timestamps/(double)nr_timestamps*100.0,
1664
			nr_unordered_timestamps, nr_timestamps);
1665
	}
1666
	if (nr_lost_events && nr_events) {
1667
		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
1668
			(double)nr_lost_events/(double)nr_events*100.0,
1669
			nr_lost_events, nr_events, nr_lost_chunks);
1670
	}
1671
	if (nr_state_machine_bugs && nr_timestamps) {
1672
		printf("  INFO: %.3f%% state machine bugs (%ld out of %ld)",
1673
			(double)nr_state_machine_bugs/(double)nr_timestamps*100.0,
1674
			nr_state_machine_bugs, nr_timestamps);
1675
		if (nr_lost_events)
1676
			printf(" (due to lost events?)");
1677
		printf("\n");
1678
	}
1679
	if (nr_context_switch_bugs && nr_timestamps) {
1680
		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
1681
			(double)nr_context_switch_bugs/(double)nr_timestamps*100.0,
1682
			nr_context_switch_bugs, nr_timestamps);
1683
		if (nr_lost_events)
1684
			printf(" (due to lost events?)");
1685
		printf("\n");
1686
	}
1687
}
1688

1689
static void __cmd_lat(void)
1690
{
1691
	struct rb_node *next;
1692

1693
	setup_pager();
1694
	read_events();
1695
	sort_lat();
1696

1697
	printf("\n ---------------------------------------------------------------------------------------------------------------\n");
1698
	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at     |\n");
1699
	printf(" ---------------------------------------------------------------------------------------------------------------\n");
1700

1701
	next = rb_first(&sorted_atom_root);
1702

1703
	while (next) {
1704
		struct work_atoms *work_list;
1705

1706
		work_list = rb_entry(next, struct work_atoms, node);
1707
		output_lat_thread(work_list);
1708
		next = rb_next(next);
1709
	}
1710

1711
	printf(" -----------------------------------------------------------------------------------------\n");
1712
	printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
1713
		(double)all_runtime/1e6, all_count);
1714

1715
	printf(" ---------------------------------------------------\n");
1716

1717
	print_bad_events();
1718
	printf("\n");
1719

1720
}
1721

1722
static struct trace_sched_handler map_ops  = {
1723
	.wakeup_event		= NULL,
1724
	.switch_event		= map_switch_event,
1725
	.runtime_event		= NULL,
1726
	.fork_event		= NULL,
1727
};
1728

1729
static void __cmd_map(void)
1730
{
1731
	max_cpu = sysconf(_SC_NPROCESSORS_CONF);
1732

1733
	setup_pager();
1734
	read_events();
1735
	print_bad_events();
1736
}
1737

1738
static void __cmd_replay(void)
1739
{
1740
	unsigned long i;
1741

1742
	calibrate_run_measurement_overhead();
1743
	calibrate_sleep_measurement_overhead();
1744

1745
	test_calibrations();
1746

1747
	read_events();
1748

1749
	printf("nr_run_events:        %ld\n", nr_run_events);
1750
	printf("nr_sleep_events:      %ld\n", nr_sleep_events);
1751
	printf("nr_wakeup_events:     %ld\n", nr_wakeup_events);
1752

1753
	if (targetless_wakeups)
1754
		printf("target-less wakeups:  %ld\n", targetless_wakeups);
1755
	if (multitarget_wakeups)
1756
		printf("multi-target wakeups: %ld\n", multitarget_wakeups);
1757
	if (nr_run_events_optimized)
1758
		printf("run atoms optimized: %ld\n",
1759
			nr_run_events_optimized);
1760

1761
	print_task_traces();
1762
	add_cross_task_wakeups();
1763

1764
	create_tasks();
1765
	printf("------------------------------------------------------------\n");
1766
	for (i = 0; i < replay_repeat; i++)
1767
		run_one_test();
1768
}
1769

1770

1771
static const char * const sched_usage[] = {
1772
	"perf sched [<options>] {record|latency|map|replay|trace}",
1773
	NULL
1774
};
1775

1776
static const struct option sched_options[] = {
1777
	OPT_STRING('i', "input", &input_name, "file",
1778
		    "input file name"),
1779
	OPT_INCR('v', "verbose", &verbose,
1780
		    "be more verbose (show symbol address, etc)"),
1781
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1782
		    "dump raw trace in ASCII"),
1783
	OPT_END()
1784
};
1785

1786
static const char * const latency_usage[] = {
1787
	"perf sched latency [<options>]",
1788
	NULL
1789
};
1790

1791
static const struct option latency_options[] = {
1792
	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
1793
		   "sort by key(s): runtime, switch, avg, max"),
1794
	OPT_INCR('v', "verbose", &verbose,
1795
		    "be more verbose (show symbol address, etc)"),
1796
	OPT_INTEGER('C', "CPU", &profile_cpu,
1797
		    "CPU to profile on"),
1798
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1799
		    "dump raw trace in ASCII"),
1800
	OPT_END()
1801
};
1802

1803
static const char * const replay_usage[] = {
1804
	"perf sched replay [<options>]",
1805
	NULL
1806
};
1807

1808
static const struct option replay_options[] = {
1809
	OPT_UINTEGER('r', "repeat", &replay_repeat,
1810
		     "repeat the workload replay N times (-1: infinite)"),
1811
	OPT_INCR('v', "verbose", &verbose,
1812
		    "be more verbose (show symbol address, etc)"),
1813
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1814
		    "dump raw trace in ASCII"),
1815
	OPT_END()
1816
};
1817

1818
static void setup_sorting(void)
1819
{
1820
	char *tmp, *tok, *str = strdup(sort_order);
1821

1822
	for (tok = strtok_r(str, ", ", &tmp);
1823
			tok; tok = strtok_r(NULL, ", ", &tmp)) {
1824
		if (sort_dimension__add(tok, &sort_list) < 0) {
1825
			error("Unknown --sort key: `%s'", tok);
1826
			usage_with_options(latency_usage, latency_options);
1827
		}
1828
	}
1829

1830
	free(str);
1831

1832
	sort_dimension__add("pid", &cmp_pid);
1833
}
1834

1835
static const char *record_args[] = {
1836
	"record",
1837
	"-a",
1838
	"-R",
1839
	"-f",
1840
	"-m", "1024",
1841
	"-c", "1",
1842
	"-e", "sched:sched_switch",
1843
	"-e", "sched:sched_stat_wait",
1844
	"-e", "sched:sched_stat_sleep",
1845
	"-e", "sched:sched_stat_iowait",
1846
	"-e", "sched:sched_stat_runtime",
1847
	"-e", "sched:sched_process_exit",
1848
	"-e", "sched:sched_process_fork",
1849
	"-e", "sched:sched_wakeup",
1850
	"-e", "sched:sched_migrate_task",
1851
};
1852

1853
static int __cmd_record(int argc, const char **argv)
1854
{
1855
	unsigned int rec_argc, i, j;
1856
	const char **rec_argv;
1857

1858
	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
1859
	rec_argv = calloc(rec_argc + 1, sizeof(char *));
1860

1861
	if (rec_argv == NULL)
1862
		return -ENOMEM;
1863

1864
	for (i = 0; i < ARRAY_SIZE(record_args); i++)
1865
		rec_argv[i] = strdup(record_args[i]);
1866

1867
	for (j = 1; j < (unsigned int)argc; j++, i++)
1868
		rec_argv[i] = argv[j];
1869

1870
	BUG_ON(i != rec_argc);
1871

1872
	return cmd_record(i, rec_argv, NULL);
1873
}
1874

1875
int cmd_sched(int argc, const char **argv, const char *prefix __used)
1876
{
1877
	argc = parse_options(argc, argv, sched_options, sched_usage,
1878
			     PARSE_OPT_STOP_AT_NON_OPTION);
1879
	if (!argc)
1880
		usage_with_options(sched_usage, sched_options);
1881

1882
	/*
1883
	 * Aliased to 'perf script' for now:
1884
	 */
1885
	if (!strcmp(argv[0], "script"))
1886
		return cmd_script(argc, argv, prefix);
1887

1888
	symbol__init();
1889
	if (!strncmp(argv[0], "rec", 3)) {
1890
		return __cmd_record(argc, argv);
1891
	} else if (!strncmp(argv[0], "lat", 3)) {
1892
		trace_handler = &lat_ops;
1893
		if (argc > 1) {
1894
			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
1895
			if (argc)
1896
				usage_with_options(latency_usage, latency_options);
1897
		}
1898
		setup_sorting();
1899
		__cmd_lat();
1900
	} else if (!strcmp(argv[0], "map")) {
1901
		trace_handler = &map_ops;
1902
		setup_sorting();
1903
		__cmd_map();
1904
	} else if (!strncmp(argv[0], "rep", 3)) {
1905
		trace_handler = &replay_ops;
1906
		if (argc) {
1907
			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
1908
			if (argc)
1909
				usage_with_options(replay_usage, replay_options);
1910
		}
1911
		__cmd_replay();
1912
	} else {
1913
		usage_with_options(sched_usage, sched_options);
1914
	}
1915

1916
	return 0;
1917
}
1918

1919
Product

Resources

Company