Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Tools/scripts/summarize_stats.py
12 views
1
"""Print a summary of specialization stats for all files in the
2
default stats folders.
3
"""
4
5
import argparse
6
import collections
7
import json
8
import os.path
9
import opcode
10
from datetime import date
11
import itertools
12
import sys
13
14
if os.name == "nt":
15
DEFAULT_DIR = "c:\\temp\\py_stats\\"
16
else:
17
DEFAULT_DIR = "/tmp/py_stats/"
18
19
#Create list of all instruction names
20
specialized = iter(opcode._specialized_instructions)
21
opname = ["<0>"]
22
for name in opcode.opname[1:]:
23
if name.startswith("<"):
24
try:
25
name = next(specialized)
26
except StopIteration:
27
pass
28
opname.append(name)
29
30
# opcode_name --> opcode
31
# Sort alphabetically.
32
opmap = {name: i for i, name in enumerate(opname)}
33
opmap = dict(sorted(opmap.items()))
34
35
TOTAL = "specialization.hit", "specialization.miss", "execution_count"
36
37
def format_ratio(num, den):
38
"""
39
Format a ratio as a percentage. When the denominator is 0, returns the empty
40
string.
41
"""
42
if den == 0:
43
return ""
44
else:
45
return f"{num/den:.01%}"
46
47
def join_rows(a_rows, b_rows):
48
"""
49
Joins two tables together, side-by-side, where the first column in each is a
50
common key.
51
"""
52
if len(a_rows) == 0 and len(b_rows) == 0:
53
return []
54
55
if len(a_rows):
56
a_ncols = list(set(len(x) for x in a_rows))
57
if len(a_ncols) != 1:
58
raise ValueError("Table a is ragged")
59
60
if len(b_rows):
61
b_ncols = list(set(len(x) for x in b_rows))
62
if len(b_ncols) != 1:
63
raise ValueError("Table b is ragged")
64
65
if len(a_rows) and len(b_rows) and a_ncols[0] != b_ncols[0]:
66
raise ValueError("Tables have different widths")
67
68
if len(a_rows):
69
ncols = a_ncols[0]
70
else:
71
ncols = b_ncols[0]
72
73
default = [""] * (ncols - 1)
74
a_data = {x[0]: x[1:] for x in a_rows}
75
b_data = {x[0]: x[1:] for x in b_rows}
76
77
if len(a_data) != len(a_rows) or len(b_data) != len(b_rows):
78
raise ValueError("Duplicate keys")
79
80
# To preserve ordering, use A's keys as is and then add any in B that aren't
81
# in A
82
keys = list(a_data.keys()) + [k for k in b_data.keys() if k not in a_data]
83
return [(k, *a_data.get(k, default), *b_data.get(k, default)) for k in keys]
84
85
def calculate_specialization_stats(family_stats, total):
86
rows = []
87
for key in sorted(family_stats):
88
if key.startswith("specialization.failure_kinds"):
89
continue
90
if key in ("specialization.hit", "specialization.miss"):
91
label = key[len("specialization."):]
92
elif key == "execution_count":
93
continue
94
elif key in ("specialization.success", "specialization.failure", "specializable"):
95
continue
96
elif key.startswith("pair"):
97
continue
98
else:
99
label = key
100
rows.append((f"{label:>12}", f"{family_stats[key]:>12}", format_ratio(family_stats[key], total)))
101
return rows
102
103
def calculate_specialization_success_failure(family_stats):
104
total_attempts = 0
105
for key in ("specialization.success", "specialization.failure"):
106
total_attempts += family_stats.get(key, 0)
107
rows = []
108
if total_attempts:
109
for key in ("specialization.success", "specialization.failure"):
110
label = key[len("specialization."):]
111
label = label[0].upper() + label[1:]
112
val = family_stats.get(key, 0)
113
rows.append((label, val, format_ratio(val, total_attempts)))
114
return rows
115
116
def calculate_specialization_failure_kinds(name, family_stats, defines):
117
total_failures = family_stats.get("specialization.failure", 0)
118
failure_kinds = [ 0 ] * 40
119
for key in family_stats:
120
if not key.startswith("specialization.failure_kind"):
121
continue
122
_, index = key[:-1].split("[")
123
index = int(index)
124
failure_kinds[index] = family_stats[key]
125
failures = [(value, index) for (index, value) in enumerate(failure_kinds)]
126
failures.sort(reverse=True)
127
rows = []
128
for value, index in failures:
129
if not value:
130
continue
131
rows.append((kind_to_text(index, defines, name), value, format_ratio(value, total_failures)))
132
return rows
133
134
def print_specialization_stats(name, family_stats, defines):
135
if "specializable" not in family_stats:
136
return
137
total = sum(family_stats.get(kind, 0) for kind in TOTAL)
138
if total == 0:
139
return
140
with Section(name, 3, f"specialization stats for {name} family"):
141
rows = calculate_specialization_stats(family_stats, total)
142
emit_table(("Kind", "Count", "Ratio"), rows)
143
rows = calculate_specialization_success_failure(family_stats)
144
if rows:
145
print_title("Specialization attempts", 4)
146
emit_table(("", "Count:", "Ratio:"), rows)
147
rows = calculate_specialization_failure_kinds(name, family_stats, defines)
148
emit_table(("Failure kind", "Count:", "Ratio:"), rows)
149
150
def print_comparative_specialization_stats(name, base_family_stats, head_family_stats, defines):
151
if "specializable" not in base_family_stats:
152
return
153
154
base_total = sum(base_family_stats.get(kind, 0) for kind in TOTAL)
155
head_total = sum(head_family_stats.get(kind, 0) for kind in TOTAL)
156
if base_total + head_total == 0:
157
return
158
with Section(name, 3, f"specialization stats for {name} family"):
159
base_rows = calculate_specialization_stats(base_family_stats, base_total)
160
head_rows = calculate_specialization_stats(head_family_stats, head_total)
161
emit_table(
162
("Kind", "Base Count", "Base Ratio", "Head Count", "Head Ratio"),
163
join_rows(base_rows, head_rows)
164
)
165
base_rows = calculate_specialization_success_failure(base_family_stats)
166
head_rows = calculate_specialization_success_failure(head_family_stats)
167
rows = join_rows(base_rows, head_rows)
168
if rows:
169
print_title("Specialization attempts", 4)
170
emit_table(("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"), rows)
171
base_rows = calculate_specialization_failure_kinds(name, base_family_stats, defines)
172
head_rows = calculate_specialization_failure_kinds(name, head_family_stats, defines)
173
emit_table(
174
("Failure kind", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"),
175
join_rows(base_rows, head_rows)
176
)
177
178
def gather_stats(input):
179
# Note the output of this function must be JSON-serializable
180
181
if os.path.isfile(input):
182
with open(input, "r") as fd:
183
return json.load(fd)
184
elif os.path.isdir(input):
185
stats = collections.Counter()
186
for filename in os.listdir(input):
187
with open(os.path.join(input, filename)) as fd:
188
for line in fd:
189
try:
190
key, value = line.split(":")
191
except ValueError:
192
print(f"Unparsable line: '{line.strip()}' in {filename}", file=sys.stderr)
193
continue
194
key = key.strip()
195
value = int(value)
196
stats[key] += value
197
stats['__nfiles__'] += 1
198
return stats
199
else:
200
raise ValueError(f"{input:r} is not a file or directory path")
201
202
def extract_opcode_stats(stats):
203
opcode_stats = [ {} for _ in range(256) ]
204
for key, value in stats.items():
205
if not key.startswith("opcode"):
206
continue
207
n, _, rest = key[7:].partition("]")
208
opcode_stats[int(n)][rest.strip(".")] = value
209
return opcode_stats
210
211
def parse_kinds(spec_src, prefix="SPEC_FAIL"):
212
defines = collections.defaultdict(list)
213
start = "#define " + prefix + "_"
214
for line in spec_src:
215
line = line.strip()
216
if not line.startswith(start):
217
continue
218
line = line[len(start):]
219
name, val = line.split()
220
defines[int(val.strip())].append(name.strip())
221
return defines
222
223
def pretty(defname):
224
return defname.replace("_", " ").lower()
225
226
def kind_to_text(kind, defines, opname):
227
if kind <= 8:
228
return pretty(defines[kind][0])
229
if opname == "LOAD_SUPER_ATTR":
230
opname = "SUPER"
231
elif opname.endswith("ATTR"):
232
opname = "ATTR"
233
elif opname in ("FOR_ITER", "SEND"):
234
opname = "ITER"
235
elif opname.endswith("SUBSCR"):
236
opname = "SUBSCR"
237
for name in defines[kind]:
238
if name.startswith(opname):
239
return pretty(name[len(opname)+1:])
240
return "kind " + str(kind)
241
242
def categorized_counts(opcode_stats):
243
basic = 0
244
specialized = 0
245
not_specialized = 0
246
specialized_instructions = {
247
op for op in opcode._specialized_instructions
248
if "__" not in op}
249
for i, opcode_stat in enumerate(opcode_stats):
250
if "execution_count" not in opcode_stat:
251
continue
252
count = opcode_stat['execution_count']
253
name = opname[i]
254
if "specializable" in opcode_stat:
255
not_specialized += count
256
elif name in specialized_instructions:
257
miss = opcode_stat.get("specialization.miss", 0)
258
not_specialized += miss
259
specialized += count - miss
260
else:
261
basic += count
262
return basic, not_specialized, specialized
263
264
def print_title(name, level=2):
265
print("#"*level, name)
266
print()
267
268
class Section:
269
270
def __init__(self, title, level=2, summary=None):
271
self.title = title
272
self.level = level
273
if summary is None:
274
self.summary = title.lower()
275
else:
276
self.summary = summary
277
278
def __enter__(self):
279
print_title(self.title, self.level)
280
print("<details>")
281
print("<summary>", self.summary, "</summary>")
282
print()
283
return self
284
285
def __exit__(*args):
286
print()
287
print("</details>")
288
print()
289
290
def to_str(x):
291
if isinstance(x, int):
292
return format(x, ",d")
293
else:
294
return str(x)
295
296
def emit_table(header, rows):
297
width = len(header)
298
header_line = "|"
299
under_line = "|"
300
for item in header:
301
under = "---"
302
if item.endswith(":"):
303
item = item[:-1]
304
under += ":"
305
header_line += item + " | "
306
under_line += under + "|"
307
print(header_line)
308
print(under_line)
309
for row in rows:
310
if width is not None and len(row) != width:
311
raise ValueError("Wrong number of elements in row '" + str(row) + "'")
312
print("|", " | ".join(to_str(i) for i in row), "|")
313
print()
314
315
def calculate_execution_counts(opcode_stats, total):
316
counts = []
317
for i, opcode_stat in enumerate(opcode_stats):
318
if "execution_count" in opcode_stat:
319
count = opcode_stat['execution_count']
320
miss = 0
321
if "specializable" not in opcode_stat:
322
miss = opcode_stat.get("specialization.miss")
323
counts.append((count, opname[i], miss))
324
counts.sort(reverse=True)
325
cumulative = 0
326
rows = []
327
for (count, name, miss) in counts:
328
cumulative += count
329
if miss:
330
miss = format_ratio(miss, count)
331
else:
332
miss = ""
333
rows.append((name, count, format_ratio(count, total),
334
format_ratio(cumulative, total), miss))
335
return rows
336
337
def emit_execution_counts(opcode_stats, total):
338
with Section("Execution counts", summary="execution counts for all instructions"):
339
rows = calculate_execution_counts(opcode_stats, total)
340
emit_table(
341
("Name", "Count:", "Self:", "Cumulative:", "Miss ratio:"),
342
rows
343
)
344
345
def emit_comparative_execution_counts(
346
base_opcode_stats, base_total, head_opcode_stats, head_total
347
):
348
with Section("Execution counts", summary="execution counts for all instructions"):
349
base_rows = calculate_execution_counts(base_opcode_stats, base_total)
350
head_rows = calculate_execution_counts(head_opcode_stats, head_total)
351
base_data = dict((x[0], x[1:]) for x in base_rows)
352
head_data = dict((x[0], x[1:]) for x in head_rows)
353
opcodes = set(base_data.keys()) | set(head_data.keys())
354
355
rows = []
356
default = [0, "0.0%", "0.0%", 0]
357
for opcode in opcodes:
358
base_entry = base_data.get(opcode, default)
359
head_entry = head_data.get(opcode, default)
360
if base_entry[0] == 0:
361
change = 1
362
else:
363
change = (head_entry[0] - base_entry[0]) / base_entry[0]
364
rows.append(
365
(opcode, base_entry[0], head_entry[0],
366
f"{100*change:0.1f}%"))
367
368
rows.sort(key=lambda x: -abs(float(x[-1][:-1])))
369
370
emit_table(
371
("Name", "Base Count:", "Head Count:", "Change:"),
372
rows
373
)
374
375
def get_defines():
376
spec_path = os.path.join(os.path.dirname(__file__), "../../Python/specialize.c")
377
with open(spec_path) as spec_src:
378
defines = parse_kinds(spec_src)
379
return defines
380
381
def emit_specialization_stats(opcode_stats):
382
defines = get_defines()
383
with Section("Specialization stats", summary="specialization stats by family"):
384
for i, opcode_stat in enumerate(opcode_stats):
385
name = opname[i]
386
print_specialization_stats(name, opcode_stat, defines)
387
388
def emit_comparative_specialization_stats(base_opcode_stats, head_opcode_stats):
389
defines = get_defines()
390
with Section("Specialization stats", summary="specialization stats by family"):
391
for i, (base_opcode_stat, head_opcode_stat) in enumerate(zip(base_opcode_stats, head_opcode_stats)):
392
name = opname[i]
393
print_comparative_specialization_stats(name, base_opcode_stat, head_opcode_stat, defines)
394
395
def calculate_specialization_effectiveness(opcode_stats, total):
396
basic, not_specialized, specialized = categorized_counts(opcode_stats)
397
return [
398
("Basic", basic, format_ratio(basic, total)),
399
("Not specialized", not_specialized, format_ratio(not_specialized, total)),
400
("Specialized", specialized, format_ratio(specialized, total)),
401
]
402
403
def emit_specialization_overview(opcode_stats, total):
404
with Section("Specialization effectiveness"):
405
rows = calculate_specialization_effectiveness(opcode_stats, total)
406
emit_table(("Instructions", "Count:", "Ratio:"), rows)
407
for title, field in (("Deferred", "specialization.deferred"), ("Misses", "specialization.miss")):
408
total = 0
409
counts = []
410
for i, opcode_stat in enumerate(opcode_stats):
411
# Avoid double counting misses
412
if title == "Misses" and "specializable" in opcode_stat:
413
continue
414
value = opcode_stat.get(field, 0)
415
counts.append((value, opname[i]))
416
total += value
417
counts.sort(reverse=True)
418
if total:
419
with Section(f"{title} by instruction", 3):
420
rows = [ (name, count, format_ratio(count, total)) for (count, name) in counts[:10] ]
421
emit_table(("Name", "Count:", "Ratio:"), rows)
422
423
def emit_comparative_specialization_overview(base_opcode_stats, base_total, head_opcode_stats, head_total):
424
with Section("Specialization effectiveness"):
425
base_rows = calculate_specialization_effectiveness(base_opcode_stats, base_total)
426
head_rows = calculate_specialization_effectiveness(head_opcode_stats, head_total)
427
emit_table(
428
("Instructions", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"),
429
join_rows(base_rows, head_rows)
430
)
431
432
def get_stats_defines():
433
stats_path = os.path.join(os.path.dirname(__file__), "../../Include/pystats.h")
434
with open(stats_path) as stats_src:
435
defines = parse_kinds(stats_src, prefix="EVAL_CALL")
436
return defines
437
438
def calculate_call_stats(stats):
439
defines = get_stats_defines()
440
total = 0
441
for key, value in stats.items():
442
if "Calls to" in key:
443
total += value
444
rows = []
445
for key, value in stats.items():
446
if "Calls to" in key:
447
rows.append((key, value, format_ratio(value, total)))
448
elif key.startswith("Calls "):
449
name, index = key[:-1].split("[")
450
index = int(index)
451
label = name + " (" + pretty(defines[index][0]) + ")"
452
rows.append((label, value, format_ratio(value, total)))
453
for key, value in stats.items():
454
if key.startswith("Frame"):
455
rows.append((key, value, format_ratio(value, total)))
456
return rows
457
458
def emit_call_stats(stats):
459
with Section("Call stats", summary="Inlined calls and frame stats"):
460
rows = calculate_call_stats(stats)
461
emit_table(("", "Count:", "Ratio:"), rows)
462
463
def emit_comparative_call_stats(base_stats, head_stats):
464
with Section("Call stats", summary="Inlined calls and frame stats"):
465
base_rows = calculate_call_stats(base_stats)
466
head_rows = calculate_call_stats(head_stats)
467
rows = join_rows(base_rows, head_rows)
468
rows.sort(key=lambda x: -float(x[-1][:-1]))
469
emit_table(
470
("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"),
471
rows
472
)
473
474
def calculate_object_stats(stats):
475
total_materializations = stats.get("Object new values")
476
total_allocations = stats.get("Object allocations") + stats.get("Object allocations from freelist")
477
total_increfs = stats.get("Object interpreter increfs") + stats.get("Object increfs")
478
total_decrefs = stats.get("Object interpreter decrefs") + stats.get("Object decrefs")
479
rows = []
480
for key, value in stats.items():
481
if key.startswith("Object"):
482
if "materialize" in key:
483
ratio = format_ratio(value, total_materializations)
484
elif "allocations" in key:
485
ratio = format_ratio(value, total_allocations)
486
elif "increfs" in key:
487
ratio = format_ratio(value, total_increfs)
488
elif "decrefs" in key:
489
ratio = format_ratio(value, total_decrefs)
490
else:
491
ratio = ""
492
label = key[6:].strip()
493
label = label[0].upper() + label[1:]
494
rows.append((label, value, ratio))
495
return rows
496
497
def emit_object_stats(stats):
498
with Section("Object stats", summary="allocations, frees and dict materializatons"):
499
rows = calculate_object_stats(stats)
500
emit_table(("", "Count:", "Ratio:"), rows)
501
502
def emit_comparative_object_stats(base_stats, head_stats):
503
with Section("Object stats", summary="allocations, frees and dict materializatons"):
504
base_rows = calculate_object_stats(base_stats)
505
head_rows = calculate_object_stats(head_stats)
506
emit_table(("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"), join_rows(base_rows, head_rows))
507
508
def get_total(opcode_stats):
509
total = 0
510
for opcode_stat in opcode_stats:
511
if "execution_count" in opcode_stat:
512
total += opcode_stat['execution_count']
513
return total
514
515
def emit_pair_counts(opcode_stats, total):
516
pair_counts = []
517
for i, opcode_stat in enumerate(opcode_stats):
518
if i == 0:
519
continue
520
for key, value in opcode_stat.items():
521
if key.startswith("pair_count"):
522
x, _, _ = key[11:].partition("]")
523
if value:
524
pair_counts.append((value, (i, int(x))))
525
with Section("Pair counts", summary="Pair counts for top 100 pairs"):
526
pair_counts.sort(reverse=True)
527
cumulative = 0
528
rows = []
529
for (count, pair) in itertools.islice(pair_counts, 100):
530
i, j = pair
531
cumulative += count
532
rows.append((opname[i] + " " + opname[j], count, format_ratio(count, total),
533
format_ratio(cumulative, total)))
534
emit_table(("Pair", "Count:", "Self:", "Cumulative:"),
535
rows
536
)
537
with Section("Predecessor/Successor Pairs", summary="Top 5 predecessors and successors of each opcode"):
538
predecessors = collections.defaultdict(collections.Counter)
539
successors = collections.defaultdict(collections.Counter)
540
total_predecessors = collections.Counter()
541
total_successors = collections.Counter()
542
for count, (first, second) in pair_counts:
543
if count:
544
predecessors[second][first] = count
545
successors[first][second] = count
546
total_predecessors[second] += count
547
total_successors[first] += count
548
for name, i in opmap.items():
549
total1 = total_predecessors[i]
550
total2 = total_successors[i]
551
if total1 == 0 and total2 == 0:
552
continue
553
pred_rows = succ_rows = ()
554
if total1:
555
pred_rows = [(opname[pred], count, f"{count/total1:.1%}")
556
for (pred, count) in predecessors[i].most_common(5)]
557
if total2:
558
succ_rows = [(opname[succ], count, f"{count/total2:.1%}")
559
for (succ, count) in successors[i].most_common(5)]
560
with Section(name, 3, f"Successors and predecessors for {name}"):
561
emit_table(("Predecessors", "Count:", "Percentage:"),
562
pred_rows
563
)
564
emit_table(("Successors", "Count:", "Percentage:"),
565
succ_rows
566
)
567
568
def output_single_stats(stats):
569
opcode_stats = extract_opcode_stats(stats)
570
total = get_total(opcode_stats)
571
emit_execution_counts(opcode_stats, total)
572
emit_pair_counts(opcode_stats, total)
573
emit_specialization_stats(opcode_stats)
574
emit_specialization_overview(opcode_stats, total)
575
emit_call_stats(stats)
576
emit_object_stats(stats)
577
with Section("Meta stats", summary="Meta statistics"):
578
emit_table(("", "Count:"), [('Number of data files', stats['__nfiles__'])])
579
580
581
def output_comparative_stats(base_stats, head_stats):
582
base_opcode_stats = extract_opcode_stats(base_stats)
583
base_total = get_total(base_opcode_stats)
584
585
head_opcode_stats = extract_opcode_stats(head_stats)
586
head_total = get_total(head_opcode_stats)
587
588
emit_comparative_execution_counts(
589
base_opcode_stats, base_total, head_opcode_stats, head_total
590
)
591
emit_comparative_specialization_stats(
592
base_opcode_stats, head_opcode_stats
593
)
594
emit_comparative_specialization_overview(
595
base_opcode_stats, base_total, head_opcode_stats, head_total
596
)
597
emit_comparative_call_stats(base_stats, head_stats)
598
emit_comparative_object_stats(base_stats, head_stats)
599
600
def output_stats(inputs, json_output=None):
601
if len(inputs) == 1:
602
stats = gather_stats(inputs[0])
603
if json_output is not None:
604
json.dump(stats, json_output)
605
output_single_stats(stats)
606
elif len(inputs) == 2:
607
if json_output is not None:
608
raise ValueError(
609
"Can not output to JSON when there are multiple inputs"
610
)
611
612
base_stats = gather_stats(inputs[0])
613
head_stats = gather_stats(inputs[1])
614
output_comparative_stats(base_stats, head_stats)
615
616
print("---")
617
print("Stats gathered on:", date.today())
618
619
def main():
620
parser = argparse.ArgumentParser(description="Summarize pystats results")
621
622
parser.add_argument(
623
"inputs",
624
nargs="*",
625
type=str,
626
default=[DEFAULT_DIR],
627
help=f"""
628
Input source(s).
629
For each entry, if a .json file, the output provided by --json-output from a previous run;
630
if a directory, a directory containing raw pystats .txt files.
631
If one source is provided, its stats are printed.
632
If two sources are provided, comparative stats are printed.
633
Default is {DEFAULT_DIR}.
634
"""
635
)
636
637
parser.add_argument(
638
"--json-output",
639
nargs="?",
640
type=argparse.FileType("w"),
641
help="Output complete raw results to the given JSON file."
642
)
643
644
args = parser.parse_args()
645
646
if len(args.inputs) > 2:
647
raise ValueError("0-2 arguments may be provided.")
648
649
output_stats(args.inputs, json_output=args.json_output)
650
651
if __name__ == "__main__":
652
main()
653
654