Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Tools/build/generate_levenshtein_examples.py
12 views
1
"""Generate 10,000 unique examples for the Levenshtein short-circuit tests."""
2
3
import argparse
4
from functools import lru_cache
5
import json
6
import os.path
7
from random import choices, randrange
8
9
10
# This should be in sync with Lib/traceback.py. It's not importing those values
11
# because this script is being executed by PYTHON_FOR_REGEN and not by the in-tree
12
# build of Python.
13
_MOVE_COST = 2
14
_CASE_COST = 1
15
16
17
def _substitution_cost(ch_a, ch_b):
18
if ch_a == ch_b:
19
return 0
20
if ch_a.lower() == ch_b.lower():
21
return _CASE_COST
22
return _MOVE_COST
23
24
25
@lru_cache(None)
26
def levenshtein(a, b):
27
if not a or not b:
28
return (len(a) + len(b)) * _MOVE_COST
29
option1 = levenshtein(a[:-1], b[:-1]) + _substitution_cost(a[-1], b[-1])
30
option2 = levenshtein(a[:-1], b) + _MOVE_COST
31
option3 = levenshtein(a, b[:-1]) + _MOVE_COST
32
return min(option1, option2, option3)
33
34
35
def main():
36
parser = argparse.ArgumentParser(description=__doc__)
37
parser.add_argument('output_path', metavar='FILE', type=str)
38
parser.add_argument('--overwrite', dest='overwrite', action='store_const',
39
const=True, default=False,
40
help='overwrite an existing test file')
41
42
args = parser.parse_args()
43
output_path = os.path.realpath(args.output_path)
44
if not args.overwrite and os.path.isfile(output_path):
45
print(f"{output_path} already exists, skipping regeneration.")
46
print(
47
"To force, add --overwrite to the invocation of this tool or"
48
" delete the existing file."
49
)
50
return
51
52
examples = set()
53
# Create a lot of non-empty examples, which should end up with a Gauss-like
54
# distribution for even costs (moves) and odd costs (case substitutions).
55
while len(examples) < 9990:
56
a = ''.join(choices("abcABC", k=randrange(1, 10)))
57
b = ''.join(choices("abcABC", k=randrange(1, 10)))
58
expected = levenshtein(a, b)
59
examples.add((a, b, expected))
60
# Create one empty case each for strings between 0 and 9 in length.
61
for i in range(10):
62
b = ''.join(choices("abcABC", k=i))
63
expected = levenshtein("", b)
64
examples.add(("", b, expected))
65
with open(output_path, "w") as f:
66
json.dump(sorted(examples), f, indent=2)
67
68
69
if __name__ == "__main__":
70
main()
71
72