Path: blob/main/Tools/build/generate_levenshtein_examples.py
12 views
"""Generate 10,000 unique examples for the Levenshtein short-circuit tests."""12import argparse3from functools import lru_cache4import json5import os.path6from random import choices, randrange789# This should be in sync with Lib/traceback.py. It's not importing those values10# because this script is being executed by PYTHON_FOR_REGEN and not by the in-tree11# build of Python.12_MOVE_COST = 213_CASE_COST = 1141516def _substitution_cost(ch_a, ch_b):17if ch_a == ch_b:18return 019if ch_a.lower() == ch_b.lower():20return _CASE_COST21return _MOVE_COST222324@lru_cache(None)25def levenshtein(a, b):26if not a or not b:27return (len(a) + len(b)) * _MOVE_COST28option1 = levenshtein(a[:-1], b[:-1]) + _substitution_cost(a[-1], b[-1])29option2 = levenshtein(a[:-1], b) + _MOVE_COST30option3 = levenshtein(a, b[:-1]) + _MOVE_COST31return min(option1, option2, option3)323334def main():35parser = argparse.ArgumentParser(description=__doc__)36parser.add_argument('output_path', metavar='FILE', type=str)37parser.add_argument('--overwrite', dest='overwrite', action='store_const',38const=True, default=False,39help='overwrite an existing test file')4041args = parser.parse_args()42output_path = os.path.realpath(args.output_path)43if not args.overwrite and os.path.isfile(output_path):44print(f"{output_path} already exists, skipping regeneration.")45print(46"To force, add --overwrite to the invocation of this tool or"47" delete the existing file."48)49return5051examples = set()52# Create a lot of non-empty examples, which should end up with a Gauss-like53# distribution for even costs (moves) and odd costs (case substitutions).54while len(examples) < 9990:55a = ''.join(choices("abcABC", k=randrange(1, 10)))56b = ''.join(choices("abcABC", k=randrange(1, 10)))57expected = levenshtein(a, b)58examples.add((a, b, expected))59# Create one empty case each for strings between 0 and 9 in length.60for i in range(10):61b = ''.join(choices("abcABC", k=i))62expected = levenshtein("", b)63examples.add(("", b, expected))64with open(output_path, "w") as f:65json.dump(sorted(examples), f, indent=2)666768if __name__ == "__main__":69main()707172