Path: blob/master/misc/scripts/unicode_ranges_fetch.py
20907 views
#!/usr/bin/env python312# Script used to dump char ranges from3# the Unicode Character Database to the `char_range.inc` file.4# NOTE: This script is deliberately not integrated into the build system;5# you should run it manually whenever you want to update the data.6from __future__ import annotations78import os9import sys10from typing import Final11from urllib.request import urlopen1213if __name__ == "__main__":14sys.path.insert(1, os.path.join(os.path.dirname(__file__), "../../"))1516from methods import generate_copyright_header1718URL: Final[str] = "https://www.unicode.org/Public/17.0.0/ucd/Blocks.txt"192021ranges: list[tuple[str, str, str]] = []2223exclude_blocks: set[str] = {24"High Surrogates",25"High Private Use Surrogates",26"Low Surrogates",27"Variation Selectors",28"Specials",29"Egyptian Hieroglyph Format Controls",30"Tags",31"Variation Selectors Supplement",32}333435def parse_unicode_data() -> None:36lines: list[str] = [line.decode("utf-8") for line in urlopen(URL)]3738for line in lines:39if line.startswith("#") or not line.strip():40continue4142split_line: list[str] = line.split(";")4344char_range: str = split_line[0].strip()45block: str = split_line[1].strip()4647if block in exclude_blocks:48continue4950range_start, range_end = char_range.split("..")5152ranges.append((f"0x{range_start}", f"0x{range_end}", block))535455def make_array(array_name: str, ranges: list[tuple[str, str, str]]) -> str:56result: str = f"static UniRange {array_name}[] = {{\n"5758for start, end, block in ranges:59result += f'\t{{ {start}, {end}, U"{block}" }},\n'6061result += """\t{ 0x10FFFF, 0x10FFFF, String() }62};\n\n"""6364return result656667def generate_unicode_ranges_inc() -> None:68parse_unicode_data()6970source: str = generate_copyright_header("unicode_ranges.inc")7172source += f"""73// This file was generated using the `misc/scripts/unicode_ranges_fetch.py` script.7475#ifndef UNICODE_RANGES_INC76#define UNICODE_RANGES_INC7778// Unicode Character Blocks79// Source: {URL}8081struct UniRange {{82\tint32_t start;83\tint32_t end;84\tString name;85}};\n\n"""8687source += make_array("unicode_ranges", ranges)8889source += "#endif // UNICODE_RANGES_INC\n"9091unicode_ranges_path: str = os.path.join(os.path.dirname(__file__), "../../editor/import/unicode_ranges.inc")92with open(unicode_ranges_path, "w", newline="\n") as f:93f.write(source)9495print("`unicode_ranges.inc` generated successfully.")969798if __name__ == "__main__":99generate_unicode_ranges_inc()100101102