Path: blob/master/misc/scripts/unicode_ranges_fetch.py
9896 views
#!/usr/bin/env python312# Script used to dump char ranges from3# the Unicode Character Database to the `char_range.inc` file.4# NOTE: This script is deliberately not integrated into the build system;5# you should run it manually whenever you want to update the data.67import os8import sys9from typing import Final, List, Set, Tuple10from urllib.request import urlopen1112if __name__ == "__main__":13sys.path.insert(1, os.path.join(os.path.dirname(__file__), "../../"))1415from methods import generate_copyright_header1617URL: Final[str] = "https://www.unicode.org/Public/16.0.0/ucd/Blocks.txt"181920ranges: List[Tuple[str, str, str]] = []2122exclude_blocks: Set[str] = {23"High Surrogates",24"High Private Use Surrogates",25"Low Surrogates",26"Variation Selectors",27"Specials",28"Egyptian Hieroglyph Format Controls",29"Tags",30"Variation Selectors Supplement",31}323334def parse_unicode_data() -> None:35lines: List[str] = [line.decode("utf-8") for line in urlopen(URL)]3637for line in lines:38if line.startswith("#") or not line.strip():39continue4041split_line: List[str] = line.split(";")4243char_range: str = split_line[0].strip()44block: str = split_line[1].strip()4546if block in exclude_blocks:47continue4849range_start, range_end = char_range.split("..")5051ranges.append((f"0x{range_start}", f"0x{range_end}", block))525354def make_array(array_name: str, ranges: List[Tuple[str, str, str]]) -> str:55result: str = f"static UniRange {array_name}[] = {{\n"5657for start, end, block in ranges:58result += f'\t{{ {start}, {end}, U"{block}" }},\n'5960result += """\t{ 0x10FFFF, 0x10FFFF, String() }61};\n\n"""6263return result646566def generate_unicode_ranges_inc() -> None:67parse_unicode_data()6869source: str = generate_copyright_header("unicode_ranges.inc")7071source += f"""72// This file was generated using the `misc/scripts/unicode_ranges_fetch.py` script.7374#ifndef UNICODE_RANGES_INC75#define UNICODE_RANGES_INC7677// Unicode Character Blocks78// Source: {URL}7980struct UniRange {{81\tint32_t start;82\tint32_t end;83\tString name;84}};\n\n"""8586source += make_array("unicode_ranges", ranges)8788source += "#endif // UNICODE_RANGES_INC\n"8990unicode_ranges_path: str = os.path.join(os.path.dirname(__file__), "../../editor/import/unicode_ranges.inc")91with open(unicode_ranges_path, "w", newline="\n") as f:92f.write(source)9394print("`unicode_ranges.inc` generated successfully.")959697if __name__ == "__main__":98generate_unicode_ranges_inc()99100101