CoCalc -- unicode_ranges

GitHub Repository: godotengine/godot
Path: blob/master/misc/scripts/unicode_ranges_fetch.py
⁹⁸⁹⁶ views
1
#!/usr/bin/env python3
2

3
# Script used to dump char ranges from
4
# the Unicode Character Database to the `char_range.inc` file.
5
# NOTE: This script is deliberately not integrated into the build system;
6
# you should run it manually whenever you want to update the data.
7

8
import os
9
import sys
10
from typing import Final, List, Set, Tuple
11
from urllib.request import urlopen
12

13
if __name__ == "__main__":
14
    sys.path.insert(1, os.path.join(os.path.dirname(__file__), "../../"))
15

16
from methods import generate_copyright_header
17

18
URL: Final[str] = "https://www.unicode.org/Public/16.0.0/ucd/Blocks.txt"
19

20

21
ranges: List[Tuple[str, str, str]] = []
22

23
exclude_blocks: Set[str] = {
24
    "High Surrogates",
25
    "High Private Use Surrogates",
26
    "Low Surrogates",
27
    "Variation Selectors",
28
    "Specials",
29
    "Egyptian Hieroglyph Format Controls",
30
    "Tags",
31
    "Variation Selectors Supplement",
32
}
33

34

35
def parse_unicode_data() -> None:
36
    lines: List[str] = [line.decode("utf-8") for line in urlopen(URL)]
37

38
    for line in lines:
39
        if line.startswith("#") or not line.strip():
40
            continue
41

42
        split_line: List[str] = line.split(";")
43

44
        char_range: str = split_line[0].strip()
45
        block: str = split_line[1].strip()
46

47
        if block in exclude_blocks:
48
            continue
49

50
        range_start, range_end = char_range.split("..")
51

52
        ranges.append((f"0x{range_start}", f"0x{range_end}", block))
53

54

55
def make_array(array_name: str, ranges: List[Tuple[str, str, str]]) -> str:
56
    result: str = f"static UniRange {array_name}[] = {{\n"
57

58
    for start, end, block in ranges:
59
        result += f'\t{{ {start}, {end}, U"{block}" }},\n'
60

61
    result += """\t{ 0x10FFFF, 0x10FFFF, String() }
62
};\n\n"""
63

64
    return result
65

66

67
def generate_unicode_ranges_inc() -> None:
68
    parse_unicode_data()
69

70
    source: str = generate_copyright_header("unicode_ranges.inc")
71

72
    source += f"""
73
// This file was generated using the `misc/scripts/unicode_ranges_fetch.py` script.
74

75
#ifndef UNICODE_RANGES_INC
76
#define UNICODE_RANGES_INC
77

78
// Unicode Character Blocks
79
// Source: {URL}
80

81
struct UniRange {{
82
\tint32_t start;
83
\tint32_t end;
84
\tString name;
85
}};\n\n"""
86

87
    source += make_array("unicode_ranges", ranges)
88

89
    source += "#endif // UNICODE_RANGES_INC\n"
90

91
    unicode_ranges_path: str = os.path.join(os.path.dirname(__file__), "../../editor/import/unicode_ranges.inc")
92
    with open(unicode_ranges_path, "w", newline="\n") as f:
93
        f.write(source)
94

95
    print("`unicode_ranges.inc` generated successfully.")
96

97

98
if __name__ == "__main__":
99
    generate_unicode_ranges_inc()
100

101
Product

Resources

Company