Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/misc/scripts/char_range_fetch.py
9896 views
1
#!/usr/bin/env python3
2
3
# Script used to dump char ranges for specific properties from
4
# the Unicode Character Database to the `char_range.inc` file.
5
# NOTE: This script is deliberately not integrated into the build system;
6
# you should run it manually whenever you want to update the data.
7
8
import os
9
import sys
10
from typing import Final, List, Tuple
11
from urllib.request import urlopen
12
13
if __name__ == "__main__":
14
sys.path.insert(1, os.path.join(os.path.dirname(__file__), "../../"))
15
16
from methods import generate_copyright_header
17
18
URL: Final[str] = "https://www.unicode.org/Public/16.0.0/ucd/DerivedCoreProperties.txt"
19
20
21
xid_start: List[Tuple[int, int]] = []
22
xid_continue: List[Tuple[int, int]] = []
23
uppercase_letter: List[Tuple[int, int]] = []
24
lowercase_letter: List[Tuple[int, int]] = []
25
unicode_letter: List[Tuple[int, int]] = []
26
27
28
def merge_ranges(ranges: List[Tuple[int, int]]) -> None:
29
if len(ranges) < 2:
30
return
31
32
last_start: int = ranges[0][0]
33
last_end: int = ranges[0][1]
34
original_ranges: List[Tuple[int, int]] = ranges[1:]
35
36
ranges.clear()
37
38
for curr_range in original_ranges:
39
curr_start: int = curr_range[0]
40
curr_end: int = curr_range[1]
41
if last_end + 1 != curr_start:
42
ranges.append((last_start, last_end))
43
last_start = curr_start
44
last_end = curr_end
45
46
ranges.append((last_start, last_end))
47
48
49
def parse_unicode_data() -> None:
50
lines: List[str] = [line.decode("utf-8") for line in urlopen(URL)]
51
52
for line in lines:
53
if line.startswith("#") or not line.strip():
54
continue
55
56
split_line: List[str] = line.split(";")
57
58
char_range: str = split_line[0].strip()
59
char_property: str = split_line[1].strip().split("#")[0].strip()
60
61
range_start: str = char_range
62
range_end: str = char_range
63
if ".." in char_range:
64
range_start, range_end = char_range.split("..")
65
66
range_tuple: Tuple[int, int] = (int(range_start, 16), int(range_end, 16))
67
68
if char_property == "XID_Start":
69
xid_start.append(range_tuple)
70
elif char_property == "XID_Continue":
71
xid_continue.append(range_tuple)
72
elif char_property == "Uppercase":
73
uppercase_letter.append(range_tuple)
74
elif char_property == "Lowercase":
75
lowercase_letter.append(range_tuple)
76
elif char_property == "Alphabetic":
77
unicode_letter.append(range_tuple)
78
79
# Underscore technically isn't in XID_Start, but for our purposes it's included.
80
xid_start.append((0x005F, 0x005F))
81
xid_start.sort(key=lambda x: x[0])
82
83
merge_ranges(xid_start)
84
merge_ranges(xid_continue)
85
merge_ranges(uppercase_letter)
86
merge_ranges(lowercase_letter)
87
merge_ranges(unicode_letter)
88
89
90
def make_array(array_name: str, range_list: List[Tuple[int, int]]) -> str:
91
result: str = f"\n\nconstexpr inline CharRange {array_name}[] = {{\n"
92
93
for start, end in range_list:
94
result += f"\t{{ 0x{start:x}, 0x{end:x} }},\n"
95
96
result += "};"
97
98
return result
99
100
101
def generate_char_range_inc() -> None:
102
parse_unicode_data()
103
104
source: str = generate_copyright_header("char_range.inc")
105
106
source += f"""
107
// This file was generated using the `misc/scripts/char_range_fetch.py` script.
108
109
#pragma once
110
111
#include "core/typedefs.h"
112
113
// Unicode Derived Core Properties
114
// Source: {URL}
115
116
struct CharRange {{
117
\tchar32_t start;
118
\tchar32_t end;
119
}};"""
120
121
source += make_array("xid_start", xid_start)
122
source += make_array("xid_continue", xid_continue)
123
source += make_array("uppercase_letter", uppercase_letter)
124
source += make_array("lowercase_letter", lowercase_letter)
125
source += make_array("unicode_letter", unicode_letter)
126
127
source += "\n"
128
129
char_range_path: str = os.path.join(os.path.dirname(__file__), "../../core/string/char_range.inc")
130
with open(char_range_path, "w", newline="\n") as f:
131
f.write(source)
132
133
print("`char_range.inc` generated successfully.")
134
135
136
if __name__ == "__main__":
137
generate_char_range_inc()
138
139