Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/misc/scripts/char_range_fetch.py
20843 views
1
#!/usr/bin/env python3
2
3
# Script used to dump char ranges for specific properties from
4
# the Unicode Character Database to the `char_range.inc` file.
5
# NOTE: This script is deliberately not integrated into the build system;
6
# you should run it manually whenever you want to update the data.
7
from __future__ import annotations
8
9
import os
10
import sys
11
from typing import Final
12
from urllib.request import urlopen
13
14
if __name__ == "__main__":
15
sys.path.insert(1, os.path.join(os.path.dirname(__file__), "../../"))
16
17
from methods import generate_copyright_header
18
19
URL: Final[str] = "https://www.unicode.org/Public/17.0.0/ucd/DerivedCoreProperties.txt"
20
21
22
xid_start: list[tuple[int, int]] = []
23
xid_continue: list[tuple[int, int]] = []
24
uppercase_letter: list[tuple[int, int]] = []
25
lowercase_letter: list[tuple[int, int]] = []
26
unicode_letter: list[tuple[int, int]] = []
27
28
29
def merge_ranges(ranges: list[tuple[int, int]]) -> None:
30
if len(ranges) < 2:
31
return
32
33
last_start: int = ranges[0][0]
34
last_end: int = ranges[0][1]
35
original_ranges: list[tuple[int, int]] = ranges[1:]
36
37
ranges.clear()
38
39
for curr_range in original_ranges:
40
curr_start: int = curr_range[0]
41
curr_end: int = curr_range[1]
42
if last_end + 1 != curr_start:
43
ranges.append((last_start, last_end))
44
last_start = curr_start
45
last_end = curr_end
46
47
ranges.append((last_start, last_end))
48
49
50
def parse_unicode_data() -> None:
51
lines: list[str] = [line.decode("utf-8") for line in urlopen(URL)]
52
53
for line in lines:
54
if line.startswith("#") or not line.strip():
55
continue
56
57
split_line: list[str] = line.split(";")
58
59
char_range: str = split_line[0].strip()
60
char_property: str = split_line[1].strip().split("#")[0].strip()
61
62
range_start: str = char_range
63
range_end: str = char_range
64
if ".." in char_range:
65
range_start, range_end = char_range.split("..")
66
67
range_tuple: tuple[int, int] = (int(range_start, 16), int(range_end, 16))
68
69
if char_property == "XID_Start":
70
xid_start.append(range_tuple)
71
elif char_property == "XID_Continue":
72
xid_continue.append(range_tuple)
73
elif char_property == "Uppercase":
74
uppercase_letter.append(range_tuple)
75
elif char_property == "Lowercase":
76
lowercase_letter.append(range_tuple)
77
elif char_property == "Alphabetic":
78
unicode_letter.append(range_tuple)
79
80
# Underscore technically isn't in XID_Start, but for our purposes it's included.
81
xid_start.append((0x005F, 0x005F))
82
xid_start.sort(key=lambda x: x[0])
83
84
merge_ranges(xid_start)
85
merge_ranges(xid_continue)
86
merge_ranges(uppercase_letter)
87
merge_ranges(lowercase_letter)
88
merge_ranges(unicode_letter)
89
90
91
def make_array(array_name: str, range_list: list[tuple[int, int]]) -> str:
92
result: str = f"\n\nconstexpr inline CharRange {array_name}[] = {{\n"
93
94
for start, end in range_list:
95
result += f"\t{{ 0x{start:x}, 0x{end:x} }},\n"
96
97
result += "};"
98
99
return result
100
101
102
def generate_char_range_inc() -> None:
103
parse_unicode_data()
104
105
source: str = generate_copyright_header("char_range.inc")
106
107
source += f"""
108
// This file was generated using the `misc/scripts/char_range_fetch.py` script.
109
110
#pragma once
111
112
#include "core/typedefs.h"
113
114
// Unicode Derived Core Properties
115
// Source: {URL}
116
117
struct CharRange {{
118
\tchar32_t start;
119
\tchar32_t end;
120
}};"""
121
122
source += make_array("xid_start", xid_start)
123
source += make_array("xid_continue", xid_continue)
124
source += make_array("uppercase_letter", uppercase_letter)
125
source += make_array("lowercase_letter", lowercase_letter)
126
source += make_array("unicode_letter", unicode_letter)
127
128
source += "\n"
129
130
char_range_path: str = os.path.join(os.path.dirname(__file__), "../../core/string/char_range.inc")
131
with open(char_range_path, "w", newline="\n") as f:
132
f.write(source)
133
134
print("`char_range.inc` generated successfully.")
135
136
137
if __name__ == "__main__":
138
generate_char_range_inc()
139
140