Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_tables.c
21746 views
1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
Written by Philip Hazel
9
Original API code Copyright (c) 1997-2012 University of Cambridge
10
New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
* Redistributions of source code must retain the above copyright notice,
17
this list of conditions and the following disclaimer.
18
19
* Redistributions in binary form must reproduce the above copyright
20
notice, this list of conditions and the following disclaimer in the
21
documentation and/or other materials provided with the distribution.
22
23
* Neither the name of the University of Cambridge nor the names of its
24
contributors may be used to endorse or promote products derived from
25
this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
/* This module contains some fixed tables that are used by more than one of the
43
PCRE2 code modules. The tables are also #included by the pcre2test program,
44
which uses macros to change their names from _pcre2_xxx to xxxx, thereby
45
avoiding name clashes with the library. In this case, PCRE2_PCRE2TEST is
46
defined. */
47
48
49
#if !defined(PCRE2_PCRE2TEST) && !defined(PCRE2_DFTABLES) && \
50
!defined(PCRE2_PCRE2POSIX) /* We're compiling the library */
51
#include "pcre2_internal.h"
52
#endif
53
54
55
/* Utility macros */
56
#define ARR_SIZE(x) sizeof(x)/sizeof(x[0])
57
58
59
#if !defined(PCRE2_PCRE2TEST) && !defined(PCRE2_DFTABLES) && \
60
!defined(PCRE2_PCRE2POSIX)
61
62
/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
63
the definition is next to the definition of the opcodes in pcre2_internal.h.
64
This is mode-dependent, so it is skipped when this file is included by
65
pcre2test. */
66
67
const uint8_t PRIV(OP_lengths)[] = { OP_LENGTHS };
68
69
/* Tables of horizontal and vertical whitespace characters, suitable for
70
adding to classes. */
71
72
const uint32_t PRIV(hspace_list)[] = { HSPACE_LIST };
73
const uint32_t PRIV(vspace_list)[] = { VSPACE_LIST };
74
75
#endif /* !PCRE2_PCRE2TEST && !PCRE2_DFTABLES && !PCRE2_PCRE2POSIX */
76
77
78
#if !defined(PCRE2_DFTABLES) && !defined(PCRE2_PCRE2POSIX)
79
80
/* These tables are the pairs of delimiters that are valid for callout string
81
arguments. For each starting delimiter there must be a matching ending
82
delimiter, which in fact is different only for bracket-like delimiters. */
83
84
const uint32_t PRIV(callout_start_delims)[] = {
85
CHAR_GRAVE_ACCENT, CHAR_APOSTROPHE, CHAR_QUOTATION_MARK,
86
CHAR_CIRCUMFLEX_ACCENT, CHAR_PERCENT_SIGN, CHAR_NUMBER_SIGN,
87
CHAR_DOLLAR_SIGN, CHAR_LEFT_CURLY_BRACKET, 0 };
88
89
const uint32_t PRIV(callout_end_delims[]) = {
90
CHAR_GRAVE_ACCENT, CHAR_APOSTROPHE, CHAR_QUOTATION_MARK,
91
CHAR_CIRCUMFLEX_ACCENT, CHAR_PERCENT_SIGN, CHAR_NUMBER_SIGN,
92
CHAR_DOLLAR_SIGN, CHAR_RIGHT_CURLY_BRACKET, 0 };
93
94
#endif /* !PCRE2_DFTABLES && !PCRE2_PCRE2POSIX */
95
96
97
/*************************************************
98
* Tables for UTF-8 support *
99
*************************************************/
100
101
/* These tables are required by pcre2test in 16- or 32-bit mode, as well
102
as for the library in 8-bit mode, because pcre2test uses UTF-8 internally for
103
handling wide characters. */
104
105
#if defined PCRE2_PCRE2TEST || \
106
(!defined(PCRE2_DFTABLES) && !defined(PCRE2_PCRE2POSIX) && \
107
defined SUPPORT_UNICODE && \
108
defined PCRE2_CODE_UNIT_WIDTH && \
109
PCRE2_CODE_UNIT_WIDTH == 8)
110
111
/* These are the breakpoints for different numbers of bytes in a UTF-8
112
character. */
113
114
const int PRIV(utf8_table1)[] =
115
{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff };
116
117
const unsigned PRIV(utf8_table1_size) = ARR_SIZE(PRIV(utf8_table1));
118
119
/* These are the indicator bits and the mask for the data bits to set in the
120
first byte of a character, indexed by the number of additional bytes. */
121
122
const int PRIV(utf8_table2)[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc };
123
const int PRIV(utf8_table3)[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
124
125
/* Table of the number of extra bytes, indexed by the first byte masked with
126
0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */
127
128
const uint8_t PRIV(utf8_table4)[] = {
129
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
130
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
131
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
132
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
133
134
#endif /* UTF-8 support needed */
135
136
/* Tables concerned with Unicode properties are relevant only when Unicode
137
support is enabled. See also the pcre2_ucptables_inc.h file, which is generated by
138
a Python script from Unicode data files. */
139
140
#if !defined(PCRE2_DFTABLES) && !defined(PCRE2_PCRE2POSIX) && \
141
defined(SUPPORT_UNICODE)
142
143
/* Table to translate from particular type value to the general value. */
144
145
const uint32_t PRIV(ucp_gentype)[] = {
146
ucp_C, ucp_C, ucp_C, ucp_C, ucp_C, /* Cc, Cf, Cn, Co, Cs */
147
ucp_L, ucp_L, ucp_L, ucp_L, ucp_L, /* Ll, Lu, Lm, Lo, Lt */
148
ucp_M, ucp_M, ucp_M, /* Mc, Me, Mn */
149
ucp_N, ucp_N, ucp_N, /* Nd, Nl, No */
150
ucp_P, ucp_P, ucp_P, ucp_P, ucp_P, /* Pc, Pd, Pe, Pf, Pi */
151
ucp_P, ucp_P, /* Ps, Po */
152
ucp_S, ucp_S, ucp_S, ucp_S, /* Sc, Sk, Sm, So */
153
ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */
154
};
155
156
/* This table encodes the rules for finding the end of an extended grapheme
157
cluster. Every code point has a grapheme break property which is one of the
158
ucp_gbXX values defined in pcre2_ucp.h. These changed between Unicode versions
159
10 and 11. The 2-dimensional table is indexed by the properties of two adjacent
160
code points. The left property selects a word from the table, and the right
161
property selects a bit from that word like this:
162
163
PRIV(ucp_gbtable)[left-property] & (1u << right-property)
164
165
The value is non-zero if a grapheme break is NOT permitted between the relevant
166
two code points. The breaking rules are as follows:
167
168
1. Break at the start and end of text (pretty obviously).
169
170
2. Do not break between a CR and LF; otherwise, break before and after
171
controls.
172
173
3. Do not break Hangul syllable sequences, the rules for which are:
174
175
L may be followed by L, V, LV or LVT
176
LV or V may be followed by V or T
177
LVT or T may be followed by T
178
179
4. Do not break before extending characters or zero-width-joiner (ZWJ).
180
181
The following rules are only for extended grapheme clusters (but that's what we
182
are implementing).
183
184
5. Do not break before SpacingMarks.
185
186
6. Do not break after Prepend characters.
187
188
7. Do not break within emoji modifier sequences or emoji zwj sequences. That
189
is, do not break between characters with the Extended_Pictographic property
190
if a ZWJ intervenes. Extend characters are allowed between the characters;
191
this cannot be represented in this table, the code has to deal with it.
192
193
8. Do not break within emoji flag sequences. That is, do not break between
194
regional indicator (RI) symbols if there are an odd number of RI characters
195
before the break point. This table encodes "join RI characters"; the code
196
has to deal with checking for previous adjoining RIs.
197
198
9. Otherwise, break everywhere.
199
*/
200
201
#define ESZ (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark)|(1<<ucp_gbZWJ)
202
203
const uint32_t PRIV(ucp_gbtable)[] = {
204
(1u<<ucp_gbLF), /* 0 CR */
205
0, /* 1 LF */
206
0, /* 2 Control */
207
ESZ, /* 3 Extend */
208
ESZ|(1u<<ucp_gbPrepend)| /* 4 Prepend */
209
(1u<<ucp_gbL)|(1u<<ucp_gbV)|(1u<<ucp_gbT)|
210
(1u<<ucp_gbLV)|(1u<<ucp_gbLVT)|(1u<<ucp_gbOther)|
211
(1u<<ucp_gbRegional_Indicator),
212
ESZ, /* 5 SpacingMark */
213
ESZ|(1u<<ucp_gbL)|(1u<<ucp_gbV)|(1u<<ucp_gbLV)| /* 6 L */
214
(1u<<ucp_gbLVT),
215
ESZ|(1u<<ucp_gbV)|(1u<<ucp_gbT), /* 7 V */
216
ESZ|(1u<<ucp_gbT), /* 8 T */
217
ESZ|(1u<<ucp_gbV)|(1u<<ucp_gbT), /* 9 LV */
218
ESZ|(1u<<ucp_gbT), /* 10 LVT */
219
(1u<<ucp_gbRegional_Indicator), /* 11 Regional Indicator */
220
ESZ, /* 12 Other */
221
ESZ|(1u<<ucp_gbExtended_Pictographic), /* 13 ZWJ */
222
ESZ /* 14 Extended Pictographic */
223
};
224
225
#undef ESZ
226
227
#ifdef SUPPORT_JIT
228
/* This table reverses PRIV(ucp_gentype). We can save the cost
229
of a memory load. */
230
231
const int PRIV(ucp_typerange)[] = {
232
ucp_Cc, ucp_Cs,
233
ucp_Ll, ucp_Lu,
234
ucp_Mc, ucp_Mn,
235
ucp_Nd, ucp_No,
236
ucp_Pc, ucp_Ps,
237
ucp_Sc, ucp_So,
238
ucp_Zl, ucp_Zs,
239
};
240
#endif /* SUPPORT_JIT */
241
242
/* Finally, include the tables that are auto-generated from the Unicode data
243
files. */
244
245
#include "pcre2_ucptables_inc.h"
246
247
#endif /* Unicode support needed */
248
249
250
/*************************************************
251
* Tables for EBCDIC support *
252
*************************************************/
253
254
#if defined(EBCDIC) && \
255
(defined(PCRE2_PCRE2TEST) || defined(PCRE2_DFTABLES) || 'a' != 0x81)
256
257
const uint8_t PRIV(ebcdic_1047_to_ascii)[256] = {
258
0x00,0x01,0x02,0x03,0x9c,0x09,0x86,0x7f,0x97,0x8d,0x8e,0x0b,0x0c,0x0d,0x0e,0x0f,
259
#ifdef EBCDIC_NL25
260
0x10,0x11,0x12,0x13,0x9d,0x85,0x08,0x87,0x18,0x19,0x92,0x8f,0x1c,0x1d,0x1e,0x1f,
261
0x80,0x81,0x82,0x83,0x84,0x0a,0x17,0x1b,0x88,0x89,0x8a,0x8b,0x8c,0x05,0x06,0x07,
262
#else
263
0x10,0x11,0x12,0x13,0x9d,0x0a,0x08,0x87,0x18,0x19,0x92,0x8f,0x1c,0x1d,0x1e,0x1f,
264
0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1b,0x88,0x89,0x8a,0x8b,0x8c,0x05,0x06,0x07,
265
#endif
266
0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9a,0x9b,0x14,0x15,0x9e,0x1a,
267
0x20,0xa0,0xe2,0xe4,0xe0,0xe1,0xe3,0xe5,0xe7,0xf1,0xa2,0x2e,0x3c,0x28,0x2b,0x7c,
268
0x26,0xe9,0xea,0xeb,0xe8,0xed,0xee,0xef,0xec,0xdf,0x21,0x24,0x2a,0x29,0x3b,0x5e,
269
0x2d,0x2f,0xc2,0xc4,0xc0,0xc1,0xc3,0xc5,0xc7,0xd1,0xa6,0x2c,0x25,0x5f,0x3e,0x3f,
270
0xf8,0xc9,0xca,0xcb,0xc8,0xcd,0xce,0xcf,0xcc,0x60,0x3a,0x23,0x40,0x27,0x3d,0x22,
271
0xd8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xab,0xbb,0xf0,0xfd,0xfe,0xb1,
272
0xb0,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,0x70,0x71,0x72,0xaa,0xba,0xe6,0xb8,0xc6,0xa4,
273
0xb5,0x7e,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0xa1,0xbf,0xd0,0x5b,0xde,0xae,
274
0xac,0xa3,0xa5,0xb7,0xa9,0xa7,0xb6,0xbc,0xbd,0xbe,0xdd,0xa8,0xaf,0x5d,0xb4,0xd7,
275
0x7b,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xad,0xf4,0xf6,0xf2,0xf3,0xf5,
276
0x7d,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,0x50,0x51,0x52,0xb9,0xfb,0xfc,0xf9,0xfa,0xff,
277
0x5c,0xf7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0xb2,0xd4,0xd6,0xd2,0xd3,0xd5,
278
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xb3,0xdb,0xdc,0xd9,0xda,0x9f,
279
};
280
281
const uint8_t PRIV(ascii_to_ebcdic_1047)[256] = {
282
#ifdef EBCDIC_NL25
283
0x00,0x01,0x02,0x03,0x37,0x2d,0x2e,0x2f,0x16,0x05,0x25,0x0b,0x0c,0x0d,0x0e,0x0f,
284
#else
285
0x00,0x01,0x02,0x03,0x37,0x2d,0x2e,0x2f,0x16,0x05,0x15,0x0b,0x0c,0x0d,0x0e,0x0f,
286
#endif
287
0x10,0x11,0x12,0x13,0x3c,0x3d,0x32,0x26,0x18,0x19,0x3f,0x27,0x1c,0x1d,0x1e,0x1f,
288
0x40,0x5a,0x7f,0x7b,0x5b,0x6c,0x50,0x7d,0x4d,0x5d,0x5c,0x4e,0x6b,0x60,0x4b,0x61,
289
0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0x7a,0x5e,0x4c,0x7e,0x6e,0x6f,
290
0x7c,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,
291
0xd7,0xd8,0xd9,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xad,0xe0,0xbd,0x5f,0x6d,
292
0x79,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x91,0x92,0x93,0x94,0x95,0x96,
293
0x97,0x98,0x99,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xc0,0x4f,0xd0,0xa1,0x07,
294
#ifdef EBCDIC_NL25
295
0x20,0x21,0x22,0x23,0x24,0x15,0x06,0x17,0x28,0x29,0x2a,0x2b,0x2c,0x09,0x0a,0x1b,
296
#else
297
0x20,0x21,0x22,0x23,0x24,0x25,0x06,0x17,0x28,0x29,0x2a,0x2b,0x2c,0x09,0x0a,0x1b,
298
#endif
299
0x30,0x31,0x1a,0x33,0x34,0x35,0x36,0x08,0x38,0x39,0x3a,0x3b,0x04,0x14,0x3e,0xff,
300
0x41,0xaa,0x4a,0xb1,0x9f,0xb2,0x6a,0xb5,0xbb,0xb4,0x9a,0x8a,0xb0,0xca,0xaf,0xbc,
301
0x90,0x8f,0xea,0xfa,0xbe,0xa0,0xb6,0xb3,0x9d,0xda,0x9b,0x8b,0xb7,0xb8,0xb9,0xab,
302
0x64,0x65,0x62,0x66,0x63,0x67,0x9e,0x68,0x74,0x71,0x72,0x73,0x78,0x75,0x76,0x77,
303
0xac,0x69,0xed,0xee,0xeb,0xef,0xec,0xbf,0x80,0xfd,0xfe,0xfb,0xfc,0xba,0xae,0x59,
304
0x44,0x45,0x42,0x46,0x43,0x47,0x9c,0x48,0x54,0x51,0x52,0x53,0x58,0x55,0x56,0x57,
305
0x8c,0x49,0xcd,0xce,0xcb,0xcf,0xcc,0xe1,0x70,0xdd,0xde,0xdb,0xdc,0x8d,0x8e,0xdf,
306
};
307
308
#endif /* EBCDIC support needed */
309
310
/* End of pcre2_tables.c */
311
312