Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Roblox
GitHub Repository: Roblox/luau
Path: blob/master/bench/other/regex.lua
2725 views
1
--[[
2
PCRE2-based RegEx implemention for Luau
3
Version 1.0.0a2
4
BSD 2-Clause Licence
5
Copyright © 2020 - Blockzez (devforum /u/Blockzez and github.com/Blockzez)
6
All rights reserved.
7
8
Redistribution and use in source and binary forms, with or without
9
modification, are permitted provided that the following conditions are met:
10
11
1. Redistributions of source code must retain the above copyright notice, this
12
list of conditions and the following disclaimer.
13
14
2. Redistributions in binary form must reproduce the above copyright notice,
15
this list of conditions and the following disclaimer in the documentation
16
and/or other materials provided with the distribution.
17
18
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
]]
29
--[[ Settings ]]--
30
-- You can change them here
31
local options = {
32
-- The maximum cache size for regex so the patterns are cached so it doesn't recompile the pattern
33
-- The only accepted value are number values >= 0, strings that can be automatically coered to numbers that are >= 0, false and nil
34
-- Do note that empty regex patterns (comment-only patterns included) are never cached regardless
35
-- The default is 256
36
cacheSize = 256,
37
38
-- A boolean that determines whether this use unicode data
39
-- If this value evalulates to false, you can remove _unicodechar_category, _scripts and _xuc safely and it'll now error if:
40
-- - You try to compile a RegEx with unicode flag
41
-- - You try to use the \p pattern
42
-- The default is true
43
unicodeData = false,
44
};
45
46
--
47
local u_categories = options.unicodeData and require(script:WaitForChild("_unicodechar_category"));
48
local chr_scripts = options.unicodeData and require(script:WaitForChild("_scripts"));
49
local xuc_chr = options.unicodeData and require(script:WaitForChild("_xuc"));
50
local proxy = setmetatable({ }, { __mode = 'k' });
51
local re, re_m, match_m = { }, { }, { };
52
local lockmsg;
53
54
--[[ Functions ]]--
55
local function to_str_arr(self, init)
56
if init then
57
self = string.sub(self, utf8.offset(self, init));
58
end;
59
local len = utf8.len(self);
60
if len <= 1999 then
61
return { n = len, s = self, utf8.codepoint(self, 1, #self) };
62
end;
63
local clen = math.ceil(len / 1999);
64
local ret = table.create(len);
65
local p = 1;
66
for i = 1, clen do
67
local c = table.pack(utf8.codepoint(self, utf8.offset(self, i * 1999 - 1998), utf8.offset(self, i * 1999 - (i == clen and 1998 - ((len - 1) % 1999 + 1) or - 1)) - 1));
68
table.move(c, 1, c.n, p, ret);
69
p += c.n;
70
end;
71
ret.s, ret.n = self, len;
72
return ret;
73
end;
74
75
local function from_str_arr(self)
76
local len = self.n or #self;
77
if len <= 7997 then
78
return utf8.char(table.unpack(self));
79
end;
80
local clen = math.ceil(len / 7997);
81
local r = table.create(clen);
82
for i = 1, clen do
83
r[i] = utf8.char(table.unpack(self, i * 7997 - 7996, i * 7997 - (i == clen and 7997 - ((len - 1) % 7997 + 1) or 0)));
84
end;
85
return table.concat(r);
86
end;
87
88
local function utf8_sub(self, i, j)
89
j = utf8.offset(self, j);
90
return string.sub(self, utf8.offset(self, i), j and j - 1);
91
end;
92
93
--
94
local flag_map = {
95
a = 'anchored', i = 'caseless', m = 'multiline', s = 'dotall', u = 'unicode', U = 'ungreedy', x ='extended',
96
};
97
98
local posix_class_names = {
99
alnum = true, alpha = true, ascii = true, blank = true, cntrl = true, digit = true, graph = true, lower = true, print = true, punct = true, space = true, upper = true, word = true, xdigit = true,
100
};
101
102
local escape_chars = {
103
-- grouped
104
-- digit, spaces and words
105
[0x44] = { "class", "digit", true }, [0x53] = { "class", "space", true }, [0x57] = { "class", "word", true },
106
[0x64] = { "class", "digit", false }, [0x73] = { "class", "space", false }, [0x77] = { "class", "word", false },
107
-- horizontal/vertical whitespace and newline
108
[0x48] = { "class", "blank", true }, [0x56] = { "class", "vertical_tab", true },
109
[0x68] = { "class", "blank", false }, [0x76] = { "class", "vertical_tab", false },
110
[0x4E] = { 0x4E }, [0x52] = { 0x52 },
111
112
-- not grouped
113
[0x42] = 0x08,
114
[0x6E] = 0x0A, [0x72] = 0x0D, [0x74] = 0x09,
115
};
116
117
local b_escape_chars = {
118
-- word boundary and not word boundary
119
[0x62] = { 0x62, { "class", "word", false } }, [0x42] = { 0x42, { "class", "word", false } },
120
121
-- keep match out
122
[0x4B] = { 0x4B },
123
124
-- start & end of string
125
[0x47] = { 0x47 }, [0x4A] = { 0x4A }, [0x5A] = { 0x5A }, [0x7A] = { 0x7A },
126
};
127
128
local valid_categories = {
129
C = true, Cc = true, Cf = true, Cn = true, Co = true, Cs = true,
130
L = true, Ll = true, Lm = true, Lo = true, Lt = true, Lu = true,
131
M = true, Mc = true, Me = true, Mn = true,
132
N = true, Nd = true, Nl = true, No = true,
133
P = true, Pc = true, Pd = true, Pe = true, Pf = true, Pi = true, Po = true, Ps = true,
134
S = true, Sc = true, Sk = true, Sm = true, So = true,
135
Z = true, Zl = true, Zp = true, Zs = true,
136
137
Xan = true, Xps = true, Xsp = true, Xuc = true, Xwd = true,
138
};
139
140
local class_ascii_punct = {
141
[0x21] = true, [0x22] = true, [0x23] = true, [0x24] = true, [0x25] = true, [0x26] = true, [0x27] = true, [0x28] = true, [0x29] = true, [0x2A] = true, [0x2B] = true, [0x2C] = true, [0x2D] = true, [0x2E] = true, [0x2F] = true,
142
[0x3A] = true, [0x3B] = true, [0x3C] = true, [0x3D] = true, [0x3E] = true, [0x3F] = true, [0x40] = true, [0x5B] = true, [0x5C] = true, [0x5D] = true, [0x5E] = true, [0x5F] = true, [0x60] = true, [0x7B] = true, [0x7C] = true,
143
[0x7D] = true, [0x7E] = true,
144
};
145
146
local end_str = { 0x24 };
147
local dot = { 0x2E };
148
local beginning_str = { 0x5E };
149
local alternation = { 0x7C };
150
151
local function check_re(re_type, name, func)
152
if re_type == "Match" then
153
return function(...)
154
local arg_n = select('#', ...);
155
if arg_n < 1 then
156
error("missing argument #1 (Match expected)", 2);
157
end;
158
local arg0, arg1 = ...;
159
if not (proxy[arg0] and proxy[arg0].name == "Match") then
160
error(string.format("invalid argument #1 to %q (Match expected, got %s)", name, typeof(arg0)), 2);
161
else
162
arg0 = proxy[arg0];
163
end;
164
if name == "group" or name == "span" then
165
if arg1 == nil then
166
arg1 = 0;
167
end;
168
end;
169
return func(arg0, arg1);
170
end;
171
end;
172
return function(...)
173
local arg_n = select('#', ...);
174
if arg_n < 1 then
175
error("missing argument #1 (RegEx expected)", 2);
176
elseif arg_n < 2 then
177
error("missing argument #2 (string expected)", 2);
178
end;
179
local arg0, arg1, arg2, arg3, arg4, arg5 = ...;
180
if not (proxy[arg0] and proxy[arg0].name == "RegEx") then
181
if type(arg0) ~= "string" and type(arg0) ~= "number" then
182
error(string.format("invalid argument #1 to %q (RegEx expected, got %s)", name, typeof(arg0)), 2);
183
end;
184
arg0 = re.fromstring(arg0);
185
elseif name == "sub" then
186
if type(arg2) == "number" then
187
arg2 ..= '';
188
elseif type(arg2) ~= "string" then
189
error(string.format("invalid argument #3 to 'sub' (string expected, got %s)", typeof(arg2)), 2);
190
end;
191
elseif type(arg1) == "number" then
192
arg1 ..= '';
193
elseif type(arg1) ~= "string" then
194
error(string.format("invalid argument #2 to %q (string expected, got %s)", name, typeof(arg1)), 2);
195
end;
196
if name ~= "sub" and name ~= "split" then
197
local init_type = typeof(arg2);
198
if init_type ~= 'nil' then
199
arg2 = tonumber(arg2);
200
if not arg2 then
201
error(string.format("invalid argument #3 to %q (number expected, got %s)", name, init_type), 2);
202
elseif arg2 < 0 then
203
arg2 = #arg1 + math.floor(arg2 + 0.5) + 1;
204
else
205
arg2 = math.max(math.floor(arg2 + 0.5), 1);
206
end;
207
end;
208
end;
209
arg0 = proxy[arg0];
210
if name == "match" or name == "matchiter" then
211
arg3 = ...;
212
elseif name == "sub" then
213
arg5 = ...;
214
end;
215
return func(arg0, arg1, arg2, arg3, arg4, arg5);
216
end;
217
end;
218
219
--[[ Matches ]]--
220
local function match_tostr(self)
221
local spans = proxy[self].spans;
222
local s_start, s_end = spans[0][1], spans[0][2];
223
if s_end <= s_start then
224
return string.format("Match (%d..%d, empty)", s_start, s_end - 1);
225
end;
226
return string.format("Match (%d..%d): %s", s_start, s_end - 1, utf8_sub(spans.input, s_start, s_end));
227
end;
228
229
local function new_match(span_arr, group_id, re, str)
230
span_arr.source, span_arr.input = re, str;
231
local object = newproxy(true);
232
local object_mt = getmetatable(object);
233
object_mt.__metatable = lockmsg;
234
object_mt.__index = setmetatable(span_arr, match_m);
235
object_mt.__tostring = match_tostr;
236
237
proxy[object] = { name = "Match", spans = span_arr, group_id = group_id };
238
return object;
239
end;
240
241
match_m.group = check_re('Match', 'group', function(self, group_id)
242
local span = self.spans[type(group_id) == "number" and group_id or self.group_id[group_id]];
243
if not span then
244
return nil;
245
end;
246
return utf8_sub(self.spans.input, span[1], span[2]);
247
end);
248
249
match_m.span = check_re('Match', 'span', function(self, group_id)
250
local span = self.spans[type(group_id) == "number" and group_id or self.group_id[group_id]];
251
if not span then
252
return nil;
253
end;
254
return span[1], span[2] - 1;
255
end);
256
257
match_m.groups = check_re('Match', 'groups', function(self)
258
local spans = self.spans;
259
if spans.n > 0 then
260
local ret = table.create(spans.n);
261
for i = 0, spans.n do
262
local v = spans[i];
263
if v then
264
ret[i] = utf8_sub(spans.input, v[1], v[2]);
265
end;
266
end;
267
return table.unpack(ret, 1, spans.n);
268
end;
269
return utf8_sub(spans.input, spans[0][1], spans[0][2]);
270
end);
271
272
match_m.groupdict = check_re('Match', 'groupdict', function(self)
273
local spans = self.spans;
274
local ret = { };
275
for k, v in pairs(self.group_id) do
276
v = spans[v];
277
if v then
278
ret[k] = utf8_sub(spans.input, v[1], v[2]);
279
end;
280
end;
281
return ret;
282
end);
283
284
match_m.grouparr = check_re('Match', 'groupdict', function(self)
285
local spans = self.spans;
286
local ret = table.create(spans.n);
287
for i = 0, spans.n do
288
local v = spans[i];
289
if v then
290
ret[i] = utf8_sub(spans.input, v[1], v[2]);
291
end;
292
end;
293
ret.n = spans.n;
294
return ret;
295
end);
296
297
--
298
local line_verbs = {
299
CR = 0, LF = 1, CRLF = 2, ANYRLF = 3, ANY = 4, NUL = 5,
300
};
301
local function is_newline(str_arr, i, verb_flags)
302
local line_verb_n = verb_flags.newline;
303
local chr = str_arr[i];
304
if line_verb_n == 0 then
305
-- carriage return
306
return chr == 0x0D;
307
elseif line_verb_n == 2 then
308
-- carriage return followed by line feed
309
return chr == 0x0A and str_arr[i - 1] == 0x20;
310
elseif line_verb_n == 3 then
311
-- any of the above
312
return chr == 0x0A or chr == 0x0D;
313
elseif line_verb_n == 4 then
314
-- any of Unicode newlines
315
return chr == 0x0A or chr == 0x0B or chr == 0x0C or chr == 0x0D or chr == 0x85 or chr == 0x2028 or chr == 0x2029;
316
elseif line_verb_n == 5 then
317
-- null
318
return chr == 0;
319
end;
320
-- linefeed
321
return chr == 0x0A;
322
end;
323
324
325
local function tkn_char_match(tkn_part, str_arr, i, flags, verb_flags)
326
local chr = str_arr[i];
327
if not chr then
328
return false;
329
elseif flags.ignoreCase and chr >= 0x61 and chr <= 0x7A then
330
chr -= 0x20;
331
end;
332
if type(tkn_part) == "number" then
333
return tkn_part == chr;
334
elseif tkn_part[1] == "charset" then
335
for _, v in ipairs(tkn_part[3]) do
336
if tkn_char_match(v, str_arr, i, flags, verb_flags) then
337
return not tkn_part[2];
338
end;
339
end;
340
return tkn_part[2];
341
elseif tkn_part[1] == "range" then
342
return chr >= tkn_part[2] and chr <= tkn_part[3] or flags.ignoreCase and chr >= 0x41 and chr <= 0x5A and (chr + 0x20) >= tkn_part[2] and (chr + 0x20) <= tkn_part[3];
343
elseif tkn_part[1] == "class" then
344
local char_class = tkn_part[2];
345
local negate = tkn_part[3];
346
local match = false;
347
-- if and elseifs :(
348
-- Might make these into tables in the future
349
if char_class == "xdigit" then
350
match = chr >= 0x30 and chr <= 0x39 or chr >= 0x41 and chr <= 0x46 or chr >= 0x61 and chr <= 0x66;
351
elseif char_class == "ascii" then
352
match = chr <= 0x7F;
353
-- cannot be accessed through POSIX classes
354
elseif char_class == "vertical_tab" then
355
match = chr >= 0x0A and chr <= 0x0D or chr == 0x2028 or chr == 0x2029;
356
--
357
elseif flags.unicode then
358
local current_category = u_categories[chr] or 'Cn';
359
local first_category = current_category:sub(1, 1);
360
if char_class == "alnum" then
361
match = first_category == 'L' or current_category == 'Nl' or current_category == 'Nd';
362
elseif char_class == "alpha" then
363
match = first_category == 'L' or current_category == 'Nl';
364
elseif char_class == "blank" then
365
match = current_category == 'Zs' or chr == 0x09;
366
elseif char_class == "cntrl" then
367
match = current_category == 'Cc';
368
elseif char_class == "digit" then
369
match = current_category == 'Nd';
370
elseif char_class == "graph" then
371
match = first_category ~= 'P' and first_category ~= 'C';
372
elseif char_class == "lower" then
373
match = current_category == 'Ll';
374
elseif char_class == "print" then
375
match = first_category ~= 'C';
376
elseif char_class == "punct" then
377
match = first_category == 'P';
378
elseif char_class == "space" then
379
match = first_category == 'Z' or chr >= 0x09 and chr <= 0x0D;
380
elseif char_class == "upper" then
381
match = current_category == 'Lu';
382
elseif char_class == "word" then
383
match = first_category == 'L' or current_category == 'Nl' or current_category == 'Nd' or current_category == 'Pc';
384
end;
385
elseif char_class == "alnum" then
386
match = chr >= 0x30 and chr <= 0x39 or chr >= 0x41 and chr <= 0x5A or chr >= 0x61 and chr <= 0x7A;
387
elseif char_class == "alpha" then
388
match = chr >= 0x41 and chr <= 0x5A or chr >= 0x61 and chr <= 0x7A;
389
elseif char_class == "blank" then
390
match = chr == 0x09 or chr == 0x20;
391
elseif char_class == "cntrl" then
392
match = chr <= 0x1F or chr == 0x7F;
393
elseif char_class == "digit" then
394
match = chr >= 0x30 and chr <= 0x39;
395
elseif char_class == "graph" then
396
match = chr >= 0x21 and chr <= 0x7E;
397
elseif char_class == "lower" then
398
match = chr >= 0x61 and chr <= 0x7A;
399
elseif char_class == "print" then
400
match = chr >= 0x20 and chr <= 0x7E;
401
elseif char_class == "punct" then
402
match = class_ascii_punct[chr];
403
elseif char_class == "space" then
404
match = chr >= 0x09 and chr <= 0x0D or chr == 0x20;
405
elseif char_class == "upper" then
406
match = chr >= 0x41 and chr <= 0x5A;
407
elseif char_class == "word" then
408
match = chr >= 0x30 and chr <= 0x39 or chr >= 0x41 and chr <= 0x5A or chr >= 0x61 and chr <= 0x7A or chr == 0x5F;
409
end;
410
if negate then
411
return not match;
412
end;
413
return match;
414
elseif tkn_part[1] == "category" then
415
local chr_category = u_categories[chr] or 'Cn';
416
local category_v = tkn_part[3];
417
local category_len = #category_v;
418
if category_len == 3 then
419
local match = false;
420
if category_v == "Xan" or category_v == "Xwd" then
421
match = chr_category:find("^[LN]") or category_v == "Xwd" and chr == 0x5F;
422
elseif category_v == "Xps" or category_v == "Xsp" then
423
match = chr_category:sub(1, 1) == 'Z' or chr >= 0x09 and chr <= 0x0D;
424
elseif category_v == "Xuc" then
425
match = tkn_char_match(xuc_chr, str_arr, i, flags, verb_flags);
426
end;
427
if tkn_part[2] then
428
return not match;
429
end
430
return match;
431
elseif chr_category:sub(1, category_len) == category_v then
432
return not tkn_part[2];
433
end;
434
return tkn_part[2];
435
elseif tkn_part[1] == 0x2E then
436
return flags.dotAll or not is_newline(str_arr, i, verb_flags);
437
elseif tkn_part[1] == 0x4E then
438
return not is_newline(str_arr, i, verb_flags);
439
elseif tkn_part[1] == 0x52 then
440
if verb_flags.newline_seq == 0 then
441
-- CR, LF or CRLF
442
return chr == 0x0A or chr == 0x0D;
443
end;
444
-- any unicode newline
445
return chr == 0x0A or chr == 0x0B or chr == 0x0C or chr == 0x0D or chr == 0x85 or chr == 0x2028 or chr == 0x2029;
446
end;
447
return false;
448
end;
449
450
local function find_alternation(token, i, count)
451
while true do
452
local v = token[i];
453
local is_table = type(v) == "table";
454
if v == alternation then
455
return i, count;
456
elseif is_table and v[1] == 0x28 then
457
if count then
458
count += v.count;
459
end;
460
i = v[3];
461
elseif is_table and v[1] == "quantifier" and type(v[5]) == "table" and v[5][1] == 0x28 then
462
if count then
463
count += v[5].count;
464
end;
465
i = v[5][3];
466
elseif not v or is_table and v[1] == 0x29 then
467
return nil, count;
468
elseif count then
469
if is_table and v[1] == "quantifier" then
470
count += v[3];
471
else
472
count += 1;
473
end;
474
end;
475
i += 1;
476
end;
477
end;
478
479
local function re_rawfind(token, str_arr, init, flags, verb_flags, as_bool)
480
local tkn_i, str_i, start_i = 0, init, init;
481
local states = { };
482
while tkn_i do
483
if tkn_i == 0 then
484
tkn_i += 1;
485
local next_alt = find_alternation(token, tkn_i);
486
if next_alt then
487
table.insert(states, 1, { "alternation", next_alt, str_i });
488
end;
489
continue;
490
end;
491
local ctkn = token[tkn_i];
492
local tkn_type = type(ctkn) == "table" and ctkn[1];
493
if not ctkn then
494
break;
495
elseif ctkn == "ACCEPT" then
496
local not_lookaround = true;
497
local close_i = tkn_i;
498
repeat
499
close_i += 1;
500
local is_table = type(token[close_i]) == "table";
501
local close_i_tkn = token[close_i];
502
if is_table and (close_i_tkn[1] == 0x28 or close_i_tkn[1] == "quantifier" and type(close_i_tkn[5]) == "table" and close_i_tkn[5][1] == 0x28) then
503
close_i = close_i_tkn[1] == "quantifier" and close_i_tkn[5][3] or close_i_tkn[3];
504
elseif is_table and close_i_tkn[1] == 0x29 and (close_i_tkn[4] == 0x21 or close_i_tkn[4] == 0x3D) then
505
not_lookaround = false;
506
tkn_i = close_i;
507
break;
508
end;
509
until not close_i_tkn;
510
if not_lookaround then
511
break;
512
end;
513
elseif ctkn == "PRUNE" or ctkn == "SKIP" then
514
table.insert(states, 1, { ctkn, str_i });
515
tkn_i += 1;
516
elseif tkn_type == 0x28 then
517
table.insert(states, 1, { "group", tkn_i, str_i, nil, ctkn[2], ctkn[3], ctkn[4] });
518
tkn_i += 1;
519
local next_alt, count = find_alternation(token, tkn_i, (ctkn[4] == 0x21 or ctkn[4] == 0x3D) and ctkn[5] and 0);
520
if next_alt then
521
table.insert(states, 1, { "alternation", next_alt, str_i });
522
end;
523
if count then
524
str_i -= count;
525
end;
526
elseif tkn_type == 0x29 and ctkn[4] ~= 0x21 then
527
if ctkn[4] == 0x21 or ctkn[4] == 0x3D then
528
while true do
529
local selected_match_start;
530
local selected_state = table.remove(states, 1);
531
if selected_state[1] == "group" and selected_state[2] == ctkn[3] then
532
if (ctkn[4] == 0x21 or ctkn[4] == 0x3D) and not ctkn[5] then
533
str_i = selected_state[3];
534
end;
535
if selected_match_start then
536
table.insert(states, 1, selected_match_start);
537
end;
538
break;
539
elseif selected_state[1] == "matchStart" and not selected_match_start and ctkn[4] == 0x3D then
540
selected_match_start = selected_state;
541
end;
542
end;
543
elseif ctkn[4] == 0x3E then
544
repeat
545
local selected_state = table.remove(states, 1);
546
until not selected_state or selected_state[1] == "group" and selected_state[2] == ctkn[3];
547
else
548
for i, v in ipairs(states) do
549
if v[1] == "group" and v[2] == ctkn[3] then
550
if v.jmp then
551
-- recursive match
552
tkn_i = v.jmp;
553
end;
554
v[4] = str_i;
555
if v[7] == "quantifier" and v[10] + 1 < v[9] then
556
if token[ctkn[3]][4] ~= "lazy" or v[10] + 1 < v[8] then
557
tkn_i = ctkn[3];
558
end;
559
local ctkn1 = token[ctkn[3]];
560
local new_group = { "group", v[2], str_i, nil, ctkn1[5][2], ctkn1[5][3], "quantifier", ctkn1[2], ctkn1[3], v[10] + 1, v[11], ctkn1[4] };
561
table.insert(states, 1, new_group);
562
if v[11] then
563
table.insert(states, 1, { "alternation", v[11], str_i });
564
end;
565
end;
566
break;
567
end;
568
end;
569
end;
570
tkn_i += 1;
571
elseif tkn_type == 0x4B then
572
table.insert(states, 1, { "matchStart", str_i });
573
tkn_i += 1;
574
elseif tkn_type == 0x7C then
575
local close_i = tkn_i;
576
repeat
577
close_i += 1;
578
local is_table = type(token[close_i]) == "table";
579
local close_i_tkn = token[close_i];
580
if is_table and (close_i_tkn[1] == 0x28 or close_i_tkn[1] == "quantifier" and type(close_i_tkn[5]) == "table" and close_i_tkn[5][1] == 0x28) then
581
close_i = close_i_tkn[1] == "quantifier" and close_i_tkn[5][3] or close_i_tkn[3];
582
end;
583
until is_table and close_i_tkn[1] == 0x29 or not close_i_tkn;
584
if token[close_i] then
585
for _, v in ipairs(states) do
586
if v[1] == "group" and v[6] == close_i then
587
tkn_i = v[6];
588
break;
589
end;
590
end;
591
else
592
tkn_i = close_i;
593
end;
594
elseif tkn_type == "recurmatch" then
595
table.insert(states, 1, { "group", ctkn[3], str_i, nil, nil, token[ctkn[3]][3], nil, jmp = tkn_i });
596
tkn_i = ctkn[3] + 1;
597
local next_alt, count = find_alternation(token, tkn_i);
598
if next_alt then
599
table.insert(states, 1, { "alternation", next_alt, str_i });
600
end;
601
else
602
local match;
603
if ctkn == "FAIL" then
604
match = false;
605
elseif tkn_type == 0x29 then
606
repeat
607
local selected_state = table.remove(states, 1);
608
until selected_state[1] == "group" and selected_state[2] == ctkn[3];
609
elseif tkn_type == "quantifier" then
610
if type(ctkn[5]) == "table" and ctkn[5][1] == 0x28 then
611
local next_alt = find_alternation(token, tkn_i + 1);
612
if next_alt then
613
table.insert(states, 1, { "alternation", next_alt, str_i });
614
end;
615
table.insert(states, next_alt and 2 or 1, { "group", tkn_i, str_i, nil, ctkn[5][2], ctkn[5][3], "quantifier", ctkn[2], ctkn[3], 0, next_alt, ctkn[4] });
616
if ctkn[4] == "lazy" and ctkn[2] == 0 then
617
tkn_i = ctkn[5][3];
618
end;
619
match = true;
620
else
621
local start_i, end_i;
622
local pattern_count = 1;
623
local is_backref = type(ctkn[5]) == "table" and ctkn[5][1] == "backref";
624
if is_backref then
625
pattern_count = 0;
626
local group_n = ctkn[5][2];
627
for _, v in ipairs(states) do
628
if v[1] == "group" and v[5] == group_n then
629
start_i, end_i = v[3], v[4];
630
pattern_count = end_i - start_i;
631
break;
632
end;
633
end;
634
end;
635
local min_max_i = str_i + ctkn[2] * pattern_count;
636
local mcount = 0;
637
while mcount < ctkn[3] do
638
if is_backref then
639
if start_i and end_i then
640
local org_i = str_i;
641
if utf8_sub(str_arr.s, start_i, end_i) ~= utf8_sub(str_arr.s, org_i, str_i + pattern_count) then
642
break;
643
end;
644
else
645
break;
646
end;
647
elseif not tkn_char_match(ctkn[5], str_arr, str_i, flags, verb_flags) then
648
break;
649
end;
650
str_i += pattern_count;
651
mcount += 1;
652
end;
653
match = mcount >= ctkn[2];
654
if match and ctkn[4] ~= "possessive" then
655
if ctkn[4] == "lazy" then
656
min_max_i, str_i = str_i, min_max_i;
657
end;
658
table.insert(states, 1, { "quantifier", tkn_i, str_i, math.min(min_max_i, str_arr.n + 1), (ctkn[4] == "lazy" and 1 or -1) * pattern_count });
659
end;
660
end;
661
elseif tkn_type == "backref" then
662
local start_i, end_i;
663
local group_n = ctkn[2];
664
for _, v in ipairs(states) do
665
if v[1] == "group" and v[5] == group_n then
666
start_i, end_i = v[3], v[4];
667
break;
668
end;
669
end;
670
if start_i and end_i then
671
local org_i = str_i;
672
str_i += end_i - start_i;
673
match = utf8_sub(str_arr.s, start_i, end_i) == utf8_sub(str_arr.s, org_i, str_i);
674
end;
675
else
676
local chr = str_arr[str_i];
677
if tkn_type == 0x24 or tkn_type == 0x5A or tkn_type == 0x7A then
678
match = str_i == str_arr.n + 1 or tkn_type == 0x24 and flags.multiline and is_newline(str_arr, str_i + 1, verb_flags) or tkn_type == 0x5A and str_i == str_arr.n and is_newline(str_arr, str_i, verb_flags);
679
elseif tkn_type == 0x5E or tkn_type == 0x41 or tkn_type == 0x47 then
680
match = str_i == 1 or tkn_type == 0x5E and flags.multiline and is_newline(str_arr, str_i - 1, verb_flags) or tkn_type == 0x47 and str_i == init;
681
elseif tkn_type == 0x42 or tkn_type == 0x62 then
682
local start_m = str_i == 1 or flags.multiline and is_newline(str_arr, str_i - 1, verb_flags);
683
local end_m = str_i == str_arr.n + 1 or flags.multiline and is_newline(str_arr, str_i, verb_flags);
684
local w_m = tkn_char_match(ctkn[2], str_arr[str_i - 1], flags) and 0 or tkn_char_match(ctkn[2], chr, flags) and 1;
685
if w_m == 0 then
686
match = end_m or not tkn_char_match(ctkn[2], chr, flags);
687
elseif w_m then
688
match = start_m or not tkn_char_match(ctkn[2], str_arr[str_i - 1], flags);
689
end;
690
if tkn_type == 0x42 then
691
match = not match;
692
end;
693
else
694
match = tkn_char_match(ctkn, str_arr, str_i, flags, verb_flags);
695
str_i += 1;
696
end;
697
end;
698
if not match then
699
while true do
700
local prev_type, prev_state = states[1] and states[1][1], states[1];
701
if not prev_type or prev_type == "PRUNE" or prev_type == "SKIP" then
702
if prev_type then
703
table.clear(states);
704
end;
705
if start_i > str_arr.n then
706
if as_bool then
707
return false;
708
end;
709
return nil;
710
end;
711
start_i = prev_type == "SKIP" and prev_state[2] or start_i + 1;
712
tkn_i, str_i = 0, start_i;
713
break;
714
elseif prev_type == "alternation" then
715
tkn_i, str_i = prev_state[2], prev_state[3];
716
local next_alt, count = find_alternation(token, tkn_i + 1);
717
if next_alt then
718
prev_state[2] = next_alt;
719
else
720
table.remove(states, 1);
721
end;
722
if count then
723
str_i -= count;
724
end;
725
break;
726
elseif prev_type == "group" then
727
if prev_state[7] == "quantifier" then
728
if prev_state[12] == "greedy" and prev_state[10] >= prev_state[8]
729
or prev_state[12] == "lazy" and prev_state[10] < prev_state[9] and not prev_state[13] then
730
tkn_i, str_i = prev_state[12] == "greedy" and prev_state[6] or prev_state[2], prev_state[3];
731
if prev_state[12] == "greedy" then
732
table.remove(states, 1);
733
break;
734
elseif prev_state[10] >= prev_state[8] then
735
prev_state[13] = true;
736
break;
737
end;
738
end;
739
elseif prev_state[7] == 0x21 then
740
table.remove(states, 1);
741
tkn_i, str_i = prev_state[6], prev_state[3];
742
break;
743
end;
744
elseif prev_type == "quantifier" then
745
if math.sign(prev_state[4] - prev_state[3]) == math.sign(prev_state[5]) then
746
prev_state[3] += prev_state[5];
747
tkn_i, str_i = prev_state[2], prev_state[3];
748
break;
749
end;
750
end;
751
-- keep match out state and recursive state, can be safely removed
752
-- prevents infinite loop
753
table.remove(states, 1);
754
end;
755
end;
756
tkn_i += 1;
757
end;
758
end;
759
if as_bool then
760
return true;
761
end;
762
local match_start_ran = false;
763
local span = table.create(token.group_n);
764
span[0], span.n = { start_i, str_i }, token.group_n;
765
for _, v in ipairs(states) do
766
if v[1] == "matchStart" and not match_start_ran then
767
span[0][1], match_start_ran = v[2], true;
768
elseif v[1] == "group" and v[5] and not span[v[5]] then
769
span[v[5]] = { v[3], v[4] };
770
end;
771
end;
772
return span;
773
end;
774
775
--[[ Methods ]]--
776
re_m.test = check_re('RegEx', 'test', function(self, str, init)
777
return re_rawfind(self.token, to_str_arr(str, init), 1, self.flags, self.verb_flags, true);
778
end);
779
780
re_m.match = check_re('RegEx', 'match', function(self, str, init, source)
781
local span = re_rawfind(self.token, to_str_arr(str, init), 1, self.flags, self.verb_flags, false);
782
if not span then
783
return nil;
784
end;
785
return new_match(span, self.group_id, source, str);
786
end);
787
788
re_m.matchall = check_re('RegEx', 'matchall', function(self, str, init, source)
789
str = to_str_arr(str, init);
790
local i = 1;
791
return function()
792
local span = i <= str.n + 1 and re_rawfind(self.token, str, i, self.flags, self.verb_flags, false);
793
if not span then
794
return nil;
795
end;
796
i = span[0][2] + (span[0][1] >= span[0][2] and 1 or 0);
797
return new_match(span, self.group_id, source, str.s);
798
end;
799
end);
800
801
local function insert_tokenized_sub(repl_r, str, span, tkn)
802
for _, v in ipairs(tkn) do
803
if type(v) == "table" then
804
if v[1] == "condition" then
805
if span[v[2]] then
806
if v[3] then
807
insert_tokenized_sub(repl_r, str, span, v[3]);
808
else
809
table.move(str, span[v[2]][1], span[v[2]][2] - 1, #repl_r + 1, repl_r);
810
end;
811
elseif v[4] then
812
insert_tokenized_sub(repl_r, str, span, v[4]);
813
end;
814
else
815
table.move(v, 1, #v, #repl_r + 1, repl_r);
816
end;
817
elseif span[v] then
818
table.move(str, span[v][1], span[v][2] - 1, #repl_r + 1, repl_r);
819
end;
820
end;
821
repl_r.n = #repl_r;
822
return repl_r;
823
end;
824
825
re_m.sub = check_re('RegEx', 'sub', function(self, repl, str, n, repl_flag_str, source)
826
if repl_flag_str ~= nil and type(repl_flag_str) ~= "number" and type(repl_flag_str) ~= "string" then
827
error(string.format("invalid argument #5 to 'sub' (string expected, got %s)", typeof(repl_flag_str)), 3);
828
end
829
local repl_flags = {
830
l = false, o = false, u = false,
831
};
832
for f in string.gmatch(repl_flag_str or '', utf8.charpattern) do
833
if repl_flags[f] ~= false then
834
error("invalid regular expression substitution flag " .. f, 3);
835
end;
836
repl_flags[f] = true;
837
end;
838
local repl_type = type(repl);
839
if repl_type == "number" then
840
repl ..= '';
841
elseif repl_type ~= "string" and repl_type ~= "function" and (not repl_flags.o or repl_type ~= "table") then
842
error(string.format("invalid argument #2 to 'sub' (string/function%s expected, got %s)", repl_flags.o and "/table" or '', typeof(repl)), 3);
843
end;
844
if tonumber(n) then
845
n = tonumber(n);
846
if n <= -1 or n ~= n then
847
n = math.huge;
848
end;
849
elseif n ~= nil then
850
error(string.format("invalid argument #4 to 'sub' (number expected, got %s)", typeof(n)), 3);
851
else
852
n = math.huge;
853
end;
854
if n < 1 then
855
return str, 0;
856
end;
857
local min_repl_n = 0;
858
if repl_type == "string" then
859
repl = to_str_arr(repl);
860
if not repl_flags.l then
861
local i1 = 0;
862
local repl_r = table.create(3);
863
local group_n = self.token.group_n;
864
local conditional_c = { };
865
while i1 < repl.n do
866
local i2 = i1;
867
repeat
868
i2 += 1;
869
until not repl[i2] or repl[i2] == 0x24 or repl[i2] == 0x5C or (repl[i2] == 0x3A or repl[i2] == 0x7D) and conditional_c[1];
870
min_repl_n += i2 - i1 - 1;
871
if i2 - i1 > 1 then
872
table.insert(repl_r, table.move(repl, i1 + 1, i2 - 1, 1, table.create(i2 - i1 - 1)));
873
end;
874
if repl[i2] == 0x3A then
875
local current_conditional_c = conditional_c[1];
876
if current_conditional_c[2] then
877
error("malformed substitution pattern", 3);
878
end;
879
current_conditional_c[2] = table.move(repl_r, current_conditional_c[3], #repl_r, 1, table.create(#repl_r + 1 - current_conditional_c[3]));
880
for i3 = #repl_r, current_conditional_c[3], -1 do
881
repl_r[i3] = nil;
882
end;
883
elseif repl[i2] == 0x7D then
884
local current_conditional_c = table.remove(conditional_c, 1);
885
local second_c = table.move(repl_r, current_conditional_c[3], #repl_r, 1, table.create(#repl_r + 1 - current_conditional_c[3]));
886
for i3 = #repl_r, current_conditional_c[3], -1 do
887
repl_r[i3] = nil;
888
end;
889
table.insert(repl_r, { "condition", current_conditional_c[1], current_conditional_c[2] ~= true and (current_conditional_c[2] or second_c), current_conditional_c[2] and second_c });
890
elseif repl[i2] then
891
i2 += 1;
892
local subst_c = repl[i2];
893
if not subst_c then
894
if repl[i2 - 1] == 0x5C then
895
error("replacement string must not end with a trailing backslash", 3);
896
end;
897
local prev_repl_f = repl_r[#repl_r];
898
if type(prev_repl_f) == "table" then
899
table.insert(prev_repl_f, repl[i2 - 1]);
900
else
901
table.insert(repl_r, { repl[i2 - 1] });
902
end;
903
elseif subst_c == 0x5C and repl[i2 - 1] == 0x24 then
904
local prev_repl_f = repl_r[#repl_r];
905
if type(prev_repl_f) == "table" then
906
table.insert(prev_repl_f, 0x24);
907
else
908
table.insert(repl_r, { 0x24 });
909
end;
910
i2 -= 1;
911
min_repl_n += 1;
912
elseif subst_c == 0x30 then
913
table.insert(repl_r, 0);
914
elseif subst_c > 0x30 and subst_c <= 0x39 then
915
local start_i2 = i2;
916
local group_i = subst_c - 0x30;
917
while repl[i2 + 1] and repl[i2 + 1] >= 0x30 and repl[i2 + 1] <= 0x39 do
918
group_i ..= repl[i2 + 1] - 0x30;
919
i2 += 1;
920
end;
921
group_i = tonumber(group_i);
922
if not repl_flags.u and group_i > group_n then
923
error("reference to non-existent subpattern", 3);
924
end;
925
table.insert(repl_r, group_i);
926
elseif subst_c == 0x7B and repl[i2 - 1] == 0x24 then
927
i2 += 1;
928
local start_i2 = i2;
929
while repl[i2] and
930
(repl[i2] >= 0x30 and repl[i2] <= 0x39
931
or repl[i2] >= 0x41 and repl[i2] <= 0x5A
932
or repl[i2] >= 0x61 and repl[i2] <= 0x7A
933
or repl[i2] == 0x5F) do
934
i2 += 1;
935
end;
936
if (repl[i2] == 0x7D or repl[i2] == 0x3A and (repl[i2 + 1] == 0x2B or repl[i2 + 1] == 0x2D)) and i2 ~= start_i2 then
937
local group_k = utf8_sub(repl.s, start_i2, i2);
938
if repl[start_i2] >= 0x30 and repl[start_i2] <= 0x39 then
939
group_k = tonumber(group_k);
940
if not repl_flags.u and group_k > group_n then
941
error("reference to non-existent subpattern", 3);
942
end;
943
else
944
group_k = self.group_id[group_k];
945
if not repl_flags.u and (not group_k or group_k > group_n) then
946
error("reference to non-existent subpattern", 3);
947
end;
948
end;
949
if repl[i2] == 0x3A then
950
i2 += 1;
951
table.insert(conditional_c, { group_k, repl[i2] == 0x2D, #repl_r + 1 });
952
else
953
table.insert(repl_r, group_k);
954
end;
955
else
956
error("malformed substitution pattern", 3);
957
end;
958
else
959
local c_escape_char;
960
if repl[i2 - 1] == 0x24 then
961
if subst_c ~= 0x24 then
962
local prev_repl_f = repl_r[#repl_r];
963
if type(prev_repl_f) == "table" then
964
table.insert(prev_repl_f, 0x24);
965
else
966
table.insert(repl_r, { 0x24 });
967
end;
968
end;
969
else
970
c_escape_char = escape_chars[repl[i2]];
971
if type(c_escape_char) ~= "number" then
972
c_escape_char = nil;
973
end;
974
end;
975
local prev_repl_f = repl_r[#repl_r];
976
if type(prev_repl_f) == "table" then
977
table.insert(prev_repl_f, c_escape_char or repl[i2]);
978
else
979
table.insert(repl_r, { c_escape_char or repl[i2] });
980
end;
981
min_repl_n += 1;
982
end;
983
end;
984
i1 = i2;
985
end;
986
if conditional_c[1] then
987
error("malformed substitution pattern", 3);
988
end;
989
if not repl_r[2] and type(repl_r[1]) == "table" and repl_r[1][1] ~= "condition" then
990
repl, repl.n = repl_r[1], #repl_r[1];
991
else
992
repl, repl_type = repl_r, "subst_string";
993
end;
994
end;
995
end;
996
str = to_str_arr(str);
997
local incr, i0, count = 0, 1, 0;
998
while i0 <= str.n + incr + 1 do
999
local span = re_rawfind(self.token, str, i0, self.flags, self.verb_flags, false);
1000
if not span then
1001
break;
1002
end;
1003
local repl_r;
1004
if repl_type == "string" then
1005
repl_r = repl;
1006
elseif repl_type == "subst_string" then
1007
repl_r = insert_tokenized_sub(table.create(min_repl_n), str, span, repl);
1008
else
1009
local re_match;
1010
local repl_c;
1011
if repl_type == "table" then
1012
re_match = utf8_sub(str.s, span[0][1], span[0][2]);
1013
repl_c = repl[re_match];
1014
else
1015
re_match = new_match(span, self.group_id, source, str.s);
1016
repl_c = repl(re_match);
1017
end;
1018
if repl_c == re_match or repl_flags.o and not repl_c then
1019
local repl_n = span[0][2] - span[0][1];
1020
repl_r = table.move(str, span[0][1], span[0][2] - 1, 1, table.create(repl_n));
1021
repl_r.n = repl_n;
1022
elseif type(repl_c) == "string" then
1023
repl_r = to_str_arr(repl_c);
1024
elseif type(repl_c) == "number" then
1025
repl_r = to_str_arr(repl_c .. '');
1026
elseif repl_flags.o then
1027
error(string.format("invalid replacement value (a %s)", type(repl_c)), 3);
1028
else
1029
repl_r = { n = 0 };
1030
end;
1031
end;
1032
local match_len = span[0][2] - span[0][1];
1033
local repl_len = math.min(repl_r.n, match_len);
1034
for i1 = 0, repl_len - 1 do
1035
str[span[0][1] + i1] = repl_r[i1 + 1];
1036
end;
1037
local i1 = span[0][1] + repl_len;
1038
i0 = span[0][2];
1039
if match_len > repl_r.n then
1040
for i2 = 1, match_len - repl_r.n do
1041
table.remove(str, i1);
1042
incr -= 1;
1043
i0 -= 1;
1044
end;
1045
elseif repl_r.n > match_len then
1046
for i2 = 1, repl_r.n - match_len do
1047
table.insert(str, i1 + i2 - 1, repl_r[repl_len + i2]);
1048
incr += 1;
1049
i0 += 1;
1050
end;
1051
end;
1052
if match_len <= 0 then
1053
i0 += 1;
1054
end;
1055
count += 1;
1056
if n < count + 1 then
1057
break;
1058
end;
1059
end;
1060
return from_str_arr(str), count;
1061
end);
1062
1063
re_m.split = check_re('RegEx', 'split', function(self, str, n)
1064
if tonumber(n) then
1065
n = tonumber(n);
1066
if n <= -1 or n ~= n then
1067
n = math.huge;
1068
end;
1069
elseif n ~= nil then
1070
error(string.format("invalid argument #3 to 'split' (number expected, got %s)", typeof(n)), 3);
1071
else
1072
n = math.huge;
1073
end;
1074
str = to_str_arr(str);
1075
local i, count = 1, 0;
1076
local ret = { };
1077
local prev_empty = 0;
1078
while i <= str.n + 1 do
1079
count += 1;
1080
local span = n >= count and re_rawfind(self.token, str, i, self.flags, self.verb_flags, false);
1081
if not span then
1082
break;
1083
end;
1084
table.insert(ret, utf8_sub(str.s, i - prev_empty, span[0][1]));
1085
prev_empty = span[0][1] >= span[0][2] and 1 or 0;
1086
i = span[0][2] + prev_empty;
1087
end;
1088
table.insert(ret, string.sub(str.s, utf8.offset(str.s, i - prev_empty)));
1089
return ret;
1090
end);
1091
1092
--
1093
local function re_index(self, index)
1094
return re_m[index] or proxy[self].flags[index];
1095
end;
1096
1097
local function re_tostr(self)
1098
return proxy[self].pattern_repr .. proxy[self].flag_repr;
1099
end;
1100
--
1101
1102
local other_valid_group_char = {
1103
-- non-capturing group
1104
[0x3A] = true,
1105
-- lookarounds
1106
[0x21] = true, [0x3D] = true,
1107
-- atomic
1108
[0x3E] = true,
1109
-- branch reset
1110
[0x7C] = true,
1111
};
1112
1113
local function tokenize_ptn(codes, flags)
1114
if flags.unicode and not options.unicodeData then
1115
return "options.unicodeData cannot be turned off while having unicode flag";
1116
end;
1117
local i, len = 1, codes.n;
1118
local group_n = 0;
1119
local outln, group_id, verb_flags = { }, { }, {
1120
newline = 1, newline_seq = 1, not_empty = 0,
1121
};
1122
while i <= len do
1123
local c = codes[i];
1124
if c == 0x28 then
1125
-- Match
1126
local ret;
1127
if codes[i + 1] == 0x2A then
1128
i += 2;
1129
local start_i = i;
1130
while codes[i]
1131
and (codes[i] >= 0x30 and codes[i] <= 0x39
1132
or codes[i] >= 0x41 and codes[i] <= 0x5A
1133
or codes[i] >= 0x61 and codes[i] <= 0x7A
1134
or codes[i] == 0x5F or codes[i] == 0x3A) do
1135
i += 1;
1136
end;
1137
if codes[i] ~= 0x29 and codes[i - 1] ~= 0x3A then
1138
-- fallback as normal and ( can't be repeated
1139
return "quantifier doesn't follow a repeatable pattern";
1140
end;
1141
local selected_verb = utf8_sub(codes.s, start_i, i);
1142
if selected_verb == "positive_lookahead:" or selected_verb == "negative_lookhead:"
1143
or selected_verb == "positive_lookbehind:" or selected_verb == "negative_lookbehind:"
1144
or selected_verb:find("^[pn]l[ab]:$") then
1145
ret = { 0x28, nil, nil, selected_verb:find('^n') and 0x21 or 0x3D, selected_verb:find('b', 3, true) and 1 };
1146
elseif selected_verb == "atomic:" then
1147
ret = { 0x28, nil, nil, 0x3E, nil };
1148
elseif selected_verb == "ACCEPT" or selected_verb == "FAIL" or selected_verb == 'F' or selected_verb == "PRUNE" or selected_verb == "SKIP" then
1149
ret = selected_verb == 'F' and "FAIL" or selected_verb;
1150
else
1151
if line_verbs[selected_verb] then
1152
verb_flags.newline = selected_verb;
1153
elseif selected_verb == "BSR_ANYCRLF" or selected_verb == "BSR_UNICODE" then
1154
verb_flags.newline_seq = selected_verb == "BSR_UNICODE" and 1 or 0;
1155
elseif selected_verb == "NOTEMPTY" or selected_verb == "NOTEMPTY_ATSTART" then
1156
verb_flags.not_empty = selected_verb == "NOTEMPTY" and 1 or 2;
1157
else
1158
return "unknown or malformed verb";
1159
end;
1160
if outln[1] then
1161
return "this verb must be placed at the beginning of the regex";
1162
end;
1163
end;
1164
elseif codes[i + 1] == 0x3F then
1165
-- ? syntax
1166
i += 2;
1167
if codes[i] == 0x23 then
1168
-- comments
1169
i = table.find(codes, 0x29, i);
1170
if not i then
1171
return "unterminated parenthetical";
1172
end;
1173
i += 1;
1174
continue;
1175
elseif not codes[i] then
1176
return "unterminated parenthetical";
1177
end;
1178
ret = { 0x28, nil, nil, codes[i], nil };
1179
if codes[i] == 0x30 and codes[i + 1] == 0x29 then
1180
-- recursive match entire pattern
1181
ret[1], ret[2], ret[3], ret[5] = "recurmatch", 0, 0, nil;
1182
elseif codes[i] > 0x30 and codes[i] <= 0x39 then
1183
-- recursive match
1184
local org_i = i;
1185
i += 1;
1186
while codes[i] >= 0x30 and codes[i] <= 0x30 do
1187
i += 1;
1188
end;
1189
if codes[i] ~= 0x29 then
1190
return "invalid group structure";
1191
end;
1192
ret[1], ret[2], ret[4] = "recurmatch", tonumber(utf8_sub(codes.s, org_i, i)), nil;
1193
elseif codes[i] == 0x3C and codes[i + 1] == 0x21 or codes[i + 1] == 0x3D then
1194
-- lookbehinds
1195
i += 1;
1196
ret[4], ret[5] = codes[i], 1;
1197
elseif codes[i] == 0x7C then
1198
-- branch reset
1199
ret[5] = group_n;
1200
elseif codes[i] == 0x50 or codes[i] == 0x3C or codes[i] == 0x27 then
1201
if codes[i] == 0x50 then
1202
i += 1;
1203
end;
1204
if codes[i] == 0x3D then
1205
-- backref
1206
local start_i = i + 1;
1207
while codes[i] and
1208
(codes[i] >= 0x30 and codes[i] <= 0x39
1209
or codes[i] >= 0x41 and codes[i] <= 0x5A
1210
or codes[i] >= 0x61 and codes[i] <= 0x7A
1211
or codes[i] == 0x5F) do
1212
i += 1;
1213
end;
1214
if not codes[i] then
1215
return "unterminated parenthetical";
1216
elseif codes[i] ~= 0x29 or i == start_i then
1217
return "invalid group structure";
1218
end;
1219
ret = { "backref", utf8_sub(codes.s, start_i, i) };
1220
elseif codes[i] == 0x3C or codes[i - 1] ~= 0x50 and codes[i] == 0x27 then
1221
-- named capture
1222
local delimiter = codes[i] == 0x27 and 0x27 or 0x3E;
1223
local start_i = i + 1;
1224
i += 1;
1225
if codes[i] == 0x29 then
1226
return "missing character in subpattern";
1227
elseif codes[i] >= 0x30 and codes[i] <= 0x39 then
1228
return "subpattern name must not begin with a digit";
1229
elseif not (codes[i] >= 0x41 and codes[i] <= 0x5A or codes[i] >= 0x61 and codes[i] <= 0x7A or codes[i] == 0x5F) then
1230
return "invalid character in subpattern";
1231
end;
1232
i += 1;
1233
while codes[i] and
1234
(codes[i] >= 0x30 and codes[i] <= 0x39
1235
or codes[i] >= 0x41 and codes[i] <= 0x5A
1236
or codes[i] >= 0x61 and codes[i] <= 0x7A
1237
or codes[i] == 0x5F) do
1238
i += 1;
1239
end;
1240
if not codes[i] then
1241
return "unterminated parenthetical";
1242
elseif codes[i] ~= delimiter then
1243
return "invalid character in subpattern";
1244
end;
1245
local name = utf8_sub(codes.s, start_i, i);
1246
group_n += 1;
1247
if (group_id[name] or group_n) ~= group_n then
1248
return "subpattern name already exists";
1249
end;
1250
for name1, group_n1 in pairs(group_id) do
1251
if name ~= name1 and group_n == group_n1 then
1252
return "different names for subpatterns of the same number aren't permitted";
1253
end;
1254
end;
1255
group_id[name] = group_n;
1256
ret[2], ret[4] = group_n, nil;
1257
else
1258
return "invalid group structure";
1259
end;
1260
elseif not other_valid_group_char[codes[i]] then
1261
return "invalid group structure";
1262
end;
1263
else
1264
group_n += 1;
1265
ret = { 0x28, group_n, nil, nil };
1266
end;
1267
if ret then
1268
table.insert(outln, ret);
1269
end;
1270
elseif c == 0x29 then
1271
-- Close parenthesis
1272
local i1 = #outln + 1;
1273
local lookbehind_c = -1;
1274
local current_lookbehind_c = 0;
1275
local max_c, group_c = 0, 0;
1276
repeat
1277
i1 -= 1;
1278
local v, is_table = outln[i1], type(outln[i1]) == "table";
1279
if is_table and v[1] == 0x28 then
1280
group_c += 1;
1281
if current_lookbehind_c and v.count then
1282
current_lookbehind_c += v.count;
1283
end;
1284
if not v[3] then
1285
if v[4] == 0x7C then
1286
group_n = v[5] + math.max(max_c, group_c);
1287
end;
1288
if current_lookbehind_c ~= lookbehind_c and lookbehind_c ~= -1 then
1289
lookbehind_c = nil;
1290
else
1291
lookbehind_c = current_lookbehind_c;
1292
end;
1293
break;
1294
end;
1295
elseif v == alternation then
1296
if current_lookbehind_c ~= lookbehind_c and lookbehind_c ~= -1 then
1297
lookbehind_c, current_lookbehind_c = nil, nil;
1298
else
1299
lookbehind_c, current_lookbehind_c = current_lookbehind_c, 0;
1300
end;
1301
max_c, group_c = math.max(max_c, group_c), 0;
1302
elseif current_lookbehind_c then
1303
if is_table and v[1] == "quantifier" then
1304
if v[2] == v[3] then
1305
current_lookbehind_c += v[2];
1306
else
1307
current_lookbehind_c = nil;
1308
end;
1309
else
1310
current_lookbehind_c += 1;
1311
end;
1312
end;
1313
until i1 < 1;
1314
if i1 < 1 then
1315
return "unmatched ) in regular expression";
1316
end;
1317
local v = outln[i1];
1318
local outln_len_p_1 = #outln + 1;
1319
local ret = { 0x29, v[2], i1, v[4], v[5], count = lookbehind_c };
1320
if (v[4] == 0x21 or v[4] == 0x3D) and v[5] and not lookbehind_c then
1321
return "lookbehind assertion is not fixed width";
1322
end;
1323
v[3] = outln_len_p_1;
1324
table.insert(outln, ret);
1325
elseif c == 0x2E then
1326
table.insert(outln, dot);
1327
elseif c == 0x5B then
1328
-- Character set
1329
local negate, char_class = false, nil;
1330
i += 1;
1331
local start_i = i;
1332
if codes[i] == 0x5E then
1333
negate = true;
1334
i += 1;
1335
elseif codes[i] == 0x2E or codes[i] == 0x3A or codes[i] == 0x3D then
1336
-- POSIX character classes
1337
char_class = codes[i];
1338
end;
1339
local ret;
1340
if codes[i] == 0x5B or codes[i] == 0x5C then
1341
ret = { };
1342
else
1343
ret = { codes[i] };
1344
i += 1;
1345
end;
1346
while codes[i] ~= 0x5D do
1347
if not codes[i] then
1348
return "unterminated character class";
1349
elseif codes[i] == 0x2D and ret[1] and type(ret[1]) == "number" then
1350
if codes[i + 1] == 0x5D then
1351
table.insert(ret, 1, 0x2D);
1352
else
1353
i += 1;
1354
local ret_c = codes[i];
1355
if ret_c == 0x5B then
1356
if codes[i + 1] == 0x2E or codes[i + 1] == 0x3A or codes[i + 1] == 0x3D then
1357
-- Check for POSIX character class, name does not matter
1358
local i1 = i + 2;
1359
repeat
1360
i1 = table.find(codes, 0x5D, i1);
1361
until not i1 or codes[i1 - 1] ~= 0x5C;
1362
if not i1 then
1363
return "unterminated character class";
1364
elseif codes[i1 - 1] == codes[i + 1] and i1 - 1 ~= i + 1 then
1365
return "invalid range in character class";
1366
end;
1367
end;
1368
if ret[1] > 0x5B then
1369
return "invalid range in character class";
1370
end;
1371
elseif ret_c == 0x5C then
1372
i += 1;
1373
if codes[i] == 0x78 then
1374
local radix0, radix1;
1375
i += 1;
1376
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66 then
1377
radix0 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
1378
i += 1;
1379
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66 then
1380
radix1 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
1381
else
1382
i -= 1;
1383
end;
1384
else
1385
i -= 1;
1386
end;
1387
ret_c = radix0 and (radix1 and 16 * radix0 + radix1 or radix0) or 0;
1388
elseif codes[i] >= 0x30 and codes[i] <= 0x37 then
1389
local radix0, radix1, radix2 = codes[i] - 0x30, nil, nil;
1390
i += 1;
1391
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
1392
radix1 = codes[i] - 0x30;
1393
i += 1;
1394
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
1395
radix2 = codes[i] - 0x30;
1396
else
1397
i -= 1;
1398
end;
1399
else
1400
i -= 1;
1401
end;
1402
ret_c = radix1 and (radix2 and 64 * radix0 + 8 * radix1 + radix2 or 8 * radix0 + radix1) or radix0;
1403
else
1404
ret_c = escape_chars[codes[i]] or codes[i];
1405
if type(ret_c) ~= "number" then
1406
return "invalid range in character class";
1407
end;
1408
end;
1409
elseif ret[1] > ret_c then
1410
return "invalid range in character class";
1411
end;
1412
ret[1] = { "range", ret[1], ret_c };
1413
end;
1414
elseif codes[i] == 0x5B then
1415
if codes[i + 1] == 0x2E or codes[i + 1] == 0x3A or codes[i + 1] == 0x3D then
1416
local i1 = i + 2;
1417
repeat
1418
i1 = table.find(codes, 0x5D, i1);
1419
until not i1 or codes[i1 - 1] ~= 0x5C;
1420
if not i1 then
1421
return "unterminated character class";
1422
elseif codes[i1 - 1] ~= codes[i + 1] or i1 - 1 == i + 1 then
1423
table.insert(ret, 1, 0x5B);
1424
elseif codes[i1 - 1] == 0x2E or codes[i1 - 1] == 0x3D then
1425
return "POSIX collating elements aren't supported";
1426
elseif codes[i1 - 1] == 0x3A then
1427
-- I have no plans to support escape codes (\) in character class names
1428
local negate = codes[i + 3] == 0x5E;
1429
local class_name = utf8_sub(codes.s, i + (negate and 3 or 2), i1 - 1);
1430
-- If not valid then throw an error
1431
if not posix_class_names[class_name] then
1432
return "unknown POSIX class name";
1433
end;
1434
table.insert(ret, 1, { "class", class_name, negate });
1435
i = i1;
1436
end;
1437
else
1438
table.insert(ret, 1, 0x5B);
1439
end;
1440
elseif codes[i] == 0x5C then
1441
i += 1;
1442
if codes[i] == 0x78 then
1443
local radix0, radix1;
1444
i += 1;
1445
if codes[i] == 0x7B then
1446
i += 1;
1447
local org_i = i;
1448
while codes[i] and
1449
(codes[i] >= 0x30 and codes[i] <= 0x39
1450
or codes[i] >= 0x41 and codes[i] <= 0x46
1451
or codes[i] >= 0x61 and codes[i] <= 0x66) do
1452
i += 1;
1453
end;
1454
if codes[i] ~= 0x7D or i == org_i then
1455
return "malformed hexadecimal character";
1456
elseif i - org_i > 4 then
1457
return "character offset too large";
1458
end;
1459
table.insert(ret, 1, tonumber(utf8_sub(codes.s, org_i, i), 16));
1460
else
1461
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66 then
1462
radix0 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
1463
i += 1;
1464
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66 then
1465
radix1 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
1466
else
1467
i -= 1;
1468
end;
1469
else
1470
i -= 1;
1471
end;
1472
table.insert(ret, 1, radix0 and (radix1 and 16 * radix0 + radix1 or radix0) or 0);
1473
end;
1474
elseif codes[i] >= 0x30 and codes[i] <= 0x37 then
1475
local radix0, radix1, radix2 = codes[i] - 0x30, nil, nil;
1476
i += 1;
1477
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
1478
radix1 = codes[i] - 0x30;
1479
i += 1;
1480
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
1481
radix2 = codes[i] - 0x30;
1482
else
1483
i -= 1;
1484
end;
1485
else
1486
i -= 1;
1487
end;
1488
table.insert(ret, 1, radix1 and (radix2 and 64 * radix0 + 8 * radix1 + radix2 or 8 * radix0 + radix1) or radix0);
1489
elseif codes[i] == 0x45 then
1490
-- intentionally left blank, \E that's not preceded \Q is ignored
1491
elseif codes[i] == 0x51 then
1492
local start_i = i + 1;
1493
repeat
1494
i = table.find(codes, 0x5C, i + 1);
1495
until not i or codes[i + 1] == 0x45;
1496
table.move(codes, start_i, i and i - 1 or #codes, #outln + 1, outln);
1497
if not i then
1498
break;
1499
end;
1500
i += 1;
1501
elseif codes[i] == 0x4E then
1502
if codes[i + 1] == 0x7B and codes[i + 2] == 0x55 and codes[i + 3] == 0x2B and flags.unicode then
1503
i += 4;
1504
local start_i = i;
1505
while codes[i] and
1506
(codes[i] >= 0x30 and codes[i] <= 0x39
1507
or codes[i] >= 0x41 and codes[i] <= 0x46
1508
or codes[i] >= 0x61 and codes[i] <= 0x66) do
1509
i += 1;
1510
end;
1511
if codes[i] ~= 0x7D or i == start_i then
1512
return "malformed Unicode code point";
1513
end;
1514
local code_point = tonumber(utf8_sub(codes.s, start_i, i));
1515
table.insert(ret, 1, code_point);
1516
else
1517
return "invalid escape sequence";
1518
end;
1519
elseif codes[i] == 0x50 or codes[i] == 0x70 then
1520
if not options.unicodeData then
1521
return "options.unicodeData cannot be turned off when using \\p";
1522
end;
1523
i += 1;
1524
if codes[i] ~= 0x7B then
1525
local c_name = utf8.char(codes[i] or 0);
1526
if not valid_categories[c_name] then
1527
return "unknown or malformed script name";
1528
end;
1529
table.insert(ret, 1, { "category", false, c_name });
1530
else
1531
local negate = codes[i] == 0x50;
1532
i += 1;
1533
if codes[i] == 0x5E then
1534
i += 1;
1535
negate = not negate;
1536
end;
1537
local start_i = i;
1538
while codes[i] and
1539
(codes[i] >= 0x30 and codes[i] <= 0x39
1540
or codes[i] >= 0x41 and codes[i] <= 0x5A
1541
or codes[i] >= 0x61 and codes[i] <= 0x7A
1542
or codes[i] == 0x5F) do
1543
i += 1;
1544
end;
1545
if codes[i] ~= 0x7D then
1546
return "unknown or malformed script name";
1547
end;
1548
local c_name = utf8_sub(codes.s, start_i, i);
1549
local script_set = chr_scripts[c_name];
1550
if script_set then
1551
table.insert(ret, 1, { "charset", negate, script_set });
1552
elseif not valid_categories[c_name] then
1553
return "unknown or malformed script name";
1554
else
1555
table.insert(ret, 1, { "category", negate, c_name });
1556
end;
1557
end;
1558
elseif codes[i] == 0x6F then
1559
i += 1;
1560
if codes[i] ~= 0x7B then
1561
return "malformed octal code";
1562
end;
1563
i += 1;
1564
local org_i = i;
1565
while codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 do
1566
i += 1;
1567
end;
1568
if codes[i] ~= 0x7D or i == org_i then
1569
return "malformed octal code";
1570
end;
1571
local ret_chr = tonumber(utf8_sub(codes.s, org_i, i), 8);
1572
if ret_chr > 0xFFFF then
1573
return "character offset too large";
1574
end;
1575
table.insert(ret, 1, ret_chr);
1576
else
1577
local esc_char = escape_chars[codes[i]];
1578
table.insert(ret, 1, type(esc_char) == "string" and { "class", esc_char, false } or esc_char or codes[i]);
1579
end;
1580
elseif flags.ignoreCase and codes[i] >= 0x61 and codes[i] <= 0x7A then
1581
table.insert(ret, 1, codes[i] - 0x20);
1582
else
1583
table.insert(ret, 1, codes[i]);
1584
end;
1585
i += 1;
1586
end;
1587
if codes[i - 1] == char_class and i - 1 ~= start_i then
1588
return char_class == 0x3A and "POSIX named classes are only support within a character set" or "POSIX collating elements aren't supported";
1589
end;
1590
if not ret[2] and not negate then
1591
table.insert(outln, ret[1]);
1592
else
1593
table.insert(outln, { "charset", negate, ret });
1594
end;
1595
elseif c == 0x5C then
1596
-- Escape char
1597
i += 1;
1598
local escape_c = codes[i];
1599
if not escape_c then
1600
return "pattern may not end with a trailing backslash";
1601
elseif escape_c >= 0x30 and escape_c <= 0x39 then
1602
local org_i = i;
1603
while codes[i + 1] and codes[i + 1] >= 0x30 and codes[i + 1] <= 0x39 do
1604
i += 1;
1605
end;
1606
local escape_d = tonumber(utf8_sub(codes.s, org_i, i + 1));
1607
if escape_d > group_n and i ~= org_i then
1608
i = org_i;
1609
local radix0, radix1, radix2;
1610
if codes[i] <= 0x37 then
1611
radix0 = codes[i] - 0x30;
1612
i += 1;
1613
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
1614
radix1 = codes[i] - 0x30;
1615
i += 1;
1616
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
1617
radix2 = codes[i] - 0x30;
1618
else
1619
i -= 1;
1620
end;
1621
else
1622
i -= 1;
1623
end;
1624
end;
1625
table.insert(outln, radix0 and (radix1 and (radix2 and 64 * radix0 + 8 * radix1 + radix2 or 8 * radix0 + radix1) or radix0) or codes[org_i]);
1626
else
1627
table.insert(outln, { "backref", escape_d });
1628
end;
1629
elseif escape_c == 0x45 then
1630
-- intentionally left blank, \E that's not preceded \Q is ignored
1631
elseif escape_c == 0x51 then
1632
local start_i = i + 1;
1633
repeat
1634
i = table.find(codes, 0x5C, i + 1);
1635
until not i or codes[i + 1] == 0x45;
1636
table.move(codes, start_i, i and i - 1 or #codes, #outln + 1, outln);
1637
if not i then
1638
break;
1639
end;
1640
i += 1;
1641
elseif escape_c == 0x4E then
1642
if codes[i + 1] == 0x7B and codes[i + 2] == 0x55 and codes[i + 3] == 0x2B and flags.unicode then
1643
i += 4;
1644
local start_i = i;
1645
while codes[i] and
1646
(codes[i] >= 0x30 and codes[i] <= 0x39
1647
or codes[i] >= 0x41 and codes[i] <= 0x46
1648
or codes[i] >= 0x61 and codes[i] <= 0x66) do
1649
i += 1;
1650
end;
1651
if codes[i] ~= 0x7D or i == start_i then
1652
return "malformed Unicode code point";
1653
end;
1654
local code_point = tonumber(utf8_sub(codes.s, start_i, i));
1655
table.insert(outln, code_point);
1656
else
1657
table.insert(outln, escape_chars[0x4E]);
1658
end;
1659
elseif escape_c == 0x50 or escape_c == 0x70 then
1660
if not options.unicodeData then
1661
return "options.unicodeData cannot be turned off when using \\p";
1662
end;
1663
i += 1;
1664
if codes[i] ~= 0x7B then
1665
local c_name = utf8.char(codes[i] or 0);
1666
if not valid_categories[c_name] then
1667
return "unknown or malformed script name";
1668
end;
1669
table.insert(outln, { "category", false, c_name });
1670
else
1671
local negate = escape_c == 0x50;
1672
i += 1;
1673
if codes[i] == 0x5E then
1674
i += 1;
1675
negate = not negate;
1676
end;
1677
local start_i = i;
1678
while codes[i] and
1679
(codes[i] >= 0x30 and codes[i] <= 0x39
1680
or codes[i] >= 0x41 and codes[i] <= 0x5A
1681
or codes[i] >= 0x61 and codes[i] <= 0x7A
1682
or codes[i] == 0x5F) do
1683
i += 1;
1684
end;
1685
if codes[i] ~= 0x7D then
1686
return "unknown or malformed script name";
1687
end;
1688
local c_name = utf8_sub(codes.s, start_i, i);
1689
local script_set = chr_scripts[c_name];
1690
if script_set then
1691
table.insert(outln, { "charset", negate, script_set });
1692
elseif not valid_categories[c_name] then
1693
return "unknown or malformed script name";
1694
else
1695
table.insert(outln, { "category", negate, c_name });
1696
end;
1697
end;
1698
elseif escape_c == 0x67 and (codes[i + 1] == 0x7B or codes[i + 1] >= 0x30 and codes[i + 1] <= 0x39) then
1699
local is_grouped = false;
1700
i += 1;
1701
if codes[i] == 0x7B then
1702
i += 1;
1703
is_grouped = true;
1704
elseif codes[i] < 0x30 or codes[i] > 0x39 then
1705
return "malformed reference code";
1706
end;
1707
local org_i = i;
1708
while codes[i] and
1709
(codes[i] >= 0x30 and codes[i] <= 0x39
1710
or codes[i] >= 0x41 and codes[i] <= 0x46
1711
or codes[i] >= 0x61 and codes[i] <= 0x66) do
1712
i += 1;
1713
end;
1714
if is_grouped and codes[i] ~= 0x7D then
1715
return "malformed reference code";
1716
end;
1717
local ref_name = tonumber(utf8_sub(codes.s, org_i, i + (is_grouped and 0 or 1)));
1718
table.insert(outln, { "backref", ref_name });
1719
if not is_grouped then
1720
i -= 1;
1721
end;
1722
elseif escape_c == 0x6F then
1723
i += 1;
1724
if codes[i + 1] ~= 0x7B then
1725
return "malformed octal code";
1726
end
1727
i += 1;
1728
local org_i = i;
1729
while codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 do
1730
i += 1;
1731
end;
1732
if codes[i] ~= 0x7D or i == org_i then
1733
return "malformed octal code";
1734
end;
1735
local ret_chr = tonumber(utf8_sub(codes.s, org_i, i), 8);
1736
if ret_chr > 0xFFFF then
1737
return "character offset too large";
1738
end;
1739
table.insert(outln, ret_chr);
1740
elseif escape_c == 0x78 then
1741
local radix0, radix1;
1742
i += 1;
1743
if codes[i] == 0x7B then
1744
i += 1;
1745
local org_i = i;
1746
while codes[i] and
1747
(codes[i] >= 0x30 and codes[i] <= 0x39
1748
or codes[i] >= 0x41 and codes[i] <= 0x46
1749
or codes[i] >= 0x61 and codes[i] <= 0x66) do
1750
i += 1;
1751
end;
1752
if codes[i] ~= 0x7D or i == org_i then
1753
return "malformed hexadecimal code";
1754
elseif i - org_i > 4 then
1755
return "character offset too large";
1756
end;
1757
table.insert(outln, tonumber(utf8_sub(codes.s, org_i, i), 16));
1758
else
1759
if codes[i] and (codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66) then
1760
radix0 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
1761
i += 1;
1762
if codes[i] and (codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66) then
1763
radix1 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
1764
else
1765
i -= 1;
1766
end;
1767
else
1768
i -= 1;
1769
end;
1770
table.insert(outln, radix0 and (radix1 and 16 * radix0 + radix1 or radix0) or 0);
1771
end;
1772
else
1773
local esc_char = b_escape_chars[escape_c] or escape_chars[escape_c];
1774
table.insert(outln, esc_char or escape_c);
1775
end;
1776
elseif c == 0x2A or c == 0x2B or c == 0x3F or c == 0x7B then
1777
-- Quantifier
1778
local start_q, end_q;
1779
if c == 0x7B then
1780
local org_i = i + 1;
1781
local start_i;
1782
while codes[i + 1] and (codes[i + 1] >= 0x30 and codes[i + 1] <= 0x39 or codes[i + 1] == 0x2C and not start_i and i + 1 ~= org_i) do
1783
i += 1;
1784
if codes[i] == 0x2C then
1785
start_i = i;
1786
end;
1787
end;
1788
if codes[i + 1] == 0x7D then
1789
i += 1;
1790
if not start_i then
1791
start_q = tonumber(utf8_sub(codes.s, org_i, i));
1792
end_q = start_q;
1793
else
1794
start_q, end_q = tonumber(utf8_sub(codes.s, org_i, start_i)), start_i + 1 == i and math.huge or tonumber(utf8_sub(codes.s, start_i + 1, i));
1795
if end_q < start_q then
1796
return "numbers out of order in {} quantifier";
1797
end;
1798
end;
1799
else
1800
table.move(codes, org_i - 1, i, #outln + 1, outln);
1801
end;
1802
else
1803
start_q, end_q = c == 0x2B and 1 or 0, c == 0x3F and 1 or math.huge;
1804
end;
1805
if start_q then
1806
local quantifier_type = flags.ungreedy and "lazy" or "greedy";
1807
if codes[i + 1] == 0x2B or codes[i + 1] == 0x3F then
1808
i += 1;
1809
quantifier_type = codes[i] == 0x2B and "possessive" or flags.ungreedy and "greedy" or "lazy";
1810
end;
1811
local outln_len = #outln;
1812
local last_outln_value = outln[outln_len];
1813
if not last_outln_value or type(last_outln_value) == "table" and (last_outln_value[1] == "quantifier" or last_outln_value[1] == 0x28 or b_escape_chars[last_outln_value[1]])
1814
or last_outln_value == alternation or type(last_outln_value) == "string" then
1815
return "quantifier doesn't follow a repeatable pattern";
1816
end;
1817
if end_q == 0 then
1818
table.remove(outln);
1819
elseif start_q ~= 1 or end_q ~= 1 then
1820
if type(last_outln_value) == "table" and last_outln_value[1] == 0x29 then
1821
outln_len = last_outln_value[3];
1822
end;
1823
outln[outln_len] = { "quantifier", start_q, end_q, quantifier_type, outln[outln_len] };
1824
end;
1825
end;
1826
elseif c == 0x7C then
1827
-- Alternation
1828
table.insert(outln, alternation);
1829
local i1 = #outln;
1830
repeat
1831
i1 -= 1;
1832
local v1, is_table = outln[i1], type(outln[i1]) == "table";
1833
if is_table and v1[1] == 0x29 then
1834
i1 = outln[i1][3];
1835
elseif is_table and v1[1] == 0x28 then
1836
if v1[4] == 0x7C then
1837
group_n = v1[5];
1838
end;
1839
break;
1840
end;
1841
until not v1;
1842
elseif c == 0x24 or c == 0x5E then
1843
table.insert(outln, c == 0x5E and beginning_str or end_str);
1844
elseif flags.ignoreCase and c >= 0x61 and c <= 0x7A then
1845
table.insert(outln, c - 0x20);
1846
elseif flags.extended and (c >= 0x09 and c <= 0x0D or c == 0x20 or c == 0x23) then
1847
if c == 0x23 then
1848
repeat
1849
i += 1;
1850
until not codes[i] or codes[i] == 0x0A or codes[i] == 0x0D;
1851
end;
1852
else
1853
table.insert(outln, c);
1854
end;
1855
i += 1;
1856
end;
1857
local max_group_n = 0;
1858
for i, v in ipairs(outln) do
1859
if type(v) == "table" and (v[1] == 0x28 or v[1] == "quantifier" and type(v[5]) == "table" and v[5][1] == 0x28) then
1860
if v[1] == "quantifier" then
1861
v = v[5];
1862
end;
1863
if not v[3] then
1864
return "unterminated parenthetical";
1865
elseif v[2] then
1866
max_group_n = math.max(max_group_n, v[2]);
1867
end;
1868
elseif type(v) == "table" and (v[1] == "backref" or v[1] == "recurmatch") then
1869
if not group_id[v[2]] and (type(v[2]) ~= "number" or v[2] > group_n) then
1870
return "reference to a non-existent or invalid subpattern";
1871
elseif v[1] == "recurmatch" and v[2] ~= 0 then
1872
for i1, v1 in ipairs(outln) do
1873
if type(v1) == "table" and v1[1] == 0x28 and v1[2] == v[2] then
1874
v[3] = i1;
1875
break;
1876
end;
1877
end;
1878
elseif type(v[2]) == "string" then
1879
v[2] = group_id[v[2]];
1880
end;
1881
end;
1882
end;
1883
outln.group_n = max_group_n;
1884
return outln, group_id, verb_flags;
1885
end;
1886
1887
if not tonumber(options.cacheSize) then
1888
error(string.format("expected number for options.cacheSize, got %s", typeof(options.cacheSize)), 2);
1889
end;
1890
local cacheSize = math.floor(options.cacheSize or 0) ~= 0 and tonumber(options.cacheSize);
1891
local cache_pattern, cache_pattern_names;
1892
if not cacheSize then
1893
elseif cacheSize < 0 or cacheSize ~= cacheSize then
1894
error("cache size cannot be a negative number or a NaN", 2);
1895
elseif cacheSize == math.huge then
1896
cache_pattern, cache_pattern_names = { nil }, { nil };
1897
elseif cacheSize >= 2 ^ 32 then
1898
error("cache size too large", 2);
1899
else
1900
cache_pattern, cache_pattern_names = table.create(options.cacheSize), table.create(options.cacheSize);
1901
end;
1902
if cacheSize then
1903
function re.pruge()
1904
table.clear(cache_pattern_names);
1905
table.clear(cache_pattern);
1906
end;
1907
end;
1908
1909
local function new_re(str_arr, flags, flag_repr, pattern_repr)
1910
local tokenized_ptn, group_id, verb_flags;
1911
local cache_format = cacheSize and string.format("%s|%s", str_arr.s, flag_repr);
1912
local cached_token = cacheSize and cache_pattern[table.find(cache_pattern_names, cache_format)];
1913
if cached_token then
1914
tokenized_ptn, group_id, verb_flags = table.unpack(cached_token, 1, 3);
1915
else
1916
tokenized_ptn, group_id, verb_flags = tokenize_ptn(str_arr, flags);
1917
if type(tokenized_ptn) == "string" then
1918
error(tokenized_ptn, 2);
1919
end;
1920
if cacheSize and tokenized_ptn[1] then
1921
table.insert(cache_pattern_names, 1, cache_format);
1922
table.insert(cache_pattern, 1, { tokenized_ptn, group_id, verb_flags });
1923
if cacheSize ~= math.huge then
1924
table.remove(cache_pattern_names, cacheSize + 1);
1925
table.remove(cache_pattern, cacheSize + 1);
1926
end;
1927
end;
1928
end;
1929
1930
local object = newproxy(true);
1931
proxy[object] = { name = "RegEx", flags = flags, flag_repr = flag_repr, pattern_repr = pattern_repr, token = tokenized_ptn, group_id = group_id, verb_flags = verb_flags };
1932
local object_mt = getmetatable(object);
1933
object_mt.__index = setmetatable(flags, re_m);
1934
object_mt.__tostring = re_tostr;
1935
object_mt.__metatable = lockmsg;
1936
1937
return object;
1938
end;
1939
1940
local function escape_fslash(pre)
1941
return (#pre % 2 == 0 and '\\' or '') .. pre .. '.';
1942
end;
1943
1944
local function sort_flag_chr(a, b)
1945
return a:lower() < b:lower();
1946
end;
1947
1948
function re.new(...)
1949
if select('#', ...) == 0 then
1950
error("missing argument #1 (string expected)", 2);
1951
end;
1952
local ptn, flags_str = ...;
1953
if type(ptn) == "number" then
1954
ptn ..= '';
1955
elseif type(ptn) ~= "string" then
1956
error(string.format("invalid argument #1 (string expected, got %s)", typeof(ptn)), 2);
1957
end;
1958
if type(flags_str) ~= "string" and type(flags_str) ~= "number" and flags_str ~= nil then
1959
error(string.format("invalid argument #2 (string expected, got %s)", typeof(flags_str)), 2);
1960
end;
1961
1962
local flags = {
1963
anchored = false, caseless = false, multiline = false, dotall = false, unicode = false, ungreedy = false, extended = false,
1964
};
1965
local flag_repr = { };
1966
for f in string.gmatch(flags_str or '', utf8.charpattern) do
1967
if flags[flag_map[f]] ~= false then
1968
error("invalid regular expression flag " .. f, 3);
1969
end;
1970
flags[flag_map[f]] = true;
1971
table.insert(flag_repr, f);
1972
end;
1973
table.sort(flag_repr, sort_flag_chr);
1974
flag_repr = table.concat(flag_repr);
1975
return new_re(to_str_arr(ptn), flags, flag_repr, string.format("/%s/", ptn:gsub("(\\*)/", escape_fslash)));
1976
end;
1977
1978
function re.fromstring(...)
1979
if select('#', ...) == 0 then
1980
error("missing argument #1 (string expected)", 2);
1981
end;
1982
local ptn = ...;
1983
if type(ptn) == "number" then
1984
ptn ..= '';
1985
elseif type(ptn) ~= "string" then
1986
error(string.format("invalid argument #1 (string expected, got %s)", typeof(ptn), 2));
1987
end;
1988
local str_arr = to_str_arr(ptn);
1989
local delimiter = str_arr[1];
1990
if not delimiter then
1991
error("empty regex", 2);
1992
elseif delimiter == 0x5C or (delimiter >= 0x30 and delimiter <= 0x39) or (delimiter >= 0x41 and delimiter <= 0x5A) or (delimiter >= 0x61 and delimiter <= 0x7A) then
1993
error("delimiter must not be alphanumeric or a backslash", 2);
1994
end;
1995
1996
local i0 = 1;
1997
repeat
1998
i0 = table.find(str_arr, delimiter, i0 + 1);
1999
if not i0 then
2000
error(string.format("no ending delimiter ('%s') found", utf8.char(delimiter)), 2);
2001
end;
2002
local escape_count = 1;
2003
while str_arr[i0 - escape_count] == 0x5C do
2004
escape_count += 1;
2005
end;
2006
until escape_count % 2 == 1;
2007
2008
local flags = {
2009
anchored = false, caseless = false, multiline = false, dotall = false, unicode = false, ungreedy = false, extended = false,
2010
};
2011
local flag_repr = { };
2012
while str_arr.n > i0 do
2013
local f = utf8.char(table.remove(str_arr));
2014
str_arr.n -= 1;
2015
if flags[flag_map[f]] ~= false then
2016
error("invalid regular expression flag " .. f, 3);
2017
end;
2018
flags[flag_map[f]] = true;
2019
table.insert(flag_repr, f);
2020
end;
2021
table.sort(flag_repr, sort_flag_chr);
2022
flag_repr = table.concat(flag_repr);
2023
table.remove(str_arr, 1);
2024
table.remove(str_arr);
2025
str_arr.n -= 2;
2026
str_arr.s = string.sub(str_arr.s, 2, 1 + str_arr.n);
2027
return new_re(str_arr, flags, flag_repr, string.sub(ptn, 1, 2 + str_arr.n));
2028
end;
2029
2030
local re_escape_line_chrs = {
2031
['\0'] = '\\x00', ['\n'] = '\\n', ['\t'] = '\\t', ['\r'] = '\\r', ['\f'] = '\\f',
2032
};
2033
2034
function re.escape(...)
2035
if select('#', ...) == 0 then
2036
error("missing argument #1 (string expected)", 2);
2037
end;
2038
local str, extended, delimiter = ...;
2039
if type(str) == "number" then
2040
str ..= '';
2041
elseif type(str) ~= "string" then
2042
error(string.format("invalid argument #1 to 'escape' (string expected, got %s)", typeof(str)), 2);
2043
end;
2044
if delimiter == nil then
2045
delimiter = '';
2046
elseif type(delimiter) == "number" then
2047
delimiter ..= '';
2048
elseif type(delimiter) ~= "string" then
2049
error(string.format("invalid argument #3 to 'escape' (string expected, got %s)", typeof(delimiter)), 2);
2050
end;
2051
if utf8.len(delimiter) > 1 or delimiter:match("^[%a\\]$") then
2052
error("delimiter have not be alphanumeric", 2);
2053
end;
2054
return (string.gsub(str, "[\0\f\n\r\t]", re_escape_line_chrs):gsub(string.format("[\\%s#()%%%%*+.?[%%]^{|%s]", extended and '%s' or '', (delimiter:find'^[%%%]]$' and '%' or '') .. delimiter), "\\%1"));
2055
end;
2056
2057
function re.type(...)
2058
if select('#', ...) == 0 then
2059
error("missing argument #1", 2);
2060
end;
2061
return proxy[...] and proxy[...].name;
2062
end;
2063
2064
-- TODO: table.foreach is currently used as top-level loops needlessly increase native code size for this module
2065
table.foreach(re_m, function(k, f) re[k] = f end)
2066
2067
re_m = { __index = re_m };
2068
2069
lockmsg = re.fromstring([[/The\s*metatable\s*is\s*(?:locked|inaccessible)(?#Nice try :])/i]]);
2070
getmetatable(lockmsg).__metatable = lockmsg;
2071
2072
local function readonly_table()
2073
error("Attempt to modify a readonly table", 2);
2074
end;
2075
2076
match_m = {
2077
__index = match_m,
2078
__metatable = lockmsg,
2079
__newindex = readonly_table,
2080
};
2081
2082
re.Match = setmetatable({ }, match_m);
2083
2084
return setmetatable({ }, {
2085
__index = re,
2086
__metatable = lockmsg,
2087
__newindex = readonly_table,
2088
});
2089
2090