local options = {
cacheSize = 256,
unicodeData = false,
};
local u_categories = options.unicodeData and require(script:WaitForChild("_unicodechar_category"));
local chr_scripts = options.unicodeData and require(script:WaitForChild("_scripts"));
local xuc_chr = options.unicodeData and require(script:WaitForChild("_xuc"));
local proxy = setmetatable({ }, { __mode = 'k' });
local re, re_m, match_m = { }, { }, { };
local lockmsg;
local function to_str_arr(self, init)
if init then
self = string.sub(self, utf8.offset(self, init));
end;
local len = utf8.len(self);
if len <= 1999 then
return { n = len, s = self, utf8.codepoint(self, 1, #self) };
end;
local clen = math.ceil(len / 1999);
local ret = table.create(len);
local p = 1;
for i = 1, clen do
local c = table.pack(utf8.codepoint(self, utf8.offset(self, i * 1999 - 1998), utf8.offset(self, i * 1999 - (i == clen and 1998 - ((len - 1) % 1999 + 1) or - 1)) - 1));
table.move(c, 1, c.n, p, ret);
p += c.n;
end;
ret.s, ret.n = self, len;
return ret;
end;
local function from_str_arr(self)
local len = self.n or #self;
if len <= 7997 then
return utf8.char(table.unpack(self));
end;
local clen = math.ceil(len / 7997);
local r = table.create(clen);
for i = 1, clen do
r[i] = utf8.char(table.unpack(self, i * 7997 - 7996, i * 7997 - (i == clen and 7997 - ((len - 1) % 7997 + 1) or 0)));
end;
return table.concat(r);
end;
local function utf8_sub(self, i, j)
j = utf8.offset(self, j);
return string.sub(self, utf8.offset(self, i), j and j - 1);
end;
local flag_map = {
a = 'anchored', i = 'caseless', m = 'multiline', s = 'dotall', u = 'unicode', U = 'ungreedy', x ='extended',
};
local posix_class_names = {
alnum = true, alpha = true, ascii = true, blank = true, cntrl = true, digit = true, graph = true, lower = true, print = true, punct = true, space = true, upper = true, word = true, xdigit = true,
};
local escape_chars = {
[0x44] = { "class", "digit", true }, [0x53] = { "class", "space", true }, [0x57] = { "class", "word", true },
[0x64] = { "class", "digit", false }, [0x73] = { "class", "space", false }, [0x77] = { "class", "word", false },
[0x48] = { "class", "blank", true }, [0x56] = { "class", "vertical_tab", true },
[0x68] = { "class", "blank", false }, [0x76] = { "class", "vertical_tab", false },
[0x4E] = { 0x4E }, [0x52] = { 0x52 },
[0x42] = 0x08,
[0x6E] = 0x0A, [0x72] = 0x0D, [0x74] = 0x09,
};
local b_escape_chars = {
[0x62] = { 0x62, { "class", "word", false } }, [0x42] = { 0x42, { "class", "word", false } },
[0x4B] = { 0x4B },
[0x47] = { 0x47 }, [0x4A] = { 0x4A }, [0x5A] = { 0x5A }, [0x7A] = { 0x7A },
};
local valid_categories = {
C = true, Cc = true, Cf = true, Cn = true, Co = true, Cs = true,
L = true, Ll = true, Lm = true, Lo = true, Lt = true, Lu = true,
M = true, Mc = true, Me = true, Mn = true,
N = true, Nd = true, Nl = true, No = true,
P = true, Pc = true, Pd = true, Pe = true, Pf = true, Pi = true, Po = true, Ps = true,
S = true, Sc = true, Sk = true, Sm = true, So = true,
Z = true, Zl = true, Zp = true, Zs = true,
Xan = true, Xps = true, Xsp = true, Xuc = true, Xwd = true,
};
local class_ascii_punct = {
[0x21] = true, [0x22] = true, [0x23] = true, [0x24] = true, [0x25] = true, [0x26] = true, [0x27] = true, [0x28] = true, [0x29] = true, [0x2A] = true, [0x2B] = true, [0x2C] = true, [0x2D] = true, [0x2E] = true, [0x2F] = true,
[0x3A] = true, [0x3B] = true, [0x3C] = true, [0x3D] = true, [0x3E] = true, [0x3F] = true, [0x40] = true, [0x5B] = true, [0x5C] = true, [0x5D] = true, [0x5E] = true, [0x5F] = true, [0x60] = true, [0x7B] = true, [0x7C] = true,
[0x7D] = true, [0x7E] = true,
};
local end_str = { 0x24 };
local dot = { 0x2E };
local beginning_str = { 0x5E };
local alternation = { 0x7C };
local function check_re(re_type, name, func)
if re_type == "Match" then
return function(...)
local arg_n = select('#', ...);
if arg_n < 1 then
error("missing argument #1 (Match expected)", 2);
end;
local arg0, arg1 = ...;
if not (proxy[arg0] and proxy[arg0].name == "Match") then
error(string.format("invalid argument #1 to %q (Match expected, got %s)", name, typeof(arg0)), 2);
else
arg0 = proxy[arg0];
end;
if name == "group" or name == "span" then
if arg1 == nil then
arg1 = 0;
end;
end;
return func(arg0, arg1);
end;
end;
return function(...)
local arg_n = select('#', ...);
if arg_n < 1 then
error("missing argument #1 (RegEx expected)", 2);
elseif arg_n < 2 then
error("missing argument #2 (string expected)", 2);
end;
local arg0, arg1, arg2, arg3, arg4, arg5 = ...;
if not (proxy[arg0] and proxy[arg0].name == "RegEx") then
if type(arg0) ~= "string" and type(arg0) ~= "number" then
error(string.format("invalid argument #1 to %q (RegEx expected, got %s)", name, typeof(arg0)), 2);
end;
arg0 = re.fromstring(arg0);
elseif name == "sub" then
if type(arg2) == "number" then
arg2 ..= '';
elseif type(arg2) ~= "string" then
error(string.format("invalid argument #3 to 'sub' (string expected, got %s)", typeof(arg2)), 2);
end;
elseif type(arg1) == "number" then
arg1 ..= '';
elseif type(arg1) ~= "string" then
error(string.format("invalid argument #2 to %q (string expected, got %s)", name, typeof(arg1)), 2);
end;
if name ~= "sub" and name ~= "split" then
local init_type = typeof(arg2);
if init_type ~= 'nil' then
arg2 = tonumber(arg2);
if not arg2 then
error(string.format("invalid argument #3 to %q (number expected, got %s)", name, init_type), 2);
elseif arg2 < 0 then
arg2 = #arg1 + math.floor(arg2 + 0.5) + 1;
else
arg2 = math.max(math.floor(arg2 + 0.5), 1);
end;
end;
end;
arg0 = proxy[arg0];
if name == "match" or name == "matchiter" then
arg3 = ...;
elseif name == "sub" then
arg5 = ...;
end;
return func(arg0, arg1, arg2, arg3, arg4, arg5);
end;
end;
local function match_tostr(self)
local spans = proxy[self].spans;
local s_start, s_end = spans[0][1], spans[0][2];
if s_end <= s_start then
return string.format("Match (%d..%d, empty)", s_start, s_end - 1);
end;
return string.format("Match (%d..%d): %s", s_start, s_end - 1, utf8_sub(spans.input, s_start, s_end));
end;
local function new_match(span_arr, group_id, re, str)
span_arr.source, span_arr.input = re, str;
local object = newproxy(true);
local object_mt = getmetatable(object);
object_mt.__metatable = lockmsg;
object_mt.__index = setmetatable(span_arr, match_m);
object_mt.__tostring = match_tostr;
proxy[object] = { name = "Match", spans = span_arr, group_id = group_id };
return object;
end;
match_m.group = check_re('Match', 'group', function(self, group_id)
local span = self.spans[type(group_id) == "number" and group_id or self.group_id[group_id]];
if not span then
return nil;
end;
return utf8_sub(self.spans.input, span[1], span[2]);
end);
match_m.span = check_re('Match', 'span', function(self, group_id)
local span = self.spans[type(group_id) == "number" and group_id or self.group_id[group_id]];
if not span then
return nil;
end;
return span[1], span[2] - 1;
end);
match_m.groups = check_re('Match', 'groups', function(self)
local spans = self.spans;
if spans.n > 0 then
local ret = table.create(spans.n);
for i = 0, spans.n do
local v = spans[i];
if v then
ret[i] = utf8_sub(spans.input, v[1], v[2]);
end;
end;
return table.unpack(ret, 1, spans.n);
end;
return utf8_sub(spans.input, spans[0][1], spans[0][2]);
end);
match_m.groupdict = check_re('Match', 'groupdict', function(self)
local spans = self.spans;
local ret = { };
for k, v in pairs(self.group_id) do
v = spans[v];
if v then
ret[k] = utf8_sub(spans.input, v[1], v[2]);
end;
end;
return ret;
end);
match_m.grouparr = check_re('Match', 'groupdict', function(self)
local spans = self.spans;
local ret = table.create(spans.n);
for i = 0, spans.n do
local v = spans[i];
if v then
ret[i] = utf8_sub(spans.input, v[1], v[2]);
end;
end;
ret.n = spans.n;
return ret;
end);
local line_verbs = {
CR = 0, LF = 1, CRLF = 2, ANYRLF = 3, ANY = 4, NUL = 5,
};
local function is_newline(str_arr, i, verb_flags)
local line_verb_n = verb_flags.newline;
local chr = str_arr[i];
if line_verb_n == 0 then
return chr == 0x0D;
elseif line_verb_n == 2 then
return chr == 0x0A and str_arr[i - 1] == 0x20;
elseif line_verb_n == 3 then
return chr == 0x0A or chr == 0x0D;
elseif line_verb_n == 4 then
return chr == 0x0A or chr == 0x0B or chr == 0x0C or chr == 0x0D or chr == 0x85 or chr == 0x2028 or chr == 0x2029;
elseif line_verb_n == 5 then
return chr == 0;
end;
return chr == 0x0A;
end;
local function tkn_char_match(tkn_part, str_arr, i, flags, verb_flags)
local chr = str_arr[i];
if not chr then
return false;
elseif flags.ignoreCase and chr >= 0x61 and chr <= 0x7A then
chr -= 0x20;
end;
if type(tkn_part) == "number" then
return tkn_part == chr;
elseif tkn_part[1] == "charset" then
for _, v in ipairs(tkn_part[3]) do
if tkn_char_match(v, str_arr, i, flags, verb_flags) then
return not tkn_part[2];
end;
end;
return tkn_part[2];
elseif tkn_part[1] == "range" then
return chr >= tkn_part[2] and chr <= tkn_part[3] or flags.ignoreCase and chr >= 0x41 and chr <= 0x5A and (chr + 0x20) >= tkn_part[2] and (chr + 0x20) <= tkn_part[3];
elseif tkn_part[1] == "class" then
local char_class = tkn_part[2];
local negate = tkn_part[3];
local match = false;
if char_class == "xdigit" then
match = chr >= 0x30 and chr <= 0x39 or chr >= 0x41 and chr <= 0x46 or chr >= 0x61 and chr <= 0x66;
elseif char_class == "ascii" then
match = chr <= 0x7F;
elseif char_class == "vertical_tab" then
match = chr >= 0x0A and chr <= 0x0D or chr == 0x2028 or chr == 0x2029;
elseif flags.unicode then
local current_category = u_categories[chr] or 'Cn';
local first_category = current_category:sub(1, 1);
if char_class == "alnum" then
match = first_category == 'L' or current_category == 'Nl' or current_category == 'Nd';
elseif char_class == "alpha" then
match = first_category == 'L' or current_category == 'Nl';
elseif char_class == "blank" then
match = current_category == 'Zs' or chr == 0x09;
elseif char_class == "cntrl" then
match = current_category == 'Cc';
elseif char_class == "digit" then
match = current_category == 'Nd';
elseif char_class == "graph" then
match = first_category ~= 'P' and first_category ~= 'C';
elseif char_class == "lower" then
match = current_category == 'Ll';
elseif char_class == "print" then
match = first_category ~= 'C';
elseif char_class == "punct" then
match = first_category == 'P';
elseif char_class == "space" then
match = first_category == 'Z' or chr >= 0x09 and chr <= 0x0D;
elseif char_class == "upper" then
match = current_category == 'Lu';
elseif char_class == "word" then
match = first_category == 'L' or current_category == 'Nl' or current_category == 'Nd' or current_category == 'Pc';
end;
elseif char_class == "alnum" then
match = chr >= 0x30 and chr <= 0x39 or chr >= 0x41 and chr <= 0x5A or chr >= 0x61 and chr <= 0x7A;
elseif char_class == "alpha" then
match = chr >= 0x41 and chr <= 0x5A or chr >= 0x61 and chr <= 0x7A;
elseif char_class == "blank" then
match = chr == 0x09 or chr == 0x20;
elseif char_class == "cntrl" then
match = chr <= 0x1F or chr == 0x7F;
elseif char_class == "digit" then
match = chr >= 0x30 and chr <= 0x39;
elseif char_class == "graph" then
match = chr >= 0x21 and chr <= 0x7E;
elseif char_class == "lower" then
match = chr >= 0x61 and chr <= 0x7A;
elseif char_class == "print" then
match = chr >= 0x20 and chr <= 0x7E;
elseif char_class == "punct" then
match = class_ascii_punct[chr];
elseif char_class == "space" then
match = chr >= 0x09 and chr <= 0x0D or chr == 0x20;
elseif char_class == "upper" then
match = chr >= 0x41 and chr <= 0x5A;
elseif char_class == "word" then
match = chr >= 0x30 and chr <= 0x39 or chr >= 0x41 and chr <= 0x5A or chr >= 0x61 and chr <= 0x7A or chr == 0x5F;
end;
if negate then
return not match;
end;
return match;
elseif tkn_part[1] == "category" then
local chr_category = u_categories[chr] or 'Cn';
local category_v = tkn_part[3];
local category_len = #category_v;
if category_len == 3 then
local match = false;
if category_v == "Xan" or category_v == "Xwd" then
match = chr_category:find("^[LN]") or category_v == "Xwd" and chr == 0x5F;
elseif category_v == "Xps" or category_v == "Xsp" then
match = chr_category:sub(1, 1) == 'Z' or chr >= 0x09 and chr <= 0x0D;
elseif category_v == "Xuc" then
match = tkn_char_match(xuc_chr, str_arr, i, flags, verb_flags);
end;
if tkn_part[2] then
return not match;
end
return match;
elseif chr_category:sub(1, category_len) == category_v then
return not tkn_part[2];
end;
return tkn_part[2];
elseif tkn_part[1] == 0x2E then
return flags.dotAll or not is_newline(str_arr, i, verb_flags);
elseif tkn_part[1] == 0x4E then
return not is_newline(str_arr, i, verb_flags);
elseif tkn_part[1] == 0x52 then
if verb_flags.newline_seq == 0 then
return chr == 0x0A or chr == 0x0D;
end;
return chr == 0x0A or chr == 0x0B or chr == 0x0C or chr == 0x0D or chr == 0x85 or chr == 0x2028 or chr == 0x2029;
end;
return false;
end;
local function find_alternation(token, i, count)
while true do
local v = token[i];
local is_table = type(v) == "table";
if v == alternation then
return i, count;
elseif is_table and v[1] == 0x28 then
if count then
count += v.count;
end;
i = v[3];
elseif is_table and v[1] == "quantifier" and type(v[5]) == "table" and v[5][1] == 0x28 then
if count then
count += v[5].count;
end;
i = v[5][3];
elseif not v or is_table and v[1] == 0x29 then
return nil, count;
elseif count then
if is_table and v[1] == "quantifier" then
count += v[3];
else
count += 1;
end;
end;
i += 1;
end;
end;
local function re_rawfind(token, str_arr, init, flags, verb_flags, as_bool)
local tkn_i, str_i, start_i = 0, init, init;
local states = { };
while tkn_i do
if tkn_i == 0 then
tkn_i += 1;
local next_alt = find_alternation(token, tkn_i);
if next_alt then
table.insert(states, 1, { "alternation", next_alt, str_i });
end;
continue;
end;
local ctkn = token[tkn_i];
local tkn_type = type(ctkn) == "table" and ctkn[1];
if not ctkn then
break;
elseif ctkn == "ACCEPT" then
local not_lookaround = true;
local close_i = tkn_i;
repeat
close_i += 1;
local is_table = type(token[close_i]) == "table";
local close_i_tkn = token[close_i];
if is_table and (close_i_tkn[1] == 0x28 or close_i_tkn[1] == "quantifier" and type(close_i_tkn[5]) == "table" and close_i_tkn[5][1] == 0x28) then
close_i = close_i_tkn[1] == "quantifier" and close_i_tkn[5][3] or close_i_tkn[3];
elseif is_table and close_i_tkn[1] == 0x29 and (close_i_tkn[4] == 0x21 or close_i_tkn[4] == 0x3D) then
not_lookaround = false;
tkn_i = close_i;
break;
end;
until not close_i_tkn;
if not_lookaround then
break;
end;
elseif ctkn == "PRUNE" or ctkn == "SKIP" then
table.insert(states, 1, { ctkn, str_i });
tkn_i += 1;
elseif tkn_type == 0x28 then
table.insert(states, 1, { "group", tkn_i, str_i, nil, ctkn[2], ctkn[3], ctkn[4] });
tkn_i += 1;
local next_alt, count = find_alternation(token, tkn_i, (ctkn[4] == 0x21 or ctkn[4] == 0x3D) and ctkn[5] and 0);
if next_alt then
table.insert(states, 1, { "alternation", next_alt, str_i });
end;
if count then
str_i -= count;
end;
elseif tkn_type == 0x29 and ctkn[4] ~= 0x21 then
if ctkn[4] == 0x21 or ctkn[4] == 0x3D then
while true do
local selected_match_start;
local selected_state = table.remove(states, 1);
if selected_state[1] == "group" and selected_state[2] == ctkn[3] then
if (ctkn[4] == 0x21 or ctkn[4] == 0x3D) and not ctkn[5] then
str_i = selected_state[3];
end;
if selected_match_start then
table.insert(states, 1, selected_match_start);
end;
break;
elseif selected_state[1] == "matchStart" and not selected_match_start and ctkn[4] == 0x3D then
selected_match_start = selected_state;
end;
end;
elseif ctkn[4] == 0x3E then
repeat
local selected_state = table.remove(states, 1);
until not selected_state or selected_state[1] == "group" and selected_state[2] == ctkn[3];
else
for i, v in ipairs(states) do
if v[1] == "group" and v[2] == ctkn[3] then
if v.jmp then
tkn_i = v.jmp;
end;
v[4] = str_i;
if v[7] == "quantifier" and v[10] + 1 < v[9] then
if token[ctkn[3]][4] ~= "lazy" or v[10] + 1 < v[8] then
tkn_i = ctkn[3];
end;
local ctkn1 = token[ctkn[3]];
local new_group = { "group", v[2], str_i, nil, ctkn1[5][2], ctkn1[5][3], "quantifier", ctkn1[2], ctkn1[3], v[10] + 1, v[11], ctkn1[4] };
table.insert(states, 1, new_group);
if v[11] then
table.insert(states, 1, { "alternation", v[11], str_i });
end;
end;
break;
end;
end;
end;
tkn_i += 1;
elseif tkn_type == 0x4B then
table.insert(states, 1, { "matchStart", str_i });
tkn_i += 1;
elseif tkn_type == 0x7C then
local close_i = tkn_i;
repeat
close_i += 1;
local is_table = type(token[close_i]) == "table";
local close_i_tkn = token[close_i];
if is_table and (close_i_tkn[1] == 0x28 or close_i_tkn[1] == "quantifier" and type(close_i_tkn[5]) == "table" and close_i_tkn[5][1] == 0x28) then
close_i = close_i_tkn[1] == "quantifier" and close_i_tkn[5][3] or close_i_tkn[3];
end;
until is_table and close_i_tkn[1] == 0x29 or not close_i_tkn;
if token[close_i] then
for _, v in ipairs(states) do
if v[1] == "group" and v[6] == close_i then
tkn_i = v[6];
break;
end;
end;
else
tkn_i = close_i;
end;
elseif tkn_type == "recurmatch" then
table.insert(states, 1, { "group", ctkn[3], str_i, nil, nil, token[ctkn[3]][3], nil, jmp = tkn_i });
tkn_i = ctkn[3] + 1;
local next_alt, count = find_alternation(token, tkn_i);
if next_alt then
table.insert(states, 1, { "alternation", next_alt, str_i });
end;
else
local match;
if ctkn == "FAIL" then
match = false;
elseif tkn_type == 0x29 then
repeat
local selected_state = table.remove(states, 1);
until selected_state[1] == "group" and selected_state[2] == ctkn[3];
elseif tkn_type == "quantifier" then
if type(ctkn[5]) == "table" and ctkn[5][1] == 0x28 then
local next_alt = find_alternation(token, tkn_i + 1);
if next_alt then
table.insert(states, 1, { "alternation", next_alt, str_i });
end;
table.insert(states, next_alt and 2 or 1, { "group", tkn_i, str_i, nil, ctkn[5][2], ctkn[5][3], "quantifier", ctkn[2], ctkn[3], 0, next_alt, ctkn[4] });
if ctkn[4] == "lazy" and ctkn[2] == 0 then
tkn_i = ctkn[5][3];
end;
match = true;
else
local start_i, end_i;
local pattern_count = 1;
local is_backref = type(ctkn[5]) == "table" and ctkn[5][1] == "backref";
if is_backref then
pattern_count = 0;
local group_n = ctkn[5][2];
for _, v in ipairs(states) do
if v[1] == "group" and v[5] == group_n then
start_i, end_i = v[3], v[4];
pattern_count = end_i - start_i;
break;
end;
end;
end;
local min_max_i = str_i + ctkn[2] * pattern_count;
local mcount = 0;
while mcount < ctkn[3] do
if is_backref then
if start_i and end_i then
local org_i = str_i;
if utf8_sub(str_arr.s, start_i, end_i) ~= utf8_sub(str_arr.s, org_i, str_i + pattern_count) then
break;
end;
else
break;
end;
elseif not tkn_char_match(ctkn[5], str_arr, str_i, flags, verb_flags) then
break;
end;
str_i += pattern_count;
mcount += 1;
end;
match = mcount >= ctkn[2];
if match and ctkn[4] ~= "possessive" then
if ctkn[4] == "lazy" then
min_max_i, str_i = str_i, min_max_i;
end;
table.insert(states, 1, { "quantifier", tkn_i, str_i, math.min(min_max_i, str_arr.n + 1), (ctkn[4] == "lazy" and 1 or -1) * pattern_count });
end;
end;
elseif tkn_type == "backref" then
local start_i, end_i;
local group_n = ctkn[2];
for _, v in ipairs(states) do
if v[1] == "group" and v[5] == group_n then
start_i, end_i = v[3], v[4];
break;
end;
end;
if start_i and end_i then
local org_i = str_i;
str_i += end_i - start_i;
match = utf8_sub(str_arr.s, start_i, end_i) == utf8_sub(str_arr.s, org_i, str_i);
end;
else
local chr = str_arr[str_i];
if tkn_type == 0x24 or tkn_type == 0x5A or tkn_type == 0x7A then
match = str_i == str_arr.n + 1 or tkn_type == 0x24 and flags.multiline and is_newline(str_arr, str_i + 1, verb_flags) or tkn_type == 0x5A and str_i == str_arr.n and is_newline(str_arr, str_i, verb_flags);
elseif tkn_type == 0x5E or tkn_type == 0x41 or tkn_type == 0x47 then
match = str_i == 1 or tkn_type == 0x5E and flags.multiline and is_newline(str_arr, str_i - 1, verb_flags) or tkn_type == 0x47 and str_i == init;
elseif tkn_type == 0x42 or tkn_type == 0x62 then
local start_m = str_i == 1 or flags.multiline and is_newline(str_arr, str_i - 1, verb_flags);
local end_m = str_i == str_arr.n + 1 or flags.multiline and is_newline(str_arr, str_i, verb_flags);
local w_m = tkn_char_match(ctkn[2], str_arr[str_i - 1], flags) and 0 or tkn_char_match(ctkn[2], chr, flags) and 1;
if w_m == 0 then
match = end_m or not tkn_char_match(ctkn[2], chr, flags);
elseif w_m then
match = start_m or not tkn_char_match(ctkn[2], str_arr[str_i - 1], flags);
end;
if tkn_type == 0x42 then
match = not match;
end;
else
match = tkn_char_match(ctkn, str_arr, str_i, flags, verb_flags);
str_i += 1;
end;
end;
if not match then
while true do
local prev_type, prev_state = states[1] and states[1][1], states[1];
if not prev_type or prev_type == "PRUNE" or prev_type == "SKIP" then
if prev_type then
table.clear(states);
end;
if start_i > str_arr.n then
if as_bool then
return false;
end;
return nil;
end;
start_i = prev_type == "SKIP" and prev_state[2] or start_i + 1;
tkn_i, str_i = 0, start_i;
break;
elseif prev_type == "alternation" then
tkn_i, str_i = prev_state[2], prev_state[3];
local next_alt, count = find_alternation(token, tkn_i + 1);
if next_alt then
prev_state[2] = next_alt;
else
table.remove(states, 1);
end;
if count then
str_i -= count;
end;
break;
elseif prev_type == "group" then
if prev_state[7] == "quantifier" then
if prev_state[12] == "greedy" and prev_state[10] >= prev_state[8]
or prev_state[12] == "lazy" and prev_state[10] < prev_state[9] and not prev_state[13] then
tkn_i, str_i = prev_state[12] == "greedy" and prev_state[6] or prev_state[2], prev_state[3];
if prev_state[12] == "greedy" then
table.remove(states, 1);
break;
elseif prev_state[10] >= prev_state[8] then
prev_state[13] = true;
break;
end;
end;
elseif prev_state[7] == 0x21 then
table.remove(states, 1);
tkn_i, str_i = prev_state[6], prev_state[3];
break;
end;
elseif prev_type == "quantifier" then
if math.sign(prev_state[4] - prev_state[3]) == math.sign(prev_state[5]) then
prev_state[3] += prev_state[5];
tkn_i, str_i = prev_state[2], prev_state[3];
break;
end;
end;
table.remove(states, 1);
end;
end;
tkn_i += 1;
end;
end;
if as_bool then
return true;
end;
local match_start_ran = false;
local span = table.create(token.group_n);
span[0], span.n = { start_i, str_i }, token.group_n;
for _, v in ipairs(states) do
if v[1] == "matchStart" and not match_start_ran then
span[0][1], match_start_ran = v[2], true;
elseif v[1] == "group" and v[5] and not span[v[5]] then
span[v[5]] = { v[3], v[4] };
end;
end;
return span;
end;
re_m.test = check_re('RegEx', 'test', function(self, str, init)
return re_rawfind(self.token, to_str_arr(str, init), 1, self.flags, self.verb_flags, true);
end);
re_m.match = check_re('RegEx', 'match', function(self, str, init, source)
local span = re_rawfind(self.token, to_str_arr(str, init), 1, self.flags, self.verb_flags, false);
if not span then
return nil;
end;
return new_match(span, self.group_id, source, str);
end);
re_m.matchall = check_re('RegEx', 'matchall', function(self, str, init, source)
str = to_str_arr(str, init);
local i = 1;
return function()
local span = i <= str.n + 1 and re_rawfind(self.token, str, i, self.flags, self.verb_flags, false);
if not span then
return nil;
end;
i = span[0][2] + (span[0][1] >= span[0][2] and 1 or 0);
return new_match(span, self.group_id, source, str.s);
end;
end);
local function insert_tokenized_sub(repl_r, str, span, tkn)
for _, v in ipairs(tkn) do
if type(v) == "table" then
if v[1] == "condition" then
if span[v[2]] then
if v[3] then
insert_tokenized_sub(repl_r, str, span, v[3]);
else
table.move(str, span[v[2]][1], span[v[2]][2] - 1, #repl_r + 1, repl_r);
end;
elseif v[4] then
insert_tokenized_sub(repl_r, str, span, v[4]);
end;
else
table.move(v, 1, #v, #repl_r + 1, repl_r);
end;
elseif span[v] then
table.move(str, span[v][1], span[v][2] - 1, #repl_r + 1, repl_r);
end;
end;
repl_r.n = #repl_r;
return repl_r;
end;
re_m.sub = check_re('RegEx', 'sub', function(self, repl, str, n, repl_flag_str, source)
if repl_flag_str ~= nil and type(repl_flag_str) ~= "number" and type(repl_flag_str) ~= "string" then
error(string.format("invalid argument #5 to 'sub' (string expected, got %s)", typeof(repl_flag_str)), 3);
end
local repl_flags = {
l = false, o = false, u = false,
};
for f in string.gmatch(repl_flag_str or '', utf8.charpattern) do
if repl_flags[f] ~= false then
error("invalid regular expression substitution flag " .. f, 3);
end;
repl_flags[f] = true;
end;
local repl_type = type(repl);
if repl_type == "number" then
repl ..= '';
elseif repl_type ~= "string" and repl_type ~= "function" and (not repl_flags.o or repl_type ~= "table") then
error(string.format("invalid argument #2 to 'sub' (string/function%s expected, got %s)", repl_flags.o and "/table" or '', typeof(repl)), 3);
end;
if tonumber(n) then
n = tonumber(n);
if n <= -1 or n ~= n then
n = math.huge;
end;
elseif n ~= nil then
error(string.format("invalid argument #4 to 'sub' (number expected, got %s)", typeof(n)), 3);
else
n = math.huge;
end;
if n < 1 then
return str, 0;
end;
local min_repl_n = 0;
if repl_type == "string" then
repl = to_str_arr(repl);
if not repl_flags.l then
local i1 = 0;
local repl_r = table.create(3);
local group_n = self.token.group_n;
local conditional_c = { };
while i1 < repl.n do
local i2 = i1;
repeat
i2 += 1;
until not repl[i2] or repl[i2] == 0x24 or repl[i2] == 0x5C or (repl[i2] == 0x3A or repl[i2] == 0x7D) and conditional_c[1];
min_repl_n += i2 - i1 - 1;
if i2 - i1 > 1 then
table.insert(repl_r, table.move(repl, i1 + 1, i2 - 1, 1, table.create(i2 - i1 - 1)));
end;
if repl[i2] == 0x3A then
local current_conditional_c = conditional_c[1];
if current_conditional_c[2] then
error("malformed substitution pattern", 3);
end;
current_conditional_c[2] = table.move(repl_r, current_conditional_c[3], #repl_r, 1, table.create(#repl_r + 1 - current_conditional_c[3]));
for i3 = #repl_r, current_conditional_c[3], -1 do
repl_r[i3] = nil;
end;
elseif repl[i2] == 0x7D then
local current_conditional_c = table.remove(conditional_c, 1);
local second_c = table.move(repl_r, current_conditional_c[3], #repl_r, 1, table.create(#repl_r + 1 - current_conditional_c[3]));
for i3 = #repl_r, current_conditional_c[3], -1 do
repl_r[i3] = nil;
end;
table.insert(repl_r, { "condition", current_conditional_c[1], current_conditional_c[2] ~= true and (current_conditional_c[2] or second_c), current_conditional_c[2] and second_c });
elseif repl[i2] then
i2 += 1;
local subst_c = repl[i2];
if not subst_c then
if repl[i2 - 1] == 0x5C then
error("replacement string must not end with a trailing backslash", 3);
end;
local prev_repl_f = repl_r[#repl_r];
if type(prev_repl_f) == "table" then
table.insert(prev_repl_f, repl[i2 - 1]);
else
table.insert(repl_r, { repl[i2 - 1] });
end;
elseif subst_c == 0x5C and repl[i2 - 1] == 0x24 then
local prev_repl_f = repl_r[#repl_r];
if type(prev_repl_f) == "table" then
table.insert(prev_repl_f, 0x24);
else
table.insert(repl_r, { 0x24 });
end;
i2 -= 1;
min_repl_n += 1;
elseif subst_c == 0x30 then
table.insert(repl_r, 0);
elseif subst_c > 0x30 and subst_c <= 0x39 then
local start_i2 = i2;
local group_i = subst_c - 0x30;
while repl[i2 + 1] and repl[i2 + 1] >= 0x30 and repl[i2 + 1] <= 0x39 do
group_i ..= repl[i2 + 1] - 0x30;
i2 += 1;
end;
group_i = tonumber(group_i);
if not repl_flags.u and group_i > group_n then
error("reference to non-existent subpattern", 3);
end;
table.insert(repl_r, group_i);
elseif subst_c == 0x7B and repl[i2 - 1] == 0x24 then
i2 += 1;
local start_i2 = i2;
while repl[i2] and
(repl[i2] >= 0x30 and repl[i2] <= 0x39
or repl[i2] >= 0x41 and repl[i2] <= 0x5A
or repl[i2] >= 0x61 and repl[i2] <= 0x7A
or repl[i2] == 0x5F) do
i2 += 1;
end;
if (repl[i2] == 0x7D or repl[i2] == 0x3A and (repl[i2 + 1] == 0x2B or repl[i2 + 1] == 0x2D)) and i2 ~= start_i2 then
local group_k = utf8_sub(repl.s, start_i2, i2);
if repl[start_i2] >= 0x30 and repl[start_i2] <= 0x39 then
group_k = tonumber(group_k);
if not repl_flags.u and group_k > group_n then
error("reference to non-existent subpattern", 3);
end;
else
group_k = self.group_id[group_k];
if not repl_flags.u and (not group_k or group_k > group_n) then
error("reference to non-existent subpattern", 3);
end;
end;
if repl[i2] == 0x3A then
i2 += 1;
table.insert(conditional_c, { group_k, repl[i2] == 0x2D, #repl_r + 1 });
else
table.insert(repl_r, group_k);
end;
else
error("malformed substitution pattern", 3);
end;
else
local c_escape_char;
if repl[i2 - 1] == 0x24 then
if subst_c ~= 0x24 then
local prev_repl_f = repl_r[#repl_r];
if type(prev_repl_f) == "table" then
table.insert(prev_repl_f, 0x24);
else
table.insert(repl_r, { 0x24 });
end;
end;
else
c_escape_char = escape_chars[repl[i2]];
if type(c_escape_char) ~= "number" then
c_escape_char = nil;
end;
end;
local prev_repl_f = repl_r[#repl_r];
if type(prev_repl_f) == "table" then
table.insert(prev_repl_f, c_escape_char or repl[i2]);
else
table.insert(repl_r, { c_escape_char or repl[i2] });
end;
min_repl_n += 1;
end;
end;
i1 = i2;
end;
if conditional_c[1] then
error("malformed substitution pattern", 3);
end;
if not repl_r[2] and type(repl_r[1]) == "table" and repl_r[1][1] ~= "condition" then
repl, repl.n = repl_r[1], #repl_r[1];
else
repl, repl_type = repl_r, "subst_string";
end;
end;
end;
str = to_str_arr(str);
local incr, i0, count = 0, 1, 0;
while i0 <= str.n + incr + 1 do
local span = re_rawfind(self.token, str, i0, self.flags, self.verb_flags, false);
if not span then
break;
end;
local repl_r;
if repl_type == "string" then
repl_r = repl;
elseif repl_type == "subst_string" then
repl_r = insert_tokenized_sub(table.create(min_repl_n), str, span, repl);
else
local re_match;
local repl_c;
if repl_type == "table" then
re_match = utf8_sub(str.s, span[0][1], span[0][2]);
repl_c = repl[re_match];
else
re_match = new_match(span, self.group_id, source, str.s);
repl_c = repl(re_match);
end;
if repl_c == re_match or repl_flags.o and not repl_c then
local repl_n = span[0][2] - span[0][1];
repl_r = table.move(str, span[0][1], span[0][2] - 1, 1, table.create(repl_n));
repl_r.n = repl_n;
elseif type(repl_c) == "string" then
repl_r = to_str_arr(repl_c);
elseif type(repl_c) == "number" then
repl_r = to_str_arr(repl_c .. '');
elseif repl_flags.o then
error(string.format("invalid replacement value (a %s)", type(repl_c)), 3);
else
repl_r = { n = 0 };
end;
end;
local match_len = span[0][2] - span[0][1];
local repl_len = math.min(repl_r.n, match_len);
for i1 = 0, repl_len - 1 do
str[span[0][1] + i1] = repl_r[i1 + 1];
end;
local i1 = span[0][1] + repl_len;
i0 = span[0][2];
if match_len > repl_r.n then
for i2 = 1, match_len - repl_r.n do
table.remove(str, i1);
incr -= 1;
i0 -= 1;
end;
elseif repl_r.n > match_len then
for i2 = 1, repl_r.n - match_len do
table.insert(str, i1 + i2 - 1, repl_r[repl_len + i2]);
incr += 1;
i0 += 1;
end;
end;
if match_len <= 0 then
i0 += 1;
end;
count += 1;
if n < count + 1 then
break;
end;
end;
return from_str_arr(str), count;
end);
re_m.split = check_re('RegEx', 'split', function(self, str, n)
if tonumber(n) then
n = tonumber(n);
if n <= -1 or n ~= n then
n = math.huge;
end;
elseif n ~= nil then
error(string.format("invalid argument #3 to 'split' (number expected, got %s)", typeof(n)), 3);
else
n = math.huge;
end;
str = to_str_arr(str);
local i, count = 1, 0;
local ret = { };
local prev_empty = 0;
while i <= str.n + 1 do
count += 1;
local span = n >= count and re_rawfind(self.token, str, i, self.flags, self.verb_flags, false);
if not span then
break;
end;
table.insert(ret, utf8_sub(str.s, i - prev_empty, span[0][1]));
prev_empty = span[0][1] >= span[0][2] and 1 or 0;
i = span[0][2] + prev_empty;
end;
table.insert(ret, string.sub(str.s, utf8.offset(str.s, i - prev_empty)));
return ret;
end);
local function re_index(self, index)
return re_m[index] or proxy[self].flags[index];
end;
local function re_tostr(self)
return proxy[self].pattern_repr .. proxy[self].flag_repr;
end;
local other_valid_group_char = {
[0x3A] = true,
[0x21] = true, [0x3D] = true,
[0x3E] = true,
[0x7C] = true,
};
local function tokenize_ptn(codes, flags)
if flags.unicode and not options.unicodeData then
return "options.unicodeData cannot be turned off while having unicode flag";
end;
local i, len = 1, codes.n;
local group_n = 0;
local outln, group_id, verb_flags = { }, { }, {
newline = 1, newline_seq = 1, not_empty = 0,
};
while i <= len do
local c = codes[i];
if c == 0x28 then
local ret;
if codes[i + 1] == 0x2A then
i += 2;
local start_i = i;
while codes[i]
and (codes[i] >= 0x30 and codes[i] <= 0x39
or codes[i] >= 0x41 and codes[i] <= 0x5A
or codes[i] >= 0x61 and codes[i] <= 0x7A
or codes[i] == 0x5F or codes[i] == 0x3A) do
i += 1;
end;
if codes[i] ~= 0x29 and codes[i - 1] ~= 0x3A then
return "quantifier doesn't follow a repeatable pattern";
end;
local selected_verb = utf8_sub(codes.s, start_i, i);
if selected_verb == "positive_lookahead:" or selected_verb == "negative_lookhead:"
or selected_verb == "positive_lookbehind:" or selected_verb == "negative_lookbehind:"
or selected_verb:find("^[pn]l[ab]:$") then
ret = { 0x28, nil, nil, selected_verb:find('^n') and 0x21 or 0x3D, selected_verb:find('b', 3, true) and 1 };
elseif selected_verb == "atomic:" then
ret = { 0x28, nil, nil, 0x3E, nil };
elseif selected_verb == "ACCEPT" or selected_verb == "FAIL" or selected_verb == 'F' or selected_verb == "PRUNE" or selected_verb == "SKIP" then
ret = selected_verb == 'F' and "FAIL" or selected_verb;
else
if line_verbs[selected_verb] then
verb_flags.newline = selected_verb;
elseif selected_verb == "BSR_ANYCRLF" or selected_verb == "BSR_UNICODE" then
verb_flags.newline_seq = selected_verb == "BSR_UNICODE" and 1 or 0;
elseif selected_verb == "NOTEMPTY" or selected_verb == "NOTEMPTY_ATSTART" then
verb_flags.not_empty = selected_verb == "NOTEMPTY" and 1 or 2;
else
return "unknown or malformed verb";
end;
if outln[1] then
return "this verb must be placed at the beginning of the regex";
end;
end;
elseif codes[i + 1] == 0x3F then
i += 2;
if codes[i] == 0x23 then
i = table.find(codes, 0x29, i);
if not i then
return "unterminated parenthetical";
end;
i += 1;
continue;
elseif not codes[i] then
return "unterminated parenthetical";
end;
ret = { 0x28, nil, nil, codes[i], nil };
if codes[i] == 0x30 and codes[i + 1] == 0x29 then
ret[1], ret[2], ret[3], ret[5] = "recurmatch", 0, 0, nil;
elseif codes[i] > 0x30 and codes[i] <= 0x39 then
local org_i = i;
i += 1;
while codes[i] >= 0x30 and codes[i] <= 0x30 do
i += 1;
end;
if codes[i] ~= 0x29 then
return "invalid group structure";
end;
ret[1], ret[2], ret[4] = "recurmatch", tonumber(utf8_sub(codes.s, org_i, i)), nil;
elseif codes[i] == 0x3C and codes[i + 1] == 0x21 or codes[i + 1] == 0x3D then
i += 1;
ret[4], ret[5] = codes[i], 1;
elseif codes[i] == 0x7C then
ret[5] = group_n;
elseif codes[i] == 0x50 or codes[i] == 0x3C or codes[i] == 0x27 then
if codes[i] == 0x50 then
i += 1;
end;
if codes[i] == 0x3D then
local start_i = i + 1;
while codes[i] and
(codes[i] >= 0x30 and codes[i] <= 0x39
or codes[i] >= 0x41 and codes[i] <= 0x5A
or codes[i] >= 0x61 and codes[i] <= 0x7A
or codes[i] == 0x5F) do
i += 1;
end;
if not codes[i] then
return "unterminated parenthetical";
elseif codes[i] ~= 0x29 or i == start_i then
return "invalid group structure";
end;
ret = { "backref", utf8_sub(codes.s, start_i, i) };
elseif codes[i] == 0x3C or codes[i - 1] ~= 0x50 and codes[i] == 0x27 then
local delimiter = codes[i] == 0x27 and 0x27 or 0x3E;
local start_i = i + 1;
i += 1;
if codes[i] == 0x29 then
return "missing character in subpattern";
elseif codes[i] >= 0x30 and codes[i] <= 0x39 then
return "subpattern name must not begin with a digit";
elseif not (codes[i] >= 0x41 and codes[i] <= 0x5A or codes[i] >= 0x61 and codes[i] <= 0x7A or codes[i] == 0x5F) then
return "invalid character in subpattern";
end;
i += 1;
while codes[i] and
(codes[i] >= 0x30 and codes[i] <= 0x39
or codes[i] >= 0x41 and codes[i] <= 0x5A
or codes[i] >= 0x61 and codes[i] <= 0x7A
or codes[i] == 0x5F) do
i += 1;
end;
if not codes[i] then
return "unterminated parenthetical";
elseif codes[i] ~= delimiter then
return "invalid character in subpattern";
end;
local name = utf8_sub(codes.s, start_i, i);
group_n += 1;
if (group_id[name] or group_n) ~= group_n then
return "subpattern name already exists";
end;
for name1, group_n1 in pairs(group_id) do
if name ~= name1 and group_n == group_n1 then
return "different names for subpatterns of the same number aren't permitted";
end;
end;
group_id[name] = group_n;
ret[2], ret[4] = group_n, nil;
else
return "invalid group structure";
end;
elseif not other_valid_group_char[codes[i]] then
return "invalid group structure";
end;
else
group_n += 1;
ret = { 0x28, group_n, nil, nil };
end;
if ret then
table.insert(outln, ret);
end;
elseif c == 0x29 then
local i1 = #outln + 1;
local lookbehind_c = -1;
local current_lookbehind_c = 0;
local max_c, group_c = 0, 0;
repeat
i1 -= 1;
local v, is_table = outln[i1], type(outln[i1]) == "table";
if is_table and v[1] == 0x28 then
group_c += 1;
if current_lookbehind_c and v.count then
current_lookbehind_c += v.count;
end;
if not v[3] then
if v[4] == 0x7C then
group_n = v[5] + math.max(max_c, group_c);
end;
if current_lookbehind_c ~= lookbehind_c and lookbehind_c ~= -1 then
lookbehind_c = nil;
else
lookbehind_c = current_lookbehind_c;
end;
break;
end;
elseif v == alternation then
if current_lookbehind_c ~= lookbehind_c and lookbehind_c ~= -1 then
lookbehind_c, current_lookbehind_c = nil, nil;
else
lookbehind_c, current_lookbehind_c = current_lookbehind_c, 0;
end;
max_c, group_c = math.max(max_c, group_c), 0;
elseif current_lookbehind_c then
if is_table and v[1] == "quantifier" then
if v[2] == v[3] then
current_lookbehind_c += v[2];
else
current_lookbehind_c = nil;
end;
else
current_lookbehind_c += 1;
end;
end;
until i1 < 1;
if i1 < 1 then
return "unmatched ) in regular expression";
end;
local v = outln[i1];
local outln_len_p_1 = #outln + 1;
local ret = { 0x29, v[2], i1, v[4], v[5], count = lookbehind_c };
if (v[4] == 0x21 or v[4] == 0x3D) and v[5] and not lookbehind_c then
return "lookbehind assertion is not fixed width";
end;
v[3] = outln_len_p_1;
table.insert(outln, ret);
elseif c == 0x2E then
table.insert(outln, dot);
elseif c == 0x5B then
local negate, char_class = false, nil;
i += 1;
local start_i = i;
if codes[i] == 0x5E then
negate = true;
i += 1;
elseif codes[i] == 0x2E or codes[i] == 0x3A or codes[i] == 0x3D then
char_class = codes[i];
end;
local ret;
if codes[i] == 0x5B or codes[i] == 0x5C then
ret = { };
else
ret = { codes[i] };
i += 1;
end;
while codes[i] ~= 0x5D do
if not codes[i] then
return "unterminated character class";
elseif codes[i] == 0x2D and ret[1] and type(ret[1]) == "number" then
if codes[i + 1] == 0x5D then
table.insert(ret, 1, 0x2D);
else
i += 1;
local ret_c = codes[i];
if ret_c == 0x5B then
if codes[i + 1] == 0x2E or codes[i + 1] == 0x3A or codes[i + 1] == 0x3D then
local i1 = i + 2;
repeat
i1 = table.find(codes, 0x5D, i1);
until not i1 or codes[i1 - 1] ~= 0x5C;
if not i1 then
return "unterminated character class";
elseif codes[i1 - 1] == codes[i + 1] and i1 - 1 ~= i + 1 then
return "invalid range in character class";
end;
end;
if ret[1] > 0x5B then
return "invalid range in character class";
end;
elseif ret_c == 0x5C then
i += 1;
if codes[i] == 0x78 then
local radix0, radix1;
i += 1;
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66 then
radix0 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
i += 1;
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66 then
radix1 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
else
i -= 1;
end;
else
i -= 1;
end;
ret_c = radix0 and (radix1 and 16 * radix0 + radix1 or radix0) or 0;
elseif codes[i] >= 0x30 and codes[i] <= 0x37 then
local radix0, radix1, radix2 = codes[i] - 0x30, nil, nil;
i += 1;
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
radix1 = codes[i] - 0x30;
i += 1;
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
radix2 = codes[i] - 0x30;
else
i -= 1;
end;
else
i -= 1;
end;
ret_c = radix1 and (radix2 and 64 * radix0 + 8 * radix1 + radix2 or 8 * radix0 + radix1) or radix0;
else
ret_c = escape_chars[codes[i]] or codes[i];
if type(ret_c) ~= "number" then
return "invalid range in character class";
end;
end;
elseif ret[1] > ret_c then
return "invalid range in character class";
end;
ret[1] = { "range", ret[1], ret_c };
end;
elseif codes[i] == 0x5B then
if codes[i + 1] == 0x2E or codes[i + 1] == 0x3A or codes[i + 1] == 0x3D then
local i1 = i + 2;
repeat
i1 = table.find(codes, 0x5D, i1);
until not i1 or codes[i1 - 1] ~= 0x5C;
if not i1 then
return "unterminated character class";
elseif codes[i1 - 1] ~= codes[i + 1] or i1 - 1 == i + 1 then
table.insert(ret, 1, 0x5B);
elseif codes[i1 - 1] == 0x2E or codes[i1 - 1] == 0x3D then
return "POSIX collating elements aren't supported";
elseif codes[i1 - 1] == 0x3A then
local negate = codes[i + 3] == 0x5E;
local class_name = utf8_sub(codes.s, i + (negate and 3 or 2), i1 - 1);
if not posix_class_names[class_name] then
return "unknown POSIX class name";
end;
table.insert(ret, 1, { "class", class_name, negate });
i = i1;
end;
else
table.insert(ret, 1, 0x5B);
end;
elseif codes[i] == 0x5C then
i += 1;
if codes[i] == 0x78 then
local radix0, radix1;
i += 1;
if codes[i] == 0x7B then
i += 1;
local org_i = i;
while codes[i] and
(codes[i] >= 0x30 and codes[i] <= 0x39
or codes[i] >= 0x41 and codes[i] <= 0x46
or codes[i] >= 0x61 and codes[i] <= 0x66) do
i += 1;
end;
if codes[i] ~= 0x7D or i == org_i then
return "malformed hexadecimal character";
elseif i - org_i > 4 then
return "character offset too large";
end;
table.insert(ret, 1, tonumber(utf8_sub(codes.s, org_i, i), 16));
else
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66 then
radix0 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
i += 1;
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66 then
radix1 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
else
i -= 1;
end;
else
i -= 1;
end;
table.insert(ret, 1, radix0 and (radix1 and 16 * radix0 + radix1 or radix0) or 0);
end;
elseif codes[i] >= 0x30 and codes[i] <= 0x37 then
local radix0, radix1, radix2 = codes[i] - 0x30, nil, nil;
i += 1;
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
radix1 = codes[i] - 0x30;
i += 1;
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
radix2 = codes[i] - 0x30;
else
i -= 1;
end;
else
i -= 1;
end;
table.insert(ret, 1, radix1 and (radix2 and 64 * radix0 + 8 * radix1 + radix2 or 8 * radix0 + radix1) or radix0);
elseif codes[i] == 0x45 then
elseif codes[i] == 0x51 then
local start_i = i + 1;
repeat
i = table.find(codes, 0x5C, i + 1);
until not i or codes[i + 1] == 0x45;
table.move(codes, start_i, i and i - 1 or #codes, #outln + 1, outln);
if not i then
break;
end;
i += 1;
elseif codes[i] == 0x4E then
if codes[i + 1] == 0x7B and codes[i + 2] == 0x55 and codes[i + 3] == 0x2B and flags.unicode then
i += 4;
local start_i = i;
while codes[i] and
(codes[i] >= 0x30 and codes[i] <= 0x39
or codes[i] >= 0x41 and codes[i] <= 0x46
or codes[i] >= 0x61 and codes[i] <= 0x66) do
i += 1;
end;
if codes[i] ~= 0x7D or i == start_i then
return "malformed Unicode code point";
end;
local code_point = tonumber(utf8_sub(codes.s, start_i, i));
table.insert(ret, 1, code_point);
else
return "invalid escape sequence";
end;
elseif codes[i] == 0x50 or codes[i] == 0x70 then
if not options.unicodeData then
return "options.unicodeData cannot be turned off when using \\p";
end;
i += 1;
if codes[i] ~= 0x7B then
local c_name = utf8.char(codes[i] or 0);
if not valid_categories[c_name] then
return "unknown or malformed script name";
end;
table.insert(ret, 1, { "category", false, c_name });
else
local negate = codes[i] == 0x50;
i += 1;
if codes[i] == 0x5E then
i += 1;
negate = not negate;
end;
local start_i = i;
while codes[i] and
(codes[i] >= 0x30 and codes[i] <= 0x39
or codes[i] >= 0x41 and codes[i] <= 0x5A
or codes[i] >= 0x61 and codes[i] <= 0x7A
or codes[i] == 0x5F) do
i += 1;
end;
if codes[i] ~= 0x7D then
return "unknown or malformed script name";
end;
local c_name = utf8_sub(codes.s, start_i, i);
local script_set = chr_scripts[c_name];
if script_set then
table.insert(ret, 1, { "charset", negate, script_set });
elseif not valid_categories[c_name] then
return "unknown or malformed script name";
else
table.insert(ret, 1, { "category", negate, c_name });
end;
end;
elseif codes[i] == 0x6F then
i += 1;
if codes[i] ~= 0x7B then
return "malformed octal code";
end;
i += 1;
local org_i = i;
while codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 do
i += 1;
end;
if codes[i] ~= 0x7D or i == org_i then
return "malformed octal code";
end;
local ret_chr = tonumber(utf8_sub(codes.s, org_i, i), 8);
if ret_chr > 0xFFFF then
return "character offset too large";
end;
table.insert(ret, 1, ret_chr);
else
local esc_char = escape_chars[codes[i]];
table.insert(ret, 1, type(esc_char) == "string" and { "class", esc_char, false } or esc_char or codes[i]);
end;
elseif flags.ignoreCase and codes[i] >= 0x61 and codes[i] <= 0x7A then
table.insert(ret, 1, codes[i] - 0x20);
else
table.insert(ret, 1, codes[i]);
end;
i += 1;
end;
if codes[i - 1] == char_class and i - 1 ~= start_i then
return char_class == 0x3A and "POSIX named classes are only support within a character set" or "POSIX collating elements aren't supported";
end;
if not ret[2] and not negate then
table.insert(outln, ret[1]);
else
table.insert(outln, { "charset", negate, ret });
end;
elseif c == 0x5C then
i += 1;
local escape_c = codes[i];
if not escape_c then
return "pattern may not end with a trailing backslash";
elseif escape_c >= 0x30 and escape_c <= 0x39 then
local org_i = i;
while codes[i + 1] and codes[i + 1] >= 0x30 and codes[i + 1] <= 0x39 do
i += 1;
end;
local escape_d = tonumber(utf8_sub(codes.s, org_i, i + 1));
if escape_d > group_n and i ~= org_i then
i = org_i;
local radix0, radix1, radix2;
if codes[i] <= 0x37 then
radix0 = codes[i] - 0x30;
i += 1;
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
radix1 = codes[i] - 0x30;
i += 1;
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
radix2 = codes[i] - 0x30;
else
i -= 1;
end;
else
i -= 1;
end;
end;
table.insert(outln, radix0 and (radix1 and (radix2 and 64 * radix0 + 8 * radix1 + radix2 or 8 * radix0 + radix1) or radix0) or codes[org_i]);
else
table.insert(outln, { "backref", escape_d });
end;
elseif escape_c == 0x45 then
elseif escape_c == 0x51 then
local start_i = i + 1;
repeat
i = table.find(codes, 0x5C, i + 1);
until not i or codes[i + 1] == 0x45;
table.move(codes, start_i, i and i - 1 or #codes, #outln + 1, outln);
if not i then
break;
end;
i += 1;
elseif escape_c == 0x4E then
if codes[i + 1] == 0x7B and codes[i + 2] == 0x55 and codes[i + 3] == 0x2B and flags.unicode then
i += 4;
local start_i = i;
while codes[i] and
(codes[i] >= 0x30 and codes[i] <= 0x39
or codes[i] >= 0x41 and codes[i] <= 0x46
or codes[i] >= 0x61 and codes[i] <= 0x66) do
i += 1;
end;
if codes[i] ~= 0x7D or i == start_i then
return "malformed Unicode code point";
end;
local code_point = tonumber(utf8_sub(codes.s, start_i, i));
table.insert(outln, code_point);
else
table.insert(outln, escape_chars[0x4E]);
end;
elseif escape_c == 0x50 or escape_c == 0x70 then
if not options.unicodeData then
return "options.unicodeData cannot be turned off when using \\p";
end;
i += 1;
if codes[i] ~= 0x7B then
local c_name = utf8.char(codes[i] or 0);
if not valid_categories[c_name] then
return "unknown or malformed script name";
end;
table.insert(outln, { "category", false, c_name });
else
local negate = escape_c == 0x50;
i += 1;
if codes[i] == 0x5E then
i += 1;
negate = not negate;
end;
local start_i = i;
while codes[i] and
(codes[i] >= 0x30 and codes[i] <= 0x39
or codes[i] >= 0x41 and codes[i] <= 0x5A
or codes[i] >= 0x61 and codes[i] <= 0x7A
or codes[i] == 0x5F) do
i += 1;
end;
if codes[i] ~= 0x7D then
return "unknown or malformed script name";
end;
local c_name = utf8_sub(codes.s, start_i, i);
local script_set = chr_scripts[c_name];
if script_set then
table.insert(outln, { "charset", negate, script_set });
elseif not valid_categories[c_name] then
return "unknown or malformed script name";
else
table.insert(outln, { "category", negate, c_name });
end;
end;
elseif escape_c == 0x67 and (codes[i + 1] == 0x7B or codes[i + 1] >= 0x30 and codes[i + 1] <= 0x39) then
local is_grouped = false;
i += 1;
if codes[i] == 0x7B then
i += 1;
is_grouped = true;
elseif codes[i] < 0x30 or codes[i] > 0x39 then
return "malformed reference code";
end;
local org_i = i;
while codes[i] and
(codes[i] >= 0x30 and codes[i] <= 0x39
or codes[i] >= 0x41 and codes[i] <= 0x46
or codes[i] >= 0x61 and codes[i] <= 0x66) do
i += 1;
end;
if is_grouped and codes[i] ~= 0x7D then
return "malformed reference code";
end;
local ref_name = tonumber(utf8_sub(codes.s, org_i, i + (is_grouped and 0 or 1)));
table.insert(outln, { "backref", ref_name });
if not is_grouped then
i -= 1;
end;
elseif escape_c == 0x6F then
i += 1;
if codes[i + 1] ~= 0x7B then
return "malformed octal code";
end
i += 1;
local org_i = i;
while codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 do
i += 1;
end;
if codes[i] ~= 0x7D or i == org_i then
return "malformed octal code";
end;
local ret_chr = tonumber(utf8_sub(codes.s, org_i, i), 8);
if ret_chr > 0xFFFF then
return "character offset too large";
end;
table.insert(outln, ret_chr);
elseif escape_c == 0x78 then
local radix0, radix1;
i += 1;
if codes[i] == 0x7B then
i += 1;
local org_i = i;
while codes[i] and
(codes[i] >= 0x30 and codes[i] <= 0x39
or codes[i] >= 0x41 and codes[i] <= 0x46
or codes[i] >= 0x61 and codes[i] <= 0x66) do
i += 1;
end;
if codes[i] ~= 0x7D or i == org_i then
return "malformed hexadecimal code";
elseif i - org_i > 4 then
return "character offset too large";
end;
table.insert(outln, tonumber(utf8_sub(codes.s, org_i, i), 16));
else
if codes[i] and (codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66) then
radix0 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
i += 1;
if codes[i] and (codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66) then
radix1 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
else
i -= 1;
end;
else
i -= 1;
end;
table.insert(outln, radix0 and (radix1 and 16 * radix0 + radix1 or radix0) or 0);
end;
else
local esc_char = b_escape_chars[escape_c] or escape_chars[escape_c];
table.insert(outln, esc_char or escape_c);
end;
elseif c == 0x2A or c == 0x2B or c == 0x3F or c == 0x7B then
local start_q, end_q;
if c == 0x7B then
local org_i = i + 1;
local start_i;
while codes[i + 1] and (codes[i + 1] >= 0x30 and codes[i + 1] <= 0x39 or codes[i + 1] == 0x2C and not start_i and i + 1 ~= org_i) do
i += 1;
if codes[i] == 0x2C then
start_i = i;
end;
end;
if codes[i + 1] == 0x7D then
i += 1;
if not start_i then
start_q = tonumber(utf8_sub(codes.s, org_i, i));
end_q = start_q;
else
start_q, end_q = tonumber(utf8_sub(codes.s, org_i, start_i)), start_i + 1 == i and math.huge or tonumber(utf8_sub(codes.s, start_i + 1, i));
if end_q < start_q then
return "numbers out of order in {} quantifier";
end;
end;
else
table.move(codes, org_i - 1, i, #outln + 1, outln);
end;
else
start_q, end_q = c == 0x2B and 1 or 0, c == 0x3F and 1 or math.huge;
end;
if start_q then
local quantifier_type = flags.ungreedy and "lazy" or "greedy";
if codes[i + 1] == 0x2B or codes[i + 1] == 0x3F then
i += 1;
quantifier_type = codes[i] == 0x2B and "possessive" or flags.ungreedy and "greedy" or "lazy";
end;
local outln_len = #outln;
local last_outln_value = outln[outln_len];
if not last_outln_value or type(last_outln_value) == "table" and (last_outln_value[1] == "quantifier" or last_outln_value[1] == 0x28 or b_escape_chars[last_outln_value[1]])
or last_outln_value == alternation or type(last_outln_value) == "string" then
return "quantifier doesn't follow a repeatable pattern";
end;
if end_q == 0 then
table.remove(outln);
elseif start_q ~= 1 or end_q ~= 1 then
if type(last_outln_value) == "table" and last_outln_value[1] == 0x29 then
outln_len = last_outln_value[3];
end;
outln[outln_len] = { "quantifier", start_q, end_q, quantifier_type, outln[outln_len] };
end;
end;
elseif c == 0x7C then
table.insert(outln, alternation);
local i1 = #outln;
repeat
i1 -= 1;
local v1, is_table = outln[i1], type(outln[i1]) == "table";
if is_table and v1[1] == 0x29 then
i1 = outln[i1][3];
elseif is_table and v1[1] == 0x28 then
if v1[4] == 0x7C then
group_n = v1[5];
end;
break;
end;
until not v1;
elseif c == 0x24 or c == 0x5E then
table.insert(outln, c == 0x5E and beginning_str or end_str);
elseif flags.ignoreCase and c >= 0x61 and c <= 0x7A then
table.insert(outln, c - 0x20);
elseif flags.extended and (c >= 0x09 and c <= 0x0D or c == 0x20 or c == 0x23) then
if c == 0x23 then
repeat
i += 1;
until not codes[i] or codes[i] == 0x0A or codes[i] == 0x0D;
end;
else
table.insert(outln, c);
end;
i += 1;
end;
local max_group_n = 0;
for i, v in ipairs(outln) do
if type(v) == "table" and (v[1] == 0x28 or v[1] == "quantifier" and type(v[5]) == "table" and v[5][1] == 0x28) then
if v[1] == "quantifier" then
v = v[5];
end;
if not v[3] then
return "unterminated parenthetical";
elseif v[2] then
max_group_n = math.max(max_group_n, v[2]);
end;
elseif type(v) == "table" and (v[1] == "backref" or v[1] == "recurmatch") then
if not group_id[v[2]] and (type(v[2]) ~= "number" or v[2] > group_n) then
return "reference to a non-existent or invalid subpattern";
elseif v[1] == "recurmatch" and v[2] ~= 0 then
for i1, v1 in ipairs(outln) do
if type(v1) == "table" and v1[1] == 0x28 and v1[2] == v[2] then
v[3] = i1;
break;
end;
end;
elseif type(v[2]) == "string" then
v[2] = group_id[v[2]];
end;
end;
end;
outln.group_n = max_group_n;
return outln, group_id, verb_flags;
end;
if not tonumber(options.cacheSize) then
error(string.format("expected number for options.cacheSize, got %s", typeof(options.cacheSize)), 2);
end;
local cacheSize = math.floor(options.cacheSize or 0) ~= 0 and tonumber(options.cacheSize);
local cache_pattern, cache_pattern_names;
if not cacheSize then
elseif cacheSize < 0 or cacheSize ~= cacheSize then
error("cache size cannot be a negative number or a NaN", 2);
elseif cacheSize == math.huge then
cache_pattern, cache_pattern_names = { nil }, { nil };
elseif cacheSize >= 2 ^ 32 then
error("cache size too large", 2);
else
cache_pattern, cache_pattern_names = table.create(options.cacheSize), table.create(options.cacheSize);
end;
if cacheSize then
function re.pruge()
table.clear(cache_pattern_names);
table.clear(cache_pattern);
end;
end;
local function new_re(str_arr, flags, flag_repr, pattern_repr)
local tokenized_ptn, group_id, verb_flags;
local cache_format = cacheSize and string.format("%s|%s", str_arr.s, flag_repr);
local cached_token = cacheSize and cache_pattern[table.find(cache_pattern_names, cache_format)];
if cached_token then
tokenized_ptn, group_id, verb_flags = table.unpack(cached_token, 1, 3);
else
tokenized_ptn, group_id, verb_flags = tokenize_ptn(str_arr, flags);
if type(tokenized_ptn) == "string" then
error(tokenized_ptn, 2);
end;
if cacheSize and tokenized_ptn[1] then
table.insert(cache_pattern_names, 1, cache_format);
table.insert(cache_pattern, 1, { tokenized_ptn, group_id, verb_flags });
if cacheSize ~= math.huge then
table.remove(cache_pattern_names, cacheSize + 1);
table.remove(cache_pattern, cacheSize + 1);
end;
end;
end;
local object = newproxy(true);
proxy[object] = { name = "RegEx", flags = flags, flag_repr = flag_repr, pattern_repr = pattern_repr, token = tokenized_ptn, group_id = group_id, verb_flags = verb_flags };
local object_mt = getmetatable(object);
object_mt.__index = setmetatable(flags, re_m);
object_mt.__tostring = re_tostr;
object_mt.__metatable = lockmsg;
return object;
end;
local function escape_fslash(pre)
return (#pre % 2 == 0 and '\\' or '') .. pre .. '.';
end;
local function sort_flag_chr(a, b)
return a:lower() < b:lower();
end;
function re.new(...)
if select('#', ...) == 0 then
error("missing argument #1 (string expected)", 2);
end;
local ptn, flags_str = ...;
if type(ptn) == "number" then
ptn ..= '';
elseif type(ptn) ~= "string" then
error(string.format("invalid argument #1 (string expected, got %s)", typeof(ptn)), 2);
end;
if type(flags_str) ~= "string" and type(flags_str) ~= "number" and flags_str ~= nil then
error(string.format("invalid argument #2 (string expected, got %s)", typeof(flags_str)), 2);
end;
local flags = {
anchored = false, caseless = false, multiline = false, dotall = false, unicode = false, ungreedy = false, extended = false,
};
local flag_repr = { };
for f in string.gmatch(flags_str or '', utf8.charpattern) do
if flags[flag_map[f]] ~= false then
error("invalid regular expression flag " .. f, 3);
end;
flags[flag_map[f]] = true;
table.insert(flag_repr, f);
end;
table.sort(flag_repr, sort_flag_chr);
flag_repr = table.concat(flag_repr);
return new_re(to_str_arr(ptn), flags, flag_repr, string.format("/%s/", ptn:gsub("(\\*)/", escape_fslash)));
end;
function re.fromstring(...)
if select('#', ...) == 0 then
error("missing argument #1 (string expected)", 2);
end;
local ptn = ...;
if type(ptn) == "number" then
ptn ..= '';
elseif type(ptn) ~= "string" then
error(string.format("invalid argument #1 (string expected, got %s)", typeof(ptn), 2));
end;
local str_arr = to_str_arr(ptn);
local delimiter = str_arr[1];
if not delimiter then
error("empty regex", 2);
elseif delimiter == 0x5C or (delimiter >= 0x30 and delimiter <= 0x39) or (delimiter >= 0x41 and delimiter <= 0x5A) or (delimiter >= 0x61 and delimiter <= 0x7A) then
error("delimiter must not be alphanumeric or a backslash", 2);
end;
local i0 = 1;
repeat
i0 = table.find(str_arr, delimiter, i0 + 1);
if not i0 then
error(string.format("no ending delimiter ('%s') found", utf8.char(delimiter)), 2);
end;
local escape_count = 1;
while str_arr[i0 - escape_count] == 0x5C do
escape_count += 1;
end;
until escape_count % 2 == 1;
local flags = {
anchored = false, caseless = false, multiline = false, dotall = false, unicode = false, ungreedy = false, extended = false,
};
local flag_repr = { };
while str_arr.n > i0 do
local f = utf8.char(table.remove(str_arr));
str_arr.n -= 1;
if flags[flag_map[f]] ~= false then
error("invalid regular expression flag " .. f, 3);
end;
flags[flag_map[f]] = true;
table.insert(flag_repr, f);
end;
table.sort(flag_repr, sort_flag_chr);
flag_repr = table.concat(flag_repr);
table.remove(str_arr, 1);
table.remove(str_arr);
str_arr.n -= 2;
str_arr.s = string.sub(str_arr.s, 2, 1 + str_arr.n);
return new_re(str_arr, flags, flag_repr, string.sub(ptn, 1, 2 + str_arr.n));
end;
local re_escape_line_chrs = {
['\0'] = '\\x00', ['\n'] = '\\n', ['\t'] = '\\t', ['\r'] = '\\r', ['\f'] = '\\f',
};
function re.escape(...)
if select('#', ...) == 0 then
error("missing argument #1 (string expected)", 2);
end;
local str, extended, delimiter = ...;
if type(str) == "number" then
str ..= '';
elseif type(str) ~= "string" then
error(string.format("invalid argument #1 to 'escape' (string expected, got %s)", typeof(str)), 2);
end;
if delimiter == nil then
delimiter = '';
elseif type(delimiter) == "number" then
delimiter ..= '';
elseif type(delimiter) ~= "string" then
error(string.format("invalid argument #3 to 'escape' (string expected, got %s)", typeof(delimiter)), 2);
end;
if utf8.len(delimiter) > 1 or delimiter:match("^[%a\\]$") then
error("delimiter have not be alphanumeric", 2);
end;
return (string.gsub(str, "[\0\f\n\r\t]", re_escape_line_chrs):gsub(string.format("[\\%s#()%%%%*+.?[%%]^{|%s]", extended and '%s' or '', (delimiter:find'^[%%%]]$' and '%' or '') .. delimiter), "\\%1"));
end;
function re.type(...)
if select('#', ...) == 0 then
error("missing argument #1", 2);
end;
return proxy[...] and proxy[...].name;
end;
table.foreach(re_m, function(k, f) re[k] = f end)
re_m = { __index = re_m };
lockmsg = re.fromstring([[/The\s*metatable\s*is\s*(?:locked|inaccessible)(?#Nice try :])/i]]);
getmetatable(lockmsg).__metatable = lockmsg;
local function readonly_table()
error("Attempt to modify a readonly table", 2);
end;
match_m = {
__index = match_m,
__metatable = lockmsg,
__newindex = readonly_table,
};
re.Match = setmetatable({ }, match_m);
return setmetatable({ }, {
__index = re,
__metatable = lockmsg,
__newindex = readonly_table,
});