Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Roblox
GitHub Repository: Roblox/luau
Path: blob/master/VM/src/lutf8lib.cpp
2725 views
1
// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
2
// This code is based on Lua 5.x implementation licensed under MIT License; see lua_LICENSE.txt for details
3
#include "lualib.h"
4
5
#include "lcommon.h"
6
7
#define MAXUNICODE 0x10FFFF
8
9
#define iscont(p) ((*(p) & 0xC0) == 0x80)
10
11
// from strlib
12
// translate a relative string position: negative means back from end
13
static int u_posrelat(int pos, size_t len)
14
{
15
if (pos >= 0)
16
return pos;
17
else if (0u - (size_t)pos > len)
18
return 0;
19
else
20
return (int)len + pos + 1;
21
}
22
23
/*
24
** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
25
*/
26
static const char* utf8_decode(const char* o, int* val)
27
{
28
static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF};
29
const unsigned char* s = (const unsigned char*)o;
30
unsigned int c = s[0];
31
unsigned int res = 0; // final result
32
if (c < 0x80) // ascii?
33
res = c;
34
else
35
{
36
int count = 0; // to count number of continuation bytes
37
while (c & 0x40)
38
{ // still have continuation bytes?
39
int cc = s[++count]; // read next byte
40
if ((cc & 0xC0) != 0x80) // not a continuation byte?
41
return NULL; // invalid byte sequence
42
res = (res << 6) | (cc & 0x3F); // add lower 6 bits from cont. byte
43
c <<= 1; // to test next bit
44
}
45
res |= ((c & 0x7F) << (count * 5)); // add first byte
46
if (count > 3 || res > MAXUNICODE || res <= limits[count])
47
return NULL; // invalid byte sequence
48
if (unsigned(res - 0xD800) < 0x800)
49
return NULL; // surrogate
50
s += count; // skip continuation bytes read
51
}
52
if (val)
53
*val = res;
54
return (const char*)s + 1; // +1 to include first byte
55
}
56
57
/*
58
** utf8len(s [, i [, j]]) --> number of characters that start in the
59
** range [i,j], or nil + current position if 's' is not well formed in
60
** that interval
61
*/
62
static int utflen(lua_State* L)
63
{
64
int n = 0;
65
size_t len;
66
const char* s = luaL_checklstring(L, 1, &len);
67
int posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
68
int posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
69
luaL_argcheck(L, 1 <= posi && --posi <= (int)len, 2, "initial position out of string");
70
luaL_argcheck(L, --posj < (int)len, 3, "final position out of string");
71
while (posi <= posj)
72
{
73
const char* s1 = utf8_decode(s + posi, NULL);
74
if (s1 == NULL)
75
{ // conversion error?
76
lua_pushnil(L); // return nil ...
77
lua_pushinteger(L, posi + 1); // ... and current position
78
return 2;
79
}
80
posi = (int)(s1 - s);
81
n++;
82
}
83
lua_pushinteger(L, n);
84
return 1;
85
}
86
87
/*
88
** codepoint(s, [i, [j]]) -> returns codepoints for all characters
89
** that start in the range [i,j]
90
*/
91
static int codepoint(lua_State* L)
92
{
93
size_t len;
94
const char* s = luaL_checklstring(L, 1, &len);
95
int posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
96
int pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
97
int n;
98
const char* se;
99
luaL_argcheck(L, posi >= 1, 2, "out of range");
100
luaL_argcheck(L, pose <= (int)len, 3, "out of range");
101
if (posi > pose)
102
return 0; // empty interval; return no values
103
if (pose - posi >= INT_MAX) // (int -> int) overflow?
104
luaL_error(L, "string slice too long");
105
n = (int)(pose - posi) + 1;
106
luaL_checkstack(L, n, "string slice too long");
107
n = 0;
108
se = s + pose;
109
for (s += posi - 1; s < se;)
110
{
111
int code;
112
s = utf8_decode(s, &code);
113
if (s == NULL)
114
luaL_error(L, "invalid UTF-8 code");
115
lua_pushinteger(L, code);
116
n++;
117
}
118
return n;
119
}
120
121
// from Lua 5.3 lobject.h
122
#define UTF8BUFFSZ 8
123
124
// from Lua 5.3 lobject.c, copied verbatim + static
125
static int luaO_utf8esc(char* buff, unsigned long x)
126
{
127
int n = 1; // number of bytes put in buffer (backwards)
128
LUAU_ASSERT(x <= 0x10FFFF);
129
if (x < 0x80) // ascii?
130
buff[UTF8BUFFSZ - 1] = cast_to(char, x);
131
else
132
{ // need continuation bytes
133
unsigned int mfb = 0x3f; // maximum that fits in first byte
134
do
135
{ // add continuation bytes
136
buff[UTF8BUFFSZ - (n++)] = cast_to(char, 0x80 | (x & 0x3f));
137
x >>= 6; // remove added bits
138
mfb >>= 1; // now there is one less bit available in first byte
139
} while (x > mfb); // still needs continuation byte?
140
buff[UTF8BUFFSZ - n] = cast_to(char, (~mfb << 1) | x); // add first byte
141
}
142
return n;
143
}
144
145
// lighter replacement for pushutfchar; doesn't push any string onto the stack
146
static int buffutfchar(lua_State* L, int arg, char* buff, const char** charstr)
147
{
148
int code = luaL_checkinteger(L, arg);
149
luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range");
150
int l = luaO_utf8esc(buff, cast_to(long, code));
151
*charstr = buff + UTF8BUFFSZ - l;
152
return l;
153
}
154
155
/*
156
** utfchar(n1, n2, ...) -> char(n1)..char(n2)...
157
**
158
** This version avoids the need to make more invasive upgrades elsewhere (like
159
** implementing the %U escape in lua_pushfstring) and avoids pushing string
160
** objects for each codepoint in the multi-argument case. -Jovanni
161
*/
162
static int utfchar(lua_State* L)
163
{
164
char buff[UTF8BUFFSZ];
165
const char* charstr;
166
167
int n = lua_gettop(L); // number of arguments
168
if (n == 1)
169
{ // optimize common case of single char
170
int l = buffutfchar(L, 1, buff, &charstr);
171
lua_pushlstring(L, charstr, l);
172
}
173
else
174
{
175
luaL_Strbuf b;
176
luaL_buffinit(L, &b);
177
for (int i = 1; i <= n; i++)
178
{
179
int l = buffutfchar(L, i, buff, &charstr);
180
luaL_addlstring(&b, charstr, l);
181
}
182
luaL_pushresult(&b);
183
}
184
return 1;
185
}
186
187
/*
188
** offset(s, n, [i]) -> index where n-th character counting from
189
** position 'i' starts; 0 means character at 'i'.
190
*/
191
static int byteoffset(lua_State* L)
192
{
193
size_t len;
194
const char* s = luaL_checklstring(L, 1, &len);
195
int n = luaL_checkinteger(L, 2);
196
int posi = (n >= 0) ? 1 : (int)len + 1;
197
posi = u_posrelat(luaL_optinteger(L, 3, posi), len);
198
luaL_argcheck(L, 1 <= posi && --posi <= (int)len, 3, "position out of range");
199
if (n == 0)
200
{
201
// find beginning of current byte sequence
202
while (posi > 0 && iscont(s + posi))
203
posi--;
204
}
205
else
206
{
207
if (iscont(s + posi))
208
luaL_error(L, "initial position is a continuation byte");
209
if (n < 0)
210
{
211
while (n < 0 && posi > 0)
212
{ // move back
213
do
214
{ // find beginning of previous character
215
posi--;
216
} while (posi > 0 && iscont(s + posi));
217
n++;
218
}
219
}
220
else
221
{
222
n--; // do not move for 1st character
223
while (n > 0 && posi < (int)len)
224
{
225
do
226
{ // find beginning of next character
227
posi++;
228
} while (iscont(s + posi)); // (cannot pass final '\0')
229
n--;
230
}
231
}
232
}
233
if (n == 0) // did it find given character?
234
lua_pushinteger(L, posi + 1);
235
else // no such character
236
lua_pushnil(L);
237
return 1;
238
}
239
240
static int iter_aux(lua_State* L)
241
{
242
size_t len;
243
const char* s = luaL_checklstring(L, 1, &len);
244
int n = lua_tointeger(L, 2) - 1;
245
if (n < 0) // first iteration?
246
n = 0; // start from here
247
else if (n < (int)len)
248
{
249
n++; // skip current byte
250
while (iscont(s + n))
251
n++; // and its continuations
252
}
253
if (n >= (int)len)
254
return 0; // no more codepoints
255
else
256
{
257
int code;
258
const char* next = utf8_decode(s + n, &code);
259
if (next == NULL || iscont(next))
260
luaL_error(L, "invalid UTF-8 code");
261
lua_pushinteger(L, n + 1);
262
lua_pushinteger(L, code);
263
return 2;
264
}
265
}
266
267
static int iter_codes(lua_State* L)
268
{
269
luaL_checkstring(L, 1);
270
lua_pushcfunction(L, iter_aux, NULL);
271
lua_pushvalue(L, 1);
272
lua_pushinteger(L, 0);
273
return 3;
274
}
275
276
// pattern to match a single UTF-8 character
277
#define UTF8PATT "[\0-\x7F\xC2-\xF4][\x80-\xBF]*"
278
279
static const luaL_Reg funcs[] = {
280
{"offset", byteoffset},
281
{"codepoint", codepoint},
282
{"char", utfchar},
283
{"len", utflen},
284
{"codes", iter_codes},
285
{NULL, NULL},
286
};
287
288
int luaopen_utf8(lua_State* L)
289
{
290
luaL_register(L, LUA_UTF8LIBNAME, funcs);
291
292
lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT) / sizeof(char) - 1);
293
lua_setfield(L, -2, "charpattern");
294
295
return 1;
296
}
297
298