Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/bc/src/bc_lex.c
39507 views
1
/*
2
* *****************************************************************************
3
*
4
* SPDX-License-Identifier: BSD-2-Clause
5
*
6
* Copyright (c) 2018-2025 Gavin D. Howard and contributors.
7
*
8
* Redistribution and use in source and binary forms, with or without
9
* modification, are permitted provided that the following conditions are met:
10
*
11
* * Redistributions of source code must retain the above copyright notice, this
12
* list of conditions and the following disclaimer.
13
*
14
* * Redistributions in binary form must reproduce the above copyright notice,
15
* this list of conditions and the following disclaimer in the documentation
16
* and/or other materials provided with the distribution.
17
*
18
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28
* POSSIBILITY OF SUCH DAMAGE.
29
*
30
* *****************************************************************************
31
*
32
* The lexer for bc.
33
*
34
*/
35
36
#if BC_ENABLED
37
38
#include <assert.h>
39
#include <ctype.h>
40
#include <string.h>
41
42
#include <bc.h>
43
#include <vm.h>
44
45
/**
46
* Lexes an identifier, which may be a keyword.
47
* @param l The lexer.
48
*/
49
static void
50
bc_lex_identifier(BcLex* l)
51
{
52
// We already passed the first character, so we need to be sure to include
53
// it.
54
const char* buf = l->buf + l->i - 1;
55
size_t i;
56
57
// This loop is simply checking for keywords.
58
for (i = 0; i < bc_lex_kws_len; ++i)
59
{
60
const BcLexKeyword* kw = bc_lex_kws + i;
61
size_t n = BC_LEX_KW_LEN(kw);
62
63
if (!strncmp(buf, kw->name, n) && !isalnum(buf[n]) && buf[n] != '_')
64
{
65
// If the keyword has been redefined, and redefinition is allowed
66
// (it is not allowed for builtin libraries), break out of the loop
67
// and use it as a name. This depends on the argument parser to
68
// ensure that only non-POSIX keywords get redefined.
69
if (!vm->no_redefine && vm->redefined_kws[i]) break;
70
71
l->t = BC_LEX_KW_AUTO + (BcLexType) i;
72
73
// Warn or error, as appropriate for the mode, if the keyword is not
74
// in the POSIX standard.
75
if (!BC_LEX_KW_POSIX(kw)) bc_lex_verr(l, BC_ERR_POSIX_KW, kw->name);
76
77
// We minus 1 because the index has already been incremented.
78
l->i += n - 1;
79
80
// Already have the token; bail.
81
return;
82
}
83
}
84
85
// If not a keyword, parse the name.
86
bc_lex_name(l);
87
88
// POSIX doesn't allow identifiers that are more than one character, so we
89
// might have to warn or error here too.
90
if (BC_ERR(l->str.len - 1 > 1))
91
{
92
bc_lex_verr(l, BC_ERR_POSIX_NAME_LEN, l->str.v);
93
}
94
}
95
96
/**
97
* Parses a bc string. This is separate from dc strings because dc strings need
98
* to be balanced.
99
* @param l The lexer.
100
*/
101
static void
102
bc_lex_string(BcLex* l)
103
{
104
// We need to keep track of newlines to increment them properly.
105
size_t len, nlines, i;
106
const char* buf;
107
char c;
108
bool got_more;
109
110
l->t = BC_LEX_STR;
111
112
do
113
{
114
nlines = 0;
115
buf = l->buf;
116
got_more = false;
117
118
#if !BC_ENABLE_OSSFUZZ
119
assert(vm->mode != BC_MODE_STDIN || buf == vm->buffer.v);
120
#endif // !BC_ENABLE_OSSFUZZ
121
122
// Fortunately for us, bc doesn't escape quotes. Instead, the equivalent
123
// is '\q', which makes this loop simpler.
124
for (i = l->i; (c = buf[i]) && c != '"'; ++i)
125
{
126
nlines += (c == '\n');
127
}
128
129
if (BC_ERR(c == '\0') && !vm->eof && l->mode != BC_MODE_FILE)
130
{
131
got_more = bc_lex_readLine(l);
132
}
133
}
134
while (got_more && c != '"');
135
136
// If the string did not end properly, barf.
137
if (c != '"')
138
{
139
l->i = i;
140
bc_lex_err(l, BC_ERR_PARSE_STRING);
141
}
142
143
// Set the temp string to the parsed string.
144
len = i - l->i;
145
bc_vec_string(&l->str, len, l->buf + l->i);
146
147
l->i = i + 1;
148
l->line += nlines;
149
}
150
151
/**
152
* This function takes a lexed operator and checks to see if it's the assignment
153
* version, setting the token appropriately.
154
* @param l The lexer.
155
* @param with The token to assign if it is an assignment operator.
156
* @param without The token to assign if it is not an assignment operator.
157
*/
158
static void
159
bc_lex_assign(BcLex* l, BcLexType with, BcLexType without)
160
{
161
if (l->buf[l->i] == '=')
162
{
163
l->i += 1;
164
l->t = with;
165
}
166
else l->t = without;
167
}
168
169
void
170
bc_lex_token(BcLex* l)
171
{
172
// We increment here. This means that all lexing needs to take that into
173
// account, such as when parsing an identifier. If we don't, the first
174
// character of every identifier would be missing.
175
char c = l->buf[l->i++], c2;
176
177
BC_SIG_ASSERT_LOCKED;
178
179
// This is the workhorse of the lexer.
180
switch (c)
181
{
182
case '\0':
183
case '\n':
184
case '\t':
185
case '\v':
186
case '\f':
187
case '\r':
188
case ' ':
189
{
190
bc_lex_commonTokens(l, c);
191
break;
192
}
193
194
case '!':
195
{
196
// Even though it's not an assignment, we can use this.
197
bc_lex_assign(l, BC_LEX_OP_REL_NE, BC_LEX_OP_BOOL_NOT);
198
199
// POSIX doesn't allow boolean not.
200
if (l->t == BC_LEX_OP_BOOL_NOT)
201
{
202
bc_lex_verr(l, BC_ERR_POSIX_BOOL, "!");
203
}
204
205
break;
206
}
207
208
case '"':
209
{
210
bc_lex_string(l);
211
break;
212
}
213
214
case '#':
215
{
216
// POSIX does not allow line comments.
217
bc_lex_err(l, BC_ERR_POSIX_COMMENT);
218
bc_lex_lineComment(l);
219
break;
220
}
221
222
case '%':
223
{
224
bc_lex_assign(l, BC_LEX_OP_ASSIGN_MODULUS, BC_LEX_OP_MODULUS);
225
break;
226
}
227
228
case '&':
229
{
230
c2 = l->buf[l->i];
231
232
// Either we have boolean and or an error. And boolean and is not
233
// allowed by POSIX.
234
if (BC_NO_ERR(c2 == '&'))
235
{
236
bc_lex_verr(l, BC_ERR_POSIX_BOOL, "&&");
237
238
l->i += 1;
239
l->t = BC_LEX_OP_BOOL_AND;
240
}
241
else bc_lex_invalidChar(l, c);
242
243
break;
244
}
245
#if BC_ENABLE_EXTRA_MATH
246
case '$':
247
{
248
l->t = BC_LEX_OP_TRUNC;
249
break;
250
}
251
252
case '@':
253
{
254
bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLACES, BC_LEX_OP_PLACES);
255
break;
256
}
257
#endif // BC_ENABLE_EXTRA_MATH
258
case '(':
259
case ')':
260
{
261
l->t = (BcLexType) (c - '(' + BC_LEX_LPAREN);
262
break;
263
}
264
265
case '*':
266
{
267
bc_lex_assign(l, BC_LEX_OP_ASSIGN_MULTIPLY, BC_LEX_OP_MULTIPLY);
268
break;
269
}
270
271
case '+':
272
{
273
c2 = l->buf[l->i];
274
275
// Have to check for increment first.
276
if (c2 == '+')
277
{
278
l->i += 1;
279
l->t = BC_LEX_OP_INC;
280
}
281
else bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLUS, BC_LEX_OP_PLUS);
282
break;
283
}
284
285
case ',':
286
{
287
l->t = BC_LEX_COMMA;
288
break;
289
}
290
291
case '-':
292
{
293
c2 = l->buf[l->i];
294
295
// Have to check for decrement first.
296
if (c2 == '-')
297
{
298
l->i += 1;
299
l->t = BC_LEX_OP_DEC;
300
}
301
else bc_lex_assign(l, BC_LEX_OP_ASSIGN_MINUS, BC_LEX_OP_MINUS);
302
break;
303
}
304
305
case '.':
306
{
307
c2 = l->buf[l->i];
308
309
// If it's alone, it's an alias for last.
310
if (BC_LEX_NUM_CHAR(c2, true, false)) bc_lex_number(l, c);
311
else
312
{
313
l->t = BC_LEX_KW_LAST;
314
bc_lex_err(l, BC_ERR_POSIX_DOT);
315
}
316
317
break;
318
}
319
320
case '/':
321
{
322
c2 = l->buf[l->i];
323
if (c2 == '*') bc_lex_comment(l);
324
else bc_lex_assign(l, BC_LEX_OP_ASSIGN_DIVIDE, BC_LEX_OP_DIVIDE);
325
break;
326
}
327
328
case '0':
329
case '1':
330
case '2':
331
case '3':
332
case '4':
333
case '5':
334
case '6':
335
case '7':
336
case '8':
337
case '9':
338
case 'A':
339
case 'B':
340
case 'C':
341
case 'D':
342
case 'E':
343
case 'F':
344
// Apparently, GNU bc (and maybe others) allows any uppercase letter as
345
// a number. When single digits, they act like the ones above. When
346
// multi-digit, any letter above the input base is automatically set to
347
// the biggest allowable digit in the input base.
348
case 'G':
349
case 'H':
350
case 'I':
351
case 'J':
352
case 'K':
353
case 'L':
354
case 'M':
355
case 'N':
356
case 'O':
357
case 'P':
358
case 'Q':
359
case 'R':
360
case 'S':
361
case 'T':
362
case 'U':
363
case 'V':
364
case 'W':
365
case 'X':
366
case 'Y':
367
case 'Z':
368
{
369
bc_lex_number(l, c);
370
break;
371
}
372
373
case ';':
374
{
375
l->t = BC_LEX_SCOLON;
376
break;
377
}
378
379
case '<':
380
{
381
#if BC_ENABLE_EXTRA_MATH
382
c2 = l->buf[l->i];
383
384
// Check for shift.
385
if (c2 == '<')
386
{
387
l->i += 1;
388
bc_lex_assign(l, BC_LEX_OP_ASSIGN_LSHIFT, BC_LEX_OP_LSHIFT);
389
break;
390
}
391
#endif // BC_ENABLE_EXTRA_MATH
392
bc_lex_assign(l, BC_LEX_OP_REL_LE, BC_LEX_OP_REL_LT);
393
break;
394
}
395
396
case '=':
397
{
398
bc_lex_assign(l, BC_LEX_OP_REL_EQ, BC_LEX_OP_ASSIGN);
399
break;
400
}
401
402
case '>':
403
{
404
#if BC_ENABLE_EXTRA_MATH
405
c2 = l->buf[l->i];
406
407
// Check for shift.
408
if (c2 == '>')
409
{
410
l->i += 1;
411
bc_lex_assign(l, BC_LEX_OP_ASSIGN_RSHIFT, BC_LEX_OP_RSHIFT);
412
break;
413
}
414
#endif // BC_ENABLE_EXTRA_MATH
415
bc_lex_assign(l, BC_LEX_OP_REL_GE, BC_LEX_OP_REL_GT);
416
break;
417
}
418
419
case '[':
420
case ']':
421
{
422
l->t = (BcLexType) (c - '[' + BC_LEX_LBRACKET);
423
break;
424
}
425
426
case '\\':
427
{
428
// In bc, a backslash+newline is whitespace.
429
if (BC_NO_ERR(l->buf[l->i] == '\n'))
430
{
431
l->i += 1;
432
l->t = BC_LEX_WHITESPACE;
433
}
434
else bc_lex_invalidChar(l, c);
435
break;
436
}
437
438
case '^':
439
{
440
bc_lex_assign(l, BC_LEX_OP_ASSIGN_POWER, BC_LEX_OP_POWER);
441
break;
442
}
443
444
case 'a':
445
case 'b':
446
case 'c':
447
case 'd':
448
case 'e':
449
case 'f':
450
case 'g':
451
case 'h':
452
case 'i':
453
case 'j':
454
case 'k':
455
case 'l':
456
case 'm':
457
case 'n':
458
case 'o':
459
case 'p':
460
case 'q':
461
case 'r':
462
case 's':
463
case 't':
464
case 'u':
465
case 'v':
466
case 'w':
467
case 'x':
468
case 'y':
469
case 'z':
470
{
471
bc_lex_identifier(l);
472
break;
473
}
474
475
case '{':
476
case '}':
477
{
478
l->t = (BcLexType) (c - '{' + BC_LEX_LBRACE);
479
break;
480
}
481
482
case '|':
483
{
484
c2 = l->buf[l->i];
485
486
// Once again, boolean or is not allowed by POSIX.
487
if (BC_NO_ERR(c2 == '|'))
488
{
489
bc_lex_verr(l, BC_ERR_POSIX_BOOL, "||");
490
491
l->i += 1;
492
l->t = BC_LEX_OP_BOOL_OR;
493
}
494
else bc_lex_invalidChar(l, c);
495
496
break;
497
}
498
499
default:
500
{
501
bc_lex_invalidChar(l, c);
502
}
503
}
504
}
505
#endif // BC_ENABLED
506
507