CoCalc -- bc

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/bc/src/bc_lex.c
³⁹⁵⁰⁷ views
1
/*
2
 * *****************************************************************************
3
 *
4
 * SPDX-License-Identifier: BSD-2-Clause
5
 *
6
 * Copyright (c) 2018-2025 Gavin D. Howard and contributors.
7
 *
8
 * Redistribution and use in source and binary forms, with or without
9
 * modification, are permitted provided that the following conditions are met:
10
 *
11
 * * Redistributions of source code must retain the above copyright notice, this
12
 *   list of conditions and the following disclaimer.
13
 *
14
 * * Redistributions in binary form must reproduce the above copyright notice,
15
 *   this list of conditions and the following disclaimer in the documentation
16
 *   and/or other materials provided with the distribution.
17
 *
18
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28
 * POSSIBILITY OF SUCH DAMAGE.
29
 *
30
 * *****************************************************************************
31
 *
32
 * The lexer for bc.
33
 *
34
 */
35

36
#if BC_ENABLED
37

38
#include <assert.h>
39
#include <ctype.h>
40
#include <string.h>
41

42
#include <bc.h>
43
#include <vm.h>
44

45
/**
46
 * Lexes an identifier, which may be a keyword.
47
 * @param l  The lexer.
48
 */
49
static void
50
bc_lex_identifier(BcLex* l)
51
{
52
	// We already passed the first character, so we need to be sure to include
53
	// it.
54
	const char* buf = l->buf + l->i - 1;
55
	size_t i;
56

57
	// This loop is simply checking for keywords.
58
	for (i = 0; i < bc_lex_kws_len; ++i)
59
	{
60
		const BcLexKeyword* kw = bc_lex_kws + i;
61
		size_t n = BC_LEX_KW_LEN(kw);
62

63
		if (!strncmp(buf, kw->name, n) && !isalnum(buf[n]) && buf[n] != '_')
64
		{
65
			// If the keyword has been redefined, and redefinition is allowed
66
			// (it is not allowed for builtin libraries), break out of the loop
67
			// and use it as a name. This depends on the argument parser to
68
			// ensure that only non-POSIX keywords get redefined.
69
			if (!vm->no_redefine && vm->redefined_kws[i]) break;
70

71
			l->t = BC_LEX_KW_AUTO + (BcLexType) i;
72

73
			// Warn or error, as appropriate for the mode, if the keyword is not
74
			// in the POSIX standard.
75
			if (!BC_LEX_KW_POSIX(kw)) bc_lex_verr(l, BC_ERR_POSIX_KW, kw->name);
76

77
			// We minus 1 because the index has already been incremented.
78
			l->i += n - 1;
79

80
			// Already have the token; bail.
81
			return;
82
		}
83
	}
84

85
	// If not a keyword, parse the name.
86
	bc_lex_name(l);
87

88
	// POSIX doesn't allow identifiers that are more than one character, so we
89
	// might have to warn or error here too.
90
	if (BC_ERR(l->str.len - 1 > 1))
91
	{
92
		bc_lex_verr(l, BC_ERR_POSIX_NAME_LEN, l->str.v);
93
	}
94
}
95

96
/**
97
 * Parses a bc string. This is separate from dc strings because dc strings need
98
 * to be balanced.
99
 * @param l  The lexer.
100
 */
101
static void
102
bc_lex_string(BcLex* l)
103
{
104
	// We need to keep track of newlines to increment them properly.
105
	size_t len, nlines, i;
106
	const char* buf;
107
	char c;
108
	bool got_more;
109

110
	l->t = BC_LEX_STR;
111

112
	do
113
	{
114
		nlines = 0;
115
		buf = l->buf;
116
		got_more = false;
117

118
#if !BC_ENABLE_OSSFUZZ
119
		assert(vm->mode != BC_MODE_STDIN || buf == vm->buffer.v);
120
#endif // !BC_ENABLE_OSSFUZZ
121

122
		// Fortunately for us, bc doesn't escape quotes. Instead, the equivalent
123
		// is '\q', which makes this loop simpler.
124
		for (i = l->i; (c = buf[i]) && c != '"'; ++i)
125
		{
126
			nlines += (c == '\n');
127
		}
128

129
		if (BC_ERR(c == '\0') && !vm->eof && l->mode != BC_MODE_FILE)
130
		{
131
			got_more = bc_lex_readLine(l);
132
		}
133
	}
134
	while (got_more && c != '"');
135

136
	// If the string did not end properly, barf.
137
	if (c != '"')
138
	{
139
		l->i = i;
140
		bc_lex_err(l, BC_ERR_PARSE_STRING);
141
	}
142

143
	// Set the temp string to the parsed string.
144
	len = i - l->i;
145
	bc_vec_string(&l->str, len, l->buf + l->i);
146

147
	l->i = i + 1;
148
	l->line += nlines;
149
}
150

151
/**
152
 * This function takes a lexed operator and checks to see if it's the assignment
153
 * version, setting the token appropriately.
154
 * @param l        The lexer.
155
 * @param with     The token to assign if it is an assignment operator.
156
 * @param without  The token to assign if it is not an assignment operator.
157
 */
158
static void
159
bc_lex_assign(BcLex* l, BcLexType with, BcLexType without)
160
{
161
	if (l->buf[l->i] == '=')
162
	{
163
		l->i += 1;
164
		l->t = with;
165
	}
166
	else l->t = without;
167
}
168

169
void
170
bc_lex_token(BcLex* l)
171
{
172
	// We increment here. This means that all lexing needs to take that into
173
	// account, such as when parsing an identifier. If we don't, the first
174
	// character of every identifier would be missing.
175
	char c = l->buf[l->i++], c2;
176

177
	BC_SIG_ASSERT_LOCKED;
178

179
	// This is the workhorse of the lexer.
180
	switch (c)
181
	{
182
		case '\0':
183
		case '\n':
184
		case '\t':
185
		case '\v':
186
		case '\f':
187
		case '\r':
188
		case ' ':
189
		{
190
			bc_lex_commonTokens(l, c);
191
			break;
192
		}
193

194
		case '!':
195
		{
196
			// Even though it's not an assignment, we can use this.
197
			bc_lex_assign(l, BC_LEX_OP_REL_NE, BC_LEX_OP_BOOL_NOT);
198

199
			// POSIX doesn't allow boolean not.
200
			if (l->t == BC_LEX_OP_BOOL_NOT)
201
			{
202
				bc_lex_verr(l, BC_ERR_POSIX_BOOL, "!");
203
			}
204

205
			break;
206
		}
207

208
		case '"':
209
		{
210
			bc_lex_string(l);
211
			break;
212
		}
213

214
		case '#':
215
		{
216
			// POSIX does not allow line comments.
217
			bc_lex_err(l, BC_ERR_POSIX_COMMENT);
218
			bc_lex_lineComment(l);
219
			break;
220
		}
221

222
		case '%':
223
		{
224
			bc_lex_assign(l, BC_LEX_OP_ASSIGN_MODULUS, BC_LEX_OP_MODULUS);
225
			break;
226
		}
227

228
		case '&':
229
		{
230
			c2 = l->buf[l->i];
231

232
			// Either we have boolean and or an error. And boolean and is not
233
			// allowed by POSIX.
234
			if (BC_NO_ERR(c2 == '&'))
235
			{
236
				bc_lex_verr(l, BC_ERR_POSIX_BOOL, "&&");
237

238
				l->i += 1;
239
				l->t = BC_LEX_OP_BOOL_AND;
240
			}
241
			else bc_lex_invalidChar(l, c);
242

243
			break;
244
		}
245
#if BC_ENABLE_EXTRA_MATH
246
		case '$':
247
		{
248
			l->t = BC_LEX_OP_TRUNC;
249
			break;
250
		}
251

252
		case '@':
253
		{
254
			bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLACES, BC_LEX_OP_PLACES);
255
			break;
256
		}
257
#endif // BC_ENABLE_EXTRA_MATH
258
		case '(':
259
		case ')':
260
		{
261
			l->t = (BcLexType) (c - '(' + BC_LEX_LPAREN);
262
			break;
263
		}
264

265
		case '*':
266
		{
267
			bc_lex_assign(l, BC_LEX_OP_ASSIGN_MULTIPLY, BC_LEX_OP_MULTIPLY);
268
			break;
269
		}
270

271
		case '+':
272
		{
273
			c2 = l->buf[l->i];
274

275
			// Have to check for increment first.
276
			if (c2 == '+')
277
			{
278
				l->i += 1;
279
				l->t = BC_LEX_OP_INC;
280
			}
281
			else bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLUS, BC_LEX_OP_PLUS);
282
			break;
283
		}
284

285
		case ',':
286
		{
287
			l->t = BC_LEX_COMMA;
288
			break;
289
		}
290

291
		case '-':
292
		{
293
			c2 = l->buf[l->i];
294

295
			// Have to check for decrement first.
296
			if (c2 == '-')
297
			{
298
				l->i += 1;
299
				l->t = BC_LEX_OP_DEC;
300
			}
301
			else bc_lex_assign(l, BC_LEX_OP_ASSIGN_MINUS, BC_LEX_OP_MINUS);
302
			break;
303
		}
304

305
		case '.':
306
		{
307
			c2 = l->buf[l->i];
308

309
			// If it's alone, it's an alias for last.
310
			if (BC_LEX_NUM_CHAR(c2, true, false)) bc_lex_number(l, c);
311
			else
312
			{
313
				l->t = BC_LEX_KW_LAST;
314
				bc_lex_err(l, BC_ERR_POSIX_DOT);
315
			}
316

317
			break;
318
		}
319

320
		case '/':
321
		{
322
			c2 = l->buf[l->i];
323
			if (c2 == '*') bc_lex_comment(l);
324
			else bc_lex_assign(l, BC_LEX_OP_ASSIGN_DIVIDE, BC_LEX_OP_DIVIDE);
325
			break;
326
		}
327

328
		case '0':
329
		case '1':
330
		case '2':
331
		case '3':
332
		case '4':
333
		case '5':
334
		case '6':
335
		case '7':
336
		case '8':
337
		case '9':
338
		case 'A':
339
		case 'B':
340
		case 'C':
341
		case 'D':
342
		case 'E':
343
		case 'F':
344
		// Apparently, GNU bc (and maybe others) allows any uppercase letter as
345
		// a number. When single digits, they act like the ones above. When
346
		// multi-digit, any letter above the input base is automatically set to
347
		// the biggest allowable digit in the input base.
348
		case 'G':
349
		case 'H':
350
		case 'I':
351
		case 'J':
352
		case 'K':
353
		case 'L':
354
		case 'M':
355
		case 'N':
356
		case 'O':
357
		case 'P':
358
		case 'Q':
359
		case 'R':
360
		case 'S':
361
		case 'T':
362
		case 'U':
363
		case 'V':
364
		case 'W':
365
		case 'X':
366
		case 'Y':
367
		case 'Z':
368
		{
369
			bc_lex_number(l, c);
370
			break;
371
		}
372

373
		case ';':
374
		{
375
			l->t = BC_LEX_SCOLON;
376
			break;
377
		}
378

379
		case '<':
380
		{
381
#if BC_ENABLE_EXTRA_MATH
382
			c2 = l->buf[l->i];
383

384
			// Check for shift.
385
			if (c2 == '<')
386
			{
387
				l->i += 1;
388
				bc_lex_assign(l, BC_LEX_OP_ASSIGN_LSHIFT, BC_LEX_OP_LSHIFT);
389
				break;
390
			}
391
#endif // BC_ENABLE_EXTRA_MATH
392
			bc_lex_assign(l, BC_LEX_OP_REL_LE, BC_LEX_OP_REL_LT);
393
			break;
394
		}
395

396
		case '=':
397
		{
398
			bc_lex_assign(l, BC_LEX_OP_REL_EQ, BC_LEX_OP_ASSIGN);
399
			break;
400
		}
401

402
		case '>':
403
		{
404
#if BC_ENABLE_EXTRA_MATH
405
			c2 = l->buf[l->i];
406

407
			// Check for shift.
408
			if (c2 == '>')
409
			{
410
				l->i += 1;
411
				bc_lex_assign(l, BC_LEX_OP_ASSIGN_RSHIFT, BC_LEX_OP_RSHIFT);
412
				break;
413
			}
414
#endif // BC_ENABLE_EXTRA_MATH
415
			bc_lex_assign(l, BC_LEX_OP_REL_GE, BC_LEX_OP_REL_GT);
416
			break;
417
		}
418

419
		case '[':
420
		case ']':
421
		{
422
			l->t = (BcLexType) (c - '[' + BC_LEX_LBRACKET);
423
			break;
424
		}
425

426
		case '\\':
427
		{
428
			// In bc, a backslash+newline is whitespace.
429
			if (BC_NO_ERR(l->buf[l->i] == '\n'))
430
			{
431
				l->i += 1;
432
				l->t = BC_LEX_WHITESPACE;
433
			}
434
			else bc_lex_invalidChar(l, c);
435
			break;
436
		}
437

438
		case '^':
439
		{
440
			bc_lex_assign(l, BC_LEX_OP_ASSIGN_POWER, BC_LEX_OP_POWER);
441
			break;
442
		}
443

444
		case 'a':
445
		case 'b':
446
		case 'c':
447
		case 'd':
448
		case 'e':
449
		case 'f':
450
		case 'g':
451
		case 'h':
452
		case 'i':
453
		case 'j':
454
		case 'k':
455
		case 'l':
456
		case 'm':
457
		case 'n':
458
		case 'o':
459
		case 'p':
460
		case 'q':
461
		case 'r':
462
		case 's':
463
		case 't':
464
		case 'u':
465
		case 'v':
466
		case 'w':
467
		case 'x':
468
		case 'y':
469
		case 'z':
470
		{
471
			bc_lex_identifier(l);
472
			break;
473
		}
474

475
		case '{':
476
		case '}':
477
		{
478
			l->t = (BcLexType) (c - '{' + BC_LEX_LBRACE);
479
			break;
480
		}
481

482
		case '|':
483
		{
484
			c2 = l->buf[l->i];
485

486
			// Once again, boolean or is not allowed by POSIX.
487
			if (BC_NO_ERR(c2 == '|'))
488
			{
489
				bc_lex_verr(l, BC_ERR_POSIX_BOOL, "||");
490

491
				l->i += 1;
492
				l->t = BC_LEX_OP_BOOL_OR;
493
			}
494
			else bc_lex_invalidChar(l, c);
495

496
			break;
497
		}
498

499
		default:
500
		{
501
			bc_lex_invalidChar(l, c);
502
		}
503
	}
504
}
505
#endif // BC_ENABLED
506

507
Product

Resources

Company