/*1* *****************************************************************************2*3* SPDX-License-Identifier: BSD-2-Clause4*5* Copyright (c) 2018-2025 Gavin D. Howard and contributors.6*7* Redistribution and use in source and binary forms, with or without8* modification, are permitted provided that the following conditions are met:9*10* * Redistributions of source code must retain the above copyright notice, this11* list of conditions and the following disclaimer.12*13* * Redistributions in binary form must reproduce the above copyright notice,14* this list of conditions and the following disclaimer in the documentation15* and/or other materials provided with the distribution.16*17* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"18* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE19* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE20* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE21* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR22* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF23* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS24* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN25* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)26* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE27* POSSIBILITY OF SUCH DAMAGE.28*29* *****************************************************************************30*31* Common code for the lexers.32*33*/3435#include <assert.h>36#include <ctype.h>37#include <stdbool.h>38#include <string.h>3940#include <lex.h>41#include <vm.h>42#include <bc.h>4344void45bc_lex_invalidChar(BcLex* l, char c)46{47l->t = BC_LEX_INVALID;48bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);49}5051void52bc_lex_lineComment(BcLex* l)53{54l->t = BC_LEX_WHITESPACE;55while (l->i < l->len && l->buf[l->i] != '\n')56{57l->i += 1;58}59}6061void62bc_lex_comment(BcLex* l)63{64size_t i, nlines = 0;65const char* buf;66bool end = false, got_more;67char c;6869l->i += 1;70l->t = BC_LEX_WHITESPACE;7172// This loop is complex because it might need to request more data from73// stdin if the comment is not ended. This loop is taken until the comment74// is finished or we have EOF.75do76{77buf = l->buf;78got_more = false;7980// If we are in stdin mode, the buffer must be the one used for stdin.81#if !BC_ENABLE_OSSFUZZ82assert(vm->mode != BC_MODE_STDIN || buf == vm->buffer.v);83#endif // !BC_ENABLE_OSSFUZZ8485// Find the end of the comment.86for (i = l->i; !end; i += !end)87{88// While we don't have an asterisk, eat, but increment nlines.89for (; (c = buf[i]) && c != '*'; ++i)90{91nlines += (c == '\n');92}9394// If this is true, we need to request more data.95if (BC_ERR(!c || buf[i + 1] == '\0'))96{97#if !BC_ENABLE_OSSFUZZ98// Read more, if possible.99if (!vm->eof && l->mode != BC_MODE_FILE)100{101got_more = bc_lex_readLine(l);102}103#endif // !BC_ENABLE_OSSFUZZ104105break;106}107108// If this turns true, we found the end. Yay!109end = (buf[i + 1] == '/');110}111}112while (got_more && !end);113114// If we didn't find the end, barf.115if (!end)116{117l->i = i;118bc_lex_err(l, BC_ERR_PARSE_COMMENT);119}120121l->i = i + 2;122l->line += nlines;123}124125void126bc_lex_whitespace(BcLex* l)127{128char c;129130l->t = BC_LEX_WHITESPACE;131132// Eat. We don't eat newlines because they can be special.133for (c = l->buf[l->i]; c != '\n' && isspace(c); c = l->buf[++l->i])134{135continue;136}137}138139void140bc_lex_commonTokens(BcLex* l, char c)141{142if (!c) l->t = BC_LEX_EOF;143else if (c == '\n') l->t = BC_LEX_NLINE;144else bc_lex_whitespace(l);145}146147/**148* Parses a number.149* @param l The lexer.150* @param start The start character.151* @param int_only Whether this function should only look for an integer. This152* is used to implement the exponent of scientific notation.153*/154static size_t155bc_lex_num(BcLex* l, char start, bool int_only)156{157const char* buf = l->buf + l->i;158size_t i;159char c;160bool last_pt, pt = (start == '.');161162// This loop looks complex. It is not. It is asking if the character is not163// a nul byte and it if it a valid num character based on what we have found164// thus far, or whether it is a backslash followed by a newline. I can do165// i+1 on the buffer because the buffer must have a nul byte.166for (i = 0; (c = buf[i]) && (BC_LEX_NUM_CHAR(c, pt, int_only) ||167(c == '\\' && buf[i + 1] == '\n'));168++i)169{170// I don't need to test that the next character is a newline because171// the loop condition above ensures that.172if (c == '\\')173{174i += 2;175176// Make sure to eat whitespace at the beginning of the line.177while (isspace(buf[i]) && buf[i] != '\n')178{179i += 1;180}181182c = buf[i];183184// If the next character is not a number character, bail.185if (!BC_LEX_NUM_CHAR(c, pt, int_only)) break;186}187188// Did we find the radix point?189last_pt = (c == '.');190191// If we did, and we already have one, then break because it's not part192// of this number.193if (pt && last_pt) break;194195// Set whether we have found a radix point.196pt = pt || last_pt;197198bc_vec_push(&l->str, &c);199}200201return i;202}203204void205bc_lex_number(BcLex* l, char start)206{207l->t = BC_LEX_NUMBER;208209// Make sure the string is clear.210bc_vec_popAll(&l->str);211bc_vec_push(&l->str, &start);212213// Parse the number.214l->i += bc_lex_num(l, start, false);215216#if BC_ENABLE_EXTRA_MATH217{218char c = l->buf[l->i];219220// Do we have a number in scientific notation?221if (c == 'e')222{223#if BC_ENABLED224// Barf for POSIX.225if (BC_IS_POSIX) bc_lex_err(l, BC_ERR_POSIX_EXP_NUM);226#endif // BC_ENABLED227228// Push the e.229bc_vec_push(&l->str, &c);230l->i += 1;231c = l->buf[l->i];232233// Check for negative specifically because bc_lex_num() does not.234if (c == BC_LEX_NEG_CHAR)235{236bc_vec_push(&l->str, &c);237l->i += 1;238c = l->buf[l->i];239}240241// We must have a number character, so barf if not.242if (BC_ERR(!BC_LEX_NUM_CHAR(c, false, true)))243{244bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);245}246247// Parse the exponent.248l->i += bc_lex_num(l, 0, true);249}250}251#endif // BC_ENABLE_EXTRA_MATH252253bc_vec_pushByte(&l->str, '\0');254}255256void257bc_lex_name(BcLex* l)258{259size_t i = 0;260const char* buf = l->buf + l->i - 1;261char c = buf[i];262263l->t = BC_LEX_NAME;264265// Should be obvious. It's looking for valid characters.266while ((c >= 'a' && c <= 'z') || isdigit(c) || c == '_')267{268c = buf[++i];269}270271// Set the string to the identifier.272bc_vec_string(&l->str, i, buf);273274// Increment the index. We minus 1 because it has already been incremented.275l->i += i - 1;276}277278void279bc_lex_init(BcLex* l)280{281BC_SIG_ASSERT_LOCKED;282assert(l != NULL);283bc_vec_init(&l->str, sizeof(char), BC_DTOR_NONE);284}285286void287bc_lex_free(BcLex* l)288{289BC_SIG_ASSERT_LOCKED;290assert(l != NULL);291bc_vec_free(&l->str);292}293294void295bc_lex_file(BcLex* l, const char* file)296{297assert(l != NULL && file != NULL);298l->line = 1;299vm->file = file;300}301302void303bc_lex_next(BcLex* l)304{305BC_SIG_ASSERT_LOCKED;306307assert(l != NULL);308309l->last = l->t;310311// If this wasn't here, the line number would be off.312l->line += (l->i != 0 && l->buf[l->i - 1] == '\n');313314// If the last token was EOF, someone called this one too many times.315if (BC_ERR(l->last == BC_LEX_EOF)) bc_lex_err(l, BC_ERR_PARSE_EOF);316317l->t = BC_LEX_EOF;318319// We are done if this is true.320if (l->i == l->len) return;321322// Loop until failure or we don't have whitespace. This323// is so the parser doesn't get inundated with whitespace.324do325{326vm->next(l);327}328while (l->t == BC_LEX_WHITESPACE);329}330331/**332* Updates the buffer and len so that they are not invalidated when the stdin333* buffer grows.334* @param l The lexer.335* @param text The text.336* @param len The length of the text.337*/338static void339bc_lex_fixText(BcLex* l, const char* text, size_t len)340{341l->buf = text;342l->len = len;343}344345bool346bc_lex_readLine(BcLex* l)347{348bool good;349350// These are reversed because they should be already locked, but351// bc_vm_readLine() needs them to be unlocked.352BC_SIG_UNLOCK;353354// Make sure we read from the appropriate place.355switch (l->mode)356{357case BC_MODE_EXPRS:358{359good = bc_vm_readBuf(false);360break;361}362363case BC_MODE_FILE:364{365good = false;366break;367}368369#if !BC_ENABLE_OSSFUZZ370371case BC_MODE_STDIN:372{373good = bc_vm_readLine(false);374break;375}376377#endif // !BC_ENABLE_OSSFUZZ378379#ifdef __GNUC__380#ifndef __clang__381default:382{383// We should never get here.384abort();385}386#endif // __clang__387#endif // __GNUC__388}389390BC_SIG_LOCK;391392bc_lex_fixText(l, vm->buffer.v, vm->buffer.len - 1);393394return good;395}396397void398bc_lex_text(BcLex* l, const char* text, BcMode mode)399{400BC_SIG_ASSERT_LOCKED;401402assert(l != NULL && text != NULL);403404bc_lex_fixText(l, text, strlen(text));405l->i = 0;406l->t = l->last = BC_LEX_INVALID;407l->mode = mode;408409bc_lex_next(l);410}411412413