/*1* *****************************************************************************2*3* SPDX-License-Identifier: BSD-2-Clause4*5* Copyright (c) 2018-2025 Gavin D. Howard and contributors.6*7* Redistribution and use in source and binary forms, with or without8* modification, are permitted provided that the following conditions are met:9*10* * Redistributions of source code must retain the above copyright notice, this11* list of conditions and the following disclaimer.12*13* * Redistributions in binary form must reproduce the above copyright notice,14* this list of conditions and the following disclaimer in the documentation15* and/or other materials provided with the distribution.16*17* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"18* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE19* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE20* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE21* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR22* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF23* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS24* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN25* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)26* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE27* POSSIBILITY OF SUCH DAMAGE.28*29* *****************************************************************************30*31* The lexer for bc.32*33*/3435#if BC_ENABLED3637#include <assert.h>38#include <ctype.h>39#include <string.h>4041#include <bc.h>42#include <vm.h>4344/**45* Lexes an identifier, which may be a keyword.46* @param l The lexer.47*/48static void49bc_lex_identifier(BcLex* l)50{51// We already passed the first character, so we need to be sure to include52// it.53const char* buf = l->buf + l->i - 1;54size_t i;5556// This loop is simply checking for keywords.57for (i = 0; i < bc_lex_kws_len; ++i)58{59const BcLexKeyword* kw = bc_lex_kws + i;60size_t n = BC_LEX_KW_LEN(kw);6162if (!strncmp(buf, kw->name, n) && !isalnum(buf[n]) && buf[n] != '_')63{64// If the keyword has been redefined, and redefinition is allowed65// (it is not allowed for builtin libraries), break out of the loop66// and use it as a name. This depends on the argument parser to67// ensure that only non-POSIX keywords get redefined.68if (!vm->no_redefine && vm->redefined_kws[i]) break;6970l->t = BC_LEX_KW_AUTO + (BcLexType) i;7172// Warn or error, as appropriate for the mode, if the keyword is not73// in the POSIX standard.74if (!BC_LEX_KW_POSIX(kw)) bc_lex_verr(l, BC_ERR_POSIX_KW, kw->name);7576// We minus 1 because the index has already been incremented.77l->i += n - 1;7879// Already have the token; bail.80return;81}82}8384// If not a keyword, parse the name.85bc_lex_name(l);8687// POSIX doesn't allow identifiers that are more than one character, so we88// might have to warn or error here too.89if (BC_ERR(l->str.len - 1 > 1))90{91bc_lex_verr(l, BC_ERR_POSIX_NAME_LEN, l->str.v);92}93}9495/**96* Parses a bc string. This is separate from dc strings because dc strings need97* to be balanced.98* @param l The lexer.99*/100static void101bc_lex_string(BcLex* l)102{103// We need to keep track of newlines to increment them properly.104size_t len, nlines, i;105const char* buf;106char c;107bool got_more;108109l->t = BC_LEX_STR;110111do112{113nlines = 0;114buf = l->buf;115got_more = false;116117#if !BC_ENABLE_OSSFUZZ118assert(vm->mode != BC_MODE_STDIN || buf == vm->buffer.v);119#endif // !BC_ENABLE_OSSFUZZ120121// Fortunately for us, bc doesn't escape quotes. Instead, the equivalent122// is '\q', which makes this loop simpler.123for (i = l->i; (c = buf[i]) && c != '"'; ++i)124{125nlines += (c == '\n');126}127128if (BC_ERR(c == '\0') && !vm->eof && l->mode != BC_MODE_FILE)129{130got_more = bc_lex_readLine(l);131}132}133while (got_more && c != '"');134135// If the string did not end properly, barf.136if (c != '"')137{138l->i = i;139bc_lex_err(l, BC_ERR_PARSE_STRING);140}141142// Set the temp string to the parsed string.143len = i - l->i;144bc_vec_string(&l->str, len, l->buf + l->i);145146l->i = i + 1;147l->line += nlines;148}149150/**151* This function takes a lexed operator and checks to see if it's the assignment152* version, setting the token appropriately.153* @param l The lexer.154* @param with The token to assign if it is an assignment operator.155* @param without The token to assign if it is not an assignment operator.156*/157static void158bc_lex_assign(BcLex* l, BcLexType with, BcLexType without)159{160if (l->buf[l->i] == '=')161{162l->i += 1;163l->t = with;164}165else l->t = without;166}167168void169bc_lex_token(BcLex* l)170{171// We increment here. This means that all lexing needs to take that into172// account, such as when parsing an identifier. If we don't, the first173// character of every identifier would be missing.174char c = l->buf[l->i++], c2;175176BC_SIG_ASSERT_LOCKED;177178// This is the workhorse of the lexer.179switch (c)180{181case '\0':182case '\n':183case '\t':184case '\v':185case '\f':186case '\r':187case ' ':188{189bc_lex_commonTokens(l, c);190break;191}192193case '!':194{195// Even though it's not an assignment, we can use this.196bc_lex_assign(l, BC_LEX_OP_REL_NE, BC_LEX_OP_BOOL_NOT);197198// POSIX doesn't allow boolean not.199if (l->t == BC_LEX_OP_BOOL_NOT)200{201bc_lex_verr(l, BC_ERR_POSIX_BOOL, "!");202}203204break;205}206207case '"':208{209bc_lex_string(l);210break;211}212213case '#':214{215// POSIX does not allow line comments.216bc_lex_err(l, BC_ERR_POSIX_COMMENT);217bc_lex_lineComment(l);218break;219}220221case '%':222{223bc_lex_assign(l, BC_LEX_OP_ASSIGN_MODULUS, BC_LEX_OP_MODULUS);224break;225}226227case '&':228{229c2 = l->buf[l->i];230231// Either we have boolean and or an error. And boolean and is not232// allowed by POSIX.233if (BC_NO_ERR(c2 == '&'))234{235bc_lex_verr(l, BC_ERR_POSIX_BOOL, "&&");236237l->i += 1;238l->t = BC_LEX_OP_BOOL_AND;239}240else bc_lex_invalidChar(l, c);241242break;243}244#if BC_ENABLE_EXTRA_MATH245case '$':246{247l->t = BC_LEX_OP_TRUNC;248break;249}250251case '@':252{253bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLACES, BC_LEX_OP_PLACES);254break;255}256#endif // BC_ENABLE_EXTRA_MATH257case '(':258case ')':259{260l->t = (BcLexType) (c - '(' + BC_LEX_LPAREN);261break;262}263264case '*':265{266bc_lex_assign(l, BC_LEX_OP_ASSIGN_MULTIPLY, BC_LEX_OP_MULTIPLY);267break;268}269270case '+':271{272c2 = l->buf[l->i];273274// Have to check for increment first.275if (c2 == '+')276{277l->i += 1;278l->t = BC_LEX_OP_INC;279}280else bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLUS, BC_LEX_OP_PLUS);281break;282}283284case ',':285{286l->t = BC_LEX_COMMA;287break;288}289290case '-':291{292c2 = l->buf[l->i];293294// Have to check for decrement first.295if (c2 == '-')296{297l->i += 1;298l->t = BC_LEX_OP_DEC;299}300else bc_lex_assign(l, BC_LEX_OP_ASSIGN_MINUS, BC_LEX_OP_MINUS);301break;302}303304case '.':305{306c2 = l->buf[l->i];307308// If it's alone, it's an alias for last.309if (BC_LEX_NUM_CHAR(c2, true, false)) bc_lex_number(l, c);310else311{312l->t = BC_LEX_KW_LAST;313bc_lex_err(l, BC_ERR_POSIX_DOT);314}315316break;317}318319case '/':320{321c2 = l->buf[l->i];322if (c2 == '*') bc_lex_comment(l);323else bc_lex_assign(l, BC_LEX_OP_ASSIGN_DIVIDE, BC_LEX_OP_DIVIDE);324break;325}326327case '0':328case '1':329case '2':330case '3':331case '4':332case '5':333case '6':334case '7':335case '8':336case '9':337case 'A':338case 'B':339case 'C':340case 'D':341case 'E':342case 'F':343// Apparently, GNU bc (and maybe others) allows any uppercase letter as344// a number. When single digits, they act like the ones above. When345// multi-digit, any letter above the input base is automatically set to346// the biggest allowable digit in the input base.347case 'G':348case 'H':349case 'I':350case 'J':351case 'K':352case 'L':353case 'M':354case 'N':355case 'O':356case 'P':357case 'Q':358case 'R':359case 'S':360case 'T':361case 'U':362case 'V':363case 'W':364case 'X':365case 'Y':366case 'Z':367{368bc_lex_number(l, c);369break;370}371372case ';':373{374l->t = BC_LEX_SCOLON;375break;376}377378case '<':379{380#if BC_ENABLE_EXTRA_MATH381c2 = l->buf[l->i];382383// Check for shift.384if (c2 == '<')385{386l->i += 1;387bc_lex_assign(l, BC_LEX_OP_ASSIGN_LSHIFT, BC_LEX_OP_LSHIFT);388break;389}390#endif // BC_ENABLE_EXTRA_MATH391bc_lex_assign(l, BC_LEX_OP_REL_LE, BC_LEX_OP_REL_LT);392break;393}394395case '=':396{397bc_lex_assign(l, BC_LEX_OP_REL_EQ, BC_LEX_OP_ASSIGN);398break;399}400401case '>':402{403#if BC_ENABLE_EXTRA_MATH404c2 = l->buf[l->i];405406// Check for shift.407if (c2 == '>')408{409l->i += 1;410bc_lex_assign(l, BC_LEX_OP_ASSIGN_RSHIFT, BC_LEX_OP_RSHIFT);411break;412}413#endif // BC_ENABLE_EXTRA_MATH414bc_lex_assign(l, BC_LEX_OP_REL_GE, BC_LEX_OP_REL_GT);415break;416}417418case '[':419case ']':420{421l->t = (BcLexType) (c - '[' + BC_LEX_LBRACKET);422break;423}424425case '\\':426{427// In bc, a backslash+newline is whitespace.428if (BC_NO_ERR(l->buf[l->i] == '\n'))429{430l->i += 1;431l->t = BC_LEX_WHITESPACE;432}433else bc_lex_invalidChar(l, c);434break;435}436437case '^':438{439bc_lex_assign(l, BC_LEX_OP_ASSIGN_POWER, BC_LEX_OP_POWER);440break;441}442443case 'a':444case 'b':445case 'c':446case 'd':447case 'e':448case 'f':449case 'g':450case 'h':451case 'i':452case 'j':453case 'k':454case 'l':455case 'm':456case 'n':457case 'o':458case 'p':459case 'q':460case 'r':461case 's':462case 't':463case 'u':464case 'v':465case 'w':466case 'x':467case 'y':468case 'z':469{470bc_lex_identifier(l);471break;472}473474case '{':475case '}':476{477l->t = (BcLexType) (c - '{' + BC_LEX_LBRACE);478break;479}480481case '|':482{483c2 = l->buf[l->i];484485// Once again, boolean or is not allowed by POSIX.486if (BC_NO_ERR(c2 == '|'))487{488bc_lex_verr(l, BC_ERR_POSIX_BOOL, "||");489490l->i += 1;491l->t = BC_LEX_OP_BOOL_OR;492}493else bc_lex_invalidChar(l, c);494495break;496}497498default:499{500bc_lex_invalidChar(l, c);501}502}503}504#endif // BC_ENABLED505506507