Path: blob/main/contrib/llvm-project/lld/ELF/ScriptLexer.cpp
34878 views
//===- ScriptLexer.cpp ----------------------------------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file defines a lexer for the linker script.9//10// The linker script's grammar is not complex but ambiguous due to the11// lack of the formal specification of the language. What we are trying to12// do in this and other files in LLD is to make a "reasonable" linker13// script processor.14//15// Among simplicity, compatibility and efficiency, we put the most16// emphasis on simplicity when we wrote this lexer. Compatibility with the17// GNU linkers is important, but we did not try to clone every tiny corner18// case of their lexers, as even ld.bfd and ld.gold are subtly different19// in various corner cases. We do not care much about efficiency because20// the time spent in parsing linker scripts is usually negligible.21//22// Overall, this lexer works fine for most linker scripts. There might23// be room for improving compatibility, but that's probably not at the24// top of our todo list.25//26//===----------------------------------------------------------------------===//2728#include "ScriptLexer.h"29#include "lld/Common/ErrorHandler.h"30#include "llvm/ADT/Twine.h"31#include "llvm/Support/ErrorHandling.h"32#include <algorithm>3334using namespace llvm;35using namespace lld;36using namespace lld::elf;3738// Returns a whole line containing the current token.39StringRef ScriptLexer::getLine() {40StringRef s = getCurrentMB().getBuffer();41StringRef tok = tokens[pos - 1];4243size_t pos = s.rfind('\n', tok.data() - s.data());44if (pos != StringRef::npos)45s = s.substr(pos + 1);46return s.substr(0, s.find_first_of("\r\n"));47}4849// Returns 1-based line number of the current token.50size_t ScriptLexer::getLineNumber() {51if (pos == 0)52return 1;53StringRef s = getCurrentMB().getBuffer();54StringRef tok = tokens[pos - 1];55const size_t tokOffset = tok.data() - s.data();5657// For the first token, or when going backwards, start from the beginning of58// the buffer. If this token is after the previous token, start from the59// previous token.60size_t line = 1;61size_t start = 0;62if (lastLineNumberOffset > 0 && tokOffset >= lastLineNumberOffset) {63start = lastLineNumberOffset;64line = lastLineNumber;65}6667line += s.substr(start, tokOffset - start).count('\n');6869// Store the line number of this token for reuse.70lastLineNumberOffset = tokOffset;71lastLineNumber = line;7273return line;74}7576// Returns 0-based column number of the current token.77size_t ScriptLexer::getColumnNumber() {78StringRef tok = tokens[pos - 1];79return tok.data() - getLine().data();80}8182std::string ScriptLexer::getCurrentLocation() {83std::string filename = std::string(getCurrentMB().getBufferIdentifier());84return (filename + ":" + Twine(getLineNumber())).str();85}8687ScriptLexer::ScriptLexer(MemoryBufferRef mb) { tokenize(mb); }8889// We don't want to record cascading errors. Keep only the first one.90void ScriptLexer::setError(const Twine &msg) {91if (errorCount())92return;9394std::string s = (getCurrentLocation() + ": " + msg).str();95if (pos)96s += "\n>>> " + getLine().str() + "\n>>> " +97std::string(getColumnNumber(), ' ') + "^";98error(s);99}100101// Split S into linker script tokens.102void ScriptLexer::tokenize(MemoryBufferRef mb) {103std::vector<StringRef> vec;104mbs.push_back(mb);105StringRef s = mb.getBuffer();106StringRef begin = s;107108for (;;) {109s = skipSpace(s);110if (s.empty())111break;112113// Quoted token. Note that double-quote characters are parts of a token114// because, in a glob match context, only unquoted tokens are interpreted115// as glob patterns. Double-quoted tokens are literal patterns in that116// context.117if (s.starts_with("\"")) {118size_t e = s.find("\"", 1);119if (e == StringRef::npos) {120StringRef filename = mb.getBufferIdentifier();121size_t lineno = begin.substr(0, s.data() - begin.data()).count('\n');122error(filename + ":" + Twine(lineno + 1) + ": unclosed quote");123return;124}125126vec.push_back(s.take_front(e + 1));127s = s.substr(e + 1);128continue;129}130131// Some operators form separate tokens.132if (s.starts_with("<<=") || s.starts_with(">>=")) {133vec.push_back(s.substr(0, 3));134s = s.substr(3);135continue;136}137if (s.size() > 1 && ((s[1] == '=' && strchr("*/+-<>&^|", s[0])) ||138(s[0] == s[1] && strchr("<>&|", s[0])))) {139vec.push_back(s.substr(0, 2));140s = s.substr(2);141continue;142}143144// Unquoted token. This is more relaxed than tokens in C-like language,145// so that you can write "file-name.cpp" as one bare token, for example.146size_t pos = s.find_first_not_of(147"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"148"0123456789_.$/\\~=+[]*?-!^:");149150// A character that cannot start a word (which is usually a151// punctuation) forms a single character token.152if (pos == 0)153pos = 1;154vec.push_back(s.substr(0, pos));155s = s.substr(pos);156}157158tokens.insert(tokens.begin() + pos, vec.begin(), vec.end());159}160161// Skip leading whitespace characters or comments.162StringRef ScriptLexer::skipSpace(StringRef s) {163for (;;) {164if (s.starts_with("/*")) {165size_t e = s.find("*/", 2);166if (e == StringRef::npos) {167setError("unclosed comment in a linker script");168return "";169}170s = s.substr(e + 2);171continue;172}173if (s.starts_with("#")) {174size_t e = s.find('\n', 1);175if (e == StringRef::npos)176e = s.size() - 1;177s = s.substr(e + 1);178continue;179}180size_t size = s.size();181s = s.ltrim();182if (s.size() == size)183return s;184}185}186187// An erroneous token is handled as if it were the last token before EOF.188bool ScriptLexer::atEOF() { return errorCount() || tokens.size() == pos; }189190// Split a given string as an expression.191// This function returns "3", "*" and "5" for "3*5" for example.192static std::vector<StringRef> tokenizeExpr(StringRef s) {193StringRef ops = "!~*/+-<>?^:="; // List of operators194195// Quoted strings are literal strings, so we don't want to split it.196if (s.starts_with("\""))197return {s};198199// Split S with operators as separators.200std::vector<StringRef> ret;201while (!s.empty()) {202size_t e = s.find_first_of(ops);203204// No need to split if there is no operator.205if (e == StringRef::npos) {206ret.push_back(s);207break;208}209210// Get a token before the operator.211if (e != 0)212ret.push_back(s.substr(0, e));213214// Get the operator as a token.215// Keep !=, ==, >=, <=, << and >> operators as a single tokens.216if (s.substr(e).starts_with("!=") || s.substr(e).starts_with("==") ||217s.substr(e).starts_with(">=") || s.substr(e).starts_with("<=") ||218s.substr(e).starts_with("<<") || s.substr(e).starts_with(">>")) {219ret.push_back(s.substr(e, 2));220s = s.substr(e + 2);221} else {222ret.push_back(s.substr(e, 1));223s = s.substr(e + 1);224}225}226return ret;227}228229// In contexts where expressions are expected, the lexer should apply230// different tokenization rules than the default one. By default,231// arithmetic operator characters are regular characters, but in the232// expression context, they should be independent tokens.233//234// For example, "foo*3" should be tokenized to "foo", "*" and "3" only235// in the expression context.236//237// This function may split the current token into multiple tokens.238void ScriptLexer::maybeSplitExpr() {239if (!inExpr || errorCount() || atEOF())240return;241242std::vector<StringRef> v = tokenizeExpr(tokens[pos]);243if (v.size() == 1)244return;245tokens.erase(tokens.begin() + pos);246tokens.insert(tokens.begin() + pos, v.begin(), v.end());247}248249StringRef ScriptLexer::next() {250maybeSplitExpr();251252if (errorCount())253return "";254if (atEOF()) {255setError("unexpected EOF");256return "";257}258return tokens[pos++];259}260261StringRef ScriptLexer::peek() {262StringRef tok = next();263if (errorCount())264return "";265pos = pos - 1;266return tok;267}268269bool ScriptLexer::consume(StringRef tok) {270if (next() == tok)271return true;272--pos;273return false;274}275276// Consumes Tok followed by ":". Space is allowed between Tok and ":".277bool ScriptLexer::consumeLabel(StringRef tok) {278if (consume((tok + ":").str()))279return true;280if (tokens.size() >= pos + 2 && tokens[pos] == tok &&281tokens[pos + 1] == ":") {282pos += 2;283return true;284}285return false;286}287288void ScriptLexer::skip() { (void)next(); }289290void ScriptLexer::expect(StringRef expect) {291if (errorCount())292return;293StringRef tok = next();294if (tok != expect)295setError(expect + " expected, but got " + tok);296}297298// Returns true if S encloses T.299static bool encloses(StringRef s, StringRef t) {300return s.bytes_begin() <= t.bytes_begin() && t.bytes_end() <= s.bytes_end();301}302303MemoryBufferRef ScriptLexer::getCurrentMB() {304// Find input buffer containing the current token.305assert(!mbs.empty());306if (pos == 0)307return mbs.back();308for (MemoryBufferRef mb : mbs)309if (encloses(mb.getBuffer(), tokens[pos - 1]))310return mb;311llvm_unreachable("getCurrentMB: failed to find a token");312}313314315