/*1* TclRegComp and TclRegExec -- TclRegSub is elsewhere2*3* Copyright (c) 1986 by University of Toronto.4* Written by Henry Spencer. Not derived from licensed software.5*6* Permission is granted to anyone to use this software for any7* purpose on any computer system, and to redistribute it freely,8* subject to the following restrictions:9*10* 1. The author is not responsible for the consequences of use of11* this software, no matter how awful, even if they arise12* from defects in it.13*14* 2. The origin of this software must not be misrepresented, either15* by explicit claim or by omission.16*17* 3. Altered versions must be plainly marked as such, and must not18* be misrepresented as being the original software.19*20* Beware that some of this code is subtly aware of the way operator21* precedence is structured in regular expressions. Serious changes in22* regular-expression syntax might require a total rethink.23*24* *** NOTE: this code has been altered slightly for use in Tcl: ***25* *** 1. Use ckalloc and ckfree instead of malloc and free. ***26* *** 2. Add extra argument to regexp to specify the real ***27* *** start of the string separately from the start of the ***28* *** current search. This is needed to search for multiple ***29* *** matches within a string. ***30* *** 3. Names have been changed, e.g. from regcomp to ***31* *** TclRegComp, to avoid clashes with other ***32* *** regexp implementations used by applications. ***33* *** 4. Added errMsg declaration and TclRegError procedure ***34* *** 5. Various lint-like things, such as casting arguments ***35* *** in procedure calls. ***36*37* *** NOTE: This code has been altered for use in MT-Sturdy Tcl ***38* *** 1. All use of static variables has been changed to access ***39* *** fields of a structure. ***40* *** 2. This in addition to changes to TclRegError makes the ***41* *** code multi-thread safe. ***42*43* SCCS: @(#) regexp.c 1.12 96/04/02 13:54:5744*/4546#include "tclInt.h"47#include "tclPort.h"4849/*50* The variable below is set to NULL before invoking regexp functions51* and checked after those functions. If an error occurred then TclRegError52* will set the variable to point to a (static) error message. This53* mechanism unfortunately does not support multi-threading, but the54* procedures TclRegError and TclGetRegError can be modified to use55* thread-specific storage for the variable and thereby make the code56* thread-safe.57*/5859static char *errMsg = NULL;6061/*62* The "internal use only" fields in regexp.h are present to pass info from63* compile to execute that permits the execute phase to run lots faster on64* simple cases. They are:65*66* regstart char that must begin a match; '\0' if none obvious67* reganch is the match anchored (at beginning-of-line only)?68* regmust string (pointer into program) that match must include, or NULL69* regmlen length of regmust string70*71* Regstart and reganch permit very fast decisions on suitable starting points72* for a match, cutting down the work a lot. Regmust permits fast rejection73* of lines that cannot possibly match. The regmust tests are costly enough74* that TclRegComp() supplies a regmust only if the r.e. contains something75* potentially expensive (at present, the only such thing detected is * or +76* at the start of the r.e., which can involve a lot of backup). Regmlen is77* supplied because the test in TclRegExec() needs it and TclRegComp() is78* computing it anyway.79*/8081/*82* Structure for regexp "program". This is essentially a linear encoding83* of a nondeterministic finite-state machine (aka syntax charts or84* "railroad normal form" in parsing technology). Each node is an opcode85* plus a "next" pointer, possibly plus an operand. "Next" pointers of86* all nodes except BRANCH implement concatenation; a "next" pointer with87* a BRANCH on both ends of it is connecting two alternatives. (Here we88* have one of the subtle syntax dependencies: an individual BRANCH (as89* opposed to a collection of them) is never concatenated with anything90* because of operator precedence.) The operand of some types of node is91* a literal string; for others, it is a node leading into a sub-FSM. In92* particular, the operand of a BRANCH node is the first node of the branch.93* (NB this is *not* a tree structure: the tail of the branch connects94* to the thing following the set of BRANCHes.) The opcodes are:95*/9697/* definition number opnd? meaning */98#define END 0 /* no End of program. */99#define BOL 1 /* no Match "" at beginning of line. */100#define EOL 2 /* no Match "" at end of line. */101#define ANY 3 /* no Match any one character. */102#define ANYOF 4 /* str Match any character in this string. */103#define ANYBUT 5 /* str Match any character not in this string. */104#define BRANCH 6 /* node Match this alternative, or the next... */105#define BACK 7 /* no Match "", "next" ptr points backward. */106#define EXACTLY 8 /* str Match this string. */107#define NOTHING 9 /* no Match empty string. */108#define STAR 10 /* node Match this (simple) thing 0 or more times. */109#define PLUS 11 /* node Match this (simple) thing 1 or more times. */110#define OPEN 20 /* no Mark this point in input as start of #n. */111/* OPEN+1 is number 1, etc. */112#define CLOSE (OPEN+NSUBEXP) /* no Analogous to OPEN. */113114/*115* Opcode notes:116*117* BRANCH The set of branches constituting a single choice are hooked118* together with their "next" pointers, since precedence prevents119* anything being concatenated to any individual branch. The120* "next" pointer of the last BRANCH in a choice points to the121* thing following the whole choice. This is also where the122* final "next" pointer of each individual branch points; each123* branch starts with the operand node of a BRANCH node.124*125* BACK Normal "next" pointers all implicitly point forward; BACK126* exists to make loop structures possible.127*128* STAR,PLUS '?', and complex '*' and '+', are implemented as circular129* BRANCH structures using BACK. Simple cases (one character130* per match) are implemented with STAR and PLUS for speed131* and to minimize recursive plunges.132*133* OPEN,CLOSE ...are numbered at compile time.134*/135136/*137* A node is one char of opcode followed by two chars of "next" pointer.138* "Next" pointers are stored as two 8-bit pieces, high order first. The139* value is a positive offset from the opcode of the node containing it.140* An operand, if any, simply follows the node. (Note that much of the141* code generation knows about this implicit relationship.)142*143* Using two bytes for the "next" pointer is vast overkill for most things,144* but allows patterns to get big without disasters.145*/146#define OP(p) (*(p))147#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))148#define OPERAND(p) ((p) + 3)149150/*151* See regmagic.h for one further detail of program structure.152*/153154155/*156* Utility definitions.157*/158#ifndef CHARBITS159#define UCHARAT(p) ((int)*(unsigned char *)(p))160#else161#define UCHARAT(p) ((int)*(p)&CHARBITS)162#endif163164#define FAIL(m) { TclRegError(m); return(NULL); }165#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')166#define META "^$.[()|?+*\\"167168/*169* Flags to be passed up and down.170*/171#define HASWIDTH 01 /* Known never to match null string. */172#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */173#define SPSTART 04 /* Starts with * or +. */174#define WORST 0 /* Worst case. */175176/*177* Global work variables for TclRegComp().178*/179struct regcomp_state {180char *regparse; /* Input-scan pointer. */181int regnpar; /* () count. */182char *regcode; /* Code-emit pointer; ®dummy = don't. */183long regsize; /* Code size. */184};185186static char regdummy;187188/*189* The first byte of the regexp internal "program" is actually this magic190* number; the start node begins in the second byte.191*/192#define MAGIC 0234193194195/*196* Forward declarations for TclRegComp()'s friends.197*/198199static char * reg _ANSI_ARGS_((int paren, int *flagp,200struct regcomp_state *rcstate));201static char * regatom _ANSI_ARGS_((int *flagp,202struct regcomp_state *rcstate));203static char * regbranch _ANSI_ARGS_((int *flagp,204struct regcomp_state *rcstate));205static void regc _ANSI_ARGS_((int b,206struct regcomp_state *rcstate));207static void reginsert _ANSI_ARGS_((int op, char *opnd,208struct regcomp_state *rcstate));209static char * regnext _ANSI_ARGS_((char *p));210static char * regnode _ANSI_ARGS_((int op,211struct regcomp_state *rcstate));212static void regoptail _ANSI_ARGS_((char *p, char *val));213static char * regpiece _ANSI_ARGS_((int *flagp,214struct regcomp_state *rcstate));215static void regtail _ANSI_ARGS_((char *p, char *val));216217#ifdef STRCSPN218static int strcspn _ANSI_ARGS_((char *s1, char *s2));219#endif220221/*222- TclRegComp - compile a regular expression into internal code223*224* We can't allocate space until we know how big the compiled form will be,225* but we can't compile it (and thus know how big it is) until we've got a226* place to put the code. So we cheat: we compile it twice, once with code227* generation turned off and size counting turned on, and once "for real".228* This also means that we don't allocate space until we are sure that the229* thing really will compile successfully, and we never have to move the230* code and thus invalidate pointers into it. (Note that it has to be in231* one piece because free() must be able to free it all.)232*233* Beware that the optimization-preparation code in here knows about some234* of the structure of the compiled regexp.235*/236regexp *237TclRegComp(exp)238char *exp;239{240register regexp *r;241register char *scan;242register char *longest;243register int len;244int flags;245struct regcomp_state state;246struct regcomp_state *rcstate= &state;247248if (exp == NULL)249FAIL("NULL argument");250251/* First pass: determine size, legality. */252rcstate->regparse = exp;253rcstate->regnpar = 1;254rcstate->regsize = 0L;255rcstate->regcode = ®dummy;256regc(MAGIC, rcstate);257if (reg(0, &flags, rcstate) == NULL)258return(NULL);259260/* Small enough for pointer-storage convention? */261if (rcstate->regsize >= 32767L) /* Probably could be 65535L. */262FAIL("regexp too big");263264/* Allocate space. */265r = (regexp *)ckalloc(sizeof(regexp) + (unsigned)rcstate->regsize);266if (r == NULL)267FAIL("out of space");268269/* Second pass: emit code. */270rcstate->regparse = exp;271rcstate->regnpar = 1;272rcstate->regcode = r->program;273regc(MAGIC, rcstate);274if (reg(0, &flags, rcstate) == NULL)275return(NULL);276277/* Dig out information for optimizations. */278r->regstart = '\0'; /* Worst-case defaults. */279r->reganch = 0;280r->regmust = NULL;281r->regmlen = 0;282scan = r->program+1; /* First BRANCH. */283if (OP(regnext(scan)) == END) { /* Only one top-level choice. */284scan = OPERAND(scan);285286/* Starting-point info. */287if (OP(scan) == EXACTLY)288r->regstart = *OPERAND(scan);289else if (OP(scan) == BOL)290r->reganch++;291292/*293* If there's something expensive in the r.e., find the294* longest literal string that must appear and make it the295* regmust. Resolve ties in favor of later strings, since296* the regstart check works with the beginning of the r.e.297* and avoiding duplication strengthens checking. Not a298* strong reason, but sufficient in the absence of others.299*/300if (flags&SPSTART) {301longest = NULL;302len = 0;303for (; scan != NULL; scan = regnext(scan))304if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) {305longest = OPERAND(scan);306len = strlen(OPERAND(scan));307}308r->regmust = longest;309r->regmlen = len;310}311}312313return(r);314}315316/*317- reg - regular expression, i.e. main body or parenthesized thing318*319* Caller must absorb opening parenthesis.320*321* Combining parenthesis handling with the base level of regular expression322* is a trifle forced, but the need to tie the tails of the branches to what323* follows makes it hard to avoid.324*/325static char *326reg(paren, flagp, rcstate)327int paren; /* Parenthesized? */328int *flagp;329struct regcomp_state *rcstate;330{331register char *ret;332register char *br;333register char *ender;334register int parno = 0;335int flags;336337*flagp = HASWIDTH; /* Tentatively. */338339/* Make an OPEN node, if parenthesized. */340if (paren) {341if (rcstate->regnpar >= NSUBEXP)342FAIL("too many ()");343parno = rcstate->regnpar;344rcstate->regnpar++;345ret = regnode(OPEN+parno,rcstate);346} else347ret = NULL;348349/* Pick up the branches, linking them together. */350br = regbranch(&flags,rcstate);351if (br == NULL)352return(NULL);353if (ret != NULL)354regtail(ret, br); /* OPEN -> first. */355else356ret = br;357if (!(flags&HASWIDTH))358*flagp &= ~HASWIDTH;359*flagp |= flags&SPSTART;360while (*rcstate->regparse == '|') {361rcstate->regparse++;362br = regbranch(&flags,rcstate);363if (br == NULL)364return(NULL);365regtail(ret, br); /* BRANCH -> BRANCH. */366if (!(flags&HASWIDTH))367*flagp &= ~HASWIDTH;368*flagp |= flags&SPSTART;369}370371/* Make a closing node, and hook it on the end. */372ender = regnode((paren) ? CLOSE+parno : END,rcstate);373regtail(ret, ender);374375/* Hook the tails of the branches to the closing node. */376for (br = ret; br != NULL; br = regnext(br))377regoptail(br, ender);378379/* Check for proper termination. */380if (paren && *rcstate->regparse++ != ')') {381FAIL("unmatched ()");382} else if (!paren && *rcstate->regparse != '\0') {383if (*rcstate->regparse == ')') {384FAIL("unmatched ()");385} else386FAIL("junk on end"); /* "Can't happen". */387/* NOTREACHED */388}389390return(ret);391}392393/*394- regbranch - one alternative of an | operator395*396* Implements the concatenation operator.397*/398static char *399regbranch(flagp, rcstate)400int *flagp;401struct regcomp_state *rcstate;402{403register char *ret;404register char *chain;405register char *latest;406int flags;407408*flagp = WORST; /* Tentatively. */409410ret = regnode(BRANCH,rcstate);411chain = NULL;412while (*rcstate->regparse != '\0' && *rcstate->regparse != '|' &&413*rcstate->regparse != ')') {414latest = regpiece(&flags, rcstate);415if (latest == NULL)416return(NULL);417*flagp |= flags&HASWIDTH;418if (chain == NULL) /* First piece. */419*flagp |= flags&SPSTART;420else421regtail(chain, latest);422chain = latest;423}424if (chain == NULL) /* Loop ran zero times. */425(void) regnode(NOTHING,rcstate);426427return(ret);428}429430/*431- regpiece - something followed by possible [*+?]432*433* Note that the branching code sequences used for ? and the general cases434* of * and + are somewhat optimized: they use the same NOTHING node as435* both the endmarker for their branch list and the body of the last branch.436* It might seem that this node could be dispensed with entirely, but the437* endmarker role is not redundant.438*/439static char *440regpiece(flagp, rcstate)441int *flagp;442struct regcomp_state *rcstate;443{444register char *ret;445register char op;446register char *next;447int flags;448449ret = regatom(&flags,rcstate);450if (ret == NULL)451return(NULL);452453op = *rcstate->regparse;454if (!ISMULT(op)) {455*flagp = flags;456return(ret);457}458459if (!(flags&HASWIDTH) && op != '?')460FAIL("*+ operand could be empty");461*flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);462463if (op == '*' && (flags&SIMPLE))464reginsert(STAR, ret, rcstate);465else if (op == '*') {466/* Emit x* as (x&|), where & means "self". */467reginsert(BRANCH, ret, rcstate); /* Either x */468regoptail(ret, regnode(BACK,rcstate)); /* and loop */469regoptail(ret, ret); /* back */470regtail(ret, regnode(BRANCH,rcstate)); /* or */471regtail(ret, regnode(NOTHING,rcstate)); /* null. */472} else if (op == '+' && (flags&SIMPLE))473reginsert(PLUS, ret, rcstate);474else if (op == '+') {475/* Emit x+ as x(&|), where & means "self". */476next = regnode(BRANCH,rcstate); /* Either */477regtail(ret, next);478regtail(regnode(BACK,rcstate), ret); /* loop back */479regtail(next, regnode(BRANCH,rcstate)); /* or */480regtail(ret, regnode(NOTHING,rcstate)); /* null. */481} else if (op == '?') {482/* Emit x? as (x|) */483reginsert(BRANCH, ret, rcstate); /* Either x */484regtail(ret, regnode(BRANCH,rcstate)); /* or */485next = regnode(NOTHING,rcstate); /* null. */486regtail(ret, next);487regoptail(ret, next);488}489rcstate->regparse++;490if (ISMULT(*rcstate->regparse))491FAIL("nested *?+");492493return(ret);494}495496/*497- regatom - the lowest level498*499* Optimization: gobbles an entire sequence of ordinary characters so that500* it can turn them into a single node, which is smaller to store and501* faster to run. Backslashed characters are exceptions, each becoming a502* separate node; the code is simpler that way and it's not worth fixing.503*/504static char *505regatom(flagp, rcstate)506int *flagp;507struct regcomp_state *rcstate;508{509register char *ret;510int flags;511512*flagp = WORST; /* Tentatively. */513514switch (*rcstate->regparse++) {515case '^':516ret = regnode(BOL,rcstate);517break;518case '$':519ret = regnode(EOL,rcstate);520break;521case '.':522ret = regnode(ANY,rcstate);523*flagp |= HASWIDTH|SIMPLE;524break;525case '[': {526register int clss;527register int classend;528529if (*rcstate->regparse == '^') { /* Complement of range. */530ret = regnode(ANYBUT,rcstate);531rcstate->regparse++;532} else533ret = regnode(ANYOF,rcstate);534if (*rcstate->regparse == ']' || *rcstate->regparse == '-')535regc(*rcstate->regparse++,rcstate);536while (*rcstate->regparse != '\0' && *rcstate->regparse != ']') {537if (*rcstate->regparse == '-') {538rcstate->regparse++;539if (*rcstate->regparse == ']' || *rcstate->regparse == '\0')540regc('-',rcstate);541else {542clss = UCHARAT(rcstate->regparse-2)+1;543classend = UCHARAT(rcstate->regparse);544if (clss > classend+1)545FAIL("invalid [] range");546for (; clss <= classend; clss++)547regc((char)clss,rcstate);548rcstate->regparse++;549}550} else551regc(*rcstate->regparse++,rcstate);552}553regc('\0',rcstate);554if (*rcstate->regparse != ']')555FAIL("unmatched []");556rcstate->regparse++;557*flagp |= HASWIDTH|SIMPLE;558}559break;560case '(':561ret = reg(1, &flags, rcstate);562if (ret == NULL)563return(NULL);564*flagp |= flags&(HASWIDTH|SPSTART);565break;566case '\0':567case '|':568case ')':569FAIL("internal urp"); /* Supposed to be caught earlier. */570/* NOTREACHED */571break;572case '?':573case '+':574case '*':575FAIL("?+* follows nothing");576/* NOTREACHED */577break;578case '\\':579if (*rcstate->regparse == '\0')580FAIL("trailing \\");581ret = regnode(EXACTLY,rcstate);582regc(*rcstate->regparse++,rcstate);583regc('\0',rcstate);584*flagp |= HASWIDTH|SIMPLE;585break;586default: {587register int len;588register char ender;589590rcstate->regparse--;591len = strcspn(rcstate->regparse, META);592if (len <= 0)593FAIL("internal disaster");594ender = *(rcstate->regparse+len);595if (len > 1 && ISMULT(ender))596len--; /* Back off clear of ?+* operand. */597*flagp |= HASWIDTH;598if (len == 1)599*flagp |= SIMPLE;600ret = regnode(EXACTLY,rcstate);601while (len > 0) {602regc(*rcstate->regparse++,rcstate);603len--;604}605regc('\0',rcstate);606}607break;608}609610return(ret);611}612613/*614- regnode - emit a node615*/616static char * /* Location. */617regnode(op, rcstate)618int op;619struct regcomp_state *rcstate;620{621register char *ret;622register char *ptr;623624ret = rcstate->regcode;625if (ret == ®dummy) {626rcstate->regsize += 3;627return(ret);628}629630ptr = ret;631*ptr++ = (char)op;632*ptr++ = '\0'; /* Null "next" pointer. */633*ptr++ = '\0';634rcstate->regcode = ptr;635636return(ret);637}638639/*640- regc - emit (if appropriate) a byte of code641*/642static void643regc(b, rcstate)644int b;645struct regcomp_state *rcstate;646{647if (rcstate->regcode != ®dummy)648*rcstate->regcode++ = (char)b;649else650rcstate->regsize++;651}652653/*654- reginsert - insert an operator in front of already-emitted operand655*656* Means relocating the operand.657*/658static void659reginsert(op, opnd, rcstate)660int op;661char *opnd;662struct regcomp_state *rcstate;663{664register char *src;665register char *dst;666register char *place;667668if (rcstate->regcode == ®dummy) {669rcstate->regsize += 3;670return;671}672673src = rcstate->regcode;674rcstate->regcode += 3;675dst = rcstate->regcode;676while (src > opnd)677*--dst = *--src;678679place = opnd; /* Op node, where operand used to be. */680*place++ = (char)op;681*place++ = '\0';682*place = '\0';683}684685/*686- regtail - set the next-pointer at the end of a node chain687*/688static void689regtail(p, val)690char *p;691char *val;692{693register char *scan;694register char *temp;695register int offset;696697if (p == ®dummy)698return;699700/* Find last node. */701scan = p;702for (;;) {703temp = regnext(scan);704if (temp == NULL)705break;706scan = temp;707}708709if (OP(scan) == BACK)710offset = scan - val;711else712offset = val - scan;713*(scan+1) = (char)((offset>>8)&0377);714*(scan+2) = (char)(offset&0377);715}716717/*718- regoptail - regtail on operand of first argument; nop if operandless719*/720static void721regoptail(p, val)722char *p;723char *val;724{725/* "Operandless" and "op != BRANCH" are synonymous in practice. */726if (p == NULL || p == ®dummy || OP(p) != BRANCH)727return;728regtail(OPERAND(p), val);729}730731/*732* TclRegExec and friends733*/734735/*736* Global work variables for TclRegExec().737*/738struct regexec_state {739char *reginput; /* String-input pointer. */740char *regbol; /* Beginning of input, for ^ check. */741char **regstartp; /* Pointer to startp array. */742char **regendp; /* Ditto for endp. */743};744745/*746* Forwards.747*/748static int regtry _ANSI_ARGS_((regexp *prog, char *string,749struct regexec_state *restate));750static int regmatch _ANSI_ARGS_((char *prog,751struct regexec_state *restate));752static int regrepeat _ANSI_ARGS_((char *p,753struct regexec_state *restate));754755#ifdef DEBUG756int regnarrate = 0;757void regdump _ANSI_ARGS_((regexp *r));758static char *regprop _ANSI_ARGS_((char *op));759#endif760761/*762- TclRegExec - match a regexp against a string763*/764int765TclRegExec(prog, string, start)766register regexp *prog;767register char *string;768char *start;769{770register char *s;771struct regexec_state state;772struct regexec_state *restate= &state;773774/* Be paranoid... */775if (prog == NULL || string == NULL) {776TclRegError("NULL parameter");777return(0);778}779780/* Check validity of program. */781if (UCHARAT(prog->program) != MAGIC) {782TclRegError("corrupted program");783return(0);784}785786/* If there is a "must appear" string, look for it. */787if (prog->regmust != NULL) {788s = string;789while ((s = strchr(s, prog->regmust[0])) != NULL) {790if (strncmp(s, prog->regmust, (size_t) prog->regmlen)791== 0)792break; /* Found it. */793s++;794}795if (s == NULL) /* Not present. */796return(0);797}798799/* Mark beginning of line for ^ . */800restate->regbol = start;801802/* Simplest case: anchored match need be tried only once. */803if (prog->reganch)804return(regtry(prog, string, restate));805806/* Messy cases: unanchored match. */807s = string;808if (prog->regstart != '\0')809/* We know what char it must start with. */810while ((s = strchr(s, prog->regstart)) != NULL) {811if (regtry(prog, s, restate))812return(1);813s++;814}815else816/* We don't -- general case. */817do {818if (regtry(prog, s, restate))819return(1);820} while (*s++ != '\0');821822/* Failure. */823return(0);824}825826/*827- regtry - try match at specific point828*/829static int /* 0 failure, 1 success */830regtry(prog, string, restate)831regexp *prog;832char *string;833struct regexec_state *restate;834{835register int i;836register char **sp;837register char **ep;838839restate->reginput = string;840restate->regstartp = prog->startp;841restate->regendp = prog->endp;842843sp = prog->startp;844ep = prog->endp;845for (i = NSUBEXP; i > 0; i--) {846*sp++ = NULL;847*ep++ = NULL;848}849if (regmatch(prog->program + 1,restate)) {850prog->startp[0] = string;851prog->endp[0] = restate->reginput;852return(1);853} else854return(0);855}856857/*858- regmatch - main matching routine859*860* Conceptually the strategy is simple: check to see whether the current861* node matches, call self recursively to see whether the rest matches,862* and then act accordingly. In practice we make some effort to avoid863* recursion, in particular by going through "ordinary" nodes (that don't864* need to know whether the rest of the match failed) by a loop instead of865* by recursion.866*/867static int /* 0 failure, 1 success */868regmatch(prog, restate)869char *prog;870struct regexec_state *restate;871{872register char *scan; /* Current node. */873char *next; /* Next node. */874875scan = prog;876#ifdef DEBUG877if (scan != NULL && regnarrate)878fprintf(stderr, "%s(\n", regprop(scan));879#endif880while (scan != NULL) {881#ifdef DEBUG882if (regnarrate)883fprintf(stderr, "%s...\n", regprop(scan));884#endif885next = regnext(scan);886887switch (OP(scan)) {888case BOL:889if (restate->reginput != restate->regbol) {890return 0;891}892break;893case EOL:894if (*restate->reginput != '\0') {895return 0;896}897break;898case ANY:899if (*restate->reginput == '\0') {900return 0;901}902restate->reginput++;903break;904case EXACTLY: {905register int len;906register char *opnd;907908opnd = OPERAND(scan);909/* Inline the first character, for speed. */910if (*opnd != *restate->reginput) {911return 0 ;912}913len = strlen(opnd);914if (len > 1 && strncmp(opnd, restate->reginput, (size_t) len)915!= 0) {916return 0;917}918restate->reginput += len;919break;920}921case ANYOF:922if (*restate->reginput == '\0'923|| strchr(OPERAND(scan), *restate->reginput) == NULL) {924return 0;925}926restate->reginput++;927break;928case ANYBUT:929if (*restate->reginput == '\0'930|| strchr(OPERAND(scan), *restate->reginput) != NULL) {931return 0;932}933restate->reginput++;934break;935case NOTHING:936break;937case BACK:938break;939case OPEN+1:940case OPEN+2:941case OPEN+3:942case OPEN+4:943case OPEN+5:944case OPEN+6:945case OPEN+7:946case OPEN+8:947case OPEN+9: {948register int no;949register char *save;950951doOpen:952no = OP(scan) - OPEN;953save = restate->reginput;954955if (regmatch(next,restate)) {956/*957* Don't set startp if some later invocation of the958* same parentheses already has.959*/960if (restate->regstartp[no] == NULL) {961restate->regstartp[no] = save;962}963return 1;964} else {965return 0;966}967}968case CLOSE+1:969case CLOSE+2:970case CLOSE+3:971case CLOSE+4:972case CLOSE+5:973case CLOSE+6:974case CLOSE+7:975case CLOSE+8:976case CLOSE+9: {977register int no;978register char *save;979980doClose:981no = OP(scan) - CLOSE;982save = restate->reginput;983984if (regmatch(next,restate)) {985/*986* Don't set endp if some later987* invocation of the same parentheses988* already has.989*/990if (restate->regendp[no] == NULL)991restate->regendp[no] = save;992return 1;993} else {994return 0;995}996}997case BRANCH: {998register char *save;9991000if (OP(next) != BRANCH) { /* No choice. */1001next = OPERAND(scan); /* Avoid recursion. */1002} else {1003do {1004save = restate->reginput;1005if (regmatch(OPERAND(scan),restate))1006return(1);1007restate->reginput = save;1008scan = regnext(scan);1009} while (scan != NULL && OP(scan) == BRANCH);1010return 0;1011}1012break;1013}1014case STAR:1015case PLUS: {1016register char nextch;1017register int no;1018register char *save;1019register int min;10201021/*1022* Lookahead to avoid useless match attempts1023* when we know what character comes next.1024*/1025nextch = '\0';1026if (OP(next) == EXACTLY)1027nextch = *OPERAND(next);1028min = (OP(scan) == STAR) ? 0 : 1;1029save = restate->reginput;1030no = regrepeat(OPERAND(scan),restate);1031while (no >= min) {1032/* If it could work, try it. */1033if (nextch == '\0' || *restate->reginput == nextch)1034if (regmatch(next,restate))1035return(1);1036/* Couldn't or didn't -- back up. */1037no--;1038restate->reginput = save + no;1039}1040return(0);1041}1042case END:1043return(1); /* Success! */1044default:1045if (OP(scan) > OPEN && OP(scan) < OPEN+NSUBEXP) {1046goto doOpen;1047} else if (OP(scan) > CLOSE && OP(scan) < CLOSE+NSUBEXP) {1048goto doClose;1049}1050TclRegError("memory corruption");1051return 0;1052}10531054scan = next;1055}10561057/*1058* We get here only if there's trouble -- normally "case END" is1059* the terminating point.1060*/1061TclRegError("corrupted pointers");1062return(0);1063}10641065/*1066- regrepeat - repeatedly match something simple, report how many1067*/1068static int1069regrepeat(p, restate)1070char *p;1071struct regexec_state *restate;1072{1073register int count = 0;1074register char *scan;1075register char *opnd;10761077scan = restate->reginput;1078opnd = OPERAND(p);1079switch (OP(p)) {1080case ANY:1081count = strlen(scan);1082scan += count;1083break;1084case EXACTLY:1085while (*opnd == *scan) {1086count++;1087scan++;1088}1089break;1090case ANYOF:1091while (*scan != '\0' && strchr(opnd, *scan) != NULL) {1092count++;1093scan++;1094}1095break;1096case ANYBUT:1097while (*scan != '\0' && strchr(opnd, *scan) == NULL) {1098count++;1099scan++;1100}1101break;1102default: /* Oh dear. Called inappropriately. */1103TclRegError("internal foulup");1104count = 0; /* Best compromise. */1105break;1106}1107restate->reginput = scan;11081109return(count);1110}11111112/*1113- regnext - dig the "next" pointer out of a node1114*/1115static char *1116regnext(p)1117register char *p;1118{1119register int offset;11201121if (p == ®dummy)1122return(NULL);11231124offset = NEXT(p);1125if (offset == 0)1126return(NULL);11271128if (OP(p) == BACK)1129return(p-offset);1130else1131return(p+offset);1132}11331134#ifdef DEBUG11351136static char *regprop();11371138/*1139- regdump - dump a regexp onto stdout in vaguely comprehensible form1140*/1141void1142regdump(r)1143regexp *r;1144{1145register char *s;1146register char op = EXACTLY; /* Arbitrary non-END op. */1147register char *next;114811491150s = r->program + 1;1151while (op != END) { /* While that wasn't END last time... */1152op = OP(s);1153printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */1154next = regnext(s);1155if (next == NULL) /* Next ptr. */1156printf("(0)");1157else1158printf("(%d)", (s-r->program)+(next-s));1159s += 3;1160if (op == ANYOF || op == ANYBUT || op == EXACTLY) {1161/* Literal string, where present. */1162while (*s != '\0') {1163putchar(*s);1164s++;1165}1166s++;1167}1168putchar('\n');1169}11701171/* Header fields of interest. */1172if (r->regstart != '\0')1173printf("start `%c' ", r->regstart);1174if (r->reganch)1175printf("anchored ");1176if (r->regmust != NULL)1177printf("must have \"%s\"", r->regmust);1178printf("\n");1179}11801181/*1182- regprop - printable representation of opcode1183*/1184static char *1185regprop(op)1186char *op;1187{1188register char *p;1189static char buf[50];11901191(void) strcpy(buf, ":");11921193switch (OP(op)) {1194case BOL:1195p = "BOL";1196break;1197case EOL:1198p = "EOL";1199break;1200case ANY:1201p = "ANY";1202break;1203case ANYOF:1204p = "ANYOF";1205break;1206case ANYBUT:1207p = "ANYBUT";1208break;1209case BRANCH:1210p = "BRANCH";1211break;1212case EXACTLY:1213p = "EXACTLY";1214break;1215case NOTHING:1216p = "NOTHING";1217break;1218case BACK:1219p = "BACK";1220break;1221case END:1222p = "END";1223break;1224case OPEN+1:1225case OPEN+2:1226case OPEN+3:1227case OPEN+4:1228case OPEN+5:1229case OPEN+6:1230case OPEN+7:1231case OPEN+8:1232case OPEN+9:1233sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);1234p = NULL;1235break;1236case CLOSE+1:1237case CLOSE+2:1238case CLOSE+3:1239case CLOSE+4:1240case CLOSE+5:1241case CLOSE+6:1242case CLOSE+7:1243case CLOSE+8:1244case CLOSE+9:1245sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);1246p = NULL;1247break;1248case STAR:1249p = "STAR";1250break;1251case PLUS:1252p = "PLUS";1253break;1254default:1255if (OP(op) > OPEN && OP(op) < OPEN+NSUBEXP) {1256sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);1257p = NULL;1258break;1259} else if (OP(op) > CLOSE && OP(op) < CLOSE+NSUBEXP) {1260sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);1261p = NULL;1262} else {1263TclRegError("corrupted opcode");1264}1265break;1266}1267if (p != NULL)1268(void) strcat(buf, p);1269return(buf);1270}1271#endif12721273/*1274* The following is provided for those people who do not have strcspn() in1275* their C libraries. They should get off their butts and do something1276* about it; at least one public-domain implementation of those (highly1277* useful) string routines has been published on Usenet.1278*/1279#ifdef STRCSPN1280/*1281* strcspn - find length of initial segment of s1 consisting entirely1282* of characters not from s21283*/12841285static int1286strcspn(s1, s2)1287char *s1;1288char *s2;1289{1290register char *scan1;1291register char *scan2;1292register int count;12931294count = 0;1295for (scan1 = s1; *scan1 != '\0'; scan1++) {1296for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */1297if (*scan1 == *scan2++)1298return(count);1299count++;1300}1301return(count);1302}1303#endif13041305/*1306*----------------------------------------------------------------------1307*1308* TclRegError --1309*1310* This procedure is invoked by the regexp code when an error1311* occurs. It saves the error message so it can be seen by the1312* code that called Spencer's code.1313*1314* Results:1315* None.1316*1317* Side effects:1318* The value of "string" is saved in "errMsg".1319*1320*----------------------------------------------------------------------1321*/13221323void1324TclRegError(string)1325char *string; /* Error message. */1326{1327errMsg = string;1328}13291330char *1331TclGetRegError()1332{1333return errMsg;1334}133513361337