/*-1* SPDX-License-Identifier: BSD-3-Clause2*3* Copyright (c) 1992, 1993, 1994 Henry Spencer.4* Copyright (c) 1992, 1993, 19945* The Regents of the University of California. All rights reserved.6*7* This code is derived from software contributed to Berkeley by8* Henry Spencer.9*10* Redistribution and use in source and binary forms, with or without11* modification, are permitted provided that the following conditions12* are met:13* 1. Redistributions of source code must retain the above copyright14* notice, this list of conditions and the following disclaimer.15* 2. Redistributions in binary form must reproduce the above copyright16* notice, this list of conditions and the following disclaimer in the17* documentation and/or other materials provided with the distribution.18* 3. Neither the name of the University nor the names of its contributors19* may be used to endorse or promote products derived from this software20* without specific prior written permission.21*22* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND23* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE24* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE25* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE26* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL27* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS28* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)29* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT30* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY31* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF32* SUCH DAMAGE.33*/3435/*36* First, the stuff that ends up in the outside-world include file37= typedef off_t regoff_t;38= typedef struct {39= int re_magic;40= size_t re_nsub; // number of parenthesized subexpressions41= const char *re_endp; // end pointer for REG_PEND42= struct re_guts *re_g; // none of your business :-)43= } regex_t;44= typedef struct {45= regoff_t rm_so; // start of match46= regoff_t rm_eo; // end of match47= } regmatch_t;48*/49/*50* internals of regex_t51*/52#define MAGIC1 ((('r'^0200)<<8) | 'e')5354/*55* The internal representation is a *strip*, a sequence of56* operators ending with an endmarker. (Some terminology etc. is a57* historical relic of earlier versions which used multiple strips.)58* Certain oddities in the representation are there to permit running59* the machinery backwards; in particular, any deviation from sequential60* flow must be marked at both its source and its destination. Some61* fine points:62*63* - OPLUS_ and O_PLUS are *inside* the loop they create.64* - OQUEST_ and O_QUEST are *outside* the bypass they create.65* - OCH_ and O_CH are *outside* the multi-way branch they create, while66* OOR1 and OOR2 are respectively the end and the beginning of one of67* the branches. Note that there is an implicit OOR2 following OCH_68* and an implicit OOR1 preceding O_CH.69*70* In state representations, an operator's bit is on to signify a state71* immediately *preceding* "execution" of that operator.72*/73typedef unsigned long sop; /* strip operator */74typedef unsigned long sopno;75#define OPRMASK 0xf8000000L76#define OPDMASK 0x07ffffffL77#define OPSHIFT ((unsigned)27)78#define OP(n) ((n)&OPRMASK)79#define OPND(n) ((n)&OPDMASK)80#define SOP(op, opnd) ((op)|(opnd))81/* operators meaning operand */82/* (back, fwd are offsets) */83#define OEND (1L<<OPSHIFT) /* endmarker - */84#define OCHAR (2L<<OPSHIFT) /* character wide character */85#define OBOL (3L<<OPSHIFT) /* left anchor - */86#define OEOL (4L<<OPSHIFT) /* right anchor - */87#define OANY (5L<<OPSHIFT) /* . - */88#define OANYOF (6L<<OPSHIFT) /* [...] set number */89#define OBACK_ (7L<<OPSHIFT) /* begin \d paren number */90#define O_BACK (8L<<OPSHIFT) /* end \d paren number */91#define OPLUS_ (9L<<OPSHIFT) /* + prefix fwd to suffix */92#define O_PLUS (10L<<OPSHIFT) /* + suffix back to prefix */93#define OQUEST_ (11L<<OPSHIFT) /* ? prefix fwd to suffix */94#define O_QUEST (12L<<OPSHIFT) /* ? suffix back to prefix */95#define OLPAREN (13L<<OPSHIFT) /* ( fwd to ) */96#define ORPAREN (14L<<OPSHIFT) /* ) back to ( */97#define OCH_ (15L<<OPSHIFT) /* begin choice fwd to OOR2 */98#define OOR1 (16L<<OPSHIFT) /* | pt. 1 back to OOR1 or OCH_ */99#define OOR2 (17L<<OPSHIFT) /* | pt. 2 fwd to OOR2 or O_CH */100#define O_CH (18L<<OPSHIFT) /* end choice back to OOR1 */101#define OBOW (19L<<OPSHIFT) /* begin word - */102#define OEOW (20L<<OPSHIFT) /* end word - */103#define OBOS (21L<<OPSHIFT) /* begin subj. - */104#define OEOS (22L<<OPSHIFT) /* end subj. - */105#define OWBND (23L<<OPSHIFT) /* word bound - */106#define ONWBND (24L<<OPSHIFT) /* not bound - */107108/*109* Structures for [] character-set representation.110*/111typedef struct {112wint_t min;113wint_t max;114} crange;115typedef struct {116unsigned char bmp[NC_MAX / 8];117wctype_t *types;118unsigned int ntypes;119wint_t *wides;120unsigned int nwides;121crange *ranges;122unsigned int nranges;123int invert;124int icase;125} cset;126127static int128CHIN1(cset *cs, wint_t ch)129{130unsigned int i;131132assert(ch >= 0);133if (ch < NC)134return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^135cs->invert);136for (i = 0; i < cs->nwides; i++) {137if (cs->icase) {138if (ch == towlower(cs->wides[i]) ||139ch == towupper(cs->wides[i]))140return (!cs->invert);141} else if (ch == cs->wides[i])142return (!cs->invert);143}144for (i = 0; i < cs->nranges; i++)145if (cs->ranges[i].min <= ch && ch <= cs->ranges[i].max)146return (!cs->invert);147for (i = 0; i < cs->ntypes; i++)148if (iswctype(ch, cs->types[i]))149return (!cs->invert);150return (cs->invert);151}152153static __inline int154CHIN(cset *cs, wint_t ch)155{156157assert(ch >= 0);158if (ch < NC)159return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^160cs->invert);161else if (cs->icase)162return (CHIN1(cs, ch) || CHIN1(cs, towlower(ch)) ||163CHIN1(cs, towupper(ch)));164else165return (CHIN1(cs, ch));166}167168/*169* main compiled-expression structure170*/171struct re_guts {172int magic;173# define MAGIC2 ((('R'^0200)<<8)|'E')174sop *strip; /* malloced area for strip */175unsigned int ncsets; /* number of csets in use */176cset *sets; /* -> cset [ncsets] */177int cflags; /* copy of regcomp() cflags argument */178sopno nstates; /* = number of sops */179sopno firststate; /* the initial OEND (normally 0) */180sopno laststate; /* the final OEND */181int iflags; /* internal flags */182# define USEBOL 01 /* used ^ */183# define USEEOL 02 /* used $ */184# define BAD 04 /* something wrong */185int nbol; /* number of ^ used */186int neol; /* number of $ used */187char *must; /* match must contain this string */188int moffset; /* latest point at which must may be located */189int *charjump; /* Boyer-Moore char jump table */190int *matchjump; /* Boyer-Moore match jump table */191int mlen; /* length of must */192size_t nsub; /* copy of re_nsub */193int backrefs; /* does it use back references? */194sopno nplus; /* how deep does it nest +s? */195};196197/* misc utilities */198#define OUT (CHAR_MIN - 1) /* a non-character value */199#define IGN (CHAR_MIN - 2)200#define ISWORD(c) (iswalnum((uch)(c)) || (c) == '_')201202203