Path: blob/devel/elmergrid/src/metis-5.1.0/GKlib/gkregex.c
3206 views
/* Extended regular expression matching and search library.1Copyright (C) 2002, 2003, 2005 Free Software Foundation, Inc.2This file is part of the GNU C Library.3Contributed by Isamu Hasegawa <[email protected]>.45The GNU C Library is free software; you can redistribute it and/or6modify it under the terms of the GNU Lesser General Public7License as published by the Free Software Foundation; either8version 2.1 of the License, or (at your option) any later version.910The GNU C Library is distributed in the hope that it will be useful,11but WITHOUT ANY WARRANTY; without even the implied warranty of12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU13Lesser General Public License for more details.1415You should have received a copy of the GNU Lesser General Public16License along with the GNU C Library; if not, write to the Free17Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA1802111-1307 USA. */1920/* this is for removing a compiler warning */21void gkfooo() { return; }2223#ifdef USE_GKREGEX2425#ifdef HAVE_CONFIG_H26#include "config.h"27#endif2829#ifdef _LIBC30/* We have to keep the namespace clean. */31# define regfree(preg) __regfree (preg)32# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)33# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)34# define regerror(errcode, preg, errbuf, errbuf_size) \35__regerror(errcode, preg, errbuf, errbuf_size)36# define re_set_registers(bu, re, nu, st, en) \37__re_set_registers (bu, re, nu, st, en)38# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \39__re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)40# define re_match(bufp, string, size, pos, regs) \41__re_match (bufp, string, size, pos, regs)42# define re_search(bufp, string, size, startpos, range, regs) \43__re_search (bufp, string, size, startpos, range, regs)44# define re_compile_pattern(pattern, length, bufp) \45__re_compile_pattern (pattern, length, bufp)46# define re_set_syntax(syntax) __re_set_syntax (syntax)47# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \48__re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)49# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)5051# include "../locale/localeinfo.h"52#endif5354#include "GKlib.h"555657/******************************************************************************/58/******************************************************************************/59/******************************************************************************/60/* GKINCLUDE #include "regex_internal.h" */61/******************************************************************************/62/******************************************************************************/63/******************************************************************************/64/* Extended regular expression matching and search library.65Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.66This file is part of the GNU C Library.67Contributed by Isamu Hasegawa <[email protected]>.6869The GNU C Library is free software; you can redistribute it and/or70modify it under the terms of the GNU Lesser General Public71License as published by the Free Software Foundation; either72version 2.1 of the License, or (at your option) any later version.7374The GNU C Library is distributed in the hope that it will be useful,75but WITHOUT ANY WARRANTY; without even the implied warranty of76MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU77Lesser General Public License for more details.7879You should have received a copy of the GNU Lesser General Public80License along with the GNU C Library; if not, write to the Free81Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA8202111-1307 USA. */8384#ifndef _REGEX_INTERNAL_H85#define _REGEX_INTERNAL_H 18687#include <assert.h>88#include <ctype.h>89#include <stdio.h>90#include <stdlib.h>91#include <string.h>9293#if defined(__MINGW32_VERSION) || defined(_MSC_VER)94#define strcasecmp stricmp95#endif9697#if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC98# include <langinfo.h>99#endif100#if defined HAVE_LOCALE_H || defined _LIBC101# include <locale.h>102#endif103#if defined HAVE_WCHAR_H || defined _LIBC104# include <wchar.h>105#endif /* HAVE_WCHAR_H || _LIBC */106#if defined HAVE_WCTYPE_H || defined _LIBC107# include <wctype.h>108#endif /* HAVE_WCTYPE_H || _LIBC */109#if defined HAVE_STDBOOL_H || defined _LIBC110# include <stdbool.h>111#else112typedef enum { false, true } bool;113#endif /* HAVE_STDBOOL_H || _LIBC */114#if defined HAVE_STDINT_H || defined _LIBC115# include <stdint.h>116#endif /* HAVE_STDINT_H || _LIBC */117#if defined _LIBC118# include <bits/libc-lock.h>119#else120# define __libc_lock_define(CLASS,NAME)121# define __libc_lock_init(NAME) do { } while (0)122# define __libc_lock_lock(NAME) do { } while (0)123# define __libc_lock_unlock(NAME) do { } while (0)124#endif125126/* In case that the system doesn't have isblank(). */127#if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank128# define isblank(ch) ((ch) == ' ' || (ch) == '\t')129#endif130131#ifdef _LIBC132# ifndef _RE_DEFINE_LOCALE_FUNCTIONS133# define _RE_DEFINE_LOCALE_FUNCTIONS 1134# include <locale/localeinfo.h>135# include <locale/elem-hash.h>136# include <locale/coll-lookup.h>137# endif138#endif139140/* This is for other GNU distributions with internationalized messages. */141#if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC142# include <libintl.h>143# ifdef _LIBC144# undef gettext145# define gettext(msgid) \146INTUSE(__dcgettext) (_libc_intl_domainname, msgid, LC_MESSAGES)147# endif148#else149# define gettext(msgid) (msgid)150#endif151152#ifndef gettext_noop153/* This define is so xgettext can find the internationalizable154strings. */155# define gettext_noop(String) String156#endif157158/* For loser systems without the definition. */159#ifndef SIZE_MAX160# define SIZE_MAX ((size_t) -1)161#endif162163#if (defined MB_CUR_MAX && HAVE_LOCALE_H && HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_WCRTOMB && HAVE_MBRTOWC && HAVE_WCSCOLL) || _LIBC164# define RE_ENABLE_I18N165#endif166167#if __GNUC__ >= 3168# define BE(expr, val) __builtin_expect (expr, val)169#else170# define BE(expr, val) (expr)171# define inline172#endif173174/* Number of single byte character. */175#define SBC_MAX 256176177#define COLL_ELEM_LEN_MAX 8178179/* The character which represents newline. */180#define NEWLINE_CHAR '\n'181#define WIDE_NEWLINE_CHAR L'\n'182183/* Rename to standard API for using out of glibc. */184#ifndef _LIBC185# define __wctype wctype186# define __iswctype iswctype187# define __btowc btowc188# define __mempcpy mempcpy189# define __wcrtomb wcrtomb190# define __regfree regfree191# define attribute_hidden192#endif /* not _LIBC */193194#ifdef __GNUC__195# define __attribute(arg) __attribute__ (arg)196#else197# define __attribute(arg)198#endif199200extern const char __re_error_msgid[] attribute_hidden;201extern const size_t __re_error_msgid_idx[] attribute_hidden;202203/* An integer used to represent a set of bits. It must be unsigned,204and must be at least as wide as unsigned int. */205typedef unsigned long int bitset_word_t;206/* All bits set in a bitset_word_t. */207#define BITSET_WORD_MAX ULONG_MAX208/* Number of bits in a bitset_word_t. */209#define BITSET_WORD_BITS (sizeof (bitset_word_t) * CHAR_BIT)210/* Number of bitset_word_t in a bit_set. */211#define BITSET_WORDS (SBC_MAX / BITSET_WORD_BITS)212typedef bitset_word_t bitset_t[BITSET_WORDS];213typedef bitset_word_t *re_bitset_ptr_t;214typedef const bitset_word_t *re_const_bitset_ptr_t;215216#define bitset_set(set,i) \217(set[i / BITSET_WORD_BITS] |= (bitset_word_t) 1 << i % BITSET_WORD_BITS)218#define bitset_clear(set,i) \219(set[i / BITSET_WORD_BITS] &= ~((bitset_word_t) 1 << i % BITSET_WORD_BITS))220#define bitset_contain(set,i) \221(set[i / BITSET_WORD_BITS] & ((bitset_word_t) 1 << i % BITSET_WORD_BITS))222#define bitset_empty(set) memset (set, '\0', sizeof (bitset_t))223#define bitset_set_all(set) memset (set, '\xff', sizeof (bitset_t))224#define bitset_copy(dest,src) memcpy (dest, src, sizeof (bitset_t))225226#define PREV_WORD_CONSTRAINT 0x0001227#define PREV_NOTWORD_CONSTRAINT 0x0002228#define NEXT_WORD_CONSTRAINT 0x0004229#define NEXT_NOTWORD_CONSTRAINT 0x0008230#define PREV_NEWLINE_CONSTRAINT 0x0010231#define NEXT_NEWLINE_CONSTRAINT 0x0020232#define PREV_BEGBUF_CONSTRAINT 0x0040233#define NEXT_ENDBUF_CONSTRAINT 0x0080234#define WORD_DELIM_CONSTRAINT 0x0100235#define NOT_WORD_DELIM_CONSTRAINT 0x0200236237typedef enum238{239INSIDE_WORD = PREV_WORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,240WORD_FIRST = PREV_NOTWORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,241WORD_LAST = PREV_WORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,242INSIDE_NOTWORD = PREV_NOTWORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,243LINE_FIRST = PREV_NEWLINE_CONSTRAINT,244LINE_LAST = NEXT_NEWLINE_CONSTRAINT,245BUF_FIRST = PREV_BEGBUF_CONSTRAINT,246BUF_LAST = NEXT_ENDBUF_CONSTRAINT,247WORD_DELIM = WORD_DELIM_CONSTRAINT,248NOT_WORD_DELIM = NOT_WORD_DELIM_CONSTRAINT249} re_context_type;250251typedef struct252{253int alloc;254int nelem;255int *elems;256} re_node_set;257258typedef enum259{260NON_TYPE = 0,261262/* Node type, These are used by token, node, tree. */263CHARACTER = 1,264END_OF_RE = 2,265SIMPLE_BRACKET = 3,266OP_BACK_REF = 4,267OP_PERIOD = 5,268#ifdef RE_ENABLE_I18N269COMPLEX_BRACKET = 6,270OP_UTF8_PERIOD = 7,271#endif /* RE_ENABLE_I18N */272273/* We define EPSILON_BIT as a macro so that OP_OPEN_SUBEXP is used274when the debugger shows values of this enum type. */275#define EPSILON_BIT 8276OP_OPEN_SUBEXP = EPSILON_BIT | 0,277OP_CLOSE_SUBEXP = EPSILON_BIT | 1,278OP_ALT = EPSILON_BIT | 2,279OP_DUP_ASTERISK = EPSILON_BIT | 3,280ANCHOR = EPSILON_BIT | 4,281282/* Tree type, these are used only by tree. */283CONCAT = 16,284SUBEXP = 17,285286/* Token type, these are used only by token. */287OP_DUP_PLUS = 18,288OP_DUP_QUESTION,289OP_OPEN_BRACKET,290OP_CLOSE_BRACKET,291OP_CHARSET_RANGE,292OP_OPEN_DUP_NUM,293OP_CLOSE_DUP_NUM,294OP_NON_MATCH_LIST,295OP_OPEN_COLL_ELEM,296OP_CLOSE_COLL_ELEM,297OP_OPEN_EQUIV_CLASS,298OP_CLOSE_EQUIV_CLASS,299OP_OPEN_CHAR_CLASS,300OP_CLOSE_CHAR_CLASS,301OP_WORD,302OP_NOTWORD,303OP_SPACE,304OP_NOTSPACE,305BACK_SLASH306307} re_token_type_t;308309#ifdef RE_ENABLE_I18N310typedef struct311{312/* Multibyte characters. */313wchar_t *mbchars;314315/* Collating symbols. */316# ifdef _LIBC317int32_t *coll_syms;318# endif319320/* Equivalence classes. */321# ifdef _LIBC322int32_t *equiv_classes;323# endif324325/* Range expressions. */326# ifdef _LIBC327uint32_t *range_starts;328uint32_t *range_ends;329# else /* not _LIBC */330wchar_t *range_starts;331wchar_t *range_ends;332# endif /* not _LIBC */333334/* Character classes. */335wctype_t *char_classes;336337/* If this character set is the non-matching list. */338unsigned int non_match : 1;339340/* # of multibyte characters. */341int nmbchars;342343/* # of collating symbols. */344int ncoll_syms;345346/* # of equivalence classes. */347int nequiv_classes;348349/* # of range expressions. */350int nranges;351352/* # of character classes. */353int nchar_classes;354} re_charset_t;355#endif /* RE_ENABLE_I18N */356357typedef struct358{359union360{361unsigned char c; /* for CHARACTER */362re_bitset_ptr_t sbcset; /* for SIMPLE_BRACKET */363#ifdef RE_ENABLE_I18N364re_charset_t *mbcset; /* for COMPLEX_BRACKET */365#endif /* RE_ENABLE_I18N */366int idx; /* for BACK_REF */367re_context_type ctx_type; /* for ANCHOR */368} opr;369#if __GNUC__ >= 2370re_token_type_t type : 8;371#else372re_token_type_t type;373#endif374unsigned int constraint : 10; /* context constraint */375unsigned int duplicated : 1;376unsigned int opt_subexp : 1;377#ifdef RE_ENABLE_I18N378unsigned int accept_mb : 1;379/* These 2 bits can be moved into the union if needed (e.g. if running out380of bits; move opr.c to opr.c.c and move the flags to opr.c.flags). */381unsigned int mb_partial : 1;382#endif383unsigned int word_char : 1;384} re_token_t;385386#define IS_EPSILON_NODE(type) ((type) & EPSILON_BIT)387388struct re_string_t389{390/* Indicate the raw buffer which is the original string passed as an391argument of regexec(), re_search(), etc.. */392const unsigned char *raw_mbs;393/* Store the multibyte string. In case of "case insensitive mode" like394REG_ICASE, upper cases of the string are stored, otherwise MBS points395the same address that RAW_MBS points. */396unsigned char *mbs;397#ifdef RE_ENABLE_I18N398/* Store the wide character string which is corresponding to MBS. */399wint_t *wcs;400int *offsets;401mbstate_t cur_state;402#endif403/* Index in RAW_MBS. Each character mbs[i] corresponds to404raw_mbs[raw_mbs_idx + i]. */405int raw_mbs_idx;406/* The length of the valid characters in the buffers. */407int valid_len;408/* The corresponding number of bytes in raw_mbs array. */409int valid_raw_len;410/* The length of the buffers MBS and WCS. */411int bufs_len;412/* The index in MBS, which is updated by re_string_fetch_byte. */413int cur_idx;414/* length of RAW_MBS array. */415int raw_len;416/* This is RAW_LEN - RAW_MBS_IDX + VALID_LEN - VALID_RAW_LEN. */417int len;418/* End of the buffer may be shorter than its length in the cases such419as re_match_2, re_search_2. Then, we use STOP for end of the buffer420instead of LEN. */421int raw_stop;422/* This is RAW_STOP - RAW_MBS_IDX adjusted through OFFSETS. */423int stop;424425/* The context of mbs[0]. We store the context independently, since426the context of mbs[0] may be different from raw_mbs[0], which is427the beginning of the input string. */428unsigned int tip_context;429/* The translation passed as a part of an argument of re_compile_pattern. */430RE_TRANSLATE_TYPE trans;431/* Copy of re_dfa_t's word_char. */432re_const_bitset_ptr_t word_char;433/* 1 if REG_ICASE. */434unsigned char icase;435unsigned char is_utf8;436unsigned char map_notascii;437unsigned char mbs_allocated;438unsigned char offsets_needed;439unsigned char newline_anchor;440unsigned char word_ops_used;441int mb_cur_max;442};443typedef struct re_string_t re_string_t;444445446struct re_dfa_t;447typedef struct re_dfa_t re_dfa_t;448449#ifndef _LIBC450# ifdef __i386__451# define internal_function __attribute ((regparm (3), stdcall))452# else453# define internal_function454# endif455#endif456457static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,458int new_buf_len)459internal_function;460#ifdef RE_ENABLE_I18N461static void build_wcs_buffer (re_string_t *pstr) internal_function;462static int build_wcs_upper_buffer (re_string_t *pstr) internal_function;463#endif /* RE_ENABLE_I18N */464static void build_upper_buffer (re_string_t *pstr) internal_function;465static void re_string_translate_buffer (re_string_t *pstr) internal_function;466static unsigned int re_string_context_at (const re_string_t *input, int idx,467int eflags)468internal_function __attribute ((pure));469#define re_string_peek_byte(pstr, offset) \470((pstr)->mbs[(pstr)->cur_idx + offset])471#define re_string_fetch_byte(pstr) \472((pstr)->mbs[(pstr)->cur_idx++])473#define re_string_first_byte(pstr, idx) \474((idx) == (pstr)->valid_len || (pstr)->wcs[idx] != WEOF)475#define re_string_is_single_byte_char(pstr, idx) \476((pstr)->wcs[idx] != WEOF && ((pstr)->valid_len == (idx) + 1 \477|| (pstr)->wcs[(idx) + 1] != WEOF))478#define re_string_eoi(pstr) ((pstr)->stop <= (pstr)->cur_idx)479#define re_string_cur_idx(pstr) ((pstr)->cur_idx)480#define re_string_get_buffer(pstr) ((pstr)->mbs)481#define re_string_length(pstr) ((pstr)->len)482#define re_string_byte_at(pstr,idx) ((pstr)->mbs[idx])483#define re_string_skip_bytes(pstr,idx) ((pstr)->cur_idx += (idx))484#define re_string_set_index(pstr,idx) ((pstr)->cur_idx = (idx))485486#ifdef __GNUC__487# define alloca(size) __builtin_alloca (size)488# define HAVE_ALLOCA 1489#elif defined(_MSC_VER)490# include <malloc.h>491# define alloca _alloca492# define HAVE_ALLOCA 1493#else494# error No alloca()495#endif496497#ifndef _LIBC498# if HAVE_ALLOCA499/* The OS usually guarantees only one guard page at the bottom of the stack,500and a page size can be as small as 4096 bytes. So we cannot safely501allocate anything larger than 4096 bytes. Also care for the possibility502of a few compiler-allocated temporary stack slots. */503# define __libc_use_alloca(n) ((n) < 4032)504# else505/* alloca is implemented with malloc, so just use malloc. */506# define __libc_use_alloca(n) 0507# endif508#endif509510#define re_malloc(t,n) ((t *) malloc ((n) * sizeof (t)))511#define re_realloc(p,t,n) ((t *) realloc (p, (n) * sizeof (t)))512#define re_free(p) free (p)513514struct bin_tree_t515{516struct bin_tree_t *parent;517struct bin_tree_t *left;518struct bin_tree_t *right;519struct bin_tree_t *first;520struct bin_tree_t *next;521522re_token_t token;523524/* `node_idx' is the index in dfa->nodes, if `type' == 0.525Otherwise `type' indicate the type of this node. */526int node_idx;527};528typedef struct bin_tree_t bin_tree_t;529530#define BIN_TREE_STORAGE_SIZE \531((1024 - sizeof (void *)) / sizeof (bin_tree_t))532533struct bin_tree_storage_t534{535struct bin_tree_storage_t *next;536bin_tree_t data[BIN_TREE_STORAGE_SIZE];537};538typedef struct bin_tree_storage_t bin_tree_storage_t;539540#define CONTEXT_WORD 1541#define CONTEXT_NEWLINE (CONTEXT_WORD << 1)542#define CONTEXT_BEGBUF (CONTEXT_NEWLINE << 1)543#define CONTEXT_ENDBUF (CONTEXT_BEGBUF << 1)544545#define IS_WORD_CONTEXT(c) ((c) & CONTEXT_WORD)546#define IS_NEWLINE_CONTEXT(c) ((c) & CONTEXT_NEWLINE)547#define IS_BEGBUF_CONTEXT(c) ((c) & CONTEXT_BEGBUF)548#define IS_ENDBUF_CONTEXT(c) ((c) & CONTEXT_ENDBUF)549#define IS_ORDINARY_CONTEXT(c) ((c) == 0)550551#define IS_WORD_CHAR(ch) (isalnum (ch) || (ch) == '_')552#define IS_NEWLINE(ch) ((ch) == NEWLINE_CHAR)553#define IS_WIDE_WORD_CHAR(ch) (iswalnum (ch) || (ch) == L'_')554#define IS_WIDE_NEWLINE(ch) ((ch) == WIDE_NEWLINE_CHAR)555556#define NOT_SATISFY_PREV_CONSTRAINT(constraint,context) \557((((constraint) & PREV_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \558|| ((constraint & PREV_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \559|| ((constraint & PREV_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context))\560|| ((constraint & PREV_BEGBUF_CONSTRAINT) && !IS_BEGBUF_CONTEXT (context)))561562#define NOT_SATISFY_NEXT_CONSTRAINT(constraint,context) \563((((constraint) & NEXT_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \564|| (((constraint) & NEXT_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \565|| (((constraint) & NEXT_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context)) \566|| (((constraint) & NEXT_ENDBUF_CONSTRAINT) && !IS_ENDBUF_CONTEXT (context)))567568struct re_dfastate_t569{570unsigned int hash;571re_node_set nodes;572re_node_set non_eps_nodes;573re_node_set inveclosure;574re_node_set *entrance_nodes;575struct re_dfastate_t **trtable, **word_trtable;576unsigned int context : 4;577unsigned int halt : 1;578/* If this state can accept `multi byte'.579Note that we refer to multibyte characters, and multi character580collating elements as `multi byte'. */581unsigned int accept_mb : 1;582/* If this state has backreference node(s). */583unsigned int has_backref : 1;584unsigned int has_constraint : 1;585};586typedef struct re_dfastate_t re_dfastate_t;587588struct re_state_table_entry589{590int num;591int alloc;592re_dfastate_t **array;593};594595/* Array type used in re_sub_match_last_t and re_sub_match_top_t. */596597typedef struct598{599int next_idx;600int alloc;601re_dfastate_t **array;602} state_array_t;603604/* Store information about the node NODE whose type is OP_CLOSE_SUBEXP. */605606typedef struct607{608int node;609int str_idx; /* The position NODE match at. */610state_array_t path;611} re_sub_match_last_t;612613/* Store information about the node NODE whose type is OP_OPEN_SUBEXP.614And information about the node, whose type is OP_CLOSE_SUBEXP,615corresponding to NODE is stored in LASTS. */616617typedef struct618{619int str_idx;620int node;621state_array_t *path;622int alasts; /* Allocation size of LASTS. */623int nlasts; /* The number of LASTS. */624re_sub_match_last_t **lasts;625} re_sub_match_top_t;626627struct re_backref_cache_entry628{629int node;630int str_idx;631int subexp_from;632int subexp_to;633char more;634char unused;635unsigned short int eps_reachable_subexps_map;636};637638typedef struct639{640/* The string object corresponding to the input string. */641re_string_t input;642#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)643const re_dfa_t *const dfa;644#else645const re_dfa_t *dfa;646#endif647/* EFLAGS of the argument of regexec. */648int eflags;649/* Where the matching ends. */650int match_last;651int last_node;652/* The state log used by the matcher. */653re_dfastate_t **state_log;654int state_log_top;655/* Back reference cache. */656int nbkref_ents;657int abkref_ents;658struct re_backref_cache_entry *bkref_ents;659int max_mb_elem_len;660int nsub_tops;661int asub_tops;662re_sub_match_top_t **sub_tops;663} re_match_context_t;664665typedef struct666{667re_dfastate_t **sifted_states;668re_dfastate_t **limited_states;669int last_node;670int last_str_idx;671re_node_set limits;672} re_sift_context_t;673674struct re_fail_stack_ent_t675{676int idx;677int node;678regmatch_t *regs;679re_node_set eps_via_nodes;680};681682struct re_fail_stack_t683{684int num;685int alloc;686struct re_fail_stack_ent_t *stack;687};688689struct re_dfa_t690{691re_token_t *nodes;692size_t nodes_alloc;693size_t nodes_len;694int *nexts;695int *org_indices;696re_node_set *edests;697re_node_set *eclosures;698re_node_set *inveclosures;699struct re_state_table_entry *state_table;700re_dfastate_t *init_state;701re_dfastate_t *init_state_word;702re_dfastate_t *init_state_nl;703re_dfastate_t *init_state_begbuf;704bin_tree_t *str_tree;705bin_tree_storage_t *str_tree_storage;706re_bitset_ptr_t sb_char;707int str_tree_storage_idx;708709/* number of subexpressions `re_nsub' is in regex_t. */710unsigned int state_hash_mask;711int init_node;712int nbackref; /* The number of backreference in this dfa. */713714/* Bitmap expressing which backreference is used. */715bitset_word_t used_bkref_map;716bitset_word_t completed_bkref_map;717718unsigned int has_plural_match : 1;719/* If this dfa has "multibyte node", which is a backreference or720a node which can accept multibyte character or multi character721collating element. */722unsigned int has_mb_node : 1;723unsigned int is_utf8 : 1;724unsigned int map_notascii : 1;725unsigned int word_ops_used : 1;726int mb_cur_max;727bitset_t word_char;728reg_syntax_t syntax;729int *subexp_map;730#ifdef DEBUG731char* re_str;732#endif733__libc_lock_define (, lock)734};735736#define re_node_set_init_empty(set) memset (set, '\0', sizeof (re_node_set))737#define re_node_set_remove(set,id) \738(re_node_set_remove_at (set, re_node_set_contains (set, id) - 1))739#define re_node_set_empty(p) ((p)->nelem = 0)740#define re_node_set_free(set) re_free ((set)->elems)741742743typedef enum744{745SB_CHAR,746MB_CHAR,747EQUIV_CLASS,748COLL_SYM,749CHAR_CLASS750} bracket_elem_type;751752typedef struct753{754bracket_elem_type type;755union756{757unsigned char ch;758unsigned char *name;759wchar_t wch;760} opr;761} bracket_elem_t;762763764/* Inline functions for bitset operation. */765static inline void766bitset_not (bitset_t set)767{768int bitset_i;769for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)770set[bitset_i] = ~set[bitset_i];771}772773static inline void774bitset_merge (bitset_t dest, const bitset_t src)775{776int bitset_i;777for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)778dest[bitset_i] |= src[bitset_i];779}780781static inline void782bitset_mask (bitset_t dest, const bitset_t src)783{784int bitset_i;785for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)786dest[bitset_i] &= src[bitset_i];787}788789#ifdef RE_ENABLE_I18N790/* Inline functions for re_string. */791static inline int792internal_function __attribute ((pure))793re_string_char_size_at (const re_string_t *pstr, int idx)794{795int byte_idx;796if (pstr->mb_cur_max == 1)797return 1;798for (byte_idx = 1; idx + byte_idx < pstr->valid_len; ++byte_idx)799if (pstr->wcs[idx + byte_idx] != WEOF)800break;801return byte_idx;802}803804static inline wint_t805internal_function __attribute ((pure))806re_string_wchar_at (const re_string_t *pstr, int idx)807{808if (pstr->mb_cur_max == 1)809return (wint_t) pstr->mbs[idx];810return (wint_t) pstr->wcs[idx];811}812813static int814internal_function __attribute ((pure))815re_string_elem_size_at (const re_string_t *pstr, int idx)816{817# ifdef _LIBC818const unsigned char *p, *extra;819const int32_t *table, *indirect;820int32_t tmp;821# include <locale/weight.h>822uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);823824if (nrules != 0)825{826table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);827extra = (const unsigned char *)828_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);829indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,830_NL_COLLATE_INDIRECTMB);831p = pstr->mbs + idx;832tmp = findidx (&p);833return p - pstr->mbs - idx;834}835else836# endif /* _LIBC */837return 1;838}839#endif /* RE_ENABLE_I18N */840841#endif /* _REGEX_INTERNAL_H */842843/******************************************************************************/844/******************************************************************************/845/******************************************************************************/846/* GKINCLUDE #include "regex_internal.c" */847/******************************************************************************/848/******************************************************************************/849/******************************************************************************/850/* Extended regular expression matching and search library.851Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.852This file is part of the GNU C Library.853Contributed by Isamu Hasegawa <[email protected]>.854855The GNU C Library is free software; you can redistribute it and/or856modify it under the terms of the GNU Lesser General Public857License as published by the Free Software Foundation; either858version 2.1 of the License, or (at your option) any later version.859860The GNU C Library is distributed in the hope that it will be useful,861but WITHOUT ANY WARRANTY; without even the implied warranty of862MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU863Lesser General Public License for more details.864865You should have received a copy of the GNU Lesser General Public866License along with the GNU C Library; if not, write to the Free867Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA86802111-1307 USA. */869870static void re_string_construct_common (const char *str, int len,871re_string_t *pstr,872RE_TRANSLATE_TYPE trans, int icase,873const re_dfa_t *dfa) internal_function;874static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,875const re_node_set *nodes,876unsigned int hash) internal_function;877static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,878const re_node_set *nodes,879unsigned int context,880unsigned int hash) internal_function;881882/* Functions for string operation. */883884/* This function allocate the buffers. It is necessary to call885re_string_reconstruct before using the object. */886887static reg_errcode_t888internal_function889re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,890RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)891{892reg_errcode_t ret;893int init_buf_len;894895/* Ensure at least one character fits into the buffers. */896if (init_len < dfa->mb_cur_max)897init_len = dfa->mb_cur_max;898init_buf_len = (len + 1 < init_len) ? len + 1: init_len;899re_string_construct_common (str, len, pstr, trans, icase, dfa);900901ret = re_string_realloc_buffers (pstr, init_buf_len);902if (BE (ret != REG_NOERROR, 0))903return ret;904905pstr->word_char = dfa->word_char;906pstr->word_ops_used = dfa->word_ops_used;907pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;908pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;909pstr->valid_raw_len = pstr->valid_len;910return REG_NOERROR;911}912913/* This function allocate the buffers, and initialize them. */914915static reg_errcode_t916internal_function917re_string_construct (re_string_t *pstr, const char *str, int len,918RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)919{920reg_errcode_t ret;921memset (pstr, '\0', sizeof (re_string_t));922re_string_construct_common (str, len, pstr, trans, icase, dfa);923924if (len > 0)925{926ret = re_string_realloc_buffers (pstr, len + 1);927if (BE (ret != REG_NOERROR, 0))928return ret;929}930pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;931932if (icase)933{934#ifdef RE_ENABLE_I18N935if (dfa->mb_cur_max > 1)936{937while (1)938{939ret = build_wcs_upper_buffer (pstr);940if (BE (ret != REG_NOERROR, 0))941return ret;942if (pstr->valid_raw_len >= len)943break;944if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)945break;946ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);947if (BE (ret != REG_NOERROR, 0))948return ret;949}950}951else952#endif /* RE_ENABLE_I18N */953build_upper_buffer (pstr);954}955else956{957#ifdef RE_ENABLE_I18N958if (dfa->mb_cur_max > 1)959build_wcs_buffer (pstr);960else961#endif /* RE_ENABLE_I18N */962{963if (trans != NULL)964re_string_translate_buffer (pstr);965else966{967pstr->valid_len = pstr->bufs_len;968pstr->valid_raw_len = pstr->bufs_len;969}970}971}972973return REG_NOERROR;974}975976/* Helper functions for re_string_allocate, and re_string_construct. */977978static reg_errcode_t979internal_function980re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)981{982#ifdef RE_ENABLE_I18N983if (pstr->mb_cur_max > 1)984{985wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);986if (BE (new_wcs == NULL, 0))987return REG_ESPACE;988pstr->wcs = new_wcs;989if (pstr->offsets != NULL)990{991int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);992if (BE (new_offsets == NULL, 0))993return REG_ESPACE;994pstr->offsets = new_offsets;995}996}997#endif /* RE_ENABLE_I18N */998if (pstr->mbs_allocated)999{1000unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,1001new_buf_len);1002if (BE (new_mbs == NULL, 0))1003return REG_ESPACE;1004pstr->mbs = new_mbs;1005}1006pstr->bufs_len = new_buf_len;1007return REG_NOERROR;1008}100910101011static void1012internal_function1013re_string_construct_common (const char *str, int len, re_string_t *pstr,1014RE_TRANSLATE_TYPE trans, int icase,1015const re_dfa_t *dfa)1016{1017pstr->raw_mbs = (const unsigned char *) str;1018pstr->len = len;1019pstr->raw_len = len;1020pstr->trans = trans;1021pstr->icase = icase ? 1 : 0;1022pstr->mbs_allocated = (trans != NULL || icase);1023pstr->mb_cur_max = dfa->mb_cur_max;1024pstr->is_utf8 = dfa->is_utf8;1025pstr->map_notascii = dfa->map_notascii;1026pstr->stop = pstr->len;1027pstr->raw_stop = pstr->stop;1028}10291030#ifdef RE_ENABLE_I18N10311032/* Build wide character buffer PSTR->WCS.1033If the byte sequence of the string are:1034<mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>1035Then wide character buffer will be:1036<wc1> , WEOF , <wc2> , WEOF , <wc3>1037We use WEOF for padding, they indicate that the position isn't1038a first byte of a multibyte character.10391040Note that this function assumes PSTR->VALID_LEN elements are already1041built and starts from PSTR->VALID_LEN. */10421043static void1044internal_function1045build_wcs_buffer (re_string_t *pstr)1046{1047#ifdef _LIBC1048unsigned char buf[MB_LEN_MAX];1049assert (MB_LEN_MAX >= pstr->mb_cur_max);1050#else1051unsigned char buf[64];1052#endif1053mbstate_t prev_st;1054int byte_idx, end_idx, remain_len;1055size_t mbclen;10561057/* Build the buffers from pstr->valid_len to either pstr->len or1058pstr->bufs_len. */1059end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;1060for (byte_idx = pstr->valid_len; byte_idx < end_idx;)1061{1062wchar_t wc;1063const char *p;10641065remain_len = end_idx - byte_idx;1066prev_st = pstr->cur_state;1067/* Apply the translation if we need. */1068if (BE (pstr->trans != NULL, 0))1069{1070int i, ch;10711072for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)1073{1074ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];1075buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];1076}1077p = (const char *) buf;1078}1079else1080p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;1081mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);1082if (BE (mbclen == (size_t) -2, 0))1083{1084/* The buffer doesn't have enough space, finish to build. */1085pstr->cur_state = prev_st;1086break;1087}1088else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))1089{1090/* We treat these cases as a singlebyte character. */1091mbclen = 1;1092wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];1093if (BE (pstr->trans != NULL, 0))1094wc = pstr->trans[wc];1095pstr->cur_state = prev_st;1096}10971098/* Write wide character and padding. */1099pstr->wcs[byte_idx++] = wc;1100/* Write paddings. */1101for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)1102pstr->wcs[byte_idx++] = WEOF;1103}1104pstr->valid_len = byte_idx;1105pstr->valid_raw_len = byte_idx;1106}11071108/* Build wide character buffer PSTR->WCS like build_wcs_buffer,1109but for REG_ICASE. */11101111static reg_errcode_t1112internal_function1113build_wcs_upper_buffer (re_string_t *pstr)1114{1115mbstate_t prev_st;1116int src_idx, byte_idx, end_idx, remain_len;1117size_t mbclen;1118#ifdef _LIBC1119char buf[MB_LEN_MAX];1120assert (MB_LEN_MAX >= pstr->mb_cur_max);1121#else1122char buf[64];1123#endif11241125byte_idx = pstr->valid_len;1126end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;11271128/* The following optimization assumes that ASCII characters can be1129mapped to wide characters with a simple cast. */1130if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)1131{1132while (byte_idx < end_idx)1133{1134wchar_t wc;11351136if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])1137&& mbsinit (&pstr->cur_state))1138{1139/* In case of a singlebyte character. */1140pstr->mbs[byte_idx]1141= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);1142/* The next step uses the assumption that wchar_t is encoded1143ASCII-safe: all ASCII values can be converted like this. */1144pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];1145++byte_idx;1146continue;1147}11481149remain_len = end_idx - byte_idx;1150prev_st = pstr->cur_state;1151mbclen = mbrtowc (&wc,1152((const char *) pstr->raw_mbs + pstr->raw_mbs_idx1153+ byte_idx), remain_len, &pstr->cur_state);1154if (BE (mbclen + 2 > 2, 1))1155{1156wchar_t wcu = wc;1157if (iswlower (wc))1158{1159size_t mbcdlen;11601161wcu = towupper (wc);1162mbcdlen = wcrtomb (buf, wcu, &prev_st);1163if (BE (mbclen == mbcdlen, 1))1164memcpy (pstr->mbs + byte_idx, buf, mbclen);1165else1166{1167src_idx = byte_idx;1168goto offsets_needed;1169}1170}1171else1172memcpy (pstr->mbs + byte_idx,1173pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);1174pstr->wcs[byte_idx++] = wcu;1175/* Write paddings. */1176for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)1177pstr->wcs[byte_idx++] = WEOF;1178}1179else if (mbclen == (size_t) -1 || mbclen == 0)1180{1181/* It is an invalid character or '\0'. Just use the byte. */1182int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];1183pstr->mbs[byte_idx] = ch;1184/* And also cast it to wide char. */1185pstr->wcs[byte_idx++] = (wchar_t) ch;1186if (BE (mbclen == (size_t) -1, 0))1187pstr->cur_state = prev_st;1188}1189else1190{1191/* The buffer doesn't have enough space, finish to build. */1192pstr->cur_state = prev_st;1193break;1194}1195}1196pstr->valid_len = byte_idx;1197pstr->valid_raw_len = byte_idx;1198return REG_NOERROR;1199}1200else1201for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)1202{1203wchar_t wc;1204const char *p;1205offsets_needed:1206remain_len = end_idx - byte_idx;1207prev_st = pstr->cur_state;1208if (BE (pstr->trans != NULL, 0))1209{1210int i, ch;12111212for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)1213{1214ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];1215buf[i] = pstr->trans[ch];1216}1217p = (const char *) buf;1218}1219else1220p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;1221mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);1222if (BE (mbclen + 2 > 2, 1))1223{1224wchar_t wcu = wc;1225if (iswlower (wc))1226{1227size_t mbcdlen;12281229wcu = towupper (wc);1230mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);1231if (BE (mbclen == mbcdlen, 1))1232memcpy (pstr->mbs + byte_idx, buf, mbclen);1233else if (mbcdlen != (size_t) -1)1234{1235size_t i;12361237if (byte_idx + mbcdlen > pstr->bufs_len)1238{1239pstr->cur_state = prev_st;1240break;1241}12421243if (pstr->offsets == NULL)1244{1245pstr->offsets = re_malloc (int, pstr->bufs_len);12461247if (pstr->offsets == NULL)1248return REG_ESPACE;1249}1250if (!pstr->offsets_needed)1251{1252for (i = 0; i < (size_t) byte_idx; ++i)1253pstr->offsets[i] = i;1254pstr->offsets_needed = 1;1255}12561257memcpy (pstr->mbs + byte_idx, buf, mbcdlen);1258pstr->wcs[byte_idx] = wcu;1259pstr->offsets[byte_idx] = src_idx;1260for (i = 1; i < mbcdlen; ++i)1261{1262pstr->offsets[byte_idx + i]1263= src_idx + (i < mbclen ? i : mbclen - 1);1264pstr->wcs[byte_idx + i] = WEOF;1265}1266pstr->len += mbcdlen - mbclen;1267if (pstr->raw_stop > src_idx)1268pstr->stop += mbcdlen - mbclen;1269end_idx = (pstr->bufs_len > pstr->len)1270? pstr->len : pstr->bufs_len;1271byte_idx += mbcdlen;1272src_idx += mbclen;1273continue;1274}1275else1276memcpy (pstr->mbs + byte_idx, p, mbclen);1277}1278else1279memcpy (pstr->mbs + byte_idx, p, mbclen);12801281if (BE (pstr->offsets_needed != 0, 0))1282{1283size_t i;1284for (i = 0; i < mbclen; ++i)1285pstr->offsets[byte_idx + i] = src_idx + i;1286}1287src_idx += mbclen;12881289pstr->wcs[byte_idx++] = wcu;1290/* Write paddings. */1291for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)1292pstr->wcs[byte_idx++] = WEOF;1293}1294else if (mbclen == (size_t) -1 || mbclen == 0)1295{1296/* It is an invalid character or '\0'. Just use the byte. */1297int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];12981299if (BE (pstr->trans != NULL, 0))1300ch = pstr->trans [ch];1301pstr->mbs[byte_idx] = ch;13021303if (BE (pstr->offsets_needed != 0, 0))1304pstr->offsets[byte_idx] = src_idx;1305++src_idx;13061307/* And also cast it to wide char. */1308pstr->wcs[byte_idx++] = (wchar_t) ch;1309if (BE (mbclen == (size_t) -1, 0))1310pstr->cur_state = prev_st;1311}1312else1313{1314/* The buffer doesn't have enough space, finish to build. */1315pstr->cur_state = prev_st;1316break;1317}1318}1319pstr->valid_len = byte_idx;1320pstr->valid_raw_len = src_idx;1321return REG_NOERROR;1322}13231324/* Skip characters until the index becomes greater than NEW_RAW_IDX.1325Return the index. */13261327static int1328internal_function1329re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)1330{1331mbstate_t prev_st;1332int rawbuf_idx;1333size_t mbclen;1334wchar_t wc = WEOF;13351336/* Skip the characters which are not necessary to check. */1337for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;1338rawbuf_idx < new_raw_idx;)1339{1340int remain_len;1341remain_len = pstr->len - rawbuf_idx;1342prev_st = pstr->cur_state;1343mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,1344remain_len, &pstr->cur_state);1345if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))1346{1347/* We treat these cases as a single byte character. */1348if (mbclen == 0 || remain_len == 0)1349wc = L'\0';1350else1351wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);1352mbclen = 1;1353pstr->cur_state = prev_st;1354}1355/* Then proceed the next character. */1356rawbuf_idx += mbclen;1357}1358*last_wc = (wint_t) wc;1359return rawbuf_idx;1360}1361#endif /* RE_ENABLE_I18N */13621363/* Build the buffer PSTR->MBS, and apply the translation if we need.1364This function is used in case of REG_ICASE. */13651366static void1367internal_function1368build_upper_buffer (re_string_t *pstr)1369{1370int char_idx, end_idx;1371end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;13721373for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)1374{1375int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];1376if (BE (pstr->trans != NULL, 0))1377ch = pstr->trans[ch];1378if (islower (ch))1379pstr->mbs[char_idx] = toupper (ch);1380else1381pstr->mbs[char_idx] = ch;1382}1383pstr->valid_len = char_idx;1384pstr->valid_raw_len = char_idx;1385}13861387/* Apply TRANS to the buffer in PSTR. */13881389static void1390internal_function1391re_string_translate_buffer (re_string_t *pstr)1392{1393int buf_idx, end_idx;1394end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;13951396for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)1397{1398int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];1399pstr->mbs[buf_idx] = pstr->trans[ch];1400}14011402pstr->valid_len = buf_idx;1403pstr->valid_raw_len = buf_idx;1404}14051406/* This function re-construct the buffers.1407Concretely, convert to wide character in case of pstr->mb_cur_max > 1,1408convert to upper case in case of REG_ICASE, apply translation. */14091410static reg_errcode_t1411internal_function1412re_string_reconstruct (re_string_t *pstr, int idx, int eflags)1413{1414int offset = idx - pstr->raw_mbs_idx;1415if (BE (offset < 0, 0))1416{1417/* Reset buffer. */1418#ifdef RE_ENABLE_I18N1419if (pstr->mb_cur_max > 1)1420memset (&pstr->cur_state, '\0', sizeof (mbstate_t));1421#endif /* RE_ENABLE_I18N */1422pstr->len = pstr->raw_len;1423pstr->stop = pstr->raw_stop;1424pstr->valid_len = 0;1425pstr->raw_mbs_idx = 0;1426pstr->valid_raw_len = 0;1427pstr->offsets_needed = 0;1428pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF1429: CONTEXT_NEWLINE | CONTEXT_BEGBUF);1430if (!pstr->mbs_allocated)1431pstr->mbs = (unsigned char *) pstr->raw_mbs;1432offset = idx;1433}14341435if (BE (offset != 0, 1))1436{1437/* Should the already checked characters be kept? */1438if (BE (offset < pstr->valid_raw_len, 1))1439{1440/* Yes, move them to the front of the buffer. */1441#ifdef RE_ENABLE_I18N1442if (BE (pstr->offsets_needed, 0))1443{1444int low = 0, high = pstr->valid_len, mid;1445do1446{1447mid = (high + low) / 2;1448if (pstr->offsets[mid] > offset)1449high = mid;1450else if (pstr->offsets[mid] < offset)1451low = mid + 1;1452else1453break;1454}1455while (low < high);1456if (pstr->offsets[mid] < offset)1457++mid;1458pstr->tip_context = re_string_context_at (pstr, mid - 1,1459eflags);1460/* This can be quite complicated, so handle specially1461only the common and easy case where the character with1462different length representation of lower and upper1463case is present at or after offset. */1464if (pstr->valid_len > offset1465&& mid == offset && pstr->offsets[mid] == offset)1466{1467memmove (pstr->wcs, pstr->wcs + offset,1468(pstr->valid_len - offset) * sizeof (wint_t));1469memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);1470pstr->valid_len -= offset;1471pstr->valid_raw_len -= offset;1472for (low = 0; low < pstr->valid_len; low++)1473pstr->offsets[low] = pstr->offsets[low + offset] - offset;1474}1475else1476{1477/* Otherwise, just find out how long the partial multibyte1478character at offset is and fill it with WEOF/255. */1479pstr->len = pstr->raw_len - idx + offset;1480pstr->stop = pstr->raw_stop - idx + offset;1481pstr->offsets_needed = 0;1482while (mid > 0 && pstr->offsets[mid - 1] == offset)1483--mid;1484while (mid < pstr->valid_len)1485if (pstr->wcs[mid] != WEOF)1486break;1487else1488++mid;1489if (mid == pstr->valid_len)1490pstr->valid_len = 0;1491else1492{1493pstr->valid_len = pstr->offsets[mid] - offset;1494if (pstr->valid_len)1495{1496for (low = 0; low < pstr->valid_len; ++low)1497pstr->wcs[low] = WEOF;1498memset (pstr->mbs, 255, pstr->valid_len);1499}1500}1501pstr->valid_raw_len = pstr->valid_len;1502}1503}1504else1505#endif1506{1507pstr->tip_context = re_string_context_at (pstr, offset - 1,1508eflags);1509#ifdef RE_ENABLE_I18N1510if (pstr->mb_cur_max > 1)1511memmove (pstr->wcs, pstr->wcs + offset,1512(pstr->valid_len - offset) * sizeof (wint_t));1513#endif /* RE_ENABLE_I18N */1514if (BE (pstr->mbs_allocated, 0))1515memmove (pstr->mbs, pstr->mbs + offset,1516pstr->valid_len - offset);1517pstr->valid_len -= offset;1518pstr->valid_raw_len -= offset;1519#if DEBUG1520assert (pstr->valid_len > 0);1521#endif1522}1523}1524else1525{1526/* No, skip all characters until IDX. */1527int prev_valid_len = pstr->valid_len;15281529#ifdef RE_ENABLE_I18N1530if (BE (pstr->offsets_needed, 0))1531{1532pstr->len = pstr->raw_len - idx + offset;1533pstr->stop = pstr->raw_stop - idx + offset;1534pstr->offsets_needed = 0;1535}1536#endif1537pstr->valid_len = 0;1538#ifdef RE_ENABLE_I18N1539if (pstr->mb_cur_max > 1)1540{1541int wcs_idx;1542wint_t wc = WEOF;15431544if (pstr->is_utf8)1545{1546const unsigned char *raw, *p, *q, *end;15471548/* Special case UTF-8. Multi-byte chars start with any1549byte other than 0x80 - 0xbf. */1550raw = pstr->raw_mbs + pstr->raw_mbs_idx;1551end = raw + (offset - pstr->mb_cur_max);1552if (end < pstr->raw_mbs)1553end = pstr->raw_mbs;1554p = raw + offset - 1;1555#ifdef _LIBC1556/* We know the wchar_t encoding is UCS4, so for the simple1557case, ASCII characters, skip the conversion step. */1558if (isascii (*p) && BE (pstr->trans == NULL, 1))1559{1560memset (&pstr->cur_state, '\0', sizeof (mbstate_t));1561/* pstr->valid_len = 0; */1562wc = (wchar_t) *p;1563}1564else1565#endif1566for (; p >= end; --p)1567if ((*p & 0xc0) != 0x80)1568{1569mbstate_t cur_state;1570wchar_t wc2;1571int mlen = raw + pstr->len - p;1572unsigned char buf[6];1573size_t mbclen;15741575q = p;1576if (BE (pstr->trans != NULL, 0))1577{1578int i = mlen < 6 ? mlen : 6;1579while (--i >= 0)1580buf[i] = pstr->trans[p[i]];1581q = buf;1582}1583/* XXX Don't use mbrtowc, we know which conversion1584to use (UTF-8 -> UCS4). */1585memset (&cur_state, 0, sizeof (cur_state));1586mbclen = mbrtowc (&wc2, (const char *) p, mlen,1587&cur_state);1588if (raw + offset - p <= mbclen1589&& mbclen < (size_t) -2)1590{1591memset (&pstr->cur_state, '\0',1592sizeof (mbstate_t));1593pstr->valid_len = mbclen - (raw + offset - p);1594wc = wc2;1595}1596break;1597}1598}15991600if (wc == WEOF)1601pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;1602if (wc == WEOF)1603pstr->tip_context1604= re_string_context_at (pstr, prev_valid_len - 1, eflags);1605else1606pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)1607&& IS_WIDE_WORD_CHAR (wc))1608? CONTEXT_WORD1609: ((IS_WIDE_NEWLINE (wc)1610&& pstr->newline_anchor)1611? CONTEXT_NEWLINE : 0));1612if (BE (pstr->valid_len, 0))1613{1614for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)1615pstr->wcs[wcs_idx] = WEOF;1616if (pstr->mbs_allocated)1617memset (pstr->mbs, 255, pstr->valid_len);1618}1619pstr->valid_raw_len = pstr->valid_len;1620}1621else1622#endif /* RE_ENABLE_I18N */1623{1624int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];1625pstr->valid_raw_len = 0;1626if (pstr->trans)1627c = pstr->trans[c];1628pstr->tip_context = (bitset_contain (pstr->word_char, c)1629? CONTEXT_WORD1630: ((IS_NEWLINE (c) && pstr->newline_anchor)1631? CONTEXT_NEWLINE : 0));1632}1633}1634if (!BE (pstr->mbs_allocated, 0))1635pstr->mbs += offset;1636}1637pstr->raw_mbs_idx = idx;1638pstr->len -= offset;1639pstr->stop -= offset;16401641/* Then build the buffers. */1642#ifdef RE_ENABLE_I18N1643if (pstr->mb_cur_max > 1)1644{1645if (pstr->icase)1646{1647reg_errcode_t ret = build_wcs_upper_buffer (pstr);1648if (BE (ret != REG_NOERROR, 0))1649return ret;1650}1651else1652build_wcs_buffer (pstr);1653}1654else1655#endif /* RE_ENABLE_I18N */1656if (BE (pstr->mbs_allocated, 0))1657{1658if (pstr->icase)1659build_upper_buffer (pstr);1660else if (pstr->trans != NULL)1661re_string_translate_buffer (pstr);1662}1663else1664pstr->valid_len = pstr->len;16651666pstr->cur_idx = 0;1667return REG_NOERROR;1668}16691670static unsigned char1671internal_function __attribute ((pure))1672re_string_peek_byte_case (const re_string_t *pstr, int idx)1673{1674int ch, off;16751676/* Handle the common (easiest) cases first. */1677if (BE (!pstr->mbs_allocated, 1))1678return re_string_peek_byte (pstr, idx);16791680#ifdef RE_ENABLE_I18N1681if (pstr->mb_cur_max > 11682&& ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))1683return re_string_peek_byte (pstr, idx);1684#endif16851686off = pstr->cur_idx + idx;1687#ifdef RE_ENABLE_I18N1688if (pstr->offsets_needed)1689off = pstr->offsets[off];1690#endif16911692ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];16931694#ifdef RE_ENABLE_I18N1695/* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I1696this function returns CAPITAL LETTER I instead of first byte of1697DOTLESS SMALL LETTER I. The latter would confuse the parser,1698since peek_byte_case doesn't advance cur_idx in any way. */1699if (pstr->offsets_needed && !isascii (ch))1700return re_string_peek_byte (pstr, idx);1701#endif17021703return ch;1704}17051706static unsigned char1707internal_function __attribute ((pure))1708re_string_fetch_byte_case (re_string_t *pstr)1709{1710if (BE (!pstr->mbs_allocated, 1))1711return re_string_fetch_byte (pstr);17121713#ifdef RE_ENABLE_I18N1714if (pstr->offsets_needed)1715{1716int off, ch;17171718/* For tr_TR.UTF-8 [[:islower:]] there is1719[[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip1720in that case the whole multi-byte character and return1721the original letter. On the other side, with1722[[: DOTLESS SMALL LETTER I return [[:I, as doing1723anything else would complicate things too much. */17241725if (!re_string_first_byte (pstr, pstr->cur_idx))1726return re_string_fetch_byte (pstr);17271728off = pstr->offsets[pstr->cur_idx];1729ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];17301731if (! isascii (ch))1732return re_string_fetch_byte (pstr);17331734re_string_skip_bytes (pstr,1735re_string_char_size_at (pstr, pstr->cur_idx));1736return ch;1737}1738#endif17391740return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];1741}17421743static void1744internal_function1745re_string_destruct (re_string_t *pstr)1746{1747#ifdef RE_ENABLE_I18N1748re_free (pstr->wcs);1749re_free (pstr->offsets);1750#endif /* RE_ENABLE_I18N */1751if (pstr->mbs_allocated)1752re_free (pstr->mbs);1753}17541755/* Return the context at IDX in INPUT. */17561757static unsigned int1758internal_function1759re_string_context_at (const re_string_t *input, int idx, int eflags)1760{1761int c;1762if (BE (idx < 0, 0))1763/* In this case, we use the value stored in input->tip_context,1764since we can't know the character in input->mbs[-1] here. */1765return input->tip_context;1766if (BE (idx == input->len, 0))1767return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF1768: CONTEXT_NEWLINE | CONTEXT_ENDBUF);1769#ifdef RE_ENABLE_I18N1770if (input->mb_cur_max > 1)1771{1772wint_t wc;1773int wc_idx = idx;1774while(input->wcs[wc_idx] == WEOF)1775{1776#ifdef DEBUG1777/* It must not happen. */1778assert (wc_idx >= 0);1779#endif1780--wc_idx;1781if (wc_idx < 0)1782return input->tip_context;1783}1784wc = input->wcs[wc_idx];1785if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))1786return CONTEXT_WORD;1787return (IS_WIDE_NEWLINE (wc) && input->newline_anchor1788? CONTEXT_NEWLINE : 0);1789}1790else1791#endif1792{1793c = re_string_byte_at (input, idx);1794if (bitset_contain (input->word_char, c))1795return CONTEXT_WORD;1796return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;1797}1798}17991800/* Functions for set operation. */18011802static reg_errcode_t1803internal_function1804re_node_set_alloc (re_node_set *set, int size)1805{1806set->alloc = size;1807set->nelem = 0;1808set->elems = re_malloc (int, size);1809if (BE (set->elems == NULL, 0))1810return REG_ESPACE;1811return REG_NOERROR;1812}18131814static reg_errcode_t1815internal_function1816re_node_set_init_1 (re_node_set *set, int elem)1817{1818set->alloc = 1;1819set->nelem = 1;1820set->elems = re_malloc (int, 1);1821if (BE (set->elems == NULL, 0))1822{1823set->alloc = set->nelem = 0;1824return REG_ESPACE;1825}1826set->elems[0] = elem;1827return REG_NOERROR;1828}18291830static reg_errcode_t1831internal_function1832re_node_set_init_2 (re_node_set *set, int elem1, int elem2)1833{1834set->alloc = 2;1835set->elems = re_malloc (int, 2);1836if (BE (set->elems == NULL, 0))1837return REG_ESPACE;1838if (elem1 == elem2)1839{1840set->nelem = 1;1841set->elems[0] = elem1;1842}1843else1844{1845set->nelem = 2;1846if (elem1 < elem2)1847{1848set->elems[0] = elem1;1849set->elems[1] = elem2;1850}1851else1852{1853set->elems[0] = elem2;1854set->elems[1] = elem1;1855}1856}1857return REG_NOERROR;1858}18591860static reg_errcode_t1861internal_function1862re_node_set_init_copy (re_node_set *dest, const re_node_set *src)1863{1864dest->nelem = src->nelem;1865if (src->nelem > 0)1866{1867dest->alloc = dest->nelem;1868dest->elems = re_malloc (int, dest->alloc);1869if (BE (dest->elems == NULL, 0))1870{1871dest->alloc = dest->nelem = 0;1872return REG_ESPACE;1873}1874memcpy (dest->elems, src->elems, src->nelem * sizeof (int));1875}1876else1877re_node_set_init_empty (dest);1878return REG_NOERROR;1879}18801881/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to1882DEST. Return value indicate the error code or REG_NOERROR if succeeded.1883Note: We assume dest->elems is NULL, when dest->alloc is 0. */18841885static reg_errcode_t1886internal_function1887re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,1888const re_node_set *src2)1889{1890int i1, i2, is, id, delta, sbase;1891if (src1->nelem == 0 || src2->nelem == 0)1892return REG_NOERROR;18931894/* We need dest->nelem + 2 * elems_in_intersection; this is a1895conservative estimate. */1896if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)1897{1898int new_alloc = src1->nelem + src2->nelem + dest->alloc;1899int *new_elems = re_realloc (dest->elems, int, new_alloc);1900if (BE (new_elems == NULL, 0))1901return REG_ESPACE;1902dest->elems = new_elems;1903dest->alloc = new_alloc;1904}19051906/* Find the items in the intersection of SRC1 and SRC2, and copy1907into the top of DEST those that are not already in DEST itself. */1908sbase = dest->nelem + src1->nelem + src2->nelem;1909i1 = src1->nelem - 1;1910i2 = src2->nelem - 1;1911id = dest->nelem - 1;1912for (;;)1913{1914if (src1->elems[i1] == src2->elems[i2])1915{1916/* Try to find the item in DEST. Maybe we could binary search? */1917while (id >= 0 && dest->elems[id] > src1->elems[i1])1918--id;19191920if (id < 0 || dest->elems[id] != src1->elems[i1])1921dest->elems[--sbase] = src1->elems[i1];19221923if (--i1 < 0 || --i2 < 0)1924break;1925}19261927/* Lower the highest of the two items. */1928else if (src1->elems[i1] < src2->elems[i2])1929{1930if (--i2 < 0)1931break;1932}1933else1934{1935if (--i1 < 0)1936break;1937}1938}19391940id = dest->nelem - 1;1941is = dest->nelem + src1->nelem + src2->nelem - 1;1942delta = is - sbase + 1;19431944/* Now copy. When DELTA becomes zero, the remaining1945DEST elements are already in place; this is more or1946less the same loop that is in re_node_set_merge. */1947dest->nelem += delta;1948if (delta > 0 && id >= 0)1949for (;;)1950{1951if (dest->elems[is] > dest->elems[id])1952{1953/* Copy from the top. */1954dest->elems[id + delta--] = dest->elems[is--];1955if (delta == 0)1956break;1957}1958else1959{1960/* Slide from the bottom. */1961dest->elems[id + delta] = dest->elems[id];1962if (--id < 0)1963break;1964}1965}19661967/* Copy remaining SRC elements. */1968memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));19691970return REG_NOERROR;1971}19721973/* Calculate the union set of the sets SRC1 and SRC2. And store it to1974DEST. Return value indicate the error code or REG_NOERROR if succeeded. */19751976static reg_errcode_t1977internal_function1978re_node_set_init_union (re_node_set *dest, const re_node_set *src1,1979const re_node_set *src2)1980{1981int i1, i2, id;1982if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)1983{1984dest->alloc = src1->nelem + src2->nelem;1985dest->elems = re_malloc (int, dest->alloc);1986if (BE (dest->elems == NULL, 0))1987return REG_ESPACE;1988}1989else1990{1991if (src1 != NULL && src1->nelem > 0)1992return re_node_set_init_copy (dest, src1);1993else if (src2 != NULL && src2->nelem > 0)1994return re_node_set_init_copy (dest, src2);1995else1996re_node_set_init_empty (dest);1997return REG_NOERROR;1998}1999for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)2000{2001if (src1->elems[i1] > src2->elems[i2])2002{2003dest->elems[id++] = src2->elems[i2++];2004continue;2005}2006if (src1->elems[i1] == src2->elems[i2])2007++i2;2008dest->elems[id++] = src1->elems[i1++];2009}2010if (i1 < src1->nelem)2011{2012memcpy (dest->elems + id, src1->elems + i1,2013(src1->nelem - i1) * sizeof (int));2014id += src1->nelem - i1;2015}2016else if (i2 < src2->nelem)2017{2018memcpy (dest->elems + id, src2->elems + i2,2019(src2->nelem - i2) * sizeof (int));2020id += src2->nelem - i2;2021}2022dest->nelem = id;2023return REG_NOERROR;2024}20252026/* Calculate the union set of the sets DEST and SRC. And store it to2027DEST. Return value indicate the error code or REG_NOERROR if succeeded. */20282029static reg_errcode_t2030internal_function2031re_node_set_merge (re_node_set *dest, const re_node_set *src)2032{2033int is, id, sbase, delta;2034if (src == NULL || src->nelem == 0)2035return REG_NOERROR;2036if (dest->alloc < 2 * src->nelem + dest->nelem)2037{2038int new_alloc = 2 * (src->nelem + dest->alloc);2039int *new_buffer = re_realloc (dest->elems, int, new_alloc);2040if (BE (new_buffer == NULL, 0))2041return REG_ESPACE;2042dest->elems = new_buffer;2043dest->alloc = new_alloc;2044}20452046if (BE (dest->nelem == 0, 0))2047{2048dest->nelem = src->nelem;2049memcpy (dest->elems, src->elems, src->nelem * sizeof (int));2050return REG_NOERROR;2051}20522053/* Copy into the top of DEST the items of SRC that are not2054found in DEST. Maybe we could binary search in DEST? */2055for (sbase = dest->nelem + 2 * src->nelem,2056is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )2057{2058if (dest->elems[id] == src->elems[is])2059is--, id--;2060else if (dest->elems[id] < src->elems[is])2061dest->elems[--sbase] = src->elems[is--];2062else /* if (dest->elems[id] > src->elems[is]) */2063--id;2064}20652066if (is >= 0)2067{2068/* If DEST is exhausted, the remaining items of SRC must be unique. */2069sbase -= is + 1;2070memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));2071}20722073id = dest->nelem - 1;2074is = dest->nelem + 2 * src->nelem - 1;2075delta = is - sbase + 1;2076if (delta == 0)2077return REG_NOERROR;20782079/* Now copy. When DELTA becomes zero, the remaining2080DEST elements are already in place. */2081dest->nelem += delta;2082for (;;)2083{2084if (dest->elems[is] > dest->elems[id])2085{2086/* Copy from the top. */2087dest->elems[id + delta--] = dest->elems[is--];2088if (delta == 0)2089break;2090}2091else2092{2093/* Slide from the bottom. */2094dest->elems[id + delta] = dest->elems[id];2095if (--id < 0)2096{2097/* Copy remaining SRC elements. */2098memcpy (dest->elems, dest->elems + sbase,2099delta * sizeof (int));2100break;2101}2102}2103}21042105return REG_NOERROR;2106}21072108/* Insert the new element ELEM to the re_node_set* SET.2109SET should not already have ELEM.2110return -1 if an error is occured, return 1 otherwise. */21112112static int2113internal_function2114re_node_set_insert (re_node_set *set, int elem)2115{2116int idx;2117/* In case the set is empty. */2118if (set->alloc == 0)2119{2120if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))2121return 1;2122else2123return -1;2124}21252126if (BE (set->nelem, 0) == 0)2127{2128/* We already guaranteed above that set->alloc != 0. */2129set->elems[0] = elem;2130++set->nelem;2131return 1;2132}21332134/* Realloc if we need. */2135if (set->alloc == set->nelem)2136{2137int *new_elems;2138set->alloc = set->alloc * 2;2139new_elems = re_realloc (set->elems, int, set->alloc);2140if (BE (new_elems == NULL, 0))2141return -1;2142set->elems = new_elems;2143}21442145/* Move the elements which follows the new element. Test the2146first element separately to skip a check in the inner loop. */2147if (elem < set->elems[0])2148{2149idx = 0;2150for (idx = set->nelem; idx > 0; idx--)2151set->elems[idx] = set->elems[idx - 1];2152}2153else2154{2155for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)2156set->elems[idx] = set->elems[idx - 1];2157}21582159/* Insert the new element. */2160set->elems[idx] = elem;2161++set->nelem;2162return 1;2163}21642165/* Insert the new element ELEM to the re_node_set* SET.2166SET should not already have any element greater than or equal to ELEM.2167Return -1 if an error is occured, return 1 otherwise. */21682169static int2170internal_function2171re_node_set_insert_last (re_node_set *set, int elem)2172{2173/* Realloc if we need. */2174if (set->alloc == set->nelem)2175{2176int *new_elems;2177set->alloc = (set->alloc + 1) * 2;2178new_elems = re_realloc (set->elems, int, set->alloc);2179if (BE (new_elems == NULL, 0))2180return -1;2181set->elems = new_elems;2182}21832184/* Insert the new element. */2185set->elems[set->nelem++] = elem;2186return 1;2187}21882189/* Compare two node sets SET1 and SET2.2190return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */21912192static int2193internal_function __attribute ((pure))2194re_node_set_compare (const re_node_set *set1, const re_node_set *set2)2195{2196int i;2197if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)2198return 0;2199for (i = set1->nelem ; --i >= 0 ; )2200if (set1->elems[i] != set2->elems[i])2201return 0;2202return 1;2203}22042205/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */22062207static int2208internal_function __attribute ((pure))2209re_node_set_contains (const re_node_set *set, int elem)2210{2211unsigned int idx, right, mid;2212if (set->nelem <= 0)2213return 0;22142215/* Binary search the element. */2216idx = 0;2217right = set->nelem - 1;2218while (idx < right)2219{2220mid = (idx + right) / 2;2221if (set->elems[mid] < elem)2222idx = mid + 1;2223else2224right = mid;2225}2226return set->elems[idx] == elem ? idx + 1 : 0;2227}22282229static void2230internal_function2231re_node_set_remove_at (re_node_set *set, int idx)2232{2233if (idx < 0 || idx >= set->nelem)2234return;2235--set->nelem;2236for (; idx < set->nelem; idx++)2237set->elems[idx] = set->elems[idx + 1];2238}223922402241/* Add the token TOKEN to dfa->nodes, and return the index of the token.2242Or return -1, if an error will be occured. */22432244static int2245internal_function2246re_dfa_add_node (re_dfa_t *dfa, re_token_t token)2247{2248int type = token.type;2249if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))2250{2251size_t new_nodes_alloc = dfa->nodes_alloc * 2;2252int *new_nexts, *new_indices;2253re_node_set *new_edests, *new_eclosures;2254re_token_t *new_nodes;22552256/* Avoid overflows. */2257if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))2258return -1;22592260new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);2261if (BE (new_nodes == NULL, 0))2262return -1;2263dfa->nodes = new_nodes;2264new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);2265new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);2266new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);2267new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);2268if (BE (new_nexts == NULL || new_indices == NULL2269|| new_edests == NULL || new_eclosures == NULL, 0))2270return -1;2271dfa->nexts = new_nexts;2272dfa->org_indices = new_indices;2273dfa->edests = new_edests;2274dfa->eclosures = new_eclosures;2275dfa->nodes_alloc = new_nodes_alloc;2276}2277dfa->nodes[dfa->nodes_len] = token;2278dfa->nodes[dfa->nodes_len].constraint = 0;2279#ifdef RE_ENABLE_I18N2280dfa->nodes[dfa->nodes_len].accept_mb =2281(type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;2282#endif2283dfa->nexts[dfa->nodes_len] = -1;2284re_node_set_init_empty (dfa->edests + dfa->nodes_len);2285re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);2286return dfa->nodes_len++;2287}22882289static inline unsigned int2290internal_function2291calc_state_hash (const re_node_set *nodes, unsigned int context)2292{2293unsigned int hash = nodes->nelem + context;2294int i;2295for (i = 0 ; i < nodes->nelem ; i++)2296hash += nodes->elems[i];2297return hash;2298}22992300/* Search for the state whose node_set is equivalent to NODES.2301Return the pointer to the state, if we found it in the DFA.2302Otherwise create the new one and return it. In case of an error2303return NULL and set the error code in ERR.2304Note: - We assume NULL as the invalid state, then it is possible that2305return value is NULL and ERR is REG_NOERROR.2306- We never return non-NULL value in case of any errors, it is for2307optimization. */23082309static re_dfastate_t *2310internal_function2311re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,2312const re_node_set *nodes)2313{2314unsigned int hash;2315re_dfastate_t *new_state;2316struct re_state_table_entry *spot;2317int i;2318if (BE (nodes->nelem == 0, 0))2319{2320*err = REG_NOERROR;2321return NULL;2322}2323hash = calc_state_hash (nodes, 0);2324spot = dfa->state_table + (hash & dfa->state_hash_mask);23252326for (i = 0 ; i < spot->num ; i++)2327{2328re_dfastate_t *state = spot->array[i];2329if (hash != state->hash)2330continue;2331if (re_node_set_compare (&state->nodes, nodes))2332return state;2333}23342335/* There are no appropriate state in the dfa, create the new one. */2336new_state = create_ci_newstate (dfa, nodes, hash);2337if (BE (new_state == NULL, 0))2338*err = REG_ESPACE;23392340return new_state;2341}23422343/* Search for the state whose node_set is equivalent to NODES and2344whose context is equivalent to CONTEXT.2345Return the pointer to the state, if we found it in the DFA.2346Otherwise create the new one and return it. In case of an error2347return NULL and set the error code in ERR.2348Note: - We assume NULL as the invalid state, then it is possible that2349return value is NULL and ERR is REG_NOERROR.2350- We never return non-NULL value in case of any errors, it is for2351optimization. */23522353static re_dfastate_t *2354internal_function2355re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,2356const re_node_set *nodes, unsigned int context)2357{2358unsigned int hash;2359re_dfastate_t *new_state;2360struct re_state_table_entry *spot;2361int i;2362if (nodes->nelem == 0)2363{2364*err = REG_NOERROR;2365return NULL;2366}2367hash = calc_state_hash (nodes, context);2368spot = dfa->state_table + (hash & dfa->state_hash_mask);23692370for (i = 0 ; i < spot->num ; i++)2371{2372re_dfastate_t *state = spot->array[i];2373if (state->hash == hash2374&& state->context == context2375&& re_node_set_compare (state->entrance_nodes, nodes))2376return state;2377}2378/* There are no appropriate state in `dfa', create the new one. */2379new_state = create_cd_newstate (dfa, nodes, context, hash);2380if (BE (new_state == NULL, 0))2381*err = REG_ESPACE;23822383return new_state;2384}23852386/* Finish initialization of the new state NEWSTATE, and using its hash value2387HASH put in the appropriate bucket of DFA's state table. Return value2388indicates the error code if failed. */23892390static reg_errcode_t2391register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,2392unsigned int hash)2393{2394struct re_state_table_entry *spot;2395reg_errcode_t err;2396int i;23972398newstate->hash = hash;2399err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);2400if (BE (err != REG_NOERROR, 0))2401return REG_ESPACE;2402for (i = 0; i < newstate->nodes.nelem; i++)2403{2404int elem = newstate->nodes.elems[i];2405if (!IS_EPSILON_NODE (dfa->nodes[elem].type))2406re_node_set_insert_last (&newstate->non_eps_nodes, elem);2407}24082409spot = dfa->state_table + (hash & dfa->state_hash_mask);2410if (BE (spot->alloc <= spot->num, 0))2411{2412int new_alloc = 2 * spot->num + 2;2413re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,2414new_alloc);2415if (BE (new_array == NULL, 0))2416return REG_ESPACE;2417spot->array = new_array;2418spot->alloc = new_alloc;2419}2420spot->array[spot->num++] = newstate;2421return REG_NOERROR;2422}24232424static void2425free_state (re_dfastate_t *state)2426{2427re_node_set_free (&state->non_eps_nodes);2428re_node_set_free (&state->inveclosure);2429if (state->entrance_nodes != &state->nodes)2430{2431re_node_set_free (state->entrance_nodes);2432re_free (state->entrance_nodes);2433}2434re_node_set_free (&state->nodes);2435re_free (state->word_trtable);2436re_free (state->trtable);2437re_free (state);2438}24392440/* Create the new state which is independ of contexts.2441Return the new state if succeeded, otherwise return NULL. */24422443static re_dfastate_t *2444internal_function2445create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,2446unsigned int hash)2447{2448int i;2449reg_errcode_t err;2450re_dfastate_t *newstate;24512452newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);2453if (BE (newstate == NULL, 0))2454return NULL;2455err = re_node_set_init_copy (&newstate->nodes, nodes);2456if (BE (err != REG_NOERROR, 0))2457{2458re_free (newstate);2459return NULL;2460}24612462newstate->entrance_nodes = &newstate->nodes;2463for (i = 0 ; i < nodes->nelem ; i++)2464{2465re_token_t *node = dfa->nodes + nodes->elems[i];2466re_token_type_t type = node->type;2467if (type == CHARACTER && !node->constraint)2468continue;2469#ifdef RE_ENABLE_I18N2470newstate->accept_mb |= node->accept_mb;2471#endif /* RE_ENABLE_I18N */24722473/* If the state has the halt node, the state is a halt state. */2474if (type == END_OF_RE)2475newstate->halt = 1;2476else if (type == OP_BACK_REF)2477newstate->has_backref = 1;2478else if (type == ANCHOR || node->constraint)2479newstate->has_constraint = 1;2480}2481err = register_state (dfa, newstate, hash);2482if (BE (err != REG_NOERROR, 0))2483{2484free_state (newstate);2485newstate = NULL;2486}2487return newstate;2488}24892490/* Create the new state which is depend on the context CONTEXT.2491Return the new state if succeeded, otherwise return NULL. */24922493static re_dfastate_t *2494internal_function2495create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,2496unsigned int context, unsigned int hash)2497{2498int i, nctx_nodes = 0;2499reg_errcode_t err;2500re_dfastate_t *newstate;25012502newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);2503if (BE (newstate == NULL, 0))2504return NULL;2505err = re_node_set_init_copy (&newstate->nodes, nodes);2506if (BE (err != REG_NOERROR, 0))2507{2508re_free (newstate);2509return NULL;2510}25112512newstate->context = context;2513newstate->entrance_nodes = &newstate->nodes;25142515for (i = 0 ; i < nodes->nelem ; i++)2516{2517unsigned int constraint = 0;2518re_token_t *node = dfa->nodes + nodes->elems[i];2519re_token_type_t type = node->type;2520if (node->constraint)2521constraint = node->constraint;25222523if (type == CHARACTER && !constraint)2524continue;2525#ifdef RE_ENABLE_I18N2526newstate->accept_mb |= node->accept_mb;2527#endif /* RE_ENABLE_I18N */25282529/* If the state has the halt node, the state is a halt state. */2530if (type == END_OF_RE)2531newstate->halt = 1;2532else if (type == OP_BACK_REF)2533newstate->has_backref = 1;2534else if (type == ANCHOR)2535constraint = node->opr.ctx_type;25362537if (constraint)2538{2539if (newstate->entrance_nodes == &newstate->nodes)2540{2541newstate->entrance_nodes = re_malloc (re_node_set, 1);2542if (BE (newstate->entrance_nodes == NULL, 0))2543{2544free_state (newstate);2545return NULL;2546}2547re_node_set_init_copy (newstate->entrance_nodes, nodes);2548nctx_nodes = 0;2549newstate->has_constraint = 1;2550}25512552if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))2553{2554re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);2555++nctx_nodes;2556}2557}2558}2559err = register_state (dfa, newstate, hash);2560if (BE (err != REG_NOERROR, 0))2561{2562free_state (newstate);2563newstate = NULL;2564}2565return newstate;2566}25672568/******************************************************************************/2569/******************************************************************************/2570/******************************************************************************/2571/* GKINCLUDE #include "regcomp.c" */2572/******************************************************************************/2573/******************************************************************************/2574/******************************************************************************/2575/* Extended regular expression matching and search library.2576Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.2577This file is part of the GNU C Library.2578Contributed by Isamu Hasegawa <[email protected]>.25792580The GNU C Library is free software; you can redistribute it and/or2581modify it under the terms of the GNU Lesser General Public2582License as published by the Free Software Foundation; either2583version 2.1 of the License, or (at your option) any later version.25842585The GNU C Library is distributed in the hope that it will be useful,2586but WITHOUT ANY WARRANTY; without even the implied warranty of2587MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU2588Lesser General Public License for more details.25892590You should have received a copy of the GNU Lesser General Public2591License along with the GNU C Library; if not, write to the Free2592Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA259302111-1307 USA. */25942595static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,2596size_t length, reg_syntax_t syntax);2597static void re_compile_fastmap_iter (regex_t *bufp,2598const re_dfastate_t *init_state,2599char *fastmap);2600static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);2601#ifdef RE_ENABLE_I18N2602static void free_charset (re_charset_t *cset);2603#endif /* RE_ENABLE_I18N */2604static void free_workarea_compile (regex_t *preg);2605static reg_errcode_t create_initial_state (re_dfa_t *dfa);2606#ifdef RE_ENABLE_I18N2607static void optimize_utf8 (re_dfa_t *dfa);2608#endif2609static reg_errcode_t analyze (regex_t *preg);2610static reg_errcode_t preorder (bin_tree_t *root,2611reg_errcode_t (fn (void *, bin_tree_t *)),2612void *extra);2613static reg_errcode_t postorder (bin_tree_t *root,2614reg_errcode_t (fn (void *, bin_tree_t *)),2615void *extra);2616static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);2617static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);2618static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,2619bin_tree_t *node);2620static reg_errcode_t calc_first (void *extra, bin_tree_t *node);2621static reg_errcode_t calc_next (void *extra, bin_tree_t *node);2622static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);2623static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);2624static int search_duplicated_node (const re_dfa_t *dfa, int org_node,2625unsigned int constraint);2626static reg_errcode_t calc_eclosure (re_dfa_t *dfa);2627static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,2628int node, int root);2629static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);2630static int fetch_number (re_string_t *input, re_token_t *token,2631reg_syntax_t syntax);2632static int peek_token (re_token_t *token, re_string_t *input,2633reg_syntax_t syntax) internal_function;2634static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,2635reg_syntax_t syntax, reg_errcode_t *err);2636static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,2637re_token_t *token, reg_syntax_t syntax,2638int nest, reg_errcode_t *err);2639static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,2640re_token_t *token, reg_syntax_t syntax,2641int nest, reg_errcode_t *err);2642static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,2643re_token_t *token, reg_syntax_t syntax,2644int nest, reg_errcode_t *err);2645static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,2646re_token_t *token, reg_syntax_t syntax,2647int nest, reg_errcode_t *err);2648static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,2649re_dfa_t *dfa, re_token_t *token,2650reg_syntax_t syntax, reg_errcode_t *err);2651static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,2652re_token_t *token, reg_syntax_t syntax,2653reg_errcode_t *err);2654static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,2655re_string_t *regexp,2656re_token_t *token, int token_len,2657re_dfa_t *dfa,2658reg_syntax_t syntax,2659int accept_hyphen);2660static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,2661re_string_t *regexp,2662re_token_t *token);2663#ifdef RE_ENABLE_I18N2664static reg_errcode_t build_equiv_class (bitset_t sbcset,2665re_charset_t *mbcset,2666int *equiv_class_alloc,2667const unsigned char *name);2668static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,2669bitset_t sbcset,2670re_charset_t *mbcset,2671int *char_class_alloc,2672const unsigned char *class_name,2673reg_syntax_t syntax);2674#else /* not RE_ENABLE_I18N */2675static reg_errcode_t build_equiv_class (bitset_t sbcset,2676const unsigned char *name);2677static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,2678bitset_t sbcset,2679const unsigned char *class_name,2680reg_syntax_t syntax);2681#endif /* not RE_ENABLE_I18N */2682static bin_tree_t *build_charclass_op (re_dfa_t *dfa,2683RE_TRANSLATE_TYPE trans,2684const unsigned char *class_name,2685const unsigned char *extra,2686int non_match, reg_errcode_t *err);2687static bin_tree_t *create_tree (re_dfa_t *dfa,2688bin_tree_t *left, bin_tree_t *right,2689re_token_type_t type);2690static bin_tree_t *create_token_tree (re_dfa_t *dfa,2691bin_tree_t *left, bin_tree_t *right,2692const re_token_t *token);2693static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);2694static void free_token (re_token_t *node);2695static reg_errcode_t free_tree (void *extra, bin_tree_t *node);2696static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);26972698/* This table gives an error message for each of the error codes listed2699in regex.h. Obviously the order here has to be same as there.2700POSIX doesn't require that we do anything for REG_NOERROR,2701but why not be nice? */27022703const char __re_error_msgid[] attribute_hidden =2704{2705#define REG_NOERROR_IDX 02706gettext_noop ("Success") /* REG_NOERROR */2707"\0"2708#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")2709gettext_noop ("No match") /* REG_NOMATCH */2710"\0"2711#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")2712gettext_noop ("Invalid regular expression") /* REG_BADPAT */2713"\0"2714#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")2715gettext_noop ("Invalid collation character") /* REG_ECOLLATE */2716"\0"2717#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")2718gettext_noop ("Invalid character class name") /* REG_ECTYPE */2719"\0"2720#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")2721gettext_noop ("Trailing backslash") /* REG_EESCAPE */2722"\0"2723#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")2724gettext_noop ("Invalid back reference") /* REG_ESUBREG */2725"\0"2726#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")2727gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */2728"\0"2729#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")2730gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */2731"\0"2732#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")2733gettext_noop ("Unmatched \\{") /* REG_EBRACE */2734"\0"2735#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")2736gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */2737"\0"2738#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")2739gettext_noop ("Invalid range end") /* REG_ERANGE */2740"\0"2741#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")2742gettext_noop ("Memory exhausted") /* REG_ESPACE */2743"\0"2744#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")2745gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */2746"\0"2747#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")2748gettext_noop ("Premature end of regular expression") /* REG_EEND */2749"\0"2750#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")2751gettext_noop ("Regular expression too big") /* REG_ESIZE */2752"\0"2753#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")2754gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */2755};27562757const size_t __re_error_msgid_idx[] attribute_hidden =2758{2759REG_NOERROR_IDX,2760REG_NOMATCH_IDX,2761REG_BADPAT_IDX,2762REG_ECOLLATE_IDX,2763REG_ECTYPE_IDX,2764REG_EESCAPE_IDX,2765REG_ESUBREG_IDX,2766REG_EBRACK_IDX,2767REG_EPAREN_IDX,2768REG_EBRACE_IDX,2769REG_BADBR_IDX,2770REG_ERANGE_IDX,2771REG_ESPACE_IDX,2772REG_BADRPT_IDX,2773REG_EEND_IDX,2774REG_ESIZE_IDX,2775REG_ERPAREN_IDX2776};27772778/* Entry points for GNU code. */27792780/* re_compile_pattern is the GNU regular expression compiler: it2781compiles PATTERN (of length LENGTH) and puts the result in BUFP.2782Returns 0 if the pattern was valid, otherwise an error string.27832784Assumes the `allocated' (and perhaps `buffer') and `translate' fields2785are set in BUFP on entry. */27862787const char *2788re_compile_pattern (pattern, length, bufp)2789const char *pattern;2790size_t length;2791struct re_pattern_buffer *bufp;2792{2793reg_errcode_t ret;27942795/* And GNU code determines whether or not to get register information2796by passing null for the REGS argument to re_match, etc., not by2797setting no_sub, unless RE_NO_SUB is set. */2798bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);27992800/* Match anchors at newline. */2801bufp->newline_anchor = 1;28022803ret = re_compile_internal (bufp, pattern, length, re_syntax_options);28042805if (!ret)2806return NULL;2807return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);2808}2809#ifdef _LIBC2810weak_alias (__re_compile_pattern, re_compile_pattern)2811#endif28122813/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can2814also be assigned to arbitrarily: each pattern buffer stores its own2815syntax, so it can be changed between regex compilations. */2816/* This has no initializer because initialized variables in Emacs2817become read-only after dumping. */2818reg_syntax_t re_syntax_options;281928202821/* Specify the precise syntax of regexps for compilation. This provides2822for compatibility for various utilities which historically have2823different, incompatible syntaxes.28242825The argument SYNTAX is a bit mask comprised of the various bits2826defined in regex.h. We return the old syntax. */28272828reg_syntax_t2829re_set_syntax (syntax)2830reg_syntax_t syntax;2831{2832reg_syntax_t ret = re_syntax_options;28332834re_syntax_options = syntax;2835return ret;2836}2837#ifdef _LIBC2838weak_alias (__re_set_syntax, re_set_syntax)2839#endif28402841int2842re_compile_fastmap (bufp)2843struct re_pattern_buffer *bufp;2844{2845re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;2846char *fastmap = bufp->fastmap;28472848memset (fastmap, '\0', sizeof (char) * SBC_MAX);2849re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);2850if (dfa->init_state != dfa->init_state_word)2851re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);2852if (dfa->init_state != dfa->init_state_nl)2853re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);2854if (dfa->init_state != dfa->init_state_begbuf)2855re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);2856bufp->fastmap_accurate = 1;2857return 0;2858}2859#ifdef _LIBC2860weak_alias (__re_compile_fastmap, re_compile_fastmap)2861#endif28622863static inline void2864__attribute ((always_inline))2865re_set_fastmap (char *fastmap, int icase, int ch)2866{2867fastmap[ch] = 1;2868if (icase)2869fastmap[tolower (ch)] = 1;2870}28712872/* Helper function for re_compile_fastmap.2873Compile fastmap for the initial_state INIT_STATE. */28742875static void2876re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,2877char *fastmap)2878{2879re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;2880int node_cnt;2881int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));2882for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)2883{2884int node = init_state->nodes.elems[node_cnt];2885re_token_type_t type = dfa->nodes[node].type;28862887if (type == CHARACTER)2888{2889re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);2890#ifdef RE_ENABLE_I18N2891if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)2892{2893unsigned char *buf = alloca (dfa->mb_cur_max), *p;2894wchar_t wc;2895mbstate_t state;28962897p = buf;2898*p++ = dfa->nodes[node].opr.c;2899while (++node < dfa->nodes_len2900&& dfa->nodes[node].type == CHARACTER2901&& dfa->nodes[node].mb_partial)2902*p++ = dfa->nodes[node].opr.c;2903memset (&state, '\0', sizeof (state));2904if (mbrtowc (&wc, (const char *) buf, p - buf,2905&state) == p - buf2906&& (__wcrtomb ((char *) buf, towlower (wc), &state)2907!= (size_t) -1))2908re_set_fastmap (fastmap, 0, buf[0]);2909}2910#endif2911}2912else if (type == SIMPLE_BRACKET)2913{2914int i, ch;2915for (i = 0, ch = 0; i < BITSET_WORDS; ++i)2916{2917int j;2918bitset_word_t w = dfa->nodes[node].opr.sbcset[i];2919for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)2920if (w & ((bitset_word_t) 1 << j))2921re_set_fastmap (fastmap, icase, ch);2922}2923}2924#ifdef RE_ENABLE_I18N2925else if (type == COMPLEX_BRACKET)2926{2927int i;2928re_charset_t *cset = dfa->nodes[node].opr.mbcset;2929if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes2930|| cset->nranges || cset->nchar_classes)2931{2932# ifdef _LIBC2933if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)2934{2935/* In this case we want to catch the bytes which are2936the first byte of any collation elements.2937e.g. In da_DK, we want to catch 'a' since "aa"2938is a valid collation element, and don't catch2939'b' since 'b' is the only collation element2940which starts from 'b'. */2941const int32_t *table = (const int32_t *)2942_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);2943for (i = 0; i < SBC_MAX; ++i)2944if (table[i] < 0)2945re_set_fastmap (fastmap, icase, i);2946}2947# else2948if (dfa->mb_cur_max > 1)2949for (i = 0; i < SBC_MAX; ++i)2950if (__btowc (i) == WEOF)2951re_set_fastmap (fastmap, icase, i);2952# endif /* not _LIBC */2953}2954for (i = 0; i < cset->nmbchars; ++i)2955{2956char buf[256];2957mbstate_t state;2958memset (&state, '\0', sizeof (state));2959if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)2960re_set_fastmap (fastmap, icase, *(unsigned char *) buf);2961if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)2962{2963if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)2964!= (size_t) -1)2965re_set_fastmap (fastmap, 0, *(unsigned char *) buf);2966}2967}2968}2969#endif /* RE_ENABLE_I18N */2970else if (type == OP_PERIOD2971#ifdef RE_ENABLE_I18N2972|| type == OP_UTF8_PERIOD2973#endif /* RE_ENABLE_I18N */2974|| type == END_OF_RE)2975{2976memset (fastmap, '\1', sizeof (char) * SBC_MAX);2977if (type == END_OF_RE)2978bufp->can_be_null = 1;2979return;2980}2981}2982}29832984/* Entry point for POSIX code. */2985/* regcomp takes a regular expression as a string and compiles it.29862987PREG is a regex_t *. We do not expect any fields to be initialized,2988since POSIX says we shouldn't. Thus, we set29892990`buffer' to the compiled pattern;2991`used' to the length of the compiled pattern;2992`syntax' to RE_SYNTAX_POSIX_EXTENDED if the2993REG_EXTENDED bit in CFLAGS is set; otherwise, to2994RE_SYNTAX_POSIX_BASIC;2995`newline_anchor' to REG_NEWLINE being set in CFLAGS;2996`fastmap' to an allocated space for the fastmap;2997`fastmap_accurate' to zero;2998`re_nsub' to the number of subexpressions in PATTERN.29993000PATTERN is the address of the pattern string.30013002CFLAGS is a series of bits which affect compilation.30033004If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we3005use POSIX basic syntax.30063007If REG_NEWLINE is set, then . and [^...] don't match newline.3008Also, regexec will try a match beginning after every newline.30093010If REG_ICASE is set, then we considers upper- and lowercase3011versions of letters to be equivalent when matching.30123013If REG_NOSUB is set, then when PREG is passed to regexec, that3014routine will report only success or failure, and nothing about the3015registers.30163017It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for3018the return codes and their meanings.) */30193020int3021regcomp (preg, pattern, cflags)3022regex_t *__restrict preg;3023const char *__restrict pattern;3024int cflags;3025{3026reg_errcode_t ret;3027reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED3028: RE_SYNTAX_POSIX_BASIC);30293030preg->buffer = NULL;3031preg->allocated = 0;3032preg->used = 0;30333034/* Try to allocate space for the fastmap. */3035preg->fastmap = re_malloc (char, SBC_MAX);3036if (BE (preg->fastmap == NULL, 0))3037return REG_ESPACE;30383039syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;30403041/* If REG_NEWLINE is set, newlines are treated differently. */3042if (cflags & REG_NEWLINE)3043{ /* REG_NEWLINE implies neither . nor [^...] match newline. */3044syntax &= ~RE_DOT_NEWLINE;3045syntax |= RE_HAT_LISTS_NOT_NEWLINE;3046/* It also changes the matching behavior. */3047preg->newline_anchor = 1;3048}3049else3050preg->newline_anchor = 0;3051preg->no_sub = !!(cflags & REG_NOSUB);3052preg->translate = NULL;30533054ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);30553056/* POSIX doesn't distinguish between an unmatched open-group and an3057unmatched close-group: both are REG_EPAREN. */3058if (ret == REG_ERPAREN)3059ret = REG_EPAREN;30603061/* We have already checked preg->fastmap != NULL. */3062if (BE (ret == REG_NOERROR, 1))3063/* Compute the fastmap now, since regexec cannot modify the pattern3064buffer. This function never fails in this implementation. */3065(void) re_compile_fastmap (preg);3066else3067{3068/* Some error occurred while compiling the expression. */3069re_free (preg->fastmap);3070preg->fastmap = NULL;3071}30723073return (int) ret;3074}3075#ifdef _LIBC3076weak_alias (__regcomp, regcomp)3077#endif30783079/* Returns a message corresponding to an error code, ERRCODE, returned3080from either regcomp or regexec. We don't use PREG here. */30813082/* regerror ( int errcode, preg, errbuf, errbuf_size) */3083size_t3084regerror (3085int errcode,3086const regex_t *__restrict preg,3087char *__restrict errbuf,3088size_t errbuf_size)3089{3090const char *msg;3091size_t msg_size;30923093if (BE (errcode < 03094|| errcode >= (int) (sizeof (__re_error_msgid_idx)3095/ sizeof (__re_error_msgid_idx[0])), 0))3096/* Only error codes returned by the rest of the code should be passed3097to this routine. If we are given anything else, or if other regex3098code generates an invalid error code, then the program has a bug.3099Dump core so we can fix it. */3100abort ();31013102msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);31033104msg_size = strlen (msg) + 1; /* Includes the null. */31053106if (BE (errbuf_size != 0, 1))3107{3108if (BE (msg_size > errbuf_size, 0))3109{3110#if defined HAVE_MEMPCPY || defined _LIBC3111*((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';3112#else3113memcpy (errbuf, msg, errbuf_size - 1);3114errbuf[errbuf_size - 1] = 0;3115#endif3116}3117else3118memcpy (errbuf, msg, msg_size);3119}31203121return msg_size;3122}3123#ifdef _LIBC3124weak_alias (__regerror, regerror)3125#endif312631273128#ifdef RE_ENABLE_I18N3129/* This static array is used for the map to single-byte characters when3130UTF-8 is used. Otherwise we would allocate memory just to initialize3131it the same all the time. UTF-8 is the preferred encoding so this is3132a worthwhile optimization. */3133static const bitset_t utf8_sb_map =3134{3135/* Set the first 128 bits. */3136[0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX3137};3138#endif313931403141static void3142free_dfa_content (re_dfa_t *dfa)3143{3144int i, j;31453146if (dfa->nodes)3147for (i = 0; i < dfa->nodes_len; ++i)3148free_token (dfa->nodes + i);3149re_free (dfa->nexts);3150for (i = 0; i < dfa->nodes_len; ++i)3151{3152if (dfa->eclosures != NULL)3153re_node_set_free (dfa->eclosures + i);3154if (dfa->inveclosures != NULL)3155re_node_set_free (dfa->inveclosures + i);3156if (dfa->edests != NULL)3157re_node_set_free (dfa->edests + i);3158}3159re_free (dfa->edests);3160re_free (dfa->eclosures);3161re_free (dfa->inveclosures);3162re_free (dfa->nodes);31633164if (dfa->state_table)3165for (i = 0; i <= dfa->state_hash_mask; ++i)3166{3167struct re_state_table_entry *entry = dfa->state_table + i;3168for (j = 0; j < entry->num; ++j)3169{3170re_dfastate_t *state = entry->array[j];3171free_state (state);3172}3173re_free (entry->array);3174}3175re_free (dfa->state_table);3176#ifdef RE_ENABLE_I18N3177if (dfa->sb_char != utf8_sb_map)3178re_free (dfa->sb_char);3179#endif3180re_free (dfa->subexp_map);3181#ifdef DEBUG3182re_free (dfa->re_str);3183#endif31843185re_free (dfa);3186}318731883189/* Free dynamically allocated space used by PREG. */31903191void3192regfree (preg)3193regex_t *preg;3194{3195re_dfa_t *dfa = (re_dfa_t *) preg->buffer;3196if (BE (dfa != NULL, 1))3197free_dfa_content (dfa);3198preg->buffer = NULL;3199preg->allocated = 0;32003201re_free (preg->fastmap);3202preg->fastmap = NULL;32033204re_free (preg->translate);3205preg->translate = NULL;3206}3207#ifdef _LIBC3208weak_alias (__regfree, regfree)3209#endif32103211/* Entry points compatible with 4.2 BSD regex library. We don't define3212them unless specifically requested. */32133214#if defined _REGEX_RE_COMP || defined _LIBC32153216/* BSD has one and only one pattern buffer. */3217static struct re_pattern_buffer re_comp_buf;32183219char *3220# ifdef _LIBC3221/* Make these definitions weak in libc, so POSIX programs can redefine3222these names if they don't use our functions, and still use3223regcomp/regexec above without link errors. */3224weak_function3225# endif3226re_comp (s)3227const char *s;3228{3229reg_errcode_t ret;3230char *fastmap;32313232if (!s)3233{3234if (!re_comp_buf.buffer)3235return gettext ("No previous regular expression");3236return 0;3237}32383239if (re_comp_buf.buffer)3240{3241fastmap = re_comp_buf.fastmap;3242re_comp_buf.fastmap = NULL;3243__regfree (&re_comp_buf);3244memset (&re_comp_buf, '\0', sizeof (re_comp_buf));3245re_comp_buf.fastmap = fastmap;3246}32473248if (re_comp_buf.fastmap == NULL)3249{3250re_comp_buf.fastmap = (char *) malloc (SBC_MAX);3251if (re_comp_buf.fastmap == NULL)3252return (char *) gettext (__re_error_msgid3253+ __re_error_msgid_idx[(int) REG_ESPACE]);3254}32553256/* Since `re_exec' always passes NULL for the `regs' argument, we3257don't need to initialize the pattern buffer fields which affect it. */32583259/* Match anchors at newlines. */3260re_comp_buf.newline_anchor = 1;32613262ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);32633264if (!ret)3265return NULL;32663267/* Yes, we're discarding `const' here if !HAVE_LIBINTL. */3268return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);3269}32703271#ifdef _LIBC3272libc_freeres_fn (free_mem)3273{3274__regfree (&re_comp_buf);3275}3276#endif32773278#endif /* _REGEX_RE_COMP */32793280/* Internal entry point.3281Compile the regular expression PATTERN, whose length is LENGTH.3282SYNTAX indicate regular expression's syntax. */32833284static reg_errcode_t3285re_compile_internal (regex_t *preg, const char * pattern, size_t length,3286reg_syntax_t syntax)3287{3288reg_errcode_t err = REG_NOERROR;3289re_dfa_t *dfa;3290re_string_t regexp;32913292/* Initialize the pattern buffer. */3293preg->fastmap_accurate = 0;3294preg->syntax = syntax;3295preg->not_bol = preg->not_eol = 0;3296preg->used = 0;3297preg->re_nsub = 0;3298preg->can_be_null = 0;3299preg->regs_allocated = REGS_UNALLOCATED;33003301/* Initialize the dfa. */3302dfa = (re_dfa_t *) preg->buffer;3303if (BE (preg->allocated < sizeof (re_dfa_t), 0))3304{3305/* If zero allocated, but buffer is non-null, try to realloc3306enough space. This loses if buffer's address is bogus, but3307that is the user's responsibility. If ->buffer is NULL this3308is a simple allocation. */3309dfa = re_realloc (preg->buffer, re_dfa_t, 1);3310if (dfa == NULL)3311return REG_ESPACE;3312preg->allocated = sizeof (re_dfa_t);3313preg->buffer = (unsigned char *) dfa;3314}3315preg->used = sizeof (re_dfa_t);33163317err = init_dfa (dfa, length);3318if (BE (err != REG_NOERROR, 0))3319{3320free_dfa_content (dfa);3321preg->buffer = NULL;3322preg->allocated = 0;3323return err;3324}3325#ifdef DEBUG3326/* Note: length+1 will not overflow since it is checked in init_dfa. */3327dfa->re_str = re_malloc (char, length + 1);3328strncpy (dfa->re_str, pattern, length + 1);3329#endif33303331__libc_lock_init (dfa->lock);33323333err = re_string_construct (®exp, pattern, length, preg->translate,3334syntax & RE_ICASE, dfa);3335if (BE (err != REG_NOERROR, 0))3336{3337re_compile_internal_free_return:3338free_workarea_compile (preg);3339re_string_destruct (®exp);3340free_dfa_content (dfa);3341preg->buffer = NULL;3342preg->allocated = 0;3343return err;3344}33453346/* Parse the regular expression, and build a structure tree. */3347preg->re_nsub = 0;3348dfa->str_tree = parse (®exp, preg, syntax, &err);3349if (BE (dfa->str_tree == NULL, 0))3350goto re_compile_internal_free_return;33513352/* Analyze the tree and create the nfa. */3353err = analyze (preg);3354if (BE (err != REG_NOERROR, 0))3355goto re_compile_internal_free_return;33563357#ifdef RE_ENABLE_I18N3358/* If possible, do searching in single byte encoding to speed things up. */3359if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)3360optimize_utf8 (dfa);3361#endif33623363/* Then create the initial state of the dfa. */3364err = create_initial_state (dfa);33653366/* Release work areas. */3367free_workarea_compile (preg);3368re_string_destruct (®exp);33693370if (BE (err != REG_NOERROR, 0))3371{3372free_dfa_content (dfa);3373preg->buffer = NULL;3374preg->allocated = 0;3375}33763377return err;3378}33793380/* Initialize DFA. We use the length of the regular expression PAT_LEN3381as the initial length of some arrays. */33823383static reg_errcode_t3384init_dfa (re_dfa_t *dfa, size_t pat_len)3385{3386unsigned int table_size;3387#ifndef _LIBC3388char *codeset_name;3389#endif33903391memset (dfa, '\0', sizeof (re_dfa_t));33923393/* Force allocation of str_tree_storage the first time. */3394dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;33953396/* Avoid overflows. */3397if (pat_len == SIZE_MAX)3398return REG_ESPACE;33993400dfa->nodes_alloc = pat_len + 1;3401dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);34023403/* table_size = 2 ^ ceil(log pat_len) */3404for (table_size = 1; ; table_size <<= 1)3405if (table_size > pat_len)3406break;34073408dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);3409dfa->state_hash_mask = table_size - 1;34103411dfa->mb_cur_max = MB_CUR_MAX;3412#ifdef _LIBC3413if (dfa->mb_cur_max == 63414&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)3415dfa->is_utf8 = 1;3416dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)3417!= 0);3418#else3419# ifdef HAVE_LANGINFO_CODESET3420codeset_name = nl_langinfo (CODESET);3421# else3422codeset_name = getenv ("LC_ALL");3423if (codeset_name == NULL || codeset_name[0] == '\0')3424codeset_name = getenv ("LC_CTYPE");3425if (codeset_name == NULL || codeset_name[0] == '\0')3426codeset_name = getenv ("LANG");3427if (codeset_name == NULL)3428codeset_name = "";3429else if (strchr (codeset_name, '.') != NULL)3430codeset_name = strchr (codeset_name, '.') + 1;3431# endif34323433if (strcasecmp (codeset_name, "UTF-8") == 03434|| strcasecmp (codeset_name, "UTF8") == 0)3435dfa->is_utf8 = 1;34363437/* We check exhaustively in the loop below if this charset is a3438superset of ASCII. */3439dfa->map_notascii = 0;3440#endif34413442#ifdef RE_ENABLE_I18N3443if (dfa->mb_cur_max > 1)3444{3445if (dfa->is_utf8)3446dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;3447else3448{3449int i, j, ch;34503451dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);3452if (BE (dfa->sb_char == NULL, 0))3453return REG_ESPACE;34543455/* Set the bits corresponding to single byte chars. */3456for (i = 0, ch = 0; i < BITSET_WORDS; ++i)3457for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)3458{3459wint_t wch = __btowc (ch);3460if (wch != WEOF)3461dfa->sb_char[i] |= (bitset_word_t) 1 << j;3462# ifndef _LIBC3463if (isascii (ch) && wch != ch)3464dfa->map_notascii = 1;3465# endif3466}3467}3468}3469#endif34703471if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))3472return REG_ESPACE;3473return REG_NOERROR;3474}34753476/* Initialize WORD_CHAR table, which indicate which character is3477"word". In this case "word" means that it is the word construction3478character used by some operators like "\<", "\>", etc. */34793480static void3481internal_function3482init_word_char (re_dfa_t *dfa)3483{3484int i, j, ch;3485dfa->word_ops_used = 1;3486for (i = 0, ch = 0; i < BITSET_WORDS; ++i)3487for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)3488if (isalnum (ch) || ch == '_')3489dfa->word_char[i] |= (bitset_word_t) 1 << j;3490}34913492/* Free the work area which are only used while compiling. */34933494static void3495free_workarea_compile (regex_t *preg)3496{3497re_dfa_t *dfa = (re_dfa_t *) preg->buffer;3498bin_tree_storage_t *storage, *next;3499for (storage = dfa->str_tree_storage; storage; storage = next)3500{3501next = storage->next;3502re_free (storage);3503}3504dfa->str_tree_storage = NULL;3505dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;3506dfa->str_tree = NULL;3507re_free (dfa->org_indices);3508dfa->org_indices = NULL;3509}35103511/* Create initial states for all contexts. */35123513static reg_errcode_t3514create_initial_state (re_dfa_t *dfa)3515{3516int first, i;3517reg_errcode_t err;3518re_node_set init_nodes;35193520/* Initial states have the epsilon closure of the node which is3521the first node of the regular expression. */3522first = dfa->str_tree->first->node_idx;3523dfa->init_node = first;3524err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);3525if (BE (err != REG_NOERROR, 0))3526return err;35273528/* The back-references which are in initial states can epsilon transit,3529since in this case all of the subexpressions can be null.3530Then we add epsilon closures of the nodes which are the next nodes of3531the back-references. */3532if (dfa->nbackref > 0)3533for (i = 0; i < init_nodes.nelem; ++i)3534{3535int node_idx = init_nodes.elems[i];3536re_token_type_t type = dfa->nodes[node_idx].type;35373538int clexp_idx;3539if (type != OP_BACK_REF)3540continue;3541for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)3542{3543re_token_t *clexp_node;3544clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];3545if (clexp_node->type == OP_CLOSE_SUBEXP3546&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)3547break;3548}3549if (clexp_idx == init_nodes.nelem)3550continue;35513552if (type == OP_BACK_REF)3553{3554int dest_idx = dfa->edests[node_idx].elems[0];3555if (!re_node_set_contains (&init_nodes, dest_idx))3556{3557re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);3558i = 0;3559}3560}3561}35623563/* It must be the first time to invoke acquire_state. */3564dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);3565/* We don't check ERR here, since the initial state must not be NULL. */3566if (BE (dfa->init_state == NULL, 0))3567return err;3568if (dfa->init_state->has_constraint)3569{3570dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,3571CONTEXT_WORD);3572dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,3573CONTEXT_NEWLINE);3574dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,3575&init_nodes,3576CONTEXT_NEWLINE3577| CONTEXT_BEGBUF);3578if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL3579|| dfa->init_state_begbuf == NULL, 0))3580return err;3581}3582else3583dfa->init_state_word = dfa->init_state_nl3584= dfa->init_state_begbuf = dfa->init_state;35853586re_node_set_free (&init_nodes);3587return REG_NOERROR;3588}35893590#ifdef RE_ENABLE_I18N3591/* If it is possible to do searching in single byte encoding instead of UTF-83592to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change3593DFA nodes where needed. */35943595static void3596optimize_utf8 (re_dfa_t *dfa)3597{3598int node, i, mb_chars = 0, has_period = 0;35993600for (node = 0; node < dfa->nodes_len; ++node)3601switch (dfa->nodes[node].type)3602{3603case CHARACTER:3604if (dfa->nodes[node].opr.c >= 0x80)3605mb_chars = 1;3606break;3607case ANCHOR:3608switch (dfa->nodes[node].opr.idx)3609{3610case LINE_FIRST:3611case LINE_LAST:3612case BUF_FIRST:3613case BUF_LAST:3614break;3615default:3616/* Word anchors etc. cannot be handled. */3617return;3618}3619break;3620case OP_PERIOD:3621has_period = 1;3622break;3623case OP_BACK_REF:3624case OP_ALT:3625case END_OF_RE:3626case OP_DUP_ASTERISK:3627case OP_OPEN_SUBEXP:3628case OP_CLOSE_SUBEXP:3629break;3630case COMPLEX_BRACKET:3631return;3632case SIMPLE_BRACKET:3633/* Just double check. The non-ASCII range starts at 0x80. */3634assert (0x80 % BITSET_WORD_BITS == 0);3635for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)3636if (dfa->nodes[node].opr.sbcset[i])3637return;3638break;3639default:3640abort ();3641}36423643if (mb_chars || has_period)3644for (node = 0; node < dfa->nodes_len; ++node)3645{3646if (dfa->nodes[node].type == CHARACTER3647&& dfa->nodes[node].opr.c >= 0x80)3648dfa->nodes[node].mb_partial = 0;3649else if (dfa->nodes[node].type == OP_PERIOD)3650dfa->nodes[node].type = OP_UTF8_PERIOD;3651}36523653/* The search can be in single byte locale. */3654dfa->mb_cur_max = 1;3655dfa->is_utf8 = 0;3656dfa->has_mb_node = dfa->nbackref > 0 || has_period;3657}3658#endif36593660/* Analyze the structure tree, and calculate "first", "next", "edest",3661"eclosure", and "inveclosure". */36623663static reg_errcode_t3664analyze (regex_t *preg)3665{3666re_dfa_t *dfa = (re_dfa_t *) preg->buffer;3667reg_errcode_t ret;36683669/* Allocate arrays. */3670dfa->nexts = re_malloc (int, dfa->nodes_alloc);3671dfa->org_indices = re_malloc (int, dfa->nodes_alloc);3672dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);3673dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);3674if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL3675|| dfa->eclosures == NULL, 0))3676return REG_ESPACE;36773678dfa->subexp_map = re_malloc (int, preg->re_nsub);3679if (dfa->subexp_map != NULL)3680{3681int i;3682for (i = 0; i < preg->re_nsub; i++)3683dfa->subexp_map[i] = i;3684preorder (dfa->str_tree, optimize_subexps, dfa);3685for (i = 0; i < preg->re_nsub; i++)3686if (dfa->subexp_map[i] != i)3687break;3688if (i == preg->re_nsub)3689{3690free (dfa->subexp_map);3691dfa->subexp_map = NULL;3692}3693}36943695ret = postorder (dfa->str_tree, lower_subexps, preg);3696if (BE (ret != REG_NOERROR, 0))3697return ret;3698ret = postorder (dfa->str_tree, calc_first, dfa);3699if (BE (ret != REG_NOERROR, 0))3700return ret;3701preorder (dfa->str_tree, calc_next, dfa);3702ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);3703if (BE (ret != REG_NOERROR, 0))3704return ret;3705ret = calc_eclosure (dfa);3706if (BE (ret != REG_NOERROR, 0))3707return ret;37083709/* We only need this during the prune_impossible_nodes pass in regexec.c;3710skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */3711if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)3712|| dfa->nbackref)3713{3714dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);3715if (BE (dfa->inveclosures == NULL, 0))3716return REG_ESPACE;3717ret = calc_inveclosure (dfa);3718}37193720return ret;3721}37223723/* Our parse trees are very unbalanced, so we cannot use a stack to3724implement parse tree visits. Instead, we use parent pointers and3725some hairy code in these two functions. */3726static reg_errcode_t3727postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),3728void *extra)3729{3730bin_tree_t *node, *prev;37313732for (node = root; ; )3733{3734/* Descend down the tree, preferably to the left (or to the right3735if that's the only child). */3736while (node->left || node->right)3737if (node->left)3738node = node->left;3739else3740node = node->right;37413742do3743{3744reg_errcode_t err = fn (extra, node);3745if (BE (err != REG_NOERROR, 0))3746return err;3747if (node->parent == NULL)3748return REG_NOERROR;3749prev = node;3750node = node->parent;3751}3752/* Go up while we have a node that is reached from the right. */3753while (node->right == prev || node->right == NULL);3754node = node->right;3755}3756}37573758static reg_errcode_t3759preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),3760void *extra)3761{3762bin_tree_t *node;37633764for (node = root; ; )3765{3766reg_errcode_t err = fn (extra, node);3767if (BE (err != REG_NOERROR, 0))3768return err;37693770/* Go to the left node, or up and to the right. */3771if (node->left)3772node = node->left;3773else3774{3775bin_tree_t *prev = NULL;3776while (node->right == prev || node->right == NULL)3777{3778prev = node;3779node = node->parent;3780if (!node)3781return REG_NOERROR;3782}3783node = node->right;3784}3785}3786}37873788/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell3789re_search_internal to map the inner one's opr.idx to this one's. Adjust3790backreferences as well. Requires a preorder visit. */3791static reg_errcode_t3792optimize_subexps (void *extra, bin_tree_t *node)3793{3794re_dfa_t *dfa = (re_dfa_t *) extra;37953796if (node->token.type == OP_BACK_REF && dfa->subexp_map)3797{3798int idx = node->token.opr.idx;3799node->token.opr.idx = dfa->subexp_map[idx];3800dfa->used_bkref_map |= 1 << node->token.opr.idx;3801}38023803else if (node->token.type == SUBEXP3804&& node->left && node->left->token.type == SUBEXP)3805{3806int other_idx = node->left->token.opr.idx;38073808node->left = node->left->left;3809if (node->left)3810node->left->parent = node;38113812dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];3813if (other_idx < BITSET_WORD_BITS)3814dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);3815}38163817return REG_NOERROR;3818}38193820/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation3821of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */3822static reg_errcode_t3823lower_subexps (void *extra, bin_tree_t *node)3824{3825regex_t *preg = (regex_t *) extra;3826reg_errcode_t err = REG_NOERROR;38273828if (node->left && node->left->token.type == SUBEXP)3829{3830node->left = lower_subexp (&err, preg, node->left);3831if (node->left)3832node->left->parent = node;3833}3834if (node->right && node->right->token.type == SUBEXP)3835{3836node->right = lower_subexp (&err, preg, node->right);3837if (node->right)3838node->right->parent = node;3839}38403841return err;3842}38433844static bin_tree_t *3845lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)3846{3847re_dfa_t *dfa = (re_dfa_t *) preg->buffer;3848bin_tree_t *body = node->left;3849bin_tree_t *op, *cls, *tree1, *tree;38503851if (preg->no_sub3852/* We do not optimize empty subexpressions, because otherwise we may3853have bad CONCAT nodes with NULL children. This is obviously not3854very common, so we do not lose much. An example that triggers3855this case is the sed "script" /\(\)/x. */3856&& node->left != NULL3857&& (node->token.opr.idx >= BITSET_WORD_BITS3858|| !(dfa->used_bkref_map3859& ((bitset_word_t) 1 << node->token.opr.idx))))3860return node->left;38613862/* Convert the SUBEXP node to the concatenation of an3863OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */3864op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);3865cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);3866tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;3867tree = create_tree (dfa, op, tree1, CONCAT);3868if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))3869{3870*err = REG_ESPACE;3871return NULL;3872}38733874op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;3875op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;3876return tree;3877}38783879/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton3880nodes. Requires a postorder visit. */3881static reg_errcode_t3882calc_first (void *extra, bin_tree_t *node)3883{3884re_dfa_t *dfa = (re_dfa_t *) extra;3885if (node->token.type == CONCAT)3886{3887node->first = node->left->first;3888node->node_idx = node->left->node_idx;3889}3890else3891{3892node->first = node;3893node->node_idx = re_dfa_add_node (dfa, node->token);3894if (BE (node->node_idx == -1, 0))3895return REG_ESPACE;3896}3897return REG_NOERROR;3898}38993900/* Pass 2: compute NEXT on the tree. Preorder visit. */3901static reg_errcode_t3902calc_next (void *extra, bin_tree_t *node)3903{3904switch (node->token.type)3905{3906case OP_DUP_ASTERISK:3907node->left->next = node;3908break;3909case CONCAT:3910node->left->next = node->right->first;3911node->right->next = node->next;3912break;3913default:3914if (node->left)3915node->left->next = node->next;3916if (node->right)3917node->right->next = node->next;3918break;3919}3920return REG_NOERROR;3921}39223923/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */3924static reg_errcode_t3925link_nfa_nodes (void *extra, bin_tree_t *node)3926{3927re_dfa_t *dfa = (re_dfa_t *) extra;3928int idx = node->node_idx;3929reg_errcode_t err = REG_NOERROR;39303931switch (node->token.type)3932{3933case CONCAT:3934break;39353936case END_OF_RE:3937assert (node->next == NULL);3938break;39393940case OP_DUP_ASTERISK:3941case OP_ALT:3942{3943int left, right;3944dfa->has_plural_match = 1;3945if (node->left != NULL)3946left = node->left->first->node_idx;3947else3948left = node->next->node_idx;3949if (node->right != NULL)3950right = node->right->first->node_idx;3951else3952right = node->next->node_idx;3953assert (left > -1);3954assert (right > -1);3955err = re_node_set_init_2 (dfa->edests + idx, left, right);3956}3957break;39583959case ANCHOR:3960case OP_OPEN_SUBEXP:3961case OP_CLOSE_SUBEXP:3962err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);3963break;39643965case OP_BACK_REF:3966dfa->nexts[idx] = node->next->node_idx;3967if (node->token.type == OP_BACK_REF)3968re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);3969break;39703971default:3972assert (!IS_EPSILON_NODE (node->token.type));3973dfa->nexts[idx] = node->next->node_idx;3974break;3975}39763977return err;3978}39793980/* Duplicate the epsilon closure of the node ROOT_NODE.3981Note that duplicated nodes have constraint INIT_CONSTRAINT in addition3982to their own constraint. */39833984static reg_errcode_t3985internal_function3986duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,3987int root_node, unsigned int init_constraint)3988{3989int org_node, clone_node, ret;3990unsigned int constraint = init_constraint;3991for (org_node = top_org_node, clone_node = top_clone_node;;)3992{3993int org_dest, clone_dest;3994if (dfa->nodes[org_node].type == OP_BACK_REF)3995{3996/* If the back reference epsilon-transit, its destination must3997also have the constraint. Then duplicate the epsilon closure3998of the destination of the back reference, and store it in3999edests of the back reference. */4000org_dest = dfa->nexts[org_node];4001re_node_set_empty (dfa->edests + clone_node);4002clone_dest = duplicate_node (dfa, org_dest, constraint);4003if (BE (clone_dest == -1, 0))4004return REG_ESPACE;4005dfa->nexts[clone_node] = dfa->nexts[org_node];4006ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);4007if (BE (ret < 0, 0))4008return REG_ESPACE;4009}4010else if (dfa->edests[org_node].nelem == 0)4011{4012/* In case of the node can't epsilon-transit, don't duplicate the4013destination and store the original destination as the4014destination of the node. */4015dfa->nexts[clone_node] = dfa->nexts[org_node];4016break;4017}4018else if (dfa->edests[org_node].nelem == 1)4019{4020/* In case of the node can epsilon-transit, and it has only one4021destination. */4022org_dest = dfa->edests[org_node].elems[0];4023re_node_set_empty (dfa->edests + clone_node);4024if (dfa->nodes[org_node].type == ANCHOR)4025{4026/* In case of the node has another constraint, append it. */4027if (org_node == root_node && clone_node != org_node)4028{4029/* ...but if the node is root_node itself, it means the4030epsilon closure have a loop, then tie it to the4031destination of the root_node. */4032ret = re_node_set_insert (dfa->edests + clone_node,4033org_dest);4034if (BE (ret < 0, 0))4035return REG_ESPACE;4036break;4037}4038constraint |= dfa->nodes[org_node].opr.ctx_type;4039}4040clone_dest = duplicate_node (dfa, org_dest, constraint);4041if (BE (clone_dest == -1, 0))4042return REG_ESPACE;4043ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);4044if (BE (ret < 0, 0))4045return REG_ESPACE;4046}4047else /* dfa->edests[org_node].nelem == 2 */4048{4049/* In case of the node can epsilon-transit, and it has two4050destinations. In the bin_tree_t and DFA, that's '|' and '*'. */4051org_dest = dfa->edests[org_node].elems[0];4052re_node_set_empty (dfa->edests + clone_node);4053/* Search for a duplicated node which satisfies the constraint. */4054clone_dest = search_duplicated_node (dfa, org_dest, constraint);4055if (clone_dest == -1)4056{4057/* There are no such a duplicated node, create a new one. */4058reg_errcode_t err;4059clone_dest = duplicate_node (dfa, org_dest, constraint);4060if (BE (clone_dest == -1, 0))4061return REG_ESPACE;4062ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);4063if (BE (ret < 0, 0))4064return REG_ESPACE;4065err = duplicate_node_closure (dfa, org_dest, clone_dest,4066root_node, constraint);4067if (BE (err != REG_NOERROR, 0))4068return err;4069}4070else4071{4072/* There are a duplicated node which satisfy the constraint,4073use it to avoid infinite loop. */4074ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);4075if (BE (ret < 0, 0))4076return REG_ESPACE;4077}40784079org_dest = dfa->edests[org_node].elems[1];4080clone_dest = duplicate_node (dfa, org_dest, constraint);4081if (BE (clone_dest == -1, 0))4082return REG_ESPACE;4083ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);4084if (BE (ret < 0, 0))4085return REG_ESPACE;4086}4087org_node = org_dest;4088clone_node = clone_dest;4089}4090return REG_NOERROR;4091}40924093/* Search for a node which is duplicated from the node ORG_NODE, and4094satisfies the constraint CONSTRAINT. */40954096static int4097search_duplicated_node (const re_dfa_t *dfa, int org_node,4098unsigned int constraint)4099{4100int idx;4101for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)4102{4103if (org_node == dfa->org_indices[idx]4104&& constraint == dfa->nodes[idx].constraint)4105return idx; /* Found. */4106}4107return -1; /* Not found. */4108}41094110/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.4111Return the index of the new node, or -1 if insufficient storage is4112available. */41134114static int4115duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)4116{4117int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);4118if (BE (dup_idx != -1, 1))4119{4120dfa->nodes[dup_idx].constraint = constraint;4121if (dfa->nodes[org_idx].type == ANCHOR)4122dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;4123dfa->nodes[dup_idx].duplicated = 1;41244125/* Store the index of the original node. */4126dfa->org_indices[dup_idx] = org_idx;4127}4128return dup_idx;4129}41304131static reg_errcode_t4132calc_inveclosure (re_dfa_t *dfa)4133{4134int src, idx, ret;4135for (idx = 0; idx < dfa->nodes_len; ++idx)4136re_node_set_init_empty (dfa->inveclosures + idx);41374138for (src = 0; src < dfa->nodes_len; ++src)4139{4140int *elems = dfa->eclosures[src].elems;4141for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)4142{4143ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);4144if (BE (ret == -1, 0))4145return REG_ESPACE;4146}4147}41484149return REG_NOERROR;4150}41514152/* Calculate "eclosure" for all the node in DFA. */41534154static reg_errcode_t4155calc_eclosure (re_dfa_t *dfa)4156{4157int node_idx, incomplete;4158#ifdef DEBUG4159assert (dfa->nodes_len > 0);4160#endif4161incomplete = 0;4162/* For each nodes, calculate epsilon closure. */4163for (node_idx = 0; ; ++node_idx)4164{4165reg_errcode_t err;4166re_node_set eclosure_elem;4167if (node_idx == dfa->nodes_len)4168{4169if (!incomplete)4170break;4171incomplete = 0;4172node_idx = 0;4173}41744175#ifdef DEBUG4176assert (dfa->eclosures[node_idx].nelem != -1);4177#endif41784179/* If we have already calculated, skip it. */4180if (dfa->eclosures[node_idx].nelem != 0)4181continue;4182/* Calculate epsilon closure of `node_idx'. */4183err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);4184if (BE (err != REG_NOERROR, 0))4185return err;41864187if (dfa->eclosures[node_idx].nelem == 0)4188{4189incomplete = 1;4190re_node_set_free (&eclosure_elem);4191}4192}4193return REG_NOERROR;4194}41954196/* Calculate epsilon closure of NODE. */41974198static reg_errcode_t4199calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)4200{4201reg_errcode_t err;4202unsigned int constraint;4203int i, incomplete;4204re_node_set eclosure;4205incomplete = 0;4206err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);4207if (BE (err != REG_NOERROR, 0))4208return err;42094210/* This indicates that we are calculating this node now.4211We reference this value to avoid infinite loop. */4212dfa->eclosures[node].nelem = -1;42134214constraint = ((dfa->nodes[node].type == ANCHOR)4215? dfa->nodes[node].opr.ctx_type : 0);4216/* If the current node has constraints, duplicate all nodes.4217Since they must inherit the constraints. */4218if (constraint4219&& dfa->edests[node].nelem4220&& !dfa->nodes[dfa->edests[node].elems[0]].duplicated)4221{4222err = duplicate_node_closure (dfa, node, node, node, constraint);4223if (BE (err != REG_NOERROR, 0))4224return err;4225}42264227/* Expand each epsilon destination nodes. */4228if (IS_EPSILON_NODE(dfa->nodes[node].type))4229for (i = 0; i < dfa->edests[node].nelem; ++i)4230{4231re_node_set eclosure_elem;4232int edest = dfa->edests[node].elems[i];4233/* If calculating the epsilon closure of `edest' is in progress,4234return intermediate result. */4235if (dfa->eclosures[edest].nelem == -1)4236{4237incomplete = 1;4238continue;4239}4240/* If we haven't calculated the epsilon closure of `edest' yet,4241calculate now. Otherwise use calculated epsilon closure. */4242if (dfa->eclosures[edest].nelem == 0)4243{4244err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);4245if (BE (err != REG_NOERROR, 0))4246return err;4247}4248else4249eclosure_elem = dfa->eclosures[edest];4250/* Merge the epsilon closure of `edest'. */4251re_node_set_merge (&eclosure, &eclosure_elem);4252/* If the epsilon closure of `edest' is incomplete,4253the epsilon closure of this node is also incomplete. */4254if (dfa->eclosures[edest].nelem == 0)4255{4256incomplete = 1;4257re_node_set_free (&eclosure_elem);4258}4259}42604261/* Epsilon closures include itself. */4262re_node_set_insert (&eclosure, node);4263if (incomplete && !root)4264dfa->eclosures[node].nelem = 0;4265else4266dfa->eclosures[node] = eclosure;4267*new_set = eclosure;4268return REG_NOERROR;4269}42704271/* Functions for token which are used in the parser. */42724273/* Fetch a token from INPUT.4274We must not use this function inside bracket expressions. */42754276static void4277internal_function4278fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)4279{4280re_string_skip_bytes (input, peek_token (result, input, syntax));4281}42824283/* Peek a token from INPUT, and return the length of the token.4284We must not use this function inside bracket expressions. */42854286static int4287internal_function4288peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)4289{4290unsigned char c;42914292if (re_string_eoi (input))4293{4294token->type = END_OF_RE;4295return 0;4296}42974298c = re_string_peek_byte (input, 0);4299token->opr.c = c;43004301token->word_char = 0;4302#ifdef RE_ENABLE_I18N4303token->mb_partial = 0;4304if (input->mb_cur_max > 1 &&4305!re_string_first_byte (input, re_string_cur_idx (input)))4306{4307token->type = CHARACTER;4308token->mb_partial = 1;4309return 1;4310}4311#endif4312if (c == '\\')4313{4314unsigned char c2;4315if (re_string_cur_idx (input) + 1 >= re_string_length (input))4316{4317token->type = BACK_SLASH;4318return 1;4319}43204321c2 = re_string_peek_byte_case (input, 1);4322token->opr.c = c2;4323token->type = CHARACTER;4324#ifdef RE_ENABLE_I18N4325if (input->mb_cur_max > 1)4326{4327wint_t wc = re_string_wchar_at (input,4328re_string_cur_idx (input) + 1);4329token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;4330}4331else4332#endif4333token->word_char = IS_WORD_CHAR (c2) != 0;43344335switch (c2)4336{4337case '|':4338if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))4339token->type = OP_ALT;4340break;4341case '1': case '2': case '3': case '4': case '5':4342case '6': case '7': case '8': case '9':4343if (!(syntax & RE_NO_BK_REFS))4344{4345token->type = OP_BACK_REF;4346token->opr.idx = c2 - '1';4347}4348break;4349case '<':4350if (!(syntax & RE_NO_GNU_OPS))4351{4352token->type = ANCHOR;4353token->opr.ctx_type = WORD_FIRST;4354}4355break;4356case '>':4357if (!(syntax & RE_NO_GNU_OPS))4358{4359token->type = ANCHOR;4360token->opr.ctx_type = WORD_LAST;4361}4362break;4363case 'b':4364if (!(syntax & RE_NO_GNU_OPS))4365{4366token->type = ANCHOR;4367token->opr.ctx_type = WORD_DELIM;4368}4369break;4370case 'B':4371if (!(syntax & RE_NO_GNU_OPS))4372{4373token->type = ANCHOR;4374token->opr.ctx_type = NOT_WORD_DELIM;4375}4376break;4377case 'w':4378if (!(syntax & RE_NO_GNU_OPS))4379token->type = OP_WORD;4380break;4381case 'W':4382if (!(syntax & RE_NO_GNU_OPS))4383token->type = OP_NOTWORD;4384break;4385case 's':4386if (!(syntax & RE_NO_GNU_OPS))4387token->type = OP_SPACE;4388break;4389case 'S':4390if (!(syntax & RE_NO_GNU_OPS))4391token->type = OP_NOTSPACE;4392break;4393case '`':4394if (!(syntax & RE_NO_GNU_OPS))4395{4396token->type = ANCHOR;4397token->opr.ctx_type = BUF_FIRST;4398}4399break;4400case '\'':4401if (!(syntax & RE_NO_GNU_OPS))4402{4403token->type = ANCHOR;4404token->opr.ctx_type = BUF_LAST;4405}4406break;4407case '(':4408if (!(syntax & RE_NO_BK_PARENS))4409token->type = OP_OPEN_SUBEXP;4410break;4411case ')':4412if (!(syntax & RE_NO_BK_PARENS))4413token->type = OP_CLOSE_SUBEXP;4414break;4415case '+':4416if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))4417token->type = OP_DUP_PLUS;4418break;4419case '?':4420if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))4421token->type = OP_DUP_QUESTION;4422break;4423case '{':4424if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))4425token->type = OP_OPEN_DUP_NUM;4426break;4427case '}':4428if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))4429token->type = OP_CLOSE_DUP_NUM;4430break;4431default:4432break;4433}4434return 2;4435}44364437token->type = CHARACTER;4438#ifdef RE_ENABLE_I18N4439if (input->mb_cur_max > 1)4440{4441wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));4442token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;4443}4444else4445#endif4446token->word_char = IS_WORD_CHAR (token->opr.c);44474448switch (c)4449{4450case '\n':4451if (syntax & RE_NEWLINE_ALT)4452token->type = OP_ALT;4453break;4454case '|':4455if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))4456token->type = OP_ALT;4457break;4458case '*':4459token->type = OP_DUP_ASTERISK;4460break;4461case '+':4462if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))4463token->type = OP_DUP_PLUS;4464break;4465case '?':4466if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))4467token->type = OP_DUP_QUESTION;4468break;4469case '{':4470if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))4471token->type = OP_OPEN_DUP_NUM;4472break;4473case '}':4474if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))4475token->type = OP_CLOSE_DUP_NUM;4476break;4477case '(':4478if (syntax & RE_NO_BK_PARENS)4479token->type = OP_OPEN_SUBEXP;4480break;4481case ')':4482if (syntax & RE_NO_BK_PARENS)4483token->type = OP_CLOSE_SUBEXP;4484break;4485case '[':4486token->type = OP_OPEN_BRACKET;4487break;4488case '.':4489token->type = OP_PERIOD;4490break;4491case '^':4492if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&4493re_string_cur_idx (input) != 0)4494{4495char prev = re_string_peek_byte (input, -1);4496if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')4497break;4498}4499token->type = ANCHOR;4500token->opr.ctx_type = LINE_FIRST;4501break;4502case '$':4503if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&4504re_string_cur_idx (input) + 1 != re_string_length (input))4505{4506re_token_t next;4507re_string_skip_bytes (input, 1);4508peek_token (&next, input, syntax);4509re_string_skip_bytes (input, -1);4510if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)4511break;4512}4513token->type = ANCHOR;4514token->opr.ctx_type = LINE_LAST;4515break;4516default:4517break;4518}4519return 1;4520}45214522/* Peek a token from INPUT, and return the length of the token.4523We must not use this function out of bracket expressions. */45244525static int4526internal_function4527peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)4528{4529unsigned char c;4530if (re_string_eoi (input))4531{4532token->type = END_OF_RE;4533return 0;4534}4535c = re_string_peek_byte (input, 0);4536token->opr.c = c;45374538#ifdef RE_ENABLE_I18N4539if (input->mb_cur_max > 1 &&4540!re_string_first_byte (input, re_string_cur_idx (input)))4541{4542token->type = CHARACTER;4543return 1;4544}4545#endif /* RE_ENABLE_I18N */45464547if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)4548&& re_string_cur_idx (input) + 1 < re_string_length (input))4549{4550/* In this case, '\' escape a character. */4551unsigned char c2;4552re_string_skip_bytes (input, 1);4553c2 = re_string_peek_byte (input, 0);4554token->opr.c = c2;4555token->type = CHARACTER;4556return 1;4557}4558if (c == '[') /* '[' is a special char in a bracket exps. */4559{4560unsigned char c2;4561int token_len;4562if (re_string_cur_idx (input) + 1 < re_string_length (input))4563c2 = re_string_peek_byte (input, 1);4564else4565c2 = 0;4566token->opr.c = c2;4567token_len = 2;4568switch (c2)4569{4570case '.':4571token->type = OP_OPEN_COLL_ELEM;4572break;4573case '=':4574token->type = OP_OPEN_EQUIV_CLASS;4575break;4576case ':':4577if (syntax & RE_CHAR_CLASSES)4578{4579token->type = OP_OPEN_CHAR_CLASS;4580break;4581}4582/* else fall through. */4583default:4584token->type = CHARACTER;4585token->opr.c = c;4586token_len = 1;4587break;4588}4589return token_len;4590}4591switch (c)4592{4593case '-':4594token->type = OP_CHARSET_RANGE;4595break;4596case ']':4597token->type = OP_CLOSE_BRACKET;4598break;4599case '^':4600token->type = OP_NON_MATCH_LIST;4601break;4602default:4603token->type = CHARACTER;4604}4605return 1;4606}46074608/* Functions for parser. */46094610/* Entry point of the parser.4611Parse the regular expression REGEXP and return the structure tree.4612If an error is occured, ERR is set by error code, and return NULL.4613This function build the following tree, from regular expression <reg_exp>:4614CAT4615/ \4616/ \4617<reg_exp> EOR46184619CAT means concatenation.4620EOR means end of regular expression. */46214622static bin_tree_t *4623parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,4624reg_errcode_t *err)4625{4626re_dfa_t *dfa = (re_dfa_t *) preg->buffer;4627bin_tree_t *tree, *eor, *root;4628re_token_t current_token;4629dfa->syntax = syntax;4630fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);4631tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);4632if (BE (*err != REG_NOERROR && tree == NULL, 0))4633return NULL;4634eor = create_tree (dfa, NULL, NULL, END_OF_RE);4635if (tree != NULL)4636root = create_tree (dfa, tree, eor, CONCAT);4637else4638root = eor;4639if (BE (eor == NULL || root == NULL, 0))4640{4641*err = REG_ESPACE;4642return NULL;4643}4644return root;4645}46464647/* This function build the following tree, from regular expression4648<branch1>|<branch2>:4649ALT4650/ \4651/ \4652<branch1> <branch2>46534654ALT means alternative, which represents the operator `|'. */46554656static bin_tree_t *4657parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,4658reg_syntax_t syntax, int nest, reg_errcode_t *err)4659{4660re_dfa_t *dfa = (re_dfa_t *) preg->buffer;4661bin_tree_t *tree, *branch = NULL;4662tree = parse_branch (regexp, preg, token, syntax, nest, err);4663if (BE (*err != REG_NOERROR && tree == NULL, 0))4664return NULL;46654666while (token->type == OP_ALT)4667{4668fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);4669if (token->type != OP_ALT && token->type != END_OF_RE4670&& (nest == 0 || token->type != OP_CLOSE_SUBEXP))4671{4672branch = parse_branch (regexp, preg, token, syntax, nest, err);4673if (BE (*err != REG_NOERROR && branch == NULL, 0))4674return NULL;4675}4676else4677branch = NULL;4678tree = create_tree (dfa, tree, branch, OP_ALT);4679if (BE (tree == NULL, 0))4680{4681*err = REG_ESPACE;4682return NULL;4683}4684}4685return tree;4686}46874688/* This function build the following tree, from regular expression4689<exp1><exp2>:4690CAT4691/ \4692/ \4693<exp1> <exp2>46944695CAT means concatenation. */46964697static bin_tree_t *4698parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,4699reg_syntax_t syntax, int nest, reg_errcode_t *err)4700{4701bin_tree_t *tree, *exp;4702re_dfa_t *dfa = (re_dfa_t *) preg->buffer;4703tree = parse_expression (regexp, preg, token, syntax, nest, err);4704if (BE (*err != REG_NOERROR && tree == NULL, 0))4705return NULL;47064707while (token->type != OP_ALT && token->type != END_OF_RE4708&& (nest == 0 || token->type != OP_CLOSE_SUBEXP))4709{4710exp = parse_expression (regexp, preg, token, syntax, nest, err);4711if (BE (*err != REG_NOERROR && exp == NULL, 0))4712{4713return NULL;4714}4715if (tree != NULL && exp != NULL)4716{4717tree = create_tree (dfa, tree, exp, CONCAT);4718if (tree == NULL)4719{4720*err = REG_ESPACE;4721return NULL;4722}4723}4724else if (tree == NULL)4725tree = exp;4726/* Otherwise exp == NULL, we don't need to create new tree. */4727}4728return tree;4729}47304731/* This function build the following tree, from regular expression a*:4732*4733|4734a4735*/47364737static bin_tree_t *4738parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,4739reg_syntax_t syntax, int nest, reg_errcode_t *err)4740{4741re_dfa_t *dfa = (re_dfa_t *) preg->buffer;4742bin_tree_t *tree;4743switch (token->type)4744{4745case CHARACTER:4746tree = create_token_tree (dfa, NULL, NULL, token);4747if (BE (tree == NULL, 0))4748{4749*err = REG_ESPACE;4750return NULL;4751}4752#ifdef RE_ENABLE_I18N4753if (dfa->mb_cur_max > 1)4754{4755while (!re_string_eoi (regexp)4756&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))4757{4758bin_tree_t *mbc_remain;4759fetch_token (token, regexp, syntax);4760mbc_remain = create_token_tree (dfa, NULL, NULL, token);4761tree = create_tree (dfa, tree, mbc_remain, CONCAT);4762if (BE (mbc_remain == NULL || tree == NULL, 0))4763{4764*err = REG_ESPACE;4765return NULL;4766}4767}4768}4769#endif4770break;4771case OP_OPEN_SUBEXP:4772tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);4773if (BE (*err != REG_NOERROR && tree == NULL, 0))4774return NULL;4775break;4776case OP_OPEN_BRACKET:4777tree = parse_bracket_exp (regexp, dfa, token, syntax, err);4778if (BE (*err != REG_NOERROR && tree == NULL, 0))4779return NULL;4780break;4781case OP_BACK_REF:4782if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))4783{4784*err = REG_ESUBREG;4785return NULL;4786}4787dfa->used_bkref_map |= 1 << token->opr.idx;4788tree = create_token_tree (dfa, NULL, NULL, token);4789if (BE (tree == NULL, 0))4790{4791*err = REG_ESPACE;4792return NULL;4793}4794++dfa->nbackref;4795dfa->has_mb_node = 1;4796break;4797case OP_OPEN_DUP_NUM:4798if (syntax & RE_CONTEXT_INVALID_DUP)4799{4800*err = REG_BADRPT;4801return NULL;4802}4803/* FALLTHROUGH */4804case OP_DUP_ASTERISK:4805case OP_DUP_PLUS:4806case OP_DUP_QUESTION:4807if (syntax & RE_CONTEXT_INVALID_OPS)4808{4809*err = REG_BADRPT;4810return NULL;4811}4812else if (syntax & RE_CONTEXT_INDEP_OPS)4813{4814fetch_token (token, regexp, syntax);4815return parse_expression (regexp, preg, token, syntax, nest, err);4816}4817/* else fall through */4818case OP_CLOSE_SUBEXP:4819if ((token->type == OP_CLOSE_SUBEXP) &&4820!(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))4821{4822*err = REG_ERPAREN;4823return NULL;4824}4825/* else fall through */4826case OP_CLOSE_DUP_NUM:4827/* We treat it as a normal character. */48284829/* Then we can these characters as normal characters. */4830token->type = CHARACTER;4831/* mb_partial and word_char bits should be initialized already4832by peek_token. */4833tree = create_token_tree (dfa, NULL, NULL, token);4834if (BE (tree == NULL, 0))4835{4836*err = REG_ESPACE;4837return NULL;4838}4839break;4840case ANCHOR:4841if ((token->opr.ctx_type4842& (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))4843&& dfa->word_ops_used == 0)4844init_word_char (dfa);4845if (token->opr.ctx_type == WORD_DELIM4846|| token->opr.ctx_type == NOT_WORD_DELIM)4847{4848bin_tree_t *tree_first, *tree_last;4849if (token->opr.ctx_type == WORD_DELIM)4850{4851token->opr.ctx_type = WORD_FIRST;4852tree_first = create_token_tree (dfa, NULL, NULL, token);4853token->opr.ctx_type = WORD_LAST;4854}4855else4856{4857token->opr.ctx_type = INSIDE_WORD;4858tree_first = create_token_tree (dfa, NULL, NULL, token);4859token->opr.ctx_type = INSIDE_NOTWORD;4860}4861tree_last = create_token_tree (dfa, NULL, NULL, token);4862tree = create_tree (dfa, tree_first, tree_last, OP_ALT);4863if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))4864{4865*err = REG_ESPACE;4866return NULL;4867}4868}4869else4870{4871tree = create_token_tree (dfa, NULL, NULL, token);4872if (BE (tree == NULL, 0))4873{4874*err = REG_ESPACE;4875return NULL;4876}4877}4878/* We must return here, since ANCHORs can't be followed4879by repetition operators.4880eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",4881it must not be "<ANCHOR(^)><REPEAT(*)>". */4882fetch_token (token, regexp, syntax);4883return tree;4884case OP_PERIOD:4885tree = create_token_tree (dfa, NULL, NULL, token);4886if (BE (tree == NULL, 0))4887{4888*err = REG_ESPACE;4889return NULL;4890}4891if (dfa->mb_cur_max > 1)4892dfa->has_mb_node = 1;4893break;4894case OP_WORD:4895case OP_NOTWORD:4896tree = build_charclass_op (dfa, regexp->trans,4897(const unsigned char *) "alnum",4898(const unsigned char *) "_",4899token->type == OP_NOTWORD, err);4900if (BE (*err != REG_NOERROR && tree == NULL, 0))4901return NULL;4902break;4903case OP_SPACE:4904case OP_NOTSPACE:4905tree = build_charclass_op (dfa, regexp->trans,4906(const unsigned char *) "space",4907(const unsigned char *) "",4908token->type == OP_NOTSPACE, err);4909if (BE (*err != REG_NOERROR && tree == NULL, 0))4910return NULL;4911break;4912case OP_ALT:4913case END_OF_RE:4914return NULL;4915case BACK_SLASH:4916*err = REG_EESCAPE;4917return NULL;4918default:4919/* Must not happen? */4920#ifdef DEBUG4921assert (0);4922#endif4923return NULL;4924}4925fetch_token (token, regexp, syntax);49264927while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS4928|| token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)4929{4930tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);4931if (BE (*err != REG_NOERROR && tree == NULL, 0))4932return NULL;4933/* In BRE consecutive duplications are not allowed. */4934if ((syntax & RE_CONTEXT_INVALID_DUP)4935&& (token->type == OP_DUP_ASTERISK4936|| token->type == OP_OPEN_DUP_NUM))4937{4938*err = REG_BADRPT;4939return NULL;4940}4941}49424943return tree;4944}49454946/* This function build the following tree, from regular expression4947(<reg_exp>):4948SUBEXP4949|4950<reg_exp>4951*/49524953static bin_tree_t *4954parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,4955reg_syntax_t syntax, int nest, reg_errcode_t *err)4956{4957re_dfa_t *dfa = (re_dfa_t *) preg->buffer;4958bin_tree_t *tree;4959size_t cur_nsub;4960cur_nsub = preg->re_nsub++;49614962fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);49634964/* The subexpression may be a null string. */4965if (token->type == OP_CLOSE_SUBEXP)4966tree = NULL;4967else4968{4969tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);4970if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))4971*err = REG_EPAREN;4972if (BE (*err != REG_NOERROR, 0))4973return NULL;4974}49754976if (cur_nsub <= '9' - '1')4977dfa->completed_bkref_map |= 1 << cur_nsub;49784979tree = create_tree (dfa, tree, NULL, SUBEXP);4980if (BE (tree == NULL, 0))4981{4982*err = REG_ESPACE;4983return NULL;4984}4985tree->token.opr.idx = cur_nsub;4986return tree;4987}49884989/* This function parse repetition operators like "*", "+", "{1,3}" etc. */49904991static bin_tree_t *4992parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,4993re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)4994{4995bin_tree_t *tree = NULL, *old_tree = NULL;4996int i, start, end, start_idx = re_string_cur_idx (regexp);4997re_token_t start_token = *token;49984999if (token->type == OP_OPEN_DUP_NUM)5000{5001end = 0;5002start = fetch_number (regexp, token, syntax);5003if (start == -1)5004{5005if (token->type == CHARACTER && token->opr.c == ',')5006start = 0; /* We treat "{,m}" as "{0,m}". */5007else5008{5009*err = REG_BADBR; /* <re>{} is invalid. */5010return NULL;5011}5012}5013if (BE (start != -2, 1))5014{5015/* We treat "{n}" as "{n,n}". */5016end = ((token->type == OP_CLOSE_DUP_NUM) ? start5017: ((token->type == CHARACTER && token->opr.c == ',')5018? fetch_number (regexp, token, syntax) : -2));5019}5020if (BE (start == -2 || end == -2, 0))5021{5022/* Invalid sequence. */5023if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))5024{5025if (token->type == END_OF_RE)5026*err = REG_EBRACE;5027else5028*err = REG_BADBR;50295030return NULL;5031}50325033/* If the syntax bit is set, rollback. */5034re_string_set_index (regexp, start_idx);5035*token = start_token;5036token->type = CHARACTER;5037/* mb_partial and word_char bits should be already initialized by5038peek_token. */5039return elem;5040}50415042if (BE (end != -1 && start > end, 0))5043{5044/* First number greater than second. */5045*err = REG_BADBR;5046return NULL;5047}5048}5049else5050{5051start = (token->type == OP_DUP_PLUS) ? 1 : 0;5052end = (token->type == OP_DUP_QUESTION) ? 1 : -1;5053}50545055fetch_token (token, regexp, syntax);50565057if (BE (elem == NULL, 0))5058return NULL;5059if (BE (start == 0 && end == 0, 0))5060{5061postorder (elem, free_tree, NULL);5062return NULL;5063}50645065/* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */5066if (BE (start > 0, 0))5067{5068tree = elem;5069for (i = 2; i <= start; ++i)5070{5071elem = duplicate_tree (elem, dfa);5072tree = create_tree (dfa, tree, elem, CONCAT);5073if (BE (elem == NULL || tree == NULL, 0))5074goto parse_dup_op_espace;5075}50765077if (start == end)5078return tree;50795080/* Duplicate ELEM before it is marked optional. */5081elem = duplicate_tree (elem, dfa);5082old_tree = tree;5083}5084else5085old_tree = NULL;50865087if (elem->token.type == SUBEXP)5088postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);50895090tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));5091if (BE (tree == NULL, 0))5092goto parse_dup_op_espace;50935094/* This loop is actually executed only when end != -1,5095to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have5096already created the start+1-th copy. */5097for (i = start + 2; i <= end; ++i)5098{5099elem = duplicate_tree (elem, dfa);5100tree = create_tree (dfa, tree, elem, CONCAT);5101if (BE (elem == NULL || tree == NULL, 0))5102goto parse_dup_op_espace;51035104tree = create_tree (dfa, tree, NULL, OP_ALT);5105if (BE (tree == NULL, 0))5106goto parse_dup_op_espace;5107}51085109if (old_tree)5110tree = create_tree (dfa, old_tree, tree, CONCAT);51115112return tree;51135114parse_dup_op_espace:5115*err = REG_ESPACE;5116return NULL;5117}51185119/* Size of the names for collating symbol/equivalence_class/character_class.5120I'm not sure, but maybe enough. */5121#define BRACKET_NAME_BUF_SIZE 3251225123#ifndef _LIBC5124/* Local function for parse_bracket_exp only used in case of NOT _LIBC.5125Build the range expression which starts from START_ELEM, and ends5126at END_ELEM. The result are written to MBCSET and SBCSET.5127RANGE_ALLOC is the allocated size of mbcset->range_starts, and5128mbcset->range_ends, is a pointer argument sinse we may5129update it. */51305131static reg_errcode_t5132internal_function5133# ifdef RE_ENABLE_I18N5134build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,5135bracket_elem_t *start_elem, bracket_elem_t *end_elem)5136# else /* not RE_ENABLE_I18N */5137build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,5138bracket_elem_t *end_elem)5139# endif /* not RE_ENABLE_I18N */5140{5141unsigned int start_ch, end_ch;5142/* Equivalence Classes and Character Classes can't be a range start/end. */5143if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS5144|| end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,51450))5146return REG_ERANGE;51475148/* We can handle no multi character collating elements without libc5149support. */5150if (BE ((start_elem->type == COLL_SYM5151&& strlen ((char *) start_elem->opr.name) > 1)5152|| (end_elem->type == COLL_SYM5153&& strlen ((char *) end_elem->opr.name) > 1), 0))5154return REG_ECOLLATE;51555156# ifdef RE_ENABLE_I18N5157{5158wchar_t wc;5159wint_t start_wc;5160wint_t end_wc;5161wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};51625163start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch5164: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]5165: 0));5166end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch5167: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]5168: 0));5169start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)5170? __btowc (start_ch) : start_elem->opr.wch);5171end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)5172? __btowc (end_ch) : end_elem->opr.wch);5173if (start_wc == WEOF || end_wc == WEOF)5174return REG_ECOLLATE;5175cmp_buf[0] = start_wc;5176cmp_buf[4] = end_wc;5177if (wcscoll (cmp_buf, cmp_buf + 4) > 0)5178return REG_ERANGE;51795180/* Got valid collation sequence values, add them as a new entry.5181However, for !_LIBC we have no collation elements: if the5182character set is single byte, the single byte character set5183that we build below suffices. parse_bracket_exp passes5184no MBCSET if dfa->mb_cur_max == 1. */5185if (mbcset)5186{5187/* Check the space of the arrays. */5188if (BE (*range_alloc == mbcset->nranges, 0))5189{5190/* There is not enough space, need realloc. */5191wchar_t *new_array_start, *new_array_end;5192int new_nranges;51935194/* +1 in case of mbcset->nranges is 0. */5195new_nranges = 2 * mbcset->nranges + 1;5196/* Use realloc since mbcset->range_starts and mbcset->range_ends5197are NULL if *range_alloc == 0. */5198new_array_start = re_realloc (mbcset->range_starts, wchar_t,5199new_nranges);5200new_array_end = re_realloc (mbcset->range_ends, wchar_t,5201new_nranges);52025203if (BE (new_array_start == NULL || new_array_end == NULL, 0))5204return REG_ESPACE;52055206mbcset->range_starts = new_array_start;5207mbcset->range_ends = new_array_end;5208*range_alloc = new_nranges;5209}52105211mbcset->range_starts[mbcset->nranges] = start_wc;5212mbcset->range_ends[mbcset->nranges++] = end_wc;5213}52145215/* Build the table for single byte characters. */5216for (wc = 0; wc < SBC_MAX; ++wc)5217{5218cmp_buf[2] = wc;5219if (wcscoll (cmp_buf, cmp_buf + 2) <= 05220&& wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)5221bitset_set (sbcset, wc);5222}5223}5224# else /* not RE_ENABLE_I18N */5225{5226unsigned int ch;5227start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch5228: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]5229: 0));5230end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch5231: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]5232: 0));5233if (start_ch > end_ch)5234return REG_ERANGE;5235/* Build the table for single byte characters. */5236for (ch = 0; ch < SBC_MAX; ++ch)5237if (start_ch <= ch && ch <= end_ch)5238bitset_set (sbcset, ch);5239}5240# endif /* not RE_ENABLE_I18N */5241return REG_NOERROR;5242}5243#endif /* not _LIBC */52445245#ifndef _LIBC5246/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..5247Build the collating element which is represented by NAME.5248The result are written to MBCSET and SBCSET.5249COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a5250pointer argument since we may update it. */52515252static reg_errcode_t5253internal_function5254# ifdef RE_ENABLE_I18N5255build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,5256int *coll_sym_alloc, const unsigned char *name)5257# else /* not RE_ENABLE_I18N */5258build_collating_symbol (bitset_t sbcset, const unsigned char *name)5259# endif /* not RE_ENABLE_I18N */5260{5261size_t name_len = strlen ((const char *) name);5262if (BE (name_len != 1, 0))5263return REG_ECOLLATE;5264else5265{5266bitset_set (sbcset, name[0]);5267return REG_NOERROR;5268}5269}5270#endif /* not _LIBC */52715272/* This function parse bracket expression like "[abc]", "[a-c]",5273"[[.a-a.]]" etc. */52745275static bin_tree_t *5276parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,5277reg_syntax_t syntax, reg_errcode_t *err)5278{5279#ifdef _LIBC5280const unsigned char *collseqmb;5281const char *collseqwc;5282uint32_t nrules;5283int32_t table_size;5284const int32_t *symb_table;5285const unsigned char *extra;52865287/* Local function for parse_bracket_exp used in _LIBC environement.5288Seek the collating symbol entry correspondings to NAME.5289Return the index of the symbol in the SYMB_TABLE. */52905291auto inline int32_t5292__attribute ((always_inline))5293seek_collating_symbol_entry (name, name_len)5294const unsigned char *name;5295size_t name_len;5296{5297int32_t hash = elem_hash ((const char *) name, name_len);5298int32_t elem = hash % table_size;5299if (symb_table[2 * elem] != 0)5300{5301int32_t second = hash % (table_size - 2) + 1;53025303do5304{5305/* First compare the hashing value. */5306if (symb_table[2 * elem] == hash5307/* Compare the length of the name. */5308&& name_len == extra[symb_table[2 * elem + 1]]5309/* Compare the name. */5310&& memcmp (name, &extra[symb_table[2 * elem + 1] + 1],5311name_len) == 0)5312{5313/* Yep, this is the entry. */5314break;5315}53165317/* Next entry. */5318elem += second;5319}5320while (symb_table[2 * elem] != 0);5321}5322return elem;5323}53245325/* Local function for parse_bracket_exp used in _LIBC environement.5326Look up the collation sequence value of BR_ELEM.5327Return the value if succeeded, UINT_MAX otherwise. */53285329auto inline unsigned int5330__attribute ((always_inline))5331lookup_collation_sequence_value (br_elem)5332bracket_elem_t *br_elem;5333{5334if (br_elem->type == SB_CHAR)5335{5336/*5337if (MB_CUR_MAX == 1)5338*/5339if (nrules == 0)5340return collseqmb[br_elem->opr.ch];5341else5342{5343wint_t wc = __btowc (br_elem->opr.ch);5344return __collseq_table_lookup (collseqwc, wc);5345}5346}5347else if (br_elem->type == MB_CHAR)5348{5349return __collseq_table_lookup (collseqwc, br_elem->opr.wch);5350}5351else if (br_elem->type == COLL_SYM)5352{5353size_t sym_name_len = strlen ((char *) br_elem->opr.name);5354if (nrules != 0)5355{5356int32_t elem, idx;5357elem = seek_collating_symbol_entry (br_elem->opr.name,5358sym_name_len);5359if (symb_table[2 * elem] != 0)5360{5361/* We found the entry. */5362idx = symb_table[2 * elem + 1];5363/* Skip the name of collating element name. */5364idx += 1 + extra[idx];5365/* Skip the byte sequence of the collating element. */5366idx += 1 + extra[idx];5367/* Adjust for the alignment. */5368idx = (idx + 3) & ~3;5369/* Skip the multibyte collation sequence value. */5370idx += sizeof (unsigned int);5371/* Skip the wide char sequence of the collating element. */5372idx += sizeof (unsigned int) *5373(1 + *(unsigned int *) (extra + idx));5374/* Return the collation sequence value. */5375return *(unsigned int *) (extra + idx);5376}5377else if (symb_table[2 * elem] == 0 && sym_name_len == 1)5378{5379/* No valid character. Match it as a single byte5380character. */5381return collseqmb[br_elem->opr.name[0]];5382}5383}5384else if (sym_name_len == 1)5385return collseqmb[br_elem->opr.name[0]];5386}5387return UINT_MAX;5388}53895390/* Local function for parse_bracket_exp used in _LIBC environement.5391Build the range expression which starts from START_ELEM, and ends5392at END_ELEM. The result are written to MBCSET and SBCSET.5393RANGE_ALLOC is the allocated size of mbcset->range_starts, and5394mbcset->range_ends, is a pointer argument sinse we may5395update it. */53965397auto inline reg_errcode_t5398__attribute ((always_inline))5399build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)5400re_charset_t *mbcset;5401int *range_alloc;5402bitset_t sbcset;5403bracket_elem_t *start_elem, *end_elem;5404{5405unsigned int ch;5406uint32_t start_collseq;5407uint32_t end_collseq;54085409/* Equivalence Classes and Character Classes can't be a range5410start/end. */5411if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS5412|| end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,54130))5414return REG_ERANGE;54155416start_collseq = lookup_collation_sequence_value (start_elem);5417end_collseq = lookup_collation_sequence_value (end_elem);5418/* Check start/end collation sequence values. */5419if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))5420return REG_ECOLLATE;5421if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))5422return REG_ERANGE;54235424/* Got valid collation sequence values, add them as a new entry.5425However, if we have no collation elements, and the character set5426is single byte, the single byte character set that we5427build below suffices. */5428if (nrules > 0 || dfa->mb_cur_max > 1)5429{5430/* Check the space of the arrays. */5431if (BE (*range_alloc == mbcset->nranges, 0))5432{5433/* There is not enough space, need realloc. */5434uint32_t *new_array_start;5435uint32_t *new_array_end;5436int new_nranges;54375438/* +1 in case of mbcset->nranges is 0. */5439new_nranges = 2 * mbcset->nranges + 1;5440new_array_start = re_realloc (mbcset->range_starts, uint32_t,5441new_nranges);5442new_array_end = re_realloc (mbcset->range_ends, uint32_t,5443new_nranges);54445445if (BE (new_array_start == NULL || new_array_end == NULL, 0))5446return REG_ESPACE;54475448mbcset->range_starts = new_array_start;5449mbcset->range_ends = new_array_end;5450*range_alloc = new_nranges;5451}54525453mbcset->range_starts[mbcset->nranges] = start_collseq;5454mbcset->range_ends[mbcset->nranges++] = end_collseq;5455}54565457/* Build the table for single byte characters. */5458for (ch = 0; ch < SBC_MAX; ch++)5459{5460uint32_t ch_collseq;5461/*5462if (MB_CUR_MAX == 1)5463*/5464if (nrules == 0)5465ch_collseq = collseqmb[ch];5466else5467ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));5468if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)5469bitset_set (sbcset, ch);5470}5471return REG_NOERROR;5472}54735474/* Local function for parse_bracket_exp used in _LIBC environement.5475Build the collating element which is represented by NAME.5476The result are written to MBCSET and SBCSET.5477COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a5478pointer argument sinse we may update it. */54795480auto inline reg_errcode_t5481__attribute ((always_inline))5482build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)5483re_charset_t *mbcset;5484int *coll_sym_alloc;5485bitset_t sbcset;5486const unsigned char *name;5487{5488int32_t elem, idx;5489size_t name_len = strlen ((const char *) name);5490if (nrules != 0)5491{5492elem = seek_collating_symbol_entry (name, name_len);5493if (symb_table[2 * elem] != 0)5494{5495/* We found the entry. */5496idx = symb_table[2 * elem + 1];5497/* Skip the name of collating element name. */5498idx += 1 + extra[idx];5499}5500else if (symb_table[2 * elem] == 0 && name_len == 1)5501{5502/* No valid character, treat it as a normal5503character. */5504bitset_set (sbcset, name[0]);5505return REG_NOERROR;5506}5507else5508return REG_ECOLLATE;55095510/* Got valid collation sequence, add it as a new entry. */5511/* Check the space of the arrays. */5512if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))5513{5514/* Not enough, realloc it. */5515/* +1 in case of mbcset->ncoll_syms is 0. */5516int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;5517/* Use realloc since mbcset->coll_syms is NULL5518if *alloc == 0. */5519int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,5520new_coll_sym_alloc);5521if (BE (new_coll_syms == NULL, 0))5522return REG_ESPACE;5523mbcset->coll_syms = new_coll_syms;5524*coll_sym_alloc = new_coll_sym_alloc;5525}5526mbcset->coll_syms[mbcset->ncoll_syms++] = idx;5527return REG_NOERROR;5528}5529else5530{5531if (BE (name_len != 1, 0))5532return REG_ECOLLATE;5533else5534{5535bitset_set (sbcset, name[0]);5536return REG_NOERROR;5537}5538}5539}5540#endif55415542re_token_t br_token;5543re_bitset_ptr_t sbcset;5544#ifdef RE_ENABLE_I18N5545re_charset_t *mbcset;5546int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;5547int equiv_class_alloc = 0, char_class_alloc = 0;5548#endif /* not RE_ENABLE_I18N */5549int non_match = 0;5550bin_tree_t *work_tree;5551int token_len;5552int first_round = 1;5553#ifdef _LIBC5554collseqmb = (const unsigned char *)5555_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);5556nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);5557if (nrules)5558{5559/*5560if (MB_CUR_MAX > 1)5561*/5562collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);5563table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);5564symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,5565_NL_COLLATE_SYMB_TABLEMB);5566extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,5567_NL_COLLATE_SYMB_EXTRAMB);5568}5569#endif5570sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);5571#ifdef RE_ENABLE_I18N5572mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);5573#endif /* RE_ENABLE_I18N */5574#ifdef RE_ENABLE_I18N5575if (BE (sbcset == NULL || mbcset == NULL, 0))5576#else5577if (BE (sbcset == NULL, 0))5578#endif /* RE_ENABLE_I18N */5579{5580*err = REG_ESPACE;5581return NULL;5582}55835584token_len = peek_token_bracket (token, regexp, syntax);5585if (BE (token->type == END_OF_RE, 0))5586{5587*err = REG_BADPAT;5588goto parse_bracket_exp_free_return;5589}5590if (token->type == OP_NON_MATCH_LIST)5591{5592#ifdef RE_ENABLE_I18N5593mbcset->non_match = 1;5594#endif /* not RE_ENABLE_I18N */5595non_match = 1;5596if (syntax & RE_HAT_LISTS_NOT_NEWLINE)5597bitset_set (sbcset, '\0');5598re_string_skip_bytes (regexp, token_len); /* Skip a token. */5599token_len = peek_token_bracket (token, regexp, syntax);5600if (BE (token->type == END_OF_RE, 0))5601{5602*err = REG_BADPAT;5603goto parse_bracket_exp_free_return;5604}5605}56065607/* We treat the first ']' as a normal character. */5608if (token->type == OP_CLOSE_BRACKET)5609token->type = CHARACTER;56105611while (1)5612{5613bracket_elem_t start_elem, end_elem;5614unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];5615unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];5616reg_errcode_t ret;5617int token_len2 = 0, is_range_exp = 0;5618re_token_t token2;56195620start_elem.opr.name = start_name_buf;5621ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,5622syntax, first_round);5623if (BE (ret != REG_NOERROR, 0))5624{5625*err = ret;5626goto parse_bracket_exp_free_return;5627}5628first_round = 0;56295630/* Get information about the next token. We need it in any case. */5631token_len = peek_token_bracket (token, regexp, syntax);56325633/* Do not check for ranges if we know they are not allowed. */5634if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)5635{5636if (BE (token->type == END_OF_RE, 0))5637{5638*err = REG_EBRACK;5639goto parse_bracket_exp_free_return;5640}5641if (token->type == OP_CHARSET_RANGE)5642{5643re_string_skip_bytes (regexp, token_len); /* Skip '-'. */5644token_len2 = peek_token_bracket (&token2, regexp, syntax);5645if (BE (token2.type == END_OF_RE, 0))5646{5647*err = REG_EBRACK;5648goto parse_bracket_exp_free_return;5649}5650if (token2.type == OP_CLOSE_BRACKET)5651{5652/* We treat the last '-' as a normal character. */5653re_string_skip_bytes (regexp, -token_len);5654token->type = CHARACTER;5655}5656else5657is_range_exp = 1;5658}5659}56605661if (is_range_exp == 1)5662{5663end_elem.opr.name = end_name_buf;5664ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,5665dfa, syntax, 1);5666if (BE (ret != REG_NOERROR, 0))5667{5668*err = ret;5669goto parse_bracket_exp_free_return;5670}56715672token_len = peek_token_bracket (token, regexp, syntax);56735674#ifdef _LIBC5675*err = build_range_exp (sbcset, mbcset, &range_alloc,5676&start_elem, &end_elem);5677#else5678# ifdef RE_ENABLE_I18N5679*err = build_range_exp (sbcset,5680dfa->mb_cur_max > 1 ? mbcset : NULL,5681&range_alloc, &start_elem, &end_elem);5682# else5683*err = build_range_exp (sbcset, &start_elem, &end_elem);5684# endif5685#endif /* RE_ENABLE_I18N */5686if (BE (*err != REG_NOERROR, 0))5687goto parse_bracket_exp_free_return;5688}5689else5690{5691switch (start_elem.type)5692{5693case SB_CHAR:5694bitset_set (sbcset, start_elem.opr.ch);5695break;5696#ifdef RE_ENABLE_I18N5697case MB_CHAR:5698/* Check whether the array has enough space. */5699if (BE (mbchar_alloc == mbcset->nmbchars, 0))5700{5701wchar_t *new_mbchars;5702/* Not enough, realloc it. */5703/* +1 in case of mbcset->nmbchars is 0. */5704mbchar_alloc = 2 * mbcset->nmbchars + 1;5705/* Use realloc since array is NULL if *alloc == 0. */5706new_mbchars = re_realloc (mbcset->mbchars, wchar_t,5707mbchar_alloc);5708if (BE (new_mbchars == NULL, 0))5709goto parse_bracket_exp_espace;5710mbcset->mbchars = new_mbchars;5711}5712mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;5713break;5714#endif /* RE_ENABLE_I18N */5715case EQUIV_CLASS:5716*err = build_equiv_class (sbcset,5717#ifdef RE_ENABLE_I18N5718mbcset, &equiv_class_alloc,5719#endif /* RE_ENABLE_I18N */5720start_elem.opr.name);5721if (BE (*err != REG_NOERROR, 0))5722goto parse_bracket_exp_free_return;5723break;5724case COLL_SYM:5725*err = build_collating_symbol (sbcset,5726#ifdef RE_ENABLE_I18N5727mbcset, &coll_sym_alloc,5728#endif /* RE_ENABLE_I18N */5729start_elem.opr.name);5730if (BE (*err != REG_NOERROR, 0))5731goto parse_bracket_exp_free_return;5732break;5733case CHAR_CLASS:5734*err = build_charclass (regexp->trans, sbcset,5735#ifdef RE_ENABLE_I18N5736mbcset, &char_class_alloc,5737#endif /* RE_ENABLE_I18N */5738start_elem.opr.name, syntax);5739if (BE (*err != REG_NOERROR, 0))5740goto parse_bracket_exp_free_return;5741break;5742default:5743assert (0);5744break;5745}5746}5747if (BE (token->type == END_OF_RE, 0))5748{5749*err = REG_EBRACK;5750goto parse_bracket_exp_free_return;5751}5752if (token->type == OP_CLOSE_BRACKET)5753break;5754}57555756re_string_skip_bytes (regexp, token_len); /* Skip a token. */57575758/* If it is non-matching list. */5759if (non_match)5760bitset_not (sbcset);57615762#ifdef RE_ENABLE_I18N5763/* Ensure only single byte characters are set. */5764if (dfa->mb_cur_max > 1)5765bitset_mask (sbcset, dfa->sb_char);57665767if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes5768|| mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes5769|| mbcset->non_match)))5770{5771bin_tree_t *mbc_tree;5772int sbc_idx;5773/* Build a tree for complex bracket. */5774dfa->has_mb_node = 1;5775br_token.type = COMPLEX_BRACKET;5776br_token.opr.mbcset = mbcset;5777mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);5778if (BE (mbc_tree == NULL, 0))5779goto parse_bracket_exp_espace;5780for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)5781if (sbcset[sbc_idx])5782break;5783/* If there are no bits set in sbcset, there is no point5784of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */5785if (sbc_idx < BITSET_WORDS)5786{5787/* Build a tree for simple bracket. */5788br_token.type = SIMPLE_BRACKET;5789br_token.opr.sbcset = sbcset;5790work_tree = create_token_tree (dfa, NULL, NULL, &br_token);5791if (BE (work_tree == NULL, 0))5792goto parse_bracket_exp_espace;57935794/* Then join them by ALT node. */5795work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);5796if (BE (work_tree == NULL, 0))5797goto parse_bracket_exp_espace;5798}5799else5800{5801re_free (sbcset);5802work_tree = mbc_tree;5803}5804}5805else5806#endif /* not RE_ENABLE_I18N */5807{5808#ifdef RE_ENABLE_I18N5809free_charset (mbcset);5810#endif5811/* Build a tree for simple bracket. */5812br_token.type = SIMPLE_BRACKET;5813br_token.opr.sbcset = sbcset;5814work_tree = create_token_tree (dfa, NULL, NULL, &br_token);5815if (BE (work_tree == NULL, 0))5816goto parse_bracket_exp_espace;5817}5818return work_tree;58195820parse_bracket_exp_espace:5821*err = REG_ESPACE;5822parse_bracket_exp_free_return:5823re_free (sbcset);5824#ifdef RE_ENABLE_I18N5825free_charset (mbcset);5826#endif /* RE_ENABLE_I18N */5827return NULL;5828}58295830/* Parse an element in the bracket expression. */58315832static reg_errcode_t5833parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,5834re_token_t *token, int token_len, re_dfa_t *dfa,5835reg_syntax_t syntax, int accept_hyphen)5836{5837#ifdef RE_ENABLE_I18N5838int cur_char_size;5839cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));5840if (cur_char_size > 1)5841{5842elem->type = MB_CHAR;5843elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));5844re_string_skip_bytes (regexp, cur_char_size);5845return REG_NOERROR;5846}5847#endif /* RE_ENABLE_I18N */5848re_string_skip_bytes (regexp, token_len); /* Skip a token. */5849if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS5850|| token->type == OP_OPEN_EQUIV_CLASS)5851return parse_bracket_symbol (elem, regexp, token);5852if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)5853{5854/* A '-' must only appear as anything but a range indicator before5855the closing bracket. Everything else is an error. */5856re_token_t token2;5857(void) peek_token_bracket (&token2, regexp, syntax);5858if (token2.type != OP_CLOSE_BRACKET)5859/* The actual error value is not standardized since this whole5860case is undefined. But ERANGE makes good sense. */5861return REG_ERANGE;5862}5863elem->type = SB_CHAR;5864elem->opr.ch = token->opr.c;5865return REG_NOERROR;5866}58675868/* Parse a bracket symbol in the bracket expression. Bracket symbols are5869such as [:<character_class>:], [.<collating_element>.], and5870[=<equivalent_class>=]. */58715872static reg_errcode_t5873parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,5874re_token_t *token)5875{5876unsigned char ch, delim = token->opr.c;5877int i = 0;5878if (re_string_eoi(regexp))5879return REG_EBRACK;5880for (;; ++i)5881{5882if (i >= BRACKET_NAME_BUF_SIZE)5883return REG_EBRACK;5884if (token->type == OP_OPEN_CHAR_CLASS)5885ch = re_string_fetch_byte_case (regexp);5886else5887ch = re_string_fetch_byte (regexp);5888if (re_string_eoi(regexp))5889return REG_EBRACK;5890if (ch == delim && re_string_peek_byte (regexp, 0) == ']')5891break;5892elem->opr.name[i] = ch;5893}5894re_string_skip_bytes (regexp, 1);5895elem->opr.name[i] = '\0';5896switch (token->type)5897{5898case OP_OPEN_COLL_ELEM:5899elem->type = COLL_SYM;5900break;5901case OP_OPEN_EQUIV_CLASS:5902elem->type = EQUIV_CLASS;5903break;5904case OP_OPEN_CHAR_CLASS:5905elem->type = CHAR_CLASS;5906break;5907default:5908break;5909}5910return REG_NOERROR;5911}59125913/* Helper function for parse_bracket_exp.5914Build the equivalence class which is represented by NAME.5915The result are written to MBCSET and SBCSET.5916EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,5917is a pointer argument sinse we may update it. */59185919static reg_errcode_t5920#ifdef RE_ENABLE_I18N5921build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,5922int *equiv_class_alloc, const unsigned char *name)5923#else /* not RE_ENABLE_I18N */5924build_equiv_class (bitset_t sbcset, const unsigned char *name)5925#endif /* not RE_ENABLE_I18N */5926{5927#ifdef _LIBC5928uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);5929if (nrules != 0)5930{5931const int32_t *table, *indirect;5932const unsigned char *weights, *extra, *cp;5933unsigned char char_buf[2];5934int32_t idx1, idx2;5935unsigned int ch;5936size_t len;5937/* This #include defines a local function! */5938# include <locale/weight.h>5939/* Calculate the index for equivalence class. */5940cp = name;5941table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);5942weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,5943_NL_COLLATE_WEIGHTMB);5944extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,5945_NL_COLLATE_EXTRAMB);5946indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,5947_NL_COLLATE_INDIRECTMB);5948idx1 = findidx (&cp);5949if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))5950/* This isn't a valid character. */5951return REG_ECOLLATE;59525953/* Build single byte matcing table for this equivalence class. */5954char_buf[1] = (unsigned char) '\0';5955len = weights[idx1];5956for (ch = 0; ch < SBC_MAX; ++ch)5957{5958char_buf[0] = ch;5959cp = char_buf;5960idx2 = findidx (&cp);5961/*5962idx2 = table[ch];5963*/5964if (idx2 == 0)5965/* This isn't a valid character. */5966continue;5967if (len == weights[idx2])5968{5969int cnt = 0;5970while (cnt <= len &&5971weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])5972++cnt;59735974if (cnt > len)5975bitset_set (sbcset, ch);5976}5977}5978/* Check whether the array has enough space. */5979if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))5980{5981/* Not enough, realloc it. */5982/* +1 in case of mbcset->nequiv_classes is 0. */5983int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;5984/* Use realloc since the array is NULL if *alloc == 0. */5985int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,5986int32_t,5987new_equiv_class_alloc);5988if (BE (new_equiv_classes == NULL, 0))5989return REG_ESPACE;5990mbcset->equiv_classes = new_equiv_classes;5991*equiv_class_alloc = new_equiv_class_alloc;5992}5993mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;5994}5995else5996#endif /* _LIBC */5997{5998if (BE (strlen ((const char *) name) != 1, 0))5999return REG_ECOLLATE;6000bitset_set (sbcset, *name);6001}6002return REG_NOERROR;6003}60046005/* Helper function for parse_bracket_exp.6006Build the character class which is represented by NAME.6007The result are written to MBCSET and SBCSET.6008CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,6009is a pointer argument sinse we may update it. */60106011static reg_errcode_t6012#ifdef RE_ENABLE_I18N6013build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,6014re_charset_t *mbcset, int *char_class_alloc,6015const unsigned char *class_name, reg_syntax_t syntax)6016#else /* not RE_ENABLE_I18N */6017build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,6018const unsigned char *class_name, reg_syntax_t syntax)6019#endif /* not RE_ENABLE_I18N */6020{6021int i;6022const char *name = (const char *) class_name;60236024/* In case of REG_ICASE "upper" and "lower" match the both of6025upper and lower cases. */6026if ((syntax & RE_ICASE)6027&& (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))6028name = "alpha";60296030#ifdef RE_ENABLE_I18N6031/* Check the space of the arrays. */6032if (BE (*char_class_alloc == mbcset->nchar_classes, 0))6033{6034/* Not enough, realloc it. */6035/* +1 in case of mbcset->nchar_classes is 0. */6036int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;6037/* Use realloc since array is NULL if *alloc == 0. */6038wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,6039new_char_class_alloc);6040if (BE (new_char_classes == NULL, 0))6041return REG_ESPACE;6042mbcset->char_classes = new_char_classes;6043*char_class_alloc = new_char_class_alloc;6044}6045mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);6046#endif /* RE_ENABLE_I18N */60476048#define BUILD_CHARCLASS_LOOP(ctype_func) \6049do { \6050if (BE (trans != NULL, 0)) \6051{ \6052for (i = 0; i < SBC_MAX; ++i) \6053if (ctype_func (i)) \6054bitset_set (sbcset, trans[i]); \6055} \6056else \6057{ \6058for (i = 0; i < SBC_MAX; ++i) \6059if (ctype_func (i)) \6060bitset_set (sbcset, i); \6061} \6062} while (0)60636064if (strcmp (name, "alnum") == 0)6065BUILD_CHARCLASS_LOOP (isalnum);6066else if (strcmp (name, "cntrl") == 0)6067BUILD_CHARCLASS_LOOP (iscntrl);6068else if (strcmp (name, "lower") == 0)6069BUILD_CHARCLASS_LOOP (islower);6070else if (strcmp (name, "space") == 0)6071BUILD_CHARCLASS_LOOP (isspace);6072else if (strcmp (name, "alpha") == 0)6073BUILD_CHARCLASS_LOOP (isalpha);6074else if (strcmp (name, "digit") == 0)6075BUILD_CHARCLASS_LOOP (isdigit);6076else if (strcmp (name, "print") == 0)6077BUILD_CHARCLASS_LOOP (isprint);6078else if (strcmp (name, "upper") == 0)6079BUILD_CHARCLASS_LOOP (isupper);6080else if (strcmp (name, "blank") == 0)6081BUILD_CHARCLASS_LOOP (isblank);6082else if (strcmp (name, "graph") == 0)6083BUILD_CHARCLASS_LOOP (isgraph);6084else if (strcmp (name, "punct") == 0)6085BUILD_CHARCLASS_LOOP (ispunct);6086else if (strcmp (name, "xdigit") == 0)6087BUILD_CHARCLASS_LOOP (isxdigit);6088else6089return REG_ECTYPE;60906091return REG_NOERROR;6092}60936094static bin_tree_t *6095build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,6096const unsigned char *class_name,6097const unsigned char *extra, int non_match,6098reg_errcode_t *err)6099{6100re_bitset_ptr_t sbcset;6101#ifdef RE_ENABLE_I18N6102re_charset_t *mbcset;6103int alloc = 0;6104#endif /* not RE_ENABLE_I18N */6105reg_errcode_t ret;6106re_token_t br_token;6107bin_tree_t *tree;61086109sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);6110#ifdef RE_ENABLE_I18N6111mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);6112#endif /* RE_ENABLE_I18N */61136114#ifdef RE_ENABLE_I18N6115if (BE (sbcset == NULL || mbcset == NULL, 0))6116#else /* not RE_ENABLE_I18N */6117if (BE (sbcset == NULL, 0))6118#endif /* not RE_ENABLE_I18N */6119{6120*err = REG_ESPACE;6121return NULL;6122}61236124if (non_match)6125{6126#ifdef RE_ENABLE_I18N6127/*6128if (syntax & RE_HAT_LISTS_NOT_NEWLINE)6129bitset_set(cset->sbcset, '\0');6130*/6131mbcset->non_match = 1;6132#endif /* not RE_ENABLE_I18N */6133}61346135/* We don't care the syntax in this case. */6136ret = build_charclass (trans, sbcset,6137#ifdef RE_ENABLE_I18N6138mbcset, &alloc,6139#endif /* RE_ENABLE_I18N */6140class_name, 0);61416142if (BE (ret != REG_NOERROR, 0))6143{6144re_free (sbcset);6145#ifdef RE_ENABLE_I18N6146free_charset (mbcset);6147#endif /* RE_ENABLE_I18N */6148*err = ret;6149return NULL;6150}6151/* \w match '_' also. */6152for (; *extra; extra++)6153bitset_set (sbcset, *extra);61546155/* If it is non-matching list. */6156if (non_match)6157bitset_not (sbcset);61586159#ifdef RE_ENABLE_I18N6160/* Ensure only single byte characters are set. */6161if (dfa->mb_cur_max > 1)6162bitset_mask (sbcset, dfa->sb_char);6163#endif61646165/* Build a tree for simple bracket. */6166br_token.type = SIMPLE_BRACKET;6167br_token.opr.sbcset = sbcset;6168tree = create_token_tree (dfa, NULL, NULL, &br_token);6169if (BE (tree == NULL, 0))6170goto build_word_op_espace;61716172#ifdef RE_ENABLE_I18N6173if (dfa->mb_cur_max > 1)6174{6175bin_tree_t *mbc_tree;6176/* Build a tree for complex bracket. */6177br_token.type = COMPLEX_BRACKET;6178br_token.opr.mbcset = mbcset;6179dfa->has_mb_node = 1;6180mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);6181if (BE (mbc_tree == NULL, 0))6182goto build_word_op_espace;6183/* Then join them by ALT node. */6184tree = create_tree (dfa, tree, mbc_tree, OP_ALT);6185if (BE (mbc_tree != NULL, 1))6186return tree;6187}6188else6189{6190free_charset (mbcset);6191return tree;6192}6193#else /* not RE_ENABLE_I18N */6194return tree;6195#endif /* not RE_ENABLE_I18N */61966197build_word_op_espace:6198re_free (sbcset);6199#ifdef RE_ENABLE_I18N6200free_charset (mbcset);6201#endif /* RE_ENABLE_I18N */6202*err = REG_ESPACE;6203return NULL;6204}62056206/* This is intended for the expressions like "a{1,3}".6207Fetch a number from `input', and return the number.6208Return -1, if the number field is empty like "{,1}".6209Return -2, If an error is occured. */62106211static int6212fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)6213{6214int num = -1;6215unsigned char c;6216while (1)6217{6218fetch_token (token, input, syntax);6219c = token->opr.c;6220if (BE (token->type == END_OF_RE, 0))6221return -2;6222if (token->type == OP_CLOSE_DUP_NUM || c == ',')6223break;6224num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)6225? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));6226num = (num > RE_DUP_MAX) ? -2 : num;6227}6228return num;6229}62306231#ifdef RE_ENABLE_I18N6232static void6233free_charset (re_charset_t *cset)6234{6235re_free (cset->mbchars);6236# ifdef _LIBC6237re_free (cset->coll_syms);6238re_free (cset->equiv_classes);6239re_free (cset->range_starts);6240re_free (cset->range_ends);6241# endif6242re_free (cset->char_classes);6243re_free (cset);6244}6245#endif /* RE_ENABLE_I18N */62466247/* Functions for binary tree operation. */62486249/* Create a tree node. */62506251static bin_tree_t *6252create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,6253re_token_type_t type)6254{6255re_token_t t;6256t.type = type;6257return create_token_tree (dfa, left, right, &t);6258}62596260static bin_tree_t *6261create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,6262const re_token_t *token)6263{6264bin_tree_t *tree;6265if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))6266{6267bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);62686269if (storage == NULL)6270return NULL;6271storage->next = dfa->str_tree_storage;6272dfa->str_tree_storage = storage;6273dfa->str_tree_storage_idx = 0;6274}6275tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];62766277tree->parent = NULL;6278tree->left = left;6279tree->right = right;6280tree->token = *token;6281tree->token.duplicated = 0;6282tree->token.opt_subexp = 0;6283tree->first = NULL;6284tree->next = NULL;6285tree->node_idx = -1;62866287if (left != NULL)6288left->parent = tree;6289if (right != NULL)6290right->parent = tree;6291return tree;6292}62936294/* Mark the tree SRC as an optional subexpression.6295To be called from preorder or postorder. */62966297static reg_errcode_t6298mark_opt_subexp (void *extra, bin_tree_t *node)6299{6300int idx = (int) (long) extra;6301if (node->token.type == SUBEXP && node->token.opr.idx == idx)6302node->token.opt_subexp = 1;63036304return REG_NOERROR;6305}63066307/* Free the allocated memory inside NODE. */63086309static void6310free_token (re_token_t *node)6311{6312#ifdef RE_ENABLE_I18N6313if (node->type == COMPLEX_BRACKET && node->duplicated == 0)6314free_charset (node->opr.mbcset);6315else6316#endif /* RE_ENABLE_I18N */6317if (node->type == SIMPLE_BRACKET && node->duplicated == 0)6318re_free (node->opr.sbcset);6319}63206321/* Worker function for tree walking. Free the allocated memory inside NODE6322and its children. */63236324static reg_errcode_t6325free_tree (void *extra, bin_tree_t *node)6326{6327free_token (&node->token);6328return REG_NOERROR;6329}633063316332/* Duplicate the node SRC, and return new node. This is a preorder6333visit similar to the one implemented by the generic visitor, but6334we need more infrastructure to maintain two parallel trees --- so,6335it's easier to duplicate. */63366337static bin_tree_t *6338duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)6339{6340const bin_tree_t *node;6341bin_tree_t *dup_root;6342bin_tree_t **p_new = &dup_root, *dup_node = root->parent;63436344for (node = root; ; )6345{6346/* Create a new tree and link it back to the current parent. */6347*p_new = create_token_tree (dfa, NULL, NULL, &node->token);6348if (*p_new == NULL)6349return NULL;6350(*p_new)->parent = dup_node;6351(*p_new)->token.duplicated = 1;6352dup_node = *p_new;63536354/* Go to the left node, or up and to the right. */6355if (node->left)6356{6357node = node->left;6358p_new = &dup_node->left;6359}6360else6361{6362const bin_tree_t *prev = NULL;6363while (node->right == prev || node->right == NULL)6364{6365prev = node;6366node = node->parent;6367dup_node = dup_node->parent;6368if (!node)6369return dup_root;6370}6371node = node->right;6372p_new = &dup_node->right;6373}6374}6375}63766377/******************************************************************************/6378/******************************************************************************/6379/******************************************************************************/6380/* GKINCLUDE #include "regexec.c" */6381/******************************************************************************/6382/******************************************************************************/6383/******************************************************************************/6384static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,6385int n) internal_function;6386static void match_ctx_clean (re_match_context_t *mctx) internal_function;6387static void match_ctx_free (re_match_context_t *cache) internal_function;6388static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,6389int str_idx, int from, int to)6390internal_function;6391static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)6392internal_function;6393static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,6394int str_idx) internal_function;6395static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,6396int node, int str_idx)6397internal_function;6398static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,6399re_dfastate_t **limited_sts, int last_node,6400int last_str_idx)6401internal_function;6402static reg_errcode_t re_search_internal (const regex_t *preg,6403const char *string, int length,6404int start, int range, int stop,6405size_t nmatch, regmatch_t pmatch[],6406int eflags) internal_function;6407static int re_search_2_stub (struct re_pattern_buffer *bufp,6408const char *string1, int length1,6409const char *string2, int length2,6410int start, int range, struct re_registers *regs,6411int stop, int ret_len) internal_function;6412static int re_search_stub (struct re_pattern_buffer *bufp,6413const char *string, int length, int start,6414int range, int stop, struct re_registers *regs,6415int ret_len) internal_function;6416static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,6417int nregs, int regs_allocated) internal_function;6418static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)6419internal_function;6420static int check_matching (re_match_context_t *mctx, int fl_longest_match,6421int *p_match_first) internal_function;6422static int check_halt_state_context (const re_match_context_t *mctx,6423const re_dfastate_t *state, int idx)6424internal_function;6425static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,6426regmatch_t *prev_idx_match, int cur_node,6427int cur_idx, int nmatch) internal_function;6428static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,6429int str_idx, int dest_node, int nregs,6430regmatch_t *regs,6431re_node_set *eps_via_nodes)6432internal_function;6433static reg_errcode_t set_regs (const regex_t *preg,6434const re_match_context_t *mctx,6435size_t nmatch, regmatch_t *pmatch,6436int fl_backtrack) internal_function;6437static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)6438internal_function;64396440#ifdef RE_ENABLE_I18N6441static int sift_states_iter_mb (const re_match_context_t *mctx,6442re_sift_context_t *sctx,6443int node_idx, int str_idx, int max_str_idx)6444internal_function;6445#endif /* RE_ENABLE_I18N */6446static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,6447re_sift_context_t *sctx)6448internal_function;6449static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,6450re_sift_context_t *sctx, int str_idx,6451re_node_set *cur_dest)6452internal_function;6453static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,6454re_sift_context_t *sctx,6455int str_idx,6456re_node_set *dest_nodes)6457internal_function;6458static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,6459re_node_set *dest_nodes,6460const re_node_set *candidates)6461internal_function;6462static int check_dst_limits (const re_match_context_t *mctx,6463re_node_set *limits,6464int dst_node, int dst_idx, int src_node,6465int src_idx) internal_function;6466static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,6467int boundaries, int subexp_idx,6468int from_node, int bkref_idx)6469internal_function;6470static int check_dst_limits_calc_pos (const re_match_context_t *mctx,6471int limit, int subexp_idx,6472int node, int str_idx,6473int bkref_idx) internal_function;6474static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,6475re_node_set *dest_nodes,6476const re_node_set *candidates,6477re_node_set *limits,6478struct re_backref_cache_entry *bkref_ents,6479int str_idx) internal_function;6480static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,6481re_sift_context_t *sctx,6482int str_idx, const re_node_set *candidates)6483internal_function;6484static reg_errcode_t merge_state_array (const re_dfa_t *dfa,6485re_dfastate_t **dst,6486re_dfastate_t **src, int num)6487internal_function;6488static re_dfastate_t *find_recover_state (reg_errcode_t *err,6489re_match_context_t *mctx) internal_function;6490static re_dfastate_t *transit_state (reg_errcode_t *err,6491re_match_context_t *mctx,6492re_dfastate_t *state) internal_function;6493static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,6494re_match_context_t *mctx,6495re_dfastate_t *next_state)6496internal_function;6497static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,6498re_node_set *cur_nodes,6499int str_idx) internal_function;6500#if 06501static re_dfastate_t *transit_state_sb (reg_errcode_t *err,6502re_match_context_t *mctx,6503re_dfastate_t *pstate)6504internal_function;6505#endif6506#ifdef RE_ENABLE_I18N6507static reg_errcode_t transit_state_mb (re_match_context_t *mctx,6508re_dfastate_t *pstate)6509internal_function;6510#endif /* RE_ENABLE_I18N */6511static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,6512const re_node_set *nodes)6513internal_function;6514static reg_errcode_t get_subexp (re_match_context_t *mctx,6515int bkref_node, int bkref_str_idx)6516internal_function;6517static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,6518const re_sub_match_top_t *sub_top,6519re_sub_match_last_t *sub_last,6520int bkref_node, int bkref_str)6521internal_function;6522static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,6523int subexp_idx, int type) internal_function;6524static reg_errcode_t check_arrival (re_match_context_t *mctx,6525state_array_t *path, int top_node,6526int top_str, int last_node, int last_str,6527int type) internal_function;6528static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,6529int str_idx,6530re_node_set *cur_nodes,6531re_node_set *next_nodes)6532internal_function;6533static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,6534re_node_set *cur_nodes,6535int ex_subexp, int type)6536internal_function;6537static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,6538re_node_set *dst_nodes,6539int target, int ex_subexp,6540int type) internal_function;6541static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,6542re_node_set *cur_nodes, int cur_str,6543int subexp_num, int type)6544internal_function;6545static int build_trtable (const re_dfa_t *dfa,6546re_dfastate_t *state) internal_function;6547#ifdef RE_ENABLE_I18N6548static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,6549const re_string_t *input, int idx)6550internal_function;6551# ifdef _LIBC6552static unsigned int find_collation_sequence_value (const unsigned char *mbs,6553size_t name_len)6554internal_function;6555# endif /* _LIBC */6556#endif /* RE_ENABLE_I18N */6557static int group_nodes_into_DFAstates (const re_dfa_t *dfa,6558const re_dfastate_t *state,6559re_node_set *states_node,6560bitset_t *states_ch) internal_function;6561static int check_node_accept (const re_match_context_t *mctx,6562const re_token_t *node, int idx)6563internal_function;6564static reg_errcode_t extend_buffers (re_match_context_t *mctx)6565internal_function;65666567/* Entry point for POSIX code. */65686569/* regexec searches for a given pattern, specified by PREG, in the6570string STRING.65716572If NMATCH is zero or REG_NOSUB was set in the cflags argument to6573`regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at6574least NMATCH elements, and we set them to the offsets of the6575corresponding matched substrings.65766577EFLAGS specifies `execution flags' which affect matching: if6578REG_NOTBOL is set, then ^ does not match at the beginning of the6579string; if REG_NOTEOL is set, then $ does not match at the end.65806581We return 0 if we find a match and REG_NOMATCH if not. */65826583int6584regexec (preg, string, nmatch, pmatch, eflags)6585const regex_t *__restrict preg;6586const char *__restrict string;6587size_t nmatch;6588regmatch_t pmatch[];6589int eflags;6590{6591reg_errcode_t err;6592int start, length;6593re_dfa_t *dfa = (re_dfa_t *) preg->buffer;65946595if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))6596return REG_BADPAT;65976598if (eflags & REG_STARTEND)6599{6600start = pmatch[0].rm_so;6601length = pmatch[0].rm_eo;6602}6603else6604{6605start = 0;6606length = strlen (string);6607}66086609__libc_lock_lock (dfa->lock);6610if (preg->no_sub)6611err = re_search_internal (preg, string, length, start, length - start,6612length, 0, NULL, eflags);6613else6614err = re_search_internal (preg, string, length, start, length - start,6615length, nmatch, pmatch, eflags);6616__libc_lock_unlock (dfa->lock);6617return err != REG_NOERROR;6618}66196620#ifdef _LIBC6621# include <shlib-compat.h>6622versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);66236624# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)6625__typeof__ (__regexec) __compat_regexec;66266627int6628attribute_compat_text_section6629__compat_regexec (const regex_t *__restrict preg,6630const char *__restrict string, size_t nmatch,6631regmatch_t pmatch[], int eflags)6632{6633return regexec (preg, string, nmatch, pmatch,6634eflags & (REG_NOTBOL | REG_NOTEOL));6635}6636compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);6637# endif6638#endif66396640/* Entry points for GNU code. */66416642/* re_match, re_search, re_match_2, re_search_266436644The former two functions operate on STRING with length LENGTH,6645while the later two operate on concatenation of STRING1 and STRING26646with lengths LENGTH1 and LENGTH2, respectively.66476648re_match() matches the compiled pattern in BUFP against the string,6649starting at index START.66506651re_search() first tries matching at index START, then it tries to match6652starting from index START + 1, and so on. The last start position tried6653is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same6654way as re_match().)66556656The parameter STOP of re_{match,search}_2 specifies that no match exceeding6657the first STOP characters of the concatenation of the strings should be6658concerned.66596660If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match6661and all groups is stroed in REGS. (For the "_2" variants, the offsets are6662computed relative to the concatenation, not relative to the individual6663strings.)66646665On success, re_match* functions return the length of the match, re_search*6666return the position of the start of the match. Return value -1 means no6667match was found and -2 indicates an internal error. */66686669int6670re_match (bufp, string, length, start, regs)6671struct re_pattern_buffer *bufp;6672const char *string;6673int length, start;6674struct re_registers *regs;6675{6676return re_search_stub (bufp, string, length, start, 0, length, regs, 1);6677}6678#ifdef _LIBC6679weak_alias (__re_match, re_match)6680#endif66816682int6683re_search (bufp, string, length, start, range, regs)6684struct re_pattern_buffer *bufp;6685const char *string;6686int length, start, range;6687struct re_registers *regs;6688{6689return re_search_stub (bufp, string, length, start, range, length, regs, 0);6690}6691#ifdef _LIBC6692weak_alias (__re_search, re_search)6693#endif66946695int6696re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)6697struct re_pattern_buffer *bufp;6698const char *string1, *string2;6699int length1, length2, start, stop;6700struct re_registers *regs;6701{6702return re_search_2_stub (bufp, string1, length1, string2, length2,6703start, 0, regs, stop, 1);6704}6705#ifdef _LIBC6706weak_alias (__re_match_2, re_match_2)6707#endif67086709int6710re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)6711struct re_pattern_buffer *bufp;6712const char *string1, *string2;6713int length1, length2, start, range, stop;6714struct re_registers *regs;6715{6716return re_search_2_stub (bufp, string1, length1, string2, length2,6717start, range, regs, stop, 0);6718}6719#ifdef _LIBC6720weak_alias (__re_search_2, re_search_2)6721#endif67226723static int6724re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,6725stop, ret_len)6726struct re_pattern_buffer *bufp;6727const char *string1, *string2;6728int length1, length2, start, range, stop, ret_len;6729struct re_registers *regs;6730{6731const char *str;6732int rval;6733int len = length1 + length2;6734int free_str = 0;67356736if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))6737return -2;67386739/* Concatenate the strings. */6740if (length2 > 0)6741if (length1 > 0)6742{6743char *s = re_malloc (char, len);67446745if (BE (s == NULL, 0))6746return -2;6747#ifdef _LIBC6748memcpy (__mempcpy (s, string1, length1), string2, length2);6749#else6750memcpy (s, string1, length1);6751memcpy (s + length1, string2, length2);6752#endif6753str = s;6754free_str = 1;6755}6756else6757str = string2;6758else6759str = string1;67606761rval = re_search_stub (bufp, str, len, start, range, stop, regs,6762ret_len);6763if (free_str)6764re_free ((char *) str);6765return rval;6766}67676768/* The parameters have the same meaning as those of re_search.6769Additional parameters:6770If RET_LEN is nonzero the length of the match is returned (re_match style);6771otherwise the position of the match is returned. */67726773static int6774re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)6775struct re_pattern_buffer *bufp;6776const char *string;6777int length, start, range, stop, ret_len;6778struct re_registers *regs;6779{6780reg_errcode_t result;6781regmatch_t *pmatch;6782int nregs, rval;6783int eflags = 0;6784re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;67856786/* Check for out-of-range. */6787if (BE (start < 0 || start > length, 0))6788return -1;6789if (BE (start + range > length, 0))6790range = length - start;6791else if (BE (start + range < 0, 0))6792range = -start;67936794__libc_lock_lock (dfa->lock);67956796eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;6797eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;67986799/* Compile fastmap if we haven't yet. */6800if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)6801re_compile_fastmap (bufp);68026803if (BE (bufp->no_sub, 0))6804regs = NULL;68056806/* We need at least 1 register. */6807if (regs == NULL)6808nregs = 1;6809else if (BE (bufp->regs_allocated == REGS_FIXED &&6810regs->num_regs < bufp->re_nsub + 1, 0))6811{6812nregs = regs->num_regs;6813if (BE (nregs < 1, 0))6814{6815/* Nothing can be copied to regs. */6816regs = NULL;6817nregs = 1;6818}6819}6820else6821nregs = bufp->re_nsub + 1;6822pmatch = re_malloc (regmatch_t, nregs);6823if (BE (pmatch == NULL, 0))6824{6825rval = -2;6826goto out;6827}68286829result = re_search_internal (bufp, string, length, start, range, stop,6830nregs, pmatch, eflags);68316832rval = 0;68336834/* I hope we needn't fill ther regs with -1's when no match was found. */6835if (result != REG_NOERROR)6836rval = -1;6837else if (regs != NULL)6838{6839/* If caller wants register contents data back, copy them. */6840bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,6841bufp->regs_allocated);6842if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))6843rval = -2;6844}68456846if (BE (rval == 0, 1))6847{6848if (ret_len)6849{6850assert (pmatch[0].rm_so == start);6851rval = pmatch[0].rm_eo - start;6852}6853else6854rval = pmatch[0].rm_so;6855}6856re_free (pmatch);6857out:6858__libc_lock_unlock (dfa->lock);6859return rval;6860}68616862static unsigned6863re_copy_regs (regs, pmatch, nregs, regs_allocated)6864struct re_registers *regs;6865regmatch_t *pmatch;6866int nregs, regs_allocated;6867{6868int rval = REGS_REALLOCATE;6869int i;6870int need_regs = nregs + 1;6871/* We need one extra element beyond `num_regs' for the `-1' marker GNU code6872uses. */68736874/* Have the register data arrays been allocated? */6875if (regs_allocated == REGS_UNALLOCATED)6876{ /* No. So allocate them with malloc. */6877regs->start = re_malloc (regoff_t, need_regs);6878regs->end = re_malloc (regoff_t, need_regs);6879if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))6880return REGS_UNALLOCATED;6881regs->num_regs = need_regs;6882}6883else if (regs_allocated == REGS_REALLOCATE)6884{ /* Yes. If we need more elements than were already6885allocated, reallocate them. If we need fewer, just6886leave it alone. */6887if (BE (need_regs > regs->num_regs, 0))6888{6889regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);6890regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);6891if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))6892return REGS_UNALLOCATED;6893regs->start = new_start;6894regs->end = new_end;6895regs->num_regs = need_regs;6896}6897}6898else6899{6900assert (regs_allocated == REGS_FIXED);6901/* This function may not be called with REGS_FIXED and nregs too big. */6902assert (regs->num_regs >= nregs);6903rval = REGS_FIXED;6904}69056906/* Copy the regs. */6907for (i = 0; i < nregs; ++i)6908{6909regs->start[i] = pmatch[i].rm_so;6910regs->end[i] = pmatch[i].rm_eo;6911}6912for ( ; i < regs->num_regs; ++i)6913regs->start[i] = regs->end[i] = -1;69146915return rval;6916}69176918/* Set REGS to hold NUM_REGS registers, storing them in STARTS and6919ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use6920this memory for recording register information. STARTS and ENDS6921must be allocated using the malloc library routine, and must each6922be at least NUM_REGS * sizeof (regoff_t) bytes long.69236924If NUM_REGS == 0, then subsequent matches should allocate their own6925register data.69266927Unless this function is called, the first search or match using6928PATTERN_BUFFER will allocate its own register data, without6929freeing the old data. */69306931void6932re_set_registers (bufp, regs, num_regs, starts, ends)6933struct re_pattern_buffer *bufp;6934struct re_registers *regs;6935unsigned num_regs;6936regoff_t *starts, *ends;6937{6938if (num_regs)6939{6940bufp->regs_allocated = REGS_REALLOCATE;6941regs->num_regs = num_regs;6942regs->start = starts;6943regs->end = ends;6944}6945else6946{6947bufp->regs_allocated = REGS_UNALLOCATED;6948regs->num_regs = 0;6949regs->start = regs->end = (regoff_t *) 0;6950}6951}6952#ifdef _LIBC6953weak_alias (__re_set_registers, re_set_registers)6954#endif69556956/* Entry points compatible with 4.2 BSD regex library. We don't define6957them unless specifically requested. */69586959#if defined _REGEX_RE_COMP || defined _LIBC6960int6961# ifdef _LIBC6962weak_function6963# endif6964re_exec (s)6965const char *s;6966{6967return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);6968}6969#endif /* _REGEX_RE_COMP */69706971/* Internal entry point. */69726973/* Searches for a compiled pattern PREG in the string STRING, whose6974length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same6975mingings with regexec. START, and RANGE have the same meanings6976with re_search.6977Return REG_NOERROR if we find a match, and REG_NOMATCH if not,6978otherwise return the error code.6979Note: We assume front end functions already check ranges.6980(START + RANGE >= 0 && START + RANGE <= LENGTH) */69816982static reg_errcode_t6983re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,6984eflags)6985const regex_t *preg;6986const char *string;6987int length, start, range, stop, eflags;6988size_t nmatch;6989regmatch_t pmatch[];6990{6991reg_errcode_t err;6992const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;6993int left_lim, right_lim, incr;6994int fl_longest_match, match_first, match_kind, match_last = -1;6995int extra_nmatch;6996int sb, ch;6997#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)6998re_match_context_t mctx = { .dfa = dfa };6999#else7000re_match_context_t mctx;7001#endif7002char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate7003&& range && !preg->can_be_null) ? preg->fastmap : NULL;7004RE_TRANSLATE_TYPE t = preg->translate;70057006#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))7007memset (&mctx, '\0', sizeof (re_match_context_t));7008mctx.dfa = dfa;7009#endif70107011extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;7012nmatch -= extra_nmatch;70137014/* Check if the DFA haven't been compiled. */7015if (BE (preg->used == 0 || dfa->init_state == NULL7016|| dfa->init_state_word == NULL || dfa->init_state_nl == NULL7017|| dfa->init_state_begbuf == NULL, 0))7018return REG_NOMATCH;70197020#ifdef DEBUG7021/* We assume front-end functions already check them. */7022assert (start + range >= 0 && start + range <= length);7023#endif70247025/* If initial states with non-begbuf contexts have no elements,7026the regex must be anchored. If preg->newline_anchor is set,7027we'll never use init_state_nl, so do not check it. */7028if (dfa->init_state->nodes.nelem == 07029&& dfa->init_state_word->nodes.nelem == 07030&& (dfa->init_state_nl->nodes.nelem == 07031|| !preg->newline_anchor))7032{7033if (start != 0 && start + range != 0)7034return REG_NOMATCH;7035start = range = 0;7036}70377038/* We must check the longest matching, if nmatch > 0. */7039fl_longest_match = (nmatch != 0 || dfa->nbackref);70407041err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,7042preg->translate, preg->syntax & RE_ICASE, dfa);7043if (BE (err != REG_NOERROR, 0))7044goto free_return;7045mctx.input.stop = stop;7046mctx.input.raw_stop = stop;7047mctx.input.newline_anchor = preg->newline_anchor;70487049err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);7050if (BE (err != REG_NOERROR, 0))7051goto free_return;70527053/* We will log all the DFA states through which the dfa pass,7054if nmatch > 1, or this dfa has "multibyte node", which is a7055back-reference or a node which can accept multibyte character or7056multi character collating element. */7057if (nmatch > 1 || dfa->has_mb_node)7058{7059mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);7060if (BE (mctx.state_log == NULL, 0))7061{7062err = REG_ESPACE;7063goto free_return;7064}7065}7066else7067mctx.state_log = NULL;70687069match_first = start;7070mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF7071: CONTEXT_NEWLINE | CONTEXT_BEGBUF;70727073/* Check incrementally whether of not the input string match. */7074incr = (range < 0) ? -1 : 1;7075left_lim = (range < 0) ? start + range : start;7076right_lim = (range < 0) ? start : start + range;7077sb = dfa->mb_cur_max == 1;7078match_kind =7079(fastmap7080? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)7081| (range >= 0 ? 2 : 0)7082| (t != NULL ? 1 : 0))7083: 8);70847085for (;; match_first += incr)7086{7087err = REG_NOMATCH;7088if (match_first < left_lim || right_lim < match_first)7089goto free_return;70907091/* Advance as rapidly as possible through the string, until we7092find a plausible place to start matching. This may be done7093with varying efficiency, so there are various possibilities:7094only the most common of them are specialized, in order to7095save on code size. We use a switch statement for speed. */7096switch (match_kind)7097{7098case 8:7099/* No fastmap. */7100break;71017102case 7:7103/* Fastmap with single-byte translation, match forward. */7104while (BE (match_first < right_lim, 1)7105&& !fastmap[t[(unsigned char) string[match_first]]])7106++match_first;7107goto forward_match_found_start_or_reached_end;71087109case 6:7110/* Fastmap without translation, match forward. */7111while (BE (match_first < right_lim, 1)7112&& !fastmap[(unsigned char) string[match_first]])7113++match_first;71147115forward_match_found_start_or_reached_end:7116if (BE (match_first == right_lim, 0))7117{7118ch = match_first >= length7119? 0 : (unsigned char) string[match_first];7120if (!fastmap[t ? t[ch] : ch])7121goto free_return;7122}7123break;71247125case 4:7126case 5:7127/* Fastmap without multi-byte translation, match backwards. */7128while (match_first >= left_lim)7129{7130ch = match_first >= length7131? 0 : (unsigned char) string[match_first];7132if (fastmap[t ? t[ch] : ch])7133break;7134--match_first;7135}7136if (match_first < left_lim)7137goto free_return;7138break;71397140default:7141/* In this case, we can't determine easily the current byte,7142since it might be a component byte of a multibyte7143character. Then we use the constructed buffer instead. */7144for (;;)7145{7146/* If MATCH_FIRST is out of the valid range, reconstruct the7147buffers. */7148unsigned int offset = match_first - mctx.input.raw_mbs_idx;7149if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))7150{7151err = re_string_reconstruct (&mctx.input, match_first,7152eflags);7153if (BE (err != REG_NOERROR, 0))7154goto free_return;71557156offset = match_first - mctx.input.raw_mbs_idx;7157}7158/* If MATCH_FIRST is out of the buffer, leave it as '\0'.7159Note that MATCH_FIRST must not be smaller than 0. */7160ch = (match_first >= length7161? 0 : re_string_byte_at (&mctx.input, offset));7162if (fastmap[ch])7163break;7164match_first += incr;7165if (match_first < left_lim || match_first > right_lim)7166{7167err = REG_NOMATCH;7168goto free_return;7169}7170}7171break;7172}71737174/* Reconstruct the buffers so that the matcher can assume that7175the matching starts from the beginning of the buffer. */7176err = re_string_reconstruct (&mctx.input, match_first, eflags);7177if (BE (err != REG_NOERROR, 0))7178goto free_return;71797180#ifdef RE_ENABLE_I18N7181/* Don't consider this char as a possible match start if it part,7182yet isn't the head, of a multibyte character. */7183if (!sb && !re_string_first_byte (&mctx.input, 0))7184continue;7185#endif71867187/* It seems to be appropriate one, then use the matcher. */7188/* We assume that the matching starts from 0. */7189mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;7190match_last = check_matching (&mctx, fl_longest_match,7191range >= 0 ? &match_first : NULL);7192if (match_last != -1)7193{7194if (BE (match_last == -2, 0))7195{7196err = REG_ESPACE;7197goto free_return;7198}7199else7200{7201mctx.match_last = match_last;7202if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)7203{7204re_dfastate_t *pstate = mctx.state_log[match_last];7205mctx.last_node = check_halt_state_context (&mctx, pstate,7206match_last);7207}7208if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)7209|| dfa->nbackref)7210{7211err = prune_impossible_nodes (&mctx);7212if (err == REG_NOERROR)7213break;7214if (BE (err != REG_NOMATCH, 0))7215goto free_return;7216match_last = -1;7217}7218else7219break; /* We found a match. */7220}7221}72227223match_ctx_clean (&mctx);7224}72257226#ifdef DEBUG7227assert (match_last != -1);7228assert (err == REG_NOERROR);7229#endif72307231/* Set pmatch[] if we need. */7232if (nmatch > 0)7233{7234int reg_idx;72357236/* Initialize registers. */7237for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)7238pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;72397240/* Set the points where matching start/end. */7241pmatch[0].rm_so = 0;7242pmatch[0].rm_eo = mctx.match_last;72437244if (!preg->no_sub && nmatch > 1)7245{7246err = set_regs (preg, &mctx, nmatch, pmatch,7247dfa->has_plural_match && dfa->nbackref > 0);7248if (BE (err != REG_NOERROR, 0))7249goto free_return;7250}72517252/* At last, add the offset to the each registers, since we slided7253the buffers so that we could assume that the matching starts7254from 0. */7255for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)7256if (pmatch[reg_idx].rm_so != -1)7257{7258#ifdef RE_ENABLE_I18N7259if (BE (mctx.input.offsets_needed != 0, 0))7260{7261pmatch[reg_idx].rm_so =7262(pmatch[reg_idx].rm_so == mctx.input.valid_len7263? mctx.input.valid_raw_len7264: mctx.input.offsets[pmatch[reg_idx].rm_so]);7265pmatch[reg_idx].rm_eo =7266(pmatch[reg_idx].rm_eo == mctx.input.valid_len7267? mctx.input.valid_raw_len7268: mctx.input.offsets[pmatch[reg_idx].rm_eo]);7269}7270#else7271assert (mctx.input.offsets_needed == 0);7272#endif7273pmatch[reg_idx].rm_so += match_first;7274pmatch[reg_idx].rm_eo += match_first;7275}7276for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)7277{7278pmatch[nmatch + reg_idx].rm_so = -1;7279pmatch[nmatch + reg_idx].rm_eo = -1;7280}72817282if (dfa->subexp_map)7283for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)7284if (dfa->subexp_map[reg_idx] != reg_idx)7285{7286pmatch[reg_idx + 1].rm_so7287= pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;7288pmatch[reg_idx + 1].rm_eo7289= pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;7290}7291}72927293free_return:7294re_free (mctx.state_log);7295if (dfa->nbackref)7296match_ctx_free (&mctx);7297re_string_destruct (&mctx.input);7298return err;7299}73007301static reg_errcode_t7302prune_impossible_nodes (mctx)7303re_match_context_t *mctx;7304{7305const re_dfa_t *const dfa = mctx->dfa;7306int halt_node, match_last;7307reg_errcode_t ret;7308re_dfastate_t **sifted_states;7309re_dfastate_t **lim_states = NULL;7310re_sift_context_t sctx;7311#ifdef DEBUG7312assert (mctx->state_log != NULL);7313#endif7314match_last = mctx->match_last;7315halt_node = mctx->last_node;7316sifted_states = re_malloc (re_dfastate_t *, match_last + 1);7317if (BE (sifted_states == NULL, 0))7318{7319ret = REG_ESPACE;7320goto free_return;7321}7322if (dfa->nbackref)7323{7324lim_states = re_malloc (re_dfastate_t *, match_last + 1);7325if (BE (lim_states == NULL, 0))7326{7327ret = REG_ESPACE;7328goto free_return;7329}7330while (1)7331{7332memset (lim_states, '\0',7333sizeof (re_dfastate_t *) * (match_last + 1));7334sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,7335match_last);7336ret = sift_states_backward (mctx, &sctx);7337re_node_set_free (&sctx.limits);7338if (BE (ret != REG_NOERROR, 0))7339goto free_return;7340if (sifted_states[0] != NULL || lim_states[0] != NULL)7341break;7342do7343{7344--match_last;7345if (match_last < 0)7346{7347ret = REG_NOMATCH;7348goto free_return;7349}7350} while (mctx->state_log[match_last] == NULL7351|| !mctx->state_log[match_last]->halt);7352halt_node = check_halt_state_context (mctx,7353mctx->state_log[match_last],7354match_last);7355}7356ret = merge_state_array (dfa, sifted_states, lim_states,7357match_last + 1);7358re_free (lim_states);7359lim_states = NULL;7360if (BE (ret != REG_NOERROR, 0))7361goto free_return;7362}7363else7364{7365sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);7366ret = sift_states_backward (mctx, &sctx);7367re_node_set_free (&sctx.limits);7368if (BE (ret != REG_NOERROR, 0))7369goto free_return;7370}7371re_free (mctx->state_log);7372mctx->state_log = sifted_states;7373sifted_states = NULL;7374mctx->last_node = halt_node;7375mctx->match_last = match_last;7376ret = REG_NOERROR;7377free_return:7378re_free (sifted_states);7379re_free (lim_states);7380return ret;7381}73827383/* Acquire an initial state and return it.7384We must select appropriate initial state depending on the context,7385since initial states may have constraints like "\<", "^", etc.. */73867387static inline re_dfastate_t *7388__attribute ((always_inline)) internal_function7389acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,7390int idx)7391{7392const re_dfa_t *const dfa = mctx->dfa;7393if (dfa->init_state->has_constraint)7394{7395unsigned int context;7396context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);7397if (IS_WORD_CONTEXT (context))7398return dfa->init_state_word;7399else if (IS_ORDINARY_CONTEXT (context))7400return dfa->init_state;7401else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))7402return dfa->init_state_begbuf;7403else if (IS_NEWLINE_CONTEXT (context))7404return dfa->init_state_nl;7405else if (IS_BEGBUF_CONTEXT (context))7406{7407/* It is relatively rare case, then calculate on demand. */7408return re_acquire_state_context (err, dfa,7409dfa->init_state->entrance_nodes,7410context);7411}7412else7413/* Must not happen? */7414return dfa->init_state;7415}7416else7417return dfa->init_state;7418}74197420/* Check whether the regular expression match input string INPUT or not,7421and return the index where the matching end, return -1 if not match,7422or return -2 in case of an error.7423FL_LONGEST_MATCH means we want the POSIX longest matching.7424If P_MATCH_FIRST is not NULL, and the match fails, it is set to the7425next place where we may want to try matching.7426Note that the matcher assume that the maching starts from the current7427index of the buffer. */74287429static int7430internal_function7431check_matching (re_match_context_t *mctx, int fl_longest_match,7432int *p_match_first)7433{7434const re_dfa_t *const dfa = mctx->dfa;7435reg_errcode_t err;7436int match = 0;7437int match_last = -1;7438int cur_str_idx = re_string_cur_idx (&mctx->input);7439re_dfastate_t *cur_state;7440int at_init_state = p_match_first != NULL;7441int next_start_idx = cur_str_idx;74427443err = REG_NOERROR;7444cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);7445/* An initial state must not be NULL (invalid). */7446if (BE (cur_state == NULL, 0))7447{7448assert (err == REG_ESPACE);7449return -2;7450}74517452if (mctx->state_log != NULL)7453{7454mctx->state_log[cur_str_idx] = cur_state;74557456/* Check OP_OPEN_SUBEXP in the initial state in case that we use them7457later. E.g. Processing back references. */7458if (BE (dfa->nbackref, 0))7459{7460at_init_state = 0;7461err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);7462if (BE (err != REG_NOERROR, 0))7463return err;74647465if (cur_state->has_backref)7466{7467err = transit_state_bkref (mctx, &cur_state->nodes);7468if (BE (err != REG_NOERROR, 0))7469return err;7470}7471}7472}74737474/* If the RE accepts NULL string. */7475if (BE (cur_state->halt, 0))7476{7477if (!cur_state->has_constraint7478|| check_halt_state_context (mctx, cur_state, cur_str_idx))7479{7480if (!fl_longest_match)7481return cur_str_idx;7482else7483{7484match_last = cur_str_idx;7485match = 1;7486}7487}7488}74897490while (!re_string_eoi (&mctx->input))7491{7492re_dfastate_t *old_state = cur_state;7493int next_char_idx = re_string_cur_idx (&mctx->input) + 1;74947495if (BE (next_char_idx >= mctx->input.bufs_len, 0)7496|| (BE (next_char_idx >= mctx->input.valid_len, 0)7497&& mctx->input.valid_len < mctx->input.len))7498{7499err = extend_buffers (mctx);7500if (BE (err != REG_NOERROR, 0))7501{7502assert (err == REG_ESPACE);7503return -2;7504}7505}75067507cur_state = transit_state (&err, mctx, cur_state);7508if (mctx->state_log != NULL)7509cur_state = merge_state_with_log (&err, mctx, cur_state);75107511if (cur_state == NULL)7512{7513/* Reached the invalid state or an error. Try to recover a valid7514state using the state log, if available and if we have not7515already found a valid (even if not the longest) match. */7516if (BE (err != REG_NOERROR, 0))7517return -2;75187519if (mctx->state_log == NULL7520|| (match && !fl_longest_match)7521|| (cur_state = find_recover_state (&err, mctx)) == NULL)7522break;7523}75247525if (BE (at_init_state, 0))7526{7527if (old_state == cur_state)7528next_start_idx = next_char_idx;7529else7530at_init_state = 0;7531}75327533if (cur_state->halt)7534{7535/* Reached a halt state.7536Check the halt state can satisfy the current context. */7537if (!cur_state->has_constraint7538|| check_halt_state_context (mctx, cur_state,7539re_string_cur_idx (&mctx->input)))7540{7541/* We found an appropriate halt state. */7542match_last = re_string_cur_idx (&mctx->input);7543match = 1;75447545/* We found a match, do not modify match_first below. */7546p_match_first = NULL;7547if (!fl_longest_match)7548break;7549}7550}7551}75527553if (p_match_first)7554*p_match_first += next_start_idx;75557556return match_last;7557}75587559/* Check NODE match the current context. */75607561static int7562internal_function7563check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)7564{7565re_token_type_t type = dfa->nodes[node].type;7566unsigned int constraint = dfa->nodes[node].constraint;7567if (type != END_OF_RE)7568return 0;7569if (!constraint)7570return 1;7571if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))7572return 0;7573return 1;7574}75757576/* Check the halt state STATE match the current context.7577Return 0 if not match, if the node, STATE has, is a halt node and7578match the context, return the node. */75797580static int7581internal_function7582check_halt_state_context (const re_match_context_t *mctx,7583const re_dfastate_t *state, int idx)7584{7585int i;7586unsigned int context;7587#ifdef DEBUG7588assert (state->halt);7589#endif7590context = re_string_context_at (&mctx->input, idx, mctx->eflags);7591for (i = 0; i < state->nodes.nelem; ++i)7592if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))7593return state->nodes.elems[i];7594return 0;7595}75967597/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA7598corresponding to the DFA).7599Return the destination node, and update EPS_VIA_NODES, return -1 in case7600of errors. */76017602static int7603internal_function7604proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,7605int *pidx, int node, re_node_set *eps_via_nodes,7606struct re_fail_stack_t *fs)7607{7608const re_dfa_t *const dfa = mctx->dfa;7609int i, err;7610if (IS_EPSILON_NODE (dfa->nodes[node].type))7611{7612re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;7613re_node_set *edests = &dfa->edests[node];7614int dest_node;7615err = re_node_set_insert (eps_via_nodes, node);7616if (BE (err < 0, 0))7617return -2;7618/* Pick up a valid destination, or return -1 if none is found. */7619for (dest_node = -1, i = 0; i < edests->nelem; ++i)7620{7621int candidate = edests->elems[i];7622if (!re_node_set_contains (cur_nodes, candidate))7623continue;7624if (dest_node == -1)7625dest_node = candidate;76267627else7628{7629/* In order to avoid infinite loop like "(a*)*", return the second7630epsilon-transition if the first was already considered. */7631if (re_node_set_contains (eps_via_nodes, dest_node))7632return candidate;76337634/* Otherwise, push the second epsilon-transition on the fail stack. */7635else if (fs != NULL7636&& push_fail_stack (fs, *pidx, candidate, nregs, regs,7637eps_via_nodes))7638return -2;76397640/* We know we are going to exit. */7641break;7642}7643}7644return dest_node;7645}7646else7647{7648int naccepted = 0;7649re_token_type_t type = dfa->nodes[node].type;76507651#ifdef RE_ENABLE_I18N7652if (dfa->nodes[node].accept_mb)7653naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);7654else7655#endif /* RE_ENABLE_I18N */7656if (type == OP_BACK_REF)7657{7658int subexp_idx = dfa->nodes[node].opr.idx + 1;7659naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;7660if (fs != NULL)7661{7662if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)7663return -1;7664else if (naccepted)7665{7666char *buf = (char *) re_string_get_buffer (&mctx->input);7667if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,7668naccepted) != 0)7669return -1;7670}7671}76727673if (naccepted == 0)7674{7675int dest_node;7676err = re_node_set_insert (eps_via_nodes, node);7677if (BE (err < 0, 0))7678return -2;7679dest_node = dfa->edests[node].elems[0];7680if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,7681dest_node))7682return dest_node;7683}7684}76857686if (naccepted != 07687|| check_node_accept (mctx, dfa->nodes + node, *pidx))7688{7689int dest_node = dfa->nexts[node];7690*pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;7691if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL7692|| !re_node_set_contains (&mctx->state_log[*pidx]->nodes,7693dest_node)))7694return -1;7695re_node_set_empty (eps_via_nodes);7696return dest_node;7697}7698}7699return -1;7700}77017702static reg_errcode_t7703internal_function7704push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,7705int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)7706{7707reg_errcode_t err;7708int num = fs->num++;7709if (fs->num == fs->alloc)7710{7711struct re_fail_stack_ent_t *new_array;7712new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)7713* fs->alloc * 2));7714if (new_array == NULL)7715return REG_ESPACE;7716fs->alloc *= 2;7717fs->stack = new_array;7718}7719fs->stack[num].idx = str_idx;7720fs->stack[num].node = dest_node;7721fs->stack[num].regs = re_malloc (regmatch_t, nregs);7722if (fs->stack[num].regs == NULL)7723return REG_ESPACE;7724memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);7725err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);7726return err;7727}77287729static int7730internal_function7731pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,7732regmatch_t *regs, re_node_set *eps_via_nodes)7733{7734int num = --fs->num;7735assert (num >= 0);7736*pidx = fs->stack[num].idx;7737memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);7738re_node_set_free (eps_via_nodes);7739re_free (fs->stack[num].regs);7740*eps_via_nodes = fs->stack[num].eps_via_nodes;7741return fs->stack[num].node;7742}77437744/* Set the positions where the subexpressions are starts/ends to registers7745PMATCH.7746Note: We assume that pmatch[0] is already set, and7747pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */77487749static reg_errcode_t7750internal_function7751set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,7752regmatch_t *pmatch, int fl_backtrack)7753{7754const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;7755int idx, cur_node;7756re_node_set eps_via_nodes;7757struct re_fail_stack_t *fs;7758struct re_fail_stack_t fs_body = { 0, 2, NULL };7759regmatch_t *prev_idx_match;7760int prev_idx_match_malloced = 0;77617762#ifdef DEBUG7763assert (nmatch > 1);7764assert (mctx->state_log != NULL);7765#endif7766if (fl_backtrack)7767{7768fs = &fs_body;7769fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);7770if (fs->stack == NULL)7771return REG_ESPACE;7772}7773else7774fs = NULL;77757776cur_node = dfa->init_node;7777re_node_set_init_empty (&eps_via_nodes);77787779if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))7780prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));7781else7782{7783prev_idx_match = re_malloc (regmatch_t, nmatch);7784if (prev_idx_match == NULL)7785{7786free_fail_stack_return (fs);7787return REG_ESPACE;7788}7789prev_idx_match_malloced = 1;7790}7791memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);77927793for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)7794{7795update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);77967797if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)7798{7799int reg_idx;7800if (fs)7801{7802for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)7803if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)7804break;7805if (reg_idx == nmatch)7806{7807re_node_set_free (&eps_via_nodes);7808if (prev_idx_match_malloced)7809re_free (prev_idx_match);7810return free_fail_stack_return (fs);7811}7812cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,7813&eps_via_nodes);7814}7815else7816{7817re_node_set_free (&eps_via_nodes);7818if (prev_idx_match_malloced)7819re_free (prev_idx_match);7820return REG_NOERROR;7821}7822}78237824/* Proceed to next node. */7825cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,7826&eps_via_nodes, fs);78277828if (BE (cur_node < 0, 0))7829{7830if (BE (cur_node == -2, 0))7831{7832re_node_set_free (&eps_via_nodes);7833if (prev_idx_match_malloced)7834re_free (prev_idx_match);7835free_fail_stack_return (fs);7836return REG_ESPACE;7837}7838if (fs)7839cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,7840&eps_via_nodes);7841else7842{7843re_node_set_free (&eps_via_nodes);7844if (prev_idx_match_malloced)7845re_free (prev_idx_match);7846return REG_NOMATCH;7847}7848}7849}7850re_node_set_free (&eps_via_nodes);7851if (prev_idx_match_malloced)7852re_free (prev_idx_match);7853return free_fail_stack_return (fs);7854}78557856static reg_errcode_t7857internal_function7858free_fail_stack_return (struct re_fail_stack_t *fs)7859{7860if (fs)7861{7862int fs_idx;7863for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)7864{7865re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);7866re_free (fs->stack[fs_idx].regs);7867}7868re_free (fs->stack);7869}7870return REG_NOERROR;7871}78727873static void7874internal_function7875update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,7876regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)7877{7878int type = dfa->nodes[cur_node].type;7879if (type == OP_OPEN_SUBEXP)7880{7881int reg_num = dfa->nodes[cur_node].opr.idx + 1;78827883/* We are at the first node of this sub expression. */7884if (reg_num < nmatch)7885{7886pmatch[reg_num].rm_so = cur_idx;7887pmatch[reg_num].rm_eo = -1;7888}7889}7890else if (type == OP_CLOSE_SUBEXP)7891{7892int reg_num = dfa->nodes[cur_node].opr.idx + 1;7893if (reg_num < nmatch)7894{7895/* We are at the last node of this sub expression. */7896if (pmatch[reg_num].rm_so < cur_idx)7897{7898pmatch[reg_num].rm_eo = cur_idx;7899/* This is a non-empty match or we are not inside an optional7900subexpression. Accept this right away. */7901memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);7902}7903else7904{7905if (dfa->nodes[cur_node].opt_subexp7906&& prev_idx_match[reg_num].rm_so != -1)7907/* We transited through an empty match for an optional7908subexpression, like (a?)*, and this is not the subexp's7909first match. Copy back the old content of the registers7910so that matches of an inner subexpression are undone as7911well, like in ((a?))*. */7912memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);7913else7914/* We completed a subexpression, but it may be part of7915an optional one, so do not update PREV_IDX_MATCH. */7916pmatch[reg_num].rm_eo = cur_idx;7917}7918}7919}7920}79217922/* This function checks the STATE_LOG from the SCTX->last_str_idx to 07923and sift the nodes in each states according to the following rules.7924Updated state_log will be wrote to STATE_LOG.79257926Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...79271. When STR_IDX == MATCH_LAST(the last index in the state_log):7928If `a' isn't the LAST_NODE and `a' can't epsilon transit to7929the LAST_NODE, we throw away the node `a'.79302. When 0 <= STR_IDX < MATCH_LAST and `a' accepts7931string `s' and transit to `b':7932i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw7933away the node `a'.7934ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is7935thrown away, we throw away the node `a'.79363. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':7937i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the7938node `a'.7939ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,7940we throw away the node `a'. */79417942#define STATE_NODE_CONTAINS(state,node) \7943((state) != NULL && re_node_set_contains (&(state)->nodes, node))79447945static reg_errcode_t7946internal_function7947sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)7948{7949reg_errcode_t err;7950int null_cnt = 0;7951int str_idx = sctx->last_str_idx;7952re_node_set cur_dest;79537954#ifdef DEBUG7955assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);7956#endif79577958/* Build sifted state_log[str_idx]. It has the nodes which can epsilon7959transit to the last_node and the last_node itself. */7960err = re_node_set_init_1 (&cur_dest, sctx->last_node);7961if (BE (err != REG_NOERROR, 0))7962return err;7963err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);7964if (BE (err != REG_NOERROR, 0))7965goto free_return;79667967/* Then check each states in the state_log. */7968while (str_idx > 0)7969{7970/* Update counters. */7971null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;7972if (null_cnt > mctx->max_mb_elem_len)7973{7974memset (sctx->sifted_states, '\0',7975sizeof (re_dfastate_t *) * str_idx);7976re_node_set_free (&cur_dest);7977return REG_NOERROR;7978}7979re_node_set_empty (&cur_dest);7980--str_idx;79817982if (mctx->state_log[str_idx])7983{7984err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);7985if (BE (err != REG_NOERROR, 0))7986goto free_return;7987}79887989/* Add all the nodes which satisfy the following conditions:7990- It can epsilon transit to a node in CUR_DEST.7991- It is in CUR_SRC.7992And update state_log. */7993err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);7994if (BE (err != REG_NOERROR, 0))7995goto free_return;7996}7997err = REG_NOERROR;7998free_return:7999re_node_set_free (&cur_dest);8000return err;8001}80028003static reg_errcode_t8004internal_function8005build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,8006int str_idx, re_node_set *cur_dest)8007{8008const re_dfa_t *const dfa = mctx->dfa;8009const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;8010int i;80118012/* Then build the next sifted state.8013We build the next sifted state on `cur_dest', and update8014`sifted_states[str_idx]' with `cur_dest'.8015Note:8016`cur_dest' is the sifted state from `state_log[str_idx + 1]'.8017`cur_src' points the node_set of the old `state_log[str_idx]'8018(with the epsilon nodes pre-filtered out). */8019for (i = 0; i < cur_src->nelem; i++)8020{8021int prev_node = cur_src->elems[i];8022int naccepted = 0;8023int ret;80248025#ifdef DEBUG8026re_token_type_t type = dfa->nodes[prev_node].type;8027assert (!IS_EPSILON_NODE (type));8028#endif8029#ifdef RE_ENABLE_I18N8030/* If the node may accept `multi byte'. */8031if (dfa->nodes[prev_node].accept_mb)8032naccepted = sift_states_iter_mb (mctx, sctx, prev_node,8033str_idx, sctx->last_str_idx);8034#endif /* RE_ENABLE_I18N */80358036/* We don't check backreferences here.8037See update_cur_sifted_state(). */8038if (!naccepted8039&& check_node_accept (mctx, dfa->nodes + prev_node, str_idx)8040&& STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],8041dfa->nexts[prev_node]))8042naccepted = 1;80438044if (naccepted == 0)8045continue;80468047if (sctx->limits.nelem)8048{8049int to_idx = str_idx + naccepted;8050if (check_dst_limits (mctx, &sctx->limits,8051dfa->nexts[prev_node], to_idx,8052prev_node, str_idx))8053continue;8054}8055ret = re_node_set_insert (cur_dest, prev_node);8056if (BE (ret == -1, 0))8057return REG_ESPACE;8058}80598060return REG_NOERROR;8061}80628063/* Helper functions. */80648065static reg_errcode_t8066internal_function8067clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)8068{8069int top = mctx->state_log_top;80708071if (next_state_log_idx >= mctx->input.bufs_len8072|| (next_state_log_idx >= mctx->input.valid_len8073&& mctx->input.valid_len < mctx->input.len))8074{8075reg_errcode_t err;8076err = extend_buffers (mctx);8077if (BE (err != REG_NOERROR, 0))8078return err;8079}80808081if (top < next_state_log_idx)8082{8083memset (mctx->state_log + top + 1, '\0',8084sizeof (re_dfastate_t *) * (next_state_log_idx - top));8085mctx->state_log_top = next_state_log_idx;8086}8087return REG_NOERROR;8088}80898090static reg_errcode_t8091internal_function8092merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,8093re_dfastate_t **src, int num)8094{8095int st_idx;8096reg_errcode_t err;8097for (st_idx = 0; st_idx < num; ++st_idx)8098{8099if (dst[st_idx] == NULL)8100dst[st_idx] = src[st_idx];8101else if (src[st_idx] != NULL)8102{8103re_node_set merged_set;8104err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,8105&src[st_idx]->nodes);8106if (BE (err != REG_NOERROR, 0))8107return err;8108dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);8109re_node_set_free (&merged_set);8110if (BE (err != REG_NOERROR, 0))8111return err;8112}8113}8114return REG_NOERROR;8115}81168117static reg_errcode_t8118internal_function8119update_cur_sifted_state (const re_match_context_t *mctx,8120re_sift_context_t *sctx, int str_idx,8121re_node_set *dest_nodes)8122{8123const re_dfa_t *const dfa = mctx->dfa;8124reg_errcode_t err = REG_NOERROR;8125const re_node_set *candidates;8126candidates = ((mctx->state_log[str_idx] == NULL) ? NULL8127: &mctx->state_log[str_idx]->nodes);81288129if (dest_nodes->nelem == 0)8130sctx->sifted_states[str_idx] = NULL;8131else8132{8133if (candidates)8134{8135/* At first, add the nodes which can epsilon transit to a node in8136DEST_NODE. */8137err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);8138if (BE (err != REG_NOERROR, 0))8139return err;81408141/* Then, check the limitations in the current sift_context. */8142if (sctx->limits.nelem)8143{8144err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,8145mctx->bkref_ents, str_idx);8146if (BE (err != REG_NOERROR, 0))8147return err;8148}8149}81508151sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);8152if (BE (err != REG_NOERROR, 0))8153return err;8154}81558156if (candidates && mctx->state_log[str_idx]->has_backref)8157{8158err = sift_states_bkref (mctx, sctx, str_idx, candidates);8159if (BE (err != REG_NOERROR, 0))8160return err;8161}8162return REG_NOERROR;8163}81648165static reg_errcode_t8166internal_function8167add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,8168const re_node_set *candidates)8169{8170reg_errcode_t err = REG_NOERROR;8171int i;81728173re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);8174if (BE (err != REG_NOERROR, 0))8175return err;81768177if (!state->inveclosure.alloc)8178{8179err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);8180if (BE (err != REG_NOERROR, 0))8181return REG_ESPACE;8182for (i = 0; i < dest_nodes->nelem; i++)8183re_node_set_merge (&state->inveclosure,8184dfa->inveclosures + dest_nodes->elems[i]);8185}8186return re_node_set_add_intersect (dest_nodes, candidates,8187&state->inveclosure);8188}81898190static reg_errcode_t8191internal_function8192sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,8193const re_node_set *candidates)8194{8195int ecl_idx;8196reg_errcode_t err;8197re_node_set *inv_eclosure = dfa->inveclosures + node;8198re_node_set except_nodes;8199re_node_set_init_empty (&except_nodes);8200for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)8201{8202int cur_node = inv_eclosure->elems[ecl_idx];8203if (cur_node == node)8204continue;8205if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))8206{8207int edst1 = dfa->edests[cur_node].elems[0];8208int edst2 = ((dfa->edests[cur_node].nelem > 1)8209? dfa->edests[cur_node].elems[1] : -1);8210if ((!re_node_set_contains (inv_eclosure, edst1)8211&& re_node_set_contains (dest_nodes, edst1))8212|| (edst2 > 08213&& !re_node_set_contains (inv_eclosure, edst2)8214&& re_node_set_contains (dest_nodes, edst2)))8215{8216err = re_node_set_add_intersect (&except_nodes, candidates,8217dfa->inveclosures + cur_node);8218if (BE (err != REG_NOERROR, 0))8219{8220re_node_set_free (&except_nodes);8221return err;8222}8223}8224}8225}8226for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)8227{8228int cur_node = inv_eclosure->elems[ecl_idx];8229if (!re_node_set_contains (&except_nodes, cur_node))8230{8231int idx = re_node_set_contains (dest_nodes, cur_node) - 1;8232re_node_set_remove_at (dest_nodes, idx);8233}8234}8235re_node_set_free (&except_nodes);8236return REG_NOERROR;8237}82388239static int8240internal_function8241check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,8242int dst_node, int dst_idx, int src_node, int src_idx)8243{8244const re_dfa_t *const dfa = mctx->dfa;8245int lim_idx, src_pos, dst_pos;82468247int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);8248int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);8249for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)8250{8251int subexp_idx;8252struct re_backref_cache_entry *ent;8253ent = mctx->bkref_ents + limits->elems[lim_idx];8254subexp_idx = dfa->nodes[ent->node].opr.idx;82558256dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],8257subexp_idx, dst_node, dst_idx,8258dst_bkref_idx);8259src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],8260subexp_idx, src_node, src_idx,8261src_bkref_idx);82628263/* In case of:8264<src> <dst> ( <subexp> )8265( <subexp> ) <src> <dst>8266( <subexp1> <src> <subexp2> <dst> <subexp3> ) */8267if (src_pos == dst_pos)8268continue; /* This is unrelated limitation. */8269else8270return 1;8271}8272return 0;8273}82748275static int8276internal_function8277check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,8278int subexp_idx, int from_node, int bkref_idx)8279{8280const re_dfa_t *const dfa = mctx->dfa;8281const re_node_set *eclosures = dfa->eclosures + from_node;8282int node_idx;82838284/* Else, we are on the boundary: examine the nodes on the epsilon8285closure. */8286for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)8287{8288int node = eclosures->elems[node_idx];8289switch (dfa->nodes[node].type)8290{8291case OP_BACK_REF:8292if (bkref_idx != -1)8293{8294struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;8295do8296{8297int dst, cpos;82988299if (ent->node != node)8300continue;83018302if (subexp_idx < BITSET_WORD_BITS8303&& !(ent->eps_reachable_subexps_map8304& ((bitset_word_t) 1 << subexp_idx)))8305continue;83068307/* Recurse trying to reach the OP_OPEN_SUBEXP and8308OP_CLOSE_SUBEXP cases below. But, if the8309destination node is the same node as the source8310node, don't recurse because it would cause an8311infinite loop: a regex that exhibits this behavior8312is ()\1*\1* */8313dst = dfa->edests[node].elems[0];8314if (dst == from_node)8315{8316if (boundaries & 1)8317return -1;8318else /* if (boundaries & 2) */8319return 0;8320}83218322cpos =8323check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,8324dst, bkref_idx);8325if (cpos == -1 /* && (boundaries & 1) */)8326return -1;8327if (cpos == 0 && (boundaries & 2))8328return 0;83298330if (subexp_idx < BITSET_WORD_BITS)8331ent->eps_reachable_subexps_map8332&= ~((bitset_word_t) 1 << subexp_idx);8333}8334while (ent++->more);8335}8336break;83378338case OP_OPEN_SUBEXP:8339if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)8340return -1;8341break;83428343case OP_CLOSE_SUBEXP:8344if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)8345return 0;8346break;83478348default:8349break;8350}8351}83528353return (boundaries & 2) ? 1 : 0;8354}83558356static int8357internal_function8358check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,8359int subexp_idx, int from_node, int str_idx,8360int bkref_idx)8361{8362struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;8363int boundaries;83648365/* If we are outside the range of the subexpression, return -1 or 1. */8366if (str_idx < lim->subexp_from)8367return -1;83688369if (lim->subexp_to < str_idx)8370return 1;83718372/* If we are within the subexpression, return 0. */8373boundaries = (str_idx == lim->subexp_from);8374boundaries |= (str_idx == lim->subexp_to) << 1;8375if (boundaries == 0)8376return 0;83778378/* Else, examine epsilon closure. */8379return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,8380from_node, bkref_idx);8381}83828383/* Check the limitations of sub expressions LIMITS, and remove the nodes8384which are against limitations from DEST_NODES. */83858386static reg_errcode_t8387internal_function8388check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,8389const re_node_set *candidates, re_node_set *limits,8390struct re_backref_cache_entry *bkref_ents, int str_idx)8391{8392reg_errcode_t err;8393int node_idx, lim_idx;83948395for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)8396{8397int subexp_idx;8398struct re_backref_cache_entry *ent;8399ent = bkref_ents + limits->elems[lim_idx];84008401if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)8402continue; /* This is unrelated limitation. */84038404subexp_idx = dfa->nodes[ent->node].opr.idx;8405if (ent->subexp_to == str_idx)8406{8407int ops_node = -1;8408int cls_node = -1;8409for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)8410{8411int node = dest_nodes->elems[node_idx];8412re_token_type_t type = dfa->nodes[node].type;8413if (type == OP_OPEN_SUBEXP8414&& subexp_idx == dfa->nodes[node].opr.idx)8415ops_node = node;8416else if (type == OP_CLOSE_SUBEXP8417&& subexp_idx == dfa->nodes[node].opr.idx)8418cls_node = node;8419}84208421/* Check the limitation of the open subexpression. */8422/* Note that (ent->subexp_to = str_idx != ent->subexp_from). */8423if (ops_node >= 0)8424{8425err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,8426candidates);8427if (BE (err != REG_NOERROR, 0))8428return err;8429}84308431/* Check the limitation of the close subexpression. */8432if (cls_node >= 0)8433for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)8434{8435int node = dest_nodes->elems[node_idx];8436if (!re_node_set_contains (dfa->inveclosures + node,8437cls_node)8438&& !re_node_set_contains (dfa->eclosures + node,8439cls_node))8440{8441/* It is against this limitation.8442Remove it form the current sifted state. */8443err = sub_epsilon_src_nodes (dfa, node, dest_nodes,8444candidates);8445if (BE (err != REG_NOERROR, 0))8446return err;8447--node_idx;8448}8449}8450}8451else /* (ent->subexp_to != str_idx) */8452{8453for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)8454{8455int node = dest_nodes->elems[node_idx];8456re_token_type_t type = dfa->nodes[node].type;8457if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)8458{8459if (subexp_idx != dfa->nodes[node].opr.idx)8460continue;8461/* It is against this limitation.8462Remove it form the current sifted state. */8463err = sub_epsilon_src_nodes (dfa, node, dest_nodes,8464candidates);8465if (BE (err != REG_NOERROR, 0))8466return err;8467}8468}8469}8470}8471return REG_NOERROR;8472}84738474static reg_errcode_t8475internal_function8476sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,8477int str_idx, const re_node_set *candidates)8478{8479const re_dfa_t *const dfa = mctx->dfa;8480reg_errcode_t err;8481int node_idx, node;8482re_sift_context_t local_sctx;8483int first_idx = search_cur_bkref_entry (mctx, str_idx);84848485if (first_idx == -1)8486return REG_NOERROR;84878488local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */84898490for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)8491{8492int enabled_idx;8493re_token_type_t type;8494struct re_backref_cache_entry *entry;8495node = candidates->elems[node_idx];8496type = dfa->nodes[node].type;8497/* Avoid infinite loop for the REs like "()\1+". */8498if (node == sctx->last_node && str_idx == sctx->last_str_idx)8499continue;8500if (type != OP_BACK_REF)8501continue;85028503entry = mctx->bkref_ents + first_idx;8504enabled_idx = first_idx;8505do8506{8507int subexp_len;8508int to_idx;8509int dst_node;8510int ret;8511re_dfastate_t *cur_state;85128513if (entry->node != node)8514continue;8515subexp_len = entry->subexp_to - entry->subexp_from;8516to_idx = str_idx + subexp_len;8517dst_node = (subexp_len ? dfa->nexts[node]8518: dfa->edests[node].elems[0]);85198520if (to_idx > sctx->last_str_idx8521|| sctx->sifted_states[to_idx] == NULL8522|| !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)8523|| check_dst_limits (mctx, &sctx->limits, node,8524str_idx, dst_node, to_idx))8525continue;85268527if (local_sctx.sifted_states == NULL)8528{8529local_sctx = *sctx;8530err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);8531if (BE (err != REG_NOERROR, 0))8532goto free_return;8533}8534local_sctx.last_node = node;8535local_sctx.last_str_idx = str_idx;8536ret = re_node_set_insert (&local_sctx.limits, enabled_idx);8537if (BE (ret < 0, 0))8538{8539err = REG_ESPACE;8540goto free_return;8541}8542cur_state = local_sctx.sifted_states[str_idx];8543err = sift_states_backward (mctx, &local_sctx);8544if (BE (err != REG_NOERROR, 0))8545goto free_return;8546if (sctx->limited_states != NULL)8547{8548err = merge_state_array (dfa, sctx->limited_states,8549local_sctx.sifted_states,8550str_idx + 1);8551if (BE (err != REG_NOERROR, 0))8552goto free_return;8553}8554local_sctx.sifted_states[str_idx] = cur_state;8555re_node_set_remove (&local_sctx.limits, enabled_idx);85568557/* mctx->bkref_ents may have changed, reload the pointer. */8558entry = mctx->bkref_ents + enabled_idx;8559}8560while (enabled_idx++, entry++->more);8561}8562err = REG_NOERROR;8563free_return:8564if (local_sctx.sifted_states != NULL)8565{8566re_node_set_free (&local_sctx.limits);8567}85688569return err;8570}857185728573#ifdef RE_ENABLE_I18N8574static int8575internal_function8576sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,8577int node_idx, int str_idx, int max_str_idx)8578{8579const re_dfa_t *const dfa = mctx->dfa;8580int naccepted;8581/* Check the node can accept `multi byte'. */8582naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);8583if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&8584!STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],8585dfa->nexts[node_idx]))8586/* The node can't accept the `multi byte', or the8587destination was already thrown away, then the node8588could't accept the current input `multi byte'. */8589naccepted = 0;8590/* Otherwise, it is sure that the node could accept8591`naccepted' bytes input. */8592return naccepted;8593}8594#endif /* RE_ENABLE_I18N */859585968597/* Functions for state transition. */85988599/* Return the next state to which the current state STATE will transit by8600accepting the current input byte, and update STATE_LOG if necessary.8601If STATE can accept a multibyte char/collating element/back reference8602update the destination of STATE_LOG. */86038604static re_dfastate_t *8605internal_function8606transit_state (reg_errcode_t *err, re_match_context_t *mctx,8607re_dfastate_t *state)8608{8609re_dfastate_t **trtable;8610unsigned char ch;86118612#ifdef RE_ENABLE_I18N8613/* If the current state can accept multibyte. */8614if (BE (state->accept_mb, 0))8615{8616*err = transit_state_mb (mctx, state);8617if (BE (*err != REG_NOERROR, 0))8618return NULL;8619}8620#endif /* RE_ENABLE_I18N */86218622/* Then decide the next state with the single byte. */8623#if 08624if (0)8625/* don't use transition table */8626return transit_state_sb (err, mctx, state);8627#endif86288629/* Use transition table */8630ch = re_string_fetch_byte (&mctx->input);8631for (;;)8632{8633trtable = state->trtable;8634if (BE (trtable != NULL, 1))8635return trtable[ch];86368637trtable = state->word_trtable;8638if (BE (trtable != NULL, 1))8639{8640unsigned int context;8641context8642= re_string_context_at (&mctx->input,8643re_string_cur_idx (&mctx->input) - 1,8644mctx->eflags);8645if (IS_WORD_CONTEXT (context))8646return trtable[ch + SBC_MAX];8647else8648return trtable[ch];8649}86508651if (!build_trtable (mctx->dfa, state))8652{8653*err = REG_ESPACE;8654return NULL;8655}86568657/* Retry, we now have a transition table. */8658}8659}86608661/* Update the state_log if we need */8662re_dfastate_t *8663internal_function8664merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,8665re_dfastate_t *next_state)8666{8667const re_dfa_t *const dfa = mctx->dfa;8668int cur_idx = re_string_cur_idx (&mctx->input);86698670if (cur_idx > mctx->state_log_top)8671{8672mctx->state_log[cur_idx] = next_state;8673mctx->state_log_top = cur_idx;8674}8675else if (mctx->state_log[cur_idx] == 0)8676{8677mctx->state_log[cur_idx] = next_state;8678}8679else8680{8681re_dfastate_t *pstate;8682unsigned int context;8683re_node_set next_nodes, *log_nodes, *table_nodes = NULL;8684/* If (state_log[cur_idx] != 0), it implies that cur_idx is8685the destination of a multibyte char/collating element/8686back reference. Then the next state is the union set of8687these destinations and the results of the transition table. */8688pstate = mctx->state_log[cur_idx];8689log_nodes = pstate->entrance_nodes;8690if (next_state != NULL)8691{8692table_nodes = next_state->entrance_nodes;8693*err = re_node_set_init_union (&next_nodes, table_nodes,8694log_nodes);8695if (BE (*err != REG_NOERROR, 0))8696return NULL;8697}8698else8699next_nodes = *log_nodes;8700/* Note: We already add the nodes of the initial state,8701then we don't need to add them here. */87028703context = re_string_context_at (&mctx->input,8704re_string_cur_idx (&mctx->input) - 1,8705mctx->eflags);8706next_state = mctx->state_log[cur_idx]8707= re_acquire_state_context (err, dfa, &next_nodes, context);8708/* We don't need to check errors here, since the return value of8709this function is next_state and ERR is already set. */87108711if (table_nodes != NULL)8712re_node_set_free (&next_nodes);8713}87148715if (BE (dfa->nbackref, 0) && next_state != NULL)8716{8717/* Check OP_OPEN_SUBEXP in the current state in case that we use them8718later. We must check them here, since the back references in the8719next state might use them. */8720*err = check_subexp_matching_top (mctx, &next_state->nodes,8721cur_idx);8722if (BE (*err != REG_NOERROR, 0))8723return NULL;87248725/* If the next state has back references. */8726if (next_state->has_backref)8727{8728*err = transit_state_bkref (mctx, &next_state->nodes);8729if (BE (*err != REG_NOERROR, 0))8730return NULL;8731next_state = mctx->state_log[cur_idx];8732}8733}87348735return next_state;8736}87378738/* Skip bytes in the input that correspond to part of a8739multi-byte match, then look in the log for a state8740from which to restart matching. */8741re_dfastate_t *8742internal_function8743find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)8744{8745re_dfastate_t *cur_state;8746do8747{8748int max = mctx->state_log_top;8749int cur_str_idx = re_string_cur_idx (&mctx->input);87508751do8752{8753if (++cur_str_idx > max)8754return NULL;8755re_string_skip_bytes (&mctx->input, 1);8756}8757while (mctx->state_log[cur_str_idx] == NULL);87588759cur_state = merge_state_with_log (err, mctx, NULL);8760}8761while (*err == REG_NOERROR && cur_state == NULL);8762return cur_state;8763}87648765/* Helper functions for transit_state. */87668767/* From the node set CUR_NODES, pick up the nodes whose types are8768OP_OPEN_SUBEXP and which have corresponding back references in the regular8769expression. And register them to use them later for evaluating the8770correspoding back references. */87718772static reg_errcode_t8773internal_function8774check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,8775int str_idx)8776{8777const re_dfa_t *const dfa = mctx->dfa;8778int node_idx;8779reg_errcode_t err;87808781/* TODO: This isn't efficient.8782Because there might be more than one nodes whose types are8783OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all8784nodes.8785E.g. RE: (a){2} */8786for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)8787{8788int node = cur_nodes->elems[node_idx];8789if (dfa->nodes[node].type == OP_OPEN_SUBEXP8790&& dfa->nodes[node].opr.idx < BITSET_WORD_BITS8791&& (dfa->used_bkref_map8792& ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))8793{8794err = match_ctx_add_subtop (mctx, node, str_idx);8795if (BE (err != REG_NOERROR, 0))8796return err;8797}8798}8799return REG_NOERROR;8800}88018802#if 08803/* Return the next state to which the current state STATE will transit by8804accepting the current input byte. */88058806static re_dfastate_t *8807transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,8808re_dfastate_t *state)8809{8810const re_dfa_t *const dfa = mctx->dfa;8811re_node_set next_nodes;8812re_dfastate_t *next_state;8813int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);8814unsigned int context;88158816*err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);8817if (BE (*err != REG_NOERROR, 0))8818return NULL;8819for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)8820{8821int cur_node = state->nodes.elems[node_cnt];8822if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))8823{8824*err = re_node_set_merge (&next_nodes,8825dfa->eclosures + dfa->nexts[cur_node]);8826if (BE (*err != REG_NOERROR, 0))8827{8828re_node_set_free (&next_nodes);8829return NULL;8830}8831}8832}8833context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);8834next_state = re_acquire_state_context (err, dfa, &next_nodes, context);8835/* We don't need to check errors here, since the return value of8836this function is next_state and ERR is already set. */88378838re_node_set_free (&next_nodes);8839re_string_skip_bytes (&mctx->input, 1);8840return next_state;8841}8842#endif88438844#ifdef RE_ENABLE_I18N8845static reg_errcode_t8846internal_function8847transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)8848{8849const re_dfa_t *const dfa = mctx->dfa;8850reg_errcode_t err;8851int i;88528853for (i = 0; i < pstate->nodes.nelem; ++i)8854{8855re_node_set dest_nodes, *new_nodes;8856int cur_node_idx = pstate->nodes.elems[i];8857int naccepted, dest_idx;8858unsigned int context;8859re_dfastate_t *dest_state;88608861if (!dfa->nodes[cur_node_idx].accept_mb)8862continue;88638864if (dfa->nodes[cur_node_idx].constraint)8865{8866context = re_string_context_at (&mctx->input,8867re_string_cur_idx (&mctx->input),8868mctx->eflags);8869if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,8870context))8871continue;8872}88738874/* How many bytes the node can accept? */8875naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,8876re_string_cur_idx (&mctx->input));8877if (naccepted == 0)8878continue;88798880/* The node can accepts `naccepted' bytes. */8881dest_idx = re_string_cur_idx (&mctx->input) + naccepted;8882mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted8883: mctx->max_mb_elem_len);8884err = clean_state_log_if_needed (mctx, dest_idx);8885if (BE (err != REG_NOERROR, 0))8886return err;8887#ifdef DEBUG8888assert (dfa->nexts[cur_node_idx] != -1);8889#endif8890new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];88918892dest_state = mctx->state_log[dest_idx];8893if (dest_state == NULL)8894dest_nodes = *new_nodes;8895else8896{8897err = re_node_set_init_union (&dest_nodes,8898dest_state->entrance_nodes, new_nodes);8899if (BE (err != REG_NOERROR, 0))8900return err;8901}8902context = re_string_context_at (&mctx->input, dest_idx - 1,8903mctx->eflags);8904mctx->state_log[dest_idx]8905= re_acquire_state_context (&err, dfa, &dest_nodes, context);8906if (dest_state != NULL)8907re_node_set_free (&dest_nodes);8908if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))8909return err;8910}8911return REG_NOERROR;8912}8913#endif /* RE_ENABLE_I18N */89148915static reg_errcode_t8916internal_function8917transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)8918{8919const re_dfa_t *const dfa = mctx->dfa;8920reg_errcode_t err;8921int i;8922int cur_str_idx = re_string_cur_idx (&mctx->input);89238924for (i = 0; i < nodes->nelem; ++i)8925{8926int dest_str_idx, prev_nelem, bkc_idx;8927int node_idx = nodes->elems[i];8928unsigned int context;8929const re_token_t *node = dfa->nodes + node_idx;8930re_node_set *new_dest_nodes;89318932/* Check whether `node' is a backreference or not. */8933if (node->type != OP_BACK_REF)8934continue;89358936if (node->constraint)8937{8938context = re_string_context_at (&mctx->input, cur_str_idx,8939mctx->eflags);8940if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))8941continue;8942}89438944/* `node' is a backreference.8945Check the substring which the substring matched. */8946bkc_idx = mctx->nbkref_ents;8947err = get_subexp (mctx, node_idx, cur_str_idx);8948if (BE (err != REG_NOERROR, 0))8949goto free_return;89508951/* And add the epsilon closures (which is `new_dest_nodes') of8952the backreference to appropriate state_log. */8953#ifdef DEBUG8954assert (dfa->nexts[node_idx] != -1);8955#endif8956for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)8957{8958int subexp_len;8959re_dfastate_t *dest_state;8960struct re_backref_cache_entry *bkref_ent;8961bkref_ent = mctx->bkref_ents + bkc_idx;8962if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)8963continue;8964subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;8965new_dest_nodes = (subexp_len == 08966? dfa->eclosures + dfa->edests[node_idx].elems[0]8967: dfa->eclosures + dfa->nexts[node_idx]);8968dest_str_idx = (cur_str_idx + bkref_ent->subexp_to8969- bkref_ent->subexp_from);8970context = re_string_context_at (&mctx->input, dest_str_idx - 1,8971mctx->eflags);8972dest_state = mctx->state_log[dest_str_idx];8973prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 08974: mctx->state_log[cur_str_idx]->nodes.nelem);8975/* Add `new_dest_node' to state_log. */8976if (dest_state == NULL)8977{8978mctx->state_log[dest_str_idx]8979= re_acquire_state_context (&err, dfa, new_dest_nodes,8980context);8981if (BE (mctx->state_log[dest_str_idx] == NULL8982&& err != REG_NOERROR, 0))8983goto free_return;8984}8985else8986{8987re_node_set dest_nodes;8988err = re_node_set_init_union (&dest_nodes,8989dest_state->entrance_nodes,8990new_dest_nodes);8991if (BE (err != REG_NOERROR, 0))8992{8993re_node_set_free (&dest_nodes);8994goto free_return;8995}8996mctx->state_log[dest_str_idx]8997= re_acquire_state_context (&err, dfa, &dest_nodes, context);8998re_node_set_free (&dest_nodes);8999if (BE (mctx->state_log[dest_str_idx] == NULL9000&& err != REG_NOERROR, 0))9001goto free_return;9002}9003/* We need to check recursively if the backreference can epsilon9004transit. */9005if (subexp_len == 09006&& mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)9007{9008err = check_subexp_matching_top (mctx, new_dest_nodes,9009cur_str_idx);9010if (BE (err != REG_NOERROR, 0))9011goto free_return;9012err = transit_state_bkref (mctx, new_dest_nodes);9013if (BE (err != REG_NOERROR, 0))9014goto free_return;9015}9016}9017}9018err = REG_NOERROR;9019free_return:9020return err;9021}90229023/* Enumerate all the candidates which the backreference BKREF_NODE can match9024at BKREF_STR_IDX, and register them by match_ctx_add_entry().9025Note that we might collect inappropriate candidates here.9026However, the cost of checking them strictly here is too high, then we9027delay these checking for prune_impossible_nodes(). */90289029static reg_errcode_t9030internal_function9031get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)9032{9033const re_dfa_t *const dfa = mctx->dfa;9034int subexp_num, sub_top_idx;9035const char *buf = (const char *) re_string_get_buffer (&mctx->input);9036/* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */9037int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);9038if (cache_idx != -1)9039{9040const struct re_backref_cache_entry *entry9041= mctx->bkref_ents + cache_idx;9042do9043if (entry->node == bkref_node)9044return REG_NOERROR; /* We already checked it. */9045while (entry++->more);9046}90479048subexp_num = dfa->nodes[bkref_node].opr.idx;90499050/* For each sub expression */9051for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)9052{9053reg_errcode_t err;9054re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];9055re_sub_match_last_t *sub_last;9056int sub_last_idx, sl_str, bkref_str_off;90579058if (dfa->nodes[sub_top->node].opr.idx != subexp_num)9059continue; /* It isn't related. */90609061sl_str = sub_top->str_idx;9062bkref_str_off = bkref_str_idx;9063/* At first, check the last node of sub expressions we already9064evaluated. */9065for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)9066{9067int sl_str_diff;9068sub_last = sub_top->lasts[sub_last_idx];9069sl_str_diff = sub_last->str_idx - sl_str;9070/* The matched string by the sub expression match with the substring9071at the back reference? */9072if (sl_str_diff > 0)9073{9074if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))9075{9076/* Not enough chars for a successful match. */9077if (bkref_str_off + sl_str_diff > mctx->input.len)9078break;90799080err = clean_state_log_if_needed (mctx,9081bkref_str_off9082+ sl_str_diff);9083if (BE (err != REG_NOERROR, 0))9084return err;9085buf = (const char *) re_string_get_buffer (&mctx->input);9086}9087if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)9088/* We don't need to search this sub expression any more. */9089break;9090}9091bkref_str_off += sl_str_diff;9092sl_str += sl_str_diff;9093err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,9094bkref_str_idx);90959096/* Reload buf, since the preceding call might have reallocated9097the buffer. */9098buf = (const char *) re_string_get_buffer (&mctx->input);90999100if (err == REG_NOMATCH)9101continue;9102if (BE (err != REG_NOERROR, 0))9103return err;9104}91059106if (sub_last_idx < sub_top->nlasts)9107continue;9108if (sub_last_idx > 0)9109++sl_str;9110/* Then, search for the other last nodes of the sub expression. */9111for (; sl_str <= bkref_str_idx; ++sl_str)9112{9113int cls_node, sl_str_off;9114const re_node_set *nodes;9115sl_str_off = sl_str - sub_top->str_idx;9116/* The matched string by the sub expression match with the substring9117at the back reference? */9118if (sl_str_off > 0)9119{9120if (BE (bkref_str_off >= mctx->input.valid_len, 0))9121{9122/* If we are at the end of the input, we cannot match. */9123if (bkref_str_off >= mctx->input.len)9124break;91259126err = extend_buffers (mctx);9127if (BE (err != REG_NOERROR, 0))9128return err;91299130buf = (const char *) re_string_get_buffer (&mctx->input);9131}9132if (buf [bkref_str_off++] != buf[sl_str - 1])9133break; /* We don't need to search this sub expression9134any more. */9135}9136if (mctx->state_log[sl_str] == NULL)9137continue;9138/* Does this state have a ')' of the sub expression? */9139nodes = &mctx->state_log[sl_str]->nodes;9140cls_node = find_subexp_node (dfa, nodes, subexp_num,9141OP_CLOSE_SUBEXP);9142if (cls_node == -1)9143continue; /* No. */9144if (sub_top->path == NULL)9145{9146sub_top->path = calloc (sizeof (state_array_t),9147sl_str - sub_top->str_idx + 1);9148if (sub_top->path == NULL)9149return REG_ESPACE;9150}9151/* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node9152in the current context? */9153err = check_arrival (mctx, sub_top->path, sub_top->node,9154sub_top->str_idx, cls_node, sl_str,9155OP_CLOSE_SUBEXP);9156if (err == REG_NOMATCH)9157continue;9158if (BE (err != REG_NOERROR, 0))9159return err;9160sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);9161if (BE (sub_last == NULL, 0))9162return REG_ESPACE;9163err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,9164bkref_str_idx);9165if (err == REG_NOMATCH)9166continue;9167}9168}9169return REG_NOERROR;9170}91719172/* Helper functions for get_subexp(). */91739174/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.9175If it can arrive, register the sub expression expressed with SUB_TOP9176and SUB_LAST. */91779178static reg_errcode_t9179internal_function9180get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,9181re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)9182{9183reg_errcode_t err;9184int to_idx;9185/* Can the subexpression arrive the back reference? */9186err = check_arrival (mctx, &sub_last->path, sub_last->node,9187sub_last->str_idx, bkref_node, bkref_str,9188OP_OPEN_SUBEXP);9189if (err != REG_NOERROR)9190return err;9191err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,9192sub_last->str_idx);9193if (BE (err != REG_NOERROR, 0))9194return err;9195to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;9196return clean_state_log_if_needed (mctx, to_idx);9197}91989199/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.9200Search '(' if FL_OPEN, or search ')' otherwise.9201TODO: This function isn't efficient...9202Because there might be more than one nodes whose types are9203OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all9204nodes.9205E.g. RE: (a){2} */92069207static int9208internal_function9209find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,9210int subexp_idx, int type)9211{9212int cls_idx;9213for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)9214{9215int cls_node = nodes->elems[cls_idx];9216const re_token_t *node = dfa->nodes + cls_node;9217if (node->type == type9218&& node->opr.idx == subexp_idx)9219return cls_node;9220}9221return -1;9222}92239224/* Check whether the node TOP_NODE at TOP_STR can arrive to the node9225LAST_NODE at LAST_STR. We record the path onto PATH since it will be9226heavily reused.9227Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */92289229static reg_errcode_t9230internal_function9231check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,9232int top_str, int last_node, int last_str, int type)9233{9234const re_dfa_t *const dfa = mctx->dfa;9235reg_errcode_t err = REG_NOERROR;9236int subexp_num, backup_cur_idx, str_idx, null_cnt;9237re_dfastate_t *cur_state = NULL;9238re_node_set *cur_nodes, next_nodes;9239re_dfastate_t **backup_state_log;9240unsigned int context;92419242subexp_num = dfa->nodes[top_node].opr.idx;9243/* Extend the buffer if we need. */9244if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))9245{9246re_dfastate_t **new_array;9247int old_alloc = path->alloc;9248path->alloc += last_str + mctx->max_mb_elem_len + 1;9249new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);9250if (BE (new_array == NULL, 0))9251{9252path->alloc = old_alloc;9253return REG_ESPACE;9254}9255path->array = new_array;9256memset (new_array + old_alloc, '\0',9257sizeof (re_dfastate_t *) * (path->alloc - old_alloc));9258}92599260str_idx = path->next_idx ? path->next_idx : top_str;92619262/* Temporary modify MCTX. */9263backup_state_log = mctx->state_log;9264backup_cur_idx = mctx->input.cur_idx;9265mctx->state_log = path->array;9266mctx->input.cur_idx = str_idx;92679268/* Setup initial node set. */9269context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);9270if (str_idx == top_str)9271{9272err = re_node_set_init_1 (&next_nodes, top_node);9273if (BE (err != REG_NOERROR, 0))9274return err;9275err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);9276if (BE (err != REG_NOERROR, 0))9277{9278re_node_set_free (&next_nodes);9279return err;9280}9281}9282else9283{9284cur_state = mctx->state_log[str_idx];9285if (cur_state && cur_state->has_backref)9286{9287err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);9288if (BE (err != REG_NOERROR, 0))9289return err;9290}9291else9292re_node_set_init_empty (&next_nodes);9293}9294if (str_idx == top_str || (cur_state && cur_state->has_backref))9295{9296if (next_nodes.nelem)9297{9298err = expand_bkref_cache (mctx, &next_nodes, str_idx,9299subexp_num, type);9300if (BE (err != REG_NOERROR, 0))9301{9302re_node_set_free (&next_nodes);9303return err;9304}9305}9306cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);9307if (BE (cur_state == NULL && err != REG_NOERROR, 0))9308{9309re_node_set_free (&next_nodes);9310return err;9311}9312mctx->state_log[str_idx] = cur_state;9313}93149315for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)9316{9317re_node_set_empty (&next_nodes);9318if (mctx->state_log[str_idx + 1])9319{9320err = re_node_set_merge (&next_nodes,9321&mctx->state_log[str_idx + 1]->nodes);9322if (BE (err != REG_NOERROR, 0))9323{9324re_node_set_free (&next_nodes);9325return err;9326}9327}9328if (cur_state)9329{9330err = check_arrival_add_next_nodes (mctx, str_idx,9331&cur_state->non_eps_nodes,9332&next_nodes);9333if (BE (err != REG_NOERROR, 0))9334{9335re_node_set_free (&next_nodes);9336return err;9337}9338}9339++str_idx;9340if (next_nodes.nelem)9341{9342err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);9343if (BE (err != REG_NOERROR, 0))9344{9345re_node_set_free (&next_nodes);9346return err;9347}9348err = expand_bkref_cache (mctx, &next_nodes, str_idx,9349subexp_num, type);9350if (BE (err != REG_NOERROR, 0))9351{9352re_node_set_free (&next_nodes);9353return err;9354}9355}9356context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);9357cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);9358if (BE (cur_state == NULL && err != REG_NOERROR, 0))9359{9360re_node_set_free (&next_nodes);9361return err;9362}9363mctx->state_log[str_idx] = cur_state;9364null_cnt = cur_state == NULL ? null_cnt + 1 : 0;9365}9366re_node_set_free (&next_nodes);9367cur_nodes = (mctx->state_log[last_str] == NULL ? NULL9368: &mctx->state_log[last_str]->nodes);9369path->next_idx = str_idx;93709371/* Fix MCTX. */9372mctx->state_log = backup_state_log;9373mctx->input.cur_idx = backup_cur_idx;93749375/* Then check the current node set has the node LAST_NODE. */9376if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))9377return REG_NOERROR;93789379return REG_NOMATCH;9380}93819382/* Helper functions for check_arrival. */93839384/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them9385to NEXT_NODES.9386TODO: This function is similar to the functions transit_state*(),9387however this function has many additional works.9388Can't we unify them? */93899390static reg_errcode_t9391internal_function9392check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,9393re_node_set *cur_nodes, re_node_set *next_nodes)9394{9395const re_dfa_t *const dfa = mctx->dfa;9396int result;9397int cur_idx;9398reg_errcode_t err = REG_NOERROR;9399re_node_set union_set;9400re_node_set_init_empty (&union_set);9401for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)9402{9403int naccepted = 0;9404int cur_node = cur_nodes->elems[cur_idx];9405#ifdef DEBUG9406re_token_type_t type = dfa->nodes[cur_node].type;9407assert (!IS_EPSILON_NODE (type));9408#endif9409#ifdef RE_ENABLE_I18N9410/* If the node may accept `multi byte'. */9411if (dfa->nodes[cur_node].accept_mb)9412{9413naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,9414str_idx);9415if (naccepted > 1)9416{9417re_dfastate_t *dest_state;9418int next_node = dfa->nexts[cur_node];9419int next_idx = str_idx + naccepted;9420dest_state = mctx->state_log[next_idx];9421re_node_set_empty (&union_set);9422if (dest_state)9423{9424err = re_node_set_merge (&union_set, &dest_state->nodes);9425if (BE (err != REG_NOERROR, 0))9426{9427re_node_set_free (&union_set);9428return err;9429}9430}9431result = re_node_set_insert (&union_set, next_node);9432if (BE (result < 0, 0))9433{9434re_node_set_free (&union_set);9435return REG_ESPACE;9436}9437mctx->state_log[next_idx] = re_acquire_state (&err, dfa,9438&union_set);9439if (BE (mctx->state_log[next_idx] == NULL9440&& err != REG_NOERROR, 0))9441{9442re_node_set_free (&union_set);9443return err;9444}9445}9446}9447#endif /* RE_ENABLE_I18N */9448if (naccepted9449|| check_node_accept (mctx, dfa->nodes + cur_node, str_idx))9450{9451result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);9452if (BE (result < 0, 0))9453{9454re_node_set_free (&union_set);9455return REG_ESPACE;9456}9457}9458}9459re_node_set_free (&union_set);9460return REG_NOERROR;9461}94629463/* For all the nodes in CUR_NODES, add the epsilon closures of them to9464CUR_NODES, however exclude the nodes which are:9465- inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.9466- out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.9467*/94689469static reg_errcode_t9470internal_function9471check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,9472int ex_subexp, int type)9473{9474reg_errcode_t err;9475int idx, outside_node;9476re_node_set new_nodes;9477#ifdef DEBUG9478assert (cur_nodes->nelem);9479#endif9480err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);9481if (BE (err != REG_NOERROR, 0))9482return err;9483/* Create a new node set NEW_NODES with the nodes which are epsilon9484closures of the node in CUR_NODES. */94859486for (idx = 0; idx < cur_nodes->nelem; ++idx)9487{9488int cur_node = cur_nodes->elems[idx];9489const re_node_set *eclosure = dfa->eclosures + cur_node;9490outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);9491if (outside_node == -1)9492{9493/* There are no problematic nodes, just merge them. */9494err = re_node_set_merge (&new_nodes, eclosure);9495if (BE (err != REG_NOERROR, 0))9496{9497re_node_set_free (&new_nodes);9498return err;9499}9500}9501else9502{9503/* There are problematic nodes, re-calculate incrementally. */9504err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,9505ex_subexp, type);9506if (BE (err != REG_NOERROR, 0))9507{9508re_node_set_free (&new_nodes);9509return err;9510}9511}9512}9513re_node_set_free (cur_nodes);9514*cur_nodes = new_nodes;9515return REG_NOERROR;9516}95179518/* Helper function for check_arrival_expand_ecl.9519Check incrementally the epsilon closure of TARGET, and if it isn't9520problematic append it to DST_NODES. */95219522static reg_errcode_t9523internal_function9524check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,9525int target, int ex_subexp, int type)9526{9527int cur_node;9528for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)9529{9530int err;95319532if (dfa->nodes[cur_node].type == type9533&& dfa->nodes[cur_node].opr.idx == ex_subexp)9534{9535if (type == OP_CLOSE_SUBEXP)9536{9537err = re_node_set_insert (dst_nodes, cur_node);9538if (BE (err == -1, 0))9539return REG_ESPACE;9540}9541break;9542}9543err = re_node_set_insert (dst_nodes, cur_node);9544if (BE (err == -1, 0))9545return REG_ESPACE;9546if (dfa->edests[cur_node].nelem == 0)9547break;9548if (dfa->edests[cur_node].nelem == 2)9549{9550err = check_arrival_expand_ecl_sub (dfa, dst_nodes,9551dfa->edests[cur_node].elems[1],9552ex_subexp, type);9553if (BE (err != REG_NOERROR, 0))9554return err;9555}9556cur_node = dfa->edests[cur_node].elems[0];9557}9558return REG_NOERROR;9559}956095619562/* For all the back references in the current state, calculate the9563destination of the back references by the appropriate entry9564in MCTX->BKREF_ENTS. */95659566static reg_errcode_t9567internal_function9568expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,9569int cur_str, int subexp_num, int type)9570{9571const re_dfa_t *const dfa = mctx->dfa;9572reg_errcode_t err;9573int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);9574struct re_backref_cache_entry *ent;95759576if (cache_idx_start == -1)9577return REG_NOERROR;95789579restart:9580ent = mctx->bkref_ents + cache_idx_start;9581do9582{9583int to_idx, next_node;95849585/* Is this entry ENT is appropriate? */9586if (!re_node_set_contains (cur_nodes, ent->node))9587continue; /* No. */95889589to_idx = cur_str + ent->subexp_to - ent->subexp_from;9590/* Calculate the destination of the back reference, and append it9591to MCTX->STATE_LOG. */9592if (to_idx == cur_str)9593{9594/* The backreference did epsilon transit, we must re-check all the9595node in the current state. */9596re_node_set new_dests;9597reg_errcode_t err2, err3;9598next_node = dfa->edests[ent->node].elems[0];9599if (re_node_set_contains (cur_nodes, next_node))9600continue;9601err = re_node_set_init_1 (&new_dests, next_node);9602err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);9603err3 = re_node_set_merge (cur_nodes, &new_dests);9604re_node_set_free (&new_dests);9605if (BE (err != REG_NOERROR || err2 != REG_NOERROR9606|| err3 != REG_NOERROR, 0))9607{9608err = (err != REG_NOERROR ? err9609: (err2 != REG_NOERROR ? err2 : err3));9610return err;9611}9612/* TODO: It is still inefficient... */9613goto restart;9614}9615else9616{9617re_node_set union_set;9618next_node = dfa->nexts[ent->node];9619if (mctx->state_log[to_idx])9620{9621int ret;9622if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,9623next_node))9624continue;9625err = re_node_set_init_copy (&union_set,9626&mctx->state_log[to_idx]->nodes);9627ret = re_node_set_insert (&union_set, next_node);9628if (BE (err != REG_NOERROR || ret < 0, 0))9629{9630re_node_set_free (&union_set);9631err = err != REG_NOERROR ? err : REG_ESPACE;9632return err;9633}9634}9635else9636{9637err = re_node_set_init_1 (&union_set, next_node);9638if (BE (err != REG_NOERROR, 0))9639return err;9640}9641mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);9642re_node_set_free (&union_set);9643if (BE (mctx->state_log[to_idx] == NULL9644&& err != REG_NOERROR, 0))9645return err;9646}9647}9648while (ent++->more);9649return REG_NOERROR;9650}96519652/* Build transition table for the state.9653Return 1 if succeeded, otherwise return NULL. */96549655static int9656internal_function9657build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)9658{9659reg_errcode_t err;9660int i, j, ch, need_word_trtable = 0;9661bitset_word_t elem, mask;9662bool dests_node_malloced = false;9663bool dest_states_malloced = false;9664int ndests; /* Number of the destination states from `state'. */9665re_dfastate_t **trtable;9666re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;9667re_node_set follows, *dests_node;9668bitset_t *dests_ch;9669bitset_t acceptable;96709671struct dests_alloc9672{9673re_node_set dests_node[SBC_MAX];9674bitset_t dests_ch[SBC_MAX];9675} *dests_alloc;96769677/* We build DFA states which corresponds to the destination nodes9678from `state'. `dests_node[i]' represents the nodes which i-th9679destination state contains, and `dests_ch[i]' represents the9680characters which i-th destination state accepts. */9681if (__libc_use_alloca (sizeof (struct dests_alloc)))9682dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));9683else9684{9685dests_alloc = re_malloc (struct dests_alloc, 1);9686if (BE (dests_alloc == NULL, 0))9687return 0;9688dests_node_malloced = true;9689}9690dests_node = dests_alloc->dests_node;9691dests_ch = dests_alloc->dests_ch;96929693/* Initialize transiton table. */9694state->word_trtable = state->trtable = NULL;96959696/* At first, group all nodes belonging to `state' into several9697destinations. */9698ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);9699if (BE (ndests <= 0, 0))9700{9701if (dests_node_malloced)9702free (dests_alloc);9703/* Return 0 in case of an error, 1 otherwise. */9704if (ndests == 0)9705{9706state->trtable = (re_dfastate_t **)9707calloc (sizeof (re_dfastate_t *), SBC_MAX);9708return 1;9709}9710return 0;9711}97129713err = re_node_set_alloc (&follows, ndests + 1);9714if (BE (err != REG_NOERROR, 0))9715goto out_free;97169717if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX9718+ ndests * 3 * sizeof (re_dfastate_t *)))9719dest_states = (re_dfastate_t **)9720alloca (ndests * 3 * sizeof (re_dfastate_t *));9721else9722{9723dest_states = (re_dfastate_t **)9724malloc (ndests * 3 * sizeof (re_dfastate_t *));9725if (BE (dest_states == NULL, 0))9726{9727out_free:9728if (dest_states_malloced)9729free (dest_states);9730re_node_set_free (&follows);9731for (i = 0; i < ndests; ++i)9732re_node_set_free (dests_node + i);9733if (dests_node_malloced)9734free (dests_alloc);9735return 0;9736}9737dest_states_malloced = true;9738}9739dest_states_word = dest_states + ndests;9740dest_states_nl = dest_states_word + ndests;9741bitset_empty (acceptable);97429743/* Then build the states for all destinations. */9744for (i = 0; i < ndests; ++i)9745{9746int next_node;9747re_node_set_empty (&follows);9748/* Merge the follows of this destination states. */9749for (j = 0; j < dests_node[i].nelem; ++j)9750{9751next_node = dfa->nexts[dests_node[i].elems[j]];9752if (next_node != -1)9753{9754err = re_node_set_merge (&follows, dfa->eclosures + next_node);9755if (BE (err != REG_NOERROR, 0))9756goto out_free;9757}9758}9759dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);9760if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))9761goto out_free;9762/* If the new state has context constraint,9763build appropriate states for these contexts. */9764if (dest_states[i]->has_constraint)9765{9766dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,9767CONTEXT_WORD);9768if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))9769goto out_free;97709771if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)9772need_word_trtable = 1;97739774dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,9775CONTEXT_NEWLINE);9776if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))9777goto out_free;9778}9779else9780{9781dest_states_word[i] = dest_states[i];9782dest_states_nl[i] = dest_states[i];9783}9784bitset_merge (acceptable, dests_ch[i]);9785}97869787if (!BE (need_word_trtable, 0))9788{9789/* We don't care about whether the following character is a word9790character, or we are in a single-byte character set so we can9791discern by looking at the character code: allocate a9792256-entry transition table. */9793trtable = state->trtable =9794(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);9795if (BE (trtable == NULL, 0))9796goto out_free;97979798/* For all characters ch...: */9799for (i = 0; i < BITSET_WORDS; ++i)9800for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;9801elem;9802mask <<= 1, elem >>= 1, ++ch)9803if (BE (elem & 1, 0))9804{9805/* There must be exactly one destination which accepts9806character ch. See group_nodes_into_DFAstates. */9807for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)9808;98099810/* j-th destination accepts the word character ch. */9811if (dfa->word_char[i] & mask)9812trtable[ch] = dest_states_word[j];9813else9814trtable[ch] = dest_states[j];9815}9816}9817else9818{9819/* We care about whether the following character is a word9820character, and we are in a multi-byte character set: discern9821by looking at the character code: build two 256-entry9822transition tables, one starting at trtable[0] and one9823starting at trtable[SBC_MAX]. */9824trtable = state->word_trtable =9825(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);9826if (BE (trtable == NULL, 0))9827goto out_free;98289829/* For all characters ch...: */9830for (i = 0; i < BITSET_WORDS; ++i)9831for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;9832elem;9833mask <<= 1, elem >>= 1, ++ch)9834if (BE (elem & 1, 0))9835{9836/* There must be exactly one destination which accepts9837character ch. See group_nodes_into_DFAstates. */9838for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)9839;98409841/* j-th destination accepts the word character ch. */9842trtable[ch] = dest_states[j];9843trtable[ch + SBC_MAX] = dest_states_word[j];9844}9845}98469847/* new line */9848if (bitset_contain (acceptable, NEWLINE_CHAR))9849{9850/* The current state accepts newline character. */9851for (j = 0; j < ndests; ++j)9852if (bitset_contain (dests_ch[j], NEWLINE_CHAR))9853{9854/* k-th destination accepts newline character. */9855trtable[NEWLINE_CHAR] = dest_states_nl[j];9856if (need_word_trtable)9857trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];9858/* There must be only one destination which accepts9859newline. See group_nodes_into_DFAstates. */9860break;9861}9862}98639864if (dest_states_malloced)9865free (dest_states);98669867re_node_set_free (&follows);9868for (i = 0; i < ndests; ++i)9869re_node_set_free (dests_node + i);98709871if (dests_node_malloced)9872free (dests_alloc);98739874return 1;9875}98769877/* Group all nodes belonging to STATE into several destinations.9878Then for all destinations, set the nodes belonging to the destination9879to DESTS_NODE[i] and set the characters accepted by the destination9880to DEST_CH[i]. This function return the number of destinations. */98819882static int9883internal_function9884group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,9885re_node_set *dests_node, bitset_t *dests_ch)9886{9887reg_errcode_t err;9888int result;9889int i, j, k;9890int ndests; /* Number of the destinations from `state'. */9891bitset_t accepts; /* Characters a node can accept. */9892const re_node_set *cur_nodes = &state->nodes;9893bitset_empty (accepts);9894ndests = 0;98959896/* For all the nodes belonging to `state', */9897for (i = 0; i < cur_nodes->nelem; ++i)9898{9899re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];9900re_token_type_t type = node->type;9901unsigned int constraint = node->constraint;99029903/* Enumerate all single byte character this node can accept. */9904if (type == CHARACTER)9905bitset_set (accepts, node->opr.c);9906else if (type == SIMPLE_BRACKET)9907{9908bitset_merge (accepts, node->opr.sbcset);9909}9910else if (type == OP_PERIOD)9911{9912#ifdef RE_ENABLE_I18N9913if (dfa->mb_cur_max > 1)9914bitset_merge (accepts, dfa->sb_char);9915else9916#endif9917bitset_set_all (accepts);9918if (!(dfa->syntax & RE_DOT_NEWLINE))9919bitset_clear (accepts, '\n');9920if (dfa->syntax & RE_DOT_NOT_NULL)9921bitset_clear (accepts, '\0');9922}9923#ifdef RE_ENABLE_I18N9924else if (type == OP_UTF8_PERIOD)9925{9926memset (accepts, '\xff', sizeof (bitset_t) / 2);9927if (!(dfa->syntax & RE_DOT_NEWLINE))9928bitset_clear (accepts, '\n');9929if (dfa->syntax & RE_DOT_NOT_NULL)9930bitset_clear (accepts, '\0');9931}9932#endif9933else9934continue;99359936/* Check the `accepts' and sift the characters which are not9937match it the context. */9938if (constraint)9939{9940if (constraint & NEXT_NEWLINE_CONSTRAINT)9941{9942bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);9943bitset_empty (accepts);9944if (accepts_newline)9945bitset_set (accepts, NEWLINE_CHAR);9946else9947continue;9948}9949if (constraint & NEXT_ENDBUF_CONSTRAINT)9950{9951bitset_empty (accepts);9952continue;9953}99549955if (constraint & NEXT_WORD_CONSTRAINT)9956{9957bitset_word_t any_set = 0;9958if (type == CHARACTER && !node->word_char)9959{9960bitset_empty (accepts);9961continue;9962}9963#ifdef RE_ENABLE_I18N9964if (dfa->mb_cur_max > 1)9965for (j = 0; j < BITSET_WORDS; ++j)9966any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));9967else9968#endif9969for (j = 0; j < BITSET_WORDS; ++j)9970any_set |= (accepts[j] &= dfa->word_char[j]);9971if (!any_set)9972continue;9973}9974if (constraint & NEXT_NOTWORD_CONSTRAINT)9975{9976bitset_word_t any_set = 0;9977if (type == CHARACTER && node->word_char)9978{9979bitset_empty (accepts);9980continue;9981}9982#ifdef RE_ENABLE_I18N9983if (dfa->mb_cur_max > 1)9984for (j = 0; j < BITSET_WORDS; ++j)9985any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));9986else9987#endif9988for (j = 0; j < BITSET_WORDS; ++j)9989any_set |= (accepts[j] &= ~dfa->word_char[j]);9990if (!any_set)9991continue;9992}9993}99949995/* Then divide `accepts' into DFA states, or create a new9996state. Above, we make sure that accepts is not empty. */9997for (j = 0; j < ndests; ++j)9998{9999bitset_t intersec; /* Intersection sets, see below. */10000bitset_t remains;10001/* Flags, see below. */10002bitset_word_t has_intersec, not_subset, not_consumed;1000310004/* Optimization, skip if this state doesn't accept the character. */10005if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))10006continue;1000710008/* Enumerate the intersection set of this state and `accepts'. */10009has_intersec = 0;10010for (k = 0; k < BITSET_WORDS; ++k)10011has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];10012/* And skip if the intersection set is empty. */10013if (!has_intersec)10014continue;1001510016/* Then check if this state is a subset of `accepts'. */10017not_subset = not_consumed = 0;10018for (k = 0; k < BITSET_WORDS; ++k)10019{10020not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];10021not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];10022}1002310024/* If this state isn't a subset of `accepts', create a10025new group state, which has the `remains'. */10026if (not_subset)10027{10028bitset_copy (dests_ch[ndests], remains);10029bitset_copy (dests_ch[j], intersec);10030err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);10031if (BE (err != REG_NOERROR, 0))10032goto error_return;10033++ndests;10034}1003510036/* Put the position in the current group. */10037result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);10038if (BE (result < 0, 0))10039goto error_return;1004010041/* If all characters are consumed, go to next node. */10042if (!not_consumed)10043break;10044}10045/* Some characters remain, create a new group. */10046if (j == ndests)10047{10048bitset_copy (dests_ch[ndests], accepts);10049err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);10050if (BE (err != REG_NOERROR, 0))10051goto error_return;10052++ndests;10053bitset_empty (accepts);10054}10055}10056return ndests;10057error_return:10058for (j = 0; j < ndests; ++j)10059re_node_set_free (dests_node + j);10060return -1;10061}1006210063#ifdef RE_ENABLE_I18N10064/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.10065Return the number of the bytes the node accepts.10066STR_IDX is the current index of the input string.1006710068This function handles the nodes which can accept one character, or10069one collating element like '.', '[a-z]', opposite to the other nodes10070can only accept one byte. */1007110072static int10073internal_function10074check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,10075const re_string_t *input, int str_idx)10076{10077const re_token_t *node = dfa->nodes + node_idx;10078int char_len, elem_len;10079int i;1008010081if (BE (node->type == OP_UTF8_PERIOD, 0))10082{10083unsigned char c = re_string_byte_at (input, str_idx), d;10084if (BE (c < 0xc2, 1))10085return 0;1008610087if (str_idx + 2 > input->len)10088return 0;1008910090d = re_string_byte_at (input, str_idx + 1);10091if (c < 0xe0)10092return (d < 0x80 || d > 0xbf) ? 0 : 2;10093else if (c < 0xf0)10094{10095char_len = 3;10096if (c == 0xe0 && d < 0xa0)10097return 0;10098}10099else if (c < 0xf8)10100{10101char_len = 4;10102if (c == 0xf0 && d < 0x90)10103return 0;10104}10105else if (c < 0xfc)10106{10107char_len = 5;10108if (c == 0xf8 && d < 0x88)10109return 0;10110}10111else if (c < 0xfe)10112{10113char_len = 6;10114if (c == 0xfc && d < 0x84)10115return 0;10116}10117else10118return 0;1011910120if (str_idx + char_len > input->len)10121return 0;1012210123for (i = 1; i < char_len; ++i)10124{10125d = re_string_byte_at (input, str_idx + i);10126if (d < 0x80 || d > 0xbf)10127return 0;10128}10129return char_len;10130}1013110132char_len = re_string_char_size_at (input, str_idx);10133if (node->type == OP_PERIOD)10134{10135if (char_len <= 1)10136return 0;10137/* FIXME: I don't think this if is needed, as both '\n'10138and '\0' are char_len == 1. */10139/* '.' accepts any one character except the following two cases. */10140if ((!(dfa->syntax & RE_DOT_NEWLINE) &&10141re_string_byte_at (input, str_idx) == '\n') ||10142((dfa->syntax & RE_DOT_NOT_NULL) &&10143re_string_byte_at (input, str_idx) == '\0'))10144return 0;10145return char_len;10146}1014710148elem_len = re_string_elem_size_at (input, str_idx);10149if ((elem_len <= 1 && char_len <= 1) || char_len == 0)10150return 0;1015110152if (node->type == COMPLEX_BRACKET)10153{10154const re_charset_t *cset = node->opr.mbcset;10155# ifdef _LIBC10156const unsigned char *pin10157= ((const unsigned char *) re_string_get_buffer (input) + str_idx);10158int j;10159uint32_t nrules;10160# endif /* _LIBC */10161int match_len = 0;10162wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)10163? re_string_wchar_at (input, str_idx) : 0);1016410165/* match with multibyte character? */10166for (i = 0; i < cset->nmbchars; ++i)10167if (wc == cset->mbchars[i])10168{10169match_len = char_len;10170goto check_node_accept_bytes_match;10171}10172/* match with character_class? */10173for (i = 0; i < cset->nchar_classes; ++i)10174{10175wctype_t wt = cset->char_classes[i];10176if (__iswctype (wc, wt))10177{10178match_len = char_len;10179goto check_node_accept_bytes_match;10180}10181}1018210183# ifdef _LIBC10184nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);10185if (nrules != 0)10186{10187unsigned int in_collseq = 0;10188const int32_t *table, *indirect;10189const unsigned char *weights, *extra;10190const char *collseqwc;10191int32_t idx;10192/* This #include defines a local function! */10193# include <locale/weight.h>1019410195/* match with collating_symbol? */10196if (cset->ncoll_syms)10197extra = (const unsigned char *)10198_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);10199for (i = 0; i < cset->ncoll_syms; ++i)10200{10201const unsigned char *coll_sym = extra + cset->coll_syms[i];10202/* Compare the length of input collating element and10203the length of current collating element. */10204if (*coll_sym != elem_len)10205continue;10206/* Compare each bytes. */10207for (j = 0; j < *coll_sym; j++)10208if (pin[j] != coll_sym[1 + j])10209break;10210if (j == *coll_sym)10211{10212/* Match if every bytes is equal. */10213match_len = j;10214goto check_node_accept_bytes_match;10215}10216}1021710218if (cset->nranges)10219{10220if (elem_len <= char_len)10221{10222collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);10223in_collseq = __collseq_table_lookup (collseqwc, wc);10224}10225else10226in_collseq = find_collation_sequence_value (pin, elem_len);10227}10228/* match with range expression? */10229for (i = 0; i < cset->nranges; ++i)10230if (cset->range_starts[i] <= in_collseq10231&& in_collseq <= cset->range_ends[i])10232{10233match_len = elem_len;10234goto check_node_accept_bytes_match;10235}1023610237/* match with equivalence_class? */10238if (cset->nequiv_classes)10239{10240const unsigned char *cp = pin;10241table = (const int32_t *)10242_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);10243weights = (const unsigned char *)10244_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);10245extra = (const unsigned char *)10246_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);10247indirect = (const int32_t *)10248_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);10249idx = findidx (&cp);10250if (idx > 0)10251for (i = 0; i < cset->nequiv_classes; ++i)10252{10253int32_t equiv_class_idx = cset->equiv_classes[i];10254size_t weight_len = weights[idx];10255if (weight_len == weights[equiv_class_idx])10256{10257int cnt = 0;10258while (cnt <= weight_len10259&& (weights[equiv_class_idx + 1 + cnt]10260== weights[idx + 1 + cnt]))10261++cnt;10262if (cnt > weight_len)10263{10264match_len = elem_len;10265goto check_node_accept_bytes_match;10266}10267}10268}10269}10270}10271else10272# endif /* _LIBC */10273{10274/* match with range expression? */10275#if __GNUC__ >= 210276wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};10277#else10278wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};10279cmp_buf[2] = wc;10280#endif10281for (i = 0; i < cset->nranges; ++i)10282{10283cmp_buf[0] = cset->range_starts[i];10284cmp_buf[4] = cset->range_ends[i];10285if (wcscoll (cmp_buf, cmp_buf + 2) <= 010286&& wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)10287{10288match_len = char_len;10289goto check_node_accept_bytes_match;10290}10291}10292}10293check_node_accept_bytes_match:10294if (!cset->non_match)10295return match_len;10296else10297{10298if (match_len > 0)10299return 0;10300else10301return (elem_len > char_len) ? elem_len : char_len;10302}10303}10304return 0;10305}1030610307# ifdef _LIBC10308static unsigned int10309internal_function10310find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)10311{10312uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);10313if (nrules == 0)10314{10315if (mbs_len == 1)10316{10317/* No valid character. Match it as a single byte character. */10318const unsigned char *collseq = (const unsigned char *)10319_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);10320return collseq[mbs[0]];10321}10322return UINT_MAX;10323}10324else10325{10326int32_t idx;10327const unsigned char *extra = (const unsigned char *)10328_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);10329int32_t extrasize = (const unsigned char *)10330_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;1033110332for (idx = 0; idx < extrasize;)10333{10334int mbs_cnt, found = 0;10335int32_t elem_mbs_len;10336/* Skip the name of collating element name. */10337idx = idx + extra[idx] + 1;10338elem_mbs_len = extra[idx++];10339if (mbs_len == elem_mbs_len)10340{10341for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)10342if (extra[idx + mbs_cnt] != mbs[mbs_cnt])10343break;10344if (mbs_cnt == elem_mbs_len)10345/* Found the entry. */10346found = 1;10347}10348/* Skip the byte sequence of the collating element. */10349idx += elem_mbs_len;10350/* Adjust for the alignment. */10351idx = (idx + 3) & ~3;10352/* Skip the collation sequence value. */10353idx += sizeof (uint32_t);10354/* Skip the wide char sequence of the collating element. */10355idx = idx + sizeof (uint32_t) * (extra[idx] + 1);10356/* If we found the entry, return the sequence value. */10357if (found)10358return *(uint32_t *) (extra + idx);10359/* Skip the collation sequence value. */10360idx += sizeof (uint32_t);10361}10362return UINT_MAX;10363}10364}10365# endif /* _LIBC */10366#endif /* RE_ENABLE_I18N */1036710368/* Check whether the node accepts the byte which is IDX-th10369byte of the INPUT. */1037010371static int10372internal_function10373check_node_accept (const re_match_context_t *mctx, const re_token_t *node,10374int idx)10375{10376unsigned char ch;10377ch = re_string_byte_at (&mctx->input, idx);10378switch (node->type)10379{10380case CHARACTER:10381if (node->opr.c != ch)10382return 0;10383break;1038410385case SIMPLE_BRACKET:10386if (!bitset_contain (node->opr.sbcset, ch))10387return 0;10388break;1038910390#ifdef RE_ENABLE_I18N10391case OP_UTF8_PERIOD:10392if (ch >= 0x80)10393return 0;10394/* FALLTHROUGH */10395#endif10396case OP_PERIOD:10397if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))10398|| (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))10399return 0;10400break;1040110402default:10403return 0;10404}1040510406if (node->constraint)10407{10408/* The node has constraints. Check whether the current context10409satisfies the constraints. */10410unsigned int context = re_string_context_at (&mctx->input, idx,10411mctx->eflags);10412if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))10413return 0;10414}1041510416return 1;10417}1041810419/* Extend the buffers, if the buffers have run out. */1042010421static reg_errcode_t10422internal_function10423extend_buffers (re_match_context_t *mctx)10424{10425reg_errcode_t ret;10426re_string_t *pstr = &mctx->input;1042710428/* Double the lengthes of the buffers. */10429ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);10430if (BE (ret != REG_NOERROR, 0))10431return ret;1043210433if (mctx->state_log != NULL)10434{10435/* And double the length of state_log. */10436/* XXX We have no indication of the size of this buffer. If this10437allocation fail we have no indication that the state_log array10438does not have the right size. */10439re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,10440pstr->bufs_len + 1);10441if (BE (new_array == NULL, 0))10442return REG_ESPACE;10443mctx->state_log = new_array;10444}1044510446/* Then reconstruct the buffers. */10447if (pstr->icase)10448{10449#ifdef RE_ENABLE_I18N10450if (pstr->mb_cur_max > 1)10451{10452ret = build_wcs_upper_buffer (pstr);10453if (BE (ret != REG_NOERROR, 0))10454return ret;10455}10456else10457#endif /* RE_ENABLE_I18N */10458build_upper_buffer (pstr);10459}10460else10461{10462#ifdef RE_ENABLE_I18N10463if (pstr->mb_cur_max > 1)10464build_wcs_buffer (pstr);10465else10466#endif /* RE_ENABLE_I18N */10467{10468if (pstr->trans != NULL)10469re_string_translate_buffer (pstr);10470}10471}10472return REG_NOERROR;10473}104741047510476/* Functions for matching context. */1047710478/* Initialize MCTX. */1047910480static reg_errcode_t10481internal_function10482match_ctx_init (re_match_context_t *mctx, int eflags, int n)10483{10484mctx->eflags = eflags;10485mctx->match_last = -1;10486if (n > 0)10487{10488mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);10489mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);10490if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))10491return REG_ESPACE;10492}10493/* Already zero-ed by the caller.10494else10495mctx->bkref_ents = NULL;10496mctx->nbkref_ents = 0;10497mctx->nsub_tops = 0; */10498mctx->abkref_ents = n;10499mctx->max_mb_elem_len = 1;10500mctx->asub_tops = n;10501return REG_NOERROR;10502}1050310504/* Clean the entries which depend on the current input in MCTX.10505This function must be invoked when the matcher changes the start index10506of the input, or changes the input string. */1050710508static void10509internal_function10510match_ctx_clean (re_match_context_t *mctx)10511{10512int st_idx;10513for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)10514{10515int sl_idx;10516re_sub_match_top_t *top = mctx->sub_tops[st_idx];10517for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)10518{10519re_sub_match_last_t *last = top->lasts[sl_idx];10520re_free (last->path.array);10521re_free (last);10522}10523re_free (top->lasts);10524if (top->path)10525{10526re_free (top->path->array);10527re_free (top->path);10528}10529free (top);10530}1053110532mctx->nsub_tops = 0;10533mctx->nbkref_ents = 0;10534}1053510536/* Free all the memory associated with MCTX. */1053710538static void10539internal_function10540match_ctx_free (re_match_context_t *mctx)10541{10542/* First, free all the memory associated with MCTX->SUB_TOPS. */10543match_ctx_clean (mctx);10544re_free (mctx->sub_tops);10545re_free (mctx->bkref_ents);10546}1054710548/* Add a new backreference entry to MCTX.10549Note that we assume that caller never call this function with duplicate10550entry, and call with STR_IDX which isn't smaller than any existing entry.10551*/1055210553static reg_errcode_t10554internal_function10555match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,10556int to)10557{10558if (mctx->nbkref_ents >= mctx->abkref_ents)10559{10560struct re_backref_cache_entry* new_entry;10561new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,10562mctx->abkref_ents * 2);10563if (BE (new_entry == NULL, 0))10564{10565re_free (mctx->bkref_ents);10566return REG_ESPACE;10567}10568mctx->bkref_ents = new_entry;10569memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',10570sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);10571mctx->abkref_ents *= 2;10572}10573if (mctx->nbkref_ents > 010574&& mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)10575mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;1057610577mctx->bkref_ents[mctx->nbkref_ents].node = node;10578mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;10579mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;10580mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;1058110582/* This is a cache that saves negative results of check_dst_limits_calc_pos.10583If bit N is clear, means that this entry won't epsilon-transition to10584an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If10585it is set, check_dst_limits_calc_pos_1 will recurse and try to find one10586such node.1058710588A backreference does not epsilon-transition unless it is empty, so set10589to all zeros if FROM != TO. */10590mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map10591= (from == to ? ~0 : 0);1059210593mctx->bkref_ents[mctx->nbkref_ents++].more = 0;10594if (mctx->max_mb_elem_len < to - from)10595mctx->max_mb_elem_len = to - from;10596return REG_NOERROR;10597}1059810599/* Search for the first entry which has the same str_idx, or -1 if none is10600found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */1060110602static int10603internal_function10604search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)10605{10606int left, right, mid, last;10607last = right = mctx->nbkref_ents;10608for (left = 0; left < right;)10609{10610mid = (left + right) / 2;10611if (mctx->bkref_ents[mid].str_idx < str_idx)10612left = mid + 1;10613else10614right = mid;10615}10616if (left < last && mctx->bkref_ents[left].str_idx == str_idx)10617return left;10618else10619return -1;10620}1062110622/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches10623at STR_IDX. */1062410625static reg_errcode_t10626internal_function10627match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)10628{10629#ifdef DEBUG10630assert (mctx->sub_tops != NULL);10631assert (mctx->asub_tops > 0);10632#endif10633if (BE (mctx->nsub_tops == mctx->asub_tops, 0))10634{10635int new_asub_tops = mctx->asub_tops * 2;10636re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,10637re_sub_match_top_t *,10638new_asub_tops);10639if (BE (new_array == NULL, 0))10640return REG_ESPACE;10641mctx->sub_tops = new_array;10642mctx->asub_tops = new_asub_tops;10643}10644mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));10645if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))10646return REG_ESPACE;10647mctx->sub_tops[mctx->nsub_tops]->node = node;10648mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;10649return REG_NOERROR;10650}1065110652/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches10653at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */1065410655static re_sub_match_last_t *10656internal_function10657match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)10658{10659re_sub_match_last_t *new_entry;10660if (BE (subtop->nlasts == subtop->alasts, 0))10661{10662int new_alasts = 2 * subtop->alasts + 1;10663re_sub_match_last_t **new_array = re_realloc (subtop->lasts,10664re_sub_match_last_t *,10665new_alasts);10666if (BE (new_array == NULL, 0))10667return NULL;10668subtop->lasts = new_array;10669subtop->alasts = new_alasts;10670}10671new_entry = calloc (1, sizeof (re_sub_match_last_t));10672if (BE (new_entry != NULL, 1))10673{10674subtop->lasts[subtop->nlasts] = new_entry;10675new_entry->node = node;10676new_entry->str_idx = str_idx;10677++subtop->nlasts;10678}10679return new_entry;10680}1068110682static void10683internal_function10684sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,10685re_dfastate_t **limited_sts, int last_node, int last_str_idx)10686{10687sctx->sifted_states = sifted_sts;10688sctx->limited_states = limited_sts;10689sctx->last_node = last_node;10690sctx->last_str_idx = last_str_idx;10691re_node_set_init_empty (&sctx->limits);10692}106931069410695/* Binary backward compatibility. */10696#if _LIBC10697# include <shlib-compat.h>10698# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3)10699link_warning (re_max_failures, "the 're_max_failures' variable is obsolete and will go away.")10700int re_max_failures = 2000;10701# endif10702#endif10703#endif107041070510706