CoCalc -- gkregex.c

GitHub Repository: ElmerCSC/elmerfem
Path: blob/devel/elmergrid/src/metis-5.1.0/GKlib/gkregex.c
³²⁰⁶ views
1
/* Extended regular expression matching and search library.
2
   Copyright (C) 2002, 2003, 2005 Free Software Foundation, Inc.
3
   This file is part of the GNU C Library.
4
   Contributed by Isamu Hasegawa <[email protected]>.
5

6
   The GNU C Library is free software; you can redistribute it and/or
7
   modify it under the terms of the GNU Lesser General Public
8
   License as published by the Free Software Foundation; either
9
   version 2.1 of the License, or (at your option) any later version.
10

11
   The GNU C Library is distributed in the hope that it will be useful,
12
   but WITHOUT ANY WARRANTY; without even the implied warranty of
13
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
   Lesser General Public License for more details.
15

16
   You should have received a copy of the GNU Lesser General Public
17
   License along with the GNU C Library; if not, write to the Free
18
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19
   02111-1307 USA.  */
20

21
/* this is for removing a compiler warning */
22
void gkfooo() { return; }
23

24
#ifdef USE_GKREGEX
25

26
#ifdef HAVE_CONFIG_H
27
#include "config.h"
28
#endif
29

30
#ifdef _LIBC
31
/* We have to keep the namespace clean.  */
32
# define regfree(preg) __regfree (preg)
33
# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
34
# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
35
# define regerror(errcode, preg, errbuf, errbuf_size) \
36
	__regerror(errcode, preg, errbuf, errbuf_size)
37
# define re_set_registers(bu, re, nu, st, en) \
38
	__re_set_registers (bu, re, nu, st, en)
39
# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
40
	__re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
41
# define re_match(bufp, string, size, pos, regs) \
42
	__re_match (bufp, string, size, pos, regs)
43
# define re_search(bufp, string, size, startpos, range, regs) \
44
	__re_search (bufp, string, size, startpos, range, regs)
45
# define re_compile_pattern(pattern, length, bufp) \
46
	__re_compile_pattern (pattern, length, bufp)
47
# define re_set_syntax(syntax) __re_set_syntax (syntax)
48
# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
49
	__re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
50
# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
51

52
# include "../locale/localeinfo.h"
53
#endif
54

55
#include "GKlib.h"
56

57

58
/******************************************************************************/
59
/******************************************************************************/
60
/******************************************************************************/
61
/* GKINCLUDE #include "regex_internal.h" */
62
/******************************************************************************/
63
/******************************************************************************/
64
/******************************************************************************/
65
/* Extended regular expression matching and search library.
66
   Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
67
   This file is part of the GNU C Library.
68
   Contributed by Isamu Hasegawa <[email protected]>.
69

70
   The GNU C Library is free software; you can redistribute it and/or
71
   modify it under the terms of the GNU Lesser General Public
72
   License as published by the Free Software Foundation; either
73
   version 2.1 of the License, or (at your option) any later version.
74

75
   The GNU C Library is distributed in the hope that it will be useful,
76
   but WITHOUT ANY WARRANTY; without even the implied warranty of
77
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
78
   Lesser General Public License for more details.
79

80
   You should have received a copy of the GNU Lesser General Public
81
   License along with the GNU C Library; if not, write to the Free
82
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
83
   02111-1307 USA.  */
84

85
#ifndef _REGEX_INTERNAL_H
86
#define _REGEX_INTERNAL_H 1
87

88
#include <assert.h>
89
#include <ctype.h>
90
#include <stdio.h>
91
#include <stdlib.h>
92
#include <string.h>
93

94
#if defined(__MINGW32_VERSION) || defined(_MSC_VER)
95
#define strcasecmp stricmp
96
#endif
97

98
#if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC
99
# include <langinfo.h>
100
#endif
101
#if defined HAVE_LOCALE_H || defined _LIBC
102
# include <locale.h>
103
#endif
104
#if defined HAVE_WCHAR_H || defined _LIBC
105
# include <wchar.h>
106
#endif /* HAVE_WCHAR_H || _LIBC */
107
#if defined HAVE_WCTYPE_H || defined _LIBC
108
# include <wctype.h>
109
#endif /* HAVE_WCTYPE_H || _LIBC */
110
#if defined HAVE_STDBOOL_H || defined _LIBC
111
# include <stdbool.h>
112
#else
113
typedef enum { false, true } bool;
114
#endif /* HAVE_STDBOOL_H || _LIBC */
115
#if defined HAVE_STDINT_H || defined _LIBC
116
# include <stdint.h>
117
#endif /* HAVE_STDINT_H || _LIBC */
118
#if defined _LIBC
119
# include <bits/libc-lock.h>
120
#else
121
# define __libc_lock_define(CLASS,NAME)
122
# define __libc_lock_init(NAME) do { } while (0)
123
# define __libc_lock_lock(NAME) do { } while (0)
124
# define __libc_lock_unlock(NAME) do { } while (0)
125
#endif
126

127
/* In case that the system doesn't have isblank().  */
128
#if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
129
# define isblank(ch) ((ch) == ' ' || (ch) == '\t')
130
#endif
131

132
#ifdef _LIBC
133
# ifndef _RE_DEFINE_LOCALE_FUNCTIONS
134
#  define _RE_DEFINE_LOCALE_FUNCTIONS 1
135
#   include <locale/localeinfo.h>
136
#   include <locale/elem-hash.h>
137
#   include <locale/coll-lookup.h>
138
# endif
139
#endif
140

141
/* This is for other GNU distributions with internationalized messages.  */
142
#if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
143
# include <libintl.h>
144
# ifdef _LIBC
145
#  undef gettext
146
#  define gettext(msgid) \
147
  INTUSE(__dcgettext) (_libc_intl_domainname, msgid, LC_MESSAGES)
148
# endif
149
#else
150
# define gettext(msgid) (msgid)
151
#endif
152

153
#ifndef gettext_noop
154
/* This define is so xgettext can find the internationalizable
155
   strings.  */
156
# define gettext_noop(String) String
157
#endif
158

159
/* For loser systems without the definition.  */
160
#ifndef SIZE_MAX
161
# define SIZE_MAX ((size_t) -1)
162
#endif
163

164
#if (defined MB_CUR_MAX && HAVE_LOCALE_H && HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_WCRTOMB && HAVE_MBRTOWC && HAVE_WCSCOLL) || _LIBC
165
# define RE_ENABLE_I18N
166
#endif
167

168
#if __GNUC__ >= 3
169
# define BE(expr, val) __builtin_expect (expr, val)
170
#else
171
# define BE(expr, val) (expr)
172
# define inline
173
#endif
174

175
/* Number of single byte character.  */
176
#define SBC_MAX 256
177

178
#define COLL_ELEM_LEN_MAX 8
179

180
/* The character which represents newline.  */
181
#define NEWLINE_CHAR '\n'
182
#define WIDE_NEWLINE_CHAR L'\n'
183

184
/* Rename to standard API for using out of glibc.  */
185
#ifndef _LIBC
186
# define __wctype wctype
187
# define __iswctype iswctype
188
# define __btowc btowc
189
# define __mempcpy mempcpy
190
# define __wcrtomb wcrtomb
191
# define __regfree regfree
192
# define attribute_hidden
193
#endif /* not _LIBC */
194

195
#ifdef __GNUC__
196
# define __attribute(arg) __attribute__ (arg)
197
#else
198
# define __attribute(arg)
199
#endif
200

201
extern const char __re_error_msgid[] attribute_hidden;
202
extern const size_t __re_error_msgid_idx[] attribute_hidden;
203

204
/* An integer used to represent a set of bits.  It must be unsigned,
205
   and must be at least as wide as unsigned int.  */
206
typedef unsigned long int bitset_word_t;
207
/* All bits set in a bitset_word_t.  */
208
#define BITSET_WORD_MAX ULONG_MAX
209
/* Number of bits in a bitset_word_t.  */
210
#define BITSET_WORD_BITS (sizeof (bitset_word_t) * CHAR_BIT)
211
/* Number of bitset_word_t in a bit_set.  */
212
#define BITSET_WORDS (SBC_MAX / BITSET_WORD_BITS)
213
typedef bitset_word_t bitset_t[BITSET_WORDS];
214
typedef bitset_word_t *re_bitset_ptr_t;
215
typedef const bitset_word_t *re_const_bitset_ptr_t;
216

217
#define bitset_set(set,i) \
218
  (set[i / BITSET_WORD_BITS] |= (bitset_word_t) 1 << i % BITSET_WORD_BITS)
219
#define bitset_clear(set,i) \
220
  (set[i / BITSET_WORD_BITS] &= ~((bitset_word_t) 1 << i % BITSET_WORD_BITS))
221
#define bitset_contain(set,i) \
222
  (set[i / BITSET_WORD_BITS] & ((bitset_word_t) 1 << i % BITSET_WORD_BITS))
223
#define bitset_empty(set) memset (set, '\0', sizeof (bitset_t))
224
#define bitset_set_all(set) memset (set, '\xff', sizeof (bitset_t))
225
#define bitset_copy(dest,src) memcpy (dest, src, sizeof (bitset_t))
226

227
#define PREV_WORD_CONSTRAINT 0x0001
228
#define PREV_NOTWORD_CONSTRAINT 0x0002
229
#define NEXT_WORD_CONSTRAINT 0x0004
230
#define NEXT_NOTWORD_CONSTRAINT 0x0008
231
#define PREV_NEWLINE_CONSTRAINT 0x0010
232
#define NEXT_NEWLINE_CONSTRAINT 0x0020
233
#define PREV_BEGBUF_CONSTRAINT 0x0040
234
#define NEXT_ENDBUF_CONSTRAINT 0x0080
235
#define WORD_DELIM_CONSTRAINT 0x0100
236
#define NOT_WORD_DELIM_CONSTRAINT 0x0200
237

238
typedef enum
239
{
240
  INSIDE_WORD = PREV_WORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
241
  WORD_FIRST = PREV_NOTWORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
242
  WORD_LAST = PREV_WORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
243
  INSIDE_NOTWORD = PREV_NOTWORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
244
  LINE_FIRST = PREV_NEWLINE_CONSTRAINT,
245
  LINE_LAST = NEXT_NEWLINE_CONSTRAINT,
246
  BUF_FIRST = PREV_BEGBUF_CONSTRAINT,
247
  BUF_LAST = NEXT_ENDBUF_CONSTRAINT,
248
  WORD_DELIM = WORD_DELIM_CONSTRAINT,
249
  NOT_WORD_DELIM = NOT_WORD_DELIM_CONSTRAINT
250
} re_context_type;
251

252
typedef struct
253
{
254
  int alloc;
255
  int nelem;
256
  int *elems;
257
} re_node_set;
258

259
typedef enum
260
{
261
  NON_TYPE = 0,
262

263
  /* Node type, These are used by token, node, tree.  */
264
  CHARACTER = 1,
265
  END_OF_RE = 2,
266
  SIMPLE_BRACKET = 3,
267
  OP_BACK_REF = 4,
268
  OP_PERIOD = 5,
269
#ifdef RE_ENABLE_I18N
270
  COMPLEX_BRACKET = 6,
271
  OP_UTF8_PERIOD = 7,
272
#endif /* RE_ENABLE_I18N */
273

274
  /* We define EPSILON_BIT as a macro so that OP_OPEN_SUBEXP is used
275
     when the debugger shows values of this enum type.  */
276
#define EPSILON_BIT 8
277
  OP_OPEN_SUBEXP = EPSILON_BIT | 0,
278
  OP_CLOSE_SUBEXP = EPSILON_BIT | 1,
279
  OP_ALT = EPSILON_BIT | 2,
280
  OP_DUP_ASTERISK = EPSILON_BIT | 3,
281
  ANCHOR = EPSILON_BIT | 4,
282

283
  /* Tree type, these are used only by tree. */
284
  CONCAT = 16,
285
  SUBEXP = 17,
286

287
  /* Token type, these are used only by token.  */
288
  OP_DUP_PLUS = 18,
289
  OP_DUP_QUESTION,
290
  OP_OPEN_BRACKET,
291
  OP_CLOSE_BRACKET,
292
  OP_CHARSET_RANGE,
293
  OP_OPEN_DUP_NUM,
294
  OP_CLOSE_DUP_NUM,
295
  OP_NON_MATCH_LIST,
296
  OP_OPEN_COLL_ELEM,
297
  OP_CLOSE_COLL_ELEM,
298
  OP_OPEN_EQUIV_CLASS,
299
  OP_CLOSE_EQUIV_CLASS,
300
  OP_OPEN_CHAR_CLASS,
301
  OP_CLOSE_CHAR_CLASS,
302
  OP_WORD,
303
  OP_NOTWORD,
304
  OP_SPACE,
305
  OP_NOTSPACE,
306
  BACK_SLASH
307

308
} re_token_type_t;
309

310
#ifdef RE_ENABLE_I18N
311
typedef struct
312
{
313
  /* Multibyte characters.  */
314
  wchar_t *mbchars;
315

316
  /* Collating symbols.  */
317
# ifdef _LIBC
318
  int32_t *coll_syms;
319
# endif
320

321
  /* Equivalence classes. */
322
# ifdef _LIBC
323
  int32_t *equiv_classes;
324
# endif
325

326
  /* Range expressions. */
327
# ifdef _LIBC
328
  uint32_t *range_starts;
329
  uint32_t *range_ends;
330
# else /* not _LIBC */
331
  wchar_t *range_starts;
332
  wchar_t *range_ends;
333
# endif /* not _LIBC */
334

335
  /* Character classes. */
336
  wctype_t *char_classes;
337

338
  /* If this character set is the non-matching list.  */
339
  unsigned int non_match : 1;
340

341
  /* # of multibyte characters.  */
342
  int nmbchars;
343

344
  /* # of collating symbols.  */
345
  int ncoll_syms;
346

347
  /* # of equivalence classes. */
348
  int nequiv_classes;
349

350
  /* # of range expressions. */
351
  int nranges;
352

353
  /* # of character classes. */
354
  int nchar_classes;
355
} re_charset_t;
356
#endif /* RE_ENABLE_I18N */
357

358
typedef struct
359
{
360
  union
361
  {
362
    unsigned char c;		/* for CHARACTER */
363
    re_bitset_ptr_t sbcset;	/* for SIMPLE_BRACKET */
364
#ifdef RE_ENABLE_I18N
365
    re_charset_t *mbcset;	/* for COMPLEX_BRACKET */
366
#endif /* RE_ENABLE_I18N */
367
    int idx;			/* for BACK_REF */
368
    re_context_type ctx_type;	/* for ANCHOR */
369
  } opr;
370
#if __GNUC__ >= 2
371
  re_token_type_t type : 8;
372
#else
373
  re_token_type_t type;
374
#endif
375
  unsigned int constraint : 10;	/* context constraint */
376
  unsigned int duplicated : 1;
377
  unsigned int opt_subexp : 1;
378
#ifdef RE_ENABLE_I18N
379
  unsigned int accept_mb : 1;
380
  /* These 2 bits can be moved into the union if needed (e.g. if running out
381
     of bits; move opr.c to opr.c.c and move the flags to opr.c.flags).  */
382
  unsigned int mb_partial : 1;
383
#endif
384
  unsigned int word_char : 1;
385
} re_token_t;
386

387
#define IS_EPSILON_NODE(type) ((type) & EPSILON_BIT)
388

389
struct re_string_t
390
{
391
  /* Indicate the raw buffer which is the original string passed as an
392
     argument of regexec(), re_search(), etc..  */
393
  const unsigned char *raw_mbs;
394
  /* Store the multibyte string.  In case of "case insensitive mode" like
395
     REG_ICASE, upper cases of the string are stored, otherwise MBS points
396
     the same address that RAW_MBS points.  */
397
  unsigned char *mbs;
398
#ifdef RE_ENABLE_I18N
399
  /* Store the wide character string which is corresponding to MBS.  */
400
  wint_t *wcs;
401
  int *offsets;
402
  mbstate_t cur_state;
403
#endif
404
  /* Index in RAW_MBS.  Each character mbs[i] corresponds to
405
     raw_mbs[raw_mbs_idx + i].  */
406
  int raw_mbs_idx;
407
  /* The length of the valid characters in the buffers.  */
408
  int valid_len;
409
  /* The corresponding number of bytes in raw_mbs array.  */
410
  int valid_raw_len;
411
  /* The length of the buffers MBS and WCS.  */
412
  int bufs_len;
413
  /* The index in MBS, which is updated by re_string_fetch_byte.  */
414
  int cur_idx;
415
  /* length of RAW_MBS array.  */
416
  int raw_len;
417
  /* This is RAW_LEN - RAW_MBS_IDX + VALID_LEN - VALID_RAW_LEN.  */
418
  int len;
419
  /* End of the buffer may be shorter than its length in the cases such
420
     as re_match_2, re_search_2.  Then, we use STOP for end of the buffer
421
     instead of LEN.  */
422
  int raw_stop;
423
  /* This is RAW_STOP - RAW_MBS_IDX adjusted through OFFSETS.  */
424
  int stop;
425

426
  /* The context of mbs[0].  We store the context independently, since
427
     the context of mbs[0] may be different from raw_mbs[0], which is
428
     the beginning of the input string.  */
429
  unsigned int tip_context;
430
  /* The translation passed as a part of an argument of re_compile_pattern.  */
431
  RE_TRANSLATE_TYPE trans;
432
  /* Copy of re_dfa_t's word_char.  */
433
  re_const_bitset_ptr_t word_char;
434
  /* 1 if REG_ICASE.  */
435
  unsigned char icase;
436
  unsigned char is_utf8;
437
  unsigned char map_notascii;
438
  unsigned char mbs_allocated;
439
  unsigned char offsets_needed;
440
  unsigned char newline_anchor;
441
  unsigned char word_ops_used;
442
  int mb_cur_max;
443
};
444
typedef struct re_string_t re_string_t;
445

446

447
struct re_dfa_t;
448
typedef struct re_dfa_t re_dfa_t;
449

450
#ifndef _LIBC
451
# ifdef __i386__
452
#  define internal_function   __attribute ((regparm (3), stdcall))
453
# else
454
#  define internal_function
455
# endif
456
#endif
457

458
static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
459
						int new_buf_len)
460
     internal_function;
461
#ifdef RE_ENABLE_I18N
462
static void build_wcs_buffer (re_string_t *pstr) internal_function;
463
static int build_wcs_upper_buffer (re_string_t *pstr) internal_function;
464
#endif /* RE_ENABLE_I18N */
465
static void build_upper_buffer (re_string_t *pstr) internal_function;
466
static void re_string_translate_buffer (re_string_t *pstr) internal_function;
467
static unsigned int re_string_context_at (const re_string_t *input, int idx,
468
					  int eflags)
469
     internal_function __attribute ((pure));
470
#define re_string_peek_byte(pstr, offset) \
471
  ((pstr)->mbs[(pstr)->cur_idx + offset])
472
#define re_string_fetch_byte(pstr) \
473
  ((pstr)->mbs[(pstr)->cur_idx++])
474
#define re_string_first_byte(pstr, idx) \
475
  ((idx) == (pstr)->valid_len || (pstr)->wcs[idx] != WEOF)
476
#define re_string_is_single_byte_char(pstr, idx) \
477
  ((pstr)->wcs[idx] != WEOF && ((pstr)->valid_len == (idx) + 1 \
478
				|| (pstr)->wcs[(idx) + 1] != WEOF))
479
#define re_string_eoi(pstr) ((pstr)->stop <= (pstr)->cur_idx)
480
#define re_string_cur_idx(pstr) ((pstr)->cur_idx)
481
#define re_string_get_buffer(pstr) ((pstr)->mbs)
482
#define re_string_length(pstr) ((pstr)->len)
483
#define re_string_byte_at(pstr,idx) ((pstr)->mbs[idx])
484
#define re_string_skip_bytes(pstr,idx) ((pstr)->cur_idx += (idx))
485
#define re_string_set_index(pstr,idx) ((pstr)->cur_idx = (idx))
486

487
#ifdef __GNUC__
488
# define alloca(size)   __builtin_alloca (size)
489
# define HAVE_ALLOCA 1
490
#elif defined(_MSC_VER)
491
# include <malloc.h>
492
# define alloca _alloca
493
# define HAVE_ALLOCA 1
494
#else
495
# error No alloca()
496
#endif
497

498
#ifndef _LIBC
499
# if HAVE_ALLOCA
500
/* The OS usually guarantees only one guard page at the bottom of the stack,
501
   and a page size can be as small as 4096 bytes.  So we cannot safely
502
   allocate anything larger than 4096 bytes.  Also care for the possibility
503
   of a few compiler-allocated temporary stack slots.  */
504
#  define __libc_use_alloca(n) ((n) < 4032)
505
# else
506
/* alloca is implemented with malloc, so just use malloc.  */
507
#  define __libc_use_alloca(n) 0
508
# endif
509
#endif
510

511
#define re_malloc(t,n) ((t *) malloc ((n) * sizeof (t)))
512
#define re_realloc(p,t,n) ((t *) realloc (p, (n) * sizeof (t)))
513
#define re_free(p) free (p)
514

515
struct bin_tree_t
516
{
517
  struct bin_tree_t *parent;
518
  struct bin_tree_t *left;
519
  struct bin_tree_t *right;
520
  struct bin_tree_t *first;
521
  struct bin_tree_t *next;
522

523
  re_token_t token;
524

525
  /* `node_idx' is the index in dfa->nodes, if `type' == 0.
526
     Otherwise `type' indicate the type of this node.  */
527
  int node_idx;
528
};
529
typedef struct bin_tree_t bin_tree_t;
530

531
#define BIN_TREE_STORAGE_SIZE \
532
  ((1024 - sizeof (void *)) / sizeof (bin_tree_t))
533

534
struct bin_tree_storage_t
535
{
536
  struct bin_tree_storage_t *next;
537
  bin_tree_t data[BIN_TREE_STORAGE_SIZE];
538
};
539
typedef struct bin_tree_storage_t bin_tree_storage_t;
540

541
#define CONTEXT_WORD 1
542
#define CONTEXT_NEWLINE (CONTEXT_WORD << 1)
543
#define CONTEXT_BEGBUF (CONTEXT_NEWLINE << 1)
544
#define CONTEXT_ENDBUF (CONTEXT_BEGBUF << 1)
545

546
#define IS_WORD_CONTEXT(c) ((c) & CONTEXT_WORD)
547
#define IS_NEWLINE_CONTEXT(c) ((c) & CONTEXT_NEWLINE)
548
#define IS_BEGBUF_CONTEXT(c) ((c) & CONTEXT_BEGBUF)
549
#define IS_ENDBUF_CONTEXT(c) ((c) & CONTEXT_ENDBUF)
550
#define IS_ORDINARY_CONTEXT(c) ((c) == 0)
551

552
#define IS_WORD_CHAR(ch) (isalnum (ch) || (ch) == '_')
553
#define IS_NEWLINE(ch) ((ch) == NEWLINE_CHAR)
554
#define IS_WIDE_WORD_CHAR(ch) (iswalnum (ch) || (ch) == L'_')
555
#define IS_WIDE_NEWLINE(ch) ((ch) == WIDE_NEWLINE_CHAR)
556

557
#define NOT_SATISFY_PREV_CONSTRAINT(constraint,context) \
558
 ((((constraint) & PREV_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
559
  || ((constraint & PREV_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
560
  || ((constraint & PREV_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context))\
561
  || ((constraint & PREV_BEGBUF_CONSTRAINT) && !IS_BEGBUF_CONTEXT (context)))
562

563
#define NOT_SATISFY_NEXT_CONSTRAINT(constraint,context) \
564
 ((((constraint) & NEXT_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
565
  || (((constraint) & NEXT_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
566
  || (((constraint) & NEXT_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context)) \
567
  || (((constraint) & NEXT_ENDBUF_CONSTRAINT) && !IS_ENDBUF_CONTEXT (context)))
568

569
struct re_dfastate_t
570
{
571
  unsigned int hash;
572
  re_node_set nodes;
573
  re_node_set non_eps_nodes;
574
  re_node_set inveclosure;
575
  re_node_set *entrance_nodes;
576
  struct re_dfastate_t **trtable, **word_trtable;
577
  unsigned int context : 4;
578
  unsigned int halt : 1;
579
  /* If this state can accept `multi byte'.
580
     Note that we refer to multibyte characters, and multi character
581
     collating elements as `multi byte'.  */
582
  unsigned int accept_mb : 1;
583
  /* If this state has backreference node(s).  */
584
  unsigned int has_backref : 1;
585
  unsigned int has_constraint : 1;
586
};
587
typedef struct re_dfastate_t re_dfastate_t;
588

589
struct re_state_table_entry
590
{
591
  int num;
592
  int alloc;
593
  re_dfastate_t **array;
594
};
595

596
/* Array type used in re_sub_match_last_t and re_sub_match_top_t.  */
597

598
typedef struct
599
{
600
  int next_idx;
601
  int alloc;
602
  re_dfastate_t **array;
603
} state_array_t;
604

605
/* Store information about the node NODE whose type is OP_CLOSE_SUBEXP.  */
606

607
typedef struct
608
{
609
  int node;
610
  int str_idx; /* The position NODE match at.  */
611
  state_array_t path;
612
} re_sub_match_last_t;
613

614
/* Store information about the node NODE whose type is OP_OPEN_SUBEXP.
615
   And information about the node, whose type is OP_CLOSE_SUBEXP,
616
   corresponding to NODE is stored in LASTS.  */
617

618
typedef struct
619
{
620
  int str_idx;
621
  int node;
622
  state_array_t *path;
623
  int alasts; /* Allocation size of LASTS.  */
624
  int nlasts; /* The number of LASTS.  */
625
  re_sub_match_last_t **lasts;
626
} re_sub_match_top_t;
627

628
struct re_backref_cache_entry
629
{
630
  int node;
631
  int str_idx;
632
  int subexp_from;
633
  int subexp_to;
634
  char more;
635
  char unused;
636
  unsigned short int eps_reachable_subexps_map;
637
};
638

639
typedef struct
640
{
641
  /* The string object corresponding to the input string.  */
642
  re_string_t input;
643
#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
644
  const re_dfa_t *const dfa;
645
#else
646
  const re_dfa_t *dfa;
647
#endif
648
  /* EFLAGS of the argument of regexec.  */
649
  int eflags;
650
  /* Where the matching ends.  */
651
  int match_last;
652
  int last_node;
653
  /* The state log used by the matcher.  */
654
  re_dfastate_t **state_log;
655
  int state_log_top;
656
  /* Back reference cache.  */
657
  int nbkref_ents;
658
  int abkref_ents;
659
  struct re_backref_cache_entry *bkref_ents;
660
  int max_mb_elem_len;
661
  int nsub_tops;
662
  int asub_tops;
663
  re_sub_match_top_t **sub_tops;
664
} re_match_context_t;
665

666
typedef struct
667
{
668
  re_dfastate_t **sifted_states;
669
  re_dfastate_t **limited_states;
670
  int last_node;
671
  int last_str_idx;
672
  re_node_set limits;
673
} re_sift_context_t;
674

675
struct re_fail_stack_ent_t
676
{
677
  int idx;
678
  int node;
679
  regmatch_t *regs;
680
  re_node_set eps_via_nodes;
681
};
682

683
struct re_fail_stack_t
684
{
685
  int num;
686
  int alloc;
687
  struct re_fail_stack_ent_t *stack;
688
};
689

690
struct re_dfa_t
691
{
692
  re_token_t *nodes;
693
  size_t nodes_alloc;
694
  size_t nodes_len;
695
  int *nexts;
696
  int *org_indices;
697
  re_node_set *edests;
698
  re_node_set *eclosures;
699
  re_node_set *inveclosures;
700
  struct re_state_table_entry *state_table;
701
  re_dfastate_t *init_state;
702
  re_dfastate_t *init_state_word;
703
  re_dfastate_t *init_state_nl;
704
  re_dfastate_t *init_state_begbuf;
705
  bin_tree_t *str_tree;
706
  bin_tree_storage_t *str_tree_storage;
707
  re_bitset_ptr_t sb_char;
708
  int str_tree_storage_idx;
709

710
  /* number of subexpressions `re_nsub' is in regex_t.  */
711
  unsigned int state_hash_mask;
712
  int init_node;
713
  int nbackref; /* The number of backreference in this dfa.  */
714

715
  /* Bitmap expressing which backreference is used.  */
716
  bitset_word_t used_bkref_map;
717
  bitset_word_t completed_bkref_map;
718

719
  unsigned int has_plural_match : 1;
720
  /* If this dfa has "multibyte node", which is a backreference or
721
     a node which can accept multibyte character or multi character
722
     collating element.  */
723
  unsigned int has_mb_node : 1;
724
  unsigned int is_utf8 : 1;
725
  unsigned int map_notascii : 1;
726
  unsigned int word_ops_used : 1;
727
  int mb_cur_max;
728
  bitset_t word_char;
729
  reg_syntax_t syntax;
730
  int *subexp_map;
731
#ifdef DEBUG
732
  char* re_str;
733
#endif
734
  __libc_lock_define (, lock)
735
};
736

737
#define re_node_set_init_empty(set) memset (set, '\0', sizeof (re_node_set))
738
#define re_node_set_remove(set,id) \
739
  (re_node_set_remove_at (set, re_node_set_contains (set, id) - 1))
740
#define re_node_set_empty(p) ((p)->nelem = 0)
741
#define re_node_set_free(set) re_free ((set)->elems)
742

743

744
typedef enum
745
{
746
  SB_CHAR,
747
  MB_CHAR,
748
  EQUIV_CLASS,
749
  COLL_SYM,
750
  CHAR_CLASS
751
} bracket_elem_type;
752

753
typedef struct
754
{
755
  bracket_elem_type type;
756
  union
757
  {
758
    unsigned char ch;
759
    unsigned char *name;
760
    wchar_t wch;
761
  } opr;
762
} bracket_elem_t;
763

764

765
/* Inline functions for bitset operation.  */
766
static inline void
767
bitset_not (bitset_t set)
768
{
769
  int bitset_i;
770
  for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)
771
    set[bitset_i] = ~set[bitset_i];
772
}
773

774
static inline void
775
bitset_merge (bitset_t dest, const bitset_t src)
776
{
777
  int bitset_i;
778
  for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)
779
    dest[bitset_i] |= src[bitset_i];
780
}
781

782
static inline void
783
bitset_mask (bitset_t dest, const bitset_t src)
784
{
785
  int bitset_i;
786
  for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)
787
    dest[bitset_i] &= src[bitset_i];
788
}
789

790
#ifdef RE_ENABLE_I18N
791
/* Inline functions for re_string.  */
792
static inline int
793
internal_function __attribute ((pure))
794
re_string_char_size_at (const re_string_t *pstr, int idx)
795
{
796
  int byte_idx;
797
  if (pstr->mb_cur_max == 1)
798
    return 1;
799
  for (byte_idx = 1; idx + byte_idx < pstr->valid_len; ++byte_idx)
800
    if (pstr->wcs[idx + byte_idx] != WEOF)
801
      break;
802
  return byte_idx;
803
}
804

805
static inline wint_t
806
internal_function __attribute ((pure))
807
re_string_wchar_at (const re_string_t *pstr, int idx)
808
{
809
  if (pstr->mb_cur_max == 1)
810
    return (wint_t) pstr->mbs[idx];
811
  return (wint_t) pstr->wcs[idx];
812
}
813

814
static int
815
internal_function __attribute ((pure))
816
re_string_elem_size_at (const re_string_t *pstr, int idx)
817
{
818
# ifdef _LIBC
819
  const unsigned char *p, *extra;
820
  const int32_t *table, *indirect;
821
  int32_t tmp;
822
#  include <locale/weight.h>
823
  uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
824

825
  if (nrules != 0)
826
    {
827
      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
828
      extra = (const unsigned char *)
829
	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
830
      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
831
						_NL_COLLATE_INDIRECTMB);
832
      p = pstr->mbs + idx;
833
      tmp = findidx (&p);
834
      return p - pstr->mbs - idx;
835
    }
836
  else
837
# endif /* _LIBC */
838
    return 1;
839
}
840
#endif /* RE_ENABLE_I18N */
841

842
#endif /*  _REGEX_INTERNAL_H */
843

844
/******************************************************************************/
845
/******************************************************************************/
846
/******************************************************************************/
847
/* GKINCLUDE #include "regex_internal.c" */
848
/******************************************************************************/
849
/******************************************************************************/
850
/******************************************************************************/
851
/* Extended regular expression matching and search library.
852
   Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
853
   This file is part of the GNU C Library.
854
   Contributed by Isamu Hasegawa <[email protected]>.
855

856
   The GNU C Library is free software; you can redistribute it and/or
857
   modify it under the terms of the GNU Lesser General Public
858
   License as published by the Free Software Foundation; either
859
   version 2.1 of the License, or (at your option) any later version.
860

861
   The GNU C Library is distributed in the hope that it will be useful,
862
   but WITHOUT ANY WARRANTY; without even the implied warranty of
863
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
864
   Lesser General Public License for more details.
865

866
   You should have received a copy of the GNU Lesser General Public
867
   License along with the GNU C Library; if not, write to the Free
868
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
869
   02111-1307 USA.  */
870

871
static void re_string_construct_common (const char *str, int len,
872
					re_string_t *pstr,
873
					RE_TRANSLATE_TYPE trans, int icase,
874
					const re_dfa_t *dfa) internal_function;
875
static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
876
					  const re_node_set *nodes,
877
					  unsigned int hash) internal_function;
878
static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
879
					  const re_node_set *nodes,
880
					  unsigned int context,
881
					  unsigned int hash) internal_function;
882

883
/* Functions for string operation.  */
884

885
/* This function allocate the buffers.  It is necessary to call
886
   re_string_reconstruct before using the object.  */
887

888
static reg_errcode_t
889
internal_function
890
re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
891
		    RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
892
{
893
  reg_errcode_t ret;
894
  int init_buf_len;
895

896
  /* Ensure at least one character fits into the buffers.  */
897
  if (init_len < dfa->mb_cur_max)
898
    init_len = dfa->mb_cur_max;
899
  init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
900
  re_string_construct_common (str, len, pstr, trans, icase, dfa);
901

902
  ret = re_string_realloc_buffers (pstr, init_buf_len);
903
  if (BE (ret != REG_NOERROR, 0))
904
    return ret;
905

906
  pstr->word_char = dfa->word_char;
907
  pstr->word_ops_used = dfa->word_ops_used;
908
  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
909
  pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
910
  pstr->valid_raw_len = pstr->valid_len;
911
  return REG_NOERROR;
912
}
913

914
/* This function allocate the buffers, and initialize them.  */
915

916
static reg_errcode_t
917
internal_function
918
re_string_construct (re_string_t *pstr, const char *str, int len,
919
		     RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
920
{
921
  reg_errcode_t ret;
922
  memset (pstr, '\0', sizeof (re_string_t));
923
  re_string_construct_common (str, len, pstr, trans, icase, dfa);
924

925
  if (len > 0)
926
    {
927
      ret = re_string_realloc_buffers (pstr, len + 1);
928
      if (BE (ret != REG_NOERROR, 0))
929
	return ret;
930
    }
931
  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
932

933
  if (icase)
934
    {
935
#ifdef RE_ENABLE_I18N
936
      if (dfa->mb_cur_max > 1)
937
	{
938
	  while (1)
939
	    {
940
	      ret = build_wcs_upper_buffer (pstr);
941
	      if (BE (ret != REG_NOERROR, 0))
942
		return ret;
943
	      if (pstr->valid_raw_len >= len)
944
		break;
945
	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
946
		break;
947
	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
948
	      if (BE (ret != REG_NOERROR, 0))
949
		return ret;
950
	    }
951
	}
952
      else
953
#endif /* RE_ENABLE_I18N  */
954
	build_upper_buffer (pstr);
955
    }
956
  else
957
    {
958
#ifdef RE_ENABLE_I18N
959
      if (dfa->mb_cur_max > 1)
960
	build_wcs_buffer (pstr);
961
      else
962
#endif /* RE_ENABLE_I18N  */
963
	{
964
	  if (trans != NULL)
965
	    re_string_translate_buffer (pstr);
966
	  else
967
	    {
968
	      pstr->valid_len = pstr->bufs_len;
969
	      pstr->valid_raw_len = pstr->bufs_len;
970
	    }
971
	}
972
    }
973

974
  return REG_NOERROR;
975
}
976

977
/* Helper functions for re_string_allocate, and re_string_construct.  */
978

979
static reg_errcode_t
980
internal_function
981
re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
982
{
983
#ifdef RE_ENABLE_I18N
984
  if (pstr->mb_cur_max > 1)
985
    {
986
      wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
987
      if (BE (new_wcs == NULL, 0))
988
	return REG_ESPACE;
989
      pstr->wcs = new_wcs;
990
      if (pstr->offsets != NULL)
991
	{
992
	  int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
993
	  if (BE (new_offsets == NULL, 0))
994
	    return REG_ESPACE;
995
	  pstr->offsets = new_offsets;
996
	}
997
    }
998
#endif /* RE_ENABLE_I18N  */
999
  if (pstr->mbs_allocated)
1000
    {
1001
      unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
1002
					   new_buf_len);
1003
      if (BE (new_mbs == NULL, 0))
1004
	return REG_ESPACE;
1005
      pstr->mbs = new_mbs;
1006
    }
1007
  pstr->bufs_len = new_buf_len;
1008
  return REG_NOERROR;
1009
}
1010

1011

1012
static void
1013
internal_function
1014
re_string_construct_common (const char *str, int len, re_string_t *pstr,
1015
			    RE_TRANSLATE_TYPE trans, int icase,
1016
			    const re_dfa_t *dfa)
1017
{
1018
  pstr->raw_mbs = (const unsigned char *) str;
1019
  pstr->len = len;
1020
  pstr->raw_len = len;
1021
  pstr->trans = trans;
1022
  pstr->icase = icase ? 1 : 0;
1023
  pstr->mbs_allocated = (trans != NULL || icase);
1024
  pstr->mb_cur_max = dfa->mb_cur_max;
1025
  pstr->is_utf8 = dfa->is_utf8;
1026
  pstr->map_notascii = dfa->map_notascii;
1027
  pstr->stop = pstr->len;
1028
  pstr->raw_stop = pstr->stop;
1029
}
1030

1031
#ifdef RE_ENABLE_I18N
1032

1033
/* Build wide character buffer PSTR->WCS.
1034
   If the byte sequence of the string are:
1035
     <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
1036
   Then wide character buffer will be:
1037
     <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
1038
   We use WEOF for padding, they indicate that the position isn't
1039
   a first byte of a multibyte character.
1040

1041
   Note that this function assumes PSTR->VALID_LEN elements are already
1042
   built and starts from PSTR->VALID_LEN.  */
1043

1044
static void
1045
internal_function
1046
build_wcs_buffer (re_string_t *pstr)
1047
{
1048
#ifdef _LIBC
1049
  unsigned char buf[MB_LEN_MAX];
1050
  assert (MB_LEN_MAX >= pstr->mb_cur_max);
1051
#else
1052
  unsigned char buf[64];
1053
#endif
1054
  mbstate_t prev_st;
1055
  int byte_idx, end_idx, remain_len;
1056
  size_t mbclen;
1057

1058
  /* Build the buffers from pstr->valid_len to either pstr->len or
1059
     pstr->bufs_len.  */
1060
  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
1061
  for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
1062
    {
1063
      wchar_t wc;
1064
      const char *p;
1065

1066
      remain_len = end_idx - byte_idx;
1067
      prev_st = pstr->cur_state;
1068
      /* Apply the translation if we need.  */
1069
      if (BE (pstr->trans != NULL, 0))
1070
	{
1071
	  int i, ch;
1072

1073
	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
1074
	    {
1075
	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
1076
	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
1077
	    }
1078
	  p = (const char *) buf;
1079
	}
1080
      else
1081
	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
1082
      mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
1083
      if (BE (mbclen == (size_t) -2, 0))
1084
	{
1085
	  /* The buffer doesn't have enough space, finish to build.  */
1086
	  pstr->cur_state = prev_st;
1087
	  break;
1088
	}
1089
      else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
1090
	{
1091
	  /* We treat these cases as a singlebyte character.  */
1092
	  mbclen = 1;
1093
	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
1094
	  if (BE (pstr->trans != NULL, 0))
1095
	    wc = pstr->trans[wc];
1096
	  pstr->cur_state = prev_st;
1097
	}
1098

1099
      /* Write wide character and padding.  */
1100
      pstr->wcs[byte_idx++] = wc;
1101
      /* Write paddings.  */
1102
      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
1103
	pstr->wcs[byte_idx++] = WEOF;
1104
    }
1105
  pstr->valid_len = byte_idx;
1106
  pstr->valid_raw_len = byte_idx;
1107
}
1108

1109
/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
1110
   but for REG_ICASE.  */
1111

1112
static reg_errcode_t
1113
internal_function
1114
build_wcs_upper_buffer (re_string_t *pstr)
1115
{
1116
  mbstate_t prev_st;
1117
  int src_idx, byte_idx, end_idx, remain_len;
1118
  size_t mbclen;
1119
#ifdef _LIBC
1120
  char buf[MB_LEN_MAX];
1121
  assert (MB_LEN_MAX >= pstr->mb_cur_max);
1122
#else
1123
  char buf[64];
1124
#endif
1125

1126
  byte_idx = pstr->valid_len;
1127
  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
1128

1129
  /* The following optimization assumes that ASCII characters can be
1130
     mapped to wide characters with a simple cast.  */
1131
  if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
1132
    {
1133
      while (byte_idx < end_idx)
1134
	{
1135
	  wchar_t wc;
1136

1137
	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
1138
	      && mbsinit (&pstr->cur_state))
1139
	    {
1140
	      /* In case of a singlebyte character.  */
1141
	      pstr->mbs[byte_idx]
1142
		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
1143
	      /* The next step uses the assumption that wchar_t is encoded
1144
		 ASCII-safe: all ASCII values can be converted like this.  */
1145
	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
1146
	      ++byte_idx;
1147
	      continue;
1148
	    }
1149

1150
	  remain_len = end_idx - byte_idx;
1151
	  prev_st = pstr->cur_state;
1152
	  mbclen = mbrtowc (&wc,
1153
			    ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
1154
			     + byte_idx), remain_len, &pstr->cur_state);
1155
	  if (BE (mbclen + 2 > 2, 1))
1156
	    {
1157
	      wchar_t wcu = wc;
1158
	      if (iswlower (wc))
1159
		{
1160
		  size_t mbcdlen;
1161

1162
		  wcu = towupper (wc);
1163
		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
1164
		  if (BE (mbclen == mbcdlen, 1))
1165
		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
1166
		  else
1167
		    {
1168
		      src_idx = byte_idx;
1169
		      goto offsets_needed;
1170
		    }
1171
		}
1172
	      else
1173
		memcpy (pstr->mbs + byte_idx,
1174
			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
1175
	      pstr->wcs[byte_idx++] = wcu;
1176
	      /* Write paddings.  */
1177
	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
1178
		pstr->wcs[byte_idx++] = WEOF;
1179
	    }
1180
	  else if (mbclen == (size_t) -1 || mbclen == 0)
1181
	    {
1182
	      /* It is an invalid character or '\0'.  Just use the byte.  */
1183
	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
1184
	      pstr->mbs[byte_idx] = ch;
1185
	      /* And also cast it to wide char.  */
1186
	      pstr->wcs[byte_idx++] = (wchar_t) ch;
1187
	      if (BE (mbclen == (size_t) -1, 0))
1188
		pstr->cur_state = prev_st;
1189
	    }
1190
	  else
1191
	    {
1192
	      /* The buffer doesn't have enough space, finish to build.  */
1193
	      pstr->cur_state = prev_st;
1194
	      break;
1195
	    }
1196
	}
1197
      pstr->valid_len = byte_idx;
1198
      pstr->valid_raw_len = byte_idx;
1199
      return REG_NOERROR;
1200
    }
1201
  else
1202
    for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
1203
      {
1204
	wchar_t wc;
1205
	const char *p;
1206
      offsets_needed:
1207
	remain_len = end_idx - byte_idx;
1208
	prev_st = pstr->cur_state;
1209
	if (BE (pstr->trans != NULL, 0))
1210
	  {
1211
	    int i, ch;
1212

1213
	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
1214
	      {
1215
		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
1216
		buf[i] = pstr->trans[ch];
1217
	      }
1218
	    p = (const char *) buf;
1219
	  }
1220
	else
1221
	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
1222
	mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
1223
	if (BE (mbclen + 2 > 2, 1))
1224
	  {
1225
	    wchar_t wcu = wc;
1226
	    if (iswlower (wc))
1227
	      {
1228
		size_t mbcdlen;
1229

1230
		wcu = towupper (wc);
1231
		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
1232
		if (BE (mbclen == mbcdlen, 1))
1233
		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
1234
		else if (mbcdlen != (size_t) -1)
1235
		  {
1236
		    size_t i;
1237

1238
		    if (byte_idx + mbcdlen > pstr->bufs_len)
1239
		      {
1240
			pstr->cur_state = prev_st;
1241
			break;
1242
		      }
1243

1244
		    if (pstr->offsets == NULL)
1245
		      {
1246
			pstr->offsets = re_malloc (int, pstr->bufs_len);
1247

1248
			if (pstr->offsets == NULL)
1249
			  return REG_ESPACE;
1250
		      }
1251
		    if (!pstr->offsets_needed)
1252
		      {
1253
			for (i = 0; i < (size_t) byte_idx; ++i)
1254
			  pstr->offsets[i] = i;
1255
			pstr->offsets_needed = 1;
1256
		      }
1257

1258
		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
1259
		    pstr->wcs[byte_idx] = wcu;
1260
		    pstr->offsets[byte_idx] = src_idx;
1261
		    for (i = 1; i < mbcdlen; ++i)
1262
		      {
1263
			pstr->offsets[byte_idx + i]
1264
			  = src_idx + (i < mbclen ? i : mbclen - 1);
1265
			pstr->wcs[byte_idx + i] = WEOF;
1266
		      }
1267
		    pstr->len += mbcdlen - mbclen;
1268
		    if (pstr->raw_stop > src_idx)
1269
		      pstr->stop += mbcdlen - mbclen;
1270
		    end_idx = (pstr->bufs_len > pstr->len)
1271
			      ? pstr->len : pstr->bufs_len;
1272
		    byte_idx += mbcdlen;
1273
		    src_idx += mbclen;
1274
		    continue;
1275
		  }
1276
                else
1277
                  memcpy (pstr->mbs + byte_idx, p, mbclen);
1278
	      }
1279
	    else
1280
	      memcpy (pstr->mbs + byte_idx, p, mbclen);
1281

1282
	    if (BE (pstr->offsets_needed != 0, 0))
1283
	      {
1284
		size_t i;
1285
		for (i = 0; i < mbclen; ++i)
1286
		  pstr->offsets[byte_idx + i] = src_idx + i;
1287
	      }
1288
	    src_idx += mbclen;
1289

1290
	    pstr->wcs[byte_idx++] = wcu;
1291
	    /* Write paddings.  */
1292
	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
1293
	      pstr->wcs[byte_idx++] = WEOF;
1294
	  }
1295
	else if (mbclen == (size_t) -1 || mbclen == 0)
1296
	  {
1297
	    /* It is an invalid character or '\0'.  Just use the byte.  */
1298
	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
1299

1300
	    if (BE (pstr->trans != NULL, 0))
1301
	      ch = pstr->trans [ch];
1302
	    pstr->mbs[byte_idx] = ch;
1303

1304
	    if (BE (pstr->offsets_needed != 0, 0))
1305
	      pstr->offsets[byte_idx] = src_idx;
1306
	    ++src_idx;
1307

1308
	    /* And also cast it to wide char.  */
1309
	    pstr->wcs[byte_idx++] = (wchar_t) ch;
1310
	    if (BE (mbclen == (size_t) -1, 0))
1311
	      pstr->cur_state = prev_st;
1312
	  }
1313
	else
1314
	  {
1315
	    /* The buffer doesn't have enough space, finish to build.  */
1316
	    pstr->cur_state = prev_st;
1317
	    break;
1318
	  }
1319
      }
1320
  pstr->valid_len = byte_idx;
1321
  pstr->valid_raw_len = src_idx;
1322
  return REG_NOERROR;
1323
}
1324

1325
/* Skip characters until the index becomes greater than NEW_RAW_IDX.
1326
   Return the index.  */
1327

1328
static int
1329
internal_function
1330
re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
1331
{
1332
  mbstate_t prev_st;
1333
  int rawbuf_idx;
1334
  size_t mbclen;
1335
  wchar_t wc = WEOF;
1336

1337
  /* Skip the characters which are not necessary to check.  */
1338
  for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
1339
       rawbuf_idx < new_raw_idx;)
1340
    {
1341
      int remain_len;
1342
      remain_len = pstr->len - rawbuf_idx;
1343
      prev_st = pstr->cur_state;
1344
      mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
1345
			remain_len, &pstr->cur_state);
1346
      if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
1347
	{
1348
	  /* We treat these cases as a single byte character.  */
1349
	  if (mbclen == 0 || remain_len == 0)
1350
	    wc = L'\0';
1351
	  else
1352
	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
1353
	  mbclen = 1;
1354
	  pstr->cur_state = prev_st;
1355
	}
1356
      /* Then proceed the next character.  */
1357
      rawbuf_idx += mbclen;
1358
    }
1359
  *last_wc = (wint_t) wc;
1360
  return rawbuf_idx;
1361
}
1362
#endif /* RE_ENABLE_I18N  */
1363

1364
/* Build the buffer PSTR->MBS, and apply the translation if we need.
1365
   This function is used in case of REG_ICASE.  */
1366

1367
static void
1368
internal_function
1369
build_upper_buffer (re_string_t *pstr)
1370
{
1371
  int char_idx, end_idx;
1372
  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
1373

1374
  for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
1375
    {
1376
      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
1377
      if (BE (pstr->trans != NULL, 0))
1378
	ch = pstr->trans[ch];
1379
      if (islower (ch))
1380
	pstr->mbs[char_idx] = toupper (ch);
1381
      else
1382
	pstr->mbs[char_idx] = ch;
1383
    }
1384
  pstr->valid_len = char_idx;
1385
  pstr->valid_raw_len = char_idx;
1386
}
1387

1388
/* Apply TRANS to the buffer in PSTR.  */
1389

1390
static void
1391
internal_function
1392
re_string_translate_buffer (re_string_t *pstr)
1393
{
1394
  int buf_idx, end_idx;
1395
  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
1396

1397
  for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
1398
    {
1399
      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
1400
      pstr->mbs[buf_idx] = pstr->trans[ch];
1401
    }
1402

1403
  pstr->valid_len = buf_idx;
1404
  pstr->valid_raw_len = buf_idx;
1405
}
1406

1407
/* This function re-construct the buffers.
1408
   Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
1409
   convert to upper case in case of REG_ICASE, apply translation.  */
1410

1411
static reg_errcode_t
1412
internal_function
1413
re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
1414
{
1415
  int offset = idx - pstr->raw_mbs_idx;
1416
  if (BE (offset < 0, 0))
1417
    {
1418
      /* Reset buffer.  */
1419
#ifdef RE_ENABLE_I18N
1420
      if (pstr->mb_cur_max > 1)
1421
	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
1422
#endif /* RE_ENABLE_I18N */
1423
      pstr->len = pstr->raw_len;
1424
      pstr->stop = pstr->raw_stop;
1425
      pstr->valid_len = 0;
1426
      pstr->raw_mbs_idx = 0;
1427
      pstr->valid_raw_len = 0;
1428
      pstr->offsets_needed = 0;
1429
      pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
1430
			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
1431
      if (!pstr->mbs_allocated)
1432
	pstr->mbs = (unsigned char *) pstr->raw_mbs;
1433
      offset = idx;
1434
    }
1435

1436
  if (BE (offset != 0, 1))
1437
    {
1438
      /* Should the already checked characters be kept?  */
1439
      if (BE (offset < pstr->valid_raw_len, 1))
1440
	{
1441
	  /* Yes, move them to the front of the buffer.  */
1442
#ifdef RE_ENABLE_I18N
1443
	  if (BE (pstr->offsets_needed, 0))
1444
	    {
1445
	      int low = 0, high = pstr->valid_len, mid;
1446
	      do
1447
		{
1448
		  mid = (high + low) / 2;
1449
		  if (pstr->offsets[mid] > offset)
1450
		    high = mid;
1451
		  else if (pstr->offsets[mid] < offset)
1452
		    low = mid + 1;
1453
		  else
1454
		    break;
1455
		}
1456
	      while (low < high);
1457
	      if (pstr->offsets[mid] < offset)
1458
		++mid;
1459
	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
1460
							eflags);
1461
	      /* This can be quite complicated, so handle specially
1462
		 only the common and easy case where the character with
1463
		 different length representation of lower and upper
1464
		 case is present at or after offset.  */
1465
	      if (pstr->valid_len > offset
1466
		  && mid == offset && pstr->offsets[mid] == offset)
1467
		{
1468
		  memmove (pstr->wcs, pstr->wcs + offset,
1469
			   (pstr->valid_len - offset) * sizeof (wint_t));
1470
		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
1471
		  pstr->valid_len -= offset;
1472
		  pstr->valid_raw_len -= offset;
1473
		  for (low = 0; low < pstr->valid_len; low++)
1474
		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
1475
		}
1476
	      else
1477
		{
1478
		  /* Otherwise, just find out how long the partial multibyte
1479
		     character at offset is and fill it with WEOF/255.  */
1480
		  pstr->len = pstr->raw_len - idx + offset;
1481
		  pstr->stop = pstr->raw_stop - idx + offset;
1482
		  pstr->offsets_needed = 0;
1483
		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
1484
		    --mid;
1485
		  while (mid < pstr->valid_len)
1486
		    if (pstr->wcs[mid] != WEOF)
1487
		      break;
1488
		    else
1489
		      ++mid;
1490
		  if (mid == pstr->valid_len)
1491
		    pstr->valid_len = 0;
1492
		  else
1493
		    {
1494
		      pstr->valid_len = pstr->offsets[mid] - offset;
1495
		      if (pstr->valid_len)
1496
			{
1497
			  for (low = 0; low < pstr->valid_len; ++low)
1498
			    pstr->wcs[low] = WEOF;
1499
			  memset (pstr->mbs, 255, pstr->valid_len);
1500
			}
1501
		    }
1502
		  pstr->valid_raw_len = pstr->valid_len;
1503
		}
1504
	    }
1505
	  else
1506
#endif
1507
	    {
1508
	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
1509
							eflags);
1510
#ifdef RE_ENABLE_I18N
1511
	      if (pstr->mb_cur_max > 1)
1512
		memmove (pstr->wcs, pstr->wcs + offset,
1513
			 (pstr->valid_len - offset) * sizeof (wint_t));
1514
#endif /* RE_ENABLE_I18N */
1515
	      if (BE (pstr->mbs_allocated, 0))
1516
		memmove (pstr->mbs, pstr->mbs + offset,
1517
			 pstr->valid_len - offset);
1518
	      pstr->valid_len -= offset;
1519
	      pstr->valid_raw_len -= offset;
1520
#if DEBUG
1521
	      assert (pstr->valid_len > 0);
1522
#endif
1523
	    }
1524
	}
1525
      else
1526
	{
1527
	  /* No, skip all characters until IDX.  */
1528
	  int prev_valid_len = pstr->valid_len;
1529

1530
#ifdef RE_ENABLE_I18N
1531
	  if (BE (pstr->offsets_needed, 0))
1532
	    {
1533
	      pstr->len = pstr->raw_len - idx + offset;
1534
	      pstr->stop = pstr->raw_stop - idx + offset;
1535
	      pstr->offsets_needed = 0;
1536
	    }
1537
#endif
1538
	  pstr->valid_len = 0;
1539
#ifdef RE_ENABLE_I18N
1540
	  if (pstr->mb_cur_max > 1)
1541
	    {
1542
	      int wcs_idx;
1543
	      wint_t wc = WEOF;
1544

1545
	      if (pstr->is_utf8)
1546
		{
1547
		  const unsigned char *raw, *p, *q, *end;
1548

1549
		  /* Special case UTF-8.  Multi-byte chars start with any
1550
		     byte other than 0x80 - 0xbf.  */
1551
		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
1552
		  end = raw + (offset - pstr->mb_cur_max);
1553
		  if (end < pstr->raw_mbs)
1554
		    end = pstr->raw_mbs;
1555
		  p = raw + offset - 1;
1556
#ifdef _LIBC
1557
		  /* We know the wchar_t encoding is UCS4, so for the simple
1558
		     case, ASCII characters, skip the conversion step.  */
1559
		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
1560
		    {
1561
		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
1562
		      /* pstr->valid_len = 0; */
1563
		      wc = (wchar_t) *p;
1564
		    }
1565
		  else
1566
#endif
1567
		    for (; p >= end; --p)
1568
		      if ((*p & 0xc0) != 0x80)
1569
			{
1570
			  mbstate_t cur_state;
1571
			  wchar_t wc2;
1572
			  int mlen = raw + pstr->len - p;
1573
			  unsigned char buf[6];
1574
			  size_t mbclen;
1575

1576
			  q = p;
1577
			  if (BE (pstr->trans != NULL, 0))
1578
			    {
1579
			      int i = mlen < 6 ? mlen : 6;
1580
			      while (--i >= 0)
1581
				buf[i] = pstr->trans[p[i]];
1582
			      q = buf;
1583
			    }
1584
			  /* XXX Don't use mbrtowc, we know which conversion
1585
			     to use (UTF-8 -> UCS4).  */
1586
			  memset (&cur_state, 0, sizeof (cur_state));
1587
			  mbclen = mbrtowc (&wc2, (const char *) p, mlen,
1588
					    &cur_state);
1589
			  if (raw + offset - p <= mbclen
1590
			      && mbclen < (size_t) -2)
1591
			    {
1592
			      memset (&pstr->cur_state, '\0',
1593
				      sizeof (mbstate_t));
1594
			      pstr->valid_len = mbclen - (raw + offset - p);
1595
			      wc = wc2;
1596
			    }
1597
			  break;
1598
			}
1599
		}
1600

1601
	      if (wc == WEOF)
1602
		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
1603
	      if (wc == WEOF)
1604
		pstr->tip_context
1605
		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
1606
	      else
1607
		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
1608
				      && IS_WIDE_WORD_CHAR (wc))
1609
				     ? CONTEXT_WORD
1610
				     : ((IS_WIDE_NEWLINE (wc)
1611
					 && pstr->newline_anchor)
1612
					? CONTEXT_NEWLINE : 0));
1613
	      if (BE (pstr->valid_len, 0))
1614
		{
1615
		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
1616
		    pstr->wcs[wcs_idx] = WEOF;
1617
		  if (pstr->mbs_allocated)
1618
		    memset (pstr->mbs, 255, pstr->valid_len);
1619
		}
1620
	      pstr->valid_raw_len = pstr->valid_len;
1621
	    }
1622
	  else
1623
#endif /* RE_ENABLE_I18N */
1624
	    {
1625
	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
1626
	      pstr->valid_raw_len = 0;
1627
	      if (pstr->trans)
1628
		c = pstr->trans[c];
1629
	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
1630
				   ? CONTEXT_WORD
1631
				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
1632
				      ? CONTEXT_NEWLINE : 0));
1633
	    }
1634
	}
1635
      if (!BE (pstr->mbs_allocated, 0))
1636
	pstr->mbs += offset;
1637
    }
1638
  pstr->raw_mbs_idx = idx;
1639
  pstr->len -= offset;
1640
  pstr->stop -= offset;
1641

1642
  /* Then build the buffers.  */
1643
#ifdef RE_ENABLE_I18N
1644
  if (pstr->mb_cur_max > 1)
1645
    {
1646
      if (pstr->icase)
1647
	{
1648
	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
1649
	  if (BE (ret != REG_NOERROR, 0))
1650
	    return ret;
1651
	}
1652
      else
1653
	build_wcs_buffer (pstr);
1654
    }
1655
  else
1656
#endif /* RE_ENABLE_I18N */
1657
    if (BE (pstr->mbs_allocated, 0))
1658
      {
1659
	if (pstr->icase)
1660
	  build_upper_buffer (pstr);
1661
	else if (pstr->trans != NULL)
1662
	  re_string_translate_buffer (pstr);
1663
      }
1664
    else
1665
      pstr->valid_len = pstr->len;
1666

1667
  pstr->cur_idx = 0;
1668
  return REG_NOERROR;
1669
}
1670

1671
static unsigned char
1672
internal_function __attribute ((pure))
1673
re_string_peek_byte_case (const re_string_t *pstr, int idx)
1674
{
1675
  int ch, off;
1676

1677
  /* Handle the common (easiest) cases first.  */
1678
  if (BE (!pstr->mbs_allocated, 1))
1679
    return re_string_peek_byte (pstr, idx);
1680

1681
#ifdef RE_ENABLE_I18N
1682
  if (pstr->mb_cur_max > 1
1683
      && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
1684
    return re_string_peek_byte (pstr, idx);
1685
#endif
1686

1687
  off = pstr->cur_idx + idx;
1688
#ifdef RE_ENABLE_I18N
1689
  if (pstr->offsets_needed)
1690
    off = pstr->offsets[off];
1691
#endif
1692

1693
  ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
1694

1695
#ifdef RE_ENABLE_I18N
1696
  /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
1697
     this function returns CAPITAL LETTER I instead of first byte of
1698
     DOTLESS SMALL LETTER I.  The latter would confuse the parser,
1699
     since peek_byte_case doesn't advance cur_idx in any way.  */
1700
  if (pstr->offsets_needed && !isascii (ch))
1701
    return re_string_peek_byte (pstr, idx);
1702
#endif
1703

1704
  return ch;
1705
}
1706

1707
static unsigned char
1708
internal_function __attribute ((pure))
1709
re_string_fetch_byte_case (re_string_t *pstr)
1710
{
1711
  if (BE (!pstr->mbs_allocated, 1))
1712
    return re_string_fetch_byte (pstr);
1713

1714
#ifdef RE_ENABLE_I18N
1715
  if (pstr->offsets_needed)
1716
    {
1717
      int off, ch;
1718

1719
      /* For tr_TR.UTF-8 [[:islower:]] there is
1720
	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
1721
	 in that case the whole multi-byte character and return
1722
	 the original letter.  On the other side, with
1723
	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
1724
	 anything else would complicate things too much.  */
1725

1726
      if (!re_string_first_byte (pstr, pstr->cur_idx))
1727
	return re_string_fetch_byte (pstr);
1728

1729
      off = pstr->offsets[pstr->cur_idx];
1730
      ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
1731

1732
      if (! isascii (ch))
1733
	return re_string_fetch_byte (pstr);
1734

1735
      re_string_skip_bytes (pstr,
1736
			    re_string_char_size_at (pstr, pstr->cur_idx));
1737
      return ch;
1738
    }
1739
#endif
1740

1741
  return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
1742
}
1743

1744
static void
1745
internal_function
1746
re_string_destruct (re_string_t *pstr)
1747
{
1748
#ifdef RE_ENABLE_I18N
1749
  re_free (pstr->wcs);
1750
  re_free (pstr->offsets);
1751
#endif /* RE_ENABLE_I18N  */
1752
  if (pstr->mbs_allocated)
1753
    re_free (pstr->mbs);
1754
}
1755

1756
/* Return the context at IDX in INPUT.  */
1757

1758
static unsigned int
1759
internal_function
1760
re_string_context_at (const re_string_t *input, int idx, int eflags)
1761
{
1762
  int c;
1763
  if (BE (idx < 0, 0))
1764
    /* In this case, we use the value stored in input->tip_context,
1765
       since we can't know the character in input->mbs[-1] here.  */
1766
    return input->tip_context;
1767
  if (BE (idx == input->len, 0))
1768
    return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
1769
	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
1770
#ifdef RE_ENABLE_I18N
1771
  if (input->mb_cur_max > 1)
1772
    {
1773
      wint_t wc;
1774
      int wc_idx = idx;
1775
      while(input->wcs[wc_idx] == WEOF)
1776
	{
1777
#ifdef DEBUG
1778
	  /* It must not happen.  */
1779
	  assert (wc_idx >= 0);
1780
#endif
1781
	  --wc_idx;
1782
	  if (wc_idx < 0)
1783
	    return input->tip_context;
1784
	}
1785
      wc = input->wcs[wc_idx];
1786
      if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
1787
	return CONTEXT_WORD;
1788
      return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
1789
	      ? CONTEXT_NEWLINE : 0);
1790
    }
1791
  else
1792
#endif
1793
    {
1794
      c = re_string_byte_at (input, idx);
1795
      if (bitset_contain (input->word_char, c))
1796
	return CONTEXT_WORD;
1797
      return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
1798
    }
1799
}
1800

1801
/* Functions for set operation.  */
1802

1803
static reg_errcode_t
1804
internal_function
1805
re_node_set_alloc (re_node_set *set, int size)
1806
{
1807
  set->alloc = size;
1808
  set->nelem = 0;
1809
  set->elems = re_malloc (int, size);
1810
  if (BE (set->elems == NULL, 0))
1811
    return REG_ESPACE;
1812
  return REG_NOERROR;
1813
}
1814

1815
static reg_errcode_t
1816
internal_function
1817
re_node_set_init_1 (re_node_set *set, int elem)
1818
{
1819
  set->alloc = 1;
1820
  set->nelem = 1;
1821
  set->elems = re_malloc (int, 1);
1822
  if (BE (set->elems == NULL, 0))
1823
    {
1824
      set->alloc = set->nelem = 0;
1825
      return REG_ESPACE;
1826
    }
1827
  set->elems[0] = elem;
1828
  return REG_NOERROR;
1829
}
1830

1831
static reg_errcode_t
1832
internal_function
1833
re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
1834
{
1835
  set->alloc = 2;
1836
  set->elems = re_malloc (int, 2);
1837
  if (BE (set->elems == NULL, 0))
1838
    return REG_ESPACE;
1839
  if (elem1 == elem2)
1840
    {
1841
      set->nelem = 1;
1842
      set->elems[0] = elem1;
1843
    }
1844
  else
1845
    {
1846
      set->nelem = 2;
1847
      if (elem1 < elem2)
1848
	{
1849
	  set->elems[0] = elem1;
1850
	  set->elems[1] = elem2;
1851
	}
1852
      else
1853
	{
1854
	  set->elems[0] = elem2;
1855
	  set->elems[1] = elem1;
1856
	}
1857
    }
1858
  return REG_NOERROR;
1859
}
1860

1861
static reg_errcode_t
1862
internal_function
1863
re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1864
{
1865
  dest->nelem = src->nelem;
1866
  if (src->nelem > 0)
1867
    {
1868
      dest->alloc = dest->nelem;
1869
      dest->elems = re_malloc (int, dest->alloc);
1870
      if (BE (dest->elems == NULL, 0))
1871
	{
1872
	  dest->alloc = dest->nelem = 0;
1873
	  return REG_ESPACE;
1874
	}
1875
      memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1876
    }
1877
  else
1878
    re_node_set_init_empty (dest);
1879
  return REG_NOERROR;
1880
}
1881

1882
/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1883
   DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1884
   Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1885

1886
static reg_errcode_t
1887
internal_function
1888
re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1889
			   const re_node_set *src2)
1890
{
1891
  int i1, i2, is, id, delta, sbase;
1892
  if (src1->nelem == 0 || src2->nelem == 0)
1893
    return REG_NOERROR;
1894

1895
  /* We need dest->nelem + 2 * elems_in_intersection; this is a
1896
     conservative estimate.  */
1897
  if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1898
    {
1899
      int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1900
      int *new_elems = re_realloc (dest->elems, int, new_alloc);
1901
      if (BE (new_elems == NULL, 0))
1902
        return REG_ESPACE;
1903
      dest->elems = new_elems;
1904
      dest->alloc = new_alloc;
1905
    }
1906

1907
  /* Find the items in the intersection of SRC1 and SRC2, and copy
1908
     into the top of DEST those that are not already in DEST itself.  */
1909
  sbase = dest->nelem + src1->nelem + src2->nelem;
1910
  i1 = src1->nelem - 1;
1911
  i2 = src2->nelem - 1;
1912
  id = dest->nelem - 1;
1913
  for (;;)
1914
    {
1915
      if (src1->elems[i1] == src2->elems[i2])
1916
	{
1917
	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1918
	  while (id >= 0 && dest->elems[id] > src1->elems[i1])
1919
	    --id;
1920

1921
          if (id < 0 || dest->elems[id] != src1->elems[i1])
1922
            dest->elems[--sbase] = src1->elems[i1];
1923

1924
	  if (--i1 < 0 || --i2 < 0)
1925
	    break;
1926
	}
1927

1928
      /* Lower the highest of the two items.  */
1929
      else if (src1->elems[i1] < src2->elems[i2])
1930
	{
1931
	  if (--i2 < 0)
1932
	    break;
1933
	}
1934
      else
1935
	{
1936
	  if (--i1 < 0)
1937
	    break;
1938
	}
1939
    }
1940

1941
  id = dest->nelem - 1;
1942
  is = dest->nelem + src1->nelem + src2->nelem - 1;
1943
  delta = is - sbase + 1;
1944

1945
  /* Now copy.  When DELTA becomes zero, the remaining
1946
     DEST elements are already in place; this is more or
1947
     less the same loop that is in re_node_set_merge.  */
1948
  dest->nelem += delta;
1949
  if (delta > 0 && id >= 0)
1950
    for (;;)
1951
      {
1952
        if (dest->elems[is] > dest->elems[id])
1953
          {
1954
            /* Copy from the top.  */
1955
            dest->elems[id + delta--] = dest->elems[is--];
1956
            if (delta == 0)
1957
              break;
1958
          }
1959
        else
1960
          {
1961
            /* Slide from the bottom.  */
1962
            dest->elems[id + delta] = dest->elems[id];
1963
            if (--id < 0)
1964
              break;
1965
          }
1966
      }
1967

1968
  /* Copy remaining SRC elements.  */
1969
  memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1970

1971
  return REG_NOERROR;
1972
}
1973

1974
/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1975
   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1976

1977
static reg_errcode_t
1978
internal_function
1979
re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1980
			const re_node_set *src2)
1981
{
1982
  int i1, i2, id;
1983
  if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1984
    {
1985
      dest->alloc = src1->nelem + src2->nelem;
1986
      dest->elems = re_malloc (int, dest->alloc);
1987
      if (BE (dest->elems == NULL, 0))
1988
	return REG_ESPACE;
1989
    }
1990
  else
1991
    {
1992
      if (src1 != NULL && src1->nelem > 0)
1993
	return re_node_set_init_copy (dest, src1);
1994
      else if (src2 != NULL && src2->nelem > 0)
1995
	return re_node_set_init_copy (dest, src2);
1996
      else
1997
	re_node_set_init_empty (dest);
1998
      return REG_NOERROR;
1999
    }
2000
  for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
2001
    {
2002
      if (src1->elems[i1] > src2->elems[i2])
2003
	{
2004
	  dest->elems[id++] = src2->elems[i2++];
2005
	  continue;
2006
	}
2007
      if (src1->elems[i1] == src2->elems[i2])
2008
	++i2;
2009
      dest->elems[id++] = src1->elems[i1++];
2010
    }
2011
  if (i1 < src1->nelem)
2012
    {
2013
      memcpy (dest->elems + id, src1->elems + i1,
2014
	     (src1->nelem - i1) * sizeof (int));
2015
      id += src1->nelem - i1;
2016
    }
2017
  else if (i2 < src2->nelem)
2018
    {
2019
      memcpy (dest->elems + id, src2->elems + i2,
2020
	     (src2->nelem - i2) * sizeof (int));
2021
      id += src2->nelem - i2;
2022
    }
2023
  dest->nelem = id;
2024
  return REG_NOERROR;
2025
}
2026

2027
/* Calculate the union set of the sets DEST and SRC. And store it to
2028
   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
2029

2030
static reg_errcode_t
2031
internal_function
2032
re_node_set_merge (re_node_set *dest, const re_node_set *src)
2033
{
2034
  int is, id, sbase, delta;
2035
  if (src == NULL || src->nelem == 0)
2036
    return REG_NOERROR;
2037
  if (dest->alloc < 2 * src->nelem + dest->nelem)
2038
    {
2039
      int new_alloc = 2 * (src->nelem + dest->alloc);
2040
      int *new_buffer = re_realloc (dest->elems, int, new_alloc);
2041
      if (BE (new_buffer == NULL, 0))
2042
	return REG_ESPACE;
2043
      dest->elems = new_buffer;
2044
      dest->alloc = new_alloc;
2045
    }
2046

2047
  if (BE (dest->nelem == 0, 0))
2048
    {
2049
      dest->nelem = src->nelem;
2050
      memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
2051
      return REG_NOERROR;
2052
    }
2053

2054
  /* Copy into the top of DEST the items of SRC that are not
2055
     found in DEST.  Maybe we could binary search in DEST?  */
2056
  for (sbase = dest->nelem + 2 * src->nelem,
2057
       is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
2058
    {
2059
      if (dest->elems[id] == src->elems[is])
2060
        is--, id--;
2061
      else if (dest->elems[id] < src->elems[is])
2062
        dest->elems[--sbase] = src->elems[is--];
2063
      else /* if (dest->elems[id] > src->elems[is]) */
2064
        --id;
2065
    }
2066

2067
  if (is >= 0)
2068
    {
2069
      /* If DEST is exhausted, the remaining items of SRC must be unique.  */
2070
      sbase -= is + 1;
2071
      memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
2072
    }
2073

2074
  id = dest->nelem - 1;
2075
  is = dest->nelem + 2 * src->nelem - 1;
2076
  delta = is - sbase + 1;
2077
  if (delta == 0)
2078
    return REG_NOERROR;
2079

2080
  /* Now copy.  When DELTA becomes zero, the remaining
2081
     DEST elements are already in place.  */
2082
  dest->nelem += delta;
2083
  for (;;)
2084
    {
2085
      if (dest->elems[is] > dest->elems[id])
2086
        {
2087
	  /* Copy from the top.  */
2088
          dest->elems[id + delta--] = dest->elems[is--];
2089
	  if (delta == 0)
2090
	    break;
2091
	}
2092
      else
2093
        {
2094
          /* Slide from the bottom.  */
2095
          dest->elems[id + delta] = dest->elems[id];
2096
	  if (--id < 0)
2097
	    {
2098
	      /* Copy remaining SRC elements.  */
2099
	      memcpy (dest->elems, dest->elems + sbase,
2100
	              delta * sizeof (int));
2101
	      break;
2102
	    }
2103
	}
2104
    }
2105

2106
  return REG_NOERROR;
2107
}
2108

2109
/* Insert the new element ELEM to the re_node_set* SET.
2110
   SET should not already have ELEM.
2111
   return -1 if an error is occured, return 1 otherwise.  */
2112

2113
static int
2114
internal_function
2115
re_node_set_insert (re_node_set *set, int elem)
2116
{
2117
  int idx;
2118
  /* In case the set is empty.  */
2119
  if (set->alloc == 0)
2120
    {
2121
      if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
2122
	return 1;
2123
      else
2124
	return -1;
2125
    }
2126

2127
  if (BE (set->nelem, 0) == 0)
2128
    {
2129
      /* We already guaranteed above that set->alloc != 0.  */
2130
      set->elems[0] = elem;
2131
      ++set->nelem;
2132
      return 1;
2133
    }
2134

2135
  /* Realloc if we need.  */
2136
  if (set->alloc == set->nelem)
2137
    {
2138
      int *new_elems;
2139
      set->alloc = set->alloc * 2;
2140
      new_elems = re_realloc (set->elems, int, set->alloc);
2141
      if (BE (new_elems == NULL, 0))
2142
	return -1;
2143
      set->elems = new_elems;
2144
    }
2145

2146
  /* Move the elements which follows the new element.  Test the
2147
     first element separately to skip a check in the inner loop.  */
2148
  if (elem < set->elems[0])
2149
    {
2150
      idx = 0;
2151
      for (idx = set->nelem; idx > 0; idx--)
2152
        set->elems[idx] = set->elems[idx - 1];
2153
    }
2154
  else
2155
    {
2156
      for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
2157
        set->elems[idx] = set->elems[idx - 1];
2158
    }
2159

2160
  /* Insert the new element.  */
2161
  set->elems[idx] = elem;
2162
  ++set->nelem;
2163
  return 1;
2164
}
2165

2166
/* Insert the new element ELEM to the re_node_set* SET.
2167
   SET should not already have any element greater than or equal to ELEM.
2168
   Return -1 if an error is occured, return 1 otherwise.  */
2169

2170
static int
2171
internal_function
2172
re_node_set_insert_last (re_node_set *set, int elem)
2173
{
2174
  /* Realloc if we need.  */
2175
  if (set->alloc == set->nelem)
2176
    {
2177
      int *new_elems;
2178
      set->alloc = (set->alloc + 1) * 2;
2179
      new_elems = re_realloc (set->elems, int, set->alloc);
2180
      if (BE (new_elems == NULL, 0))
2181
	return -1;
2182
      set->elems = new_elems;
2183
    }
2184

2185
  /* Insert the new element.  */
2186
  set->elems[set->nelem++] = elem;
2187
  return 1;
2188
}
2189

2190
/* Compare two node sets SET1 and SET2.
2191
   return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
2192

2193
static int
2194
internal_function __attribute ((pure))
2195
re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
2196
{
2197
  int i;
2198
  if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
2199
    return 0;
2200
  for (i = set1->nelem ; --i >= 0 ; )
2201
    if (set1->elems[i] != set2->elems[i])
2202
      return 0;
2203
  return 1;
2204
}
2205

2206
/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
2207

2208
static int
2209
internal_function __attribute ((pure))
2210
re_node_set_contains (const re_node_set *set, int elem)
2211
{
2212
  unsigned int idx, right, mid;
2213
  if (set->nelem <= 0)
2214
    return 0;
2215

2216
  /* Binary search the element.  */
2217
  idx = 0;
2218
  right = set->nelem - 1;
2219
  while (idx < right)
2220
    {
2221
      mid = (idx + right) / 2;
2222
      if (set->elems[mid] < elem)
2223
	idx = mid + 1;
2224
      else
2225
	right = mid;
2226
    }
2227
  return set->elems[idx] == elem ? idx + 1 : 0;
2228
}
2229

2230
static void
2231
internal_function
2232
re_node_set_remove_at (re_node_set *set, int idx)
2233
{
2234
  if (idx < 0 || idx >= set->nelem)
2235
    return;
2236
  --set->nelem;
2237
  for (; idx < set->nelem; idx++)
2238
    set->elems[idx] = set->elems[idx + 1];
2239
}
2240

2241

2242
/* Add the token TOKEN to dfa->nodes, and return the index of the token.
2243
   Or return -1, if an error will be occured.  */
2244

2245
static int
2246
internal_function
2247
re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
2248
{
2249
  int type = token.type;
2250
  if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
2251
    {
2252
      size_t new_nodes_alloc = dfa->nodes_alloc * 2;
2253
      int *new_nexts, *new_indices;
2254
      re_node_set *new_edests, *new_eclosures;
2255
      re_token_t *new_nodes;
2256

2257
      /* Avoid overflows.  */
2258
      if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
2259
	return -1;
2260

2261
      new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
2262
      if (BE (new_nodes == NULL, 0))
2263
	return -1;
2264
      dfa->nodes = new_nodes;
2265
      new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
2266
      new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
2267
      new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
2268
      new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
2269
      if (BE (new_nexts == NULL || new_indices == NULL
2270
	      || new_edests == NULL || new_eclosures == NULL, 0))
2271
	return -1;
2272
      dfa->nexts = new_nexts;
2273
      dfa->org_indices = new_indices;
2274
      dfa->edests = new_edests;
2275
      dfa->eclosures = new_eclosures;
2276
      dfa->nodes_alloc = new_nodes_alloc;
2277
    }
2278
  dfa->nodes[dfa->nodes_len] = token;
2279
  dfa->nodes[dfa->nodes_len].constraint = 0;
2280
#ifdef RE_ENABLE_I18N
2281
  dfa->nodes[dfa->nodes_len].accept_mb =
2282
    (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
2283
#endif
2284
  dfa->nexts[dfa->nodes_len] = -1;
2285
  re_node_set_init_empty (dfa->edests + dfa->nodes_len);
2286
  re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
2287
  return dfa->nodes_len++;
2288
}
2289

2290
static inline unsigned int
2291
internal_function
2292
calc_state_hash (const re_node_set *nodes, unsigned int context)
2293
{
2294
  unsigned int hash = nodes->nelem + context;
2295
  int i;
2296
  for (i = 0 ; i < nodes->nelem ; i++)
2297
    hash += nodes->elems[i];
2298
  return hash;
2299
}
2300

2301
/* Search for the state whose node_set is equivalent to NODES.
2302
   Return the pointer to the state, if we found it in the DFA.
2303
   Otherwise create the new one and return it.  In case of an error
2304
   return NULL and set the error code in ERR.
2305
   Note: - We assume NULL as the invalid state, then it is possible that
2306
	   return value is NULL and ERR is REG_NOERROR.
2307
	 - We never return non-NULL value in case of any errors, it is for
2308
	   optimization.  */
2309

2310
static re_dfastate_t *
2311
internal_function
2312
re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
2313
		  const re_node_set *nodes)
2314
{
2315
  unsigned int hash;
2316
  re_dfastate_t *new_state;
2317
  struct re_state_table_entry *spot;
2318
  int i;
2319
  if (BE (nodes->nelem == 0, 0))
2320
    {
2321
      *err = REG_NOERROR;
2322
      return NULL;
2323
    }
2324
  hash = calc_state_hash (nodes, 0);
2325
  spot = dfa->state_table + (hash & dfa->state_hash_mask);
2326

2327
  for (i = 0 ; i < spot->num ; i++)
2328
    {
2329
      re_dfastate_t *state = spot->array[i];
2330
      if (hash != state->hash)
2331
	continue;
2332
      if (re_node_set_compare (&state->nodes, nodes))
2333
	return state;
2334
    }
2335

2336
  /* There are no appropriate state in the dfa, create the new one.  */
2337
  new_state = create_ci_newstate (dfa, nodes, hash);
2338
  if (BE (new_state == NULL, 0))
2339
    *err = REG_ESPACE;
2340

2341
  return new_state;
2342
}
2343

2344
/* Search for the state whose node_set is equivalent to NODES and
2345
   whose context is equivalent to CONTEXT.
2346
   Return the pointer to the state, if we found it in the DFA.
2347
   Otherwise create the new one and return it.  In case of an error
2348
   return NULL and set the error code in ERR.
2349
   Note: - We assume NULL as the invalid state, then it is possible that
2350
	   return value is NULL and ERR is REG_NOERROR.
2351
	 - We never return non-NULL value in case of any errors, it is for
2352
	   optimization.  */
2353

2354
static re_dfastate_t *
2355
internal_function
2356
re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
2357
			  const re_node_set *nodes, unsigned int context)
2358
{
2359
  unsigned int hash;
2360
  re_dfastate_t *new_state;
2361
  struct re_state_table_entry *spot;
2362
  int i;
2363
  if (nodes->nelem == 0)
2364
    {
2365
      *err = REG_NOERROR;
2366
      return NULL;
2367
    }
2368
  hash = calc_state_hash (nodes, context);
2369
  spot = dfa->state_table + (hash & dfa->state_hash_mask);
2370

2371
  for (i = 0 ; i < spot->num ; i++)
2372
    {
2373
      re_dfastate_t *state = spot->array[i];
2374
      if (state->hash == hash
2375
	  && state->context == context
2376
	  && re_node_set_compare (state->entrance_nodes, nodes))
2377
	return state;
2378
    }
2379
  /* There are no appropriate state in `dfa', create the new one.  */
2380
  new_state = create_cd_newstate (dfa, nodes, context, hash);
2381
  if (BE (new_state == NULL, 0))
2382
    *err = REG_ESPACE;
2383

2384
  return new_state;
2385
}
2386

2387
/* Finish initialization of the new state NEWSTATE, and using its hash value
2388
   HASH put in the appropriate bucket of DFA's state table.  Return value
2389
   indicates the error code if failed.  */
2390

2391
static reg_errcode_t
2392
register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
2393
		unsigned int hash)
2394
{
2395
  struct re_state_table_entry *spot;
2396
  reg_errcode_t err;
2397
  int i;
2398

2399
  newstate->hash = hash;
2400
  err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
2401
  if (BE (err != REG_NOERROR, 0))
2402
    return REG_ESPACE;
2403
  for (i = 0; i < newstate->nodes.nelem; i++)
2404
    {
2405
      int elem = newstate->nodes.elems[i];
2406
      if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
2407
        re_node_set_insert_last (&newstate->non_eps_nodes, elem);
2408
    }
2409

2410
  spot = dfa->state_table + (hash & dfa->state_hash_mask);
2411
  if (BE (spot->alloc <= spot->num, 0))
2412
    {
2413
      int new_alloc = 2 * spot->num + 2;
2414
      re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
2415
					      new_alloc);
2416
      if (BE (new_array == NULL, 0))
2417
	return REG_ESPACE;
2418
      spot->array = new_array;
2419
      spot->alloc = new_alloc;
2420
    }
2421
  spot->array[spot->num++] = newstate;
2422
  return REG_NOERROR;
2423
}
2424

2425
static void
2426
free_state (re_dfastate_t *state)
2427
{
2428
  re_node_set_free (&state->non_eps_nodes);
2429
  re_node_set_free (&state->inveclosure);
2430
  if (state->entrance_nodes != &state->nodes)
2431
    {
2432
      re_node_set_free (state->entrance_nodes);
2433
      re_free (state->entrance_nodes);
2434
    }
2435
  re_node_set_free (&state->nodes);
2436
  re_free (state->word_trtable);
2437
  re_free (state->trtable);
2438
  re_free (state);
2439
}
2440

2441
/* Create the new state which is independ of contexts.
2442
   Return the new state if succeeded, otherwise return NULL.  */
2443

2444
static re_dfastate_t *
2445
internal_function
2446
create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
2447
		    unsigned int hash)
2448
{
2449
  int i;
2450
  reg_errcode_t err;
2451
  re_dfastate_t *newstate;
2452

2453
  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
2454
  if (BE (newstate == NULL, 0))
2455
    return NULL;
2456
  err = re_node_set_init_copy (&newstate->nodes, nodes);
2457
  if (BE (err != REG_NOERROR, 0))
2458
    {
2459
      re_free (newstate);
2460
      return NULL;
2461
    }
2462

2463
  newstate->entrance_nodes = &newstate->nodes;
2464
  for (i = 0 ; i < nodes->nelem ; i++)
2465
    {
2466
      re_token_t *node = dfa->nodes + nodes->elems[i];
2467
      re_token_type_t type = node->type;
2468
      if (type == CHARACTER && !node->constraint)
2469
	continue;
2470
#ifdef RE_ENABLE_I18N
2471
      newstate->accept_mb |= node->accept_mb;
2472
#endif /* RE_ENABLE_I18N */
2473

2474
      /* If the state has the halt node, the state is a halt state.  */
2475
      if (type == END_OF_RE)
2476
	newstate->halt = 1;
2477
      else if (type == OP_BACK_REF)
2478
	newstate->has_backref = 1;
2479
      else if (type == ANCHOR || node->constraint)
2480
	newstate->has_constraint = 1;
2481
    }
2482
  err = register_state (dfa, newstate, hash);
2483
  if (BE (err != REG_NOERROR, 0))
2484
    {
2485
      free_state (newstate);
2486
      newstate = NULL;
2487
    }
2488
  return newstate;
2489
}
2490

2491
/* Create the new state which is depend on the context CONTEXT.
2492
   Return the new state if succeeded, otherwise return NULL.  */
2493

2494
static re_dfastate_t *
2495
internal_function
2496
create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
2497
		    unsigned int context, unsigned int hash)
2498
{
2499
  int i, nctx_nodes = 0;
2500
  reg_errcode_t err;
2501
  re_dfastate_t *newstate;
2502

2503
  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
2504
  if (BE (newstate == NULL, 0))
2505
    return NULL;
2506
  err = re_node_set_init_copy (&newstate->nodes, nodes);
2507
  if (BE (err != REG_NOERROR, 0))
2508
    {
2509
      re_free (newstate);
2510
      return NULL;
2511
    }
2512

2513
  newstate->context = context;
2514
  newstate->entrance_nodes = &newstate->nodes;
2515

2516
  for (i = 0 ; i < nodes->nelem ; i++)
2517
    {
2518
      unsigned int constraint = 0;
2519
      re_token_t *node = dfa->nodes + nodes->elems[i];
2520
      re_token_type_t type = node->type;
2521
      if (node->constraint)
2522
	constraint = node->constraint;
2523

2524
      if (type == CHARACTER && !constraint)
2525
	continue;
2526
#ifdef RE_ENABLE_I18N
2527
      newstate->accept_mb |= node->accept_mb;
2528
#endif /* RE_ENABLE_I18N */
2529

2530
      /* If the state has the halt node, the state is a halt state.  */
2531
      if (type == END_OF_RE)
2532
	newstate->halt = 1;
2533
      else if (type == OP_BACK_REF)
2534
	newstate->has_backref = 1;
2535
      else if (type == ANCHOR)
2536
	constraint = node->opr.ctx_type;
2537

2538
      if (constraint)
2539
	{
2540
	  if (newstate->entrance_nodes == &newstate->nodes)
2541
	    {
2542
	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
2543
	      if (BE (newstate->entrance_nodes == NULL, 0))
2544
		{
2545
		  free_state (newstate);
2546
		  return NULL;
2547
		}
2548
	      re_node_set_init_copy (newstate->entrance_nodes, nodes);
2549
	      nctx_nodes = 0;
2550
	      newstate->has_constraint = 1;
2551
	    }
2552

2553
	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
2554
	    {
2555
	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
2556
	      ++nctx_nodes;
2557
	    }
2558
	}
2559
    }
2560
  err = register_state (dfa, newstate, hash);
2561
  if (BE (err != REG_NOERROR, 0))
2562
    {
2563
      free_state (newstate);
2564
      newstate = NULL;
2565
    }
2566
  return  newstate;
2567
}
2568

2569
/******************************************************************************/
2570
/******************************************************************************/
2571
/******************************************************************************/
2572
/* GKINCLUDE #include "regcomp.c" */
2573
/******************************************************************************/
2574
/******************************************************************************/
2575
/******************************************************************************/
2576
/* Extended regular expression matching and search library.
2577
   Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.
2578
   This file is part of the GNU C Library.
2579
   Contributed by Isamu Hasegawa <[email protected]>.
2580

2581
   The GNU C Library is free software; you can redistribute it and/or
2582
   modify it under the terms of the GNU Lesser General Public
2583
   License as published by the Free Software Foundation; either
2584
   version 2.1 of the License, or (at your option) any later version.
2585

2586
   The GNU C Library is distributed in the hope that it will be useful,
2587
   but WITHOUT ANY WARRANTY; without even the implied warranty of
2588
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
2589
   Lesser General Public License for more details.
2590

2591
   You should have received a copy of the GNU Lesser General Public
2592
   License along with the GNU C Library; if not, write to the Free
2593
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
2594
   02111-1307 USA.  */
2595

2596
static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
2597
					  size_t length, reg_syntax_t syntax);
2598
static void re_compile_fastmap_iter (regex_t *bufp,
2599
				     const re_dfastate_t *init_state,
2600
				     char *fastmap);
2601
static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
2602
#ifdef RE_ENABLE_I18N
2603
static void free_charset (re_charset_t *cset);
2604
#endif /* RE_ENABLE_I18N */
2605
static void free_workarea_compile (regex_t *preg);
2606
static reg_errcode_t create_initial_state (re_dfa_t *dfa);
2607
#ifdef RE_ENABLE_I18N
2608
static void optimize_utf8 (re_dfa_t *dfa);
2609
#endif
2610
static reg_errcode_t analyze (regex_t *preg);
2611
static reg_errcode_t preorder (bin_tree_t *root,
2612
			       reg_errcode_t (fn (void *, bin_tree_t *)),
2613
			       void *extra);
2614
static reg_errcode_t postorder (bin_tree_t *root,
2615
				reg_errcode_t (fn (void *, bin_tree_t *)),
2616
				void *extra);
2617
static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
2618
static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
2619
static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
2620
				 bin_tree_t *node);
2621
static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
2622
static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
2623
static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
2624
static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
2625
static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
2626
				   unsigned int constraint);
2627
static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
2628
static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
2629
					 int node, int root);
2630
static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
2631
static int fetch_number (re_string_t *input, re_token_t *token,
2632
			 reg_syntax_t syntax);
2633
static int peek_token (re_token_t *token, re_string_t *input,
2634
			reg_syntax_t syntax) internal_function;
2635
static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
2636
			  reg_syntax_t syntax, reg_errcode_t *err);
2637
static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
2638
				  re_token_t *token, reg_syntax_t syntax,
2639
				  int nest, reg_errcode_t *err);
2640
static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
2641
				 re_token_t *token, reg_syntax_t syntax,
2642
				 int nest, reg_errcode_t *err);
2643
static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
2644
				     re_token_t *token, reg_syntax_t syntax,
2645
				     int nest, reg_errcode_t *err);
2646
static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
2647
				  re_token_t *token, reg_syntax_t syntax,
2648
				  int nest, reg_errcode_t *err);
2649
static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
2650
				 re_dfa_t *dfa, re_token_t *token,
2651
				 reg_syntax_t syntax, reg_errcode_t *err);
2652
static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
2653
				      re_token_t *token, reg_syntax_t syntax,
2654
				      reg_errcode_t *err);
2655
static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
2656
					    re_string_t *regexp,
2657
					    re_token_t *token, int token_len,
2658
					    re_dfa_t *dfa,
2659
					    reg_syntax_t syntax,
2660
					    int accept_hyphen);
2661
static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
2662
					  re_string_t *regexp,
2663
					  re_token_t *token);
2664
#ifdef RE_ENABLE_I18N
2665
static reg_errcode_t build_equiv_class (bitset_t sbcset,
2666
					re_charset_t *mbcset,
2667
					int *equiv_class_alloc,
2668
					const unsigned char *name);
2669
static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
2670
				      bitset_t sbcset,
2671
				      re_charset_t *mbcset,
2672
				      int *char_class_alloc,
2673
				      const unsigned char *class_name,
2674
				      reg_syntax_t syntax);
2675
#else  /* not RE_ENABLE_I18N */
2676
static reg_errcode_t build_equiv_class (bitset_t sbcset,
2677
					const unsigned char *name);
2678
static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
2679
				      bitset_t sbcset,
2680
				      const unsigned char *class_name,
2681
				      reg_syntax_t syntax);
2682
#endif /* not RE_ENABLE_I18N */
2683
static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
2684
				       RE_TRANSLATE_TYPE trans,
2685
				       const unsigned char *class_name,
2686
				       const unsigned char *extra,
2687
				       int non_match, reg_errcode_t *err);
2688
static bin_tree_t *create_tree (re_dfa_t *dfa,
2689
				bin_tree_t *left, bin_tree_t *right,
2690
				re_token_type_t type);
2691
static bin_tree_t *create_token_tree (re_dfa_t *dfa,
2692
				      bin_tree_t *left, bin_tree_t *right,
2693
				      const re_token_t *token);
2694
static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
2695
static void free_token (re_token_t *node);
2696
static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
2697
static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
2698

2699
/* This table gives an error message for each of the error codes listed
2700
   in regex.h.  Obviously the order here has to be same as there.
2701
   POSIX doesn't require that we do anything for REG_NOERROR,
2702
   but why not be nice?  */
2703

2704
const char __re_error_msgid[] attribute_hidden =
2705
  {
2706
#define REG_NOERROR_IDX	0
2707
    gettext_noop ("Success")	/* REG_NOERROR */
2708
    "\0"
2709
#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
2710
    gettext_noop ("No match")	/* REG_NOMATCH */
2711
    "\0"
2712
#define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
2713
    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
2714
    "\0"
2715
#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
2716
    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
2717
    "\0"
2718
#define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
2719
    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
2720
    "\0"
2721
#define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
2722
    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
2723
    "\0"
2724
#define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
2725
    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
2726
    "\0"
2727
#define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
2728
    gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
2729
    "\0"
2730
#define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
2731
    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
2732
    "\0"
2733
#define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
2734
    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
2735
    "\0"
2736
#define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
2737
    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
2738
    "\0"
2739
#define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
2740
    gettext_noop ("Invalid range end")	/* REG_ERANGE */
2741
    "\0"
2742
#define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
2743
    gettext_noop ("Memory exhausted") /* REG_ESPACE */
2744
    "\0"
2745
#define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
2746
    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
2747
    "\0"
2748
#define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
2749
    gettext_noop ("Premature end of regular expression") /* REG_EEND */
2750
    "\0"
2751
#define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
2752
    gettext_noop ("Regular expression too big") /* REG_ESIZE */
2753
    "\0"
2754
#define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
2755
    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
2756
  };
2757

2758
const size_t __re_error_msgid_idx[] attribute_hidden =
2759
  {
2760
    REG_NOERROR_IDX,
2761
    REG_NOMATCH_IDX,
2762
    REG_BADPAT_IDX,
2763
    REG_ECOLLATE_IDX,
2764
    REG_ECTYPE_IDX,
2765
    REG_EESCAPE_IDX,
2766
    REG_ESUBREG_IDX,
2767
    REG_EBRACK_IDX,
2768
    REG_EPAREN_IDX,
2769
    REG_EBRACE_IDX,
2770
    REG_BADBR_IDX,
2771
    REG_ERANGE_IDX,
2772
    REG_ESPACE_IDX,
2773
    REG_BADRPT_IDX,
2774
    REG_EEND_IDX,
2775
    REG_ESIZE_IDX,
2776
    REG_ERPAREN_IDX
2777
  };
2778

2779
/* Entry points for GNU code.  */
2780

2781
/* re_compile_pattern is the GNU regular expression compiler: it
2782
   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
2783
   Returns 0 if the pattern was valid, otherwise an error string.
2784

2785
   Assumes the `allocated' (and perhaps `buffer') and `translate' fields
2786
   are set in BUFP on entry.  */
2787

2788
const char *
2789
re_compile_pattern (pattern, length, bufp)
2790
    const char *pattern;
2791
    size_t length;
2792
    struct re_pattern_buffer *bufp;
2793
{
2794
  reg_errcode_t ret;
2795

2796
  /* And GNU code determines whether or not to get register information
2797
     by passing null for the REGS argument to re_match, etc., not by
2798
     setting no_sub, unless RE_NO_SUB is set.  */
2799
  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
2800

2801
  /* Match anchors at newline.  */
2802
  bufp->newline_anchor = 1;
2803

2804
  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
2805

2806
  if (!ret)
2807
    return NULL;
2808
  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
2809
}
2810
#ifdef _LIBC
2811
weak_alias (__re_compile_pattern, re_compile_pattern)
2812
#endif
2813

2814
/* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
2815
   also be assigned to arbitrarily: each pattern buffer stores its own
2816
   syntax, so it can be changed between regex compilations.  */
2817
/* This has no initializer because initialized variables in Emacs
2818
   become read-only after dumping.  */
2819
reg_syntax_t re_syntax_options;
2820

2821

2822
/* Specify the precise syntax of regexps for compilation.  This provides
2823
   for compatibility for various utilities which historically have
2824
   different, incompatible syntaxes.
2825

2826
   The argument SYNTAX is a bit mask comprised of the various bits
2827
   defined in regex.h.  We return the old syntax.  */
2828

2829
reg_syntax_t
2830
re_set_syntax (syntax)
2831
    reg_syntax_t syntax;
2832
{
2833
  reg_syntax_t ret = re_syntax_options;
2834

2835
  re_syntax_options = syntax;
2836
  return ret;
2837
}
2838
#ifdef _LIBC
2839
weak_alias (__re_set_syntax, re_set_syntax)
2840
#endif
2841

2842
int
2843
re_compile_fastmap (bufp)
2844
    struct re_pattern_buffer *bufp;
2845
{
2846
  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
2847
  char *fastmap = bufp->fastmap;
2848

2849
  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
2850
  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
2851
  if (dfa->init_state != dfa->init_state_word)
2852
    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
2853
  if (dfa->init_state != dfa->init_state_nl)
2854
    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
2855
  if (dfa->init_state != dfa->init_state_begbuf)
2856
    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
2857
  bufp->fastmap_accurate = 1;
2858
  return 0;
2859
}
2860
#ifdef _LIBC
2861
weak_alias (__re_compile_fastmap, re_compile_fastmap)
2862
#endif
2863

2864
static inline void
2865
__attribute ((always_inline))
2866
re_set_fastmap (char *fastmap, int icase, int ch)
2867
{
2868
  fastmap[ch] = 1;
2869
  if (icase)
2870
    fastmap[tolower (ch)] = 1;
2871
}
2872

2873
/* Helper function for re_compile_fastmap.
2874
   Compile fastmap for the initial_state INIT_STATE.  */
2875

2876
static void
2877
re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
2878
			 char *fastmap)
2879
{
2880
  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
2881
  int node_cnt;
2882
  int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
2883
  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
2884
    {
2885
      int node = init_state->nodes.elems[node_cnt];
2886
      re_token_type_t type = dfa->nodes[node].type;
2887

2888
      if (type == CHARACTER)
2889
	{
2890
	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
2891
#ifdef RE_ENABLE_I18N
2892
	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
2893
	    {
2894
	      unsigned char *buf = alloca (dfa->mb_cur_max), *p;
2895
	      wchar_t wc;
2896
	      mbstate_t state;
2897

2898
	      p = buf;
2899
	      *p++ = dfa->nodes[node].opr.c;
2900
	      while (++node < dfa->nodes_len
2901
		     &&	dfa->nodes[node].type == CHARACTER
2902
		     && dfa->nodes[node].mb_partial)
2903
		*p++ = dfa->nodes[node].opr.c;
2904
	      memset (&state, '\0', sizeof (state));
2905
	      if (mbrtowc (&wc, (const char *) buf, p - buf,
2906
			   &state) == p - buf
2907
		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
2908
		      != (size_t) -1))
2909
		re_set_fastmap (fastmap, 0, buf[0]);
2910
	    }
2911
#endif
2912
	}
2913
      else if (type == SIMPLE_BRACKET)
2914
	{
2915
	  int i, ch;
2916
	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
2917
	    {
2918
	      int j;
2919
	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
2920
	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
2921
		if (w & ((bitset_word_t) 1 << j))
2922
		  re_set_fastmap (fastmap, icase, ch);
2923
	    }
2924
	}
2925
#ifdef RE_ENABLE_I18N
2926
      else if (type == COMPLEX_BRACKET)
2927
	{
2928
	  int i;
2929
	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
2930
	  if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
2931
	      || cset->nranges || cset->nchar_classes)
2932
	    {
2933
# ifdef _LIBC
2934
	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
2935
		{
2936
		  /* In this case we want to catch the bytes which are
2937
		     the first byte of any collation elements.
2938
		     e.g. In da_DK, we want to catch 'a' since "aa"
2939
			  is a valid collation element, and don't catch
2940
			  'b' since 'b' is the only collation element
2941
			  which starts from 'b'.  */
2942
		  const int32_t *table = (const int32_t *)
2943
		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
2944
		  for (i = 0; i < SBC_MAX; ++i)
2945
		    if (table[i] < 0)
2946
		      re_set_fastmap (fastmap, icase, i);
2947
		}
2948
# else
2949
	      if (dfa->mb_cur_max > 1)
2950
		for (i = 0; i < SBC_MAX; ++i)
2951
		  if (__btowc (i) == WEOF)
2952
		    re_set_fastmap (fastmap, icase, i);
2953
# endif /* not _LIBC */
2954
	    }
2955
	  for (i = 0; i < cset->nmbchars; ++i)
2956
	    {
2957
	      char buf[256];
2958
	      mbstate_t state;
2959
	      memset (&state, '\0', sizeof (state));
2960
	      if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
2961
		re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
2962
	      if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
2963
		{
2964
		  if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
2965
		      != (size_t) -1)
2966
		    re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
2967
		}
2968
	    }
2969
	}
2970
#endif /* RE_ENABLE_I18N */
2971
      else if (type == OP_PERIOD
2972
#ifdef RE_ENABLE_I18N
2973
	       || type == OP_UTF8_PERIOD
2974
#endif /* RE_ENABLE_I18N */
2975
	       || type == END_OF_RE)
2976
	{
2977
	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
2978
	  if (type == END_OF_RE)
2979
	    bufp->can_be_null = 1;
2980
	  return;
2981
	}
2982
    }
2983
}
2984

2985
/* Entry point for POSIX code.  */
2986
/* regcomp takes a regular expression as a string and compiles it.
2987

2988
   PREG is a regex_t *.  We do not expect any fields to be initialized,
2989
   since POSIX says we shouldn't.  Thus, we set
2990

2991
     `buffer' to the compiled pattern;
2992
     `used' to the length of the compiled pattern;
2993
     `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
2994
       REG_EXTENDED bit in CFLAGS is set; otherwise, to
2995
       RE_SYNTAX_POSIX_BASIC;
2996
     `newline_anchor' to REG_NEWLINE being set in CFLAGS;
2997
     `fastmap' to an allocated space for the fastmap;
2998
     `fastmap_accurate' to zero;
2999
     `re_nsub' to the number of subexpressions in PATTERN.
3000

3001
   PATTERN is the address of the pattern string.
3002

3003
   CFLAGS is a series of bits which affect compilation.
3004

3005
     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
3006
     use POSIX basic syntax.
3007

3008
     If REG_NEWLINE is set, then . and [^...] don't match newline.
3009
     Also, regexec will try a match beginning after every newline.
3010

3011
     If REG_ICASE is set, then we considers upper- and lowercase
3012
     versions of letters to be equivalent when matching.
3013

3014
     If REG_NOSUB is set, then when PREG is passed to regexec, that
3015
     routine will report only success or failure, and nothing about the
3016
     registers.
3017

3018
   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
3019
   the return codes and their meanings.)  */
3020

3021
int
3022
regcomp (preg, pattern, cflags)
3023
    regex_t *__restrict preg;
3024
    const char *__restrict pattern;
3025
    int cflags;
3026
{
3027
  reg_errcode_t ret;
3028
  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
3029
			 : RE_SYNTAX_POSIX_BASIC);
3030

3031
  preg->buffer = NULL;
3032
  preg->allocated = 0;
3033
  preg->used = 0;
3034

3035
  /* Try to allocate space for the fastmap.  */
3036
  preg->fastmap = re_malloc (char, SBC_MAX);
3037
  if (BE (preg->fastmap == NULL, 0))
3038
    return REG_ESPACE;
3039

3040
  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
3041

3042
  /* If REG_NEWLINE is set, newlines are treated differently.  */
3043
  if (cflags & REG_NEWLINE)
3044
    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
3045
      syntax &= ~RE_DOT_NEWLINE;
3046
      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
3047
      /* It also changes the matching behavior.  */
3048
      preg->newline_anchor = 1;
3049
    }
3050
  else
3051
    preg->newline_anchor = 0;
3052
  preg->no_sub = !!(cflags & REG_NOSUB);
3053
  preg->translate = NULL;
3054

3055
  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
3056

3057
  /* POSIX doesn't distinguish between an unmatched open-group and an
3058
     unmatched close-group: both are REG_EPAREN.  */
3059
  if (ret == REG_ERPAREN)
3060
    ret = REG_EPAREN;
3061

3062
  /* We have already checked preg->fastmap != NULL.  */
3063
  if (BE (ret == REG_NOERROR, 1))
3064
    /* Compute the fastmap now, since regexec cannot modify the pattern
3065
       buffer.  This function never fails in this implementation.  */
3066
    (void) re_compile_fastmap (preg);
3067
  else
3068
    {
3069
      /* Some error occurred while compiling the expression.  */
3070
      re_free (preg->fastmap);
3071
      preg->fastmap = NULL;
3072
    }
3073

3074
  return (int) ret;
3075
}
3076
#ifdef _LIBC
3077
weak_alias (__regcomp, regcomp)
3078
#endif
3079

3080
/* Returns a message corresponding to an error code, ERRCODE, returned
3081
   from either regcomp or regexec.   We don't use PREG here.  */
3082

3083
/* regerror ( int errcode, preg, errbuf, errbuf_size) */
3084
size_t
3085
regerror (
3086
    int errcode,
3087
    const regex_t *__restrict preg,
3088
    char *__restrict errbuf,
3089
    size_t errbuf_size)
3090
{
3091
  const char *msg;
3092
  size_t msg_size;
3093

3094
  if (BE (errcode < 0
3095
	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
3096
			       / sizeof (__re_error_msgid_idx[0])), 0))
3097
    /* Only error codes returned by the rest of the code should be passed
3098
       to this routine.  If we are given anything else, or if other regex
3099
       code generates an invalid error code, then the program has a bug.
3100
       Dump core so we can fix it.  */
3101
    abort ();
3102

3103
  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
3104

3105
  msg_size = strlen (msg) + 1; /* Includes the null.  */
3106

3107
  if (BE (errbuf_size != 0, 1))
3108
    {
3109
      if (BE (msg_size > errbuf_size, 0))
3110
	{
3111
#if defined HAVE_MEMPCPY || defined _LIBC
3112
	  *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
3113
#else
3114
	  memcpy (errbuf, msg, errbuf_size - 1);
3115
	  errbuf[errbuf_size - 1] = 0;
3116
#endif
3117
	}
3118
      else
3119
	memcpy (errbuf, msg, msg_size);
3120
    }
3121

3122
  return msg_size;
3123
}
3124
#ifdef _LIBC
3125
weak_alias (__regerror, regerror)
3126
#endif
3127

3128

3129
#ifdef RE_ENABLE_I18N
3130
/* This static array is used for the map to single-byte characters when
3131
   UTF-8 is used.  Otherwise we would allocate memory just to initialize
3132
   it the same all the time.  UTF-8 is the preferred encoding so this is
3133
   a worthwhile optimization.  */
3134
static const bitset_t utf8_sb_map =
3135
{
3136
  /* Set the first 128 bits.  */
3137
  [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
3138
};
3139
#endif
3140

3141

3142
static void
3143
free_dfa_content (re_dfa_t *dfa)
3144
{
3145
  int i, j;
3146

3147
  if (dfa->nodes)
3148
    for (i = 0; i < dfa->nodes_len; ++i)
3149
      free_token (dfa->nodes + i);
3150
  re_free (dfa->nexts);
3151
  for (i = 0; i < dfa->nodes_len; ++i)
3152
    {
3153
      if (dfa->eclosures != NULL)
3154
	re_node_set_free (dfa->eclosures + i);
3155
      if (dfa->inveclosures != NULL)
3156
	re_node_set_free (dfa->inveclosures + i);
3157
      if (dfa->edests != NULL)
3158
	re_node_set_free (dfa->edests + i);
3159
    }
3160
  re_free (dfa->edests);
3161
  re_free (dfa->eclosures);
3162
  re_free (dfa->inveclosures);
3163
  re_free (dfa->nodes);
3164

3165
  if (dfa->state_table)
3166
    for (i = 0; i <= dfa->state_hash_mask; ++i)
3167
      {
3168
	struct re_state_table_entry *entry = dfa->state_table + i;
3169
	for (j = 0; j < entry->num; ++j)
3170
	  {
3171
	    re_dfastate_t *state = entry->array[j];
3172
	    free_state (state);
3173
	  }
3174
        re_free (entry->array);
3175
      }
3176
  re_free (dfa->state_table);
3177
#ifdef RE_ENABLE_I18N
3178
  if (dfa->sb_char != utf8_sb_map)
3179
    re_free (dfa->sb_char);
3180
#endif
3181
  re_free (dfa->subexp_map);
3182
#ifdef DEBUG
3183
  re_free (dfa->re_str);
3184
#endif
3185

3186
  re_free (dfa);
3187
}
3188

3189

3190
/* Free dynamically allocated space used by PREG.  */
3191

3192
void
3193
regfree (preg)
3194
    regex_t *preg;
3195
{
3196
  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3197
  if (BE (dfa != NULL, 1))
3198
    free_dfa_content (dfa);
3199
  preg->buffer = NULL;
3200
  preg->allocated = 0;
3201

3202
  re_free (preg->fastmap);
3203
  preg->fastmap = NULL;
3204

3205
  re_free (preg->translate);
3206
  preg->translate = NULL;
3207
}
3208
#ifdef _LIBC
3209
weak_alias (__regfree, regfree)
3210
#endif
3211

3212
/* Entry points compatible with 4.2 BSD regex library.  We don't define
3213
   them unless specifically requested.  */
3214

3215
#if defined _REGEX_RE_COMP || defined _LIBC
3216

3217
/* BSD has one and only one pattern buffer.  */
3218
static struct re_pattern_buffer re_comp_buf;
3219

3220
char *
3221
# ifdef _LIBC
3222
/* Make these definitions weak in libc, so POSIX programs can redefine
3223
   these names if they don't use our functions, and still use
3224
   regcomp/regexec above without link errors.  */
3225
weak_function
3226
# endif
3227
re_comp (s)
3228
     const char *s;
3229
{
3230
  reg_errcode_t ret;
3231
  char *fastmap;
3232

3233
  if (!s)
3234
    {
3235
      if (!re_comp_buf.buffer)
3236
	return gettext ("No previous regular expression");
3237
      return 0;
3238
    }
3239

3240
  if (re_comp_buf.buffer)
3241
    {
3242
      fastmap = re_comp_buf.fastmap;
3243
      re_comp_buf.fastmap = NULL;
3244
      __regfree (&re_comp_buf);
3245
      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
3246
      re_comp_buf.fastmap = fastmap;
3247
    }
3248

3249
  if (re_comp_buf.fastmap == NULL)
3250
    {
3251
      re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
3252
      if (re_comp_buf.fastmap == NULL)
3253
	return (char *) gettext (__re_error_msgid
3254
				 + __re_error_msgid_idx[(int) REG_ESPACE]);
3255
    }
3256

3257
  /* Since `re_exec' always passes NULL for the `regs' argument, we
3258
     don't need to initialize the pattern buffer fields which affect it.  */
3259

3260
  /* Match anchors at newlines.  */
3261
  re_comp_buf.newline_anchor = 1;
3262

3263
  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
3264

3265
  if (!ret)
3266
    return NULL;
3267

3268
  /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
3269
  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3270
}
3271

3272
#ifdef _LIBC
3273
libc_freeres_fn (free_mem)
3274
{
3275
  __regfree (&re_comp_buf);
3276
}
3277
#endif
3278

3279
#endif /* _REGEX_RE_COMP */
3280

3281
/* Internal entry point.
3282
   Compile the regular expression PATTERN, whose length is LENGTH.
3283
   SYNTAX indicate regular expression's syntax.  */
3284

3285
static reg_errcode_t
3286
re_compile_internal (regex_t *preg, const char * pattern, size_t length,
3287
		     reg_syntax_t syntax)
3288
{
3289
  reg_errcode_t err = REG_NOERROR;
3290
  re_dfa_t *dfa;
3291
  re_string_t regexp;
3292

3293
  /* Initialize the pattern buffer.  */
3294
  preg->fastmap_accurate = 0;
3295
  preg->syntax = syntax;
3296
  preg->not_bol = preg->not_eol = 0;
3297
  preg->used = 0;
3298
  preg->re_nsub = 0;
3299
  preg->can_be_null = 0;
3300
  preg->regs_allocated = REGS_UNALLOCATED;
3301

3302
  /* Initialize the dfa.  */
3303
  dfa = (re_dfa_t *) preg->buffer;
3304
  if (BE (preg->allocated < sizeof (re_dfa_t), 0))
3305
    {
3306
      /* If zero allocated, but buffer is non-null, try to realloc
3307
	 enough space.  This loses if buffer's address is bogus, but
3308
	 that is the user's responsibility.  If ->buffer is NULL this
3309
	 is a simple allocation.  */
3310
      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
3311
      if (dfa == NULL)
3312
	return REG_ESPACE;
3313
      preg->allocated = sizeof (re_dfa_t);
3314
      preg->buffer = (unsigned char *) dfa;
3315
    }
3316
  preg->used = sizeof (re_dfa_t);
3317

3318
  err = init_dfa (dfa, length);
3319
  if (BE (err != REG_NOERROR, 0))
3320
    {
3321
      free_dfa_content (dfa);
3322
      preg->buffer = NULL;
3323
      preg->allocated = 0;
3324
      return err;
3325
    }
3326
#ifdef DEBUG
3327
  /* Note: length+1 will not overflow since it is checked in init_dfa.  */
3328
  dfa->re_str = re_malloc (char, length + 1);
3329
  strncpy (dfa->re_str, pattern, length + 1);
3330
#endif
3331

3332
  __libc_lock_init (dfa->lock);
3333

3334
  err = re_string_construct (&regexp, pattern, length, preg->translate,
3335
			     syntax & RE_ICASE, dfa);
3336
  if (BE (err != REG_NOERROR, 0))
3337
    {
3338
    re_compile_internal_free_return:
3339
      free_workarea_compile (preg);
3340
      re_string_destruct (&regexp);
3341
      free_dfa_content (dfa);
3342
      preg->buffer = NULL;
3343
      preg->allocated = 0;
3344
      return err;
3345
    }
3346

3347
  /* Parse the regular expression, and build a structure tree.  */
3348
  preg->re_nsub = 0;
3349
  dfa->str_tree = parse (&regexp, preg, syntax, &err);
3350
  if (BE (dfa->str_tree == NULL, 0))
3351
    goto re_compile_internal_free_return;
3352

3353
  /* Analyze the tree and create the nfa.  */
3354
  err = analyze (preg);
3355
  if (BE (err != REG_NOERROR, 0))
3356
    goto re_compile_internal_free_return;
3357

3358
#ifdef RE_ENABLE_I18N
3359
  /* If possible, do searching in single byte encoding to speed things up.  */
3360
  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
3361
    optimize_utf8 (dfa);
3362
#endif
3363

3364
  /* Then create the initial state of the dfa.  */
3365
  err = create_initial_state (dfa);
3366

3367
  /* Release work areas.  */
3368
  free_workarea_compile (preg);
3369
  re_string_destruct (&regexp);
3370

3371
  if (BE (err != REG_NOERROR, 0))
3372
    {
3373
      free_dfa_content (dfa);
3374
      preg->buffer = NULL;
3375
      preg->allocated = 0;
3376
    }
3377

3378
  return err;
3379
}
3380

3381
/* Initialize DFA.  We use the length of the regular expression PAT_LEN
3382
   as the initial length of some arrays.  */
3383

3384
static reg_errcode_t
3385
init_dfa (re_dfa_t *dfa, size_t pat_len)
3386
{
3387
  unsigned int table_size;
3388
#ifndef _LIBC
3389
  char *codeset_name;
3390
#endif
3391

3392
  memset (dfa, '\0', sizeof (re_dfa_t));
3393

3394
  /* Force allocation of str_tree_storage the first time.  */
3395
  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
3396

3397
  /* Avoid overflows.  */
3398
  if (pat_len == SIZE_MAX)
3399
    return REG_ESPACE;
3400

3401
  dfa->nodes_alloc = pat_len + 1;
3402
  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
3403

3404
  /*  table_size = 2 ^ ceil(log pat_len) */
3405
  for (table_size = 1; ; table_size <<= 1)
3406
    if (table_size > pat_len)
3407
      break;
3408

3409
  dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
3410
  dfa->state_hash_mask = table_size - 1;
3411

3412
  dfa->mb_cur_max = MB_CUR_MAX;
3413
#ifdef _LIBC
3414
  if (dfa->mb_cur_max == 6
3415
      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
3416
    dfa->is_utf8 = 1;
3417
  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
3418
		       != 0);
3419
#else
3420
# ifdef HAVE_LANGINFO_CODESET
3421
  codeset_name = nl_langinfo (CODESET);
3422
# else
3423
  codeset_name = getenv ("LC_ALL");
3424
  if (codeset_name == NULL || codeset_name[0] == '\0')
3425
    codeset_name = getenv ("LC_CTYPE");
3426
  if (codeset_name == NULL || codeset_name[0] == '\0')
3427
    codeset_name = getenv ("LANG");
3428
  if (codeset_name == NULL)
3429
    codeset_name = "";
3430
  else if (strchr (codeset_name, '.') !=  NULL)
3431
    codeset_name = strchr (codeset_name, '.') + 1;
3432
# endif
3433

3434
  if (strcasecmp (codeset_name, "UTF-8") == 0
3435
      || strcasecmp (codeset_name, "UTF8") == 0)
3436
    dfa->is_utf8 = 1;
3437

3438
  /* We check exhaustively in the loop below if this charset is a
3439
     superset of ASCII.  */
3440
  dfa->map_notascii = 0;
3441
#endif
3442

3443
#ifdef RE_ENABLE_I18N
3444
  if (dfa->mb_cur_max > 1)
3445
    {
3446
      if (dfa->is_utf8)
3447
	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
3448
      else
3449
	{
3450
	  int i, j, ch;
3451

3452
	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3453
	  if (BE (dfa->sb_char == NULL, 0))
3454
	    return REG_ESPACE;
3455

3456
	  /* Set the bits corresponding to single byte chars.  */
3457
	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
3458
	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
3459
	      {
3460
		wint_t wch = __btowc (ch);
3461
		if (wch != WEOF)
3462
		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
3463
# ifndef _LIBC
3464
		if (isascii (ch) && wch != ch)
3465
		  dfa->map_notascii = 1;
3466
# endif
3467
	      }
3468
	}
3469
    }
3470
#endif
3471

3472
  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
3473
    return REG_ESPACE;
3474
  return REG_NOERROR;
3475
}
3476

3477
/* Initialize WORD_CHAR table, which indicate which character is
3478
   "word".  In this case "word" means that it is the word construction
3479
   character used by some operators like "\<", "\>", etc.  */
3480

3481
static void
3482
internal_function
3483
init_word_char (re_dfa_t *dfa)
3484
{
3485
  int i, j, ch;
3486
  dfa->word_ops_used = 1;
3487
  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
3488
    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
3489
      if (isalnum (ch) || ch == '_')
3490
	dfa->word_char[i] |= (bitset_word_t) 1 << j;
3491
}
3492

3493
/* Free the work area which are only used while compiling.  */
3494

3495
static void
3496
free_workarea_compile (regex_t *preg)
3497
{
3498
  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3499
  bin_tree_storage_t *storage, *next;
3500
  for (storage = dfa->str_tree_storage; storage; storage = next)
3501
    {
3502
      next = storage->next;
3503
      re_free (storage);
3504
    }
3505
  dfa->str_tree_storage = NULL;
3506
  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
3507
  dfa->str_tree = NULL;
3508
  re_free (dfa->org_indices);
3509
  dfa->org_indices = NULL;
3510
}
3511

3512
/* Create initial states for all contexts.  */
3513

3514
static reg_errcode_t
3515
create_initial_state (re_dfa_t *dfa)
3516
{
3517
  int first, i;
3518
  reg_errcode_t err;
3519
  re_node_set init_nodes;
3520

3521
  /* Initial states have the epsilon closure of the node which is
3522
     the first node of the regular expression.  */
3523
  first = dfa->str_tree->first->node_idx;
3524
  dfa->init_node = first;
3525
  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
3526
  if (BE (err != REG_NOERROR, 0))
3527
    return err;
3528

3529
  /* The back-references which are in initial states can epsilon transit,
3530
     since in this case all of the subexpressions can be null.
3531
     Then we add epsilon closures of the nodes which are the next nodes of
3532
     the back-references.  */
3533
  if (dfa->nbackref > 0)
3534
    for (i = 0; i < init_nodes.nelem; ++i)
3535
      {
3536
	int node_idx = init_nodes.elems[i];
3537
	re_token_type_t type = dfa->nodes[node_idx].type;
3538

3539
	int clexp_idx;
3540
	if (type != OP_BACK_REF)
3541
	  continue;
3542
	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
3543
	  {
3544
	    re_token_t *clexp_node;
3545
	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
3546
	    if (clexp_node->type == OP_CLOSE_SUBEXP
3547
		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
3548
	      break;
3549
	  }
3550
	if (clexp_idx == init_nodes.nelem)
3551
	  continue;
3552

3553
	if (type == OP_BACK_REF)
3554
	  {
3555
	    int dest_idx = dfa->edests[node_idx].elems[0];
3556
	    if (!re_node_set_contains (&init_nodes, dest_idx))
3557
	      {
3558
		re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
3559
		i = 0;
3560
	      }
3561
	  }
3562
      }
3563

3564
  /* It must be the first time to invoke acquire_state.  */
3565
  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
3566
  /* We don't check ERR here, since the initial state must not be NULL.  */
3567
  if (BE (dfa->init_state == NULL, 0))
3568
    return err;
3569
  if (dfa->init_state->has_constraint)
3570
    {
3571
      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
3572
						       CONTEXT_WORD);
3573
      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
3574
						     CONTEXT_NEWLINE);
3575
      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
3576
							 &init_nodes,
3577
							 CONTEXT_NEWLINE
3578
							 | CONTEXT_BEGBUF);
3579
      if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
3580
	      || dfa->init_state_begbuf == NULL, 0))
3581
	return err;
3582
    }
3583
  else
3584
    dfa->init_state_word = dfa->init_state_nl
3585
      = dfa->init_state_begbuf = dfa->init_state;
3586

3587
  re_node_set_free (&init_nodes);
3588
  return REG_NOERROR;
3589
}
3590

3591
#ifdef RE_ENABLE_I18N
3592
/* If it is possible to do searching in single byte encoding instead of UTF-8
3593
   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
3594
   DFA nodes where needed.  */
3595

3596
static void
3597
optimize_utf8 (re_dfa_t *dfa)
3598
{
3599
  int node, i, mb_chars = 0, has_period = 0;
3600

3601
  for (node = 0; node < dfa->nodes_len; ++node)
3602
    switch (dfa->nodes[node].type)
3603
      {
3604
      case CHARACTER:
3605
	if (dfa->nodes[node].opr.c >= 0x80)
3606
	  mb_chars = 1;
3607
	break;
3608
      case ANCHOR:
3609
	switch (dfa->nodes[node].opr.idx)
3610
	  {
3611
	  case LINE_FIRST:
3612
	  case LINE_LAST:
3613
	  case BUF_FIRST:
3614
	  case BUF_LAST:
3615
	    break;
3616
	  default:
3617
	    /* Word anchors etc. cannot be handled.  */
3618
	    return;
3619
	  }
3620
	break;
3621
      case OP_PERIOD:
3622
        has_period = 1;
3623
        break;
3624
      case OP_BACK_REF:
3625
      case OP_ALT:
3626
      case END_OF_RE:
3627
      case OP_DUP_ASTERISK:
3628
      case OP_OPEN_SUBEXP:
3629
      case OP_CLOSE_SUBEXP:
3630
	break;
3631
      case COMPLEX_BRACKET:
3632
	return;
3633
      case SIMPLE_BRACKET:
3634
	/* Just double check.  The non-ASCII range starts at 0x80.  */
3635
	assert (0x80 % BITSET_WORD_BITS == 0);
3636
        for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
3637
	  if (dfa->nodes[node].opr.sbcset[i])
3638
	    return;
3639
	break;
3640
      default:
3641
	abort ();
3642
      }
3643

3644
  if (mb_chars || has_period)
3645
    for (node = 0; node < dfa->nodes_len; ++node)
3646
      {
3647
	if (dfa->nodes[node].type == CHARACTER
3648
	    && dfa->nodes[node].opr.c >= 0x80)
3649
	  dfa->nodes[node].mb_partial = 0;
3650
	else if (dfa->nodes[node].type == OP_PERIOD)
3651
	  dfa->nodes[node].type = OP_UTF8_PERIOD;
3652
      }
3653

3654
  /* The search can be in single byte locale.  */
3655
  dfa->mb_cur_max = 1;
3656
  dfa->is_utf8 = 0;
3657
  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
3658
}
3659
#endif
3660

3661
/* Analyze the structure tree, and calculate "first", "next", "edest",
3662
   "eclosure", and "inveclosure".  */
3663

3664
static reg_errcode_t
3665
analyze (regex_t *preg)
3666
{
3667
  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3668
  reg_errcode_t ret;
3669

3670
  /* Allocate arrays.  */
3671
  dfa->nexts = re_malloc (int, dfa->nodes_alloc);
3672
  dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
3673
  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
3674
  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
3675
  if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
3676
	  || dfa->eclosures == NULL, 0))
3677
    return REG_ESPACE;
3678

3679
  dfa->subexp_map = re_malloc (int, preg->re_nsub);
3680
  if (dfa->subexp_map != NULL)
3681
    {
3682
      int i;
3683
      for (i = 0; i < preg->re_nsub; i++)
3684
	dfa->subexp_map[i] = i;
3685
      preorder (dfa->str_tree, optimize_subexps, dfa);
3686
      for (i = 0; i < preg->re_nsub; i++)
3687
	if (dfa->subexp_map[i] != i)
3688
	  break;
3689
      if (i == preg->re_nsub)
3690
	{
3691
	  free (dfa->subexp_map);
3692
	  dfa->subexp_map = NULL;
3693
	}
3694
    }
3695

3696
  ret = postorder (dfa->str_tree, lower_subexps, preg);
3697
  if (BE (ret != REG_NOERROR, 0))
3698
    return ret;
3699
  ret = postorder (dfa->str_tree, calc_first, dfa);
3700
  if (BE (ret != REG_NOERROR, 0))
3701
    return ret;
3702
  preorder (dfa->str_tree, calc_next, dfa);
3703
  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
3704
  if (BE (ret != REG_NOERROR, 0))
3705
    return ret;
3706
  ret = calc_eclosure (dfa);
3707
  if (BE (ret != REG_NOERROR, 0))
3708
    return ret;
3709

3710
  /* We only need this during the prune_impossible_nodes pass in regexec.c;
3711
     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
3712
  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
3713
      || dfa->nbackref)
3714
    {
3715
      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
3716
      if (BE (dfa->inveclosures == NULL, 0))
3717
        return REG_ESPACE;
3718
      ret = calc_inveclosure (dfa);
3719
    }
3720

3721
  return ret;
3722
}
3723

3724
/* Our parse trees are very unbalanced, so we cannot use a stack to
3725
   implement parse tree visits.  Instead, we use parent pointers and
3726
   some hairy code in these two functions.  */
3727
static reg_errcode_t
3728
postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
3729
	   void *extra)
3730
{
3731
  bin_tree_t *node, *prev;
3732

3733
  for (node = root; ; )
3734
    {
3735
      /* Descend down the tree, preferably to the left (or to the right
3736
	 if that's the only child).  */
3737
      while (node->left || node->right)
3738
	if (node->left)
3739
          node = node->left;
3740
        else
3741
          node = node->right;
3742

3743
      do
3744
	{
3745
	  reg_errcode_t err = fn (extra, node);
3746
	  if (BE (err != REG_NOERROR, 0))
3747
	    return err;
3748
          if (node->parent == NULL)
3749
	    return REG_NOERROR;
3750
	  prev = node;
3751
	  node = node->parent;
3752
	}
3753
      /* Go up while we have a node that is reached from the right.  */
3754
      while (node->right == prev || node->right == NULL);
3755
      node = node->right;
3756
    }
3757
}
3758

3759
static reg_errcode_t
3760
preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
3761
	  void *extra)
3762
{
3763
  bin_tree_t *node;
3764

3765
  for (node = root; ; )
3766
    {
3767
      reg_errcode_t err = fn (extra, node);
3768
      if (BE (err != REG_NOERROR, 0))
3769
	return err;
3770

3771
      /* Go to the left node, or up and to the right.  */
3772
      if (node->left)
3773
	node = node->left;
3774
      else
3775
	{
3776
	  bin_tree_t *prev = NULL;
3777
	  while (node->right == prev || node->right == NULL)
3778
	    {
3779
	      prev = node;
3780
	      node = node->parent;
3781
	      if (!node)
3782
	        return REG_NOERROR;
3783
	    }
3784
	  node = node->right;
3785
	}
3786
    }
3787
}
3788

3789
/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
3790
   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
3791
   backreferences as well.  Requires a preorder visit.  */
3792
static reg_errcode_t
3793
optimize_subexps (void *extra, bin_tree_t *node)
3794
{
3795
  re_dfa_t *dfa = (re_dfa_t *) extra;
3796

3797
  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
3798
    {
3799
      int idx = node->token.opr.idx;
3800
      node->token.opr.idx = dfa->subexp_map[idx];
3801
      dfa->used_bkref_map |= 1 << node->token.opr.idx;
3802
    }
3803

3804
  else if (node->token.type == SUBEXP
3805
           && node->left && node->left->token.type == SUBEXP)
3806
    {
3807
      int other_idx = node->left->token.opr.idx;
3808

3809
      node->left = node->left->left;
3810
      if (node->left)
3811
        node->left->parent = node;
3812

3813
      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
3814
      if (other_idx < BITSET_WORD_BITS)
3815
	  dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
3816
    }
3817

3818
  return REG_NOERROR;
3819
}
3820

3821
/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
3822
   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
3823
static reg_errcode_t
3824
lower_subexps (void *extra, bin_tree_t *node)
3825
{
3826
  regex_t *preg = (regex_t *) extra;
3827
  reg_errcode_t err = REG_NOERROR;
3828

3829
  if (node->left && node->left->token.type == SUBEXP)
3830
    {
3831
      node->left = lower_subexp (&err, preg, node->left);
3832
      if (node->left)
3833
	node->left->parent = node;
3834
    }
3835
  if (node->right && node->right->token.type == SUBEXP)
3836
    {
3837
      node->right = lower_subexp (&err, preg, node->right);
3838
      if (node->right)
3839
	node->right->parent = node;
3840
    }
3841

3842
  return err;
3843
}
3844

3845
static bin_tree_t *
3846
lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
3847
{
3848
  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3849
  bin_tree_t *body = node->left;
3850
  bin_tree_t *op, *cls, *tree1, *tree;
3851

3852
  if (preg->no_sub
3853
      /* We do not optimize empty subexpressions, because otherwise we may
3854
	 have bad CONCAT nodes with NULL children.  This is obviously not
3855
	 very common, so we do not lose much.  An example that triggers
3856
	 this case is the sed "script" /\(\)/x.  */
3857
      && node->left != NULL
3858
      && (node->token.opr.idx >= BITSET_WORD_BITS
3859
	  || !(dfa->used_bkref_map
3860
	       & ((bitset_word_t) 1 << node->token.opr.idx))))
3861
    return node->left;
3862

3863
  /* Convert the SUBEXP node to the concatenation of an
3864
     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
3865
  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
3866
  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
3867
  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
3868
  tree = create_tree (dfa, op, tree1, CONCAT);
3869
  if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
3870
    {
3871
      *err = REG_ESPACE;
3872
      return NULL;
3873
    }
3874

3875
  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
3876
  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
3877
  return tree;
3878
}
3879

3880
/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
3881
   nodes.  Requires a postorder visit.  */
3882
static reg_errcode_t
3883
calc_first (void *extra, bin_tree_t *node)
3884
{
3885
  re_dfa_t *dfa = (re_dfa_t *) extra;
3886
  if (node->token.type == CONCAT)
3887
    {
3888
      node->first = node->left->first;
3889
      node->node_idx = node->left->node_idx;
3890
    }
3891
  else
3892
    {
3893
      node->first = node;
3894
      node->node_idx = re_dfa_add_node (dfa, node->token);
3895
      if (BE (node->node_idx == -1, 0))
3896
        return REG_ESPACE;
3897
    }
3898
  return REG_NOERROR;
3899
}
3900

3901
/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
3902
static reg_errcode_t
3903
calc_next (void *extra, bin_tree_t *node)
3904
{
3905
  switch (node->token.type)
3906
    {
3907
    case OP_DUP_ASTERISK:
3908
      node->left->next = node;
3909
      break;
3910
    case CONCAT:
3911
      node->left->next = node->right->first;
3912
      node->right->next = node->next;
3913
      break;
3914
    default:
3915
      if (node->left)
3916
	node->left->next = node->next;
3917
      if (node->right)
3918
        node->right->next = node->next;
3919
      break;
3920
    }
3921
  return REG_NOERROR;
3922
}
3923

3924
/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
3925
static reg_errcode_t
3926
link_nfa_nodes (void *extra, bin_tree_t *node)
3927
{
3928
  re_dfa_t *dfa = (re_dfa_t *) extra;
3929
  int idx = node->node_idx;
3930
  reg_errcode_t err = REG_NOERROR;
3931

3932
  switch (node->token.type)
3933
    {
3934
    case CONCAT:
3935
      break;
3936

3937
    case END_OF_RE:
3938
      assert (node->next == NULL);
3939
      break;
3940

3941
    case OP_DUP_ASTERISK:
3942
    case OP_ALT:
3943
      {
3944
	int left, right;
3945
	dfa->has_plural_match = 1;
3946
	if (node->left != NULL)
3947
	  left = node->left->first->node_idx;
3948
	else
3949
	  left = node->next->node_idx;
3950
	if (node->right != NULL)
3951
	  right = node->right->first->node_idx;
3952
	else
3953
	  right = node->next->node_idx;
3954
	assert (left > -1);
3955
	assert (right > -1);
3956
	err = re_node_set_init_2 (dfa->edests + idx, left, right);
3957
      }
3958
      break;
3959

3960
    case ANCHOR:
3961
    case OP_OPEN_SUBEXP:
3962
    case OP_CLOSE_SUBEXP:
3963
      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
3964
      break;
3965

3966
    case OP_BACK_REF:
3967
      dfa->nexts[idx] = node->next->node_idx;
3968
      if (node->token.type == OP_BACK_REF)
3969
	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
3970
      break;
3971

3972
    default:
3973
      assert (!IS_EPSILON_NODE (node->token.type));
3974
      dfa->nexts[idx] = node->next->node_idx;
3975
      break;
3976
    }
3977

3978
  return err;
3979
}
3980

3981
/* Duplicate the epsilon closure of the node ROOT_NODE.
3982
   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
3983
   to their own constraint.  */
3984

3985
static reg_errcode_t
3986
internal_function
3987
duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
3988
			int root_node, unsigned int init_constraint)
3989
{
3990
  int org_node, clone_node, ret;
3991
  unsigned int constraint = init_constraint;
3992
  for (org_node = top_org_node, clone_node = top_clone_node;;)
3993
    {
3994
      int org_dest, clone_dest;
3995
      if (dfa->nodes[org_node].type == OP_BACK_REF)
3996
	{
3997
	  /* If the back reference epsilon-transit, its destination must
3998
	     also have the constraint.  Then duplicate the epsilon closure
3999
	     of the destination of the back reference, and store it in
4000
	     edests of the back reference.  */
4001
	  org_dest = dfa->nexts[org_node];
4002
	  re_node_set_empty (dfa->edests + clone_node);
4003
	  clone_dest = duplicate_node (dfa, org_dest, constraint);
4004
	  if (BE (clone_dest == -1, 0))
4005
	    return REG_ESPACE;
4006
	  dfa->nexts[clone_node] = dfa->nexts[org_node];
4007
	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4008
	  if (BE (ret < 0, 0))
4009
	    return REG_ESPACE;
4010
	}
4011
      else if (dfa->edests[org_node].nelem == 0)
4012
	{
4013
	  /* In case of the node can't epsilon-transit, don't duplicate the
4014
	     destination and store the original destination as the
4015
	     destination of the node.  */
4016
	  dfa->nexts[clone_node] = dfa->nexts[org_node];
4017
	  break;
4018
	}
4019
      else if (dfa->edests[org_node].nelem == 1)
4020
	{
4021
	  /* In case of the node can epsilon-transit, and it has only one
4022
	     destination.  */
4023
	  org_dest = dfa->edests[org_node].elems[0];
4024
	  re_node_set_empty (dfa->edests + clone_node);
4025
	  if (dfa->nodes[org_node].type == ANCHOR)
4026
	    {
4027
	      /* In case of the node has another constraint, append it.  */
4028
	      if (org_node == root_node && clone_node != org_node)
4029
		{
4030
		  /* ...but if the node is root_node itself, it means the
4031
		     epsilon closure have a loop, then tie it to the
4032
		     destination of the root_node.  */
4033
		  ret = re_node_set_insert (dfa->edests + clone_node,
4034
					    org_dest);
4035
		  if (BE (ret < 0, 0))
4036
		    return REG_ESPACE;
4037
		  break;
4038
		}
4039
	      constraint |= dfa->nodes[org_node].opr.ctx_type;
4040
	    }
4041
	  clone_dest = duplicate_node (dfa, org_dest, constraint);
4042
	  if (BE (clone_dest == -1, 0))
4043
	    return REG_ESPACE;
4044
	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4045
	  if (BE (ret < 0, 0))
4046
	    return REG_ESPACE;
4047
	}
4048
      else /* dfa->edests[org_node].nelem == 2 */
4049
	{
4050
	  /* In case of the node can epsilon-transit, and it has two
4051
	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
4052
	  org_dest = dfa->edests[org_node].elems[0];
4053
	  re_node_set_empty (dfa->edests + clone_node);
4054
	  /* Search for a duplicated node which satisfies the constraint.  */
4055
	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
4056
	  if (clone_dest == -1)
4057
	    {
4058
	      /* There are no such a duplicated node, create a new one.  */
4059
	      reg_errcode_t err;
4060
	      clone_dest = duplicate_node (dfa, org_dest, constraint);
4061
	      if (BE (clone_dest == -1, 0))
4062
		return REG_ESPACE;
4063
	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4064
	      if (BE (ret < 0, 0))
4065
		return REG_ESPACE;
4066
	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
4067
					    root_node, constraint);
4068
	      if (BE (err != REG_NOERROR, 0))
4069
		return err;
4070
	    }
4071
	  else
4072
	    {
4073
	      /* There are a duplicated node which satisfy the constraint,
4074
		 use it to avoid infinite loop.  */
4075
	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4076
	      if (BE (ret < 0, 0))
4077
		return REG_ESPACE;
4078
	    }
4079

4080
	  org_dest = dfa->edests[org_node].elems[1];
4081
	  clone_dest = duplicate_node (dfa, org_dest, constraint);
4082
	  if (BE (clone_dest == -1, 0))
4083
	    return REG_ESPACE;
4084
	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4085
	  if (BE (ret < 0, 0))
4086
	    return REG_ESPACE;
4087
	}
4088
      org_node = org_dest;
4089
      clone_node = clone_dest;
4090
    }
4091
  return REG_NOERROR;
4092
}
4093

4094
/* Search for a node which is duplicated from the node ORG_NODE, and
4095
   satisfies the constraint CONSTRAINT.  */
4096

4097
static int
4098
search_duplicated_node (const re_dfa_t *dfa, int org_node,
4099
			unsigned int constraint)
4100
{
4101
  int idx;
4102
  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
4103
    {
4104
      if (org_node == dfa->org_indices[idx]
4105
	  && constraint == dfa->nodes[idx].constraint)
4106
	return idx; /* Found.  */
4107
    }
4108
  return -1; /* Not found.  */
4109
}
4110

4111
/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
4112
   Return the index of the new node, or -1 if insufficient storage is
4113
   available.  */
4114

4115
static int
4116
duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
4117
{
4118
  int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
4119
  if (BE (dup_idx != -1, 1))
4120
    {
4121
      dfa->nodes[dup_idx].constraint = constraint;
4122
      if (dfa->nodes[org_idx].type == ANCHOR)
4123
	dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
4124
      dfa->nodes[dup_idx].duplicated = 1;
4125

4126
      /* Store the index of the original node.  */
4127
      dfa->org_indices[dup_idx] = org_idx;
4128
    }
4129
  return dup_idx;
4130
}
4131

4132
static reg_errcode_t
4133
calc_inveclosure (re_dfa_t *dfa)
4134
{
4135
  int src, idx, ret;
4136
  for (idx = 0; idx < dfa->nodes_len; ++idx)
4137
    re_node_set_init_empty (dfa->inveclosures + idx);
4138

4139
  for (src = 0; src < dfa->nodes_len; ++src)
4140
    {
4141
      int *elems = dfa->eclosures[src].elems;
4142
      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
4143
	{
4144
	  ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
4145
	  if (BE (ret == -1, 0))
4146
	    return REG_ESPACE;
4147
	}
4148
    }
4149

4150
  return REG_NOERROR;
4151
}
4152

4153
/* Calculate "eclosure" for all the node in DFA.  */
4154

4155
static reg_errcode_t
4156
calc_eclosure (re_dfa_t *dfa)
4157
{
4158
  int node_idx, incomplete;
4159
#ifdef DEBUG
4160
  assert (dfa->nodes_len > 0);
4161
#endif
4162
  incomplete = 0;
4163
  /* For each nodes, calculate epsilon closure.  */
4164
  for (node_idx = 0; ; ++node_idx)
4165
    {
4166
      reg_errcode_t err;
4167
      re_node_set eclosure_elem;
4168
      if (node_idx == dfa->nodes_len)
4169
	{
4170
	  if (!incomplete)
4171
	    break;
4172
	  incomplete = 0;
4173
	  node_idx = 0;
4174
	}
4175

4176
#ifdef DEBUG
4177
      assert (dfa->eclosures[node_idx].nelem != -1);
4178
#endif
4179

4180
      /* If we have already calculated, skip it.  */
4181
      if (dfa->eclosures[node_idx].nelem != 0)
4182
	continue;
4183
      /* Calculate epsilon closure of `node_idx'.  */
4184
      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
4185
      if (BE (err != REG_NOERROR, 0))
4186
	return err;
4187

4188
      if (dfa->eclosures[node_idx].nelem == 0)
4189
	{
4190
	  incomplete = 1;
4191
	  re_node_set_free (&eclosure_elem);
4192
	}
4193
    }
4194
  return REG_NOERROR;
4195
}
4196

4197
/* Calculate epsilon closure of NODE.  */
4198

4199
static reg_errcode_t
4200
calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
4201
{
4202
  reg_errcode_t err;
4203
  unsigned int constraint;
4204
  int i, incomplete;
4205
  re_node_set eclosure;
4206
  incomplete = 0;
4207
  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
4208
  if (BE (err != REG_NOERROR, 0))
4209
    return err;
4210

4211
  /* This indicates that we are calculating this node now.
4212
     We reference this value to avoid infinite loop.  */
4213
  dfa->eclosures[node].nelem = -1;
4214

4215
  constraint = ((dfa->nodes[node].type == ANCHOR)
4216
		? dfa->nodes[node].opr.ctx_type : 0);
4217
  /* If the current node has constraints, duplicate all nodes.
4218
     Since they must inherit the constraints.  */
4219
  if (constraint
4220
      && dfa->edests[node].nelem
4221
      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
4222
    {
4223
      err = duplicate_node_closure (dfa, node, node, node, constraint);
4224
      if (BE (err != REG_NOERROR, 0))
4225
	return err;
4226
    }
4227

4228
  /* Expand each epsilon destination nodes.  */
4229
  if (IS_EPSILON_NODE(dfa->nodes[node].type))
4230
    for (i = 0; i < dfa->edests[node].nelem; ++i)
4231
      {
4232
	re_node_set eclosure_elem;
4233
	int edest = dfa->edests[node].elems[i];
4234
	/* If calculating the epsilon closure of `edest' is in progress,
4235
	   return intermediate result.  */
4236
	if (dfa->eclosures[edest].nelem == -1)
4237
	  {
4238
	    incomplete = 1;
4239
	    continue;
4240
	  }
4241
	/* If we haven't calculated the epsilon closure of `edest' yet,
4242
	   calculate now. Otherwise use calculated epsilon closure.  */
4243
	if (dfa->eclosures[edest].nelem == 0)
4244
	  {
4245
	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
4246
	    if (BE (err != REG_NOERROR, 0))
4247
	      return err;
4248
	  }
4249
	else
4250
	  eclosure_elem = dfa->eclosures[edest];
4251
	/* Merge the epsilon closure of `edest'.  */
4252
	re_node_set_merge (&eclosure, &eclosure_elem);
4253
	/* If the epsilon closure of `edest' is incomplete,
4254
	   the epsilon closure of this node is also incomplete.  */
4255
	if (dfa->eclosures[edest].nelem == 0)
4256
	  {
4257
	    incomplete = 1;
4258
	    re_node_set_free (&eclosure_elem);
4259
	  }
4260
      }
4261

4262
  /* Epsilon closures include itself.  */
4263
  re_node_set_insert (&eclosure, node);
4264
  if (incomplete && !root)
4265
    dfa->eclosures[node].nelem = 0;
4266
  else
4267
    dfa->eclosures[node] = eclosure;
4268
  *new_set = eclosure;
4269
  return REG_NOERROR;
4270
}
4271

4272
/* Functions for token which are used in the parser.  */
4273

4274
/* Fetch a token from INPUT.
4275
   We must not use this function inside bracket expressions.  */
4276

4277
static void
4278
internal_function
4279
fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
4280
{
4281
  re_string_skip_bytes (input, peek_token (result, input, syntax));
4282
}
4283

4284
/* Peek a token from INPUT, and return the length of the token.
4285
   We must not use this function inside bracket expressions.  */
4286

4287
static int
4288
internal_function
4289
peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
4290
{
4291
  unsigned char c;
4292

4293
  if (re_string_eoi (input))
4294
    {
4295
      token->type = END_OF_RE;
4296
      return 0;
4297
    }
4298

4299
  c = re_string_peek_byte (input, 0);
4300
  token->opr.c = c;
4301

4302
  token->word_char = 0;
4303
#ifdef RE_ENABLE_I18N
4304
  token->mb_partial = 0;
4305
  if (input->mb_cur_max > 1 &&
4306
      !re_string_first_byte (input, re_string_cur_idx (input)))
4307
    {
4308
      token->type = CHARACTER;
4309
      token->mb_partial = 1;
4310
      return 1;
4311
    }
4312
#endif
4313
  if (c == '\\')
4314
    {
4315
      unsigned char c2;
4316
      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
4317
	{
4318
	  token->type = BACK_SLASH;
4319
	  return 1;
4320
	}
4321

4322
      c2 = re_string_peek_byte_case (input, 1);
4323
      token->opr.c = c2;
4324
      token->type = CHARACTER;
4325
#ifdef RE_ENABLE_I18N
4326
      if (input->mb_cur_max > 1)
4327
	{
4328
	  wint_t wc = re_string_wchar_at (input,
4329
					  re_string_cur_idx (input) + 1);
4330
	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
4331
	}
4332
      else
4333
#endif
4334
	token->word_char = IS_WORD_CHAR (c2) != 0;
4335

4336
      switch (c2)
4337
	{
4338
	case '|':
4339
	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
4340
	    token->type = OP_ALT;
4341
	  break;
4342
	case '1': case '2': case '3': case '4': case '5':
4343
	case '6': case '7': case '8': case '9':
4344
	  if (!(syntax & RE_NO_BK_REFS))
4345
	    {
4346
	      token->type = OP_BACK_REF;
4347
	      token->opr.idx = c2 - '1';
4348
	    }
4349
	  break;
4350
	case '<':
4351
	  if (!(syntax & RE_NO_GNU_OPS))
4352
	    {
4353
	      token->type = ANCHOR;
4354
	      token->opr.ctx_type = WORD_FIRST;
4355
	    }
4356
	  break;
4357
	case '>':
4358
	  if (!(syntax & RE_NO_GNU_OPS))
4359
	    {
4360
	      token->type = ANCHOR;
4361
	      token->opr.ctx_type = WORD_LAST;
4362
	    }
4363
	  break;
4364
	case 'b':
4365
	  if (!(syntax & RE_NO_GNU_OPS))
4366
	    {
4367
	      token->type = ANCHOR;
4368
	      token->opr.ctx_type = WORD_DELIM;
4369
	    }
4370
	  break;
4371
	case 'B':
4372
	  if (!(syntax & RE_NO_GNU_OPS))
4373
	    {
4374
	      token->type = ANCHOR;
4375
	      token->opr.ctx_type = NOT_WORD_DELIM;
4376
	    }
4377
	  break;
4378
	case 'w':
4379
	  if (!(syntax & RE_NO_GNU_OPS))
4380
	    token->type = OP_WORD;
4381
	  break;
4382
	case 'W':
4383
	  if (!(syntax & RE_NO_GNU_OPS))
4384
	    token->type = OP_NOTWORD;
4385
	  break;
4386
	case 's':
4387
	  if (!(syntax & RE_NO_GNU_OPS))
4388
	    token->type = OP_SPACE;
4389
	  break;
4390
	case 'S':
4391
	  if (!(syntax & RE_NO_GNU_OPS))
4392
	    token->type = OP_NOTSPACE;
4393
	  break;
4394
	case '`':
4395
	  if (!(syntax & RE_NO_GNU_OPS))
4396
	    {
4397
	      token->type = ANCHOR;
4398
	      token->opr.ctx_type = BUF_FIRST;
4399
	    }
4400
	  break;
4401
	case '\'':
4402
	  if (!(syntax & RE_NO_GNU_OPS))
4403
	    {
4404
	      token->type = ANCHOR;
4405
	      token->opr.ctx_type = BUF_LAST;
4406
	    }
4407
	  break;
4408
	case '(':
4409
	  if (!(syntax & RE_NO_BK_PARENS))
4410
	    token->type = OP_OPEN_SUBEXP;
4411
	  break;
4412
	case ')':
4413
	  if (!(syntax & RE_NO_BK_PARENS))
4414
	    token->type = OP_CLOSE_SUBEXP;
4415
	  break;
4416
	case '+':
4417
	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
4418
	    token->type = OP_DUP_PLUS;
4419
	  break;
4420
	case '?':
4421
	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
4422
	    token->type = OP_DUP_QUESTION;
4423
	  break;
4424
	case '{':
4425
	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
4426
	    token->type = OP_OPEN_DUP_NUM;
4427
	  break;
4428
	case '}':
4429
	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
4430
	    token->type = OP_CLOSE_DUP_NUM;
4431
	  break;
4432
	default:
4433
	  break;
4434
	}
4435
      return 2;
4436
    }
4437

4438
  token->type = CHARACTER;
4439
#ifdef RE_ENABLE_I18N
4440
  if (input->mb_cur_max > 1)
4441
    {
4442
      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
4443
      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
4444
    }
4445
  else
4446
#endif
4447
    token->word_char = IS_WORD_CHAR (token->opr.c);
4448

4449
  switch (c)
4450
    {
4451
    case '\n':
4452
      if (syntax & RE_NEWLINE_ALT)
4453
	token->type = OP_ALT;
4454
      break;
4455
    case '|':
4456
      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
4457
	token->type = OP_ALT;
4458
      break;
4459
    case '*':
4460
      token->type = OP_DUP_ASTERISK;
4461
      break;
4462
    case '+':
4463
      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
4464
	token->type = OP_DUP_PLUS;
4465
      break;
4466
    case '?':
4467
      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
4468
	token->type = OP_DUP_QUESTION;
4469
      break;
4470
    case '{':
4471
      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
4472
	token->type = OP_OPEN_DUP_NUM;
4473
      break;
4474
    case '}':
4475
      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
4476
	token->type = OP_CLOSE_DUP_NUM;
4477
      break;
4478
    case '(':
4479
      if (syntax & RE_NO_BK_PARENS)
4480
	token->type = OP_OPEN_SUBEXP;
4481
      break;
4482
    case ')':
4483
      if (syntax & RE_NO_BK_PARENS)
4484
	token->type = OP_CLOSE_SUBEXP;
4485
      break;
4486
    case '[':
4487
      token->type = OP_OPEN_BRACKET;
4488
      break;
4489
    case '.':
4490
      token->type = OP_PERIOD;
4491
      break;
4492
    case '^':
4493
      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
4494
	  re_string_cur_idx (input) != 0)
4495
	{
4496
	  char prev = re_string_peek_byte (input, -1);
4497
	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
4498
	    break;
4499
	}
4500
      token->type = ANCHOR;
4501
      token->opr.ctx_type = LINE_FIRST;
4502
      break;
4503
    case '$':
4504
      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
4505
	  re_string_cur_idx (input) + 1 != re_string_length (input))
4506
	{
4507
	  re_token_t next;
4508
	  re_string_skip_bytes (input, 1);
4509
	  peek_token (&next, input, syntax);
4510
	  re_string_skip_bytes (input, -1);
4511
	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
4512
	    break;
4513
	}
4514
      token->type = ANCHOR;
4515
      token->opr.ctx_type = LINE_LAST;
4516
      break;
4517
    default:
4518
      break;
4519
    }
4520
  return 1;
4521
}
4522

4523
/* Peek a token from INPUT, and return the length of the token.
4524
   We must not use this function out of bracket expressions.  */
4525

4526
static int
4527
internal_function
4528
peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
4529
{
4530
  unsigned char c;
4531
  if (re_string_eoi (input))
4532
    {
4533
      token->type = END_OF_RE;
4534
      return 0;
4535
    }
4536
  c = re_string_peek_byte (input, 0);
4537
  token->opr.c = c;
4538

4539
#ifdef RE_ENABLE_I18N
4540
  if (input->mb_cur_max > 1 &&
4541
      !re_string_first_byte (input, re_string_cur_idx (input)))
4542
    {
4543
      token->type = CHARACTER;
4544
      return 1;
4545
    }
4546
#endif /* RE_ENABLE_I18N */
4547

4548
  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
4549
      && re_string_cur_idx (input) + 1 < re_string_length (input))
4550
    {
4551
      /* In this case, '\' escape a character.  */
4552
      unsigned char c2;
4553
      re_string_skip_bytes (input, 1);
4554
      c2 = re_string_peek_byte (input, 0);
4555
      token->opr.c = c2;
4556
      token->type = CHARACTER;
4557
      return 1;
4558
    }
4559
  if (c == '[') /* '[' is a special char in a bracket exps.  */
4560
    {
4561
      unsigned char c2;
4562
      int token_len;
4563
      if (re_string_cur_idx (input) + 1 < re_string_length (input))
4564
	c2 = re_string_peek_byte (input, 1);
4565
      else
4566
	c2 = 0;
4567
      token->opr.c = c2;
4568
      token_len = 2;
4569
      switch (c2)
4570
	{
4571
	case '.':
4572
	  token->type = OP_OPEN_COLL_ELEM;
4573
	  break;
4574
	case '=':
4575
	  token->type = OP_OPEN_EQUIV_CLASS;
4576
	  break;
4577
	case ':':
4578
	  if (syntax & RE_CHAR_CLASSES)
4579
	    {
4580
	      token->type = OP_OPEN_CHAR_CLASS;
4581
	      break;
4582
	    }
4583
	  /* else fall through.  */
4584
	default:
4585
	  token->type = CHARACTER;
4586
	  token->opr.c = c;
4587
	  token_len = 1;
4588
	  break;
4589
	}
4590
      return token_len;
4591
    }
4592
  switch (c)
4593
    {
4594
    case '-':
4595
      token->type = OP_CHARSET_RANGE;
4596
      break;
4597
    case ']':
4598
      token->type = OP_CLOSE_BRACKET;
4599
      break;
4600
    case '^':
4601
      token->type = OP_NON_MATCH_LIST;
4602
      break;
4603
    default:
4604
      token->type = CHARACTER;
4605
    }
4606
  return 1;
4607
}
4608

4609
/* Functions for parser.  */
4610

4611
/* Entry point of the parser.
4612
   Parse the regular expression REGEXP and return the structure tree.
4613
   If an error is occured, ERR is set by error code, and return NULL.
4614
   This function build the following tree, from regular expression <reg_exp>:
4615
	   CAT
4616
	   / \
4617
	  /   \
4618
   <reg_exp>  EOR
4619

4620
   CAT means concatenation.
4621
   EOR means end of regular expression.  */
4622

4623
static bin_tree_t *
4624
parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
4625
       reg_errcode_t *err)
4626
{
4627
  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4628
  bin_tree_t *tree, *eor, *root;
4629
  re_token_t current_token;
4630
  dfa->syntax = syntax;
4631
  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
4632
  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
4633
  if (BE (*err != REG_NOERROR && tree == NULL, 0))
4634
    return NULL;
4635
  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
4636
  if (tree != NULL)
4637
    root = create_tree (dfa, tree, eor, CONCAT);
4638
  else
4639
    root = eor;
4640
  if (BE (eor == NULL || root == NULL, 0))
4641
    {
4642
      *err = REG_ESPACE;
4643
      return NULL;
4644
    }
4645
  return root;
4646
}
4647

4648
/* This function build the following tree, from regular expression
4649
   <branch1>|<branch2>:
4650
	   ALT
4651
	   / \
4652
	  /   \
4653
   <branch1> <branch2>
4654

4655
   ALT means alternative, which represents the operator `|'.  */
4656

4657
static bin_tree_t *
4658
parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
4659
	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
4660
{
4661
  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4662
  bin_tree_t *tree, *branch = NULL;
4663
  tree = parse_branch (regexp, preg, token, syntax, nest, err);
4664
  if (BE (*err != REG_NOERROR && tree == NULL, 0))
4665
    return NULL;
4666

4667
  while (token->type == OP_ALT)
4668
    {
4669
      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
4670
      if (token->type != OP_ALT && token->type != END_OF_RE
4671
	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
4672
	{
4673
	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
4674
	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
4675
	    return NULL;
4676
	}
4677
      else
4678
	branch = NULL;
4679
      tree = create_tree (dfa, tree, branch, OP_ALT);
4680
      if (BE (tree == NULL, 0))
4681
	{
4682
	  *err = REG_ESPACE;
4683
	  return NULL;
4684
	}
4685
    }
4686
  return tree;
4687
}
4688

4689
/* This function build the following tree, from regular expression
4690
   <exp1><exp2>:
4691
	CAT
4692
	/ \
4693
       /   \
4694
   <exp1> <exp2>
4695

4696
   CAT means concatenation.  */
4697

4698
static bin_tree_t *
4699
parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
4700
	      reg_syntax_t syntax, int nest, reg_errcode_t *err)
4701
{
4702
  bin_tree_t *tree, *exp;
4703
  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4704
  tree = parse_expression (regexp, preg, token, syntax, nest, err);
4705
  if (BE (*err != REG_NOERROR && tree == NULL, 0))
4706
    return NULL;
4707

4708
  while (token->type != OP_ALT && token->type != END_OF_RE
4709
	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
4710
    {
4711
      exp = parse_expression (regexp, preg, token, syntax, nest, err);
4712
      if (BE (*err != REG_NOERROR && exp == NULL, 0))
4713
	{
4714
	  return NULL;
4715
	}
4716
      if (tree != NULL && exp != NULL)
4717
	{
4718
	  tree = create_tree (dfa, tree, exp, CONCAT);
4719
	  if (tree == NULL)
4720
	    {
4721
	      *err = REG_ESPACE;
4722
	      return NULL;
4723
	    }
4724
	}
4725
      else if (tree == NULL)
4726
	tree = exp;
4727
      /* Otherwise exp == NULL, we don't need to create new tree.  */
4728
    }
4729
  return tree;
4730
}
4731

4732
/* This function build the following tree, from regular expression a*:
4733
	 *
4734
	 |
4735
	 a
4736
*/
4737

4738
static bin_tree_t *
4739
parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
4740
		  reg_syntax_t syntax, int nest, reg_errcode_t *err)
4741
{
4742
  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4743
  bin_tree_t *tree;
4744
  switch (token->type)
4745
    {
4746
    case CHARACTER:
4747
      tree = create_token_tree (dfa, NULL, NULL, token);
4748
      if (BE (tree == NULL, 0))
4749
	{
4750
	  *err = REG_ESPACE;
4751
	  return NULL;
4752
	}
4753
#ifdef RE_ENABLE_I18N
4754
      if (dfa->mb_cur_max > 1)
4755
	{
4756
	  while (!re_string_eoi (regexp)
4757
		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
4758
	    {
4759
	      bin_tree_t *mbc_remain;
4760
	      fetch_token (token, regexp, syntax);
4761
	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
4762
	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
4763
	      if (BE (mbc_remain == NULL || tree == NULL, 0))
4764
		{
4765
		  *err = REG_ESPACE;
4766
		  return NULL;
4767
		}
4768
	    }
4769
	}
4770
#endif
4771
      break;
4772
    case OP_OPEN_SUBEXP:
4773
      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
4774
      if (BE (*err != REG_NOERROR && tree == NULL, 0))
4775
	return NULL;
4776
      break;
4777
    case OP_OPEN_BRACKET:
4778
      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
4779
      if (BE (*err != REG_NOERROR && tree == NULL, 0))
4780
	return NULL;
4781
      break;
4782
    case OP_BACK_REF:
4783
      if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
4784
	{
4785
	  *err = REG_ESUBREG;
4786
	  return NULL;
4787
	}
4788
      dfa->used_bkref_map |= 1 << token->opr.idx;
4789
      tree = create_token_tree (dfa, NULL, NULL, token);
4790
      if (BE (tree == NULL, 0))
4791
	{
4792
	  *err = REG_ESPACE;
4793
	  return NULL;
4794
	}
4795
      ++dfa->nbackref;
4796
      dfa->has_mb_node = 1;
4797
      break;
4798
    case OP_OPEN_DUP_NUM:
4799
      if (syntax & RE_CONTEXT_INVALID_DUP)
4800
	{
4801
	  *err = REG_BADRPT;
4802
	  return NULL;
4803
	}
4804
      /* FALLTHROUGH */
4805
    case OP_DUP_ASTERISK:
4806
    case OP_DUP_PLUS:
4807
    case OP_DUP_QUESTION:
4808
      if (syntax & RE_CONTEXT_INVALID_OPS)
4809
	{
4810
	  *err = REG_BADRPT;
4811
	  return NULL;
4812
	}
4813
      else if (syntax & RE_CONTEXT_INDEP_OPS)
4814
	{
4815
	  fetch_token (token, regexp, syntax);
4816
	  return parse_expression (regexp, preg, token, syntax, nest, err);
4817
	}
4818
      /* else fall through  */
4819
    case OP_CLOSE_SUBEXP:
4820
      if ((token->type == OP_CLOSE_SUBEXP) &&
4821
	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
4822
	{
4823
	  *err = REG_ERPAREN;
4824
	  return NULL;
4825
	}
4826
      /* else fall through  */
4827
    case OP_CLOSE_DUP_NUM:
4828
      /* We treat it as a normal character.  */
4829

4830
      /* Then we can these characters as normal characters.  */
4831
      token->type = CHARACTER;
4832
      /* mb_partial and word_char bits should be initialized already
4833
	 by peek_token.  */
4834
      tree = create_token_tree (dfa, NULL, NULL, token);
4835
      if (BE (tree == NULL, 0))
4836
	{
4837
	  *err = REG_ESPACE;
4838
	  return NULL;
4839
	}
4840
      break;
4841
    case ANCHOR:
4842
      if ((token->opr.ctx_type
4843
	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
4844
	  && dfa->word_ops_used == 0)
4845
	init_word_char (dfa);
4846
      if (token->opr.ctx_type == WORD_DELIM
4847
          || token->opr.ctx_type == NOT_WORD_DELIM)
4848
	{
4849
	  bin_tree_t *tree_first, *tree_last;
4850
	  if (token->opr.ctx_type == WORD_DELIM)
4851
	    {
4852
	      token->opr.ctx_type = WORD_FIRST;
4853
	      tree_first = create_token_tree (dfa, NULL, NULL, token);
4854
	      token->opr.ctx_type = WORD_LAST;
4855
            }
4856
          else
4857
            {
4858
	      token->opr.ctx_type = INSIDE_WORD;
4859
	      tree_first = create_token_tree (dfa, NULL, NULL, token);
4860
	      token->opr.ctx_type = INSIDE_NOTWORD;
4861
            }
4862
	  tree_last = create_token_tree (dfa, NULL, NULL, token);
4863
	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
4864
	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
4865
	    {
4866
	      *err = REG_ESPACE;
4867
	      return NULL;
4868
	    }
4869
	}
4870
      else
4871
	{
4872
	  tree = create_token_tree (dfa, NULL, NULL, token);
4873
	  if (BE (tree == NULL, 0))
4874
	    {
4875
	      *err = REG_ESPACE;
4876
	      return NULL;
4877
	    }
4878
	}
4879
      /* We must return here, since ANCHORs can't be followed
4880
	 by repetition operators.
4881
	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
4882
	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
4883
      fetch_token (token, regexp, syntax);
4884
      return tree;
4885
    case OP_PERIOD:
4886
      tree = create_token_tree (dfa, NULL, NULL, token);
4887
      if (BE (tree == NULL, 0))
4888
	{
4889
	  *err = REG_ESPACE;
4890
	  return NULL;
4891
	}
4892
      if (dfa->mb_cur_max > 1)
4893
	dfa->has_mb_node = 1;
4894
      break;
4895
    case OP_WORD:
4896
    case OP_NOTWORD:
4897
      tree = build_charclass_op (dfa, regexp->trans,
4898
				 (const unsigned char *) "alnum",
4899
				 (const unsigned char *) "_",
4900
				 token->type == OP_NOTWORD, err);
4901
      if (BE (*err != REG_NOERROR && tree == NULL, 0))
4902
	return NULL;
4903
      break;
4904
    case OP_SPACE:
4905
    case OP_NOTSPACE:
4906
      tree = build_charclass_op (dfa, regexp->trans,
4907
				 (const unsigned char *) "space",
4908
				 (const unsigned char *) "",
4909
				 token->type == OP_NOTSPACE, err);
4910
      if (BE (*err != REG_NOERROR && tree == NULL, 0))
4911
	return NULL;
4912
      break;
4913
    case OP_ALT:
4914
    case END_OF_RE:
4915
      return NULL;
4916
    case BACK_SLASH:
4917
      *err = REG_EESCAPE;
4918
      return NULL;
4919
    default:
4920
      /* Must not happen?  */
4921
#ifdef DEBUG
4922
      assert (0);
4923
#endif
4924
      return NULL;
4925
    }
4926
  fetch_token (token, regexp, syntax);
4927

4928
  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
4929
	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
4930
    {
4931
      tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
4932
      if (BE (*err != REG_NOERROR && tree == NULL, 0))
4933
	return NULL;
4934
      /* In BRE consecutive duplications are not allowed.  */
4935
      if ((syntax & RE_CONTEXT_INVALID_DUP)
4936
	  && (token->type == OP_DUP_ASTERISK
4937
	      || token->type == OP_OPEN_DUP_NUM))
4938
	{
4939
	  *err = REG_BADRPT;
4940
	  return NULL;
4941
	}
4942
    }
4943

4944
  return tree;
4945
}
4946

4947
/* This function build the following tree, from regular expression
4948
   (<reg_exp>):
4949
	 SUBEXP
4950
	    |
4951
	<reg_exp>
4952
*/
4953

4954
static bin_tree_t *
4955
parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
4956
	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
4957
{
4958
  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4959
  bin_tree_t *tree;
4960
  size_t cur_nsub;
4961
  cur_nsub = preg->re_nsub++;
4962

4963
  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
4964

4965
  /* The subexpression may be a null string.  */
4966
  if (token->type == OP_CLOSE_SUBEXP)
4967
    tree = NULL;
4968
  else
4969
    {
4970
      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
4971
      if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
4972
        *err = REG_EPAREN;
4973
      if (BE (*err != REG_NOERROR, 0))
4974
	return NULL;
4975
    }
4976

4977
  if (cur_nsub <= '9' - '1')
4978
    dfa->completed_bkref_map |= 1 << cur_nsub;
4979

4980
  tree = create_tree (dfa, tree, NULL, SUBEXP);
4981
  if (BE (tree == NULL, 0))
4982
    {
4983
      *err = REG_ESPACE;
4984
      return NULL;
4985
    }
4986
  tree->token.opr.idx = cur_nsub;
4987
  return tree;
4988
}
4989

4990
/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
4991

4992
static bin_tree_t *
4993
parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
4994
	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
4995
{
4996
  bin_tree_t *tree = NULL, *old_tree = NULL;
4997
  int i, start, end, start_idx = re_string_cur_idx (regexp);
4998
  re_token_t start_token = *token;
4999

5000
  if (token->type == OP_OPEN_DUP_NUM)
5001
    {
5002
      end = 0;
5003
      start = fetch_number (regexp, token, syntax);
5004
      if (start == -1)
5005
	{
5006
	  if (token->type == CHARACTER && token->opr.c == ',')
5007
	    start = 0; /* We treat "{,m}" as "{0,m}".  */
5008
	  else
5009
	    {
5010
	      *err = REG_BADBR; /* <re>{} is invalid.  */
5011
	      return NULL;
5012
	    }
5013
	}
5014
      if (BE (start != -2, 1))
5015
	{
5016
	  /* We treat "{n}" as "{n,n}".  */
5017
	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
5018
		 : ((token->type == CHARACTER && token->opr.c == ',')
5019
		    ? fetch_number (regexp, token, syntax) : -2));
5020
	}
5021
      if (BE (start == -2 || end == -2, 0))
5022
	{
5023
	  /* Invalid sequence.  */
5024
	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
5025
	    {
5026
	      if (token->type == END_OF_RE)
5027
		*err = REG_EBRACE;
5028
	      else
5029
		*err = REG_BADBR;
5030

5031
	      return NULL;
5032
	    }
5033

5034
	  /* If the syntax bit is set, rollback.  */
5035
	  re_string_set_index (regexp, start_idx);
5036
	  *token = start_token;
5037
	  token->type = CHARACTER;
5038
	  /* mb_partial and word_char bits should be already initialized by
5039
	     peek_token.  */
5040
	  return elem;
5041
	}
5042

5043
      if (BE (end != -1 && start > end, 0))
5044
	{
5045
	  /* First number greater than second.  */
5046
	  *err = REG_BADBR;
5047
	  return NULL;
5048
	}
5049
    }
5050
  else
5051
    {
5052
      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
5053
      end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
5054
    }
5055

5056
  fetch_token (token, regexp, syntax);
5057

5058
  if (BE (elem == NULL, 0))
5059
    return NULL;
5060
  if (BE (start == 0 && end == 0, 0))
5061
    {
5062
      postorder (elem, free_tree, NULL);
5063
      return NULL;
5064
    }
5065

5066
  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
5067
  if (BE (start > 0, 0))
5068
    {
5069
      tree = elem;
5070
      for (i = 2; i <= start; ++i)
5071
	{
5072
	  elem = duplicate_tree (elem, dfa);
5073
	  tree = create_tree (dfa, tree, elem, CONCAT);
5074
	  if (BE (elem == NULL || tree == NULL, 0))
5075
	    goto parse_dup_op_espace;
5076
	}
5077

5078
      if (start == end)
5079
	return tree;
5080

5081
      /* Duplicate ELEM before it is marked optional.  */
5082
      elem = duplicate_tree (elem, dfa);
5083
      old_tree = tree;
5084
    }
5085
  else
5086
    old_tree = NULL;
5087

5088
  if (elem->token.type == SUBEXP)
5089
    postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
5090

5091
  tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
5092
  if (BE (tree == NULL, 0))
5093
    goto parse_dup_op_espace;
5094

5095
  /* This loop is actually executed only when end != -1,
5096
     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
5097
     already created the start+1-th copy.  */
5098
  for (i = start + 2; i <= end; ++i)
5099
    {
5100
      elem = duplicate_tree (elem, dfa);
5101
      tree = create_tree (dfa, tree, elem, CONCAT);
5102
      if (BE (elem == NULL || tree == NULL, 0))
5103
        goto parse_dup_op_espace;
5104

5105
      tree = create_tree (dfa, tree, NULL, OP_ALT);
5106
      if (BE (tree == NULL, 0))
5107
        goto parse_dup_op_espace;
5108
    }
5109

5110
  if (old_tree)
5111
    tree = create_tree (dfa, old_tree, tree, CONCAT);
5112

5113
  return tree;
5114

5115
 parse_dup_op_espace:
5116
  *err = REG_ESPACE;
5117
  return NULL;
5118
}
5119

5120
/* Size of the names for collating symbol/equivalence_class/character_class.
5121
   I'm not sure, but maybe enough.  */
5122
#define BRACKET_NAME_BUF_SIZE 32
5123

5124
#ifndef _LIBC
5125
  /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
5126
     Build the range expression which starts from START_ELEM, and ends
5127
     at END_ELEM.  The result are written to MBCSET and SBCSET.
5128
     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
5129
     mbcset->range_ends, is a pointer argument sinse we may
5130
     update it.  */
5131

5132
static reg_errcode_t
5133
internal_function
5134
# ifdef RE_ENABLE_I18N
5135
build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
5136
		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
5137
# else /* not RE_ENABLE_I18N */
5138
build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
5139
		 bracket_elem_t *end_elem)
5140
# endif /* not RE_ENABLE_I18N */
5141
{
5142
  unsigned int start_ch, end_ch;
5143
  /* Equivalence Classes and Character Classes can't be a range start/end.  */
5144
  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
5145
	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
5146
	  0))
5147
    return REG_ERANGE;
5148

5149
  /* We can handle no multi character collating elements without libc
5150
     support.  */
5151
  if (BE ((start_elem->type == COLL_SYM
5152
	   && strlen ((char *) start_elem->opr.name) > 1)
5153
	  || (end_elem->type == COLL_SYM
5154
	      && strlen ((char *) end_elem->opr.name) > 1), 0))
5155
    return REG_ECOLLATE;
5156

5157
# ifdef RE_ENABLE_I18N
5158
  {
5159
    wchar_t wc;
5160
    wint_t start_wc;
5161
    wint_t end_wc;
5162
    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
5163

5164
    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
5165
		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
5166
		   : 0));
5167
    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
5168
	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
5169
		 : 0));
5170
    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
5171
		? __btowc (start_ch) : start_elem->opr.wch);
5172
    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
5173
	      ? __btowc (end_ch) : end_elem->opr.wch);
5174
    if (start_wc == WEOF || end_wc == WEOF)
5175
      return REG_ECOLLATE;
5176
    cmp_buf[0] = start_wc;
5177
    cmp_buf[4] = end_wc;
5178
    if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
5179
      return REG_ERANGE;
5180

5181
    /* Got valid collation sequence values, add them as a new entry.
5182
       However, for !_LIBC we have no collation elements: if the
5183
       character set is single byte, the single byte character set
5184
       that we build below suffices.  parse_bracket_exp passes
5185
       no MBCSET if dfa->mb_cur_max == 1.  */
5186
    if (mbcset)
5187
      {
5188
        /* Check the space of the arrays.  */
5189
        if (BE (*range_alloc == mbcset->nranges, 0))
5190
          {
5191
	    /* There is not enough space, need realloc.  */
5192
	    wchar_t *new_array_start, *new_array_end;
5193
	    int new_nranges;
5194

5195
	    /* +1 in case of mbcset->nranges is 0.  */
5196
	    new_nranges = 2 * mbcset->nranges + 1;
5197
	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
5198
	       are NULL if *range_alloc == 0.  */
5199
	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
5200
				          new_nranges);
5201
	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
5202
				        new_nranges);
5203

5204
	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
5205
	      return REG_ESPACE;
5206

5207
	    mbcset->range_starts = new_array_start;
5208
	    mbcset->range_ends = new_array_end;
5209
	    *range_alloc = new_nranges;
5210
          }
5211

5212
        mbcset->range_starts[mbcset->nranges] = start_wc;
5213
        mbcset->range_ends[mbcset->nranges++] = end_wc;
5214
      }
5215

5216
    /* Build the table for single byte characters.  */
5217
    for (wc = 0; wc < SBC_MAX; ++wc)
5218
      {
5219
	cmp_buf[2] = wc;
5220
	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
5221
	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
5222
	  bitset_set (sbcset, wc);
5223
      }
5224
  }
5225
# else /* not RE_ENABLE_I18N */
5226
  {
5227
    unsigned int ch;
5228
    start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
5229
		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
5230
		   : 0));
5231
    end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
5232
	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
5233
		 : 0));
5234
    if (start_ch > end_ch)
5235
      return REG_ERANGE;
5236
    /* Build the table for single byte characters.  */
5237
    for (ch = 0; ch < SBC_MAX; ++ch)
5238
      if (start_ch <= ch  && ch <= end_ch)
5239
	bitset_set (sbcset, ch);
5240
  }
5241
# endif /* not RE_ENABLE_I18N */
5242
  return REG_NOERROR;
5243
}
5244
#endif /* not _LIBC */
5245

5246
#ifndef _LIBC
5247
/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
5248
   Build the collating element which is represented by NAME.
5249
   The result are written to MBCSET and SBCSET.
5250
   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
5251
   pointer argument since we may update it.  */
5252

5253
static reg_errcode_t
5254
internal_function
5255
# ifdef RE_ENABLE_I18N
5256
build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
5257
			int *coll_sym_alloc, const unsigned char *name)
5258
# else /* not RE_ENABLE_I18N */
5259
build_collating_symbol (bitset_t sbcset, const unsigned char *name)
5260
# endif /* not RE_ENABLE_I18N */
5261
{
5262
  size_t name_len = strlen ((const char *) name);
5263
  if (BE (name_len != 1, 0))
5264
    return REG_ECOLLATE;
5265
  else
5266
    {
5267
      bitset_set (sbcset, name[0]);
5268
      return REG_NOERROR;
5269
    }
5270
}
5271
#endif /* not _LIBC */
5272

5273
/* This function parse bracket expression like "[abc]", "[a-c]",
5274
   "[[.a-a.]]" etc.  */
5275

5276
static bin_tree_t *
5277
parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
5278
		   reg_syntax_t syntax, reg_errcode_t *err)
5279
{
5280
#ifdef _LIBC
5281
  const unsigned char *collseqmb;
5282
  const char *collseqwc;
5283
  uint32_t nrules;
5284
  int32_t table_size;
5285
  const int32_t *symb_table;
5286
  const unsigned char *extra;
5287

5288
  /* Local function for parse_bracket_exp used in _LIBC environement.
5289
     Seek the collating symbol entry correspondings to NAME.
5290
     Return the index of the symbol in the SYMB_TABLE.  */
5291

5292
  auto inline int32_t
5293
  __attribute ((always_inline))
5294
  seek_collating_symbol_entry (name, name_len)
5295
	 const unsigned char *name;
5296
	 size_t name_len;
5297
    {
5298
      int32_t hash = elem_hash ((const char *) name, name_len);
5299
      int32_t elem = hash % table_size;
5300
      if (symb_table[2 * elem] != 0)
5301
	{
5302
	  int32_t second = hash % (table_size - 2) + 1;
5303

5304
	  do
5305
	    {
5306
	      /* First compare the hashing value.  */
5307
	      if (symb_table[2 * elem] == hash
5308
		  /* Compare the length of the name.  */
5309
		  && name_len == extra[symb_table[2 * elem + 1]]
5310
		  /* Compare the name.  */
5311
		  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
5312
			     name_len) == 0)
5313
		{
5314
		  /* Yep, this is the entry.  */
5315
		  break;
5316
		}
5317

5318
	      /* Next entry.  */
5319
	      elem += second;
5320
	    }
5321
	  while (symb_table[2 * elem] != 0);
5322
	}
5323
      return elem;
5324
    }
5325

5326
  /* Local function for parse_bracket_exp used in _LIBC environement.
5327
     Look up the collation sequence value of BR_ELEM.
5328
     Return the value if succeeded, UINT_MAX otherwise.  */
5329

5330
  auto inline unsigned int
5331
  __attribute ((always_inline))
5332
  lookup_collation_sequence_value (br_elem)
5333
	 bracket_elem_t *br_elem;
5334
    {
5335
      if (br_elem->type == SB_CHAR)
5336
	{
5337
	  /*
5338
	  if (MB_CUR_MAX == 1)
5339
	  */
5340
	  if (nrules == 0)
5341
	    return collseqmb[br_elem->opr.ch];
5342
	  else
5343
	    {
5344
	      wint_t wc = __btowc (br_elem->opr.ch);
5345
	      return __collseq_table_lookup (collseqwc, wc);
5346
	    }
5347
	}
5348
      else if (br_elem->type == MB_CHAR)
5349
	{
5350
	  return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
5351
	}
5352
      else if (br_elem->type == COLL_SYM)
5353
	{
5354
	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
5355
	  if (nrules != 0)
5356
	    {
5357
	      int32_t elem, idx;
5358
	      elem = seek_collating_symbol_entry (br_elem->opr.name,
5359
						  sym_name_len);
5360
	      if (symb_table[2 * elem] != 0)
5361
		{
5362
		  /* We found the entry.  */
5363
		  idx = symb_table[2 * elem + 1];
5364
		  /* Skip the name of collating element name.  */
5365
		  idx += 1 + extra[idx];
5366
		  /* Skip the byte sequence of the collating element.  */
5367
		  idx += 1 + extra[idx];
5368
		  /* Adjust for the alignment.  */
5369
		  idx = (idx + 3) & ~3;
5370
		  /* Skip the multibyte collation sequence value.  */
5371
		  idx += sizeof (unsigned int);
5372
		  /* Skip the wide char sequence of the collating element.  */
5373
		  idx += sizeof (unsigned int) *
5374
		    (1 + *(unsigned int *) (extra + idx));
5375
		  /* Return the collation sequence value.  */
5376
		  return *(unsigned int *) (extra + idx);
5377
		}
5378
	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
5379
		{
5380
		  /* No valid character.  Match it as a single byte
5381
		     character.  */
5382
		  return collseqmb[br_elem->opr.name[0]];
5383
		}
5384
	    }
5385
	  else if (sym_name_len == 1)
5386
	    return collseqmb[br_elem->opr.name[0]];
5387
	}
5388
      return UINT_MAX;
5389
    }
5390

5391
  /* Local function for parse_bracket_exp used in _LIBC environement.
5392
     Build the range expression which starts from START_ELEM, and ends
5393
     at END_ELEM.  The result are written to MBCSET and SBCSET.
5394
     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
5395
     mbcset->range_ends, is a pointer argument sinse we may
5396
     update it.  */
5397

5398
  auto inline reg_errcode_t
5399
  __attribute ((always_inline))
5400
  build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
5401
	 re_charset_t *mbcset;
5402
	 int *range_alloc;
5403
	 bitset_t sbcset;
5404
	 bracket_elem_t *start_elem, *end_elem;
5405
    {
5406
      unsigned int ch;
5407
      uint32_t start_collseq;
5408
      uint32_t end_collseq;
5409

5410
      /* Equivalence Classes and Character Classes can't be a range
5411
	 start/end.  */
5412
      if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
5413
	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
5414
	      0))
5415
	return REG_ERANGE;
5416

5417
      start_collseq = lookup_collation_sequence_value (start_elem);
5418
      end_collseq = lookup_collation_sequence_value (end_elem);
5419
      /* Check start/end collation sequence values.  */
5420
      if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
5421
	return REG_ECOLLATE;
5422
      if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
5423
	return REG_ERANGE;
5424

5425
      /* Got valid collation sequence values, add them as a new entry.
5426
	 However, if we have no collation elements, and the character set
5427
	 is single byte, the single byte character set that we
5428
	 build below suffices. */
5429
      if (nrules > 0 || dfa->mb_cur_max > 1)
5430
	{
5431
          /* Check the space of the arrays.  */
5432
          if (BE (*range_alloc == mbcset->nranges, 0))
5433
	    {
5434
	      /* There is not enough space, need realloc.  */
5435
	      uint32_t *new_array_start;
5436
	      uint32_t *new_array_end;
5437
	      int new_nranges;
5438

5439
	      /* +1 in case of mbcset->nranges is 0.  */
5440
	      new_nranges = 2 * mbcset->nranges + 1;
5441
	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
5442
					    new_nranges);
5443
	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
5444
				          new_nranges);
5445

5446
	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
5447
	        return REG_ESPACE;
5448

5449
	      mbcset->range_starts = new_array_start;
5450
	      mbcset->range_ends = new_array_end;
5451
	      *range_alloc = new_nranges;
5452
	    }
5453

5454
          mbcset->range_starts[mbcset->nranges] = start_collseq;
5455
          mbcset->range_ends[mbcset->nranges++] = end_collseq;
5456
	}
5457

5458
      /* Build the table for single byte characters.  */
5459
      for (ch = 0; ch < SBC_MAX; ch++)
5460
	{
5461
	  uint32_t ch_collseq;
5462
	  /*
5463
	  if (MB_CUR_MAX == 1)
5464
	  */
5465
	  if (nrules == 0)
5466
	    ch_collseq = collseqmb[ch];
5467
	  else
5468
	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
5469
	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
5470
	    bitset_set (sbcset, ch);
5471
	}
5472
      return REG_NOERROR;
5473
    }
5474

5475
  /* Local function for parse_bracket_exp used in _LIBC environement.
5476
     Build the collating element which is represented by NAME.
5477
     The result are written to MBCSET and SBCSET.
5478
     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
5479
     pointer argument sinse we may update it.  */
5480

5481
  auto inline reg_errcode_t
5482
  __attribute ((always_inline))
5483
  build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
5484
	 re_charset_t *mbcset;
5485
	 int *coll_sym_alloc;
5486
	 bitset_t sbcset;
5487
	 const unsigned char *name;
5488
    {
5489
      int32_t elem, idx;
5490
      size_t name_len = strlen ((const char *) name);
5491
      if (nrules != 0)
5492
	{
5493
	  elem = seek_collating_symbol_entry (name, name_len);
5494
	  if (symb_table[2 * elem] != 0)
5495
	    {
5496
	      /* We found the entry.  */
5497
	      idx = symb_table[2 * elem + 1];
5498
	      /* Skip the name of collating element name.  */
5499
	      idx += 1 + extra[idx];
5500
	    }
5501
	  else if (symb_table[2 * elem] == 0 && name_len == 1)
5502
	    {
5503
	      /* No valid character, treat it as a normal
5504
		 character.  */
5505
	      bitset_set (sbcset, name[0]);
5506
	      return REG_NOERROR;
5507
	    }
5508
	  else
5509
	    return REG_ECOLLATE;
5510

5511
	  /* Got valid collation sequence, add it as a new entry.  */
5512
	  /* Check the space of the arrays.  */
5513
	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
5514
	    {
5515
	      /* Not enough, realloc it.  */
5516
	      /* +1 in case of mbcset->ncoll_syms is 0.  */
5517
	      int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
5518
	      /* Use realloc since mbcset->coll_syms is NULL
5519
		 if *alloc == 0.  */
5520
	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
5521
						   new_coll_sym_alloc);
5522
	      if (BE (new_coll_syms == NULL, 0))
5523
		return REG_ESPACE;
5524
	      mbcset->coll_syms = new_coll_syms;
5525
	      *coll_sym_alloc = new_coll_sym_alloc;
5526
	    }
5527
	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
5528
	  return REG_NOERROR;
5529
	}
5530
      else
5531
	{
5532
	  if (BE (name_len != 1, 0))
5533
	    return REG_ECOLLATE;
5534
	  else
5535
	    {
5536
	      bitset_set (sbcset, name[0]);
5537
	      return REG_NOERROR;
5538
	    }
5539
	}
5540
    }
5541
#endif
5542

5543
  re_token_t br_token;
5544
  re_bitset_ptr_t sbcset;
5545
#ifdef RE_ENABLE_I18N
5546
  re_charset_t *mbcset;
5547
  int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
5548
  int equiv_class_alloc = 0, char_class_alloc = 0;
5549
#endif /* not RE_ENABLE_I18N */
5550
  int non_match = 0;
5551
  bin_tree_t *work_tree;
5552
  int token_len;
5553
  int first_round = 1;
5554
#ifdef _LIBC
5555
  collseqmb = (const unsigned char *)
5556
    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
5557
  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
5558
  if (nrules)
5559
    {
5560
      /*
5561
      if (MB_CUR_MAX > 1)
5562
      */
5563
      collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
5564
      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
5565
      symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
5566
						  _NL_COLLATE_SYMB_TABLEMB);
5567
      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
5568
						   _NL_COLLATE_SYMB_EXTRAMB);
5569
    }
5570
#endif
5571
  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
5572
#ifdef RE_ENABLE_I18N
5573
  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
5574
#endif /* RE_ENABLE_I18N */
5575
#ifdef RE_ENABLE_I18N
5576
  if (BE (sbcset == NULL || mbcset == NULL, 0))
5577
#else
5578
  if (BE (sbcset == NULL, 0))
5579
#endif /* RE_ENABLE_I18N */
5580
    {
5581
      *err = REG_ESPACE;
5582
      return NULL;
5583
    }
5584

5585
  token_len = peek_token_bracket (token, regexp, syntax);
5586
  if (BE (token->type == END_OF_RE, 0))
5587
    {
5588
      *err = REG_BADPAT;
5589
      goto parse_bracket_exp_free_return;
5590
    }
5591
  if (token->type == OP_NON_MATCH_LIST)
5592
    {
5593
#ifdef RE_ENABLE_I18N
5594
      mbcset->non_match = 1;
5595
#endif /* not RE_ENABLE_I18N */
5596
      non_match = 1;
5597
      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
5598
	bitset_set (sbcset, '\0');
5599
      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
5600
      token_len = peek_token_bracket (token, regexp, syntax);
5601
      if (BE (token->type == END_OF_RE, 0))
5602
	{
5603
	  *err = REG_BADPAT;
5604
	  goto parse_bracket_exp_free_return;
5605
	}
5606
    }
5607

5608
  /* We treat the first ']' as a normal character.  */
5609
  if (token->type == OP_CLOSE_BRACKET)
5610
    token->type = CHARACTER;
5611

5612
  while (1)
5613
    {
5614
      bracket_elem_t start_elem, end_elem;
5615
      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
5616
      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
5617
      reg_errcode_t ret;
5618
      int token_len2 = 0, is_range_exp = 0;
5619
      re_token_t token2;
5620

5621
      start_elem.opr.name = start_name_buf;
5622
      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
5623
				   syntax, first_round);
5624
      if (BE (ret != REG_NOERROR, 0))
5625
	{
5626
	  *err = ret;
5627
	  goto parse_bracket_exp_free_return;
5628
	}
5629
      first_round = 0;
5630

5631
      /* Get information about the next token.  We need it in any case.  */
5632
      token_len = peek_token_bracket (token, regexp, syntax);
5633

5634
      /* Do not check for ranges if we know they are not allowed.  */
5635
      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
5636
	{
5637
	  if (BE (token->type == END_OF_RE, 0))
5638
	    {
5639
	      *err = REG_EBRACK;
5640
	      goto parse_bracket_exp_free_return;
5641
	    }
5642
	  if (token->type == OP_CHARSET_RANGE)
5643
	    {
5644
	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
5645
	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
5646
	      if (BE (token2.type == END_OF_RE, 0))
5647
		{
5648
		  *err = REG_EBRACK;
5649
		  goto parse_bracket_exp_free_return;
5650
		}
5651
	      if (token2.type == OP_CLOSE_BRACKET)
5652
		{
5653
		  /* We treat the last '-' as a normal character.  */
5654
		  re_string_skip_bytes (regexp, -token_len);
5655
		  token->type = CHARACTER;
5656
		}
5657
	      else
5658
		is_range_exp = 1;
5659
	    }
5660
	}
5661

5662
      if (is_range_exp == 1)
5663
	{
5664
	  end_elem.opr.name = end_name_buf;
5665
	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
5666
				       dfa, syntax, 1);
5667
	  if (BE (ret != REG_NOERROR, 0))
5668
	    {
5669
	      *err = ret;
5670
	      goto parse_bracket_exp_free_return;
5671
	    }
5672

5673
	  token_len = peek_token_bracket (token, regexp, syntax);
5674

5675
#ifdef _LIBC
5676
	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
5677
				  &start_elem, &end_elem);
5678
#else
5679
# ifdef RE_ENABLE_I18N
5680
	  *err = build_range_exp (sbcset,
5681
				  dfa->mb_cur_max > 1 ? mbcset : NULL,
5682
				  &range_alloc, &start_elem, &end_elem);
5683
# else
5684
	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
5685
# endif
5686
#endif /* RE_ENABLE_I18N */
5687
	  if (BE (*err != REG_NOERROR, 0))
5688
	    goto parse_bracket_exp_free_return;
5689
	}
5690
      else
5691
	{
5692
	  switch (start_elem.type)
5693
	    {
5694
	    case SB_CHAR:
5695
	      bitset_set (sbcset, start_elem.opr.ch);
5696
	      break;
5697
#ifdef RE_ENABLE_I18N
5698
	    case MB_CHAR:
5699
	      /* Check whether the array has enough space.  */
5700
	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
5701
		{
5702
		  wchar_t *new_mbchars;
5703
		  /* Not enough, realloc it.  */
5704
		  /* +1 in case of mbcset->nmbchars is 0.  */
5705
		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
5706
		  /* Use realloc since array is NULL if *alloc == 0.  */
5707
		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
5708
					    mbchar_alloc);
5709
		  if (BE (new_mbchars == NULL, 0))
5710
		    goto parse_bracket_exp_espace;
5711
		  mbcset->mbchars = new_mbchars;
5712
		}
5713
	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
5714
	      break;
5715
#endif /* RE_ENABLE_I18N */
5716
	    case EQUIV_CLASS:
5717
	      *err = build_equiv_class (sbcset,
5718
#ifdef RE_ENABLE_I18N
5719
					mbcset, &equiv_class_alloc,
5720
#endif /* RE_ENABLE_I18N */
5721
					start_elem.opr.name);
5722
	      if (BE (*err != REG_NOERROR, 0))
5723
		goto parse_bracket_exp_free_return;
5724
	      break;
5725
	    case COLL_SYM:
5726
	      *err = build_collating_symbol (sbcset,
5727
#ifdef RE_ENABLE_I18N
5728
					     mbcset, &coll_sym_alloc,
5729
#endif /* RE_ENABLE_I18N */
5730
					     start_elem.opr.name);
5731
	      if (BE (*err != REG_NOERROR, 0))
5732
		goto parse_bracket_exp_free_return;
5733
	      break;
5734
	    case CHAR_CLASS:
5735
	      *err = build_charclass (regexp->trans, sbcset,
5736
#ifdef RE_ENABLE_I18N
5737
				      mbcset, &char_class_alloc,
5738
#endif /* RE_ENABLE_I18N */
5739
				      start_elem.opr.name, syntax);
5740
	      if (BE (*err != REG_NOERROR, 0))
5741
	       goto parse_bracket_exp_free_return;
5742
	      break;
5743
	    default:
5744
	      assert (0);
5745
	      break;
5746
	    }
5747
	}
5748
      if (BE (token->type == END_OF_RE, 0))
5749
	{
5750
	  *err = REG_EBRACK;
5751
	  goto parse_bracket_exp_free_return;
5752
	}
5753
      if (token->type == OP_CLOSE_BRACKET)
5754
	break;
5755
    }
5756

5757
  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
5758

5759
  /* If it is non-matching list.  */
5760
  if (non_match)
5761
    bitset_not (sbcset);
5762

5763
#ifdef RE_ENABLE_I18N
5764
  /* Ensure only single byte characters are set.  */
5765
  if (dfa->mb_cur_max > 1)
5766
    bitset_mask (sbcset, dfa->sb_char);
5767

5768
  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
5769
      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
5770
						     || mbcset->non_match)))
5771
    {
5772
      bin_tree_t *mbc_tree;
5773
      int sbc_idx;
5774
      /* Build a tree for complex bracket.  */
5775
      dfa->has_mb_node = 1;
5776
      br_token.type = COMPLEX_BRACKET;
5777
      br_token.opr.mbcset = mbcset;
5778
      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
5779
      if (BE (mbc_tree == NULL, 0))
5780
	goto parse_bracket_exp_espace;
5781
      for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
5782
	if (sbcset[sbc_idx])
5783
	  break;
5784
      /* If there are no bits set in sbcset, there is no point
5785
	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
5786
      if (sbc_idx < BITSET_WORDS)
5787
	{
5788
          /* Build a tree for simple bracket.  */
5789
          br_token.type = SIMPLE_BRACKET;
5790
          br_token.opr.sbcset = sbcset;
5791
          work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
5792
          if (BE (work_tree == NULL, 0))
5793
            goto parse_bracket_exp_espace;
5794

5795
          /* Then join them by ALT node.  */
5796
          work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
5797
          if (BE (work_tree == NULL, 0))
5798
            goto parse_bracket_exp_espace;
5799
	}
5800
      else
5801
	{
5802
	  re_free (sbcset);
5803
	  work_tree = mbc_tree;
5804
	}
5805
    }
5806
  else
5807
#endif /* not RE_ENABLE_I18N */
5808
    {
5809
#ifdef RE_ENABLE_I18N
5810
      free_charset (mbcset);
5811
#endif
5812
      /* Build a tree for simple bracket.  */
5813
      br_token.type = SIMPLE_BRACKET;
5814
      br_token.opr.sbcset = sbcset;
5815
      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
5816
      if (BE (work_tree == NULL, 0))
5817
        goto parse_bracket_exp_espace;
5818
    }
5819
  return work_tree;
5820

5821
 parse_bracket_exp_espace:
5822
  *err = REG_ESPACE;
5823
 parse_bracket_exp_free_return:
5824
  re_free (sbcset);
5825
#ifdef RE_ENABLE_I18N
5826
  free_charset (mbcset);
5827
#endif /* RE_ENABLE_I18N */
5828
  return NULL;
5829
}
5830

5831
/* Parse an element in the bracket expression.  */
5832

5833
static reg_errcode_t
5834
parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
5835
		       re_token_t *token, int token_len, re_dfa_t *dfa,
5836
		       reg_syntax_t syntax, int accept_hyphen)
5837
{
5838
#ifdef RE_ENABLE_I18N
5839
  int cur_char_size;
5840
  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
5841
  if (cur_char_size > 1)
5842
    {
5843
      elem->type = MB_CHAR;
5844
      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
5845
      re_string_skip_bytes (regexp, cur_char_size);
5846
      return REG_NOERROR;
5847
    }
5848
#endif /* RE_ENABLE_I18N */
5849
  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
5850
  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
5851
      || token->type == OP_OPEN_EQUIV_CLASS)
5852
    return parse_bracket_symbol (elem, regexp, token);
5853
  if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
5854
    {
5855
      /* A '-' must only appear as anything but a range indicator before
5856
	 the closing bracket.  Everything else is an error.  */
5857
      re_token_t token2;
5858
      (void) peek_token_bracket (&token2, regexp, syntax);
5859
      if (token2.type != OP_CLOSE_BRACKET)
5860
	/* The actual error value is not standardized since this whole
5861
	   case is undefined.  But ERANGE makes good sense.  */
5862
	return REG_ERANGE;
5863
    }
5864
  elem->type = SB_CHAR;
5865
  elem->opr.ch = token->opr.c;
5866
  return REG_NOERROR;
5867
}
5868

5869
/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
5870
   such as [:<character_class>:], [.<collating_element>.], and
5871
   [=<equivalent_class>=].  */
5872

5873
static reg_errcode_t
5874
parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
5875
		      re_token_t *token)
5876
{
5877
  unsigned char ch, delim = token->opr.c;
5878
  int i = 0;
5879
  if (re_string_eoi(regexp))
5880
    return REG_EBRACK;
5881
  for (;; ++i)
5882
    {
5883
      if (i >= BRACKET_NAME_BUF_SIZE)
5884
	return REG_EBRACK;
5885
      if (token->type == OP_OPEN_CHAR_CLASS)
5886
	ch = re_string_fetch_byte_case (regexp);
5887
      else
5888
	ch = re_string_fetch_byte (regexp);
5889
      if (re_string_eoi(regexp))
5890
	return REG_EBRACK;
5891
      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
5892
	break;
5893
      elem->opr.name[i] = ch;
5894
    }
5895
  re_string_skip_bytes (regexp, 1);
5896
  elem->opr.name[i] = '\0';
5897
  switch (token->type)
5898
    {
5899
    case OP_OPEN_COLL_ELEM:
5900
      elem->type = COLL_SYM;
5901
      break;
5902
    case OP_OPEN_EQUIV_CLASS:
5903
      elem->type = EQUIV_CLASS;
5904
      break;
5905
    case OP_OPEN_CHAR_CLASS:
5906
      elem->type = CHAR_CLASS;
5907
      break;
5908
    default:
5909
      break;
5910
    }
5911
  return REG_NOERROR;
5912
}
5913

5914
  /* Helper function for parse_bracket_exp.
5915
     Build the equivalence class which is represented by NAME.
5916
     The result are written to MBCSET and SBCSET.
5917
     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
5918
     is a pointer argument sinse we may update it.  */
5919

5920
static reg_errcode_t
5921
#ifdef RE_ENABLE_I18N
5922
build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
5923
		   int *equiv_class_alloc, const unsigned char *name)
5924
#else /* not RE_ENABLE_I18N */
5925
build_equiv_class (bitset_t sbcset, const unsigned char *name)
5926
#endif /* not RE_ENABLE_I18N */
5927
{
5928
#ifdef _LIBC
5929
  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
5930
  if (nrules != 0)
5931
    {
5932
      const int32_t *table, *indirect;
5933
      const unsigned char *weights, *extra, *cp;
5934
      unsigned char char_buf[2];
5935
      int32_t idx1, idx2;
5936
      unsigned int ch;
5937
      size_t len;
5938
      /* This #include defines a local function!  */
5939
# include <locale/weight.h>
5940
      /* Calculate the index for equivalence class.  */
5941
      cp = name;
5942
      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
5943
      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
5944
					       _NL_COLLATE_WEIGHTMB);
5945
      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
5946
						   _NL_COLLATE_EXTRAMB);
5947
      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
5948
						_NL_COLLATE_INDIRECTMB);
5949
      idx1 = findidx (&cp);
5950
      if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
5951
	/* This isn't a valid character.  */
5952
	return REG_ECOLLATE;
5953

5954
      /* Build single byte matcing table for this equivalence class.  */
5955
      char_buf[1] = (unsigned char) '\0';
5956
      len = weights[idx1];
5957
      for (ch = 0; ch < SBC_MAX; ++ch)
5958
	{
5959
	  char_buf[0] = ch;
5960
	  cp = char_buf;
5961
	  idx2 = findidx (&cp);
5962
/*
5963
	  idx2 = table[ch];
5964
*/
5965
	  if (idx2 == 0)
5966
	    /* This isn't a valid character.  */
5967
	    continue;
5968
	  if (len == weights[idx2])
5969
	    {
5970
	      int cnt = 0;
5971
	      while (cnt <= len &&
5972
		     weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
5973
		++cnt;
5974

5975
	      if (cnt > len)
5976
		bitset_set (sbcset, ch);
5977
	    }
5978
	}
5979
      /* Check whether the array has enough space.  */
5980
      if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
5981
	{
5982
	  /* Not enough, realloc it.  */
5983
	  /* +1 in case of mbcset->nequiv_classes is 0.  */
5984
	  int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
5985
	  /* Use realloc since the array is NULL if *alloc == 0.  */
5986
	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
5987
						   int32_t,
5988
						   new_equiv_class_alloc);
5989
	  if (BE (new_equiv_classes == NULL, 0))
5990
	    return REG_ESPACE;
5991
	  mbcset->equiv_classes = new_equiv_classes;
5992
	  *equiv_class_alloc = new_equiv_class_alloc;
5993
	}
5994
      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
5995
    }
5996
  else
5997
#endif /* _LIBC */
5998
    {
5999
      if (BE (strlen ((const char *) name) != 1, 0))
6000
	return REG_ECOLLATE;
6001
      bitset_set (sbcset, *name);
6002
    }
6003
  return REG_NOERROR;
6004
}
6005

6006
  /* Helper function for parse_bracket_exp.
6007
     Build the character class which is represented by NAME.
6008
     The result are written to MBCSET and SBCSET.
6009
     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
6010
     is a pointer argument sinse we may update it.  */
6011

6012
static reg_errcode_t
6013
#ifdef RE_ENABLE_I18N
6014
build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
6015
		 re_charset_t *mbcset, int *char_class_alloc,
6016
		 const unsigned char *class_name, reg_syntax_t syntax)
6017
#else /* not RE_ENABLE_I18N */
6018
build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
6019
		 const unsigned char *class_name, reg_syntax_t syntax)
6020
#endif /* not RE_ENABLE_I18N */
6021
{
6022
  int i;
6023
  const char *name = (const char *) class_name;
6024

6025
  /* In case of REG_ICASE "upper" and "lower" match the both of
6026
     upper and lower cases.  */
6027
  if ((syntax & RE_ICASE)
6028
      && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
6029
    name = "alpha";
6030

6031
#ifdef RE_ENABLE_I18N
6032
  /* Check the space of the arrays.  */
6033
  if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
6034
    {
6035
      /* Not enough, realloc it.  */
6036
      /* +1 in case of mbcset->nchar_classes is 0.  */
6037
      int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
6038
      /* Use realloc since array is NULL if *alloc == 0.  */
6039
      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
6040
					       new_char_class_alloc);
6041
      if (BE (new_char_classes == NULL, 0))
6042
	return REG_ESPACE;
6043
      mbcset->char_classes = new_char_classes;
6044
      *char_class_alloc = new_char_class_alloc;
6045
    }
6046
  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
6047
#endif /* RE_ENABLE_I18N */
6048

6049
#define BUILD_CHARCLASS_LOOP(ctype_func)	\
6050
  do {						\
6051
    if (BE (trans != NULL, 0))			\
6052
      {						\
6053
	for (i = 0; i < SBC_MAX; ++i)		\
6054
  	  if (ctype_func (i))			\
6055
	    bitset_set (sbcset, trans[i]);	\
6056
      }						\
6057
    else					\
6058
      {						\
6059
	for (i = 0; i < SBC_MAX; ++i)		\
6060
  	  if (ctype_func (i))			\
6061
	    bitset_set (sbcset, i);		\
6062
      }						\
6063
  } while (0)
6064

6065
  if (strcmp (name, "alnum") == 0)
6066
    BUILD_CHARCLASS_LOOP (isalnum);
6067
  else if (strcmp (name, "cntrl") == 0)
6068
    BUILD_CHARCLASS_LOOP (iscntrl);
6069
  else if (strcmp (name, "lower") == 0)
6070
    BUILD_CHARCLASS_LOOP (islower);
6071
  else if (strcmp (name, "space") == 0)
6072
    BUILD_CHARCLASS_LOOP (isspace);
6073
  else if (strcmp (name, "alpha") == 0)
6074
    BUILD_CHARCLASS_LOOP (isalpha);
6075
  else if (strcmp (name, "digit") == 0)
6076
    BUILD_CHARCLASS_LOOP (isdigit);
6077
  else if (strcmp (name, "print") == 0)
6078
    BUILD_CHARCLASS_LOOP (isprint);
6079
  else if (strcmp (name, "upper") == 0)
6080
    BUILD_CHARCLASS_LOOP (isupper);
6081
  else if (strcmp (name, "blank") == 0)
6082
    BUILD_CHARCLASS_LOOP (isblank);
6083
  else if (strcmp (name, "graph") == 0)
6084
    BUILD_CHARCLASS_LOOP (isgraph);
6085
  else if (strcmp (name, "punct") == 0)
6086
    BUILD_CHARCLASS_LOOP (ispunct);
6087
  else if (strcmp (name, "xdigit") == 0)
6088
    BUILD_CHARCLASS_LOOP (isxdigit);
6089
  else
6090
    return REG_ECTYPE;
6091

6092
  return REG_NOERROR;
6093
}
6094

6095
static bin_tree_t *
6096
build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
6097
		    const unsigned char *class_name,
6098
		    const unsigned char *extra, int non_match,
6099
		    reg_errcode_t *err)
6100
{
6101
  re_bitset_ptr_t sbcset;
6102
#ifdef RE_ENABLE_I18N
6103
  re_charset_t *mbcset;
6104
  int alloc = 0;
6105
#endif /* not RE_ENABLE_I18N */
6106
  reg_errcode_t ret;
6107
  re_token_t br_token;
6108
  bin_tree_t *tree;
6109

6110
  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
6111
#ifdef RE_ENABLE_I18N
6112
  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
6113
#endif /* RE_ENABLE_I18N */
6114

6115
#ifdef RE_ENABLE_I18N
6116
  if (BE (sbcset == NULL || mbcset == NULL, 0))
6117
#else /* not RE_ENABLE_I18N */
6118
  if (BE (sbcset == NULL, 0))
6119
#endif /* not RE_ENABLE_I18N */
6120
    {
6121
      *err = REG_ESPACE;
6122
      return NULL;
6123
    }
6124

6125
  if (non_match)
6126
    {
6127
#ifdef RE_ENABLE_I18N
6128
      /*
6129
      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
6130
	bitset_set(cset->sbcset, '\0');
6131
      */
6132
      mbcset->non_match = 1;
6133
#endif /* not RE_ENABLE_I18N */
6134
    }
6135

6136
  /* We don't care the syntax in this case.  */
6137
  ret = build_charclass (trans, sbcset,
6138
#ifdef RE_ENABLE_I18N
6139
			 mbcset, &alloc,
6140
#endif /* RE_ENABLE_I18N */
6141
			 class_name, 0);
6142

6143
  if (BE (ret != REG_NOERROR, 0))
6144
    {
6145
      re_free (sbcset);
6146
#ifdef RE_ENABLE_I18N
6147
      free_charset (mbcset);
6148
#endif /* RE_ENABLE_I18N */
6149
      *err = ret;
6150
      return NULL;
6151
    }
6152
  /* \w match '_' also.  */
6153
  for (; *extra; extra++)
6154
    bitset_set (sbcset, *extra);
6155

6156
  /* If it is non-matching list.  */
6157
  if (non_match)
6158
    bitset_not (sbcset);
6159

6160
#ifdef RE_ENABLE_I18N
6161
  /* Ensure only single byte characters are set.  */
6162
  if (dfa->mb_cur_max > 1)
6163
    bitset_mask (sbcset, dfa->sb_char);
6164
#endif
6165

6166
  /* Build a tree for simple bracket.  */
6167
  br_token.type = SIMPLE_BRACKET;
6168
  br_token.opr.sbcset = sbcset;
6169
  tree = create_token_tree (dfa, NULL, NULL, &br_token);
6170
  if (BE (tree == NULL, 0))
6171
    goto build_word_op_espace;
6172

6173
#ifdef RE_ENABLE_I18N
6174
  if (dfa->mb_cur_max > 1)
6175
    {
6176
      bin_tree_t *mbc_tree;
6177
      /* Build a tree for complex bracket.  */
6178
      br_token.type = COMPLEX_BRACKET;
6179
      br_token.opr.mbcset = mbcset;
6180
      dfa->has_mb_node = 1;
6181
      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
6182
      if (BE (mbc_tree == NULL, 0))
6183
	goto build_word_op_espace;
6184
      /* Then join them by ALT node.  */
6185
      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
6186
      if (BE (mbc_tree != NULL, 1))
6187
	return tree;
6188
    }
6189
  else
6190
    {
6191
      free_charset (mbcset);
6192
      return tree;
6193
    }
6194
#else /* not RE_ENABLE_I18N */
6195
  return tree;
6196
#endif /* not RE_ENABLE_I18N */
6197

6198
 build_word_op_espace:
6199
  re_free (sbcset);
6200
#ifdef RE_ENABLE_I18N
6201
  free_charset (mbcset);
6202
#endif /* RE_ENABLE_I18N */
6203
  *err = REG_ESPACE;
6204
  return NULL;
6205
}
6206

6207
/* This is intended for the expressions like "a{1,3}".
6208
   Fetch a number from `input', and return the number.
6209
   Return -1, if the number field is empty like "{,1}".
6210
   Return -2, If an error is occured.  */
6211

6212
static int
6213
fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
6214
{
6215
  int num = -1;
6216
  unsigned char c;
6217
  while (1)
6218
    {
6219
      fetch_token (token, input, syntax);
6220
      c = token->opr.c;
6221
      if (BE (token->type == END_OF_RE, 0))
6222
	return -2;
6223
      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
6224
	break;
6225
      num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
6226
	     ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
6227
      num = (num > RE_DUP_MAX) ? -2 : num;
6228
    }
6229
  return num;
6230
}
6231

6232
#ifdef RE_ENABLE_I18N
6233
static void
6234
free_charset (re_charset_t *cset)
6235
{
6236
  re_free (cset->mbchars);
6237
# ifdef _LIBC
6238
  re_free (cset->coll_syms);
6239
  re_free (cset->equiv_classes);
6240
  re_free (cset->range_starts);
6241
  re_free (cset->range_ends);
6242
# endif
6243
  re_free (cset->char_classes);
6244
  re_free (cset);
6245
}
6246
#endif /* RE_ENABLE_I18N */
6247

6248
/* Functions for binary tree operation.  */
6249

6250
/* Create a tree node.  */
6251

6252
static bin_tree_t *
6253
create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
6254
	     re_token_type_t type)
6255
{
6256
  re_token_t t;
6257
  t.type = type;
6258
  return create_token_tree (dfa, left, right, &t);
6259
}
6260

6261
static bin_tree_t *
6262
create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
6263
		   const re_token_t *token)
6264
{
6265
  bin_tree_t *tree;
6266
  if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
6267
    {
6268
      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
6269

6270
      if (storage == NULL)
6271
	return NULL;
6272
      storage->next = dfa->str_tree_storage;
6273
      dfa->str_tree_storage = storage;
6274
      dfa->str_tree_storage_idx = 0;
6275
    }
6276
  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
6277

6278
  tree->parent = NULL;
6279
  tree->left = left;
6280
  tree->right = right;
6281
  tree->token = *token;
6282
  tree->token.duplicated = 0;
6283
  tree->token.opt_subexp = 0;
6284
  tree->first = NULL;
6285
  tree->next = NULL;
6286
  tree->node_idx = -1;
6287

6288
  if (left != NULL)
6289
    left->parent = tree;
6290
  if (right != NULL)
6291
    right->parent = tree;
6292
  return tree;
6293
}
6294

6295
/* Mark the tree SRC as an optional subexpression.
6296
   To be called from preorder or postorder.  */
6297

6298
static reg_errcode_t
6299
mark_opt_subexp (void *extra, bin_tree_t *node)
6300
{
6301
  int idx = (int) (long) extra;
6302
  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
6303
    node->token.opt_subexp = 1;
6304

6305
  return REG_NOERROR;
6306
}
6307

6308
/* Free the allocated memory inside NODE. */
6309

6310
static void
6311
free_token (re_token_t *node)
6312
{
6313
#ifdef RE_ENABLE_I18N
6314
  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
6315
    free_charset (node->opr.mbcset);
6316
  else
6317
#endif /* RE_ENABLE_I18N */
6318
    if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
6319
      re_free (node->opr.sbcset);
6320
}
6321

6322
/* Worker function for tree walking.  Free the allocated memory inside NODE
6323
   and its children. */
6324

6325
static reg_errcode_t
6326
free_tree (void *extra, bin_tree_t *node)
6327
{
6328
  free_token (&node->token);
6329
  return REG_NOERROR;
6330
}
6331

6332

6333
/* Duplicate the node SRC, and return new node.  This is a preorder
6334
   visit similar to the one implemented by the generic visitor, but
6335
   we need more infrastructure to maintain two parallel trees --- so,
6336
   it's easier to duplicate.  */
6337

6338
static bin_tree_t *
6339
duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
6340
{
6341
  const bin_tree_t *node;
6342
  bin_tree_t *dup_root;
6343
  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
6344

6345
  for (node = root; ; )
6346
    {
6347
      /* Create a new tree and link it back to the current parent.  */
6348
      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
6349
      if (*p_new == NULL)
6350
	return NULL;
6351
      (*p_new)->parent = dup_node;
6352
      (*p_new)->token.duplicated = 1;
6353
      dup_node = *p_new;
6354

6355
      /* Go to the left node, or up and to the right.  */
6356
      if (node->left)
6357
	{
6358
	  node = node->left;
6359
	  p_new = &dup_node->left;
6360
	}
6361
      else
6362
	{
6363
	  const bin_tree_t *prev = NULL;
6364
	  while (node->right == prev || node->right == NULL)
6365
	    {
6366
	      prev = node;
6367
	      node = node->parent;
6368
	      dup_node = dup_node->parent;
6369
	      if (!node)
6370
	        return dup_root;
6371
	    }
6372
	  node = node->right;
6373
	  p_new = &dup_node->right;
6374
	}
6375
    }
6376
}
6377

6378
/******************************************************************************/
6379
/******************************************************************************/
6380
/******************************************************************************/
6381
/* GKINCLUDE #include "regexec.c" */
6382
/******************************************************************************/
6383
/******************************************************************************/
6384
/******************************************************************************/
6385
static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
6386
				     int n) internal_function;
6387
static void match_ctx_clean (re_match_context_t *mctx) internal_function;
6388
static void match_ctx_free (re_match_context_t *cache) internal_function;
6389
static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
6390
					  int str_idx, int from, int to)
6391
     internal_function;
6392
static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
6393
     internal_function;
6394
static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
6395
					   int str_idx) internal_function;
6396
static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
6397
						   int node, int str_idx)
6398
     internal_function;
6399
static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
6400
			   re_dfastate_t **limited_sts, int last_node,
6401
			   int last_str_idx)
6402
     internal_function;
6403
static reg_errcode_t re_search_internal (const regex_t *preg,
6404
					 const char *string, int length,
6405
					 int start, int range, int stop,
6406
					 size_t nmatch, regmatch_t pmatch[],
6407
					 int eflags) internal_function;
6408
static int re_search_2_stub (struct re_pattern_buffer *bufp,
6409
			     const char *string1, int length1,
6410
			     const char *string2, int length2,
6411
			     int start, int range, struct re_registers *regs,
6412
			     int stop, int ret_len) internal_function;
6413
static int re_search_stub (struct re_pattern_buffer *bufp,
6414
			   const char *string, int length, int start,
6415
			   int range, int stop, struct re_registers *regs,
6416
			   int ret_len) internal_function;
6417
static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
6418
			      int nregs, int regs_allocated) internal_function;
6419
static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
6420
     internal_function;
6421
static int check_matching (re_match_context_t *mctx, int fl_longest_match,
6422
			   int *p_match_first) internal_function;
6423
static int check_halt_state_context (const re_match_context_t *mctx,
6424
				     const re_dfastate_t *state, int idx)
6425
     internal_function;
6426
static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
6427
			 regmatch_t *prev_idx_match, int cur_node,
6428
			 int cur_idx, int nmatch) internal_function;
6429
static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
6430
				      int str_idx, int dest_node, int nregs,
6431
				      regmatch_t *regs,
6432
				      re_node_set *eps_via_nodes)
6433
     internal_function;
6434
static reg_errcode_t set_regs (const regex_t *preg,
6435
			       const re_match_context_t *mctx,
6436
			       size_t nmatch, regmatch_t *pmatch,
6437
			       int fl_backtrack) internal_function;
6438
static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
6439
     internal_function;
6440

6441
#ifdef RE_ENABLE_I18N
6442
static int sift_states_iter_mb (const re_match_context_t *mctx,
6443
				re_sift_context_t *sctx,
6444
				int node_idx, int str_idx, int max_str_idx)
6445
     internal_function;
6446
#endif /* RE_ENABLE_I18N */
6447
static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
6448
					   re_sift_context_t *sctx)
6449
     internal_function;
6450
static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
6451
					  re_sift_context_t *sctx, int str_idx,
6452
					  re_node_set *cur_dest)
6453
     internal_function;
6454
static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
6455
					      re_sift_context_t *sctx,
6456
					      int str_idx,
6457
					      re_node_set *dest_nodes)
6458
     internal_function;
6459
static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
6460
					    re_node_set *dest_nodes,
6461
					    const re_node_set *candidates)
6462
     internal_function;
6463
static int check_dst_limits (const re_match_context_t *mctx,
6464
			     re_node_set *limits,
6465
			     int dst_node, int dst_idx, int src_node,
6466
			     int src_idx) internal_function;
6467
static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
6468
					int boundaries, int subexp_idx,
6469
					int from_node, int bkref_idx)
6470
     internal_function;
6471
static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
6472
				      int limit, int subexp_idx,
6473
				      int node, int str_idx,
6474
				      int bkref_idx) internal_function;
6475
static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
6476
					  re_node_set *dest_nodes,
6477
					  const re_node_set *candidates,
6478
					  re_node_set *limits,
6479
					  struct re_backref_cache_entry *bkref_ents,
6480
					  int str_idx) internal_function;
6481
static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
6482
					re_sift_context_t *sctx,
6483
					int str_idx, const re_node_set *candidates)
6484
     internal_function;
6485
static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
6486
					re_dfastate_t **dst,
6487
					re_dfastate_t **src, int num)
6488
     internal_function;
6489
static re_dfastate_t *find_recover_state (reg_errcode_t *err,
6490
					 re_match_context_t *mctx) internal_function;
6491
static re_dfastate_t *transit_state (reg_errcode_t *err,
6492
				     re_match_context_t *mctx,
6493
				     re_dfastate_t *state) internal_function;
6494
static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
6495
					    re_match_context_t *mctx,
6496
					    re_dfastate_t *next_state)
6497
     internal_function;
6498
static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
6499
						re_node_set *cur_nodes,
6500
						int str_idx) internal_function;
6501
#if 0
6502
static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
6503
					re_match_context_t *mctx,
6504
					re_dfastate_t *pstate)
6505
     internal_function;
6506
#endif
6507
#ifdef RE_ENABLE_I18N
6508
static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
6509
				       re_dfastate_t *pstate)
6510
     internal_function;
6511
#endif /* RE_ENABLE_I18N */
6512
static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
6513
					  const re_node_set *nodes)
6514
     internal_function;
6515
static reg_errcode_t get_subexp (re_match_context_t *mctx,
6516
				 int bkref_node, int bkref_str_idx)
6517
     internal_function;
6518
static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
6519
				     const re_sub_match_top_t *sub_top,
6520
				     re_sub_match_last_t *sub_last,
6521
				     int bkref_node, int bkref_str)
6522
     internal_function;
6523
static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
6524
			     int subexp_idx, int type) internal_function;
6525
static reg_errcode_t check_arrival (re_match_context_t *mctx,
6526
				    state_array_t *path, int top_node,
6527
				    int top_str, int last_node, int last_str,
6528
				    int type) internal_function;
6529
static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
6530
						   int str_idx,
6531
						   re_node_set *cur_nodes,
6532
						   re_node_set *next_nodes)
6533
     internal_function;
6534
static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
6535
					       re_node_set *cur_nodes,
6536
					       int ex_subexp, int type)
6537
     internal_function;
6538
static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
6539
						   re_node_set *dst_nodes,
6540
						   int target, int ex_subexp,
6541
						   int type) internal_function;
6542
static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
6543
					 re_node_set *cur_nodes, int cur_str,
6544
					 int subexp_num, int type)
6545
     internal_function;
6546
static int build_trtable (const re_dfa_t *dfa,
6547
			  re_dfastate_t *state) internal_function;
6548
#ifdef RE_ENABLE_I18N
6549
static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
6550
				    const re_string_t *input, int idx)
6551
     internal_function;
6552
# ifdef _LIBC
6553
static unsigned int find_collation_sequence_value (const unsigned char *mbs,
6554
						   size_t name_len)
6555
     internal_function;
6556
# endif /* _LIBC */
6557
#endif /* RE_ENABLE_I18N */
6558
static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
6559
				       const re_dfastate_t *state,
6560
				       re_node_set *states_node,
6561
				       bitset_t *states_ch) internal_function;
6562
static int check_node_accept (const re_match_context_t *mctx,
6563
			      const re_token_t *node, int idx)
6564
     internal_function;
6565
static reg_errcode_t extend_buffers (re_match_context_t *mctx)
6566
     internal_function;
6567

6568
/* Entry point for POSIX code.  */
6569

6570
/* regexec searches for a given pattern, specified by PREG, in the
6571
   string STRING.
6572

6573
   If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6574
   `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
6575
   least NMATCH elements, and we set them to the offsets of the
6576
   corresponding matched substrings.
6577

6578
   EFLAGS specifies `execution flags' which affect matching: if
6579
   REG_NOTBOL is set, then ^ does not match at the beginning of the
6580
   string; if REG_NOTEOL is set, then $ does not match at the end.
6581

6582
   We return 0 if we find a match and REG_NOMATCH if not.  */
6583

6584
int
6585
regexec (preg, string, nmatch, pmatch, eflags)
6586
    const regex_t *__restrict preg;
6587
    const char *__restrict string;
6588
    size_t nmatch;
6589
    regmatch_t pmatch[];
6590
    int eflags;
6591
{
6592
  reg_errcode_t err;
6593
  int start, length;
6594
  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
6595

6596
  if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
6597
    return REG_BADPAT;
6598

6599
  if (eflags & REG_STARTEND)
6600
    {
6601
      start = pmatch[0].rm_so;
6602
      length = pmatch[0].rm_eo;
6603
    }
6604
  else
6605
    {
6606
      start = 0;
6607
      length = strlen (string);
6608
    }
6609

6610
  __libc_lock_lock (dfa->lock);
6611
  if (preg->no_sub)
6612
    err = re_search_internal (preg, string, length, start, length - start,
6613
			      length, 0, NULL, eflags);
6614
  else
6615
    err = re_search_internal (preg, string, length, start, length - start,
6616
			      length, nmatch, pmatch, eflags);
6617
  __libc_lock_unlock (dfa->lock);
6618
  return err != REG_NOERROR;
6619
}
6620

6621
#ifdef _LIBC
6622
# include <shlib-compat.h>
6623
versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
6624

6625
# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
6626
__typeof__ (__regexec) __compat_regexec;
6627

6628
int
6629
attribute_compat_text_section
6630
__compat_regexec (const regex_t *__restrict preg,
6631
		  const char *__restrict string, size_t nmatch,
6632
		  regmatch_t pmatch[], int eflags)
6633
{
6634
  return regexec (preg, string, nmatch, pmatch,
6635
		  eflags & (REG_NOTBOL | REG_NOTEOL));
6636
}
6637
compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
6638
# endif
6639
#endif
6640

6641
/* Entry points for GNU code.  */
6642

6643
/* re_match, re_search, re_match_2, re_search_2
6644

6645
   The former two functions operate on STRING with length LENGTH,
6646
   while the later two operate on concatenation of STRING1 and STRING2
6647
   with lengths LENGTH1 and LENGTH2, respectively.
6648

6649
   re_match() matches the compiled pattern in BUFP against the string,
6650
   starting at index START.
6651

6652
   re_search() first tries matching at index START, then it tries to match
6653
   starting from index START + 1, and so on.  The last start position tried
6654
   is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
6655
   way as re_match().)
6656

6657
   The parameter STOP of re_{match,search}_2 specifies that no match exceeding
6658
   the first STOP characters of the concatenation of the strings should be
6659
   concerned.
6660

6661
   If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
6662
   and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
6663
   computed relative to the concatenation, not relative to the individual
6664
   strings.)
6665

6666
   On success, re_match* functions return the length of the match, re_search*
6667
   return the position of the start of the match.  Return value -1 means no
6668
   match was found and -2 indicates an internal error.  */
6669

6670
int
6671
re_match (bufp, string, length, start, regs)
6672
    struct re_pattern_buffer *bufp;
6673
    const char *string;
6674
    int length, start;
6675
    struct re_registers *regs;
6676
{
6677
  return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
6678
}
6679
#ifdef _LIBC
6680
weak_alias (__re_match, re_match)
6681
#endif
6682

6683
int
6684
re_search (bufp, string, length, start, range, regs)
6685
    struct re_pattern_buffer *bufp;
6686
    const char *string;
6687
    int length, start, range;
6688
    struct re_registers *regs;
6689
{
6690
  return re_search_stub (bufp, string, length, start, range, length, regs, 0);
6691
}
6692
#ifdef _LIBC
6693
weak_alias (__re_search, re_search)
6694
#endif
6695

6696
int
6697
re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
6698
    struct re_pattern_buffer *bufp;
6699
    const char *string1, *string2;
6700
    int length1, length2, start, stop;
6701
    struct re_registers *regs;
6702
{
6703
  return re_search_2_stub (bufp, string1, length1, string2, length2,
6704
			   start, 0, regs, stop, 1);
6705
}
6706
#ifdef _LIBC
6707
weak_alias (__re_match_2, re_match_2)
6708
#endif
6709

6710
int
6711
re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
6712
    struct re_pattern_buffer *bufp;
6713
    const char *string1, *string2;
6714
    int length1, length2, start, range, stop;
6715
    struct re_registers *regs;
6716
{
6717
  return re_search_2_stub (bufp, string1, length1, string2, length2,
6718
			   start, range, regs, stop, 0);
6719
}
6720
#ifdef _LIBC
6721
weak_alias (__re_search_2, re_search_2)
6722
#endif
6723

6724
static int
6725
re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
6726
		  stop, ret_len)
6727
    struct re_pattern_buffer *bufp;
6728
    const char *string1, *string2;
6729
    int length1, length2, start, range, stop, ret_len;
6730
    struct re_registers *regs;
6731
{
6732
  const char *str;
6733
  int rval;
6734
  int len = length1 + length2;
6735
  int free_str = 0;
6736

6737
  if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
6738
    return -2;
6739

6740
  /* Concatenate the strings.  */
6741
  if (length2 > 0)
6742
    if (length1 > 0)
6743
      {
6744
	char *s = re_malloc (char, len);
6745

6746
	if (BE (s == NULL, 0))
6747
	  return -2;
6748
#ifdef _LIBC
6749
	memcpy (__mempcpy (s, string1, length1), string2, length2);
6750
#else
6751
	memcpy (s, string1, length1);
6752
	memcpy (s + length1, string2, length2);
6753
#endif
6754
	str = s;
6755
	free_str = 1;
6756
      }
6757
    else
6758
      str = string2;
6759
  else
6760
    str = string1;
6761

6762
  rval = re_search_stub (bufp, str, len, start, range, stop, regs,
6763
			 ret_len);
6764
  if (free_str)
6765
    re_free ((char *) str);
6766
  return rval;
6767
}
6768

6769
/* The parameters have the same meaning as those of re_search.
6770
   Additional parameters:
6771
   If RET_LEN is nonzero the length of the match is returned (re_match style);
6772
   otherwise the position of the match is returned.  */
6773

6774
static int
6775
re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
6776
    struct re_pattern_buffer *bufp;
6777
    const char *string;
6778
    int length, start, range, stop, ret_len;
6779
    struct re_registers *regs;
6780
{
6781
  reg_errcode_t result;
6782
  regmatch_t *pmatch;
6783
  int nregs, rval;
6784
  int eflags = 0;
6785
  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
6786

6787
  /* Check for out-of-range.  */
6788
  if (BE (start < 0 || start > length, 0))
6789
    return -1;
6790
  if (BE (start + range > length, 0))
6791
    range = length - start;
6792
  else if (BE (start + range < 0, 0))
6793
    range = -start;
6794

6795
  __libc_lock_lock (dfa->lock);
6796

6797
  eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
6798
  eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
6799

6800
  /* Compile fastmap if we haven't yet.  */
6801
  if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
6802
    re_compile_fastmap (bufp);
6803

6804
  if (BE (bufp->no_sub, 0))
6805
    regs = NULL;
6806

6807
  /* We need at least 1 register.  */
6808
  if (regs == NULL)
6809
    nregs = 1;
6810
  else if (BE (bufp->regs_allocated == REGS_FIXED &&
6811
	       regs->num_regs < bufp->re_nsub + 1, 0))
6812
    {
6813
      nregs = regs->num_regs;
6814
      if (BE (nregs < 1, 0))
6815
	{
6816
	  /* Nothing can be copied to regs.  */
6817
	  regs = NULL;
6818
	  nregs = 1;
6819
	}
6820
    }
6821
  else
6822
    nregs = bufp->re_nsub + 1;
6823
  pmatch = re_malloc (regmatch_t, nregs);
6824
  if (BE (pmatch == NULL, 0))
6825
    {
6826
      rval = -2;
6827
      goto out;
6828
    }
6829

6830
  result = re_search_internal (bufp, string, length, start, range, stop,
6831
			       nregs, pmatch, eflags);
6832

6833
  rval = 0;
6834

6835
  /* I hope we needn't fill ther regs with -1's when no match was found.  */
6836
  if (result != REG_NOERROR)
6837
    rval = -1;
6838
  else if (regs != NULL)
6839
    {
6840
      /* If caller wants register contents data back, copy them.  */
6841
      bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
6842
					   bufp->regs_allocated);
6843
      if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
6844
	rval = -2;
6845
    }
6846

6847
  if (BE (rval == 0, 1))
6848
    {
6849
      if (ret_len)
6850
	{
6851
	  assert (pmatch[0].rm_so == start);
6852
	  rval = pmatch[0].rm_eo - start;
6853
	}
6854
      else
6855
	rval = pmatch[0].rm_so;
6856
    }
6857
  re_free (pmatch);
6858
 out:
6859
  __libc_lock_unlock (dfa->lock);
6860
  return rval;
6861
}
6862

6863
static unsigned
6864
re_copy_regs (regs, pmatch, nregs, regs_allocated)
6865
    struct re_registers *regs;
6866
    regmatch_t *pmatch;
6867
    int nregs, regs_allocated;
6868
{
6869
  int rval = REGS_REALLOCATE;
6870
  int i;
6871
  int need_regs = nregs + 1;
6872
  /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
6873
     uses.  */
6874

6875
  /* Have the register data arrays been allocated?  */
6876
  if (regs_allocated == REGS_UNALLOCATED)
6877
    { /* No.  So allocate them with malloc.  */
6878
      regs->start = re_malloc (regoff_t, need_regs);
6879
      regs->end = re_malloc (regoff_t, need_regs);
6880
      if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
6881
	return REGS_UNALLOCATED;
6882
      regs->num_regs = need_regs;
6883
    }
6884
  else if (regs_allocated == REGS_REALLOCATE)
6885
    { /* Yes.  If we need more elements than were already
6886
	 allocated, reallocate them.  If we need fewer, just
6887
	 leave it alone.  */
6888
      if (BE (need_regs > regs->num_regs, 0))
6889
	{
6890
	  regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
6891
	  regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
6892
	  if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
6893
	    return REGS_UNALLOCATED;
6894
	  regs->start = new_start;
6895
	  regs->end = new_end;
6896
	  regs->num_regs = need_regs;
6897
	}
6898
    }
6899
  else
6900
    {
6901
      assert (regs_allocated == REGS_FIXED);
6902
      /* This function may not be called with REGS_FIXED and nregs too big.  */
6903
      assert (regs->num_regs >= nregs);
6904
      rval = REGS_FIXED;
6905
    }
6906

6907
  /* Copy the regs.  */
6908
  for (i = 0; i < nregs; ++i)
6909
    {
6910
      regs->start[i] = pmatch[i].rm_so;
6911
      regs->end[i] = pmatch[i].rm_eo;
6912
    }
6913
  for ( ; i < regs->num_regs; ++i)
6914
    regs->start[i] = regs->end[i] = -1;
6915

6916
  return rval;
6917
}
6918

6919
/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
6920
   ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
6921
   this memory for recording register information.  STARTS and ENDS
6922
   must be allocated using the malloc library routine, and must each
6923
   be at least NUM_REGS * sizeof (regoff_t) bytes long.
6924

6925
   If NUM_REGS == 0, then subsequent matches should allocate their own
6926
   register data.
6927

6928
   Unless this function is called, the first search or match using
6929
   PATTERN_BUFFER will allocate its own register data, without
6930
   freeing the old data.  */
6931

6932
void
6933
re_set_registers (bufp, regs, num_regs, starts, ends)
6934
    struct re_pattern_buffer *bufp;
6935
    struct re_registers *regs;
6936
    unsigned num_regs;
6937
    regoff_t *starts, *ends;
6938
{
6939
  if (num_regs)
6940
    {
6941
      bufp->regs_allocated = REGS_REALLOCATE;
6942
      regs->num_regs = num_regs;
6943
      regs->start = starts;
6944
      regs->end = ends;
6945
    }
6946
  else
6947
    {
6948
      bufp->regs_allocated = REGS_UNALLOCATED;
6949
      regs->num_regs = 0;
6950
      regs->start = regs->end = (regoff_t *) 0;
6951
    }
6952
}
6953
#ifdef _LIBC
6954
weak_alias (__re_set_registers, re_set_registers)
6955
#endif
6956

6957
/* Entry points compatible with 4.2 BSD regex library.  We don't define
6958
   them unless specifically requested.  */
6959

6960
#if defined _REGEX_RE_COMP || defined _LIBC
6961
int
6962
# ifdef _LIBC
6963
weak_function
6964
# endif
6965
re_exec (s)
6966
     const char *s;
6967
{
6968
  return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
6969
}
6970
#endif /* _REGEX_RE_COMP */
6971

6972
/* Internal entry point.  */
6973

6974
/* Searches for a compiled pattern PREG in the string STRING, whose
6975
   length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
6976
   mingings with regexec.  START, and RANGE have the same meanings
6977
   with re_search.
6978
   Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
6979
   otherwise return the error code.
6980
   Note: We assume front end functions already check ranges.
6981
   (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
6982

6983
static reg_errcode_t
6984
re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
6985
		    eflags)
6986
    const regex_t *preg;
6987
    const char *string;
6988
    int length, start, range, stop, eflags;
6989
    size_t nmatch;
6990
    regmatch_t pmatch[];
6991
{
6992
  reg_errcode_t err;
6993
  const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
6994
  int left_lim, right_lim, incr;
6995
  int fl_longest_match, match_first, match_kind, match_last = -1;
6996
  int extra_nmatch;
6997
  int sb, ch;
6998
#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
6999
  re_match_context_t mctx = { .dfa = dfa };
7000
#else
7001
  re_match_context_t mctx;
7002
#endif
7003
  char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
7004
		   && range && !preg->can_be_null) ? preg->fastmap : NULL;
7005
  RE_TRANSLATE_TYPE t = preg->translate;
7006

7007
#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
7008
  memset (&mctx, '\0', sizeof (re_match_context_t));
7009
  mctx.dfa = dfa;
7010
#endif
7011

7012
  extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
7013
  nmatch -= extra_nmatch;
7014

7015
  /* Check if the DFA haven't been compiled.  */
7016
  if (BE (preg->used == 0 || dfa->init_state == NULL
7017
	  || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
7018
	  || dfa->init_state_begbuf == NULL, 0))
7019
    return REG_NOMATCH;
7020

7021
#ifdef DEBUG
7022
  /* We assume front-end functions already check them.  */
7023
  assert (start + range >= 0 && start + range <= length);
7024
#endif
7025

7026
  /* If initial states with non-begbuf contexts have no elements,
7027
     the regex must be anchored.  If preg->newline_anchor is set,
7028
     we'll never use init_state_nl, so do not check it.  */
7029
  if (dfa->init_state->nodes.nelem == 0
7030
      && dfa->init_state_word->nodes.nelem == 0
7031
      && (dfa->init_state_nl->nodes.nelem == 0
7032
	  || !preg->newline_anchor))
7033
    {
7034
      if (start != 0 && start + range != 0)
7035
        return REG_NOMATCH;
7036
      start = range = 0;
7037
    }
7038

7039
  /* We must check the longest matching, if nmatch > 0.  */
7040
  fl_longest_match = (nmatch != 0 || dfa->nbackref);
7041

7042
  err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
7043
			    preg->translate, preg->syntax & RE_ICASE, dfa);
7044
  if (BE (err != REG_NOERROR, 0))
7045
    goto free_return;
7046
  mctx.input.stop = stop;
7047
  mctx.input.raw_stop = stop;
7048
  mctx.input.newline_anchor = preg->newline_anchor;
7049

7050
  err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
7051
  if (BE (err != REG_NOERROR, 0))
7052
    goto free_return;
7053

7054
  /* We will log all the DFA states through which the dfa pass,
7055
     if nmatch > 1, or this dfa has "multibyte node", which is a
7056
     back-reference or a node which can accept multibyte character or
7057
     multi character collating element.  */
7058
  if (nmatch > 1 || dfa->has_mb_node)
7059
    {
7060
      mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
7061
      if (BE (mctx.state_log == NULL, 0))
7062
	{
7063
	  err = REG_ESPACE;
7064
	  goto free_return;
7065
	}
7066
    }
7067
  else
7068
    mctx.state_log = NULL;
7069

7070
  match_first = start;
7071
  mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
7072
			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
7073

7074
  /* Check incrementally whether of not the input string match.  */
7075
  incr = (range < 0) ? -1 : 1;
7076
  left_lim = (range < 0) ? start + range : start;
7077
  right_lim = (range < 0) ? start : start + range;
7078
  sb = dfa->mb_cur_max == 1;
7079
  match_kind =
7080
    (fastmap
7081
     ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
7082
	| (range >= 0 ? 2 : 0)
7083
	| (t != NULL ? 1 : 0))
7084
     : 8);
7085

7086
  for (;; match_first += incr)
7087
    {
7088
      err = REG_NOMATCH;
7089
      if (match_first < left_lim || right_lim < match_first)
7090
	goto free_return;
7091

7092
      /* Advance as rapidly as possible through the string, until we
7093
	 find a plausible place to start matching.  This may be done
7094
	 with varying efficiency, so there are various possibilities:
7095
	 only the most common of them are specialized, in order to
7096
	 save on code size.  We use a switch statement for speed.  */
7097
      switch (match_kind)
7098
	{
7099
	case 8:
7100
	  /* No fastmap.  */
7101
	  break;
7102

7103
	case 7:
7104
	  /* Fastmap with single-byte translation, match forward.  */
7105
	  while (BE (match_first < right_lim, 1)
7106
		 && !fastmap[t[(unsigned char) string[match_first]]])
7107
	    ++match_first;
7108
	  goto forward_match_found_start_or_reached_end;
7109

7110
	case 6:
7111
	  /* Fastmap without translation, match forward.  */
7112
	  while (BE (match_first < right_lim, 1)
7113
		 && !fastmap[(unsigned char) string[match_first]])
7114
	    ++match_first;
7115

7116
	forward_match_found_start_or_reached_end:
7117
	  if (BE (match_first == right_lim, 0))
7118
	    {
7119
	      ch = match_first >= length
7120
		       ? 0 : (unsigned char) string[match_first];
7121
	      if (!fastmap[t ? t[ch] : ch])
7122
		goto free_return;
7123
	    }
7124
	  break;
7125

7126
	case 4:
7127
	case 5:
7128
	  /* Fastmap without multi-byte translation, match backwards.  */
7129
	  while (match_first >= left_lim)
7130
	    {
7131
	      ch = match_first >= length
7132
		       ? 0 : (unsigned char) string[match_first];
7133
	      if (fastmap[t ? t[ch] : ch])
7134
		break;
7135
	      --match_first;
7136
	    }
7137
	  if (match_first < left_lim)
7138
	    goto free_return;
7139
	  break;
7140

7141
	default:
7142
	  /* In this case, we can't determine easily the current byte,
7143
	     since it might be a component byte of a multibyte
7144
	     character.  Then we use the constructed buffer instead.  */
7145
	  for (;;)
7146
	    {
7147
	      /* If MATCH_FIRST is out of the valid range, reconstruct the
7148
		 buffers.  */
7149
	      unsigned int offset = match_first - mctx.input.raw_mbs_idx;
7150
	      if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
7151
		{
7152
		  err = re_string_reconstruct (&mctx.input, match_first,
7153
					       eflags);
7154
		  if (BE (err != REG_NOERROR, 0))
7155
		    goto free_return;
7156

7157
		  offset = match_first - mctx.input.raw_mbs_idx;
7158
		}
7159
	      /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
7160
		 Note that MATCH_FIRST must not be smaller than 0.  */
7161
	      ch = (match_first >= length
7162
		    ? 0 : re_string_byte_at (&mctx.input, offset));
7163
	      if (fastmap[ch])
7164
		break;
7165
	      match_first += incr;
7166
	      if (match_first < left_lim || match_first > right_lim)
7167
	        {
7168
	          err = REG_NOMATCH;
7169
	          goto free_return;
7170
	        }
7171
	    }
7172
	  break;
7173
	}
7174

7175
      /* Reconstruct the buffers so that the matcher can assume that
7176
	 the matching starts from the beginning of the buffer.  */
7177
      err = re_string_reconstruct (&mctx.input, match_first, eflags);
7178
      if (BE (err != REG_NOERROR, 0))
7179
	goto free_return;
7180

7181
#ifdef RE_ENABLE_I18N
7182
     /* Don't consider this char as a possible match start if it part,
7183
	yet isn't the head, of a multibyte character.  */
7184
      if (!sb && !re_string_first_byte (&mctx.input, 0))
7185
	continue;
7186
#endif
7187

7188
      /* It seems to be appropriate one, then use the matcher.  */
7189
      /* We assume that the matching starts from 0.  */
7190
      mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
7191
      match_last = check_matching (&mctx, fl_longest_match,
7192
				   range >= 0 ? &match_first : NULL);
7193
      if (match_last != -1)
7194
	{
7195
	  if (BE (match_last == -2, 0))
7196
	    {
7197
	      err = REG_ESPACE;
7198
	      goto free_return;
7199
	    }
7200
	  else
7201
	    {
7202
	      mctx.match_last = match_last;
7203
	      if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
7204
		{
7205
		  re_dfastate_t *pstate = mctx.state_log[match_last];
7206
		  mctx.last_node = check_halt_state_context (&mctx, pstate,
7207
							     match_last);
7208
		}
7209
	      if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
7210
		  || dfa->nbackref)
7211
		{
7212
		  err = prune_impossible_nodes (&mctx);
7213
		  if (err == REG_NOERROR)
7214
		    break;
7215
		  if (BE (err != REG_NOMATCH, 0))
7216
		    goto free_return;
7217
		  match_last = -1;
7218
		}
7219
	      else
7220
		break; /* We found a match.  */
7221
	    }
7222
	}
7223

7224
      match_ctx_clean (&mctx);
7225
    }
7226

7227
#ifdef DEBUG
7228
  assert (match_last != -1);
7229
  assert (err == REG_NOERROR);
7230
#endif
7231

7232
  /* Set pmatch[] if we need.  */
7233
  if (nmatch > 0)
7234
    {
7235
      int reg_idx;
7236

7237
      /* Initialize registers.  */
7238
      for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
7239
	pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
7240

7241
      /* Set the points where matching start/end.  */
7242
      pmatch[0].rm_so = 0;
7243
      pmatch[0].rm_eo = mctx.match_last;
7244

7245
      if (!preg->no_sub && nmatch > 1)
7246
	{
7247
	  err = set_regs (preg, &mctx, nmatch, pmatch,
7248
			  dfa->has_plural_match && dfa->nbackref > 0);
7249
	  if (BE (err != REG_NOERROR, 0))
7250
	    goto free_return;
7251
	}
7252

7253
      /* At last, add the offset to the each registers, since we slided
7254
	 the buffers so that we could assume that the matching starts
7255
	 from 0.  */
7256
      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
7257
	if (pmatch[reg_idx].rm_so != -1)
7258
	  {
7259
#ifdef RE_ENABLE_I18N
7260
	    if (BE (mctx.input.offsets_needed != 0, 0))
7261
	      {
7262
		pmatch[reg_idx].rm_so =
7263
		  (pmatch[reg_idx].rm_so == mctx.input.valid_len
7264
		   ? mctx.input.valid_raw_len
7265
		   : mctx.input.offsets[pmatch[reg_idx].rm_so]);
7266
		pmatch[reg_idx].rm_eo =
7267
		  (pmatch[reg_idx].rm_eo == mctx.input.valid_len
7268
		   ? mctx.input.valid_raw_len
7269
		   : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
7270
	      }
7271
#else
7272
	    assert (mctx.input.offsets_needed == 0);
7273
#endif
7274
	    pmatch[reg_idx].rm_so += match_first;
7275
	    pmatch[reg_idx].rm_eo += match_first;
7276
	  }
7277
      for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
7278
	{
7279
	  pmatch[nmatch + reg_idx].rm_so = -1;
7280
	  pmatch[nmatch + reg_idx].rm_eo = -1;
7281
	}
7282

7283
      if (dfa->subexp_map)
7284
        for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
7285
          if (dfa->subexp_map[reg_idx] != reg_idx)
7286
            {
7287
              pmatch[reg_idx + 1].rm_so
7288
                = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
7289
              pmatch[reg_idx + 1].rm_eo
7290
                = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
7291
            }
7292
    }
7293

7294
 free_return:
7295
  re_free (mctx.state_log);
7296
  if (dfa->nbackref)
7297
    match_ctx_free (&mctx);
7298
  re_string_destruct (&mctx.input);
7299
  return err;
7300
}
7301

7302
static reg_errcode_t
7303
prune_impossible_nodes (mctx)
7304
     re_match_context_t *mctx;
7305
{
7306
  const re_dfa_t *const dfa = mctx->dfa;
7307
  int halt_node, match_last;
7308
  reg_errcode_t ret;
7309
  re_dfastate_t **sifted_states;
7310
  re_dfastate_t **lim_states = NULL;
7311
  re_sift_context_t sctx;
7312
#ifdef DEBUG
7313
  assert (mctx->state_log != NULL);
7314
#endif
7315
  match_last = mctx->match_last;
7316
  halt_node = mctx->last_node;
7317
  sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
7318
  if (BE (sifted_states == NULL, 0))
7319
    {
7320
      ret = REG_ESPACE;
7321
      goto free_return;
7322
    }
7323
  if (dfa->nbackref)
7324
    {
7325
      lim_states = re_malloc (re_dfastate_t *, match_last + 1);
7326
      if (BE (lim_states == NULL, 0))
7327
	{
7328
	  ret = REG_ESPACE;
7329
	  goto free_return;
7330
	}
7331
      while (1)
7332
	{
7333
	  memset (lim_states, '\0',
7334
		  sizeof (re_dfastate_t *) * (match_last + 1));
7335
	  sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
7336
			 match_last);
7337
	  ret = sift_states_backward (mctx, &sctx);
7338
	  re_node_set_free (&sctx.limits);
7339
	  if (BE (ret != REG_NOERROR, 0))
7340
	      goto free_return;
7341
	  if (sifted_states[0] != NULL || lim_states[0] != NULL)
7342
	    break;
7343
	  do
7344
	    {
7345
	      --match_last;
7346
	      if (match_last < 0)
7347
		{
7348
		  ret = REG_NOMATCH;
7349
		  goto free_return;
7350
		}
7351
	    } while (mctx->state_log[match_last] == NULL
7352
		     || !mctx->state_log[match_last]->halt);
7353
	  halt_node = check_halt_state_context (mctx,
7354
						mctx->state_log[match_last],
7355
						match_last);
7356
	}
7357
      ret = merge_state_array (dfa, sifted_states, lim_states,
7358
			       match_last + 1);
7359
      re_free (lim_states);
7360
      lim_states = NULL;
7361
      if (BE (ret != REG_NOERROR, 0))
7362
	goto free_return;
7363
    }
7364
  else
7365
    {
7366
      sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
7367
      ret = sift_states_backward (mctx, &sctx);
7368
      re_node_set_free (&sctx.limits);
7369
      if (BE (ret != REG_NOERROR, 0))
7370
	goto free_return;
7371
    }
7372
  re_free (mctx->state_log);
7373
  mctx->state_log = sifted_states;
7374
  sifted_states = NULL;
7375
  mctx->last_node = halt_node;
7376
  mctx->match_last = match_last;
7377
  ret = REG_NOERROR;
7378
 free_return:
7379
  re_free (sifted_states);
7380
  re_free (lim_states);
7381
  return ret;
7382
}
7383

7384
/* Acquire an initial state and return it.
7385
   We must select appropriate initial state depending on the context,
7386
   since initial states may have constraints like "\<", "^", etc..  */
7387

7388
static inline re_dfastate_t *
7389
__attribute ((always_inline)) internal_function
7390
acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
7391
			    int idx)
7392
{
7393
  const re_dfa_t *const dfa = mctx->dfa;
7394
  if (dfa->init_state->has_constraint)
7395
    {
7396
      unsigned int context;
7397
      context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
7398
      if (IS_WORD_CONTEXT (context))
7399
	return dfa->init_state_word;
7400
      else if (IS_ORDINARY_CONTEXT (context))
7401
	return dfa->init_state;
7402
      else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
7403
	return dfa->init_state_begbuf;
7404
      else if (IS_NEWLINE_CONTEXT (context))
7405
	return dfa->init_state_nl;
7406
      else if (IS_BEGBUF_CONTEXT (context))
7407
	{
7408
	  /* It is relatively rare case, then calculate on demand.  */
7409
	  return re_acquire_state_context (err, dfa,
7410
					   dfa->init_state->entrance_nodes,
7411
					   context);
7412
	}
7413
      else
7414
	/* Must not happen?  */
7415
	return dfa->init_state;
7416
    }
7417
  else
7418
    return dfa->init_state;
7419
}
7420

7421
/* Check whether the regular expression match input string INPUT or not,
7422
   and return the index where the matching end, return -1 if not match,
7423
   or return -2 in case of an error.
7424
   FL_LONGEST_MATCH means we want the POSIX longest matching.
7425
   If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
7426
   next place where we may want to try matching.
7427
   Note that the matcher assume that the maching starts from the current
7428
   index of the buffer.  */
7429

7430
static int
7431
internal_function
7432
check_matching (re_match_context_t *mctx, int fl_longest_match,
7433
		int *p_match_first)
7434
{
7435
  const re_dfa_t *const dfa = mctx->dfa;
7436
  reg_errcode_t err;
7437
  int match = 0;
7438
  int match_last = -1;
7439
  int cur_str_idx = re_string_cur_idx (&mctx->input);
7440
  re_dfastate_t *cur_state;
7441
  int at_init_state = p_match_first != NULL;
7442
  int next_start_idx = cur_str_idx;
7443

7444
  err = REG_NOERROR;
7445
  cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
7446
  /* An initial state must not be NULL (invalid).  */
7447
  if (BE (cur_state == NULL, 0))
7448
    {
7449
      assert (err == REG_ESPACE);
7450
      return -2;
7451
    }
7452

7453
  if (mctx->state_log != NULL)
7454
    {
7455
      mctx->state_log[cur_str_idx] = cur_state;
7456

7457
      /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
7458
	 later.  E.g. Processing back references.  */
7459
      if (BE (dfa->nbackref, 0))
7460
	{
7461
	  at_init_state = 0;
7462
	  err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
7463
	  if (BE (err != REG_NOERROR, 0))
7464
	    return err;
7465

7466
	  if (cur_state->has_backref)
7467
	    {
7468
	      err = transit_state_bkref (mctx, &cur_state->nodes);
7469
	      if (BE (err != REG_NOERROR, 0))
7470
	        return err;
7471
	    }
7472
	}
7473
    }
7474

7475
  /* If the RE accepts NULL string.  */
7476
  if (BE (cur_state->halt, 0))
7477
    {
7478
      if (!cur_state->has_constraint
7479
	  || check_halt_state_context (mctx, cur_state, cur_str_idx))
7480
	{
7481
	  if (!fl_longest_match)
7482
	    return cur_str_idx;
7483
	  else
7484
	    {
7485
	      match_last = cur_str_idx;
7486
	      match = 1;
7487
	    }
7488
	}
7489
    }
7490

7491
  while (!re_string_eoi (&mctx->input))
7492
    {
7493
      re_dfastate_t *old_state = cur_state;
7494
      int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
7495

7496
      if (BE (next_char_idx >= mctx->input.bufs_len, 0)
7497
          || (BE (next_char_idx >= mctx->input.valid_len, 0)
7498
              && mctx->input.valid_len < mctx->input.len))
7499
        {
7500
          err = extend_buffers (mctx);
7501
          if (BE (err != REG_NOERROR, 0))
7502
	    {
7503
	      assert (err == REG_ESPACE);
7504
	      return -2;
7505
	    }
7506
        }
7507

7508
      cur_state = transit_state (&err, mctx, cur_state);
7509
      if (mctx->state_log != NULL)
7510
	cur_state = merge_state_with_log (&err, mctx, cur_state);
7511

7512
      if (cur_state == NULL)
7513
	{
7514
	  /* Reached the invalid state or an error.  Try to recover a valid
7515
	     state using the state log, if available and if we have not
7516
	     already found a valid (even if not the longest) match.  */
7517
	  if (BE (err != REG_NOERROR, 0))
7518
	    return -2;
7519

7520
	  if (mctx->state_log == NULL
7521
	      || (match && !fl_longest_match)
7522
	      || (cur_state = find_recover_state (&err, mctx)) == NULL)
7523
	    break;
7524
	}
7525

7526
      if (BE (at_init_state, 0))
7527
	{
7528
	  if (old_state == cur_state)
7529
	    next_start_idx = next_char_idx;
7530
	  else
7531
	    at_init_state = 0;
7532
	}
7533

7534
      if (cur_state->halt)
7535
	{
7536
	  /* Reached a halt state.
7537
	     Check the halt state can satisfy the current context.  */
7538
	  if (!cur_state->has_constraint
7539
	      || check_halt_state_context (mctx, cur_state,
7540
					   re_string_cur_idx (&mctx->input)))
7541
	    {
7542
	      /* We found an appropriate halt state.  */
7543
	      match_last = re_string_cur_idx (&mctx->input);
7544
	      match = 1;
7545

7546
	      /* We found a match, do not modify match_first below.  */
7547
	      p_match_first = NULL;
7548
	      if (!fl_longest_match)
7549
		break;
7550
	    }
7551
	}
7552
    }
7553

7554
  if (p_match_first)
7555
    *p_match_first += next_start_idx;
7556

7557
  return match_last;
7558
}
7559

7560
/* Check NODE match the current context.  */
7561

7562
static int
7563
internal_function
7564
check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
7565
{
7566
  re_token_type_t type = dfa->nodes[node].type;
7567
  unsigned int constraint = dfa->nodes[node].constraint;
7568
  if (type != END_OF_RE)
7569
    return 0;
7570
  if (!constraint)
7571
    return 1;
7572
  if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
7573
    return 0;
7574
  return 1;
7575
}
7576

7577
/* Check the halt state STATE match the current context.
7578
   Return 0 if not match, if the node, STATE has, is a halt node and
7579
   match the context, return the node.  */
7580

7581
static int
7582
internal_function
7583
check_halt_state_context (const re_match_context_t *mctx,
7584
			  const re_dfastate_t *state, int idx)
7585
{
7586
  int i;
7587
  unsigned int context;
7588
#ifdef DEBUG
7589
  assert (state->halt);
7590
#endif
7591
  context = re_string_context_at (&mctx->input, idx, mctx->eflags);
7592
  for (i = 0; i < state->nodes.nelem; ++i)
7593
    if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
7594
      return state->nodes.elems[i];
7595
  return 0;
7596
}
7597

7598
/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
7599
   corresponding to the DFA).
7600
   Return the destination node, and update EPS_VIA_NODES, return -1 in case
7601
   of errors.  */
7602

7603
static int
7604
internal_function
7605
proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
7606
		   int *pidx, int node, re_node_set *eps_via_nodes,
7607
		   struct re_fail_stack_t *fs)
7608
{
7609
  const re_dfa_t *const dfa = mctx->dfa;
7610
  int i, err;
7611
  if (IS_EPSILON_NODE (dfa->nodes[node].type))
7612
    {
7613
      re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
7614
      re_node_set *edests = &dfa->edests[node];
7615
      int dest_node;
7616
      err = re_node_set_insert (eps_via_nodes, node);
7617
      if (BE (err < 0, 0))
7618
	return -2;
7619
      /* Pick up a valid destination, or return -1 if none is found.  */
7620
      for (dest_node = -1, i = 0; i < edests->nelem; ++i)
7621
	{
7622
	  int candidate = edests->elems[i];
7623
	  if (!re_node_set_contains (cur_nodes, candidate))
7624
	    continue;
7625
          if (dest_node == -1)
7626
	    dest_node = candidate;
7627

7628
          else
7629
	    {
7630
	      /* In order to avoid infinite loop like "(a*)*", return the second
7631
	         epsilon-transition if the first was already considered.  */
7632
	      if (re_node_set_contains (eps_via_nodes, dest_node))
7633
	        return candidate;
7634

7635
	      /* Otherwise, push the second epsilon-transition on the fail stack.  */
7636
	      else if (fs != NULL
7637
		       && push_fail_stack (fs, *pidx, candidate, nregs, regs,
7638
				           eps_via_nodes))
7639
		return -2;
7640

7641
	      /* We know we are going to exit.  */
7642
	      break;
7643
	    }
7644
	}
7645
      return dest_node;
7646
    }
7647
  else
7648
    {
7649
      int naccepted = 0;
7650
      re_token_type_t type = dfa->nodes[node].type;
7651

7652
#ifdef RE_ENABLE_I18N
7653
      if (dfa->nodes[node].accept_mb)
7654
	naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
7655
      else
7656
#endif /* RE_ENABLE_I18N */
7657
      if (type == OP_BACK_REF)
7658
	{
7659
	  int subexp_idx = dfa->nodes[node].opr.idx + 1;
7660
	  naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
7661
	  if (fs != NULL)
7662
	    {
7663
	      if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
7664
		return -1;
7665
	      else if (naccepted)
7666
		{
7667
		  char *buf = (char *) re_string_get_buffer (&mctx->input);
7668
		  if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
7669
			      naccepted) != 0)
7670
		    return -1;
7671
		}
7672
	    }
7673

7674
	  if (naccepted == 0)
7675
	    {
7676
	      int dest_node;
7677
	      err = re_node_set_insert (eps_via_nodes, node);
7678
	      if (BE (err < 0, 0))
7679
		return -2;
7680
	      dest_node = dfa->edests[node].elems[0];
7681
	      if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
7682
					dest_node))
7683
		return dest_node;
7684
	    }
7685
	}
7686

7687
      if (naccepted != 0
7688
	  || check_node_accept (mctx, dfa->nodes + node, *pidx))
7689
	{
7690
	  int dest_node = dfa->nexts[node];
7691
	  *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
7692
	  if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
7693
		     || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
7694
					       dest_node)))
7695
	    return -1;
7696
	  re_node_set_empty (eps_via_nodes);
7697
	  return dest_node;
7698
	}
7699
    }
7700
  return -1;
7701
}
7702

7703
static reg_errcode_t
7704
internal_function
7705
push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
7706
		 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
7707
{
7708
  reg_errcode_t err;
7709
  int num = fs->num++;
7710
  if (fs->num == fs->alloc)
7711
    {
7712
      struct re_fail_stack_ent_t *new_array;
7713
      new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
7714
				       * fs->alloc * 2));
7715
      if (new_array == NULL)
7716
	return REG_ESPACE;
7717
      fs->alloc *= 2;
7718
      fs->stack = new_array;
7719
    }
7720
  fs->stack[num].idx = str_idx;
7721
  fs->stack[num].node = dest_node;
7722
  fs->stack[num].regs = re_malloc (regmatch_t, nregs);
7723
  if (fs->stack[num].regs == NULL)
7724
    return REG_ESPACE;
7725
  memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
7726
  err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
7727
  return err;
7728
}
7729

7730
static int
7731
internal_function
7732
pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
7733
		regmatch_t *regs, re_node_set *eps_via_nodes)
7734
{
7735
  int num = --fs->num;
7736
  assert (num >= 0);
7737
  *pidx = fs->stack[num].idx;
7738
  memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
7739
  re_node_set_free (eps_via_nodes);
7740
  re_free (fs->stack[num].regs);
7741
  *eps_via_nodes = fs->stack[num].eps_via_nodes;
7742
  return fs->stack[num].node;
7743
}
7744

7745
/* Set the positions where the subexpressions are starts/ends to registers
7746
   PMATCH.
7747
   Note: We assume that pmatch[0] is already set, and
7748
   pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
7749

7750
static reg_errcode_t
7751
internal_function
7752
set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
7753
	  regmatch_t *pmatch, int fl_backtrack)
7754
{
7755
  const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
7756
  int idx, cur_node;
7757
  re_node_set eps_via_nodes;
7758
  struct re_fail_stack_t *fs;
7759
  struct re_fail_stack_t fs_body = { 0, 2, NULL };
7760
  regmatch_t *prev_idx_match;
7761
  int prev_idx_match_malloced = 0;
7762

7763
#ifdef DEBUG
7764
  assert (nmatch > 1);
7765
  assert (mctx->state_log != NULL);
7766
#endif
7767
  if (fl_backtrack)
7768
    {
7769
      fs = &fs_body;
7770
      fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
7771
      if (fs->stack == NULL)
7772
	return REG_ESPACE;
7773
    }
7774
  else
7775
    fs = NULL;
7776

7777
  cur_node = dfa->init_node;
7778
  re_node_set_init_empty (&eps_via_nodes);
7779

7780
  if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
7781
    prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
7782
  else
7783
    {
7784
      prev_idx_match = re_malloc (regmatch_t, nmatch);
7785
      if (prev_idx_match == NULL)
7786
	{
7787
	  free_fail_stack_return (fs);
7788
	  return REG_ESPACE;
7789
	}
7790
      prev_idx_match_malloced = 1;
7791
    }
7792
  memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
7793

7794
  for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
7795
    {
7796
      update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
7797

7798
      if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
7799
	{
7800
	  int reg_idx;
7801
	  if (fs)
7802
	    {
7803
	      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
7804
		if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
7805
		  break;
7806
	      if (reg_idx == nmatch)
7807
		{
7808
		  re_node_set_free (&eps_via_nodes);
7809
		  if (prev_idx_match_malloced)
7810
		    re_free (prev_idx_match);
7811
		  return free_fail_stack_return (fs);
7812
		}
7813
	      cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
7814
					 &eps_via_nodes);
7815
	    }
7816
	  else
7817
	    {
7818
	      re_node_set_free (&eps_via_nodes);
7819
	      if (prev_idx_match_malloced)
7820
		re_free (prev_idx_match);
7821
	      return REG_NOERROR;
7822
	    }
7823
	}
7824

7825
      /* Proceed to next node.  */
7826
      cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
7827
				    &eps_via_nodes, fs);
7828

7829
      if (BE (cur_node < 0, 0))
7830
	{
7831
	  if (BE (cur_node == -2, 0))
7832
	    {
7833
	      re_node_set_free (&eps_via_nodes);
7834
	      if (prev_idx_match_malloced)
7835
		re_free (prev_idx_match);
7836
	      free_fail_stack_return (fs);
7837
	      return REG_ESPACE;
7838
	    }
7839
	  if (fs)
7840
	    cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
7841
				       &eps_via_nodes);
7842
	  else
7843
	    {
7844
	      re_node_set_free (&eps_via_nodes);
7845
	      if (prev_idx_match_malloced)
7846
		re_free (prev_idx_match);
7847
	      return REG_NOMATCH;
7848
	    }
7849
	}
7850
    }
7851
  re_node_set_free (&eps_via_nodes);
7852
  if (prev_idx_match_malloced)
7853
    re_free (prev_idx_match);
7854
  return free_fail_stack_return (fs);
7855
}
7856

7857
static reg_errcode_t
7858
internal_function
7859
free_fail_stack_return (struct re_fail_stack_t *fs)
7860
{
7861
  if (fs)
7862
    {
7863
      int fs_idx;
7864
      for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
7865
	{
7866
	  re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
7867
	  re_free (fs->stack[fs_idx].regs);
7868
	}
7869
      re_free (fs->stack);
7870
    }
7871
  return REG_NOERROR;
7872
}
7873

7874
static void
7875
internal_function
7876
update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
7877
	     regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
7878
{
7879
  int type = dfa->nodes[cur_node].type;
7880
  if (type == OP_OPEN_SUBEXP)
7881
    {
7882
      int reg_num = dfa->nodes[cur_node].opr.idx + 1;
7883

7884
      /* We are at the first node of this sub expression.  */
7885
      if (reg_num < nmatch)
7886
	{
7887
	  pmatch[reg_num].rm_so = cur_idx;
7888
	  pmatch[reg_num].rm_eo = -1;
7889
	}
7890
    }
7891
  else if (type == OP_CLOSE_SUBEXP)
7892
    {
7893
      int reg_num = dfa->nodes[cur_node].opr.idx + 1;
7894
      if (reg_num < nmatch)
7895
	{
7896
	  /* We are at the last node of this sub expression.  */
7897
	  if (pmatch[reg_num].rm_so < cur_idx)
7898
	    {
7899
	      pmatch[reg_num].rm_eo = cur_idx;
7900
	      /* This is a non-empty match or we are not inside an optional
7901
		 subexpression.  Accept this right away.  */
7902
	      memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
7903
	    }
7904
	  else
7905
	    {
7906
	      if (dfa->nodes[cur_node].opt_subexp
7907
		  && prev_idx_match[reg_num].rm_so != -1)
7908
		/* We transited through an empty match for an optional
7909
		   subexpression, like (a?)*, and this is not the subexp's
7910
		   first match.  Copy back the old content of the registers
7911
		   so that matches of an inner subexpression are undone as
7912
		   well, like in ((a?))*.  */
7913
		memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
7914
	      else
7915
		/* We completed a subexpression, but it may be part of
7916
		   an optional one, so do not update PREV_IDX_MATCH.  */
7917
		pmatch[reg_num].rm_eo = cur_idx;
7918
	    }
7919
	}
7920
    }
7921
}
7922

7923
/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
7924
   and sift the nodes in each states according to the following rules.
7925
   Updated state_log will be wrote to STATE_LOG.
7926

7927
   Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
7928
     1. When STR_IDX == MATCH_LAST(the last index in the state_log):
7929
	If `a' isn't the LAST_NODE and `a' can't epsilon transit to
7930
	the LAST_NODE, we throw away the node `a'.
7931
     2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
7932
	string `s' and transit to `b':
7933
	i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
7934
	   away the node `a'.
7935
	ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
7936
	    thrown away, we throw away the node `a'.
7937
     3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
7938
	i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
7939
	   node `a'.
7940
	ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
7941
	    we throw away the node `a'.  */
7942

7943
#define STATE_NODE_CONTAINS(state,node) \
7944
  ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
7945

7946
static reg_errcode_t
7947
internal_function
7948
sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
7949
{
7950
  reg_errcode_t err;
7951
  int null_cnt = 0;
7952
  int str_idx = sctx->last_str_idx;
7953
  re_node_set cur_dest;
7954

7955
#ifdef DEBUG
7956
  assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
7957
#endif
7958

7959
  /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
7960
     transit to the last_node and the last_node itself.  */
7961
  err = re_node_set_init_1 (&cur_dest, sctx->last_node);
7962
  if (BE (err != REG_NOERROR, 0))
7963
    return err;
7964
  err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
7965
  if (BE (err != REG_NOERROR, 0))
7966
    goto free_return;
7967

7968
  /* Then check each states in the state_log.  */
7969
  while (str_idx > 0)
7970
    {
7971
      /* Update counters.  */
7972
      null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
7973
      if (null_cnt > mctx->max_mb_elem_len)
7974
	{
7975
	  memset (sctx->sifted_states, '\0',
7976
		  sizeof (re_dfastate_t *) * str_idx);
7977
	  re_node_set_free (&cur_dest);
7978
	  return REG_NOERROR;
7979
	}
7980
      re_node_set_empty (&cur_dest);
7981
      --str_idx;
7982

7983
      if (mctx->state_log[str_idx])
7984
	{
7985
	  err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
7986
          if (BE (err != REG_NOERROR, 0))
7987
	    goto free_return;
7988
	}
7989

7990
      /* Add all the nodes which satisfy the following conditions:
7991
	 - It can epsilon transit to a node in CUR_DEST.
7992
	 - It is in CUR_SRC.
7993
	 And update state_log.  */
7994
      err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
7995
      if (BE (err != REG_NOERROR, 0))
7996
	goto free_return;
7997
    }
7998
  err = REG_NOERROR;
7999
 free_return:
8000
  re_node_set_free (&cur_dest);
8001
  return err;
8002
}
8003

8004
static reg_errcode_t
8005
internal_function
8006
build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
8007
		     int str_idx, re_node_set *cur_dest)
8008
{
8009
  const re_dfa_t *const dfa = mctx->dfa;
8010
  const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
8011
  int i;
8012

8013
  /* Then build the next sifted state.
8014
     We build the next sifted state on `cur_dest', and update
8015
     `sifted_states[str_idx]' with `cur_dest'.
8016
     Note:
8017
     `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
8018
     `cur_src' points the node_set of the old `state_log[str_idx]'
8019
     (with the epsilon nodes pre-filtered out).  */
8020
  for (i = 0; i < cur_src->nelem; i++)
8021
    {
8022
      int prev_node = cur_src->elems[i];
8023
      int naccepted = 0;
8024
      int ret;
8025

8026
#ifdef DEBUG
8027
      re_token_type_t type = dfa->nodes[prev_node].type;
8028
      assert (!IS_EPSILON_NODE (type));
8029
#endif
8030
#ifdef RE_ENABLE_I18N
8031
      /* If the node may accept `multi byte'.  */
8032
      if (dfa->nodes[prev_node].accept_mb)
8033
	naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
8034
					 str_idx, sctx->last_str_idx);
8035
#endif /* RE_ENABLE_I18N */
8036

8037
      /* We don't check backreferences here.
8038
	 See update_cur_sifted_state().  */
8039
      if (!naccepted
8040
	  && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
8041
	  && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
8042
				  dfa->nexts[prev_node]))
8043
	naccepted = 1;
8044

8045
      if (naccepted == 0)
8046
	continue;
8047

8048
      if (sctx->limits.nelem)
8049
	{
8050
	  int to_idx = str_idx + naccepted;
8051
	  if (check_dst_limits (mctx, &sctx->limits,
8052
				dfa->nexts[prev_node], to_idx,
8053
				prev_node, str_idx))
8054
	    continue;
8055
	}
8056
      ret = re_node_set_insert (cur_dest, prev_node);
8057
      if (BE (ret == -1, 0))
8058
	return REG_ESPACE;
8059
    }
8060

8061
  return REG_NOERROR;
8062
}
8063

8064
/* Helper functions.  */
8065

8066
static reg_errcode_t
8067
internal_function
8068
clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
8069
{
8070
  int top = mctx->state_log_top;
8071

8072
  if (next_state_log_idx >= mctx->input.bufs_len
8073
      || (next_state_log_idx >= mctx->input.valid_len
8074
	  && mctx->input.valid_len < mctx->input.len))
8075
    {
8076
      reg_errcode_t err;
8077
      err = extend_buffers (mctx);
8078
      if (BE (err != REG_NOERROR, 0))
8079
	return err;
8080
    }
8081

8082
  if (top < next_state_log_idx)
8083
    {
8084
      memset (mctx->state_log + top + 1, '\0',
8085
	      sizeof (re_dfastate_t *) * (next_state_log_idx - top));
8086
      mctx->state_log_top = next_state_log_idx;
8087
    }
8088
  return REG_NOERROR;
8089
}
8090

8091
static reg_errcode_t
8092
internal_function
8093
merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
8094
		   re_dfastate_t **src, int num)
8095
{
8096
  int st_idx;
8097
  reg_errcode_t err;
8098
  for (st_idx = 0; st_idx < num; ++st_idx)
8099
    {
8100
      if (dst[st_idx] == NULL)
8101
	dst[st_idx] = src[st_idx];
8102
      else if (src[st_idx] != NULL)
8103
	{
8104
	  re_node_set merged_set;
8105
	  err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
8106
					&src[st_idx]->nodes);
8107
	  if (BE (err != REG_NOERROR, 0))
8108
	    return err;
8109
	  dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
8110
	  re_node_set_free (&merged_set);
8111
	  if (BE (err != REG_NOERROR, 0))
8112
	    return err;
8113
	}
8114
    }
8115
  return REG_NOERROR;
8116
}
8117

8118
static reg_errcode_t
8119
internal_function
8120
update_cur_sifted_state (const re_match_context_t *mctx,
8121
			 re_sift_context_t *sctx, int str_idx,
8122
			 re_node_set *dest_nodes)
8123
{
8124
  const re_dfa_t *const dfa = mctx->dfa;
8125
  reg_errcode_t err = REG_NOERROR;
8126
  const re_node_set *candidates;
8127
  candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
8128
		: &mctx->state_log[str_idx]->nodes);
8129

8130
  if (dest_nodes->nelem == 0)
8131
    sctx->sifted_states[str_idx] = NULL;
8132
  else
8133
    {
8134
      if (candidates)
8135
	{
8136
	  /* At first, add the nodes which can epsilon transit to a node in
8137
	     DEST_NODE.  */
8138
	  err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
8139
	  if (BE (err != REG_NOERROR, 0))
8140
	    return err;
8141

8142
	  /* Then, check the limitations in the current sift_context.  */
8143
	  if (sctx->limits.nelem)
8144
	    {
8145
	      err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
8146
					 mctx->bkref_ents, str_idx);
8147
	      if (BE (err != REG_NOERROR, 0))
8148
		return err;
8149
	    }
8150
	}
8151

8152
      sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
8153
      if (BE (err != REG_NOERROR, 0))
8154
	return err;
8155
    }
8156

8157
  if (candidates && mctx->state_log[str_idx]->has_backref)
8158
    {
8159
      err = sift_states_bkref (mctx, sctx, str_idx, candidates);
8160
      if (BE (err != REG_NOERROR, 0))
8161
	return err;
8162
    }
8163
  return REG_NOERROR;
8164
}
8165

8166
static reg_errcode_t
8167
internal_function
8168
add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
8169
		       const re_node_set *candidates)
8170
{
8171
  reg_errcode_t err = REG_NOERROR;
8172
  int i;
8173

8174
  re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
8175
  if (BE (err != REG_NOERROR, 0))
8176
    return err;
8177

8178
  if (!state->inveclosure.alloc)
8179
    {
8180
      err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
8181
      if (BE (err != REG_NOERROR, 0))
8182
        return REG_ESPACE;
8183
      for (i = 0; i < dest_nodes->nelem; i++)
8184
        re_node_set_merge (&state->inveclosure,
8185
			   dfa->inveclosures + dest_nodes->elems[i]);
8186
    }
8187
  return re_node_set_add_intersect (dest_nodes, candidates,
8188
				    &state->inveclosure);
8189
}
8190

8191
static reg_errcode_t
8192
internal_function
8193
sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
8194
		       const re_node_set *candidates)
8195
{
8196
    int ecl_idx;
8197
    reg_errcode_t err;
8198
    re_node_set *inv_eclosure = dfa->inveclosures + node;
8199
    re_node_set except_nodes;
8200
    re_node_set_init_empty (&except_nodes);
8201
    for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
8202
      {
8203
	int cur_node = inv_eclosure->elems[ecl_idx];
8204
	if (cur_node == node)
8205
	  continue;
8206
	if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
8207
	  {
8208
	    int edst1 = dfa->edests[cur_node].elems[0];
8209
	    int edst2 = ((dfa->edests[cur_node].nelem > 1)
8210
			 ? dfa->edests[cur_node].elems[1] : -1);
8211
	    if ((!re_node_set_contains (inv_eclosure, edst1)
8212
		 && re_node_set_contains (dest_nodes, edst1))
8213
		|| (edst2 > 0
8214
		    && !re_node_set_contains (inv_eclosure, edst2)
8215
		    && re_node_set_contains (dest_nodes, edst2)))
8216
	      {
8217
		err = re_node_set_add_intersect (&except_nodes, candidates,
8218
						 dfa->inveclosures + cur_node);
8219
		if (BE (err != REG_NOERROR, 0))
8220
		  {
8221
		    re_node_set_free (&except_nodes);
8222
		    return err;
8223
		  }
8224
	      }
8225
	  }
8226
      }
8227
    for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
8228
      {
8229
	int cur_node = inv_eclosure->elems[ecl_idx];
8230
	if (!re_node_set_contains (&except_nodes, cur_node))
8231
	  {
8232
	    int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
8233
	    re_node_set_remove_at (dest_nodes, idx);
8234
	  }
8235
      }
8236
    re_node_set_free (&except_nodes);
8237
    return REG_NOERROR;
8238
}
8239

8240
static int
8241
internal_function
8242
check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
8243
		  int dst_node, int dst_idx, int src_node, int src_idx)
8244
{
8245
  const re_dfa_t *const dfa = mctx->dfa;
8246
  int lim_idx, src_pos, dst_pos;
8247

8248
  int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
8249
  int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
8250
  for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
8251
    {
8252
      int subexp_idx;
8253
      struct re_backref_cache_entry *ent;
8254
      ent = mctx->bkref_ents + limits->elems[lim_idx];
8255
      subexp_idx = dfa->nodes[ent->node].opr.idx;
8256

8257
      dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
8258
					   subexp_idx, dst_node, dst_idx,
8259
					   dst_bkref_idx);
8260
      src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
8261
					   subexp_idx, src_node, src_idx,
8262
					   src_bkref_idx);
8263

8264
      /* In case of:
8265
	 <src> <dst> ( <subexp> )
8266
	 ( <subexp> ) <src> <dst>
8267
	 ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
8268
      if (src_pos == dst_pos)
8269
	continue; /* This is unrelated limitation.  */
8270
      else
8271
	return 1;
8272
    }
8273
  return 0;
8274
}
8275

8276
static int
8277
internal_function
8278
check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
8279
			     int subexp_idx, int from_node, int bkref_idx)
8280
{
8281
  const re_dfa_t *const dfa = mctx->dfa;
8282
  const re_node_set *eclosures = dfa->eclosures + from_node;
8283
  int node_idx;
8284

8285
  /* Else, we are on the boundary: examine the nodes on the epsilon
8286
     closure.  */
8287
  for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
8288
    {
8289
      int node = eclosures->elems[node_idx];
8290
      switch (dfa->nodes[node].type)
8291
	{
8292
	case OP_BACK_REF:
8293
	  if (bkref_idx != -1)
8294
	    {
8295
	      struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
8296
	      do
8297
	        {
8298
		  int dst, cpos;
8299

8300
		  if (ent->node != node)
8301
		    continue;
8302

8303
		  if (subexp_idx < BITSET_WORD_BITS
8304
		      && !(ent->eps_reachable_subexps_map
8305
			   & ((bitset_word_t) 1 << subexp_idx)))
8306
		    continue;
8307

8308
		  /* Recurse trying to reach the OP_OPEN_SUBEXP and
8309
		     OP_CLOSE_SUBEXP cases below.  But, if the
8310
		     destination node is the same node as the source
8311
		     node, don't recurse because it would cause an
8312
		     infinite loop: a regex that exhibits this behavior
8313
		     is ()\1*\1*  */
8314
		  dst = dfa->edests[node].elems[0];
8315
		  if (dst == from_node)
8316
		    {
8317
		      if (boundaries & 1)
8318
		        return -1;
8319
		      else /* if (boundaries & 2) */
8320
		        return 0;
8321
		    }
8322

8323
		  cpos =
8324
		    check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
8325
						 dst, bkref_idx);
8326
		  if (cpos == -1 /* && (boundaries & 1) */)
8327
		    return -1;
8328
		  if (cpos == 0 && (boundaries & 2))
8329
		    return 0;
8330

8331
		  if (subexp_idx < BITSET_WORD_BITS)
8332
		    ent->eps_reachable_subexps_map
8333
		      &= ~((bitset_word_t) 1 << subexp_idx);
8334
	        }
8335
	      while (ent++->more);
8336
	    }
8337
	  break;
8338

8339
	case OP_OPEN_SUBEXP:
8340
	  if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
8341
	    return -1;
8342
	  break;
8343

8344
	case OP_CLOSE_SUBEXP:
8345
	  if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
8346
	    return 0;
8347
	  break;
8348

8349
	default:
8350
	    break;
8351
	}
8352
    }
8353

8354
  return (boundaries & 2) ? 1 : 0;
8355
}
8356

8357
static int
8358
internal_function
8359
check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
8360
			   int subexp_idx, int from_node, int str_idx,
8361
			   int bkref_idx)
8362
{
8363
  struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
8364
  int boundaries;
8365

8366
  /* If we are outside the range of the subexpression, return -1 or 1.  */
8367
  if (str_idx < lim->subexp_from)
8368
    return -1;
8369

8370
  if (lim->subexp_to < str_idx)
8371
    return 1;
8372

8373
  /* If we are within the subexpression, return 0.  */
8374
  boundaries = (str_idx == lim->subexp_from);
8375
  boundaries |= (str_idx == lim->subexp_to) << 1;
8376
  if (boundaries == 0)
8377
    return 0;
8378

8379
  /* Else, examine epsilon closure.  */
8380
  return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
8381
				      from_node, bkref_idx);
8382
}
8383

8384
/* Check the limitations of sub expressions LIMITS, and remove the nodes
8385
   which are against limitations from DEST_NODES. */
8386

8387
static reg_errcode_t
8388
internal_function
8389
check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
8390
		     const re_node_set *candidates, re_node_set *limits,
8391
		     struct re_backref_cache_entry *bkref_ents, int str_idx)
8392
{
8393
  reg_errcode_t err;
8394
  int node_idx, lim_idx;
8395

8396
  for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
8397
    {
8398
      int subexp_idx;
8399
      struct re_backref_cache_entry *ent;
8400
      ent = bkref_ents + limits->elems[lim_idx];
8401

8402
      if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
8403
	continue; /* This is unrelated limitation.  */
8404

8405
      subexp_idx = dfa->nodes[ent->node].opr.idx;
8406
      if (ent->subexp_to == str_idx)
8407
	{
8408
	  int ops_node = -1;
8409
	  int cls_node = -1;
8410
	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
8411
	    {
8412
	      int node = dest_nodes->elems[node_idx];
8413
	      re_token_type_t type = dfa->nodes[node].type;
8414
	      if (type == OP_OPEN_SUBEXP
8415
		  && subexp_idx == dfa->nodes[node].opr.idx)
8416
		ops_node = node;
8417
	      else if (type == OP_CLOSE_SUBEXP
8418
		       && subexp_idx == dfa->nodes[node].opr.idx)
8419
		cls_node = node;
8420
	    }
8421

8422
	  /* Check the limitation of the open subexpression.  */
8423
	  /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
8424
	  if (ops_node >= 0)
8425
	    {
8426
	      err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
8427
					   candidates);
8428
	      if (BE (err != REG_NOERROR, 0))
8429
		return err;
8430
	    }
8431

8432
	  /* Check the limitation of the close subexpression.  */
8433
	  if (cls_node >= 0)
8434
	    for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
8435
	      {
8436
		int node = dest_nodes->elems[node_idx];
8437
		if (!re_node_set_contains (dfa->inveclosures + node,
8438
					   cls_node)
8439
		    && !re_node_set_contains (dfa->eclosures + node,
8440
					      cls_node))
8441
		  {
8442
		    /* It is against this limitation.
8443
		       Remove it form the current sifted state.  */
8444
		    err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
8445
						 candidates);
8446
		    if (BE (err != REG_NOERROR, 0))
8447
		      return err;
8448
		    --node_idx;
8449
		  }
8450
	      }
8451
	}
8452
      else /* (ent->subexp_to != str_idx)  */
8453
	{
8454
	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
8455
	    {
8456
	      int node = dest_nodes->elems[node_idx];
8457
	      re_token_type_t type = dfa->nodes[node].type;
8458
	      if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
8459
		{
8460
		  if (subexp_idx != dfa->nodes[node].opr.idx)
8461
		    continue;
8462
		  /* It is against this limitation.
8463
		     Remove it form the current sifted state.  */
8464
		  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
8465
					       candidates);
8466
		  if (BE (err != REG_NOERROR, 0))
8467
		    return err;
8468
		}
8469
	    }
8470
	}
8471
    }
8472
  return REG_NOERROR;
8473
}
8474

8475
static reg_errcode_t
8476
internal_function
8477
sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
8478
		   int str_idx, const re_node_set *candidates)
8479
{
8480
  const re_dfa_t *const dfa = mctx->dfa;
8481
  reg_errcode_t err;
8482
  int node_idx, node;
8483
  re_sift_context_t local_sctx;
8484
  int first_idx = search_cur_bkref_entry (mctx, str_idx);
8485

8486
  if (first_idx == -1)
8487
    return REG_NOERROR;
8488

8489
  local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
8490

8491
  for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
8492
    {
8493
      int enabled_idx;
8494
      re_token_type_t type;
8495
      struct re_backref_cache_entry *entry;
8496
      node = candidates->elems[node_idx];
8497
      type = dfa->nodes[node].type;
8498
      /* Avoid infinite loop for the REs like "()\1+".  */
8499
      if (node == sctx->last_node && str_idx == sctx->last_str_idx)
8500
	continue;
8501
      if (type != OP_BACK_REF)
8502
	continue;
8503

8504
      entry = mctx->bkref_ents + first_idx;
8505
      enabled_idx = first_idx;
8506
      do
8507
	{
8508
	  int subexp_len;
8509
	  int to_idx;
8510
	  int dst_node;
8511
	  int ret;
8512
	  re_dfastate_t *cur_state;
8513

8514
	  if (entry->node != node)
8515
	    continue;
8516
	  subexp_len = entry->subexp_to - entry->subexp_from;
8517
	  to_idx = str_idx + subexp_len;
8518
	  dst_node = (subexp_len ? dfa->nexts[node]
8519
		      : dfa->edests[node].elems[0]);
8520

8521
	  if (to_idx > sctx->last_str_idx
8522
	      || sctx->sifted_states[to_idx] == NULL
8523
	      || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
8524
	      || check_dst_limits (mctx, &sctx->limits, node,
8525
				   str_idx, dst_node, to_idx))
8526
	    continue;
8527

8528
	  if (local_sctx.sifted_states == NULL)
8529
	    {
8530
	      local_sctx = *sctx;
8531
	      err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
8532
	      if (BE (err != REG_NOERROR, 0))
8533
		goto free_return;
8534
	    }
8535
	  local_sctx.last_node = node;
8536
	  local_sctx.last_str_idx = str_idx;
8537
	  ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
8538
	  if (BE (ret < 0, 0))
8539
	    {
8540
	      err = REG_ESPACE;
8541
	      goto free_return;
8542
	    }
8543
	  cur_state = local_sctx.sifted_states[str_idx];
8544
	  err = sift_states_backward (mctx, &local_sctx);
8545
	  if (BE (err != REG_NOERROR, 0))
8546
	    goto free_return;
8547
	  if (sctx->limited_states != NULL)
8548
	    {
8549
	      err = merge_state_array (dfa, sctx->limited_states,
8550
				       local_sctx.sifted_states,
8551
				       str_idx + 1);
8552
	      if (BE (err != REG_NOERROR, 0))
8553
		goto free_return;
8554
	    }
8555
	  local_sctx.sifted_states[str_idx] = cur_state;
8556
	  re_node_set_remove (&local_sctx.limits, enabled_idx);
8557

8558
	  /* mctx->bkref_ents may have changed, reload the pointer.  */
8559
          entry = mctx->bkref_ents + enabled_idx;
8560
	}
8561
      while (enabled_idx++, entry++->more);
8562
    }
8563
  err = REG_NOERROR;
8564
 free_return:
8565
  if (local_sctx.sifted_states != NULL)
8566
    {
8567
      re_node_set_free (&local_sctx.limits);
8568
    }
8569

8570
  return err;
8571
}
8572

8573

8574
#ifdef RE_ENABLE_I18N
8575
static int
8576
internal_function
8577
sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
8578
		     int node_idx, int str_idx, int max_str_idx)
8579
{
8580
  const re_dfa_t *const dfa = mctx->dfa;
8581
  int naccepted;
8582
  /* Check the node can accept `multi byte'.  */
8583
  naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
8584
  if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
8585
      !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
8586
			    dfa->nexts[node_idx]))
8587
    /* The node can't accept the `multi byte', or the
8588
       destination was already thrown away, then the node
8589
       could't accept the current input `multi byte'.   */
8590
    naccepted = 0;
8591
  /* Otherwise, it is sure that the node could accept
8592
     `naccepted' bytes input.  */
8593
  return naccepted;
8594
}
8595
#endif /* RE_ENABLE_I18N */
8596

8597

8598
/* Functions for state transition.  */
8599

8600
/* Return the next state to which the current state STATE will transit by
8601
   accepting the current input byte, and update STATE_LOG if necessary.
8602
   If STATE can accept a multibyte char/collating element/back reference
8603
   update the destination of STATE_LOG.  */
8604

8605
static re_dfastate_t *
8606
internal_function
8607
transit_state (reg_errcode_t *err, re_match_context_t *mctx,
8608
	       re_dfastate_t *state)
8609
{
8610
  re_dfastate_t **trtable;
8611
  unsigned char ch;
8612

8613
#ifdef RE_ENABLE_I18N
8614
  /* If the current state can accept multibyte.  */
8615
  if (BE (state->accept_mb, 0))
8616
    {
8617
      *err = transit_state_mb (mctx, state);
8618
      if (BE (*err != REG_NOERROR, 0))
8619
	return NULL;
8620
    }
8621
#endif /* RE_ENABLE_I18N */
8622

8623
  /* Then decide the next state with the single byte.  */
8624
#if 0
8625
  if (0)
8626
    /* don't use transition table  */
8627
    return transit_state_sb (err, mctx, state);
8628
#endif
8629

8630
  /* Use transition table  */
8631
  ch = re_string_fetch_byte (&mctx->input);
8632
  for (;;)
8633
    {
8634
      trtable = state->trtable;
8635
      if (BE (trtable != NULL, 1))
8636
	return trtable[ch];
8637

8638
      trtable = state->word_trtable;
8639
      if (BE (trtable != NULL, 1))
8640
        {
8641
	  unsigned int context;
8642
	  context
8643
	    = re_string_context_at (&mctx->input,
8644
				    re_string_cur_idx (&mctx->input) - 1,
8645
				    mctx->eflags);
8646
	  if (IS_WORD_CONTEXT (context))
8647
	    return trtable[ch + SBC_MAX];
8648
	  else
8649
	    return trtable[ch];
8650
	}
8651

8652
      if (!build_trtable (mctx->dfa, state))
8653
	{
8654
	  *err = REG_ESPACE;
8655
	  return NULL;
8656
	}
8657

8658
      /* Retry, we now have a transition table.  */
8659
    }
8660
}
8661

8662
/* Update the state_log if we need */
8663
re_dfastate_t *
8664
internal_function
8665
merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
8666
		      re_dfastate_t *next_state)
8667
{
8668
  const re_dfa_t *const dfa = mctx->dfa;
8669
  int cur_idx = re_string_cur_idx (&mctx->input);
8670

8671
  if (cur_idx > mctx->state_log_top)
8672
    {
8673
      mctx->state_log[cur_idx] = next_state;
8674
      mctx->state_log_top = cur_idx;
8675
    }
8676
  else if (mctx->state_log[cur_idx] == 0)
8677
    {
8678
      mctx->state_log[cur_idx] = next_state;
8679
    }
8680
  else
8681
    {
8682
      re_dfastate_t *pstate;
8683
      unsigned int context;
8684
      re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
8685
      /* If (state_log[cur_idx] != 0), it implies that cur_idx is
8686
         the destination of a multibyte char/collating element/
8687
         back reference.  Then the next state is the union set of
8688
         these destinations and the results of the transition table.  */
8689
      pstate = mctx->state_log[cur_idx];
8690
      log_nodes = pstate->entrance_nodes;
8691
      if (next_state != NULL)
8692
        {
8693
          table_nodes = next_state->entrance_nodes;
8694
          *err = re_node_set_init_union (&next_nodes, table_nodes,
8695
					     log_nodes);
8696
          if (BE (*err != REG_NOERROR, 0))
8697
	    return NULL;
8698
        }
8699
      else
8700
        next_nodes = *log_nodes;
8701
      /* Note: We already add the nodes of the initial state,
8702
	 then we don't need to add them here.  */
8703

8704
      context = re_string_context_at (&mctx->input,
8705
				      re_string_cur_idx (&mctx->input) - 1,
8706
				      mctx->eflags);
8707
      next_state = mctx->state_log[cur_idx]
8708
        = re_acquire_state_context (err, dfa, &next_nodes, context);
8709
      /* We don't need to check errors here, since the return value of
8710
         this function is next_state and ERR is already set.  */
8711

8712
      if (table_nodes != NULL)
8713
        re_node_set_free (&next_nodes);
8714
    }
8715

8716
  if (BE (dfa->nbackref, 0) && next_state != NULL)
8717
    {
8718
      /* Check OP_OPEN_SUBEXP in the current state in case that we use them
8719
	 later.  We must check them here, since the back references in the
8720
	 next state might use them.  */
8721
      *err = check_subexp_matching_top (mctx, &next_state->nodes,
8722
					cur_idx);
8723
      if (BE (*err != REG_NOERROR, 0))
8724
	return NULL;
8725

8726
      /* If the next state has back references.  */
8727
      if (next_state->has_backref)
8728
	{
8729
	  *err = transit_state_bkref (mctx, &next_state->nodes);
8730
	  if (BE (*err != REG_NOERROR, 0))
8731
	    return NULL;
8732
	  next_state = mctx->state_log[cur_idx];
8733
	}
8734
    }
8735

8736
  return next_state;
8737
}
8738

8739
/* Skip bytes in the input that correspond to part of a
8740
   multi-byte match, then look in the log for a state
8741
   from which to restart matching.  */
8742
re_dfastate_t *
8743
internal_function
8744
find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
8745
{
8746
  re_dfastate_t *cur_state;
8747
  do
8748
    {
8749
      int max = mctx->state_log_top;
8750
      int cur_str_idx = re_string_cur_idx (&mctx->input);
8751

8752
      do
8753
	{
8754
          if (++cur_str_idx > max)
8755
            return NULL;
8756
          re_string_skip_bytes (&mctx->input, 1);
8757
	}
8758
      while (mctx->state_log[cur_str_idx] == NULL);
8759

8760
      cur_state = merge_state_with_log (err, mctx, NULL);
8761
    }
8762
  while (*err == REG_NOERROR && cur_state == NULL);
8763
  return cur_state;
8764
}
8765

8766
/* Helper functions for transit_state.  */
8767

8768
/* From the node set CUR_NODES, pick up the nodes whose types are
8769
   OP_OPEN_SUBEXP and which have corresponding back references in the regular
8770
   expression. And register them to use them later for evaluating the
8771
   correspoding back references.  */
8772

8773
static reg_errcode_t
8774
internal_function
8775
check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
8776
			   int str_idx)
8777
{
8778
  const re_dfa_t *const dfa = mctx->dfa;
8779
  int node_idx;
8780
  reg_errcode_t err;
8781

8782
  /* TODO: This isn't efficient.
8783
	   Because there might be more than one nodes whose types are
8784
	   OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
8785
	   nodes.
8786
	   E.g. RE: (a){2}  */
8787
  for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
8788
    {
8789
      int node = cur_nodes->elems[node_idx];
8790
      if (dfa->nodes[node].type == OP_OPEN_SUBEXP
8791
	  && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
8792
	  && (dfa->used_bkref_map
8793
	      & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
8794
	{
8795
	  err = match_ctx_add_subtop (mctx, node, str_idx);
8796
	  if (BE (err != REG_NOERROR, 0))
8797
	    return err;
8798
	}
8799
    }
8800
  return REG_NOERROR;
8801
}
8802

8803
#if 0
8804
/* Return the next state to which the current state STATE will transit by
8805
   accepting the current input byte.  */
8806

8807
static re_dfastate_t *
8808
transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
8809
		  re_dfastate_t *state)
8810
{
8811
  const re_dfa_t *const dfa = mctx->dfa;
8812
  re_node_set next_nodes;
8813
  re_dfastate_t *next_state;
8814
  int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
8815
  unsigned int context;
8816

8817
  *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
8818
  if (BE (*err != REG_NOERROR, 0))
8819
    return NULL;
8820
  for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
8821
    {
8822
      int cur_node = state->nodes.elems[node_cnt];
8823
      if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
8824
	{
8825
	  *err = re_node_set_merge (&next_nodes,
8826
				    dfa->eclosures + dfa->nexts[cur_node]);
8827
	  if (BE (*err != REG_NOERROR, 0))
8828
	    {
8829
	      re_node_set_free (&next_nodes);
8830
	      return NULL;
8831
	    }
8832
	}
8833
    }
8834
  context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
8835
  next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
8836
  /* We don't need to check errors here, since the return value of
8837
     this function is next_state and ERR is already set.  */
8838

8839
  re_node_set_free (&next_nodes);
8840
  re_string_skip_bytes (&mctx->input, 1);
8841
  return next_state;
8842
}
8843
#endif
8844

8845
#ifdef RE_ENABLE_I18N
8846
static reg_errcode_t
8847
internal_function
8848
transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
8849
{
8850
  const re_dfa_t *const dfa = mctx->dfa;
8851
  reg_errcode_t err;
8852
  int i;
8853

8854
  for (i = 0; i < pstate->nodes.nelem; ++i)
8855
    {
8856
      re_node_set dest_nodes, *new_nodes;
8857
      int cur_node_idx = pstate->nodes.elems[i];
8858
      int naccepted, dest_idx;
8859
      unsigned int context;
8860
      re_dfastate_t *dest_state;
8861

8862
      if (!dfa->nodes[cur_node_idx].accept_mb)
8863
        continue;
8864

8865
      if (dfa->nodes[cur_node_idx].constraint)
8866
	{
8867
	  context = re_string_context_at (&mctx->input,
8868
					  re_string_cur_idx (&mctx->input),
8869
					  mctx->eflags);
8870
	  if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
8871
					   context))
8872
	    continue;
8873
	}
8874

8875
      /* How many bytes the node can accept?  */
8876
      naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
8877
					   re_string_cur_idx (&mctx->input));
8878
      if (naccepted == 0)
8879
	continue;
8880

8881
      /* The node can accepts `naccepted' bytes.  */
8882
      dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
8883
      mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
8884
			       : mctx->max_mb_elem_len);
8885
      err = clean_state_log_if_needed (mctx, dest_idx);
8886
      if (BE (err != REG_NOERROR, 0))
8887
	return err;
8888
#ifdef DEBUG
8889
      assert (dfa->nexts[cur_node_idx] != -1);
8890
#endif
8891
      new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
8892

8893
      dest_state = mctx->state_log[dest_idx];
8894
      if (dest_state == NULL)
8895
	dest_nodes = *new_nodes;
8896
      else
8897
	{
8898
	  err = re_node_set_init_union (&dest_nodes,
8899
					dest_state->entrance_nodes, new_nodes);
8900
	  if (BE (err != REG_NOERROR, 0))
8901
	    return err;
8902
	}
8903
      context = re_string_context_at (&mctx->input, dest_idx - 1,
8904
				      mctx->eflags);
8905
      mctx->state_log[dest_idx]
8906
	= re_acquire_state_context (&err, dfa, &dest_nodes, context);
8907
      if (dest_state != NULL)
8908
	re_node_set_free (&dest_nodes);
8909
      if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
8910
	return err;
8911
    }
8912
  return REG_NOERROR;
8913
}
8914
#endif /* RE_ENABLE_I18N */
8915

8916
static reg_errcode_t
8917
internal_function
8918
transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
8919
{
8920
  const re_dfa_t *const dfa = mctx->dfa;
8921
  reg_errcode_t err;
8922
  int i;
8923
  int cur_str_idx = re_string_cur_idx (&mctx->input);
8924

8925
  for (i = 0; i < nodes->nelem; ++i)
8926
    {
8927
      int dest_str_idx, prev_nelem, bkc_idx;
8928
      int node_idx = nodes->elems[i];
8929
      unsigned int context;
8930
      const re_token_t *node = dfa->nodes + node_idx;
8931
      re_node_set *new_dest_nodes;
8932

8933
      /* Check whether `node' is a backreference or not.  */
8934
      if (node->type != OP_BACK_REF)
8935
	continue;
8936

8937
      if (node->constraint)
8938
	{
8939
	  context = re_string_context_at (&mctx->input, cur_str_idx,
8940
					  mctx->eflags);
8941
	  if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
8942
	    continue;
8943
	}
8944

8945
      /* `node' is a backreference.
8946
	 Check the substring which the substring matched.  */
8947
      bkc_idx = mctx->nbkref_ents;
8948
      err = get_subexp (mctx, node_idx, cur_str_idx);
8949
      if (BE (err != REG_NOERROR, 0))
8950
	goto free_return;
8951

8952
      /* And add the epsilon closures (which is `new_dest_nodes') of
8953
	 the backreference to appropriate state_log.  */
8954
#ifdef DEBUG
8955
      assert (dfa->nexts[node_idx] != -1);
8956
#endif
8957
      for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
8958
	{
8959
	  int subexp_len;
8960
	  re_dfastate_t *dest_state;
8961
	  struct re_backref_cache_entry *bkref_ent;
8962
	  bkref_ent = mctx->bkref_ents + bkc_idx;
8963
	  if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
8964
	    continue;
8965
	  subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
8966
	  new_dest_nodes = (subexp_len == 0
8967
			    ? dfa->eclosures + dfa->edests[node_idx].elems[0]
8968
			    : dfa->eclosures + dfa->nexts[node_idx]);
8969
	  dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
8970
			  - bkref_ent->subexp_from);
8971
	  context = re_string_context_at (&mctx->input, dest_str_idx - 1,
8972
					  mctx->eflags);
8973
	  dest_state = mctx->state_log[dest_str_idx];
8974
	  prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
8975
			: mctx->state_log[cur_str_idx]->nodes.nelem);
8976
	  /* Add `new_dest_node' to state_log.  */
8977
	  if (dest_state == NULL)
8978
	    {
8979
	      mctx->state_log[dest_str_idx]
8980
		= re_acquire_state_context (&err, dfa, new_dest_nodes,
8981
					    context);
8982
	      if (BE (mctx->state_log[dest_str_idx] == NULL
8983
		      && err != REG_NOERROR, 0))
8984
		goto free_return;
8985
	    }
8986
	  else
8987
	    {
8988
	      re_node_set dest_nodes;
8989
	      err = re_node_set_init_union (&dest_nodes,
8990
					    dest_state->entrance_nodes,
8991
					    new_dest_nodes);
8992
	      if (BE (err != REG_NOERROR, 0))
8993
		{
8994
		  re_node_set_free (&dest_nodes);
8995
		  goto free_return;
8996
		}
8997
	      mctx->state_log[dest_str_idx]
8998
		= re_acquire_state_context (&err, dfa, &dest_nodes, context);
8999
	      re_node_set_free (&dest_nodes);
9000
	      if (BE (mctx->state_log[dest_str_idx] == NULL
9001
		      && err != REG_NOERROR, 0))
9002
		goto free_return;
9003
	    }
9004
	  /* We need to check recursively if the backreference can epsilon
9005
	     transit.  */
9006
	  if (subexp_len == 0
9007
	      && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
9008
	    {
9009
	      err = check_subexp_matching_top (mctx, new_dest_nodes,
9010
					       cur_str_idx);
9011
	      if (BE (err != REG_NOERROR, 0))
9012
		goto free_return;
9013
	      err = transit_state_bkref (mctx, new_dest_nodes);
9014
	      if (BE (err != REG_NOERROR, 0))
9015
		goto free_return;
9016
	    }
9017
	}
9018
    }
9019
  err = REG_NOERROR;
9020
 free_return:
9021
  return err;
9022
}
9023

9024
/* Enumerate all the candidates which the backreference BKREF_NODE can match
9025
   at BKREF_STR_IDX, and register them by match_ctx_add_entry().
9026
   Note that we might collect inappropriate candidates here.
9027
   However, the cost of checking them strictly here is too high, then we
9028
   delay these checking for prune_impossible_nodes().  */
9029

9030
static reg_errcode_t
9031
internal_function
9032
get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
9033
{
9034
  const re_dfa_t *const dfa = mctx->dfa;
9035
  int subexp_num, sub_top_idx;
9036
  const char *buf = (const char *) re_string_get_buffer (&mctx->input);
9037
  /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
9038
  int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
9039
  if (cache_idx != -1)
9040
    {
9041
      const struct re_backref_cache_entry *entry
9042
	= mctx->bkref_ents + cache_idx;
9043
      do
9044
        if (entry->node == bkref_node)
9045
	  return REG_NOERROR; /* We already checked it.  */
9046
      while (entry++->more);
9047
    }
9048

9049
  subexp_num = dfa->nodes[bkref_node].opr.idx;
9050

9051
  /* For each sub expression  */
9052
  for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
9053
    {
9054
      reg_errcode_t err;
9055
      re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
9056
      re_sub_match_last_t *sub_last;
9057
      int sub_last_idx, sl_str, bkref_str_off;
9058

9059
      if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
9060
	continue; /* It isn't related.  */
9061

9062
      sl_str = sub_top->str_idx;
9063
      bkref_str_off = bkref_str_idx;
9064
      /* At first, check the last node of sub expressions we already
9065
	 evaluated.  */
9066
      for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
9067
	{
9068
	  int sl_str_diff;
9069
	  sub_last = sub_top->lasts[sub_last_idx];
9070
	  sl_str_diff = sub_last->str_idx - sl_str;
9071
	  /* The matched string by the sub expression match with the substring
9072
	     at the back reference?  */
9073
	  if (sl_str_diff > 0)
9074
	    {
9075
	      if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
9076
		{
9077
		  /* Not enough chars for a successful match.  */
9078
		  if (bkref_str_off + sl_str_diff > mctx->input.len)
9079
		    break;
9080

9081
		  err = clean_state_log_if_needed (mctx,
9082
						   bkref_str_off
9083
						   + sl_str_diff);
9084
		  if (BE (err != REG_NOERROR, 0))
9085
		    return err;
9086
		  buf = (const char *) re_string_get_buffer (&mctx->input);
9087
		}
9088
	      if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
9089
		/* We don't need to search this sub expression any more.  */
9090
		break;
9091
	    }
9092
	  bkref_str_off += sl_str_diff;
9093
	  sl_str += sl_str_diff;
9094
	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
9095
				bkref_str_idx);
9096

9097
	  /* Reload buf, since the preceding call might have reallocated
9098
	     the buffer.  */
9099
	  buf = (const char *) re_string_get_buffer (&mctx->input);
9100

9101
	  if (err == REG_NOMATCH)
9102
	    continue;
9103
	  if (BE (err != REG_NOERROR, 0))
9104
	    return err;
9105
	}
9106

9107
      if (sub_last_idx < sub_top->nlasts)
9108
	continue;
9109
      if (sub_last_idx > 0)
9110
	++sl_str;
9111
      /* Then, search for the other last nodes of the sub expression.  */
9112
      for (; sl_str <= bkref_str_idx; ++sl_str)
9113
	{
9114
	  int cls_node, sl_str_off;
9115
	  const re_node_set *nodes;
9116
	  sl_str_off = sl_str - sub_top->str_idx;
9117
	  /* The matched string by the sub expression match with the substring
9118
	     at the back reference?  */
9119
	  if (sl_str_off > 0)
9120
	    {
9121
	      if (BE (bkref_str_off >= mctx->input.valid_len, 0))
9122
		{
9123
		  /* If we are at the end of the input, we cannot match.  */
9124
		  if (bkref_str_off >= mctx->input.len)
9125
		    break;
9126

9127
		  err = extend_buffers (mctx);
9128
		  if (BE (err != REG_NOERROR, 0))
9129
		    return err;
9130

9131
		  buf = (const char *) re_string_get_buffer (&mctx->input);
9132
		}
9133
	      if (buf [bkref_str_off++] != buf[sl_str - 1])
9134
		break; /* We don't need to search this sub expression
9135
			  any more.  */
9136
	    }
9137
	  if (mctx->state_log[sl_str] == NULL)
9138
	    continue;
9139
	  /* Does this state have a ')' of the sub expression?  */
9140
	  nodes = &mctx->state_log[sl_str]->nodes;
9141
	  cls_node = find_subexp_node (dfa, nodes, subexp_num,
9142
				       OP_CLOSE_SUBEXP);
9143
	  if (cls_node == -1)
9144
	    continue; /* No.  */
9145
	  if (sub_top->path == NULL)
9146
	    {
9147
	      sub_top->path = calloc (sizeof (state_array_t),
9148
				      sl_str - sub_top->str_idx + 1);
9149
	      if (sub_top->path == NULL)
9150
		return REG_ESPACE;
9151
	    }
9152
	  /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
9153
	     in the current context?  */
9154
	  err = check_arrival (mctx, sub_top->path, sub_top->node,
9155
			       sub_top->str_idx, cls_node, sl_str,
9156
			       OP_CLOSE_SUBEXP);
9157
	  if (err == REG_NOMATCH)
9158
	      continue;
9159
	  if (BE (err != REG_NOERROR, 0))
9160
	      return err;
9161
	  sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
9162
	  if (BE (sub_last == NULL, 0))
9163
	    return REG_ESPACE;
9164
	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
9165
				bkref_str_idx);
9166
	  if (err == REG_NOMATCH)
9167
	    continue;
9168
	}
9169
    }
9170
  return REG_NOERROR;
9171
}
9172

9173
/* Helper functions for get_subexp().  */
9174

9175
/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
9176
   If it can arrive, register the sub expression expressed with SUB_TOP
9177
   and SUB_LAST.  */
9178

9179
static reg_errcode_t
9180
internal_function
9181
get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
9182
		re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
9183
{
9184
  reg_errcode_t err;
9185
  int to_idx;
9186
  /* Can the subexpression arrive the back reference?  */
9187
  err = check_arrival (mctx, &sub_last->path, sub_last->node,
9188
		       sub_last->str_idx, bkref_node, bkref_str,
9189
		       OP_OPEN_SUBEXP);
9190
  if (err != REG_NOERROR)
9191
    return err;
9192
  err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
9193
			     sub_last->str_idx);
9194
  if (BE (err != REG_NOERROR, 0))
9195
    return err;
9196
  to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
9197
  return clean_state_log_if_needed (mctx, to_idx);
9198
}
9199

9200
/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
9201
   Search '(' if FL_OPEN, or search ')' otherwise.
9202
   TODO: This function isn't efficient...
9203
	 Because there might be more than one nodes whose types are
9204
	 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
9205
	 nodes.
9206
	 E.g. RE: (a){2}  */
9207

9208
static int
9209
internal_function
9210
find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
9211
		  int subexp_idx, int type)
9212
{
9213
  int cls_idx;
9214
  for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
9215
    {
9216
      int cls_node = nodes->elems[cls_idx];
9217
      const re_token_t *node = dfa->nodes + cls_node;
9218
      if (node->type == type
9219
	  && node->opr.idx == subexp_idx)
9220
	return cls_node;
9221
    }
9222
  return -1;
9223
}
9224

9225
/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
9226
   LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
9227
   heavily reused.
9228
   Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
9229

9230
static reg_errcode_t
9231
internal_function
9232
check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
9233
	       int top_str, int last_node, int last_str, int type)
9234
{
9235
  const re_dfa_t *const dfa = mctx->dfa;
9236
  reg_errcode_t err = REG_NOERROR;
9237
  int subexp_num, backup_cur_idx, str_idx, null_cnt;
9238
  re_dfastate_t *cur_state = NULL;
9239
  re_node_set *cur_nodes, next_nodes;
9240
  re_dfastate_t **backup_state_log;
9241
  unsigned int context;
9242

9243
  subexp_num = dfa->nodes[top_node].opr.idx;
9244
  /* Extend the buffer if we need.  */
9245
  if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
9246
    {
9247
      re_dfastate_t **new_array;
9248
      int old_alloc = path->alloc;
9249
      path->alloc += last_str + mctx->max_mb_elem_len + 1;
9250
      new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
9251
      if (BE (new_array == NULL, 0))
9252
	{
9253
	  path->alloc = old_alloc;
9254
	  return REG_ESPACE;
9255
	}
9256
      path->array = new_array;
9257
      memset (new_array + old_alloc, '\0',
9258
	      sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
9259
    }
9260

9261
  str_idx = path->next_idx ? path->next_idx : top_str;
9262

9263
  /* Temporary modify MCTX.  */
9264
  backup_state_log = mctx->state_log;
9265
  backup_cur_idx = mctx->input.cur_idx;
9266
  mctx->state_log = path->array;
9267
  mctx->input.cur_idx = str_idx;
9268

9269
  /* Setup initial node set.  */
9270
  context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
9271
  if (str_idx == top_str)
9272
    {
9273
      err = re_node_set_init_1 (&next_nodes, top_node);
9274
      if (BE (err != REG_NOERROR, 0))
9275
	return err;
9276
      err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
9277
      if (BE (err != REG_NOERROR, 0))
9278
	{
9279
	  re_node_set_free (&next_nodes);
9280
	  return err;
9281
	}
9282
    }
9283
  else
9284
    {
9285
      cur_state = mctx->state_log[str_idx];
9286
      if (cur_state && cur_state->has_backref)
9287
	{
9288
	  err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
9289
	  if (BE (err != REG_NOERROR, 0))
9290
	    return err;
9291
	}
9292
      else
9293
	re_node_set_init_empty (&next_nodes);
9294
    }
9295
  if (str_idx == top_str || (cur_state && cur_state->has_backref))
9296
    {
9297
      if (next_nodes.nelem)
9298
	{
9299
	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
9300
				    subexp_num, type);
9301
	  if (BE (err != REG_NOERROR, 0))
9302
	    {
9303
	      re_node_set_free (&next_nodes);
9304
	      return err;
9305
	    }
9306
	}
9307
      cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
9308
      if (BE (cur_state == NULL && err != REG_NOERROR, 0))
9309
	{
9310
	  re_node_set_free (&next_nodes);
9311
	  return err;
9312
	}
9313
      mctx->state_log[str_idx] = cur_state;
9314
    }
9315

9316
  for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
9317
    {
9318
      re_node_set_empty (&next_nodes);
9319
      if (mctx->state_log[str_idx + 1])
9320
	{
9321
	  err = re_node_set_merge (&next_nodes,
9322
				   &mctx->state_log[str_idx + 1]->nodes);
9323
	  if (BE (err != REG_NOERROR, 0))
9324
	    {
9325
	      re_node_set_free (&next_nodes);
9326
	      return err;
9327
	    }
9328
	}
9329
      if (cur_state)
9330
	{
9331
	  err = check_arrival_add_next_nodes (mctx, str_idx,
9332
					      &cur_state->non_eps_nodes,
9333
					      &next_nodes);
9334
	  if (BE (err != REG_NOERROR, 0))
9335
	    {
9336
	      re_node_set_free (&next_nodes);
9337
	      return err;
9338
	    }
9339
	}
9340
      ++str_idx;
9341
      if (next_nodes.nelem)
9342
	{
9343
	  err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
9344
	  if (BE (err != REG_NOERROR, 0))
9345
	    {
9346
	      re_node_set_free (&next_nodes);
9347
	      return err;
9348
	    }
9349
	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
9350
				    subexp_num, type);
9351
	  if (BE (err != REG_NOERROR, 0))
9352
	    {
9353
	      re_node_set_free (&next_nodes);
9354
	      return err;
9355
	    }
9356
	}
9357
      context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
9358
      cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
9359
      if (BE (cur_state == NULL && err != REG_NOERROR, 0))
9360
	{
9361
	  re_node_set_free (&next_nodes);
9362
	  return err;
9363
	}
9364
      mctx->state_log[str_idx] = cur_state;
9365
      null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
9366
    }
9367
  re_node_set_free (&next_nodes);
9368
  cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
9369
	       : &mctx->state_log[last_str]->nodes);
9370
  path->next_idx = str_idx;
9371

9372
  /* Fix MCTX.  */
9373
  mctx->state_log = backup_state_log;
9374
  mctx->input.cur_idx = backup_cur_idx;
9375

9376
  /* Then check the current node set has the node LAST_NODE.  */
9377
  if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
9378
    return REG_NOERROR;
9379

9380
  return REG_NOMATCH;
9381
}
9382

9383
/* Helper functions for check_arrival.  */
9384

9385
/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
9386
   to NEXT_NODES.
9387
   TODO: This function is similar to the functions transit_state*(),
9388
	 however this function has many additional works.
9389
	 Can't we unify them?  */
9390

9391
static reg_errcode_t
9392
internal_function
9393
check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
9394
			      re_node_set *cur_nodes, re_node_set *next_nodes)
9395
{
9396
  const re_dfa_t *const dfa = mctx->dfa;
9397
  int result;
9398
  int cur_idx;
9399
  reg_errcode_t err = REG_NOERROR;
9400
  re_node_set union_set;
9401
  re_node_set_init_empty (&union_set);
9402
  for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
9403
    {
9404
      int naccepted = 0;
9405
      int cur_node = cur_nodes->elems[cur_idx];
9406
#ifdef DEBUG
9407
      re_token_type_t type = dfa->nodes[cur_node].type;
9408
      assert (!IS_EPSILON_NODE (type));
9409
#endif
9410
#ifdef RE_ENABLE_I18N
9411
      /* If the node may accept `multi byte'.  */
9412
      if (dfa->nodes[cur_node].accept_mb)
9413
	{
9414
	  naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
9415
					       str_idx);
9416
	  if (naccepted > 1)
9417
	    {
9418
	      re_dfastate_t *dest_state;
9419
	      int next_node = dfa->nexts[cur_node];
9420
	      int next_idx = str_idx + naccepted;
9421
	      dest_state = mctx->state_log[next_idx];
9422
	      re_node_set_empty (&union_set);
9423
	      if (dest_state)
9424
		{
9425
		  err = re_node_set_merge (&union_set, &dest_state->nodes);
9426
		  if (BE (err != REG_NOERROR, 0))
9427
		    {
9428
		      re_node_set_free (&union_set);
9429
		      return err;
9430
		    }
9431
		}
9432
	      result = re_node_set_insert (&union_set, next_node);
9433
	      if (BE (result < 0, 0))
9434
		{
9435
		  re_node_set_free (&union_set);
9436
		  return REG_ESPACE;
9437
		}
9438
	      mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
9439
							    &union_set);
9440
	      if (BE (mctx->state_log[next_idx] == NULL
9441
		      && err != REG_NOERROR, 0))
9442
		{
9443
		  re_node_set_free (&union_set);
9444
		  return err;
9445
		}
9446
	    }
9447
	}
9448
#endif /* RE_ENABLE_I18N */
9449
      if (naccepted
9450
	  || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
9451
	{
9452
	  result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
9453
	  if (BE (result < 0, 0))
9454
	    {
9455
	      re_node_set_free (&union_set);
9456
	      return REG_ESPACE;
9457
	    }
9458
	}
9459
    }
9460
  re_node_set_free (&union_set);
9461
  return REG_NOERROR;
9462
}
9463

9464
/* For all the nodes in CUR_NODES, add the epsilon closures of them to
9465
   CUR_NODES, however exclude the nodes which are:
9466
    - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
9467
    - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
9468
*/
9469

9470
static reg_errcode_t
9471
internal_function
9472
check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
9473
			  int ex_subexp, int type)
9474
{
9475
  reg_errcode_t err;
9476
  int idx, outside_node;
9477
  re_node_set new_nodes;
9478
#ifdef DEBUG
9479
  assert (cur_nodes->nelem);
9480
#endif
9481
  err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
9482
  if (BE (err != REG_NOERROR, 0))
9483
    return err;
9484
  /* Create a new node set NEW_NODES with the nodes which are epsilon
9485
     closures of the node in CUR_NODES.  */
9486

9487
  for (idx = 0; idx < cur_nodes->nelem; ++idx)
9488
    {
9489
      int cur_node = cur_nodes->elems[idx];
9490
      const re_node_set *eclosure = dfa->eclosures + cur_node;
9491
      outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
9492
      if (outside_node == -1)
9493
	{
9494
	  /* There are no problematic nodes, just merge them.  */
9495
	  err = re_node_set_merge (&new_nodes, eclosure);
9496
	  if (BE (err != REG_NOERROR, 0))
9497
	    {
9498
	      re_node_set_free (&new_nodes);
9499
	      return err;
9500
	    }
9501
	}
9502
      else
9503
	{
9504
	  /* There are problematic nodes, re-calculate incrementally.  */
9505
	  err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
9506
					      ex_subexp, type);
9507
	  if (BE (err != REG_NOERROR, 0))
9508
	    {
9509
	      re_node_set_free (&new_nodes);
9510
	      return err;
9511
	    }
9512
	}
9513
    }
9514
  re_node_set_free (cur_nodes);
9515
  *cur_nodes = new_nodes;
9516
  return REG_NOERROR;
9517
}
9518

9519
/* Helper function for check_arrival_expand_ecl.
9520
   Check incrementally the epsilon closure of TARGET, and if it isn't
9521
   problematic append it to DST_NODES.  */
9522

9523
static reg_errcode_t
9524
internal_function
9525
check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
9526
			      int target, int ex_subexp, int type)
9527
{
9528
  int cur_node;
9529
  for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
9530
    {
9531
      int err;
9532

9533
      if (dfa->nodes[cur_node].type == type
9534
	  && dfa->nodes[cur_node].opr.idx == ex_subexp)
9535
	{
9536
	  if (type == OP_CLOSE_SUBEXP)
9537
	    {
9538
	      err = re_node_set_insert (dst_nodes, cur_node);
9539
	      if (BE (err == -1, 0))
9540
		return REG_ESPACE;
9541
	    }
9542
	  break;
9543
	}
9544
      err = re_node_set_insert (dst_nodes, cur_node);
9545
      if (BE (err == -1, 0))
9546
	return REG_ESPACE;
9547
      if (dfa->edests[cur_node].nelem == 0)
9548
	break;
9549
      if (dfa->edests[cur_node].nelem == 2)
9550
	{
9551
	  err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
9552
					      dfa->edests[cur_node].elems[1],
9553
					      ex_subexp, type);
9554
	  if (BE (err != REG_NOERROR, 0))
9555
	    return err;
9556
	}
9557
      cur_node = dfa->edests[cur_node].elems[0];
9558
    }
9559
  return REG_NOERROR;
9560
}
9561

9562

9563
/* For all the back references in the current state, calculate the
9564
   destination of the back references by the appropriate entry
9565
   in MCTX->BKREF_ENTS.  */
9566

9567
static reg_errcode_t
9568
internal_function
9569
expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
9570
		    int cur_str, int subexp_num, int type)
9571
{
9572
  const re_dfa_t *const dfa = mctx->dfa;
9573
  reg_errcode_t err;
9574
  int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
9575
  struct re_backref_cache_entry *ent;
9576

9577
  if (cache_idx_start == -1)
9578
    return REG_NOERROR;
9579

9580
 restart:
9581
  ent = mctx->bkref_ents + cache_idx_start;
9582
  do
9583
    {
9584
      int to_idx, next_node;
9585

9586
      /* Is this entry ENT is appropriate?  */
9587
      if (!re_node_set_contains (cur_nodes, ent->node))
9588
	continue; /* No.  */
9589

9590
      to_idx = cur_str + ent->subexp_to - ent->subexp_from;
9591
      /* Calculate the destination of the back reference, and append it
9592
	 to MCTX->STATE_LOG.  */
9593
      if (to_idx == cur_str)
9594
	{
9595
	  /* The backreference did epsilon transit, we must re-check all the
9596
	     node in the current state.  */
9597
	  re_node_set new_dests;
9598
	  reg_errcode_t err2, err3;
9599
	  next_node = dfa->edests[ent->node].elems[0];
9600
	  if (re_node_set_contains (cur_nodes, next_node))
9601
	    continue;
9602
	  err = re_node_set_init_1 (&new_dests, next_node);
9603
	  err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
9604
	  err3 = re_node_set_merge (cur_nodes, &new_dests);
9605
	  re_node_set_free (&new_dests);
9606
	  if (BE (err != REG_NOERROR || err2 != REG_NOERROR
9607
		  || err3 != REG_NOERROR, 0))
9608
	    {
9609
	      err = (err != REG_NOERROR ? err
9610
		     : (err2 != REG_NOERROR ? err2 : err3));
9611
	      return err;
9612
	    }
9613
	  /* TODO: It is still inefficient...  */
9614
	  goto restart;
9615
	}
9616
      else
9617
	{
9618
	  re_node_set union_set;
9619
	  next_node = dfa->nexts[ent->node];
9620
	  if (mctx->state_log[to_idx])
9621
	    {
9622
	      int ret;
9623
	      if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
9624
					next_node))
9625
		continue;
9626
	      err = re_node_set_init_copy (&union_set,
9627
					   &mctx->state_log[to_idx]->nodes);
9628
	      ret = re_node_set_insert (&union_set, next_node);
9629
	      if (BE (err != REG_NOERROR || ret < 0, 0))
9630
		{
9631
		  re_node_set_free (&union_set);
9632
		  err = err != REG_NOERROR ? err : REG_ESPACE;
9633
		  return err;
9634
		}
9635
	    }
9636
	  else
9637
	    {
9638
	      err = re_node_set_init_1 (&union_set, next_node);
9639
	      if (BE (err != REG_NOERROR, 0))
9640
		return err;
9641
	    }
9642
	  mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
9643
	  re_node_set_free (&union_set);
9644
	  if (BE (mctx->state_log[to_idx] == NULL
9645
		  && err != REG_NOERROR, 0))
9646
	    return err;
9647
	}
9648
    }
9649
  while (ent++->more);
9650
  return REG_NOERROR;
9651
}
9652

9653
/* Build transition table for the state.
9654
   Return 1 if succeeded, otherwise return NULL.  */
9655

9656
static int
9657
internal_function
9658
build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
9659
{
9660
  reg_errcode_t err;
9661
  int i, j, ch, need_word_trtable = 0;
9662
  bitset_word_t elem, mask;
9663
  bool dests_node_malloced = false;
9664
  bool dest_states_malloced = false;
9665
  int ndests; /* Number of the destination states from `state'.  */
9666
  re_dfastate_t **trtable;
9667
  re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
9668
  re_node_set follows, *dests_node;
9669
  bitset_t *dests_ch;
9670
  bitset_t acceptable;
9671

9672
  struct dests_alloc
9673
  {
9674
    re_node_set dests_node[SBC_MAX];
9675
    bitset_t dests_ch[SBC_MAX];
9676
  } *dests_alloc;
9677

9678
  /* We build DFA states which corresponds to the destination nodes
9679
     from `state'.  `dests_node[i]' represents the nodes which i-th
9680
     destination state contains, and `dests_ch[i]' represents the
9681
     characters which i-th destination state accepts.  */
9682
  if (__libc_use_alloca (sizeof (struct dests_alloc)))
9683
    dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
9684
  else
9685
    {
9686
      dests_alloc = re_malloc (struct dests_alloc, 1);
9687
      if (BE (dests_alloc == NULL, 0))
9688
	return 0;
9689
      dests_node_malloced = true;
9690
    }
9691
  dests_node = dests_alloc->dests_node;
9692
  dests_ch = dests_alloc->dests_ch;
9693

9694
  /* Initialize transiton table.  */
9695
  state->word_trtable = state->trtable = NULL;
9696

9697
  /* At first, group all nodes belonging to `state' into several
9698
     destinations.  */
9699
  ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
9700
  if (BE (ndests <= 0, 0))
9701
    {
9702
      if (dests_node_malloced)
9703
	free (dests_alloc);
9704
      /* Return 0 in case of an error, 1 otherwise.  */
9705
      if (ndests == 0)
9706
	{
9707
	  state->trtable = (re_dfastate_t **)
9708
	    calloc (sizeof (re_dfastate_t *), SBC_MAX);
9709
	  return 1;
9710
	}
9711
      return 0;
9712
    }
9713

9714
  err = re_node_set_alloc (&follows, ndests + 1);
9715
  if (BE (err != REG_NOERROR, 0))
9716
    goto out_free;
9717

9718
  if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
9719
			 + ndests * 3 * sizeof (re_dfastate_t *)))
9720
    dest_states = (re_dfastate_t **)
9721
      alloca (ndests * 3 * sizeof (re_dfastate_t *));
9722
  else
9723
    {
9724
      dest_states = (re_dfastate_t **)
9725
	malloc (ndests * 3 * sizeof (re_dfastate_t *));
9726
      if (BE (dest_states == NULL, 0))
9727
	{
9728
out_free:
9729
	  if (dest_states_malloced)
9730
	    free (dest_states);
9731
	  re_node_set_free (&follows);
9732
	  for (i = 0; i < ndests; ++i)
9733
	    re_node_set_free (dests_node + i);
9734
	  if (dests_node_malloced)
9735
	    free (dests_alloc);
9736
	  return 0;
9737
	}
9738
      dest_states_malloced = true;
9739
    }
9740
  dest_states_word = dest_states + ndests;
9741
  dest_states_nl = dest_states_word + ndests;
9742
  bitset_empty (acceptable);
9743

9744
  /* Then build the states for all destinations.  */
9745
  for (i = 0; i < ndests; ++i)
9746
    {
9747
      int next_node;
9748
      re_node_set_empty (&follows);
9749
      /* Merge the follows of this destination states.  */
9750
      for (j = 0; j < dests_node[i].nelem; ++j)
9751
	{
9752
	  next_node = dfa->nexts[dests_node[i].elems[j]];
9753
	  if (next_node != -1)
9754
	    {
9755
	      err = re_node_set_merge (&follows, dfa->eclosures + next_node);
9756
	      if (BE (err != REG_NOERROR, 0))
9757
		goto out_free;
9758
	    }
9759
	}
9760
      dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
9761
      if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
9762
	goto out_free;
9763
      /* If the new state has context constraint,
9764
	 build appropriate states for these contexts.  */
9765
      if (dest_states[i]->has_constraint)
9766
	{
9767
	  dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
9768
							  CONTEXT_WORD);
9769
	  if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
9770
	    goto out_free;
9771

9772
	  if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
9773
	    need_word_trtable = 1;
9774

9775
	  dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
9776
							CONTEXT_NEWLINE);
9777
	  if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
9778
	    goto out_free;
9779
 	}
9780
      else
9781
	{
9782
	  dest_states_word[i] = dest_states[i];
9783
	  dest_states_nl[i] = dest_states[i];
9784
	}
9785
      bitset_merge (acceptable, dests_ch[i]);
9786
    }
9787

9788
  if (!BE (need_word_trtable, 0))
9789
    {
9790
      /* We don't care about whether the following character is a word
9791
	 character, or we are in a single-byte character set so we can
9792
	 discern by looking at the character code: allocate a
9793
	 256-entry transition table.  */
9794
      trtable = state->trtable =
9795
	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
9796
      if (BE (trtable == NULL, 0))
9797
	goto out_free;
9798

9799
      /* For all characters ch...:  */
9800
      for (i = 0; i < BITSET_WORDS; ++i)
9801
	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
9802
	     elem;
9803
	     mask <<= 1, elem >>= 1, ++ch)
9804
	  if (BE (elem & 1, 0))
9805
	    {
9806
	      /* There must be exactly one destination which accepts
9807
		 character ch.  See group_nodes_into_DFAstates.  */
9808
	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
9809
		;
9810

9811
	      /* j-th destination accepts the word character ch.  */
9812
	      if (dfa->word_char[i] & mask)
9813
		trtable[ch] = dest_states_word[j];
9814
	      else
9815
		trtable[ch] = dest_states[j];
9816
	    }
9817
    }
9818
  else
9819
    {
9820
      /* We care about whether the following character is a word
9821
	 character, and we are in a multi-byte character set: discern
9822
	 by looking at the character code: build two 256-entry
9823
	 transition tables, one starting at trtable[0] and one
9824
	 starting at trtable[SBC_MAX].  */
9825
      trtable = state->word_trtable =
9826
	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
9827
      if (BE (trtable == NULL, 0))
9828
	goto out_free;
9829

9830
      /* For all characters ch...:  */
9831
      for (i = 0; i < BITSET_WORDS; ++i)
9832
	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
9833
	     elem;
9834
	     mask <<= 1, elem >>= 1, ++ch)
9835
	  if (BE (elem & 1, 0))
9836
	    {
9837
	      /* There must be exactly one destination which accepts
9838
		 character ch.  See group_nodes_into_DFAstates.  */
9839
	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
9840
		;
9841

9842
	      /* j-th destination accepts the word character ch.  */
9843
	      trtable[ch] = dest_states[j];
9844
	      trtable[ch + SBC_MAX] = dest_states_word[j];
9845
	    }
9846
    }
9847

9848
  /* new line */
9849
  if (bitset_contain (acceptable, NEWLINE_CHAR))
9850
    {
9851
      /* The current state accepts newline character.  */
9852
      for (j = 0; j < ndests; ++j)
9853
	if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
9854
	  {
9855
	    /* k-th destination accepts newline character.  */
9856
	    trtable[NEWLINE_CHAR] = dest_states_nl[j];
9857
	    if (need_word_trtable)
9858
	      trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
9859
	    /* There must be only one destination which accepts
9860
	       newline.  See group_nodes_into_DFAstates.  */
9861
	    break;
9862
	  }
9863
    }
9864

9865
  if (dest_states_malloced)
9866
    free (dest_states);
9867

9868
  re_node_set_free (&follows);
9869
  for (i = 0; i < ndests; ++i)
9870
    re_node_set_free (dests_node + i);
9871

9872
  if (dests_node_malloced)
9873
    free (dests_alloc);
9874

9875
  return 1;
9876
}
9877

9878
/* Group all nodes belonging to STATE into several destinations.
9879
   Then for all destinations, set the nodes belonging to the destination
9880
   to DESTS_NODE[i] and set the characters accepted by the destination
9881
   to DEST_CH[i].  This function return the number of destinations.  */
9882

9883
static int
9884
internal_function
9885
group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
9886
			    re_node_set *dests_node, bitset_t *dests_ch)
9887
{
9888
  reg_errcode_t err;
9889
  int result;
9890
  int i, j, k;
9891
  int ndests; /* Number of the destinations from `state'.  */
9892
  bitset_t accepts; /* Characters a node can accept.  */
9893
  const re_node_set *cur_nodes = &state->nodes;
9894
  bitset_empty (accepts);
9895
  ndests = 0;
9896

9897
  /* For all the nodes belonging to `state',  */
9898
  for (i = 0; i < cur_nodes->nelem; ++i)
9899
    {
9900
      re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
9901
      re_token_type_t type = node->type;
9902
      unsigned int constraint = node->constraint;
9903

9904
      /* Enumerate all single byte character this node can accept.  */
9905
      if (type == CHARACTER)
9906
	bitset_set (accepts, node->opr.c);
9907
      else if (type == SIMPLE_BRACKET)
9908
	{
9909
	  bitset_merge (accepts, node->opr.sbcset);
9910
	}
9911
      else if (type == OP_PERIOD)
9912
	{
9913
#ifdef RE_ENABLE_I18N
9914
	  if (dfa->mb_cur_max > 1)
9915
	    bitset_merge (accepts, dfa->sb_char);
9916
	  else
9917
#endif
9918
	    bitset_set_all (accepts);
9919
	  if (!(dfa->syntax & RE_DOT_NEWLINE))
9920
	    bitset_clear (accepts, '\n');
9921
	  if (dfa->syntax & RE_DOT_NOT_NULL)
9922
	    bitset_clear (accepts, '\0');
9923
	}
9924
#ifdef RE_ENABLE_I18N
9925
      else if (type == OP_UTF8_PERIOD)
9926
        {
9927
	  memset (accepts, '\xff', sizeof (bitset_t) / 2);
9928
	  if (!(dfa->syntax & RE_DOT_NEWLINE))
9929
	    bitset_clear (accepts, '\n');
9930
	  if (dfa->syntax & RE_DOT_NOT_NULL)
9931
	    bitset_clear (accepts, '\0');
9932
        }
9933
#endif
9934
      else
9935
	continue;
9936

9937
      /* Check the `accepts' and sift the characters which are not
9938
	 match it the context.  */
9939
      if (constraint)
9940
	{
9941
	  if (constraint & NEXT_NEWLINE_CONSTRAINT)
9942
	    {
9943
	      bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
9944
	      bitset_empty (accepts);
9945
	      if (accepts_newline)
9946
		bitset_set (accepts, NEWLINE_CHAR);
9947
	      else
9948
		continue;
9949
	    }
9950
	  if (constraint & NEXT_ENDBUF_CONSTRAINT)
9951
	    {
9952
	      bitset_empty (accepts);
9953
	      continue;
9954
	    }
9955

9956
	  if (constraint & NEXT_WORD_CONSTRAINT)
9957
	    {
9958
	      bitset_word_t any_set = 0;
9959
	      if (type == CHARACTER && !node->word_char)
9960
		{
9961
		  bitset_empty (accepts);
9962
		  continue;
9963
		}
9964
#ifdef RE_ENABLE_I18N
9965
	      if (dfa->mb_cur_max > 1)
9966
		for (j = 0; j < BITSET_WORDS; ++j)
9967
		  any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
9968
	      else
9969
#endif
9970
		for (j = 0; j < BITSET_WORDS; ++j)
9971
		  any_set |= (accepts[j] &= dfa->word_char[j]);
9972
	      if (!any_set)
9973
		continue;
9974
	    }
9975
	  if (constraint & NEXT_NOTWORD_CONSTRAINT)
9976
	    {
9977
	      bitset_word_t any_set = 0;
9978
	      if (type == CHARACTER && node->word_char)
9979
		{
9980
		  bitset_empty (accepts);
9981
		  continue;
9982
		}
9983
#ifdef RE_ENABLE_I18N
9984
	      if (dfa->mb_cur_max > 1)
9985
		for (j = 0; j < BITSET_WORDS; ++j)
9986
		  any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
9987
	      else
9988
#endif
9989
		for (j = 0; j < BITSET_WORDS; ++j)
9990
		  any_set |= (accepts[j] &= ~dfa->word_char[j]);
9991
	      if (!any_set)
9992
		continue;
9993
	    }
9994
	}
9995

9996
      /* Then divide `accepts' into DFA states, or create a new
9997
	 state.  Above, we make sure that accepts is not empty.  */
9998
      for (j = 0; j < ndests; ++j)
9999
	{
10000
	  bitset_t intersec; /* Intersection sets, see below.  */
10001
	  bitset_t remains;
10002
	  /* Flags, see below.  */
10003
	  bitset_word_t has_intersec, not_subset, not_consumed;
10004

10005
	  /* Optimization, skip if this state doesn't accept the character.  */
10006
	  if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
10007
	    continue;
10008

10009
	  /* Enumerate the intersection set of this state and `accepts'.  */
10010
	  has_intersec = 0;
10011
	  for (k = 0; k < BITSET_WORDS; ++k)
10012
	    has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
10013
	  /* And skip if the intersection set is empty.  */
10014
	  if (!has_intersec)
10015
	    continue;
10016

10017
	  /* Then check if this state is a subset of `accepts'.  */
10018
	  not_subset = not_consumed = 0;
10019
	  for (k = 0; k < BITSET_WORDS; ++k)
10020
	    {
10021
	      not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
10022
	      not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
10023
	    }
10024

10025
	  /* If this state isn't a subset of `accepts', create a
10026
	     new group state, which has the `remains'. */
10027
	  if (not_subset)
10028
	    {
10029
	      bitset_copy (dests_ch[ndests], remains);
10030
	      bitset_copy (dests_ch[j], intersec);
10031
	      err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
10032
	      if (BE (err != REG_NOERROR, 0))
10033
		goto error_return;
10034
	      ++ndests;
10035
	    }
10036

10037
	  /* Put the position in the current group. */
10038
	  result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
10039
	  if (BE (result < 0, 0))
10040
	    goto error_return;
10041

10042
	  /* If all characters are consumed, go to next node. */
10043
	  if (!not_consumed)
10044
	    break;
10045
	}
10046
      /* Some characters remain, create a new group. */
10047
      if (j == ndests)
10048
	{
10049
	  bitset_copy (dests_ch[ndests], accepts);
10050
	  err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
10051
	  if (BE (err != REG_NOERROR, 0))
10052
	    goto error_return;
10053
	  ++ndests;
10054
	  bitset_empty (accepts);
10055
	}
10056
    }
10057
  return ndests;
10058
 error_return:
10059
  for (j = 0; j < ndests; ++j)
10060
    re_node_set_free (dests_node + j);
10061
  return -1;
10062
}
10063

10064
#ifdef RE_ENABLE_I18N
10065
/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
10066
   Return the number of the bytes the node accepts.
10067
   STR_IDX is the current index of the input string.
10068

10069
   This function handles the nodes which can accept one character, or
10070
   one collating element like '.', '[a-z]', opposite to the other nodes
10071
   can only accept one byte.  */
10072

10073
static int
10074
internal_function
10075
check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
10076
			 const re_string_t *input, int str_idx)
10077
{
10078
  const re_token_t *node = dfa->nodes + node_idx;
10079
  int char_len, elem_len;
10080
  int i;
10081

10082
  if (BE (node->type == OP_UTF8_PERIOD, 0))
10083
    {
10084
      unsigned char c = re_string_byte_at (input, str_idx), d;
10085
      if (BE (c < 0xc2, 1))
10086
	return 0;
10087

10088
      if (str_idx + 2 > input->len)
10089
	return 0;
10090

10091
      d = re_string_byte_at (input, str_idx + 1);
10092
      if (c < 0xe0)
10093
	return (d < 0x80 || d > 0xbf) ? 0 : 2;
10094
      else if (c < 0xf0)
10095
	{
10096
	  char_len = 3;
10097
	  if (c == 0xe0 && d < 0xa0)
10098
	    return 0;
10099
	}
10100
      else if (c < 0xf8)
10101
	{
10102
	  char_len = 4;
10103
	  if (c == 0xf0 && d < 0x90)
10104
	    return 0;
10105
	}
10106
      else if (c < 0xfc)
10107
	{
10108
	  char_len = 5;
10109
	  if (c == 0xf8 && d < 0x88)
10110
	    return 0;
10111
	}
10112
      else if (c < 0xfe)
10113
	{
10114
	  char_len = 6;
10115
	  if (c == 0xfc && d < 0x84)
10116
	    return 0;
10117
	}
10118
      else
10119
	return 0;
10120

10121
      if (str_idx + char_len > input->len)
10122
	return 0;
10123

10124
      for (i = 1; i < char_len; ++i)
10125
	{
10126
	  d = re_string_byte_at (input, str_idx + i);
10127
	  if (d < 0x80 || d > 0xbf)
10128
	    return 0;
10129
	}
10130
      return char_len;
10131
    }
10132

10133
  char_len = re_string_char_size_at (input, str_idx);
10134
  if (node->type == OP_PERIOD)
10135
    {
10136
      if (char_len <= 1)
10137
        return 0;
10138
      /* FIXME: I don't think this if is needed, as both '\n'
10139
	 and '\0' are char_len == 1.  */
10140
      /* '.' accepts any one character except the following two cases.  */
10141
      if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
10142
	   re_string_byte_at (input, str_idx) == '\n') ||
10143
	  ((dfa->syntax & RE_DOT_NOT_NULL) &&
10144
	   re_string_byte_at (input, str_idx) == '\0'))
10145
	return 0;
10146
      return char_len;
10147
    }
10148

10149
  elem_len = re_string_elem_size_at (input, str_idx);
10150
  if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
10151
    return 0;
10152

10153
  if (node->type == COMPLEX_BRACKET)
10154
    {
10155
      const re_charset_t *cset = node->opr.mbcset;
10156
# ifdef _LIBC
10157
      const unsigned char *pin
10158
	= ((const unsigned char *) re_string_get_buffer (input) + str_idx);
10159
      int j;
10160
      uint32_t nrules;
10161
# endif /* _LIBC */
10162
      int match_len = 0;
10163
      wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
10164
		    ? re_string_wchar_at (input, str_idx) : 0);
10165

10166
      /* match with multibyte character?  */
10167
      for (i = 0; i < cset->nmbchars; ++i)
10168
	if (wc == cset->mbchars[i])
10169
	  {
10170
	    match_len = char_len;
10171
	    goto check_node_accept_bytes_match;
10172
	  }
10173
      /* match with character_class?  */
10174
      for (i = 0; i < cset->nchar_classes; ++i)
10175
	{
10176
	  wctype_t wt = cset->char_classes[i];
10177
	  if (__iswctype (wc, wt))
10178
	    {
10179
	      match_len = char_len;
10180
	      goto check_node_accept_bytes_match;
10181
	    }
10182
	}
10183

10184
# ifdef _LIBC
10185
      nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
10186
      if (nrules != 0)
10187
	{
10188
	  unsigned int in_collseq = 0;
10189
	  const int32_t *table, *indirect;
10190
	  const unsigned char *weights, *extra;
10191
	  const char *collseqwc;
10192
	  int32_t idx;
10193
	  /* This #include defines a local function!  */
10194
#  include <locale/weight.h>
10195

10196
	  /* match with collating_symbol?  */
10197
	  if (cset->ncoll_syms)
10198
	    extra = (const unsigned char *)
10199
	      _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
10200
	  for (i = 0; i < cset->ncoll_syms; ++i)
10201
	    {
10202
	      const unsigned char *coll_sym = extra + cset->coll_syms[i];
10203
	      /* Compare the length of input collating element and
10204
		 the length of current collating element.  */
10205
	      if (*coll_sym != elem_len)
10206
		continue;
10207
	      /* Compare each bytes.  */
10208
	      for (j = 0; j < *coll_sym; j++)
10209
		if (pin[j] != coll_sym[1 + j])
10210
		  break;
10211
	      if (j == *coll_sym)
10212
		{
10213
		  /* Match if every bytes is equal.  */
10214
		  match_len = j;
10215
		  goto check_node_accept_bytes_match;
10216
		}
10217
	    }
10218

10219
	  if (cset->nranges)
10220
	    {
10221
	      if (elem_len <= char_len)
10222
		{
10223
		  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
10224
		  in_collseq = __collseq_table_lookup (collseqwc, wc);
10225
		}
10226
	      else
10227
		in_collseq = find_collation_sequence_value (pin, elem_len);
10228
	    }
10229
	  /* match with range expression?  */
10230
	  for (i = 0; i < cset->nranges; ++i)
10231
	    if (cset->range_starts[i] <= in_collseq
10232
		&& in_collseq <= cset->range_ends[i])
10233
	      {
10234
		match_len = elem_len;
10235
		goto check_node_accept_bytes_match;
10236
	      }
10237

10238
	  /* match with equivalence_class?  */
10239
	  if (cset->nequiv_classes)
10240
	    {
10241
	      const unsigned char *cp = pin;
10242
	      table = (const int32_t *)
10243
		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
10244
	      weights = (const unsigned char *)
10245
		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
10246
	      extra = (const unsigned char *)
10247
		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
10248
	      indirect = (const int32_t *)
10249
		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
10250
	      idx = findidx (&cp);
10251
	      if (idx > 0)
10252
		for (i = 0; i < cset->nequiv_classes; ++i)
10253
		  {
10254
		    int32_t equiv_class_idx = cset->equiv_classes[i];
10255
		    size_t weight_len = weights[idx];
10256
		    if (weight_len == weights[equiv_class_idx])
10257
		      {
10258
			int cnt = 0;
10259
			while (cnt <= weight_len
10260
			       && (weights[equiv_class_idx + 1 + cnt]
10261
				   == weights[idx + 1 + cnt]))
10262
			  ++cnt;
10263
			if (cnt > weight_len)
10264
			  {
10265
			    match_len = elem_len;
10266
			    goto check_node_accept_bytes_match;
10267
			  }
10268
		      }
10269
		  }
10270
	    }
10271
	}
10272
      else
10273
# endif /* _LIBC */
10274
	{
10275
	  /* match with range expression?  */
10276
#if __GNUC__ >= 2
10277
	  wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
10278
#else
10279
	  wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
10280
	  cmp_buf[2] = wc;
10281
#endif
10282
	  for (i = 0; i < cset->nranges; ++i)
10283
	    {
10284
	      cmp_buf[0] = cset->range_starts[i];
10285
	      cmp_buf[4] = cset->range_ends[i];
10286
	      if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
10287
		  && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
10288
		{
10289
		  match_len = char_len;
10290
		  goto check_node_accept_bytes_match;
10291
		}
10292
	    }
10293
	}
10294
    check_node_accept_bytes_match:
10295
      if (!cset->non_match)
10296
	return match_len;
10297
      else
10298
	{
10299
	  if (match_len > 0)
10300
	    return 0;
10301
	  else
10302
	    return (elem_len > char_len) ? elem_len : char_len;
10303
	}
10304
    }
10305
  return 0;
10306
}
10307

10308
# ifdef _LIBC
10309
static unsigned int
10310
internal_function
10311
find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
10312
{
10313
  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
10314
  if (nrules == 0)
10315
    {
10316
      if (mbs_len == 1)
10317
	{
10318
	  /* No valid character.  Match it as a single byte character.  */
10319
	  const unsigned char *collseq = (const unsigned char *)
10320
	    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
10321
	  return collseq[mbs[0]];
10322
	}
10323
      return UINT_MAX;
10324
    }
10325
  else
10326
    {
10327
      int32_t idx;
10328
      const unsigned char *extra = (const unsigned char *)
10329
	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
10330
      int32_t extrasize = (const unsigned char *)
10331
	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
10332

10333
      for (idx = 0; idx < extrasize;)
10334
	{
10335
	  int mbs_cnt, found = 0;
10336
	  int32_t elem_mbs_len;
10337
	  /* Skip the name of collating element name.  */
10338
	  idx = idx + extra[idx] + 1;
10339
	  elem_mbs_len = extra[idx++];
10340
	  if (mbs_len == elem_mbs_len)
10341
	    {
10342
	      for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
10343
		if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
10344
		  break;
10345
	      if (mbs_cnt == elem_mbs_len)
10346
		/* Found the entry.  */
10347
		found = 1;
10348
	    }
10349
	  /* Skip the byte sequence of the collating element.  */
10350
	  idx += elem_mbs_len;
10351
	  /* Adjust for the alignment.  */
10352
	  idx = (idx + 3) & ~3;
10353
	  /* Skip the collation sequence value.  */
10354
	  idx += sizeof (uint32_t);
10355
	  /* Skip the wide char sequence of the collating element.  */
10356
	  idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
10357
	  /* If we found the entry, return the sequence value.  */
10358
	  if (found)
10359
	    return *(uint32_t *) (extra + idx);
10360
	  /* Skip the collation sequence value.  */
10361
	  idx += sizeof (uint32_t);
10362
	}
10363
      return UINT_MAX;
10364
    }
10365
}
10366
# endif /* _LIBC */
10367
#endif /* RE_ENABLE_I18N */
10368

10369
/* Check whether the node accepts the byte which is IDX-th
10370
   byte of the INPUT.  */
10371

10372
static int
10373
internal_function
10374
check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
10375
		   int idx)
10376
{
10377
  unsigned char ch;
10378
  ch = re_string_byte_at (&mctx->input, idx);
10379
  switch (node->type)
10380
    {
10381
    case CHARACTER:
10382
      if (node->opr.c != ch)
10383
        return 0;
10384
      break;
10385

10386
    case SIMPLE_BRACKET:
10387
      if (!bitset_contain (node->opr.sbcset, ch))
10388
        return 0;
10389
      break;
10390

10391
#ifdef RE_ENABLE_I18N
10392
    case OP_UTF8_PERIOD:
10393
      if (ch >= 0x80)
10394
        return 0;
10395
      /* FALLTHROUGH */
10396
#endif
10397
    case OP_PERIOD:
10398
      if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
10399
	  || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
10400
	return 0;
10401
      break;
10402

10403
    default:
10404
      return 0;
10405
    }
10406

10407
  if (node->constraint)
10408
    {
10409
      /* The node has constraints.  Check whether the current context
10410
	 satisfies the constraints.  */
10411
      unsigned int context = re_string_context_at (&mctx->input, idx,
10412
						   mctx->eflags);
10413
      if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
10414
	return 0;
10415
    }
10416

10417
  return 1;
10418
}
10419

10420
/* Extend the buffers, if the buffers have run out.  */
10421

10422
static reg_errcode_t
10423
internal_function
10424
extend_buffers (re_match_context_t *mctx)
10425
{
10426
  reg_errcode_t ret;
10427
  re_string_t *pstr = &mctx->input;
10428

10429
  /* Double the lengthes of the buffers.  */
10430
  ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
10431
  if (BE (ret != REG_NOERROR, 0))
10432
    return ret;
10433

10434
  if (mctx->state_log != NULL)
10435
    {
10436
      /* And double the length of state_log.  */
10437
      /* XXX We have no indication of the size of this buffer.  If this
10438
	 allocation fail we have no indication that the state_log array
10439
	 does not have the right size.  */
10440
      re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
10441
					      pstr->bufs_len + 1);
10442
      if (BE (new_array == NULL, 0))
10443
	return REG_ESPACE;
10444
      mctx->state_log = new_array;
10445
    }
10446

10447
  /* Then reconstruct the buffers.  */
10448
  if (pstr->icase)
10449
    {
10450
#ifdef RE_ENABLE_I18N
10451
      if (pstr->mb_cur_max > 1)
10452
	{
10453
	  ret = build_wcs_upper_buffer (pstr);
10454
	  if (BE (ret != REG_NOERROR, 0))
10455
	    return ret;
10456
	}
10457
      else
10458
#endif /* RE_ENABLE_I18N  */
10459
	build_upper_buffer (pstr);
10460
    }
10461
  else
10462
    {
10463
#ifdef RE_ENABLE_I18N
10464
      if (pstr->mb_cur_max > 1)
10465
	build_wcs_buffer (pstr);
10466
      else
10467
#endif /* RE_ENABLE_I18N  */
10468
	{
10469
	  if (pstr->trans != NULL)
10470
	    re_string_translate_buffer (pstr);
10471
	}
10472
    }
10473
  return REG_NOERROR;
10474
}
10475

10476

10477
/* Functions for matching context.  */
10478

10479
/* Initialize MCTX.  */
10480

10481
static reg_errcode_t
10482
internal_function
10483
match_ctx_init (re_match_context_t *mctx, int eflags, int n)
10484
{
10485
  mctx->eflags = eflags;
10486
  mctx->match_last = -1;
10487
  if (n > 0)
10488
    {
10489
      mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
10490
      mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
10491
      if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
10492
	return REG_ESPACE;
10493
    }
10494
  /* Already zero-ed by the caller.
10495
     else
10496
       mctx->bkref_ents = NULL;
10497
     mctx->nbkref_ents = 0;
10498
     mctx->nsub_tops = 0;  */
10499
  mctx->abkref_ents = n;
10500
  mctx->max_mb_elem_len = 1;
10501
  mctx->asub_tops = n;
10502
  return REG_NOERROR;
10503
}
10504

10505
/* Clean the entries which depend on the current input in MCTX.
10506
   This function must be invoked when the matcher changes the start index
10507
   of the input, or changes the input string.  */
10508

10509
static void
10510
internal_function
10511
match_ctx_clean (re_match_context_t *mctx)
10512
{
10513
  int st_idx;
10514
  for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
10515
    {
10516
      int sl_idx;
10517
      re_sub_match_top_t *top = mctx->sub_tops[st_idx];
10518
      for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
10519
	{
10520
	  re_sub_match_last_t *last = top->lasts[sl_idx];
10521
	  re_free (last->path.array);
10522
	  re_free (last);
10523
	}
10524
      re_free (top->lasts);
10525
      if (top->path)
10526
	{
10527
	  re_free (top->path->array);
10528
	  re_free (top->path);
10529
	}
10530
      free (top);
10531
    }
10532

10533
  mctx->nsub_tops = 0;
10534
  mctx->nbkref_ents = 0;
10535
}
10536

10537
/* Free all the memory associated with MCTX.  */
10538

10539
static void
10540
internal_function
10541
match_ctx_free (re_match_context_t *mctx)
10542
{
10543
  /* First, free all the memory associated with MCTX->SUB_TOPS.  */
10544
  match_ctx_clean (mctx);
10545
  re_free (mctx->sub_tops);
10546
  re_free (mctx->bkref_ents);
10547
}
10548

10549
/* Add a new backreference entry to MCTX.
10550
   Note that we assume that caller never call this function with duplicate
10551
   entry, and call with STR_IDX which isn't smaller than any existing entry.
10552
*/
10553

10554
static reg_errcode_t
10555
internal_function
10556
match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
10557
		     int to)
10558
{
10559
  if (mctx->nbkref_ents >= mctx->abkref_ents)
10560
    {
10561
      struct re_backref_cache_entry* new_entry;
10562
      new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
10563
			      mctx->abkref_ents * 2);
10564
      if (BE (new_entry == NULL, 0))
10565
	{
10566
	  re_free (mctx->bkref_ents);
10567
	  return REG_ESPACE;
10568
	}
10569
      mctx->bkref_ents = new_entry;
10570
      memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
10571
	      sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
10572
      mctx->abkref_ents *= 2;
10573
    }
10574
  if (mctx->nbkref_ents > 0
10575
      && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
10576
    mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
10577

10578
  mctx->bkref_ents[mctx->nbkref_ents].node = node;
10579
  mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
10580
  mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
10581
  mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
10582

10583
  /* This is a cache that saves negative results of check_dst_limits_calc_pos.
10584
     If bit N is clear, means that this entry won't epsilon-transition to
10585
     an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
10586
     it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
10587
     such node.
10588

10589
     A backreference does not epsilon-transition unless it is empty, so set
10590
     to all zeros if FROM != TO.  */
10591
  mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
10592
    = (from == to ? ~0 : 0);
10593

10594
  mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
10595
  if (mctx->max_mb_elem_len < to - from)
10596
    mctx->max_mb_elem_len = to - from;
10597
  return REG_NOERROR;
10598
}
10599

10600
/* Search for the first entry which has the same str_idx, or -1 if none is
10601
   found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
10602

10603
static int
10604
internal_function
10605
search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
10606
{
10607
  int left, right, mid, last;
10608
  last = right = mctx->nbkref_ents;
10609
  for (left = 0; left < right;)
10610
    {
10611
      mid = (left + right) / 2;
10612
      if (mctx->bkref_ents[mid].str_idx < str_idx)
10613
	left = mid + 1;
10614
      else
10615
	right = mid;
10616
    }
10617
  if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
10618
    return left;
10619
  else
10620
    return -1;
10621
}
10622

10623
/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
10624
   at STR_IDX.  */
10625

10626
static reg_errcode_t
10627
internal_function
10628
match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
10629
{
10630
#ifdef DEBUG
10631
  assert (mctx->sub_tops != NULL);
10632
  assert (mctx->asub_tops > 0);
10633
#endif
10634
  if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
10635
    {
10636
      int new_asub_tops = mctx->asub_tops * 2;
10637
      re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
10638
						   re_sub_match_top_t *,
10639
						   new_asub_tops);
10640
      if (BE (new_array == NULL, 0))
10641
	return REG_ESPACE;
10642
      mctx->sub_tops = new_array;
10643
      mctx->asub_tops = new_asub_tops;
10644
    }
10645
  mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
10646
  if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
10647
    return REG_ESPACE;
10648
  mctx->sub_tops[mctx->nsub_tops]->node = node;
10649
  mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
10650
  return REG_NOERROR;
10651
}
10652

10653
/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
10654
   at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
10655

10656
static re_sub_match_last_t *
10657
internal_function
10658
match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
10659
{
10660
  re_sub_match_last_t *new_entry;
10661
  if (BE (subtop->nlasts == subtop->alasts, 0))
10662
    {
10663
      int new_alasts = 2 * subtop->alasts + 1;
10664
      re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
10665
						    re_sub_match_last_t *,
10666
						    new_alasts);
10667
      if (BE (new_array == NULL, 0))
10668
	return NULL;
10669
      subtop->lasts = new_array;
10670
      subtop->alasts = new_alasts;
10671
    }
10672
  new_entry = calloc (1, sizeof (re_sub_match_last_t));
10673
  if (BE (new_entry != NULL, 1))
10674
    {
10675
      subtop->lasts[subtop->nlasts] = new_entry;
10676
      new_entry->node = node;
10677
      new_entry->str_idx = str_idx;
10678
      ++subtop->nlasts;
10679
    }
10680
  return new_entry;
10681
}
10682

10683
static void
10684
internal_function
10685
sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
10686
	       re_dfastate_t **limited_sts, int last_node, int last_str_idx)
10687
{
10688
  sctx->sifted_states = sifted_sts;
10689
  sctx->limited_states = limited_sts;
10690
  sctx->last_node = last_node;
10691
  sctx->last_str_idx = last_str_idx;
10692
  re_node_set_init_empty (&sctx->limits);
10693
}
10694

10695

10696
/* Binary backward compatibility.  */
10697
#if _LIBC
10698
# include <shlib-compat.h>
10699
# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3)
10700
link_warning (re_max_failures, "the 're_max_failures' variable is obsolete and will go away.")
10701
int re_max_failures = 2000;
10702
# endif
10703
#endif
10704
#endif
10705

10706
Product

Resources

Company