Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ElmerCSC
GitHub Repository: ElmerCSC/elmerfem
Path: blob/devel/elmergrid/src/metis-5.1.0/GKlib/gkregex.c
3206 views
1
/* Extended regular expression matching and search library.
2
Copyright (C) 2002, 2003, 2005 Free Software Foundation, Inc.
3
This file is part of the GNU C Library.
4
Contributed by Isamu Hasegawa <[email protected]>.
5
6
The GNU C Library is free software; you can redistribute it and/or
7
modify it under the terms of the GNU Lesser General Public
8
License as published by the Free Software Foundation; either
9
version 2.1 of the License, or (at your option) any later version.
10
11
The GNU C Library is distributed in the hope that it will be useful,
12
but WITHOUT ANY WARRANTY; without even the implied warranty of
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
Lesser General Public License for more details.
15
16
You should have received a copy of the GNU Lesser General Public
17
License along with the GNU C Library; if not, write to the Free
18
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19
02111-1307 USA. */
20
21
/* this is for removing a compiler warning */
22
void gkfooo() { return; }
23
24
#ifdef USE_GKREGEX
25
26
#ifdef HAVE_CONFIG_H
27
#include "config.h"
28
#endif
29
30
#ifdef _LIBC
31
/* We have to keep the namespace clean. */
32
# define regfree(preg) __regfree (preg)
33
# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
34
# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
35
# define regerror(errcode, preg, errbuf, errbuf_size) \
36
__regerror(errcode, preg, errbuf, errbuf_size)
37
# define re_set_registers(bu, re, nu, st, en) \
38
__re_set_registers (bu, re, nu, st, en)
39
# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
40
__re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
41
# define re_match(bufp, string, size, pos, regs) \
42
__re_match (bufp, string, size, pos, regs)
43
# define re_search(bufp, string, size, startpos, range, regs) \
44
__re_search (bufp, string, size, startpos, range, regs)
45
# define re_compile_pattern(pattern, length, bufp) \
46
__re_compile_pattern (pattern, length, bufp)
47
# define re_set_syntax(syntax) __re_set_syntax (syntax)
48
# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
49
__re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
50
# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
51
52
# include "../locale/localeinfo.h"
53
#endif
54
55
#include "GKlib.h"
56
57
58
/******************************************************************************/
59
/******************************************************************************/
60
/******************************************************************************/
61
/* GKINCLUDE #include "regex_internal.h" */
62
/******************************************************************************/
63
/******************************************************************************/
64
/******************************************************************************/
65
/* Extended regular expression matching and search library.
66
Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
67
This file is part of the GNU C Library.
68
Contributed by Isamu Hasegawa <[email protected]>.
69
70
The GNU C Library is free software; you can redistribute it and/or
71
modify it under the terms of the GNU Lesser General Public
72
License as published by the Free Software Foundation; either
73
version 2.1 of the License, or (at your option) any later version.
74
75
The GNU C Library is distributed in the hope that it will be useful,
76
but WITHOUT ANY WARRANTY; without even the implied warranty of
77
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
78
Lesser General Public License for more details.
79
80
You should have received a copy of the GNU Lesser General Public
81
License along with the GNU C Library; if not, write to the Free
82
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
83
02111-1307 USA. */
84
85
#ifndef _REGEX_INTERNAL_H
86
#define _REGEX_INTERNAL_H 1
87
88
#include <assert.h>
89
#include <ctype.h>
90
#include <stdio.h>
91
#include <stdlib.h>
92
#include <string.h>
93
94
#if defined(__MINGW32_VERSION) || defined(_MSC_VER)
95
#define strcasecmp stricmp
96
#endif
97
98
#if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC
99
# include <langinfo.h>
100
#endif
101
#if defined HAVE_LOCALE_H || defined _LIBC
102
# include <locale.h>
103
#endif
104
#if defined HAVE_WCHAR_H || defined _LIBC
105
# include <wchar.h>
106
#endif /* HAVE_WCHAR_H || _LIBC */
107
#if defined HAVE_WCTYPE_H || defined _LIBC
108
# include <wctype.h>
109
#endif /* HAVE_WCTYPE_H || _LIBC */
110
#if defined HAVE_STDBOOL_H || defined _LIBC
111
# include <stdbool.h>
112
#else
113
typedef enum { false, true } bool;
114
#endif /* HAVE_STDBOOL_H || _LIBC */
115
#if defined HAVE_STDINT_H || defined _LIBC
116
# include <stdint.h>
117
#endif /* HAVE_STDINT_H || _LIBC */
118
#if defined _LIBC
119
# include <bits/libc-lock.h>
120
#else
121
# define __libc_lock_define(CLASS,NAME)
122
# define __libc_lock_init(NAME) do { } while (0)
123
# define __libc_lock_lock(NAME) do { } while (0)
124
# define __libc_lock_unlock(NAME) do { } while (0)
125
#endif
126
127
/* In case that the system doesn't have isblank(). */
128
#if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
129
# define isblank(ch) ((ch) == ' ' || (ch) == '\t')
130
#endif
131
132
#ifdef _LIBC
133
# ifndef _RE_DEFINE_LOCALE_FUNCTIONS
134
# define _RE_DEFINE_LOCALE_FUNCTIONS 1
135
# include <locale/localeinfo.h>
136
# include <locale/elem-hash.h>
137
# include <locale/coll-lookup.h>
138
# endif
139
#endif
140
141
/* This is for other GNU distributions with internationalized messages. */
142
#if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
143
# include <libintl.h>
144
# ifdef _LIBC
145
# undef gettext
146
# define gettext(msgid) \
147
INTUSE(__dcgettext) (_libc_intl_domainname, msgid, LC_MESSAGES)
148
# endif
149
#else
150
# define gettext(msgid) (msgid)
151
#endif
152
153
#ifndef gettext_noop
154
/* This define is so xgettext can find the internationalizable
155
strings. */
156
# define gettext_noop(String) String
157
#endif
158
159
/* For loser systems without the definition. */
160
#ifndef SIZE_MAX
161
# define SIZE_MAX ((size_t) -1)
162
#endif
163
164
#if (defined MB_CUR_MAX && HAVE_LOCALE_H && HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_WCRTOMB && HAVE_MBRTOWC && HAVE_WCSCOLL) || _LIBC
165
# define RE_ENABLE_I18N
166
#endif
167
168
#if __GNUC__ >= 3
169
# define BE(expr, val) __builtin_expect (expr, val)
170
#else
171
# define BE(expr, val) (expr)
172
# define inline
173
#endif
174
175
/* Number of single byte character. */
176
#define SBC_MAX 256
177
178
#define COLL_ELEM_LEN_MAX 8
179
180
/* The character which represents newline. */
181
#define NEWLINE_CHAR '\n'
182
#define WIDE_NEWLINE_CHAR L'\n'
183
184
/* Rename to standard API for using out of glibc. */
185
#ifndef _LIBC
186
# define __wctype wctype
187
# define __iswctype iswctype
188
# define __btowc btowc
189
# define __mempcpy mempcpy
190
# define __wcrtomb wcrtomb
191
# define __regfree regfree
192
# define attribute_hidden
193
#endif /* not _LIBC */
194
195
#ifdef __GNUC__
196
# define __attribute(arg) __attribute__ (arg)
197
#else
198
# define __attribute(arg)
199
#endif
200
201
extern const char __re_error_msgid[] attribute_hidden;
202
extern const size_t __re_error_msgid_idx[] attribute_hidden;
203
204
/* An integer used to represent a set of bits. It must be unsigned,
205
and must be at least as wide as unsigned int. */
206
typedef unsigned long int bitset_word_t;
207
/* All bits set in a bitset_word_t. */
208
#define BITSET_WORD_MAX ULONG_MAX
209
/* Number of bits in a bitset_word_t. */
210
#define BITSET_WORD_BITS (sizeof (bitset_word_t) * CHAR_BIT)
211
/* Number of bitset_word_t in a bit_set. */
212
#define BITSET_WORDS (SBC_MAX / BITSET_WORD_BITS)
213
typedef bitset_word_t bitset_t[BITSET_WORDS];
214
typedef bitset_word_t *re_bitset_ptr_t;
215
typedef const bitset_word_t *re_const_bitset_ptr_t;
216
217
#define bitset_set(set,i) \
218
(set[i / BITSET_WORD_BITS] |= (bitset_word_t) 1 << i % BITSET_WORD_BITS)
219
#define bitset_clear(set,i) \
220
(set[i / BITSET_WORD_BITS] &= ~((bitset_word_t) 1 << i % BITSET_WORD_BITS))
221
#define bitset_contain(set,i) \
222
(set[i / BITSET_WORD_BITS] & ((bitset_word_t) 1 << i % BITSET_WORD_BITS))
223
#define bitset_empty(set) memset (set, '\0', sizeof (bitset_t))
224
#define bitset_set_all(set) memset (set, '\xff', sizeof (bitset_t))
225
#define bitset_copy(dest,src) memcpy (dest, src, sizeof (bitset_t))
226
227
#define PREV_WORD_CONSTRAINT 0x0001
228
#define PREV_NOTWORD_CONSTRAINT 0x0002
229
#define NEXT_WORD_CONSTRAINT 0x0004
230
#define NEXT_NOTWORD_CONSTRAINT 0x0008
231
#define PREV_NEWLINE_CONSTRAINT 0x0010
232
#define NEXT_NEWLINE_CONSTRAINT 0x0020
233
#define PREV_BEGBUF_CONSTRAINT 0x0040
234
#define NEXT_ENDBUF_CONSTRAINT 0x0080
235
#define WORD_DELIM_CONSTRAINT 0x0100
236
#define NOT_WORD_DELIM_CONSTRAINT 0x0200
237
238
typedef enum
239
{
240
INSIDE_WORD = PREV_WORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
241
WORD_FIRST = PREV_NOTWORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
242
WORD_LAST = PREV_WORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
243
INSIDE_NOTWORD = PREV_NOTWORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
244
LINE_FIRST = PREV_NEWLINE_CONSTRAINT,
245
LINE_LAST = NEXT_NEWLINE_CONSTRAINT,
246
BUF_FIRST = PREV_BEGBUF_CONSTRAINT,
247
BUF_LAST = NEXT_ENDBUF_CONSTRAINT,
248
WORD_DELIM = WORD_DELIM_CONSTRAINT,
249
NOT_WORD_DELIM = NOT_WORD_DELIM_CONSTRAINT
250
} re_context_type;
251
252
typedef struct
253
{
254
int alloc;
255
int nelem;
256
int *elems;
257
} re_node_set;
258
259
typedef enum
260
{
261
NON_TYPE = 0,
262
263
/* Node type, These are used by token, node, tree. */
264
CHARACTER = 1,
265
END_OF_RE = 2,
266
SIMPLE_BRACKET = 3,
267
OP_BACK_REF = 4,
268
OP_PERIOD = 5,
269
#ifdef RE_ENABLE_I18N
270
COMPLEX_BRACKET = 6,
271
OP_UTF8_PERIOD = 7,
272
#endif /* RE_ENABLE_I18N */
273
274
/* We define EPSILON_BIT as a macro so that OP_OPEN_SUBEXP is used
275
when the debugger shows values of this enum type. */
276
#define EPSILON_BIT 8
277
OP_OPEN_SUBEXP = EPSILON_BIT | 0,
278
OP_CLOSE_SUBEXP = EPSILON_BIT | 1,
279
OP_ALT = EPSILON_BIT | 2,
280
OP_DUP_ASTERISK = EPSILON_BIT | 3,
281
ANCHOR = EPSILON_BIT | 4,
282
283
/* Tree type, these are used only by tree. */
284
CONCAT = 16,
285
SUBEXP = 17,
286
287
/* Token type, these are used only by token. */
288
OP_DUP_PLUS = 18,
289
OP_DUP_QUESTION,
290
OP_OPEN_BRACKET,
291
OP_CLOSE_BRACKET,
292
OP_CHARSET_RANGE,
293
OP_OPEN_DUP_NUM,
294
OP_CLOSE_DUP_NUM,
295
OP_NON_MATCH_LIST,
296
OP_OPEN_COLL_ELEM,
297
OP_CLOSE_COLL_ELEM,
298
OP_OPEN_EQUIV_CLASS,
299
OP_CLOSE_EQUIV_CLASS,
300
OP_OPEN_CHAR_CLASS,
301
OP_CLOSE_CHAR_CLASS,
302
OP_WORD,
303
OP_NOTWORD,
304
OP_SPACE,
305
OP_NOTSPACE,
306
BACK_SLASH
307
308
} re_token_type_t;
309
310
#ifdef RE_ENABLE_I18N
311
typedef struct
312
{
313
/* Multibyte characters. */
314
wchar_t *mbchars;
315
316
/* Collating symbols. */
317
# ifdef _LIBC
318
int32_t *coll_syms;
319
# endif
320
321
/* Equivalence classes. */
322
# ifdef _LIBC
323
int32_t *equiv_classes;
324
# endif
325
326
/* Range expressions. */
327
# ifdef _LIBC
328
uint32_t *range_starts;
329
uint32_t *range_ends;
330
# else /* not _LIBC */
331
wchar_t *range_starts;
332
wchar_t *range_ends;
333
# endif /* not _LIBC */
334
335
/* Character classes. */
336
wctype_t *char_classes;
337
338
/* If this character set is the non-matching list. */
339
unsigned int non_match : 1;
340
341
/* # of multibyte characters. */
342
int nmbchars;
343
344
/* # of collating symbols. */
345
int ncoll_syms;
346
347
/* # of equivalence classes. */
348
int nequiv_classes;
349
350
/* # of range expressions. */
351
int nranges;
352
353
/* # of character classes. */
354
int nchar_classes;
355
} re_charset_t;
356
#endif /* RE_ENABLE_I18N */
357
358
typedef struct
359
{
360
union
361
{
362
unsigned char c; /* for CHARACTER */
363
re_bitset_ptr_t sbcset; /* for SIMPLE_BRACKET */
364
#ifdef RE_ENABLE_I18N
365
re_charset_t *mbcset; /* for COMPLEX_BRACKET */
366
#endif /* RE_ENABLE_I18N */
367
int idx; /* for BACK_REF */
368
re_context_type ctx_type; /* for ANCHOR */
369
} opr;
370
#if __GNUC__ >= 2
371
re_token_type_t type : 8;
372
#else
373
re_token_type_t type;
374
#endif
375
unsigned int constraint : 10; /* context constraint */
376
unsigned int duplicated : 1;
377
unsigned int opt_subexp : 1;
378
#ifdef RE_ENABLE_I18N
379
unsigned int accept_mb : 1;
380
/* These 2 bits can be moved into the union if needed (e.g. if running out
381
of bits; move opr.c to opr.c.c and move the flags to opr.c.flags). */
382
unsigned int mb_partial : 1;
383
#endif
384
unsigned int word_char : 1;
385
} re_token_t;
386
387
#define IS_EPSILON_NODE(type) ((type) & EPSILON_BIT)
388
389
struct re_string_t
390
{
391
/* Indicate the raw buffer which is the original string passed as an
392
argument of regexec(), re_search(), etc.. */
393
const unsigned char *raw_mbs;
394
/* Store the multibyte string. In case of "case insensitive mode" like
395
REG_ICASE, upper cases of the string are stored, otherwise MBS points
396
the same address that RAW_MBS points. */
397
unsigned char *mbs;
398
#ifdef RE_ENABLE_I18N
399
/* Store the wide character string which is corresponding to MBS. */
400
wint_t *wcs;
401
int *offsets;
402
mbstate_t cur_state;
403
#endif
404
/* Index in RAW_MBS. Each character mbs[i] corresponds to
405
raw_mbs[raw_mbs_idx + i]. */
406
int raw_mbs_idx;
407
/* The length of the valid characters in the buffers. */
408
int valid_len;
409
/* The corresponding number of bytes in raw_mbs array. */
410
int valid_raw_len;
411
/* The length of the buffers MBS and WCS. */
412
int bufs_len;
413
/* The index in MBS, which is updated by re_string_fetch_byte. */
414
int cur_idx;
415
/* length of RAW_MBS array. */
416
int raw_len;
417
/* This is RAW_LEN - RAW_MBS_IDX + VALID_LEN - VALID_RAW_LEN. */
418
int len;
419
/* End of the buffer may be shorter than its length in the cases such
420
as re_match_2, re_search_2. Then, we use STOP for end of the buffer
421
instead of LEN. */
422
int raw_stop;
423
/* This is RAW_STOP - RAW_MBS_IDX adjusted through OFFSETS. */
424
int stop;
425
426
/* The context of mbs[0]. We store the context independently, since
427
the context of mbs[0] may be different from raw_mbs[0], which is
428
the beginning of the input string. */
429
unsigned int tip_context;
430
/* The translation passed as a part of an argument of re_compile_pattern. */
431
RE_TRANSLATE_TYPE trans;
432
/* Copy of re_dfa_t's word_char. */
433
re_const_bitset_ptr_t word_char;
434
/* 1 if REG_ICASE. */
435
unsigned char icase;
436
unsigned char is_utf8;
437
unsigned char map_notascii;
438
unsigned char mbs_allocated;
439
unsigned char offsets_needed;
440
unsigned char newline_anchor;
441
unsigned char word_ops_used;
442
int mb_cur_max;
443
};
444
typedef struct re_string_t re_string_t;
445
446
447
struct re_dfa_t;
448
typedef struct re_dfa_t re_dfa_t;
449
450
#ifndef _LIBC
451
# ifdef __i386__
452
# define internal_function __attribute ((regparm (3), stdcall))
453
# else
454
# define internal_function
455
# endif
456
#endif
457
458
static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
459
int new_buf_len)
460
internal_function;
461
#ifdef RE_ENABLE_I18N
462
static void build_wcs_buffer (re_string_t *pstr) internal_function;
463
static int build_wcs_upper_buffer (re_string_t *pstr) internal_function;
464
#endif /* RE_ENABLE_I18N */
465
static void build_upper_buffer (re_string_t *pstr) internal_function;
466
static void re_string_translate_buffer (re_string_t *pstr) internal_function;
467
static unsigned int re_string_context_at (const re_string_t *input, int idx,
468
int eflags)
469
internal_function __attribute ((pure));
470
#define re_string_peek_byte(pstr, offset) \
471
((pstr)->mbs[(pstr)->cur_idx + offset])
472
#define re_string_fetch_byte(pstr) \
473
((pstr)->mbs[(pstr)->cur_idx++])
474
#define re_string_first_byte(pstr, idx) \
475
((idx) == (pstr)->valid_len || (pstr)->wcs[idx] != WEOF)
476
#define re_string_is_single_byte_char(pstr, idx) \
477
((pstr)->wcs[idx] != WEOF && ((pstr)->valid_len == (idx) + 1 \
478
|| (pstr)->wcs[(idx) + 1] != WEOF))
479
#define re_string_eoi(pstr) ((pstr)->stop <= (pstr)->cur_idx)
480
#define re_string_cur_idx(pstr) ((pstr)->cur_idx)
481
#define re_string_get_buffer(pstr) ((pstr)->mbs)
482
#define re_string_length(pstr) ((pstr)->len)
483
#define re_string_byte_at(pstr,idx) ((pstr)->mbs[idx])
484
#define re_string_skip_bytes(pstr,idx) ((pstr)->cur_idx += (idx))
485
#define re_string_set_index(pstr,idx) ((pstr)->cur_idx = (idx))
486
487
#ifdef __GNUC__
488
# define alloca(size) __builtin_alloca (size)
489
# define HAVE_ALLOCA 1
490
#elif defined(_MSC_VER)
491
# include <malloc.h>
492
# define alloca _alloca
493
# define HAVE_ALLOCA 1
494
#else
495
# error No alloca()
496
#endif
497
498
#ifndef _LIBC
499
# if HAVE_ALLOCA
500
/* The OS usually guarantees only one guard page at the bottom of the stack,
501
and a page size can be as small as 4096 bytes. So we cannot safely
502
allocate anything larger than 4096 bytes. Also care for the possibility
503
of a few compiler-allocated temporary stack slots. */
504
# define __libc_use_alloca(n) ((n) < 4032)
505
# else
506
/* alloca is implemented with malloc, so just use malloc. */
507
# define __libc_use_alloca(n) 0
508
# endif
509
#endif
510
511
#define re_malloc(t,n) ((t *) malloc ((n) * sizeof (t)))
512
#define re_realloc(p,t,n) ((t *) realloc (p, (n) * sizeof (t)))
513
#define re_free(p) free (p)
514
515
struct bin_tree_t
516
{
517
struct bin_tree_t *parent;
518
struct bin_tree_t *left;
519
struct bin_tree_t *right;
520
struct bin_tree_t *first;
521
struct bin_tree_t *next;
522
523
re_token_t token;
524
525
/* `node_idx' is the index in dfa->nodes, if `type' == 0.
526
Otherwise `type' indicate the type of this node. */
527
int node_idx;
528
};
529
typedef struct bin_tree_t bin_tree_t;
530
531
#define BIN_TREE_STORAGE_SIZE \
532
((1024 - sizeof (void *)) / sizeof (bin_tree_t))
533
534
struct bin_tree_storage_t
535
{
536
struct bin_tree_storage_t *next;
537
bin_tree_t data[BIN_TREE_STORAGE_SIZE];
538
};
539
typedef struct bin_tree_storage_t bin_tree_storage_t;
540
541
#define CONTEXT_WORD 1
542
#define CONTEXT_NEWLINE (CONTEXT_WORD << 1)
543
#define CONTEXT_BEGBUF (CONTEXT_NEWLINE << 1)
544
#define CONTEXT_ENDBUF (CONTEXT_BEGBUF << 1)
545
546
#define IS_WORD_CONTEXT(c) ((c) & CONTEXT_WORD)
547
#define IS_NEWLINE_CONTEXT(c) ((c) & CONTEXT_NEWLINE)
548
#define IS_BEGBUF_CONTEXT(c) ((c) & CONTEXT_BEGBUF)
549
#define IS_ENDBUF_CONTEXT(c) ((c) & CONTEXT_ENDBUF)
550
#define IS_ORDINARY_CONTEXT(c) ((c) == 0)
551
552
#define IS_WORD_CHAR(ch) (isalnum (ch) || (ch) == '_')
553
#define IS_NEWLINE(ch) ((ch) == NEWLINE_CHAR)
554
#define IS_WIDE_WORD_CHAR(ch) (iswalnum (ch) || (ch) == L'_')
555
#define IS_WIDE_NEWLINE(ch) ((ch) == WIDE_NEWLINE_CHAR)
556
557
#define NOT_SATISFY_PREV_CONSTRAINT(constraint,context) \
558
((((constraint) & PREV_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
559
|| ((constraint & PREV_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
560
|| ((constraint & PREV_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context))\
561
|| ((constraint & PREV_BEGBUF_CONSTRAINT) && !IS_BEGBUF_CONTEXT (context)))
562
563
#define NOT_SATISFY_NEXT_CONSTRAINT(constraint,context) \
564
((((constraint) & NEXT_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
565
|| (((constraint) & NEXT_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
566
|| (((constraint) & NEXT_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context)) \
567
|| (((constraint) & NEXT_ENDBUF_CONSTRAINT) && !IS_ENDBUF_CONTEXT (context)))
568
569
struct re_dfastate_t
570
{
571
unsigned int hash;
572
re_node_set nodes;
573
re_node_set non_eps_nodes;
574
re_node_set inveclosure;
575
re_node_set *entrance_nodes;
576
struct re_dfastate_t **trtable, **word_trtable;
577
unsigned int context : 4;
578
unsigned int halt : 1;
579
/* If this state can accept `multi byte'.
580
Note that we refer to multibyte characters, and multi character
581
collating elements as `multi byte'. */
582
unsigned int accept_mb : 1;
583
/* If this state has backreference node(s). */
584
unsigned int has_backref : 1;
585
unsigned int has_constraint : 1;
586
};
587
typedef struct re_dfastate_t re_dfastate_t;
588
589
struct re_state_table_entry
590
{
591
int num;
592
int alloc;
593
re_dfastate_t **array;
594
};
595
596
/* Array type used in re_sub_match_last_t and re_sub_match_top_t. */
597
598
typedef struct
599
{
600
int next_idx;
601
int alloc;
602
re_dfastate_t **array;
603
} state_array_t;
604
605
/* Store information about the node NODE whose type is OP_CLOSE_SUBEXP. */
606
607
typedef struct
608
{
609
int node;
610
int str_idx; /* The position NODE match at. */
611
state_array_t path;
612
} re_sub_match_last_t;
613
614
/* Store information about the node NODE whose type is OP_OPEN_SUBEXP.
615
And information about the node, whose type is OP_CLOSE_SUBEXP,
616
corresponding to NODE is stored in LASTS. */
617
618
typedef struct
619
{
620
int str_idx;
621
int node;
622
state_array_t *path;
623
int alasts; /* Allocation size of LASTS. */
624
int nlasts; /* The number of LASTS. */
625
re_sub_match_last_t **lasts;
626
} re_sub_match_top_t;
627
628
struct re_backref_cache_entry
629
{
630
int node;
631
int str_idx;
632
int subexp_from;
633
int subexp_to;
634
char more;
635
char unused;
636
unsigned short int eps_reachable_subexps_map;
637
};
638
639
typedef struct
640
{
641
/* The string object corresponding to the input string. */
642
re_string_t input;
643
#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
644
const re_dfa_t *const dfa;
645
#else
646
const re_dfa_t *dfa;
647
#endif
648
/* EFLAGS of the argument of regexec. */
649
int eflags;
650
/* Where the matching ends. */
651
int match_last;
652
int last_node;
653
/* The state log used by the matcher. */
654
re_dfastate_t **state_log;
655
int state_log_top;
656
/* Back reference cache. */
657
int nbkref_ents;
658
int abkref_ents;
659
struct re_backref_cache_entry *bkref_ents;
660
int max_mb_elem_len;
661
int nsub_tops;
662
int asub_tops;
663
re_sub_match_top_t **sub_tops;
664
} re_match_context_t;
665
666
typedef struct
667
{
668
re_dfastate_t **sifted_states;
669
re_dfastate_t **limited_states;
670
int last_node;
671
int last_str_idx;
672
re_node_set limits;
673
} re_sift_context_t;
674
675
struct re_fail_stack_ent_t
676
{
677
int idx;
678
int node;
679
regmatch_t *regs;
680
re_node_set eps_via_nodes;
681
};
682
683
struct re_fail_stack_t
684
{
685
int num;
686
int alloc;
687
struct re_fail_stack_ent_t *stack;
688
};
689
690
struct re_dfa_t
691
{
692
re_token_t *nodes;
693
size_t nodes_alloc;
694
size_t nodes_len;
695
int *nexts;
696
int *org_indices;
697
re_node_set *edests;
698
re_node_set *eclosures;
699
re_node_set *inveclosures;
700
struct re_state_table_entry *state_table;
701
re_dfastate_t *init_state;
702
re_dfastate_t *init_state_word;
703
re_dfastate_t *init_state_nl;
704
re_dfastate_t *init_state_begbuf;
705
bin_tree_t *str_tree;
706
bin_tree_storage_t *str_tree_storage;
707
re_bitset_ptr_t sb_char;
708
int str_tree_storage_idx;
709
710
/* number of subexpressions `re_nsub' is in regex_t. */
711
unsigned int state_hash_mask;
712
int init_node;
713
int nbackref; /* The number of backreference in this dfa. */
714
715
/* Bitmap expressing which backreference is used. */
716
bitset_word_t used_bkref_map;
717
bitset_word_t completed_bkref_map;
718
719
unsigned int has_plural_match : 1;
720
/* If this dfa has "multibyte node", which is a backreference or
721
a node which can accept multibyte character or multi character
722
collating element. */
723
unsigned int has_mb_node : 1;
724
unsigned int is_utf8 : 1;
725
unsigned int map_notascii : 1;
726
unsigned int word_ops_used : 1;
727
int mb_cur_max;
728
bitset_t word_char;
729
reg_syntax_t syntax;
730
int *subexp_map;
731
#ifdef DEBUG
732
char* re_str;
733
#endif
734
__libc_lock_define (, lock)
735
};
736
737
#define re_node_set_init_empty(set) memset (set, '\0', sizeof (re_node_set))
738
#define re_node_set_remove(set,id) \
739
(re_node_set_remove_at (set, re_node_set_contains (set, id) - 1))
740
#define re_node_set_empty(p) ((p)->nelem = 0)
741
#define re_node_set_free(set) re_free ((set)->elems)
742
743
744
typedef enum
745
{
746
SB_CHAR,
747
MB_CHAR,
748
EQUIV_CLASS,
749
COLL_SYM,
750
CHAR_CLASS
751
} bracket_elem_type;
752
753
typedef struct
754
{
755
bracket_elem_type type;
756
union
757
{
758
unsigned char ch;
759
unsigned char *name;
760
wchar_t wch;
761
} opr;
762
} bracket_elem_t;
763
764
765
/* Inline functions for bitset operation. */
766
static inline void
767
bitset_not (bitset_t set)
768
{
769
int bitset_i;
770
for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)
771
set[bitset_i] = ~set[bitset_i];
772
}
773
774
static inline void
775
bitset_merge (bitset_t dest, const bitset_t src)
776
{
777
int bitset_i;
778
for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)
779
dest[bitset_i] |= src[bitset_i];
780
}
781
782
static inline void
783
bitset_mask (bitset_t dest, const bitset_t src)
784
{
785
int bitset_i;
786
for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)
787
dest[bitset_i] &= src[bitset_i];
788
}
789
790
#ifdef RE_ENABLE_I18N
791
/* Inline functions for re_string. */
792
static inline int
793
internal_function __attribute ((pure))
794
re_string_char_size_at (const re_string_t *pstr, int idx)
795
{
796
int byte_idx;
797
if (pstr->mb_cur_max == 1)
798
return 1;
799
for (byte_idx = 1; idx + byte_idx < pstr->valid_len; ++byte_idx)
800
if (pstr->wcs[idx + byte_idx] != WEOF)
801
break;
802
return byte_idx;
803
}
804
805
static inline wint_t
806
internal_function __attribute ((pure))
807
re_string_wchar_at (const re_string_t *pstr, int idx)
808
{
809
if (pstr->mb_cur_max == 1)
810
return (wint_t) pstr->mbs[idx];
811
return (wint_t) pstr->wcs[idx];
812
}
813
814
static int
815
internal_function __attribute ((pure))
816
re_string_elem_size_at (const re_string_t *pstr, int idx)
817
{
818
# ifdef _LIBC
819
const unsigned char *p, *extra;
820
const int32_t *table, *indirect;
821
int32_t tmp;
822
# include <locale/weight.h>
823
uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
824
825
if (nrules != 0)
826
{
827
table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
828
extra = (const unsigned char *)
829
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
830
indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
831
_NL_COLLATE_INDIRECTMB);
832
p = pstr->mbs + idx;
833
tmp = findidx (&p);
834
return p - pstr->mbs - idx;
835
}
836
else
837
# endif /* _LIBC */
838
return 1;
839
}
840
#endif /* RE_ENABLE_I18N */
841
842
#endif /* _REGEX_INTERNAL_H */
843
844
/******************************************************************************/
845
/******************************************************************************/
846
/******************************************************************************/
847
/* GKINCLUDE #include "regex_internal.c" */
848
/******************************************************************************/
849
/******************************************************************************/
850
/******************************************************************************/
851
/* Extended regular expression matching and search library.
852
Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
853
This file is part of the GNU C Library.
854
Contributed by Isamu Hasegawa <[email protected]>.
855
856
The GNU C Library is free software; you can redistribute it and/or
857
modify it under the terms of the GNU Lesser General Public
858
License as published by the Free Software Foundation; either
859
version 2.1 of the License, or (at your option) any later version.
860
861
The GNU C Library is distributed in the hope that it will be useful,
862
but WITHOUT ANY WARRANTY; without even the implied warranty of
863
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
864
Lesser General Public License for more details.
865
866
You should have received a copy of the GNU Lesser General Public
867
License along with the GNU C Library; if not, write to the Free
868
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
869
02111-1307 USA. */
870
871
static void re_string_construct_common (const char *str, int len,
872
re_string_t *pstr,
873
RE_TRANSLATE_TYPE trans, int icase,
874
const re_dfa_t *dfa) internal_function;
875
static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
876
const re_node_set *nodes,
877
unsigned int hash) internal_function;
878
static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
879
const re_node_set *nodes,
880
unsigned int context,
881
unsigned int hash) internal_function;
882
883
/* Functions for string operation. */
884
885
/* This function allocate the buffers. It is necessary to call
886
re_string_reconstruct before using the object. */
887
888
static reg_errcode_t
889
internal_function
890
re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
891
RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
892
{
893
reg_errcode_t ret;
894
int init_buf_len;
895
896
/* Ensure at least one character fits into the buffers. */
897
if (init_len < dfa->mb_cur_max)
898
init_len = dfa->mb_cur_max;
899
init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
900
re_string_construct_common (str, len, pstr, trans, icase, dfa);
901
902
ret = re_string_realloc_buffers (pstr, init_buf_len);
903
if (BE (ret != REG_NOERROR, 0))
904
return ret;
905
906
pstr->word_char = dfa->word_char;
907
pstr->word_ops_used = dfa->word_ops_used;
908
pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
909
pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
910
pstr->valid_raw_len = pstr->valid_len;
911
return REG_NOERROR;
912
}
913
914
/* This function allocate the buffers, and initialize them. */
915
916
static reg_errcode_t
917
internal_function
918
re_string_construct (re_string_t *pstr, const char *str, int len,
919
RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
920
{
921
reg_errcode_t ret;
922
memset (pstr, '\0', sizeof (re_string_t));
923
re_string_construct_common (str, len, pstr, trans, icase, dfa);
924
925
if (len > 0)
926
{
927
ret = re_string_realloc_buffers (pstr, len + 1);
928
if (BE (ret != REG_NOERROR, 0))
929
return ret;
930
}
931
pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
932
933
if (icase)
934
{
935
#ifdef RE_ENABLE_I18N
936
if (dfa->mb_cur_max > 1)
937
{
938
while (1)
939
{
940
ret = build_wcs_upper_buffer (pstr);
941
if (BE (ret != REG_NOERROR, 0))
942
return ret;
943
if (pstr->valid_raw_len >= len)
944
break;
945
if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
946
break;
947
ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
948
if (BE (ret != REG_NOERROR, 0))
949
return ret;
950
}
951
}
952
else
953
#endif /* RE_ENABLE_I18N */
954
build_upper_buffer (pstr);
955
}
956
else
957
{
958
#ifdef RE_ENABLE_I18N
959
if (dfa->mb_cur_max > 1)
960
build_wcs_buffer (pstr);
961
else
962
#endif /* RE_ENABLE_I18N */
963
{
964
if (trans != NULL)
965
re_string_translate_buffer (pstr);
966
else
967
{
968
pstr->valid_len = pstr->bufs_len;
969
pstr->valid_raw_len = pstr->bufs_len;
970
}
971
}
972
}
973
974
return REG_NOERROR;
975
}
976
977
/* Helper functions for re_string_allocate, and re_string_construct. */
978
979
static reg_errcode_t
980
internal_function
981
re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
982
{
983
#ifdef RE_ENABLE_I18N
984
if (pstr->mb_cur_max > 1)
985
{
986
wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
987
if (BE (new_wcs == NULL, 0))
988
return REG_ESPACE;
989
pstr->wcs = new_wcs;
990
if (pstr->offsets != NULL)
991
{
992
int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
993
if (BE (new_offsets == NULL, 0))
994
return REG_ESPACE;
995
pstr->offsets = new_offsets;
996
}
997
}
998
#endif /* RE_ENABLE_I18N */
999
if (pstr->mbs_allocated)
1000
{
1001
unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
1002
new_buf_len);
1003
if (BE (new_mbs == NULL, 0))
1004
return REG_ESPACE;
1005
pstr->mbs = new_mbs;
1006
}
1007
pstr->bufs_len = new_buf_len;
1008
return REG_NOERROR;
1009
}
1010
1011
1012
static void
1013
internal_function
1014
re_string_construct_common (const char *str, int len, re_string_t *pstr,
1015
RE_TRANSLATE_TYPE trans, int icase,
1016
const re_dfa_t *dfa)
1017
{
1018
pstr->raw_mbs = (const unsigned char *) str;
1019
pstr->len = len;
1020
pstr->raw_len = len;
1021
pstr->trans = trans;
1022
pstr->icase = icase ? 1 : 0;
1023
pstr->mbs_allocated = (trans != NULL || icase);
1024
pstr->mb_cur_max = dfa->mb_cur_max;
1025
pstr->is_utf8 = dfa->is_utf8;
1026
pstr->map_notascii = dfa->map_notascii;
1027
pstr->stop = pstr->len;
1028
pstr->raw_stop = pstr->stop;
1029
}
1030
1031
#ifdef RE_ENABLE_I18N
1032
1033
/* Build wide character buffer PSTR->WCS.
1034
If the byte sequence of the string are:
1035
<mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
1036
Then wide character buffer will be:
1037
<wc1> , WEOF , <wc2> , WEOF , <wc3>
1038
We use WEOF for padding, they indicate that the position isn't
1039
a first byte of a multibyte character.
1040
1041
Note that this function assumes PSTR->VALID_LEN elements are already
1042
built and starts from PSTR->VALID_LEN. */
1043
1044
static void
1045
internal_function
1046
build_wcs_buffer (re_string_t *pstr)
1047
{
1048
#ifdef _LIBC
1049
unsigned char buf[MB_LEN_MAX];
1050
assert (MB_LEN_MAX >= pstr->mb_cur_max);
1051
#else
1052
unsigned char buf[64];
1053
#endif
1054
mbstate_t prev_st;
1055
int byte_idx, end_idx, remain_len;
1056
size_t mbclen;
1057
1058
/* Build the buffers from pstr->valid_len to either pstr->len or
1059
pstr->bufs_len. */
1060
end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
1061
for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
1062
{
1063
wchar_t wc;
1064
const char *p;
1065
1066
remain_len = end_idx - byte_idx;
1067
prev_st = pstr->cur_state;
1068
/* Apply the translation if we need. */
1069
if (BE (pstr->trans != NULL, 0))
1070
{
1071
int i, ch;
1072
1073
for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
1074
{
1075
ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
1076
buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
1077
}
1078
p = (const char *) buf;
1079
}
1080
else
1081
p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
1082
mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
1083
if (BE (mbclen == (size_t) -2, 0))
1084
{
1085
/* The buffer doesn't have enough space, finish to build. */
1086
pstr->cur_state = prev_st;
1087
break;
1088
}
1089
else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
1090
{
1091
/* We treat these cases as a singlebyte character. */
1092
mbclen = 1;
1093
wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
1094
if (BE (pstr->trans != NULL, 0))
1095
wc = pstr->trans[wc];
1096
pstr->cur_state = prev_st;
1097
}
1098
1099
/* Write wide character and padding. */
1100
pstr->wcs[byte_idx++] = wc;
1101
/* Write paddings. */
1102
for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
1103
pstr->wcs[byte_idx++] = WEOF;
1104
}
1105
pstr->valid_len = byte_idx;
1106
pstr->valid_raw_len = byte_idx;
1107
}
1108
1109
/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
1110
but for REG_ICASE. */
1111
1112
static reg_errcode_t
1113
internal_function
1114
build_wcs_upper_buffer (re_string_t *pstr)
1115
{
1116
mbstate_t prev_st;
1117
int src_idx, byte_idx, end_idx, remain_len;
1118
size_t mbclen;
1119
#ifdef _LIBC
1120
char buf[MB_LEN_MAX];
1121
assert (MB_LEN_MAX >= pstr->mb_cur_max);
1122
#else
1123
char buf[64];
1124
#endif
1125
1126
byte_idx = pstr->valid_len;
1127
end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
1128
1129
/* The following optimization assumes that ASCII characters can be
1130
mapped to wide characters with a simple cast. */
1131
if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
1132
{
1133
while (byte_idx < end_idx)
1134
{
1135
wchar_t wc;
1136
1137
if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
1138
&& mbsinit (&pstr->cur_state))
1139
{
1140
/* In case of a singlebyte character. */
1141
pstr->mbs[byte_idx]
1142
= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
1143
/* The next step uses the assumption that wchar_t is encoded
1144
ASCII-safe: all ASCII values can be converted like this. */
1145
pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
1146
++byte_idx;
1147
continue;
1148
}
1149
1150
remain_len = end_idx - byte_idx;
1151
prev_st = pstr->cur_state;
1152
mbclen = mbrtowc (&wc,
1153
((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
1154
+ byte_idx), remain_len, &pstr->cur_state);
1155
if (BE (mbclen + 2 > 2, 1))
1156
{
1157
wchar_t wcu = wc;
1158
if (iswlower (wc))
1159
{
1160
size_t mbcdlen;
1161
1162
wcu = towupper (wc);
1163
mbcdlen = wcrtomb (buf, wcu, &prev_st);
1164
if (BE (mbclen == mbcdlen, 1))
1165
memcpy (pstr->mbs + byte_idx, buf, mbclen);
1166
else
1167
{
1168
src_idx = byte_idx;
1169
goto offsets_needed;
1170
}
1171
}
1172
else
1173
memcpy (pstr->mbs + byte_idx,
1174
pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
1175
pstr->wcs[byte_idx++] = wcu;
1176
/* Write paddings. */
1177
for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
1178
pstr->wcs[byte_idx++] = WEOF;
1179
}
1180
else if (mbclen == (size_t) -1 || mbclen == 0)
1181
{
1182
/* It is an invalid character or '\0'. Just use the byte. */
1183
int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
1184
pstr->mbs[byte_idx] = ch;
1185
/* And also cast it to wide char. */
1186
pstr->wcs[byte_idx++] = (wchar_t) ch;
1187
if (BE (mbclen == (size_t) -1, 0))
1188
pstr->cur_state = prev_st;
1189
}
1190
else
1191
{
1192
/* The buffer doesn't have enough space, finish to build. */
1193
pstr->cur_state = prev_st;
1194
break;
1195
}
1196
}
1197
pstr->valid_len = byte_idx;
1198
pstr->valid_raw_len = byte_idx;
1199
return REG_NOERROR;
1200
}
1201
else
1202
for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
1203
{
1204
wchar_t wc;
1205
const char *p;
1206
offsets_needed:
1207
remain_len = end_idx - byte_idx;
1208
prev_st = pstr->cur_state;
1209
if (BE (pstr->trans != NULL, 0))
1210
{
1211
int i, ch;
1212
1213
for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
1214
{
1215
ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
1216
buf[i] = pstr->trans[ch];
1217
}
1218
p = (const char *) buf;
1219
}
1220
else
1221
p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
1222
mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
1223
if (BE (mbclen + 2 > 2, 1))
1224
{
1225
wchar_t wcu = wc;
1226
if (iswlower (wc))
1227
{
1228
size_t mbcdlen;
1229
1230
wcu = towupper (wc);
1231
mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
1232
if (BE (mbclen == mbcdlen, 1))
1233
memcpy (pstr->mbs + byte_idx, buf, mbclen);
1234
else if (mbcdlen != (size_t) -1)
1235
{
1236
size_t i;
1237
1238
if (byte_idx + mbcdlen > pstr->bufs_len)
1239
{
1240
pstr->cur_state = prev_st;
1241
break;
1242
}
1243
1244
if (pstr->offsets == NULL)
1245
{
1246
pstr->offsets = re_malloc (int, pstr->bufs_len);
1247
1248
if (pstr->offsets == NULL)
1249
return REG_ESPACE;
1250
}
1251
if (!pstr->offsets_needed)
1252
{
1253
for (i = 0; i < (size_t) byte_idx; ++i)
1254
pstr->offsets[i] = i;
1255
pstr->offsets_needed = 1;
1256
}
1257
1258
memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
1259
pstr->wcs[byte_idx] = wcu;
1260
pstr->offsets[byte_idx] = src_idx;
1261
for (i = 1; i < mbcdlen; ++i)
1262
{
1263
pstr->offsets[byte_idx + i]
1264
= src_idx + (i < mbclen ? i : mbclen - 1);
1265
pstr->wcs[byte_idx + i] = WEOF;
1266
}
1267
pstr->len += mbcdlen - mbclen;
1268
if (pstr->raw_stop > src_idx)
1269
pstr->stop += mbcdlen - mbclen;
1270
end_idx = (pstr->bufs_len > pstr->len)
1271
? pstr->len : pstr->bufs_len;
1272
byte_idx += mbcdlen;
1273
src_idx += mbclen;
1274
continue;
1275
}
1276
else
1277
memcpy (pstr->mbs + byte_idx, p, mbclen);
1278
}
1279
else
1280
memcpy (pstr->mbs + byte_idx, p, mbclen);
1281
1282
if (BE (pstr->offsets_needed != 0, 0))
1283
{
1284
size_t i;
1285
for (i = 0; i < mbclen; ++i)
1286
pstr->offsets[byte_idx + i] = src_idx + i;
1287
}
1288
src_idx += mbclen;
1289
1290
pstr->wcs[byte_idx++] = wcu;
1291
/* Write paddings. */
1292
for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
1293
pstr->wcs[byte_idx++] = WEOF;
1294
}
1295
else if (mbclen == (size_t) -1 || mbclen == 0)
1296
{
1297
/* It is an invalid character or '\0'. Just use the byte. */
1298
int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
1299
1300
if (BE (pstr->trans != NULL, 0))
1301
ch = pstr->trans [ch];
1302
pstr->mbs[byte_idx] = ch;
1303
1304
if (BE (pstr->offsets_needed != 0, 0))
1305
pstr->offsets[byte_idx] = src_idx;
1306
++src_idx;
1307
1308
/* And also cast it to wide char. */
1309
pstr->wcs[byte_idx++] = (wchar_t) ch;
1310
if (BE (mbclen == (size_t) -1, 0))
1311
pstr->cur_state = prev_st;
1312
}
1313
else
1314
{
1315
/* The buffer doesn't have enough space, finish to build. */
1316
pstr->cur_state = prev_st;
1317
break;
1318
}
1319
}
1320
pstr->valid_len = byte_idx;
1321
pstr->valid_raw_len = src_idx;
1322
return REG_NOERROR;
1323
}
1324
1325
/* Skip characters until the index becomes greater than NEW_RAW_IDX.
1326
Return the index. */
1327
1328
static int
1329
internal_function
1330
re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
1331
{
1332
mbstate_t prev_st;
1333
int rawbuf_idx;
1334
size_t mbclen;
1335
wchar_t wc = WEOF;
1336
1337
/* Skip the characters which are not necessary to check. */
1338
for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
1339
rawbuf_idx < new_raw_idx;)
1340
{
1341
int remain_len;
1342
remain_len = pstr->len - rawbuf_idx;
1343
prev_st = pstr->cur_state;
1344
mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
1345
remain_len, &pstr->cur_state);
1346
if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
1347
{
1348
/* We treat these cases as a single byte character. */
1349
if (mbclen == 0 || remain_len == 0)
1350
wc = L'\0';
1351
else
1352
wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
1353
mbclen = 1;
1354
pstr->cur_state = prev_st;
1355
}
1356
/* Then proceed the next character. */
1357
rawbuf_idx += mbclen;
1358
}
1359
*last_wc = (wint_t) wc;
1360
return rawbuf_idx;
1361
}
1362
#endif /* RE_ENABLE_I18N */
1363
1364
/* Build the buffer PSTR->MBS, and apply the translation if we need.
1365
This function is used in case of REG_ICASE. */
1366
1367
static void
1368
internal_function
1369
build_upper_buffer (re_string_t *pstr)
1370
{
1371
int char_idx, end_idx;
1372
end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
1373
1374
for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
1375
{
1376
int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
1377
if (BE (pstr->trans != NULL, 0))
1378
ch = pstr->trans[ch];
1379
if (islower (ch))
1380
pstr->mbs[char_idx] = toupper (ch);
1381
else
1382
pstr->mbs[char_idx] = ch;
1383
}
1384
pstr->valid_len = char_idx;
1385
pstr->valid_raw_len = char_idx;
1386
}
1387
1388
/* Apply TRANS to the buffer in PSTR. */
1389
1390
static void
1391
internal_function
1392
re_string_translate_buffer (re_string_t *pstr)
1393
{
1394
int buf_idx, end_idx;
1395
end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
1396
1397
for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
1398
{
1399
int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
1400
pstr->mbs[buf_idx] = pstr->trans[ch];
1401
}
1402
1403
pstr->valid_len = buf_idx;
1404
pstr->valid_raw_len = buf_idx;
1405
}
1406
1407
/* This function re-construct the buffers.
1408
Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
1409
convert to upper case in case of REG_ICASE, apply translation. */
1410
1411
static reg_errcode_t
1412
internal_function
1413
re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
1414
{
1415
int offset = idx - pstr->raw_mbs_idx;
1416
if (BE (offset < 0, 0))
1417
{
1418
/* Reset buffer. */
1419
#ifdef RE_ENABLE_I18N
1420
if (pstr->mb_cur_max > 1)
1421
memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
1422
#endif /* RE_ENABLE_I18N */
1423
pstr->len = pstr->raw_len;
1424
pstr->stop = pstr->raw_stop;
1425
pstr->valid_len = 0;
1426
pstr->raw_mbs_idx = 0;
1427
pstr->valid_raw_len = 0;
1428
pstr->offsets_needed = 0;
1429
pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
1430
: CONTEXT_NEWLINE | CONTEXT_BEGBUF);
1431
if (!pstr->mbs_allocated)
1432
pstr->mbs = (unsigned char *) pstr->raw_mbs;
1433
offset = idx;
1434
}
1435
1436
if (BE (offset != 0, 1))
1437
{
1438
/* Should the already checked characters be kept? */
1439
if (BE (offset < pstr->valid_raw_len, 1))
1440
{
1441
/* Yes, move them to the front of the buffer. */
1442
#ifdef RE_ENABLE_I18N
1443
if (BE (pstr->offsets_needed, 0))
1444
{
1445
int low = 0, high = pstr->valid_len, mid;
1446
do
1447
{
1448
mid = (high + low) / 2;
1449
if (pstr->offsets[mid] > offset)
1450
high = mid;
1451
else if (pstr->offsets[mid] < offset)
1452
low = mid + 1;
1453
else
1454
break;
1455
}
1456
while (low < high);
1457
if (pstr->offsets[mid] < offset)
1458
++mid;
1459
pstr->tip_context = re_string_context_at (pstr, mid - 1,
1460
eflags);
1461
/* This can be quite complicated, so handle specially
1462
only the common and easy case where the character with
1463
different length representation of lower and upper
1464
case is present at or after offset. */
1465
if (pstr->valid_len > offset
1466
&& mid == offset && pstr->offsets[mid] == offset)
1467
{
1468
memmove (pstr->wcs, pstr->wcs + offset,
1469
(pstr->valid_len - offset) * sizeof (wint_t));
1470
memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
1471
pstr->valid_len -= offset;
1472
pstr->valid_raw_len -= offset;
1473
for (low = 0; low < pstr->valid_len; low++)
1474
pstr->offsets[low] = pstr->offsets[low + offset] - offset;
1475
}
1476
else
1477
{
1478
/* Otherwise, just find out how long the partial multibyte
1479
character at offset is and fill it with WEOF/255. */
1480
pstr->len = pstr->raw_len - idx + offset;
1481
pstr->stop = pstr->raw_stop - idx + offset;
1482
pstr->offsets_needed = 0;
1483
while (mid > 0 && pstr->offsets[mid - 1] == offset)
1484
--mid;
1485
while (mid < pstr->valid_len)
1486
if (pstr->wcs[mid] != WEOF)
1487
break;
1488
else
1489
++mid;
1490
if (mid == pstr->valid_len)
1491
pstr->valid_len = 0;
1492
else
1493
{
1494
pstr->valid_len = pstr->offsets[mid] - offset;
1495
if (pstr->valid_len)
1496
{
1497
for (low = 0; low < pstr->valid_len; ++low)
1498
pstr->wcs[low] = WEOF;
1499
memset (pstr->mbs, 255, pstr->valid_len);
1500
}
1501
}
1502
pstr->valid_raw_len = pstr->valid_len;
1503
}
1504
}
1505
else
1506
#endif
1507
{
1508
pstr->tip_context = re_string_context_at (pstr, offset - 1,
1509
eflags);
1510
#ifdef RE_ENABLE_I18N
1511
if (pstr->mb_cur_max > 1)
1512
memmove (pstr->wcs, pstr->wcs + offset,
1513
(pstr->valid_len - offset) * sizeof (wint_t));
1514
#endif /* RE_ENABLE_I18N */
1515
if (BE (pstr->mbs_allocated, 0))
1516
memmove (pstr->mbs, pstr->mbs + offset,
1517
pstr->valid_len - offset);
1518
pstr->valid_len -= offset;
1519
pstr->valid_raw_len -= offset;
1520
#if DEBUG
1521
assert (pstr->valid_len > 0);
1522
#endif
1523
}
1524
}
1525
else
1526
{
1527
/* No, skip all characters until IDX. */
1528
int prev_valid_len = pstr->valid_len;
1529
1530
#ifdef RE_ENABLE_I18N
1531
if (BE (pstr->offsets_needed, 0))
1532
{
1533
pstr->len = pstr->raw_len - idx + offset;
1534
pstr->stop = pstr->raw_stop - idx + offset;
1535
pstr->offsets_needed = 0;
1536
}
1537
#endif
1538
pstr->valid_len = 0;
1539
#ifdef RE_ENABLE_I18N
1540
if (pstr->mb_cur_max > 1)
1541
{
1542
int wcs_idx;
1543
wint_t wc = WEOF;
1544
1545
if (pstr->is_utf8)
1546
{
1547
const unsigned char *raw, *p, *q, *end;
1548
1549
/* Special case UTF-8. Multi-byte chars start with any
1550
byte other than 0x80 - 0xbf. */
1551
raw = pstr->raw_mbs + pstr->raw_mbs_idx;
1552
end = raw + (offset - pstr->mb_cur_max);
1553
if (end < pstr->raw_mbs)
1554
end = pstr->raw_mbs;
1555
p = raw + offset - 1;
1556
#ifdef _LIBC
1557
/* We know the wchar_t encoding is UCS4, so for the simple
1558
case, ASCII characters, skip the conversion step. */
1559
if (isascii (*p) && BE (pstr->trans == NULL, 1))
1560
{
1561
memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
1562
/* pstr->valid_len = 0; */
1563
wc = (wchar_t) *p;
1564
}
1565
else
1566
#endif
1567
for (; p >= end; --p)
1568
if ((*p & 0xc0) != 0x80)
1569
{
1570
mbstate_t cur_state;
1571
wchar_t wc2;
1572
int mlen = raw + pstr->len - p;
1573
unsigned char buf[6];
1574
size_t mbclen;
1575
1576
q = p;
1577
if (BE (pstr->trans != NULL, 0))
1578
{
1579
int i = mlen < 6 ? mlen : 6;
1580
while (--i >= 0)
1581
buf[i] = pstr->trans[p[i]];
1582
q = buf;
1583
}
1584
/* XXX Don't use mbrtowc, we know which conversion
1585
to use (UTF-8 -> UCS4). */
1586
memset (&cur_state, 0, sizeof (cur_state));
1587
mbclen = mbrtowc (&wc2, (const char *) p, mlen,
1588
&cur_state);
1589
if (raw + offset - p <= mbclen
1590
&& mbclen < (size_t) -2)
1591
{
1592
memset (&pstr->cur_state, '\0',
1593
sizeof (mbstate_t));
1594
pstr->valid_len = mbclen - (raw + offset - p);
1595
wc = wc2;
1596
}
1597
break;
1598
}
1599
}
1600
1601
if (wc == WEOF)
1602
pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
1603
if (wc == WEOF)
1604
pstr->tip_context
1605
= re_string_context_at (pstr, prev_valid_len - 1, eflags);
1606
else
1607
pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
1608
&& IS_WIDE_WORD_CHAR (wc))
1609
? CONTEXT_WORD
1610
: ((IS_WIDE_NEWLINE (wc)
1611
&& pstr->newline_anchor)
1612
? CONTEXT_NEWLINE : 0));
1613
if (BE (pstr->valid_len, 0))
1614
{
1615
for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
1616
pstr->wcs[wcs_idx] = WEOF;
1617
if (pstr->mbs_allocated)
1618
memset (pstr->mbs, 255, pstr->valid_len);
1619
}
1620
pstr->valid_raw_len = pstr->valid_len;
1621
}
1622
else
1623
#endif /* RE_ENABLE_I18N */
1624
{
1625
int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
1626
pstr->valid_raw_len = 0;
1627
if (pstr->trans)
1628
c = pstr->trans[c];
1629
pstr->tip_context = (bitset_contain (pstr->word_char, c)
1630
? CONTEXT_WORD
1631
: ((IS_NEWLINE (c) && pstr->newline_anchor)
1632
? CONTEXT_NEWLINE : 0));
1633
}
1634
}
1635
if (!BE (pstr->mbs_allocated, 0))
1636
pstr->mbs += offset;
1637
}
1638
pstr->raw_mbs_idx = idx;
1639
pstr->len -= offset;
1640
pstr->stop -= offset;
1641
1642
/* Then build the buffers. */
1643
#ifdef RE_ENABLE_I18N
1644
if (pstr->mb_cur_max > 1)
1645
{
1646
if (pstr->icase)
1647
{
1648
reg_errcode_t ret = build_wcs_upper_buffer (pstr);
1649
if (BE (ret != REG_NOERROR, 0))
1650
return ret;
1651
}
1652
else
1653
build_wcs_buffer (pstr);
1654
}
1655
else
1656
#endif /* RE_ENABLE_I18N */
1657
if (BE (pstr->mbs_allocated, 0))
1658
{
1659
if (pstr->icase)
1660
build_upper_buffer (pstr);
1661
else if (pstr->trans != NULL)
1662
re_string_translate_buffer (pstr);
1663
}
1664
else
1665
pstr->valid_len = pstr->len;
1666
1667
pstr->cur_idx = 0;
1668
return REG_NOERROR;
1669
}
1670
1671
static unsigned char
1672
internal_function __attribute ((pure))
1673
re_string_peek_byte_case (const re_string_t *pstr, int idx)
1674
{
1675
int ch, off;
1676
1677
/* Handle the common (easiest) cases first. */
1678
if (BE (!pstr->mbs_allocated, 1))
1679
return re_string_peek_byte (pstr, idx);
1680
1681
#ifdef RE_ENABLE_I18N
1682
if (pstr->mb_cur_max > 1
1683
&& ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
1684
return re_string_peek_byte (pstr, idx);
1685
#endif
1686
1687
off = pstr->cur_idx + idx;
1688
#ifdef RE_ENABLE_I18N
1689
if (pstr->offsets_needed)
1690
off = pstr->offsets[off];
1691
#endif
1692
1693
ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
1694
1695
#ifdef RE_ENABLE_I18N
1696
/* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
1697
this function returns CAPITAL LETTER I instead of first byte of
1698
DOTLESS SMALL LETTER I. The latter would confuse the parser,
1699
since peek_byte_case doesn't advance cur_idx in any way. */
1700
if (pstr->offsets_needed && !isascii (ch))
1701
return re_string_peek_byte (pstr, idx);
1702
#endif
1703
1704
return ch;
1705
}
1706
1707
static unsigned char
1708
internal_function __attribute ((pure))
1709
re_string_fetch_byte_case (re_string_t *pstr)
1710
{
1711
if (BE (!pstr->mbs_allocated, 1))
1712
return re_string_fetch_byte (pstr);
1713
1714
#ifdef RE_ENABLE_I18N
1715
if (pstr->offsets_needed)
1716
{
1717
int off, ch;
1718
1719
/* For tr_TR.UTF-8 [[:islower:]] there is
1720
[[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
1721
in that case the whole multi-byte character and return
1722
the original letter. On the other side, with
1723
[[: DOTLESS SMALL LETTER I return [[:I, as doing
1724
anything else would complicate things too much. */
1725
1726
if (!re_string_first_byte (pstr, pstr->cur_idx))
1727
return re_string_fetch_byte (pstr);
1728
1729
off = pstr->offsets[pstr->cur_idx];
1730
ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
1731
1732
if (! isascii (ch))
1733
return re_string_fetch_byte (pstr);
1734
1735
re_string_skip_bytes (pstr,
1736
re_string_char_size_at (pstr, pstr->cur_idx));
1737
return ch;
1738
}
1739
#endif
1740
1741
return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
1742
}
1743
1744
static void
1745
internal_function
1746
re_string_destruct (re_string_t *pstr)
1747
{
1748
#ifdef RE_ENABLE_I18N
1749
re_free (pstr->wcs);
1750
re_free (pstr->offsets);
1751
#endif /* RE_ENABLE_I18N */
1752
if (pstr->mbs_allocated)
1753
re_free (pstr->mbs);
1754
}
1755
1756
/* Return the context at IDX in INPUT. */
1757
1758
static unsigned int
1759
internal_function
1760
re_string_context_at (const re_string_t *input, int idx, int eflags)
1761
{
1762
int c;
1763
if (BE (idx < 0, 0))
1764
/* In this case, we use the value stored in input->tip_context,
1765
since we can't know the character in input->mbs[-1] here. */
1766
return input->tip_context;
1767
if (BE (idx == input->len, 0))
1768
return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
1769
: CONTEXT_NEWLINE | CONTEXT_ENDBUF);
1770
#ifdef RE_ENABLE_I18N
1771
if (input->mb_cur_max > 1)
1772
{
1773
wint_t wc;
1774
int wc_idx = idx;
1775
while(input->wcs[wc_idx] == WEOF)
1776
{
1777
#ifdef DEBUG
1778
/* It must not happen. */
1779
assert (wc_idx >= 0);
1780
#endif
1781
--wc_idx;
1782
if (wc_idx < 0)
1783
return input->tip_context;
1784
}
1785
wc = input->wcs[wc_idx];
1786
if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
1787
return CONTEXT_WORD;
1788
return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
1789
? CONTEXT_NEWLINE : 0);
1790
}
1791
else
1792
#endif
1793
{
1794
c = re_string_byte_at (input, idx);
1795
if (bitset_contain (input->word_char, c))
1796
return CONTEXT_WORD;
1797
return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
1798
}
1799
}
1800
1801
/* Functions for set operation. */
1802
1803
static reg_errcode_t
1804
internal_function
1805
re_node_set_alloc (re_node_set *set, int size)
1806
{
1807
set->alloc = size;
1808
set->nelem = 0;
1809
set->elems = re_malloc (int, size);
1810
if (BE (set->elems == NULL, 0))
1811
return REG_ESPACE;
1812
return REG_NOERROR;
1813
}
1814
1815
static reg_errcode_t
1816
internal_function
1817
re_node_set_init_1 (re_node_set *set, int elem)
1818
{
1819
set->alloc = 1;
1820
set->nelem = 1;
1821
set->elems = re_malloc (int, 1);
1822
if (BE (set->elems == NULL, 0))
1823
{
1824
set->alloc = set->nelem = 0;
1825
return REG_ESPACE;
1826
}
1827
set->elems[0] = elem;
1828
return REG_NOERROR;
1829
}
1830
1831
static reg_errcode_t
1832
internal_function
1833
re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
1834
{
1835
set->alloc = 2;
1836
set->elems = re_malloc (int, 2);
1837
if (BE (set->elems == NULL, 0))
1838
return REG_ESPACE;
1839
if (elem1 == elem2)
1840
{
1841
set->nelem = 1;
1842
set->elems[0] = elem1;
1843
}
1844
else
1845
{
1846
set->nelem = 2;
1847
if (elem1 < elem2)
1848
{
1849
set->elems[0] = elem1;
1850
set->elems[1] = elem2;
1851
}
1852
else
1853
{
1854
set->elems[0] = elem2;
1855
set->elems[1] = elem1;
1856
}
1857
}
1858
return REG_NOERROR;
1859
}
1860
1861
static reg_errcode_t
1862
internal_function
1863
re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1864
{
1865
dest->nelem = src->nelem;
1866
if (src->nelem > 0)
1867
{
1868
dest->alloc = dest->nelem;
1869
dest->elems = re_malloc (int, dest->alloc);
1870
if (BE (dest->elems == NULL, 0))
1871
{
1872
dest->alloc = dest->nelem = 0;
1873
return REG_ESPACE;
1874
}
1875
memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1876
}
1877
else
1878
re_node_set_init_empty (dest);
1879
return REG_NOERROR;
1880
}
1881
1882
/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1883
DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1884
Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1885
1886
static reg_errcode_t
1887
internal_function
1888
re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1889
const re_node_set *src2)
1890
{
1891
int i1, i2, is, id, delta, sbase;
1892
if (src1->nelem == 0 || src2->nelem == 0)
1893
return REG_NOERROR;
1894
1895
/* We need dest->nelem + 2 * elems_in_intersection; this is a
1896
conservative estimate. */
1897
if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1898
{
1899
int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1900
int *new_elems = re_realloc (dest->elems, int, new_alloc);
1901
if (BE (new_elems == NULL, 0))
1902
return REG_ESPACE;
1903
dest->elems = new_elems;
1904
dest->alloc = new_alloc;
1905
}
1906
1907
/* Find the items in the intersection of SRC1 and SRC2, and copy
1908
into the top of DEST those that are not already in DEST itself. */
1909
sbase = dest->nelem + src1->nelem + src2->nelem;
1910
i1 = src1->nelem - 1;
1911
i2 = src2->nelem - 1;
1912
id = dest->nelem - 1;
1913
for (;;)
1914
{
1915
if (src1->elems[i1] == src2->elems[i2])
1916
{
1917
/* Try to find the item in DEST. Maybe we could binary search? */
1918
while (id >= 0 && dest->elems[id] > src1->elems[i1])
1919
--id;
1920
1921
if (id < 0 || dest->elems[id] != src1->elems[i1])
1922
dest->elems[--sbase] = src1->elems[i1];
1923
1924
if (--i1 < 0 || --i2 < 0)
1925
break;
1926
}
1927
1928
/* Lower the highest of the two items. */
1929
else if (src1->elems[i1] < src2->elems[i2])
1930
{
1931
if (--i2 < 0)
1932
break;
1933
}
1934
else
1935
{
1936
if (--i1 < 0)
1937
break;
1938
}
1939
}
1940
1941
id = dest->nelem - 1;
1942
is = dest->nelem + src1->nelem + src2->nelem - 1;
1943
delta = is - sbase + 1;
1944
1945
/* Now copy. When DELTA becomes zero, the remaining
1946
DEST elements are already in place; this is more or
1947
less the same loop that is in re_node_set_merge. */
1948
dest->nelem += delta;
1949
if (delta > 0 && id >= 0)
1950
for (;;)
1951
{
1952
if (dest->elems[is] > dest->elems[id])
1953
{
1954
/* Copy from the top. */
1955
dest->elems[id + delta--] = dest->elems[is--];
1956
if (delta == 0)
1957
break;
1958
}
1959
else
1960
{
1961
/* Slide from the bottom. */
1962
dest->elems[id + delta] = dest->elems[id];
1963
if (--id < 0)
1964
break;
1965
}
1966
}
1967
1968
/* Copy remaining SRC elements. */
1969
memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1970
1971
return REG_NOERROR;
1972
}
1973
1974
/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1975
DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1976
1977
static reg_errcode_t
1978
internal_function
1979
re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1980
const re_node_set *src2)
1981
{
1982
int i1, i2, id;
1983
if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1984
{
1985
dest->alloc = src1->nelem + src2->nelem;
1986
dest->elems = re_malloc (int, dest->alloc);
1987
if (BE (dest->elems == NULL, 0))
1988
return REG_ESPACE;
1989
}
1990
else
1991
{
1992
if (src1 != NULL && src1->nelem > 0)
1993
return re_node_set_init_copy (dest, src1);
1994
else if (src2 != NULL && src2->nelem > 0)
1995
return re_node_set_init_copy (dest, src2);
1996
else
1997
re_node_set_init_empty (dest);
1998
return REG_NOERROR;
1999
}
2000
for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
2001
{
2002
if (src1->elems[i1] > src2->elems[i2])
2003
{
2004
dest->elems[id++] = src2->elems[i2++];
2005
continue;
2006
}
2007
if (src1->elems[i1] == src2->elems[i2])
2008
++i2;
2009
dest->elems[id++] = src1->elems[i1++];
2010
}
2011
if (i1 < src1->nelem)
2012
{
2013
memcpy (dest->elems + id, src1->elems + i1,
2014
(src1->nelem - i1) * sizeof (int));
2015
id += src1->nelem - i1;
2016
}
2017
else if (i2 < src2->nelem)
2018
{
2019
memcpy (dest->elems + id, src2->elems + i2,
2020
(src2->nelem - i2) * sizeof (int));
2021
id += src2->nelem - i2;
2022
}
2023
dest->nelem = id;
2024
return REG_NOERROR;
2025
}
2026
2027
/* Calculate the union set of the sets DEST and SRC. And store it to
2028
DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
2029
2030
static reg_errcode_t
2031
internal_function
2032
re_node_set_merge (re_node_set *dest, const re_node_set *src)
2033
{
2034
int is, id, sbase, delta;
2035
if (src == NULL || src->nelem == 0)
2036
return REG_NOERROR;
2037
if (dest->alloc < 2 * src->nelem + dest->nelem)
2038
{
2039
int new_alloc = 2 * (src->nelem + dest->alloc);
2040
int *new_buffer = re_realloc (dest->elems, int, new_alloc);
2041
if (BE (new_buffer == NULL, 0))
2042
return REG_ESPACE;
2043
dest->elems = new_buffer;
2044
dest->alloc = new_alloc;
2045
}
2046
2047
if (BE (dest->nelem == 0, 0))
2048
{
2049
dest->nelem = src->nelem;
2050
memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
2051
return REG_NOERROR;
2052
}
2053
2054
/* Copy into the top of DEST the items of SRC that are not
2055
found in DEST. Maybe we could binary search in DEST? */
2056
for (sbase = dest->nelem + 2 * src->nelem,
2057
is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
2058
{
2059
if (dest->elems[id] == src->elems[is])
2060
is--, id--;
2061
else if (dest->elems[id] < src->elems[is])
2062
dest->elems[--sbase] = src->elems[is--];
2063
else /* if (dest->elems[id] > src->elems[is]) */
2064
--id;
2065
}
2066
2067
if (is >= 0)
2068
{
2069
/* If DEST is exhausted, the remaining items of SRC must be unique. */
2070
sbase -= is + 1;
2071
memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
2072
}
2073
2074
id = dest->nelem - 1;
2075
is = dest->nelem + 2 * src->nelem - 1;
2076
delta = is - sbase + 1;
2077
if (delta == 0)
2078
return REG_NOERROR;
2079
2080
/* Now copy. When DELTA becomes zero, the remaining
2081
DEST elements are already in place. */
2082
dest->nelem += delta;
2083
for (;;)
2084
{
2085
if (dest->elems[is] > dest->elems[id])
2086
{
2087
/* Copy from the top. */
2088
dest->elems[id + delta--] = dest->elems[is--];
2089
if (delta == 0)
2090
break;
2091
}
2092
else
2093
{
2094
/* Slide from the bottom. */
2095
dest->elems[id + delta] = dest->elems[id];
2096
if (--id < 0)
2097
{
2098
/* Copy remaining SRC elements. */
2099
memcpy (dest->elems, dest->elems + sbase,
2100
delta * sizeof (int));
2101
break;
2102
}
2103
}
2104
}
2105
2106
return REG_NOERROR;
2107
}
2108
2109
/* Insert the new element ELEM to the re_node_set* SET.
2110
SET should not already have ELEM.
2111
return -1 if an error is occured, return 1 otherwise. */
2112
2113
static int
2114
internal_function
2115
re_node_set_insert (re_node_set *set, int elem)
2116
{
2117
int idx;
2118
/* In case the set is empty. */
2119
if (set->alloc == 0)
2120
{
2121
if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
2122
return 1;
2123
else
2124
return -1;
2125
}
2126
2127
if (BE (set->nelem, 0) == 0)
2128
{
2129
/* We already guaranteed above that set->alloc != 0. */
2130
set->elems[0] = elem;
2131
++set->nelem;
2132
return 1;
2133
}
2134
2135
/* Realloc if we need. */
2136
if (set->alloc == set->nelem)
2137
{
2138
int *new_elems;
2139
set->alloc = set->alloc * 2;
2140
new_elems = re_realloc (set->elems, int, set->alloc);
2141
if (BE (new_elems == NULL, 0))
2142
return -1;
2143
set->elems = new_elems;
2144
}
2145
2146
/* Move the elements which follows the new element. Test the
2147
first element separately to skip a check in the inner loop. */
2148
if (elem < set->elems[0])
2149
{
2150
idx = 0;
2151
for (idx = set->nelem; idx > 0; idx--)
2152
set->elems[idx] = set->elems[idx - 1];
2153
}
2154
else
2155
{
2156
for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
2157
set->elems[idx] = set->elems[idx - 1];
2158
}
2159
2160
/* Insert the new element. */
2161
set->elems[idx] = elem;
2162
++set->nelem;
2163
return 1;
2164
}
2165
2166
/* Insert the new element ELEM to the re_node_set* SET.
2167
SET should not already have any element greater than or equal to ELEM.
2168
Return -1 if an error is occured, return 1 otherwise. */
2169
2170
static int
2171
internal_function
2172
re_node_set_insert_last (re_node_set *set, int elem)
2173
{
2174
/* Realloc if we need. */
2175
if (set->alloc == set->nelem)
2176
{
2177
int *new_elems;
2178
set->alloc = (set->alloc + 1) * 2;
2179
new_elems = re_realloc (set->elems, int, set->alloc);
2180
if (BE (new_elems == NULL, 0))
2181
return -1;
2182
set->elems = new_elems;
2183
}
2184
2185
/* Insert the new element. */
2186
set->elems[set->nelem++] = elem;
2187
return 1;
2188
}
2189
2190
/* Compare two node sets SET1 and SET2.
2191
return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
2192
2193
static int
2194
internal_function __attribute ((pure))
2195
re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
2196
{
2197
int i;
2198
if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
2199
return 0;
2200
for (i = set1->nelem ; --i >= 0 ; )
2201
if (set1->elems[i] != set2->elems[i])
2202
return 0;
2203
return 1;
2204
}
2205
2206
/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
2207
2208
static int
2209
internal_function __attribute ((pure))
2210
re_node_set_contains (const re_node_set *set, int elem)
2211
{
2212
unsigned int idx, right, mid;
2213
if (set->nelem <= 0)
2214
return 0;
2215
2216
/* Binary search the element. */
2217
idx = 0;
2218
right = set->nelem - 1;
2219
while (idx < right)
2220
{
2221
mid = (idx + right) / 2;
2222
if (set->elems[mid] < elem)
2223
idx = mid + 1;
2224
else
2225
right = mid;
2226
}
2227
return set->elems[idx] == elem ? idx + 1 : 0;
2228
}
2229
2230
static void
2231
internal_function
2232
re_node_set_remove_at (re_node_set *set, int idx)
2233
{
2234
if (idx < 0 || idx >= set->nelem)
2235
return;
2236
--set->nelem;
2237
for (; idx < set->nelem; idx++)
2238
set->elems[idx] = set->elems[idx + 1];
2239
}
2240
2241
2242
/* Add the token TOKEN to dfa->nodes, and return the index of the token.
2243
Or return -1, if an error will be occured. */
2244
2245
static int
2246
internal_function
2247
re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
2248
{
2249
int type = token.type;
2250
if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
2251
{
2252
size_t new_nodes_alloc = dfa->nodes_alloc * 2;
2253
int *new_nexts, *new_indices;
2254
re_node_set *new_edests, *new_eclosures;
2255
re_token_t *new_nodes;
2256
2257
/* Avoid overflows. */
2258
if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
2259
return -1;
2260
2261
new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
2262
if (BE (new_nodes == NULL, 0))
2263
return -1;
2264
dfa->nodes = new_nodes;
2265
new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
2266
new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
2267
new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
2268
new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
2269
if (BE (new_nexts == NULL || new_indices == NULL
2270
|| new_edests == NULL || new_eclosures == NULL, 0))
2271
return -1;
2272
dfa->nexts = new_nexts;
2273
dfa->org_indices = new_indices;
2274
dfa->edests = new_edests;
2275
dfa->eclosures = new_eclosures;
2276
dfa->nodes_alloc = new_nodes_alloc;
2277
}
2278
dfa->nodes[dfa->nodes_len] = token;
2279
dfa->nodes[dfa->nodes_len].constraint = 0;
2280
#ifdef RE_ENABLE_I18N
2281
dfa->nodes[dfa->nodes_len].accept_mb =
2282
(type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
2283
#endif
2284
dfa->nexts[dfa->nodes_len] = -1;
2285
re_node_set_init_empty (dfa->edests + dfa->nodes_len);
2286
re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
2287
return dfa->nodes_len++;
2288
}
2289
2290
static inline unsigned int
2291
internal_function
2292
calc_state_hash (const re_node_set *nodes, unsigned int context)
2293
{
2294
unsigned int hash = nodes->nelem + context;
2295
int i;
2296
for (i = 0 ; i < nodes->nelem ; i++)
2297
hash += nodes->elems[i];
2298
return hash;
2299
}
2300
2301
/* Search for the state whose node_set is equivalent to NODES.
2302
Return the pointer to the state, if we found it in the DFA.
2303
Otherwise create the new one and return it. In case of an error
2304
return NULL and set the error code in ERR.
2305
Note: - We assume NULL as the invalid state, then it is possible that
2306
return value is NULL and ERR is REG_NOERROR.
2307
- We never return non-NULL value in case of any errors, it is for
2308
optimization. */
2309
2310
static re_dfastate_t *
2311
internal_function
2312
re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
2313
const re_node_set *nodes)
2314
{
2315
unsigned int hash;
2316
re_dfastate_t *new_state;
2317
struct re_state_table_entry *spot;
2318
int i;
2319
if (BE (nodes->nelem == 0, 0))
2320
{
2321
*err = REG_NOERROR;
2322
return NULL;
2323
}
2324
hash = calc_state_hash (nodes, 0);
2325
spot = dfa->state_table + (hash & dfa->state_hash_mask);
2326
2327
for (i = 0 ; i < spot->num ; i++)
2328
{
2329
re_dfastate_t *state = spot->array[i];
2330
if (hash != state->hash)
2331
continue;
2332
if (re_node_set_compare (&state->nodes, nodes))
2333
return state;
2334
}
2335
2336
/* There are no appropriate state in the dfa, create the new one. */
2337
new_state = create_ci_newstate (dfa, nodes, hash);
2338
if (BE (new_state == NULL, 0))
2339
*err = REG_ESPACE;
2340
2341
return new_state;
2342
}
2343
2344
/* Search for the state whose node_set is equivalent to NODES and
2345
whose context is equivalent to CONTEXT.
2346
Return the pointer to the state, if we found it in the DFA.
2347
Otherwise create the new one and return it. In case of an error
2348
return NULL and set the error code in ERR.
2349
Note: - We assume NULL as the invalid state, then it is possible that
2350
return value is NULL and ERR is REG_NOERROR.
2351
- We never return non-NULL value in case of any errors, it is for
2352
optimization. */
2353
2354
static re_dfastate_t *
2355
internal_function
2356
re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
2357
const re_node_set *nodes, unsigned int context)
2358
{
2359
unsigned int hash;
2360
re_dfastate_t *new_state;
2361
struct re_state_table_entry *spot;
2362
int i;
2363
if (nodes->nelem == 0)
2364
{
2365
*err = REG_NOERROR;
2366
return NULL;
2367
}
2368
hash = calc_state_hash (nodes, context);
2369
spot = dfa->state_table + (hash & dfa->state_hash_mask);
2370
2371
for (i = 0 ; i < spot->num ; i++)
2372
{
2373
re_dfastate_t *state = spot->array[i];
2374
if (state->hash == hash
2375
&& state->context == context
2376
&& re_node_set_compare (state->entrance_nodes, nodes))
2377
return state;
2378
}
2379
/* There are no appropriate state in `dfa', create the new one. */
2380
new_state = create_cd_newstate (dfa, nodes, context, hash);
2381
if (BE (new_state == NULL, 0))
2382
*err = REG_ESPACE;
2383
2384
return new_state;
2385
}
2386
2387
/* Finish initialization of the new state NEWSTATE, and using its hash value
2388
HASH put in the appropriate bucket of DFA's state table. Return value
2389
indicates the error code if failed. */
2390
2391
static reg_errcode_t
2392
register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
2393
unsigned int hash)
2394
{
2395
struct re_state_table_entry *spot;
2396
reg_errcode_t err;
2397
int i;
2398
2399
newstate->hash = hash;
2400
err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
2401
if (BE (err != REG_NOERROR, 0))
2402
return REG_ESPACE;
2403
for (i = 0; i < newstate->nodes.nelem; i++)
2404
{
2405
int elem = newstate->nodes.elems[i];
2406
if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
2407
re_node_set_insert_last (&newstate->non_eps_nodes, elem);
2408
}
2409
2410
spot = dfa->state_table + (hash & dfa->state_hash_mask);
2411
if (BE (spot->alloc <= spot->num, 0))
2412
{
2413
int new_alloc = 2 * spot->num + 2;
2414
re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
2415
new_alloc);
2416
if (BE (new_array == NULL, 0))
2417
return REG_ESPACE;
2418
spot->array = new_array;
2419
spot->alloc = new_alloc;
2420
}
2421
spot->array[spot->num++] = newstate;
2422
return REG_NOERROR;
2423
}
2424
2425
static void
2426
free_state (re_dfastate_t *state)
2427
{
2428
re_node_set_free (&state->non_eps_nodes);
2429
re_node_set_free (&state->inveclosure);
2430
if (state->entrance_nodes != &state->nodes)
2431
{
2432
re_node_set_free (state->entrance_nodes);
2433
re_free (state->entrance_nodes);
2434
}
2435
re_node_set_free (&state->nodes);
2436
re_free (state->word_trtable);
2437
re_free (state->trtable);
2438
re_free (state);
2439
}
2440
2441
/* Create the new state which is independ of contexts.
2442
Return the new state if succeeded, otherwise return NULL. */
2443
2444
static re_dfastate_t *
2445
internal_function
2446
create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
2447
unsigned int hash)
2448
{
2449
int i;
2450
reg_errcode_t err;
2451
re_dfastate_t *newstate;
2452
2453
newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
2454
if (BE (newstate == NULL, 0))
2455
return NULL;
2456
err = re_node_set_init_copy (&newstate->nodes, nodes);
2457
if (BE (err != REG_NOERROR, 0))
2458
{
2459
re_free (newstate);
2460
return NULL;
2461
}
2462
2463
newstate->entrance_nodes = &newstate->nodes;
2464
for (i = 0 ; i < nodes->nelem ; i++)
2465
{
2466
re_token_t *node = dfa->nodes + nodes->elems[i];
2467
re_token_type_t type = node->type;
2468
if (type == CHARACTER && !node->constraint)
2469
continue;
2470
#ifdef RE_ENABLE_I18N
2471
newstate->accept_mb |= node->accept_mb;
2472
#endif /* RE_ENABLE_I18N */
2473
2474
/* If the state has the halt node, the state is a halt state. */
2475
if (type == END_OF_RE)
2476
newstate->halt = 1;
2477
else if (type == OP_BACK_REF)
2478
newstate->has_backref = 1;
2479
else if (type == ANCHOR || node->constraint)
2480
newstate->has_constraint = 1;
2481
}
2482
err = register_state (dfa, newstate, hash);
2483
if (BE (err != REG_NOERROR, 0))
2484
{
2485
free_state (newstate);
2486
newstate = NULL;
2487
}
2488
return newstate;
2489
}
2490
2491
/* Create the new state which is depend on the context CONTEXT.
2492
Return the new state if succeeded, otherwise return NULL. */
2493
2494
static re_dfastate_t *
2495
internal_function
2496
create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
2497
unsigned int context, unsigned int hash)
2498
{
2499
int i, nctx_nodes = 0;
2500
reg_errcode_t err;
2501
re_dfastate_t *newstate;
2502
2503
newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
2504
if (BE (newstate == NULL, 0))
2505
return NULL;
2506
err = re_node_set_init_copy (&newstate->nodes, nodes);
2507
if (BE (err != REG_NOERROR, 0))
2508
{
2509
re_free (newstate);
2510
return NULL;
2511
}
2512
2513
newstate->context = context;
2514
newstate->entrance_nodes = &newstate->nodes;
2515
2516
for (i = 0 ; i < nodes->nelem ; i++)
2517
{
2518
unsigned int constraint = 0;
2519
re_token_t *node = dfa->nodes + nodes->elems[i];
2520
re_token_type_t type = node->type;
2521
if (node->constraint)
2522
constraint = node->constraint;
2523
2524
if (type == CHARACTER && !constraint)
2525
continue;
2526
#ifdef RE_ENABLE_I18N
2527
newstate->accept_mb |= node->accept_mb;
2528
#endif /* RE_ENABLE_I18N */
2529
2530
/* If the state has the halt node, the state is a halt state. */
2531
if (type == END_OF_RE)
2532
newstate->halt = 1;
2533
else if (type == OP_BACK_REF)
2534
newstate->has_backref = 1;
2535
else if (type == ANCHOR)
2536
constraint = node->opr.ctx_type;
2537
2538
if (constraint)
2539
{
2540
if (newstate->entrance_nodes == &newstate->nodes)
2541
{
2542
newstate->entrance_nodes = re_malloc (re_node_set, 1);
2543
if (BE (newstate->entrance_nodes == NULL, 0))
2544
{
2545
free_state (newstate);
2546
return NULL;
2547
}
2548
re_node_set_init_copy (newstate->entrance_nodes, nodes);
2549
nctx_nodes = 0;
2550
newstate->has_constraint = 1;
2551
}
2552
2553
if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
2554
{
2555
re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
2556
++nctx_nodes;
2557
}
2558
}
2559
}
2560
err = register_state (dfa, newstate, hash);
2561
if (BE (err != REG_NOERROR, 0))
2562
{
2563
free_state (newstate);
2564
newstate = NULL;
2565
}
2566
return newstate;
2567
}
2568
2569
/******************************************************************************/
2570
/******************************************************************************/
2571
/******************************************************************************/
2572
/* GKINCLUDE #include "regcomp.c" */
2573
/******************************************************************************/
2574
/******************************************************************************/
2575
/******************************************************************************/
2576
/* Extended regular expression matching and search library.
2577
Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.
2578
This file is part of the GNU C Library.
2579
Contributed by Isamu Hasegawa <[email protected]>.
2580
2581
The GNU C Library is free software; you can redistribute it and/or
2582
modify it under the terms of the GNU Lesser General Public
2583
License as published by the Free Software Foundation; either
2584
version 2.1 of the License, or (at your option) any later version.
2585
2586
The GNU C Library is distributed in the hope that it will be useful,
2587
but WITHOUT ANY WARRANTY; without even the implied warranty of
2588
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
2589
Lesser General Public License for more details.
2590
2591
You should have received a copy of the GNU Lesser General Public
2592
License along with the GNU C Library; if not, write to the Free
2593
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
2594
02111-1307 USA. */
2595
2596
static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
2597
size_t length, reg_syntax_t syntax);
2598
static void re_compile_fastmap_iter (regex_t *bufp,
2599
const re_dfastate_t *init_state,
2600
char *fastmap);
2601
static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
2602
#ifdef RE_ENABLE_I18N
2603
static void free_charset (re_charset_t *cset);
2604
#endif /* RE_ENABLE_I18N */
2605
static void free_workarea_compile (regex_t *preg);
2606
static reg_errcode_t create_initial_state (re_dfa_t *dfa);
2607
#ifdef RE_ENABLE_I18N
2608
static void optimize_utf8 (re_dfa_t *dfa);
2609
#endif
2610
static reg_errcode_t analyze (regex_t *preg);
2611
static reg_errcode_t preorder (bin_tree_t *root,
2612
reg_errcode_t (fn (void *, bin_tree_t *)),
2613
void *extra);
2614
static reg_errcode_t postorder (bin_tree_t *root,
2615
reg_errcode_t (fn (void *, bin_tree_t *)),
2616
void *extra);
2617
static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
2618
static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
2619
static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
2620
bin_tree_t *node);
2621
static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
2622
static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
2623
static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
2624
static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
2625
static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
2626
unsigned int constraint);
2627
static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
2628
static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
2629
int node, int root);
2630
static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
2631
static int fetch_number (re_string_t *input, re_token_t *token,
2632
reg_syntax_t syntax);
2633
static int peek_token (re_token_t *token, re_string_t *input,
2634
reg_syntax_t syntax) internal_function;
2635
static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
2636
reg_syntax_t syntax, reg_errcode_t *err);
2637
static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
2638
re_token_t *token, reg_syntax_t syntax,
2639
int nest, reg_errcode_t *err);
2640
static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
2641
re_token_t *token, reg_syntax_t syntax,
2642
int nest, reg_errcode_t *err);
2643
static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
2644
re_token_t *token, reg_syntax_t syntax,
2645
int nest, reg_errcode_t *err);
2646
static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
2647
re_token_t *token, reg_syntax_t syntax,
2648
int nest, reg_errcode_t *err);
2649
static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
2650
re_dfa_t *dfa, re_token_t *token,
2651
reg_syntax_t syntax, reg_errcode_t *err);
2652
static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
2653
re_token_t *token, reg_syntax_t syntax,
2654
reg_errcode_t *err);
2655
static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
2656
re_string_t *regexp,
2657
re_token_t *token, int token_len,
2658
re_dfa_t *dfa,
2659
reg_syntax_t syntax,
2660
int accept_hyphen);
2661
static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
2662
re_string_t *regexp,
2663
re_token_t *token);
2664
#ifdef RE_ENABLE_I18N
2665
static reg_errcode_t build_equiv_class (bitset_t sbcset,
2666
re_charset_t *mbcset,
2667
int *equiv_class_alloc,
2668
const unsigned char *name);
2669
static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
2670
bitset_t sbcset,
2671
re_charset_t *mbcset,
2672
int *char_class_alloc,
2673
const unsigned char *class_name,
2674
reg_syntax_t syntax);
2675
#else /* not RE_ENABLE_I18N */
2676
static reg_errcode_t build_equiv_class (bitset_t sbcset,
2677
const unsigned char *name);
2678
static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
2679
bitset_t sbcset,
2680
const unsigned char *class_name,
2681
reg_syntax_t syntax);
2682
#endif /* not RE_ENABLE_I18N */
2683
static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
2684
RE_TRANSLATE_TYPE trans,
2685
const unsigned char *class_name,
2686
const unsigned char *extra,
2687
int non_match, reg_errcode_t *err);
2688
static bin_tree_t *create_tree (re_dfa_t *dfa,
2689
bin_tree_t *left, bin_tree_t *right,
2690
re_token_type_t type);
2691
static bin_tree_t *create_token_tree (re_dfa_t *dfa,
2692
bin_tree_t *left, bin_tree_t *right,
2693
const re_token_t *token);
2694
static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
2695
static void free_token (re_token_t *node);
2696
static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
2697
static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
2698
2699
/* This table gives an error message for each of the error codes listed
2700
in regex.h. Obviously the order here has to be same as there.
2701
POSIX doesn't require that we do anything for REG_NOERROR,
2702
but why not be nice? */
2703
2704
const char __re_error_msgid[] attribute_hidden =
2705
{
2706
#define REG_NOERROR_IDX 0
2707
gettext_noop ("Success") /* REG_NOERROR */
2708
"\0"
2709
#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
2710
gettext_noop ("No match") /* REG_NOMATCH */
2711
"\0"
2712
#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
2713
gettext_noop ("Invalid regular expression") /* REG_BADPAT */
2714
"\0"
2715
#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
2716
gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
2717
"\0"
2718
#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
2719
gettext_noop ("Invalid character class name") /* REG_ECTYPE */
2720
"\0"
2721
#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
2722
gettext_noop ("Trailing backslash") /* REG_EESCAPE */
2723
"\0"
2724
#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
2725
gettext_noop ("Invalid back reference") /* REG_ESUBREG */
2726
"\0"
2727
#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
2728
gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
2729
"\0"
2730
#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
2731
gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
2732
"\0"
2733
#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
2734
gettext_noop ("Unmatched \\{") /* REG_EBRACE */
2735
"\0"
2736
#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
2737
gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
2738
"\0"
2739
#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
2740
gettext_noop ("Invalid range end") /* REG_ERANGE */
2741
"\0"
2742
#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
2743
gettext_noop ("Memory exhausted") /* REG_ESPACE */
2744
"\0"
2745
#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
2746
gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
2747
"\0"
2748
#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
2749
gettext_noop ("Premature end of regular expression") /* REG_EEND */
2750
"\0"
2751
#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
2752
gettext_noop ("Regular expression too big") /* REG_ESIZE */
2753
"\0"
2754
#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
2755
gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
2756
};
2757
2758
const size_t __re_error_msgid_idx[] attribute_hidden =
2759
{
2760
REG_NOERROR_IDX,
2761
REG_NOMATCH_IDX,
2762
REG_BADPAT_IDX,
2763
REG_ECOLLATE_IDX,
2764
REG_ECTYPE_IDX,
2765
REG_EESCAPE_IDX,
2766
REG_ESUBREG_IDX,
2767
REG_EBRACK_IDX,
2768
REG_EPAREN_IDX,
2769
REG_EBRACE_IDX,
2770
REG_BADBR_IDX,
2771
REG_ERANGE_IDX,
2772
REG_ESPACE_IDX,
2773
REG_BADRPT_IDX,
2774
REG_EEND_IDX,
2775
REG_ESIZE_IDX,
2776
REG_ERPAREN_IDX
2777
};
2778
2779
/* Entry points for GNU code. */
2780
2781
/* re_compile_pattern is the GNU regular expression compiler: it
2782
compiles PATTERN (of length LENGTH) and puts the result in BUFP.
2783
Returns 0 if the pattern was valid, otherwise an error string.
2784
2785
Assumes the `allocated' (and perhaps `buffer') and `translate' fields
2786
are set in BUFP on entry. */
2787
2788
const char *
2789
re_compile_pattern (pattern, length, bufp)
2790
const char *pattern;
2791
size_t length;
2792
struct re_pattern_buffer *bufp;
2793
{
2794
reg_errcode_t ret;
2795
2796
/* And GNU code determines whether or not to get register information
2797
by passing null for the REGS argument to re_match, etc., not by
2798
setting no_sub, unless RE_NO_SUB is set. */
2799
bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
2800
2801
/* Match anchors at newline. */
2802
bufp->newline_anchor = 1;
2803
2804
ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
2805
2806
if (!ret)
2807
return NULL;
2808
return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
2809
}
2810
#ifdef _LIBC
2811
weak_alias (__re_compile_pattern, re_compile_pattern)
2812
#endif
2813
2814
/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
2815
also be assigned to arbitrarily: each pattern buffer stores its own
2816
syntax, so it can be changed between regex compilations. */
2817
/* This has no initializer because initialized variables in Emacs
2818
become read-only after dumping. */
2819
reg_syntax_t re_syntax_options;
2820
2821
2822
/* Specify the precise syntax of regexps for compilation. This provides
2823
for compatibility for various utilities which historically have
2824
different, incompatible syntaxes.
2825
2826
The argument SYNTAX is a bit mask comprised of the various bits
2827
defined in regex.h. We return the old syntax. */
2828
2829
reg_syntax_t
2830
re_set_syntax (syntax)
2831
reg_syntax_t syntax;
2832
{
2833
reg_syntax_t ret = re_syntax_options;
2834
2835
re_syntax_options = syntax;
2836
return ret;
2837
}
2838
#ifdef _LIBC
2839
weak_alias (__re_set_syntax, re_set_syntax)
2840
#endif
2841
2842
int
2843
re_compile_fastmap (bufp)
2844
struct re_pattern_buffer *bufp;
2845
{
2846
re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
2847
char *fastmap = bufp->fastmap;
2848
2849
memset (fastmap, '\0', sizeof (char) * SBC_MAX);
2850
re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
2851
if (dfa->init_state != dfa->init_state_word)
2852
re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
2853
if (dfa->init_state != dfa->init_state_nl)
2854
re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
2855
if (dfa->init_state != dfa->init_state_begbuf)
2856
re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
2857
bufp->fastmap_accurate = 1;
2858
return 0;
2859
}
2860
#ifdef _LIBC
2861
weak_alias (__re_compile_fastmap, re_compile_fastmap)
2862
#endif
2863
2864
static inline void
2865
__attribute ((always_inline))
2866
re_set_fastmap (char *fastmap, int icase, int ch)
2867
{
2868
fastmap[ch] = 1;
2869
if (icase)
2870
fastmap[tolower (ch)] = 1;
2871
}
2872
2873
/* Helper function for re_compile_fastmap.
2874
Compile fastmap for the initial_state INIT_STATE. */
2875
2876
static void
2877
re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
2878
char *fastmap)
2879
{
2880
re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
2881
int node_cnt;
2882
int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
2883
for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
2884
{
2885
int node = init_state->nodes.elems[node_cnt];
2886
re_token_type_t type = dfa->nodes[node].type;
2887
2888
if (type == CHARACTER)
2889
{
2890
re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
2891
#ifdef RE_ENABLE_I18N
2892
if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
2893
{
2894
unsigned char *buf = alloca (dfa->mb_cur_max), *p;
2895
wchar_t wc;
2896
mbstate_t state;
2897
2898
p = buf;
2899
*p++ = dfa->nodes[node].opr.c;
2900
while (++node < dfa->nodes_len
2901
&& dfa->nodes[node].type == CHARACTER
2902
&& dfa->nodes[node].mb_partial)
2903
*p++ = dfa->nodes[node].opr.c;
2904
memset (&state, '\0', sizeof (state));
2905
if (mbrtowc (&wc, (const char *) buf, p - buf,
2906
&state) == p - buf
2907
&& (__wcrtomb ((char *) buf, towlower (wc), &state)
2908
!= (size_t) -1))
2909
re_set_fastmap (fastmap, 0, buf[0]);
2910
}
2911
#endif
2912
}
2913
else if (type == SIMPLE_BRACKET)
2914
{
2915
int i, ch;
2916
for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
2917
{
2918
int j;
2919
bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
2920
for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
2921
if (w & ((bitset_word_t) 1 << j))
2922
re_set_fastmap (fastmap, icase, ch);
2923
}
2924
}
2925
#ifdef RE_ENABLE_I18N
2926
else if (type == COMPLEX_BRACKET)
2927
{
2928
int i;
2929
re_charset_t *cset = dfa->nodes[node].opr.mbcset;
2930
if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
2931
|| cset->nranges || cset->nchar_classes)
2932
{
2933
# ifdef _LIBC
2934
if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
2935
{
2936
/* In this case we want to catch the bytes which are
2937
the first byte of any collation elements.
2938
e.g. In da_DK, we want to catch 'a' since "aa"
2939
is a valid collation element, and don't catch
2940
'b' since 'b' is the only collation element
2941
which starts from 'b'. */
2942
const int32_t *table = (const int32_t *)
2943
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
2944
for (i = 0; i < SBC_MAX; ++i)
2945
if (table[i] < 0)
2946
re_set_fastmap (fastmap, icase, i);
2947
}
2948
# else
2949
if (dfa->mb_cur_max > 1)
2950
for (i = 0; i < SBC_MAX; ++i)
2951
if (__btowc (i) == WEOF)
2952
re_set_fastmap (fastmap, icase, i);
2953
# endif /* not _LIBC */
2954
}
2955
for (i = 0; i < cset->nmbchars; ++i)
2956
{
2957
char buf[256];
2958
mbstate_t state;
2959
memset (&state, '\0', sizeof (state));
2960
if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
2961
re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
2962
if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
2963
{
2964
if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
2965
!= (size_t) -1)
2966
re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
2967
}
2968
}
2969
}
2970
#endif /* RE_ENABLE_I18N */
2971
else if (type == OP_PERIOD
2972
#ifdef RE_ENABLE_I18N
2973
|| type == OP_UTF8_PERIOD
2974
#endif /* RE_ENABLE_I18N */
2975
|| type == END_OF_RE)
2976
{
2977
memset (fastmap, '\1', sizeof (char) * SBC_MAX);
2978
if (type == END_OF_RE)
2979
bufp->can_be_null = 1;
2980
return;
2981
}
2982
}
2983
}
2984
2985
/* Entry point for POSIX code. */
2986
/* regcomp takes a regular expression as a string and compiles it.
2987
2988
PREG is a regex_t *. We do not expect any fields to be initialized,
2989
since POSIX says we shouldn't. Thus, we set
2990
2991
`buffer' to the compiled pattern;
2992
`used' to the length of the compiled pattern;
2993
`syntax' to RE_SYNTAX_POSIX_EXTENDED if the
2994
REG_EXTENDED bit in CFLAGS is set; otherwise, to
2995
RE_SYNTAX_POSIX_BASIC;
2996
`newline_anchor' to REG_NEWLINE being set in CFLAGS;
2997
`fastmap' to an allocated space for the fastmap;
2998
`fastmap_accurate' to zero;
2999
`re_nsub' to the number of subexpressions in PATTERN.
3000
3001
PATTERN is the address of the pattern string.
3002
3003
CFLAGS is a series of bits which affect compilation.
3004
3005
If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
3006
use POSIX basic syntax.
3007
3008
If REG_NEWLINE is set, then . and [^...] don't match newline.
3009
Also, regexec will try a match beginning after every newline.
3010
3011
If REG_ICASE is set, then we considers upper- and lowercase
3012
versions of letters to be equivalent when matching.
3013
3014
If REG_NOSUB is set, then when PREG is passed to regexec, that
3015
routine will report only success or failure, and nothing about the
3016
registers.
3017
3018
It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
3019
the return codes and their meanings.) */
3020
3021
int
3022
regcomp (preg, pattern, cflags)
3023
regex_t *__restrict preg;
3024
const char *__restrict pattern;
3025
int cflags;
3026
{
3027
reg_errcode_t ret;
3028
reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
3029
: RE_SYNTAX_POSIX_BASIC);
3030
3031
preg->buffer = NULL;
3032
preg->allocated = 0;
3033
preg->used = 0;
3034
3035
/* Try to allocate space for the fastmap. */
3036
preg->fastmap = re_malloc (char, SBC_MAX);
3037
if (BE (preg->fastmap == NULL, 0))
3038
return REG_ESPACE;
3039
3040
syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
3041
3042
/* If REG_NEWLINE is set, newlines are treated differently. */
3043
if (cflags & REG_NEWLINE)
3044
{ /* REG_NEWLINE implies neither . nor [^...] match newline. */
3045
syntax &= ~RE_DOT_NEWLINE;
3046
syntax |= RE_HAT_LISTS_NOT_NEWLINE;
3047
/* It also changes the matching behavior. */
3048
preg->newline_anchor = 1;
3049
}
3050
else
3051
preg->newline_anchor = 0;
3052
preg->no_sub = !!(cflags & REG_NOSUB);
3053
preg->translate = NULL;
3054
3055
ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
3056
3057
/* POSIX doesn't distinguish between an unmatched open-group and an
3058
unmatched close-group: both are REG_EPAREN. */
3059
if (ret == REG_ERPAREN)
3060
ret = REG_EPAREN;
3061
3062
/* We have already checked preg->fastmap != NULL. */
3063
if (BE (ret == REG_NOERROR, 1))
3064
/* Compute the fastmap now, since regexec cannot modify the pattern
3065
buffer. This function never fails in this implementation. */
3066
(void) re_compile_fastmap (preg);
3067
else
3068
{
3069
/* Some error occurred while compiling the expression. */
3070
re_free (preg->fastmap);
3071
preg->fastmap = NULL;
3072
}
3073
3074
return (int) ret;
3075
}
3076
#ifdef _LIBC
3077
weak_alias (__regcomp, regcomp)
3078
#endif
3079
3080
/* Returns a message corresponding to an error code, ERRCODE, returned
3081
from either regcomp or regexec. We don't use PREG here. */
3082
3083
/* regerror ( int errcode, preg, errbuf, errbuf_size) */
3084
size_t
3085
regerror (
3086
int errcode,
3087
const regex_t *__restrict preg,
3088
char *__restrict errbuf,
3089
size_t errbuf_size)
3090
{
3091
const char *msg;
3092
size_t msg_size;
3093
3094
if (BE (errcode < 0
3095
|| errcode >= (int) (sizeof (__re_error_msgid_idx)
3096
/ sizeof (__re_error_msgid_idx[0])), 0))
3097
/* Only error codes returned by the rest of the code should be passed
3098
to this routine. If we are given anything else, or if other regex
3099
code generates an invalid error code, then the program has a bug.
3100
Dump core so we can fix it. */
3101
abort ();
3102
3103
msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
3104
3105
msg_size = strlen (msg) + 1; /* Includes the null. */
3106
3107
if (BE (errbuf_size != 0, 1))
3108
{
3109
if (BE (msg_size > errbuf_size, 0))
3110
{
3111
#if defined HAVE_MEMPCPY || defined _LIBC
3112
*((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
3113
#else
3114
memcpy (errbuf, msg, errbuf_size - 1);
3115
errbuf[errbuf_size - 1] = 0;
3116
#endif
3117
}
3118
else
3119
memcpy (errbuf, msg, msg_size);
3120
}
3121
3122
return msg_size;
3123
}
3124
#ifdef _LIBC
3125
weak_alias (__regerror, regerror)
3126
#endif
3127
3128
3129
#ifdef RE_ENABLE_I18N
3130
/* This static array is used for the map to single-byte characters when
3131
UTF-8 is used. Otherwise we would allocate memory just to initialize
3132
it the same all the time. UTF-8 is the preferred encoding so this is
3133
a worthwhile optimization. */
3134
static const bitset_t utf8_sb_map =
3135
{
3136
/* Set the first 128 bits. */
3137
[0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
3138
};
3139
#endif
3140
3141
3142
static void
3143
free_dfa_content (re_dfa_t *dfa)
3144
{
3145
int i, j;
3146
3147
if (dfa->nodes)
3148
for (i = 0; i < dfa->nodes_len; ++i)
3149
free_token (dfa->nodes + i);
3150
re_free (dfa->nexts);
3151
for (i = 0; i < dfa->nodes_len; ++i)
3152
{
3153
if (dfa->eclosures != NULL)
3154
re_node_set_free (dfa->eclosures + i);
3155
if (dfa->inveclosures != NULL)
3156
re_node_set_free (dfa->inveclosures + i);
3157
if (dfa->edests != NULL)
3158
re_node_set_free (dfa->edests + i);
3159
}
3160
re_free (dfa->edests);
3161
re_free (dfa->eclosures);
3162
re_free (dfa->inveclosures);
3163
re_free (dfa->nodes);
3164
3165
if (dfa->state_table)
3166
for (i = 0; i <= dfa->state_hash_mask; ++i)
3167
{
3168
struct re_state_table_entry *entry = dfa->state_table + i;
3169
for (j = 0; j < entry->num; ++j)
3170
{
3171
re_dfastate_t *state = entry->array[j];
3172
free_state (state);
3173
}
3174
re_free (entry->array);
3175
}
3176
re_free (dfa->state_table);
3177
#ifdef RE_ENABLE_I18N
3178
if (dfa->sb_char != utf8_sb_map)
3179
re_free (dfa->sb_char);
3180
#endif
3181
re_free (dfa->subexp_map);
3182
#ifdef DEBUG
3183
re_free (dfa->re_str);
3184
#endif
3185
3186
re_free (dfa);
3187
}
3188
3189
3190
/* Free dynamically allocated space used by PREG. */
3191
3192
void
3193
regfree (preg)
3194
regex_t *preg;
3195
{
3196
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3197
if (BE (dfa != NULL, 1))
3198
free_dfa_content (dfa);
3199
preg->buffer = NULL;
3200
preg->allocated = 0;
3201
3202
re_free (preg->fastmap);
3203
preg->fastmap = NULL;
3204
3205
re_free (preg->translate);
3206
preg->translate = NULL;
3207
}
3208
#ifdef _LIBC
3209
weak_alias (__regfree, regfree)
3210
#endif
3211
3212
/* Entry points compatible with 4.2 BSD regex library. We don't define
3213
them unless specifically requested. */
3214
3215
#if defined _REGEX_RE_COMP || defined _LIBC
3216
3217
/* BSD has one and only one pattern buffer. */
3218
static struct re_pattern_buffer re_comp_buf;
3219
3220
char *
3221
# ifdef _LIBC
3222
/* Make these definitions weak in libc, so POSIX programs can redefine
3223
these names if they don't use our functions, and still use
3224
regcomp/regexec above without link errors. */
3225
weak_function
3226
# endif
3227
re_comp (s)
3228
const char *s;
3229
{
3230
reg_errcode_t ret;
3231
char *fastmap;
3232
3233
if (!s)
3234
{
3235
if (!re_comp_buf.buffer)
3236
return gettext ("No previous regular expression");
3237
return 0;
3238
}
3239
3240
if (re_comp_buf.buffer)
3241
{
3242
fastmap = re_comp_buf.fastmap;
3243
re_comp_buf.fastmap = NULL;
3244
__regfree (&re_comp_buf);
3245
memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
3246
re_comp_buf.fastmap = fastmap;
3247
}
3248
3249
if (re_comp_buf.fastmap == NULL)
3250
{
3251
re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
3252
if (re_comp_buf.fastmap == NULL)
3253
return (char *) gettext (__re_error_msgid
3254
+ __re_error_msgid_idx[(int) REG_ESPACE]);
3255
}
3256
3257
/* Since `re_exec' always passes NULL for the `regs' argument, we
3258
don't need to initialize the pattern buffer fields which affect it. */
3259
3260
/* Match anchors at newlines. */
3261
re_comp_buf.newline_anchor = 1;
3262
3263
ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
3264
3265
if (!ret)
3266
return NULL;
3267
3268
/* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
3269
return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3270
}
3271
3272
#ifdef _LIBC
3273
libc_freeres_fn (free_mem)
3274
{
3275
__regfree (&re_comp_buf);
3276
}
3277
#endif
3278
3279
#endif /* _REGEX_RE_COMP */
3280
3281
/* Internal entry point.
3282
Compile the regular expression PATTERN, whose length is LENGTH.
3283
SYNTAX indicate regular expression's syntax. */
3284
3285
static reg_errcode_t
3286
re_compile_internal (regex_t *preg, const char * pattern, size_t length,
3287
reg_syntax_t syntax)
3288
{
3289
reg_errcode_t err = REG_NOERROR;
3290
re_dfa_t *dfa;
3291
re_string_t regexp;
3292
3293
/* Initialize the pattern buffer. */
3294
preg->fastmap_accurate = 0;
3295
preg->syntax = syntax;
3296
preg->not_bol = preg->not_eol = 0;
3297
preg->used = 0;
3298
preg->re_nsub = 0;
3299
preg->can_be_null = 0;
3300
preg->regs_allocated = REGS_UNALLOCATED;
3301
3302
/* Initialize the dfa. */
3303
dfa = (re_dfa_t *) preg->buffer;
3304
if (BE (preg->allocated < sizeof (re_dfa_t), 0))
3305
{
3306
/* If zero allocated, but buffer is non-null, try to realloc
3307
enough space. This loses if buffer's address is bogus, but
3308
that is the user's responsibility. If ->buffer is NULL this
3309
is a simple allocation. */
3310
dfa = re_realloc (preg->buffer, re_dfa_t, 1);
3311
if (dfa == NULL)
3312
return REG_ESPACE;
3313
preg->allocated = sizeof (re_dfa_t);
3314
preg->buffer = (unsigned char *) dfa;
3315
}
3316
preg->used = sizeof (re_dfa_t);
3317
3318
err = init_dfa (dfa, length);
3319
if (BE (err != REG_NOERROR, 0))
3320
{
3321
free_dfa_content (dfa);
3322
preg->buffer = NULL;
3323
preg->allocated = 0;
3324
return err;
3325
}
3326
#ifdef DEBUG
3327
/* Note: length+1 will not overflow since it is checked in init_dfa. */
3328
dfa->re_str = re_malloc (char, length + 1);
3329
strncpy (dfa->re_str, pattern, length + 1);
3330
#endif
3331
3332
__libc_lock_init (dfa->lock);
3333
3334
err = re_string_construct (&regexp, pattern, length, preg->translate,
3335
syntax & RE_ICASE, dfa);
3336
if (BE (err != REG_NOERROR, 0))
3337
{
3338
re_compile_internal_free_return:
3339
free_workarea_compile (preg);
3340
re_string_destruct (&regexp);
3341
free_dfa_content (dfa);
3342
preg->buffer = NULL;
3343
preg->allocated = 0;
3344
return err;
3345
}
3346
3347
/* Parse the regular expression, and build a structure tree. */
3348
preg->re_nsub = 0;
3349
dfa->str_tree = parse (&regexp, preg, syntax, &err);
3350
if (BE (dfa->str_tree == NULL, 0))
3351
goto re_compile_internal_free_return;
3352
3353
/* Analyze the tree and create the nfa. */
3354
err = analyze (preg);
3355
if (BE (err != REG_NOERROR, 0))
3356
goto re_compile_internal_free_return;
3357
3358
#ifdef RE_ENABLE_I18N
3359
/* If possible, do searching in single byte encoding to speed things up. */
3360
if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
3361
optimize_utf8 (dfa);
3362
#endif
3363
3364
/* Then create the initial state of the dfa. */
3365
err = create_initial_state (dfa);
3366
3367
/* Release work areas. */
3368
free_workarea_compile (preg);
3369
re_string_destruct (&regexp);
3370
3371
if (BE (err != REG_NOERROR, 0))
3372
{
3373
free_dfa_content (dfa);
3374
preg->buffer = NULL;
3375
preg->allocated = 0;
3376
}
3377
3378
return err;
3379
}
3380
3381
/* Initialize DFA. We use the length of the regular expression PAT_LEN
3382
as the initial length of some arrays. */
3383
3384
static reg_errcode_t
3385
init_dfa (re_dfa_t *dfa, size_t pat_len)
3386
{
3387
unsigned int table_size;
3388
#ifndef _LIBC
3389
char *codeset_name;
3390
#endif
3391
3392
memset (dfa, '\0', sizeof (re_dfa_t));
3393
3394
/* Force allocation of str_tree_storage the first time. */
3395
dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
3396
3397
/* Avoid overflows. */
3398
if (pat_len == SIZE_MAX)
3399
return REG_ESPACE;
3400
3401
dfa->nodes_alloc = pat_len + 1;
3402
dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
3403
3404
/* table_size = 2 ^ ceil(log pat_len) */
3405
for (table_size = 1; ; table_size <<= 1)
3406
if (table_size > pat_len)
3407
break;
3408
3409
dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
3410
dfa->state_hash_mask = table_size - 1;
3411
3412
dfa->mb_cur_max = MB_CUR_MAX;
3413
#ifdef _LIBC
3414
if (dfa->mb_cur_max == 6
3415
&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
3416
dfa->is_utf8 = 1;
3417
dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
3418
!= 0);
3419
#else
3420
# ifdef HAVE_LANGINFO_CODESET
3421
codeset_name = nl_langinfo (CODESET);
3422
# else
3423
codeset_name = getenv ("LC_ALL");
3424
if (codeset_name == NULL || codeset_name[0] == '\0')
3425
codeset_name = getenv ("LC_CTYPE");
3426
if (codeset_name == NULL || codeset_name[0] == '\0')
3427
codeset_name = getenv ("LANG");
3428
if (codeset_name == NULL)
3429
codeset_name = "";
3430
else if (strchr (codeset_name, '.') != NULL)
3431
codeset_name = strchr (codeset_name, '.') + 1;
3432
# endif
3433
3434
if (strcasecmp (codeset_name, "UTF-8") == 0
3435
|| strcasecmp (codeset_name, "UTF8") == 0)
3436
dfa->is_utf8 = 1;
3437
3438
/* We check exhaustively in the loop below if this charset is a
3439
superset of ASCII. */
3440
dfa->map_notascii = 0;
3441
#endif
3442
3443
#ifdef RE_ENABLE_I18N
3444
if (dfa->mb_cur_max > 1)
3445
{
3446
if (dfa->is_utf8)
3447
dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
3448
else
3449
{
3450
int i, j, ch;
3451
3452
dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3453
if (BE (dfa->sb_char == NULL, 0))
3454
return REG_ESPACE;
3455
3456
/* Set the bits corresponding to single byte chars. */
3457
for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
3458
for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
3459
{
3460
wint_t wch = __btowc (ch);
3461
if (wch != WEOF)
3462
dfa->sb_char[i] |= (bitset_word_t) 1 << j;
3463
# ifndef _LIBC
3464
if (isascii (ch) && wch != ch)
3465
dfa->map_notascii = 1;
3466
# endif
3467
}
3468
}
3469
}
3470
#endif
3471
3472
if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
3473
return REG_ESPACE;
3474
return REG_NOERROR;
3475
}
3476
3477
/* Initialize WORD_CHAR table, which indicate which character is
3478
"word". In this case "word" means that it is the word construction
3479
character used by some operators like "\<", "\>", etc. */
3480
3481
static void
3482
internal_function
3483
init_word_char (re_dfa_t *dfa)
3484
{
3485
int i, j, ch;
3486
dfa->word_ops_used = 1;
3487
for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
3488
for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
3489
if (isalnum (ch) || ch == '_')
3490
dfa->word_char[i] |= (bitset_word_t) 1 << j;
3491
}
3492
3493
/* Free the work area which are only used while compiling. */
3494
3495
static void
3496
free_workarea_compile (regex_t *preg)
3497
{
3498
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3499
bin_tree_storage_t *storage, *next;
3500
for (storage = dfa->str_tree_storage; storage; storage = next)
3501
{
3502
next = storage->next;
3503
re_free (storage);
3504
}
3505
dfa->str_tree_storage = NULL;
3506
dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
3507
dfa->str_tree = NULL;
3508
re_free (dfa->org_indices);
3509
dfa->org_indices = NULL;
3510
}
3511
3512
/* Create initial states for all contexts. */
3513
3514
static reg_errcode_t
3515
create_initial_state (re_dfa_t *dfa)
3516
{
3517
int first, i;
3518
reg_errcode_t err;
3519
re_node_set init_nodes;
3520
3521
/* Initial states have the epsilon closure of the node which is
3522
the first node of the regular expression. */
3523
first = dfa->str_tree->first->node_idx;
3524
dfa->init_node = first;
3525
err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
3526
if (BE (err != REG_NOERROR, 0))
3527
return err;
3528
3529
/* The back-references which are in initial states can epsilon transit,
3530
since in this case all of the subexpressions can be null.
3531
Then we add epsilon closures of the nodes which are the next nodes of
3532
the back-references. */
3533
if (dfa->nbackref > 0)
3534
for (i = 0; i < init_nodes.nelem; ++i)
3535
{
3536
int node_idx = init_nodes.elems[i];
3537
re_token_type_t type = dfa->nodes[node_idx].type;
3538
3539
int clexp_idx;
3540
if (type != OP_BACK_REF)
3541
continue;
3542
for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
3543
{
3544
re_token_t *clexp_node;
3545
clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
3546
if (clexp_node->type == OP_CLOSE_SUBEXP
3547
&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
3548
break;
3549
}
3550
if (clexp_idx == init_nodes.nelem)
3551
continue;
3552
3553
if (type == OP_BACK_REF)
3554
{
3555
int dest_idx = dfa->edests[node_idx].elems[0];
3556
if (!re_node_set_contains (&init_nodes, dest_idx))
3557
{
3558
re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
3559
i = 0;
3560
}
3561
}
3562
}
3563
3564
/* It must be the first time to invoke acquire_state. */
3565
dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
3566
/* We don't check ERR here, since the initial state must not be NULL. */
3567
if (BE (dfa->init_state == NULL, 0))
3568
return err;
3569
if (dfa->init_state->has_constraint)
3570
{
3571
dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
3572
CONTEXT_WORD);
3573
dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
3574
CONTEXT_NEWLINE);
3575
dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
3576
&init_nodes,
3577
CONTEXT_NEWLINE
3578
| CONTEXT_BEGBUF);
3579
if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
3580
|| dfa->init_state_begbuf == NULL, 0))
3581
return err;
3582
}
3583
else
3584
dfa->init_state_word = dfa->init_state_nl
3585
= dfa->init_state_begbuf = dfa->init_state;
3586
3587
re_node_set_free (&init_nodes);
3588
return REG_NOERROR;
3589
}
3590
3591
#ifdef RE_ENABLE_I18N
3592
/* If it is possible to do searching in single byte encoding instead of UTF-8
3593
to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
3594
DFA nodes where needed. */
3595
3596
static void
3597
optimize_utf8 (re_dfa_t *dfa)
3598
{
3599
int node, i, mb_chars = 0, has_period = 0;
3600
3601
for (node = 0; node < dfa->nodes_len; ++node)
3602
switch (dfa->nodes[node].type)
3603
{
3604
case CHARACTER:
3605
if (dfa->nodes[node].opr.c >= 0x80)
3606
mb_chars = 1;
3607
break;
3608
case ANCHOR:
3609
switch (dfa->nodes[node].opr.idx)
3610
{
3611
case LINE_FIRST:
3612
case LINE_LAST:
3613
case BUF_FIRST:
3614
case BUF_LAST:
3615
break;
3616
default:
3617
/* Word anchors etc. cannot be handled. */
3618
return;
3619
}
3620
break;
3621
case OP_PERIOD:
3622
has_period = 1;
3623
break;
3624
case OP_BACK_REF:
3625
case OP_ALT:
3626
case END_OF_RE:
3627
case OP_DUP_ASTERISK:
3628
case OP_OPEN_SUBEXP:
3629
case OP_CLOSE_SUBEXP:
3630
break;
3631
case COMPLEX_BRACKET:
3632
return;
3633
case SIMPLE_BRACKET:
3634
/* Just double check. The non-ASCII range starts at 0x80. */
3635
assert (0x80 % BITSET_WORD_BITS == 0);
3636
for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
3637
if (dfa->nodes[node].opr.sbcset[i])
3638
return;
3639
break;
3640
default:
3641
abort ();
3642
}
3643
3644
if (mb_chars || has_period)
3645
for (node = 0; node < dfa->nodes_len; ++node)
3646
{
3647
if (dfa->nodes[node].type == CHARACTER
3648
&& dfa->nodes[node].opr.c >= 0x80)
3649
dfa->nodes[node].mb_partial = 0;
3650
else if (dfa->nodes[node].type == OP_PERIOD)
3651
dfa->nodes[node].type = OP_UTF8_PERIOD;
3652
}
3653
3654
/* The search can be in single byte locale. */
3655
dfa->mb_cur_max = 1;
3656
dfa->is_utf8 = 0;
3657
dfa->has_mb_node = dfa->nbackref > 0 || has_period;
3658
}
3659
#endif
3660
3661
/* Analyze the structure tree, and calculate "first", "next", "edest",
3662
"eclosure", and "inveclosure". */
3663
3664
static reg_errcode_t
3665
analyze (regex_t *preg)
3666
{
3667
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3668
reg_errcode_t ret;
3669
3670
/* Allocate arrays. */
3671
dfa->nexts = re_malloc (int, dfa->nodes_alloc);
3672
dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
3673
dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
3674
dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
3675
if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
3676
|| dfa->eclosures == NULL, 0))
3677
return REG_ESPACE;
3678
3679
dfa->subexp_map = re_malloc (int, preg->re_nsub);
3680
if (dfa->subexp_map != NULL)
3681
{
3682
int i;
3683
for (i = 0; i < preg->re_nsub; i++)
3684
dfa->subexp_map[i] = i;
3685
preorder (dfa->str_tree, optimize_subexps, dfa);
3686
for (i = 0; i < preg->re_nsub; i++)
3687
if (dfa->subexp_map[i] != i)
3688
break;
3689
if (i == preg->re_nsub)
3690
{
3691
free (dfa->subexp_map);
3692
dfa->subexp_map = NULL;
3693
}
3694
}
3695
3696
ret = postorder (dfa->str_tree, lower_subexps, preg);
3697
if (BE (ret != REG_NOERROR, 0))
3698
return ret;
3699
ret = postorder (dfa->str_tree, calc_first, dfa);
3700
if (BE (ret != REG_NOERROR, 0))
3701
return ret;
3702
preorder (dfa->str_tree, calc_next, dfa);
3703
ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
3704
if (BE (ret != REG_NOERROR, 0))
3705
return ret;
3706
ret = calc_eclosure (dfa);
3707
if (BE (ret != REG_NOERROR, 0))
3708
return ret;
3709
3710
/* We only need this during the prune_impossible_nodes pass in regexec.c;
3711
skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
3712
if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
3713
|| dfa->nbackref)
3714
{
3715
dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
3716
if (BE (dfa->inveclosures == NULL, 0))
3717
return REG_ESPACE;
3718
ret = calc_inveclosure (dfa);
3719
}
3720
3721
return ret;
3722
}
3723
3724
/* Our parse trees are very unbalanced, so we cannot use a stack to
3725
implement parse tree visits. Instead, we use parent pointers and
3726
some hairy code in these two functions. */
3727
static reg_errcode_t
3728
postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
3729
void *extra)
3730
{
3731
bin_tree_t *node, *prev;
3732
3733
for (node = root; ; )
3734
{
3735
/* Descend down the tree, preferably to the left (or to the right
3736
if that's the only child). */
3737
while (node->left || node->right)
3738
if (node->left)
3739
node = node->left;
3740
else
3741
node = node->right;
3742
3743
do
3744
{
3745
reg_errcode_t err = fn (extra, node);
3746
if (BE (err != REG_NOERROR, 0))
3747
return err;
3748
if (node->parent == NULL)
3749
return REG_NOERROR;
3750
prev = node;
3751
node = node->parent;
3752
}
3753
/* Go up while we have a node that is reached from the right. */
3754
while (node->right == prev || node->right == NULL);
3755
node = node->right;
3756
}
3757
}
3758
3759
static reg_errcode_t
3760
preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
3761
void *extra)
3762
{
3763
bin_tree_t *node;
3764
3765
for (node = root; ; )
3766
{
3767
reg_errcode_t err = fn (extra, node);
3768
if (BE (err != REG_NOERROR, 0))
3769
return err;
3770
3771
/* Go to the left node, or up and to the right. */
3772
if (node->left)
3773
node = node->left;
3774
else
3775
{
3776
bin_tree_t *prev = NULL;
3777
while (node->right == prev || node->right == NULL)
3778
{
3779
prev = node;
3780
node = node->parent;
3781
if (!node)
3782
return REG_NOERROR;
3783
}
3784
node = node->right;
3785
}
3786
}
3787
}
3788
3789
/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
3790
re_search_internal to map the inner one's opr.idx to this one's. Adjust
3791
backreferences as well. Requires a preorder visit. */
3792
static reg_errcode_t
3793
optimize_subexps (void *extra, bin_tree_t *node)
3794
{
3795
re_dfa_t *dfa = (re_dfa_t *) extra;
3796
3797
if (node->token.type == OP_BACK_REF && dfa->subexp_map)
3798
{
3799
int idx = node->token.opr.idx;
3800
node->token.opr.idx = dfa->subexp_map[idx];
3801
dfa->used_bkref_map |= 1 << node->token.opr.idx;
3802
}
3803
3804
else if (node->token.type == SUBEXP
3805
&& node->left && node->left->token.type == SUBEXP)
3806
{
3807
int other_idx = node->left->token.opr.idx;
3808
3809
node->left = node->left->left;
3810
if (node->left)
3811
node->left->parent = node;
3812
3813
dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
3814
if (other_idx < BITSET_WORD_BITS)
3815
dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
3816
}
3817
3818
return REG_NOERROR;
3819
}
3820
3821
/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
3822
of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
3823
static reg_errcode_t
3824
lower_subexps (void *extra, bin_tree_t *node)
3825
{
3826
regex_t *preg = (regex_t *) extra;
3827
reg_errcode_t err = REG_NOERROR;
3828
3829
if (node->left && node->left->token.type == SUBEXP)
3830
{
3831
node->left = lower_subexp (&err, preg, node->left);
3832
if (node->left)
3833
node->left->parent = node;
3834
}
3835
if (node->right && node->right->token.type == SUBEXP)
3836
{
3837
node->right = lower_subexp (&err, preg, node->right);
3838
if (node->right)
3839
node->right->parent = node;
3840
}
3841
3842
return err;
3843
}
3844
3845
static bin_tree_t *
3846
lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
3847
{
3848
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3849
bin_tree_t *body = node->left;
3850
bin_tree_t *op, *cls, *tree1, *tree;
3851
3852
if (preg->no_sub
3853
/* We do not optimize empty subexpressions, because otherwise we may
3854
have bad CONCAT nodes with NULL children. This is obviously not
3855
very common, so we do not lose much. An example that triggers
3856
this case is the sed "script" /\(\)/x. */
3857
&& node->left != NULL
3858
&& (node->token.opr.idx >= BITSET_WORD_BITS
3859
|| !(dfa->used_bkref_map
3860
& ((bitset_word_t) 1 << node->token.opr.idx))))
3861
return node->left;
3862
3863
/* Convert the SUBEXP node to the concatenation of an
3864
OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
3865
op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
3866
cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
3867
tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
3868
tree = create_tree (dfa, op, tree1, CONCAT);
3869
if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
3870
{
3871
*err = REG_ESPACE;
3872
return NULL;
3873
}
3874
3875
op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
3876
op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
3877
return tree;
3878
}
3879
3880
/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
3881
nodes. Requires a postorder visit. */
3882
static reg_errcode_t
3883
calc_first (void *extra, bin_tree_t *node)
3884
{
3885
re_dfa_t *dfa = (re_dfa_t *) extra;
3886
if (node->token.type == CONCAT)
3887
{
3888
node->first = node->left->first;
3889
node->node_idx = node->left->node_idx;
3890
}
3891
else
3892
{
3893
node->first = node;
3894
node->node_idx = re_dfa_add_node (dfa, node->token);
3895
if (BE (node->node_idx == -1, 0))
3896
return REG_ESPACE;
3897
}
3898
return REG_NOERROR;
3899
}
3900
3901
/* Pass 2: compute NEXT on the tree. Preorder visit. */
3902
static reg_errcode_t
3903
calc_next (void *extra, bin_tree_t *node)
3904
{
3905
switch (node->token.type)
3906
{
3907
case OP_DUP_ASTERISK:
3908
node->left->next = node;
3909
break;
3910
case CONCAT:
3911
node->left->next = node->right->first;
3912
node->right->next = node->next;
3913
break;
3914
default:
3915
if (node->left)
3916
node->left->next = node->next;
3917
if (node->right)
3918
node->right->next = node->next;
3919
break;
3920
}
3921
return REG_NOERROR;
3922
}
3923
3924
/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
3925
static reg_errcode_t
3926
link_nfa_nodes (void *extra, bin_tree_t *node)
3927
{
3928
re_dfa_t *dfa = (re_dfa_t *) extra;
3929
int idx = node->node_idx;
3930
reg_errcode_t err = REG_NOERROR;
3931
3932
switch (node->token.type)
3933
{
3934
case CONCAT:
3935
break;
3936
3937
case END_OF_RE:
3938
assert (node->next == NULL);
3939
break;
3940
3941
case OP_DUP_ASTERISK:
3942
case OP_ALT:
3943
{
3944
int left, right;
3945
dfa->has_plural_match = 1;
3946
if (node->left != NULL)
3947
left = node->left->first->node_idx;
3948
else
3949
left = node->next->node_idx;
3950
if (node->right != NULL)
3951
right = node->right->first->node_idx;
3952
else
3953
right = node->next->node_idx;
3954
assert (left > -1);
3955
assert (right > -1);
3956
err = re_node_set_init_2 (dfa->edests + idx, left, right);
3957
}
3958
break;
3959
3960
case ANCHOR:
3961
case OP_OPEN_SUBEXP:
3962
case OP_CLOSE_SUBEXP:
3963
err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
3964
break;
3965
3966
case OP_BACK_REF:
3967
dfa->nexts[idx] = node->next->node_idx;
3968
if (node->token.type == OP_BACK_REF)
3969
re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
3970
break;
3971
3972
default:
3973
assert (!IS_EPSILON_NODE (node->token.type));
3974
dfa->nexts[idx] = node->next->node_idx;
3975
break;
3976
}
3977
3978
return err;
3979
}
3980
3981
/* Duplicate the epsilon closure of the node ROOT_NODE.
3982
Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
3983
to their own constraint. */
3984
3985
static reg_errcode_t
3986
internal_function
3987
duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
3988
int root_node, unsigned int init_constraint)
3989
{
3990
int org_node, clone_node, ret;
3991
unsigned int constraint = init_constraint;
3992
for (org_node = top_org_node, clone_node = top_clone_node;;)
3993
{
3994
int org_dest, clone_dest;
3995
if (dfa->nodes[org_node].type == OP_BACK_REF)
3996
{
3997
/* If the back reference epsilon-transit, its destination must
3998
also have the constraint. Then duplicate the epsilon closure
3999
of the destination of the back reference, and store it in
4000
edests of the back reference. */
4001
org_dest = dfa->nexts[org_node];
4002
re_node_set_empty (dfa->edests + clone_node);
4003
clone_dest = duplicate_node (dfa, org_dest, constraint);
4004
if (BE (clone_dest == -1, 0))
4005
return REG_ESPACE;
4006
dfa->nexts[clone_node] = dfa->nexts[org_node];
4007
ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4008
if (BE (ret < 0, 0))
4009
return REG_ESPACE;
4010
}
4011
else if (dfa->edests[org_node].nelem == 0)
4012
{
4013
/* In case of the node can't epsilon-transit, don't duplicate the
4014
destination and store the original destination as the
4015
destination of the node. */
4016
dfa->nexts[clone_node] = dfa->nexts[org_node];
4017
break;
4018
}
4019
else if (dfa->edests[org_node].nelem == 1)
4020
{
4021
/* In case of the node can epsilon-transit, and it has only one
4022
destination. */
4023
org_dest = dfa->edests[org_node].elems[0];
4024
re_node_set_empty (dfa->edests + clone_node);
4025
if (dfa->nodes[org_node].type == ANCHOR)
4026
{
4027
/* In case of the node has another constraint, append it. */
4028
if (org_node == root_node && clone_node != org_node)
4029
{
4030
/* ...but if the node is root_node itself, it means the
4031
epsilon closure have a loop, then tie it to the
4032
destination of the root_node. */
4033
ret = re_node_set_insert (dfa->edests + clone_node,
4034
org_dest);
4035
if (BE (ret < 0, 0))
4036
return REG_ESPACE;
4037
break;
4038
}
4039
constraint |= dfa->nodes[org_node].opr.ctx_type;
4040
}
4041
clone_dest = duplicate_node (dfa, org_dest, constraint);
4042
if (BE (clone_dest == -1, 0))
4043
return REG_ESPACE;
4044
ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4045
if (BE (ret < 0, 0))
4046
return REG_ESPACE;
4047
}
4048
else /* dfa->edests[org_node].nelem == 2 */
4049
{
4050
/* In case of the node can epsilon-transit, and it has two
4051
destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
4052
org_dest = dfa->edests[org_node].elems[0];
4053
re_node_set_empty (dfa->edests + clone_node);
4054
/* Search for a duplicated node which satisfies the constraint. */
4055
clone_dest = search_duplicated_node (dfa, org_dest, constraint);
4056
if (clone_dest == -1)
4057
{
4058
/* There are no such a duplicated node, create a new one. */
4059
reg_errcode_t err;
4060
clone_dest = duplicate_node (dfa, org_dest, constraint);
4061
if (BE (clone_dest == -1, 0))
4062
return REG_ESPACE;
4063
ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4064
if (BE (ret < 0, 0))
4065
return REG_ESPACE;
4066
err = duplicate_node_closure (dfa, org_dest, clone_dest,
4067
root_node, constraint);
4068
if (BE (err != REG_NOERROR, 0))
4069
return err;
4070
}
4071
else
4072
{
4073
/* There are a duplicated node which satisfy the constraint,
4074
use it to avoid infinite loop. */
4075
ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4076
if (BE (ret < 0, 0))
4077
return REG_ESPACE;
4078
}
4079
4080
org_dest = dfa->edests[org_node].elems[1];
4081
clone_dest = duplicate_node (dfa, org_dest, constraint);
4082
if (BE (clone_dest == -1, 0))
4083
return REG_ESPACE;
4084
ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4085
if (BE (ret < 0, 0))
4086
return REG_ESPACE;
4087
}
4088
org_node = org_dest;
4089
clone_node = clone_dest;
4090
}
4091
return REG_NOERROR;
4092
}
4093
4094
/* Search for a node which is duplicated from the node ORG_NODE, and
4095
satisfies the constraint CONSTRAINT. */
4096
4097
static int
4098
search_duplicated_node (const re_dfa_t *dfa, int org_node,
4099
unsigned int constraint)
4100
{
4101
int idx;
4102
for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
4103
{
4104
if (org_node == dfa->org_indices[idx]
4105
&& constraint == dfa->nodes[idx].constraint)
4106
return idx; /* Found. */
4107
}
4108
return -1; /* Not found. */
4109
}
4110
4111
/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
4112
Return the index of the new node, or -1 if insufficient storage is
4113
available. */
4114
4115
static int
4116
duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
4117
{
4118
int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
4119
if (BE (dup_idx != -1, 1))
4120
{
4121
dfa->nodes[dup_idx].constraint = constraint;
4122
if (dfa->nodes[org_idx].type == ANCHOR)
4123
dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
4124
dfa->nodes[dup_idx].duplicated = 1;
4125
4126
/* Store the index of the original node. */
4127
dfa->org_indices[dup_idx] = org_idx;
4128
}
4129
return dup_idx;
4130
}
4131
4132
static reg_errcode_t
4133
calc_inveclosure (re_dfa_t *dfa)
4134
{
4135
int src, idx, ret;
4136
for (idx = 0; idx < dfa->nodes_len; ++idx)
4137
re_node_set_init_empty (dfa->inveclosures + idx);
4138
4139
for (src = 0; src < dfa->nodes_len; ++src)
4140
{
4141
int *elems = dfa->eclosures[src].elems;
4142
for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
4143
{
4144
ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
4145
if (BE (ret == -1, 0))
4146
return REG_ESPACE;
4147
}
4148
}
4149
4150
return REG_NOERROR;
4151
}
4152
4153
/* Calculate "eclosure" for all the node in DFA. */
4154
4155
static reg_errcode_t
4156
calc_eclosure (re_dfa_t *dfa)
4157
{
4158
int node_idx, incomplete;
4159
#ifdef DEBUG
4160
assert (dfa->nodes_len > 0);
4161
#endif
4162
incomplete = 0;
4163
/* For each nodes, calculate epsilon closure. */
4164
for (node_idx = 0; ; ++node_idx)
4165
{
4166
reg_errcode_t err;
4167
re_node_set eclosure_elem;
4168
if (node_idx == dfa->nodes_len)
4169
{
4170
if (!incomplete)
4171
break;
4172
incomplete = 0;
4173
node_idx = 0;
4174
}
4175
4176
#ifdef DEBUG
4177
assert (dfa->eclosures[node_idx].nelem != -1);
4178
#endif
4179
4180
/* If we have already calculated, skip it. */
4181
if (dfa->eclosures[node_idx].nelem != 0)
4182
continue;
4183
/* Calculate epsilon closure of `node_idx'. */
4184
err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
4185
if (BE (err != REG_NOERROR, 0))
4186
return err;
4187
4188
if (dfa->eclosures[node_idx].nelem == 0)
4189
{
4190
incomplete = 1;
4191
re_node_set_free (&eclosure_elem);
4192
}
4193
}
4194
return REG_NOERROR;
4195
}
4196
4197
/* Calculate epsilon closure of NODE. */
4198
4199
static reg_errcode_t
4200
calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
4201
{
4202
reg_errcode_t err;
4203
unsigned int constraint;
4204
int i, incomplete;
4205
re_node_set eclosure;
4206
incomplete = 0;
4207
err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
4208
if (BE (err != REG_NOERROR, 0))
4209
return err;
4210
4211
/* This indicates that we are calculating this node now.
4212
We reference this value to avoid infinite loop. */
4213
dfa->eclosures[node].nelem = -1;
4214
4215
constraint = ((dfa->nodes[node].type == ANCHOR)
4216
? dfa->nodes[node].opr.ctx_type : 0);
4217
/* If the current node has constraints, duplicate all nodes.
4218
Since they must inherit the constraints. */
4219
if (constraint
4220
&& dfa->edests[node].nelem
4221
&& !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
4222
{
4223
err = duplicate_node_closure (dfa, node, node, node, constraint);
4224
if (BE (err != REG_NOERROR, 0))
4225
return err;
4226
}
4227
4228
/* Expand each epsilon destination nodes. */
4229
if (IS_EPSILON_NODE(dfa->nodes[node].type))
4230
for (i = 0; i < dfa->edests[node].nelem; ++i)
4231
{
4232
re_node_set eclosure_elem;
4233
int edest = dfa->edests[node].elems[i];
4234
/* If calculating the epsilon closure of `edest' is in progress,
4235
return intermediate result. */
4236
if (dfa->eclosures[edest].nelem == -1)
4237
{
4238
incomplete = 1;
4239
continue;
4240
}
4241
/* If we haven't calculated the epsilon closure of `edest' yet,
4242
calculate now. Otherwise use calculated epsilon closure. */
4243
if (dfa->eclosures[edest].nelem == 0)
4244
{
4245
err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
4246
if (BE (err != REG_NOERROR, 0))
4247
return err;
4248
}
4249
else
4250
eclosure_elem = dfa->eclosures[edest];
4251
/* Merge the epsilon closure of `edest'. */
4252
re_node_set_merge (&eclosure, &eclosure_elem);
4253
/* If the epsilon closure of `edest' is incomplete,
4254
the epsilon closure of this node is also incomplete. */
4255
if (dfa->eclosures[edest].nelem == 0)
4256
{
4257
incomplete = 1;
4258
re_node_set_free (&eclosure_elem);
4259
}
4260
}
4261
4262
/* Epsilon closures include itself. */
4263
re_node_set_insert (&eclosure, node);
4264
if (incomplete && !root)
4265
dfa->eclosures[node].nelem = 0;
4266
else
4267
dfa->eclosures[node] = eclosure;
4268
*new_set = eclosure;
4269
return REG_NOERROR;
4270
}
4271
4272
/* Functions for token which are used in the parser. */
4273
4274
/* Fetch a token from INPUT.
4275
We must not use this function inside bracket expressions. */
4276
4277
static void
4278
internal_function
4279
fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
4280
{
4281
re_string_skip_bytes (input, peek_token (result, input, syntax));
4282
}
4283
4284
/* Peek a token from INPUT, and return the length of the token.
4285
We must not use this function inside bracket expressions. */
4286
4287
static int
4288
internal_function
4289
peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
4290
{
4291
unsigned char c;
4292
4293
if (re_string_eoi (input))
4294
{
4295
token->type = END_OF_RE;
4296
return 0;
4297
}
4298
4299
c = re_string_peek_byte (input, 0);
4300
token->opr.c = c;
4301
4302
token->word_char = 0;
4303
#ifdef RE_ENABLE_I18N
4304
token->mb_partial = 0;
4305
if (input->mb_cur_max > 1 &&
4306
!re_string_first_byte (input, re_string_cur_idx (input)))
4307
{
4308
token->type = CHARACTER;
4309
token->mb_partial = 1;
4310
return 1;
4311
}
4312
#endif
4313
if (c == '\\')
4314
{
4315
unsigned char c2;
4316
if (re_string_cur_idx (input) + 1 >= re_string_length (input))
4317
{
4318
token->type = BACK_SLASH;
4319
return 1;
4320
}
4321
4322
c2 = re_string_peek_byte_case (input, 1);
4323
token->opr.c = c2;
4324
token->type = CHARACTER;
4325
#ifdef RE_ENABLE_I18N
4326
if (input->mb_cur_max > 1)
4327
{
4328
wint_t wc = re_string_wchar_at (input,
4329
re_string_cur_idx (input) + 1);
4330
token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
4331
}
4332
else
4333
#endif
4334
token->word_char = IS_WORD_CHAR (c2) != 0;
4335
4336
switch (c2)
4337
{
4338
case '|':
4339
if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
4340
token->type = OP_ALT;
4341
break;
4342
case '1': case '2': case '3': case '4': case '5':
4343
case '6': case '7': case '8': case '9':
4344
if (!(syntax & RE_NO_BK_REFS))
4345
{
4346
token->type = OP_BACK_REF;
4347
token->opr.idx = c2 - '1';
4348
}
4349
break;
4350
case '<':
4351
if (!(syntax & RE_NO_GNU_OPS))
4352
{
4353
token->type = ANCHOR;
4354
token->opr.ctx_type = WORD_FIRST;
4355
}
4356
break;
4357
case '>':
4358
if (!(syntax & RE_NO_GNU_OPS))
4359
{
4360
token->type = ANCHOR;
4361
token->opr.ctx_type = WORD_LAST;
4362
}
4363
break;
4364
case 'b':
4365
if (!(syntax & RE_NO_GNU_OPS))
4366
{
4367
token->type = ANCHOR;
4368
token->opr.ctx_type = WORD_DELIM;
4369
}
4370
break;
4371
case 'B':
4372
if (!(syntax & RE_NO_GNU_OPS))
4373
{
4374
token->type = ANCHOR;
4375
token->opr.ctx_type = NOT_WORD_DELIM;
4376
}
4377
break;
4378
case 'w':
4379
if (!(syntax & RE_NO_GNU_OPS))
4380
token->type = OP_WORD;
4381
break;
4382
case 'W':
4383
if (!(syntax & RE_NO_GNU_OPS))
4384
token->type = OP_NOTWORD;
4385
break;
4386
case 's':
4387
if (!(syntax & RE_NO_GNU_OPS))
4388
token->type = OP_SPACE;
4389
break;
4390
case 'S':
4391
if (!(syntax & RE_NO_GNU_OPS))
4392
token->type = OP_NOTSPACE;
4393
break;
4394
case '`':
4395
if (!(syntax & RE_NO_GNU_OPS))
4396
{
4397
token->type = ANCHOR;
4398
token->opr.ctx_type = BUF_FIRST;
4399
}
4400
break;
4401
case '\'':
4402
if (!(syntax & RE_NO_GNU_OPS))
4403
{
4404
token->type = ANCHOR;
4405
token->opr.ctx_type = BUF_LAST;
4406
}
4407
break;
4408
case '(':
4409
if (!(syntax & RE_NO_BK_PARENS))
4410
token->type = OP_OPEN_SUBEXP;
4411
break;
4412
case ')':
4413
if (!(syntax & RE_NO_BK_PARENS))
4414
token->type = OP_CLOSE_SUBEXP;
4415
break;
4416
case '+':
4417
if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
4418
token->type = OP_DUP_PLUS;
4419
break;
4420
case '?':
4421
if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
4422
token->type = OP_DUP_QUESTION;
4423
break;
4424
case '{':
4425
if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
4426
token->type = OP_OPEN_DUP_NUM;
4427
break;
4428
case '}':
4429
if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
4430
token->type = OP_CLOSE_DUP_NUM;
4431
break;
4432
default:
4433
break;
4434
}
4435
return 2;
4436
}
4437
4438
token->type = CHARACTER;
4439
#ifdef RE_ENABLE_I18N
4440
if (input->mb_cur_max > 1)
4441
{
4442
wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
4443
token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
4444
}
4445
else
4446
#endif
4447
token->word_char = IS_WORD_CHAR (token->opr.c);
4448
4449
switch (c)
4450
{
4451
case '\n':
4452
if (syntax & RE_NEWLINE_ALT)
4453
token->type = OP_ALT;
4454
break;
4455
case '|':
4456
if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
4457
token->type = OP_ALT;
4458
break;
4459
case '*':
4460
token->type = OP_DUP_ASTERISK;
4461
break;
4462
case '+':
4463
if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
4464
token->type = OP_DUP_PLUS;
4465
break;
4466
case '?':
4467
if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
4468
token->type = OP_DUP_QUESTION;
4469
break;
4470
case '{':
4471
if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
4472
token->type = OP_OPEN_DUP_NUM;
4473
break;
4474
case '}':
4475
if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
4476
token->type = OP_CLOSE_DUP_NUM;
4477
break;
4478
case '(':
4479
if (syntax & RE_NO_BK_PARENS)
4480
token->type = OP_OPEN_SUBEXP;
4481
break;
4482
case ')':
4483
if (syntax & RE_NO_BK_PARENS)
4484
token->type = OP_CLOSE_SUBEXP;
4485
break;
4486
case '[':
4487
token->type = OP_OPEN_BRACKET;
4488
break;
4489
case '.':
4490
token->type = OP_PERIOD;
4491
break;
4492
case '^':
4493
if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
4494
re_string_cur_idx (input) != 0)
4495
{
4496
char prev = re_string_peek_byte (input, -1);
4497
if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
4498
break;
4499
}
4500
token->type = ANCHOR;
4501
token->opr.ctx_type = LINE_FIRST;
4502
break;
4503
case '$':
4504
if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
4505
re_string_cur_idx (input) + 1 != re_string_length (input))
4506
{
4507
re_token_t next;
4508
re_string_skip_bytes (input, 1);
4509
peek_token (&next, input, syntax);
4510
re_string_skip_bytes (input, -1);
4511
if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
4512
break;
4513
}
4514
token->type = ANCHOR;
4515
token->opr.ctx_type = LINE_LAST;
4516
break;
4517
default:
4518
break;
4519
}
4520
return 1;
4521
}
4522
4523
/* Peek a token from INPUT, and return the length of the token.
4524
We must not use this function out of bracket expressions. */
4525
4526
static int
4527
internal_function
4528
peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
4529
{
4530
unsigned char c;
4531
if (re_string_eoi (input))
4532
{
4533
token->type = END_OF_RE;
4534
return 0;
4535
}
4536
c = re_string_peek_byte (input, 0);
4537
token->opr.c = c;
4538
4539
#ifdef RE_ENABLE_I18N
4540
if (input->mb_cur_max > 1 &&
4541
!re_string_first_byte (input, re_string_cur_idx (input)))
4542
{
4543
token->type = CHARACTER;
4544
return 1;
4545
}
4546
#endif /* RE_ENABLE_I18N */
4547
4548
if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
4549
&& re_string_cur_idx (input) + 1 < re_string_length (input))
4550
{
4551
/* In this case, '\' escape a character. */
4552
unsigned char c2;
4553
re_string_skip_bytes (input, 1);
4554
c2 = re_string_peek_byte (input, 0);
4555
token->opr.c = c2;
4556
token->type = CHARACTER;
4557
return 1;
4558
}
4559
if (c == '[') /* '[' is a special char in a bracket exps. */
4560
{
4561
unsigned char c2;
4562
int token_len;
4563
if (re_string_cur_idx (input) + 1 < re_string_length (input))
4564
c2 = re_string_peek_byte (input, 1);
4565
else
4566
c2 = 0;
4567
token->opr.c = c2;
4568
token_len = 2;
4569
switch (c2)
4570
{
4571
case '.':
4572
token->type = OP_OPEN_COLL_ELEM;
4573
break;
4574
case '=':
4575
token->type = OP_OPEN_EQUIV_CLASS;
4576
break;
4577
case ':':
4578
if (syntax & RE_CHAR_CLASSES)
4579
{
4580
token->type = OP_OPEN_CHAR_CLASS;
4581
break;
4582
}
4583
/* else fall through. */
4584
default:
4585
token->type = CHARACTER;
4586
token->opr.c = c;
4587
token_len = 1;
4588
break;
4589
}
4590
return token_len;
4591
}
4592
switch (c)
4593
{
4594
case '-':
4595
token->type = OP_CHARSET_RANGE;
4596
break;
4597
case ']':
4598
token->type = OP_CLOSE_BRACKET;
4599
break;
4600
case '^':
4601
token->type = OP_NON_MATCH_LIST;
4602
break;
4603
default:
4604
token->type = CHARACTER;
4605
}
4606
return 1;
4607
}
4608
4609
/* Functions for parser. */
4610
4611
/* Entry point of the parser.
4612
Parse the regular expression REGEXP and return the structure tree.
4613
If an error is occured, ERR is set by error code, and return NULL.
4614
This function build the following tree, from regular expression <reg_exp>:
4615
CAT
4616
/ \
4617
/ \
4618
<reg_exp> EOR
4619
4620
CAT means concatenation.
4621
EOR means end of regular expression. */
4622
4623
static bin_tree_t *
4624
parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
4625
reg_errcode_t *err)
4626
{
4627
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4628
bin_tree_t *tree, *eor, *root;
4629
re_token_t current_token;
4630
dfa->syntax = syntax;
4631
fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
4632
tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
4633
if (BE (*err != REG_NOERROR && tree == NULL, 0))
4634
return NULL;
4635
eor = create_tree (dfa, NULL, NULL, END_OF_RE);
4636
if (tree != NULL)
4637
root = create_tree (dfa, tree, eor, CONCAT);
4638
else
4639
root = eor;
4640
if (BE (eor == NULL || root == NULL, 0))
4641
{
4642
*err = REG_ESPACE;
4643
return NULL;
4644
}
4645
return root;
4646
}
4647
4648
/* This function build the following tree, from regular expression
4649
<branch1>|<branch2>:
4650
ALT
4651
/ \
4652
/ \
4653
<branch1> <branch2>
4654
4655
ALT means alternative, which represents the operator `|'. */
4656
4657
static bin_tree_t *
4658
parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
4659
reg_syntax_t syntax, int nest, reg_errcode_t *err)
4660
{
4661
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4662
bin_tree_t *tree, *branch = NULL;
4663
tree = parse_branch (regexp, preg, token, syntax, nest, err);
4664
if (BE (*err != REG_NOERROR && tree == NULL, 0))
4665
return NULL;
4666
4667
while (token->type == OP_ALT)
4668
{
4669
fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
4670
if (token->type != OP_ALT && token->type != END_OF_RE
4671
&& (nest == 0 || token->type != OP_CLOSE_SUBEXP))
4672
{
4673
branch = parse_branch (regexp, preg, token, syntax, nest, err);
4674
if (BE (*err != REG_NOERROR && branch == NULL, 0))
4675
return NULL;
4676
}
4677
else
4678
branch = NULL;
4679
tree = create_tree (dfa, tree, branch, OP_ALT);
4680
if (BE (tree == NULL, 0))
4681
{
4682
*err = REG_ESPACE;
4683
return NULL;
4684
}
4685
}
4686
return tree;
4687
}
4688
4689
/* This function build the following tree, from regular expression
4690
<exp1><exp2>:
4691
CAT
4692
/ \
4693
/ \
4694
<exp1> <exp2>
4695
4696
CAT means concatenation. */
4697
4698
static bin_tree_t *
4699
parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
4700
reg_syntax_t syntax, int nest, reg_errcode_t *err)
4701
{
4702
bin_tree_t *tree, *exp;
4703
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4704
tree = parse_expression (regexp, preg, token, syntax, nest, err);
4705
if (BE (*err != REG_NOERROR && tree == NULL, 0))
4706
return NULL;
4707
4708
while (token->type != OP_ALT && token->type != END_OF_RE
4709
&& (nest == 0 || token->type != OP_CLOSE_SUBEXP))
4710
{
4711
exp = parse_expression (regexp, preg, token, syntax, nest, err);
4712
if (BE (*err != REG_NOERROR && exp == NULL, 0))
4713
{
4714
return NULL;
4715
}
4716
if (tree != NULL && exp != NULL)
4717
{
4718
tree = create_tree (dfa, tree, exp, CONCAT);
4719
if (tree == NULL)
4720
{
4721
*err = REG_ESPACE;
4722
return NULL;
4723
}
4724
}
4725
else if (tree == NULL)
4726
tree = exp;
4727
/* Otherwise exp == NULL, we don't need to create new tree. */
4728
}
4729
return tree;
4730
}
4731
4732
/* This function build the following tree, from regular expression a*:
4733
*
4734
|
4735
a
4736
*/
4737
4738
static bin_tree_t *
4739
parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
4740
reg_syntax_t syntax, int nest, reg_errcode_t *err)
4741
{
4742
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4743
bin_tree_t *tree;
4744
switch (token->type)
4745
{
4746
case CHARACTER:
4747
tree = create_token_tree (dfa, NULL, NULL, token);
4748
if (BE (tree == NULL, 0))
4749
{
4750
*err = REG_ESPACE;
4751
return NULL;
4752
}
4753
#ifdef RE_ENABLE_I18N
4754
if (dfa->mb_cur_max > 1)
4755
{
4756
while (!re_string_eoi (regexp)
4757
&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
4758
{
4759
bin_tree_t *mbc_remain;
4760
fetch_token (token, regexp, syntax);
4761
mbc_remain = create_token_tree (dfa, NULL, NULL, token);
4762
tree = create_tree (dfa, tree, mbc_remain, CONCAT);
4763
if (BE (mbc_remain == NULL || tree == NULL, 0))
4764
{
4765
*err = REG_ESPACE;
4766
return NULL;
4767
}
4768
}
4769
}
4770
#endif
4771
break;
4772
case OP_OPEN_SUBEXP:
4773
tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
4774
if (BE (*err != REG_NOERROR && tree == NULL, 0))
4775
return NULL;
4776
break;
4777
case OP_OPEN_BRACKET:
4778
tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
4779
if (BE (*err != REG_NOERROR && tree == NULL, 0))
4780
return NULL;
4781
break;
4782
case OP_BACK_REF:
4783
if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
4784
{
4785
*err = REG_ESUBREG;
4786
return NULL;
4787
}
4788
dfa->used_bkref_map |= 1 << token->opr.idx;
4789
tree = create_token_tree (dfa, NULL, NULL, token);
4790
if (BE (tree == NULL, 0))
4791
{
4792
*err = REG_ESPACE;
4793
return NULL;
4794
}
4795
++dfa->nbackref;
4796
dfa->has_mb_node = 1;
4797
break;
4798
case OP_OPEN_DUP_NUM:
4799
if (syntax & RE_CONTEXT_INVALID_DUP)
4800
{
4801
*err = REG_BADRPT;
4802
return NULL;
4803
}
4804
/* FALLTHROUGH */
4805
case OP_DUP_ASTERISK:
4806
case OP_DUP_PLUS:
4807
case OP_DUP_QUESTION:
4808
if (syntax & RE_CONTEXT_INVALID_OPS)
4809
{
4810
*err = REG_BADRPT;
4811
return NULL;
4812
}
4813
else if (syntax & RE_CONTEXT_INDEP_OPS)
4814
{
4815
fetch_token (token, regexp, syntax);
4816
return parse_expression (regexp, preg, token, syntax, nest, err);
4817
}
4818
/* else fall through */
4819
case OP_CLOSE_SUBEXP:
4820
if ((token->type == OP_CLOSE_SUBEXP) &&
4821
!(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
4822
{
4823
*err = REG_ERPAREN;
4824
return NULL;
4825
}
4826
/* else fall through */
4827
case OP_CLOSE_DUP_NUM:
4828
/* We treat it as a normal character. */
4829
4830
/* Then we can these characters as normal characters. */
4831
token->type = CHARACTER;
4832
/* mb_partial and word_char bits should be initialized already
4833
by peek_token. */
4834
tree = create_token_tree (dfa, NULL, NULL, token);
4835
if (BE (tree == NULL, 0))
4836
{
4837
*err = REG_ESPACE;
4838
return NULL;
4839
}
4840
break;
4841
case ANCHOR:
4842
if ((token->opr.ctx_type
4843
& (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
4844
&& dfa->word_ops_used == 0)
4845
init_word_char (dfa);
4846
if (token->opr.ctx_type == WORD_DELIM
4847
|| token->opr.ctx_type == NOT_WORD_DELIM)
4848
{
4849
bin_tree_t *tree_first, *tree_last;
4850
if (token->opr.ctx_type == WORD_DELIM)
4851
{
4852
token->opr.ctx_type = WORD_FIRST;
4853
tree_first = create_token_tree (dfa, NULL, NULL, token);
4854
token->opr.ctx_type = WORD_LAST;
4855
}
4856
else
4857
{
4858
token->opr.ctx_type = INSIDE_WORD;
4859
tree_first = create_token_tree (dfa, NULL, NULL, token);
4860
token->opr.ctx_type = INSIDE_NOTWORD;
4861
}
4862
tree_last = create_token_tree (dfa, NULL, NULL, token);
4863
tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
4864
if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
4865
{
4866
*err = REG_ESPACE;
4867
return NULL;
4868
}
4869
}
4870
else
4871
{
4872
tree = create_token_tree (dfa, NULL, NULL, token);
4873
if (BE (tree == NULL, 0))
4874
{
4875
*err = REG_ESPACE;
4876
return NULL;
4877
}
4878
}
4879
/* We must return here, since ANCHORs can't be followed
4880
by repetition operators.
4881
eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
4882
it must not be "<ANCHOR(^)><REPEAT(*)>". */
4883
fetch_token (token, regexp, syntax);
4884
return tree;
4885
case OP_PERIOD:
4886
tree = create_token_tree (dfa, NULL, NULL, token);
4887
if (BE (tree == NULL, 0))
4888
{
4889
*err = REG_ESPACE;
4890
return NULL;
4891
}
4892
if (dfa->mb_cur_max > 1)
4893
dfa->has_mb_node = 1;
4894
break;
4895
case OP_WORD:
4896
case OP_NOTWORD:
4897
tree = build_charclass_op (dfa, regexp->trans,
4898
(const unsigned char *) "alnum",
4899
(const unsigned char *) "_",
4900
token->type == OP_NOTWORD, err);
4901
if (BE (*err != REG_NOERROR && tree == NULL, 0))
4902
return NULL;
4903
break;
4904
case OP_SPACE:
4905
case OP_NOTSPACE:
4906
tree = build_charclass_op (dfa, regexp->trans,
4907
(const unsigned char *) "space",
4908
(const unsigned char *) "",
4909
token->type == OP_NOTSPACE, err);
4910
if (BE (*err != REG_NOERROR && tree == NULL, 0))
4911
return NULL;
4912
break;
4913
case OP_ALT:
4914
case END_OF_RE:
4915
return NULL;
4916
case BACK_SLASH:
4917
*err = REG_EESCAPE;
4918
return NULL;
4919
default:
4920
/* Must not happen? */
4921
#ifdef DEBUG
4922
assert (0);
4923
#endif
4924
return NULL;
4925
}
4926
fetch_token (token, regexp, syntax);
4927
4928
while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
4929
|| token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
4930
{
4931
tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
4932
if (BE (*err != REG_NOERROR && tree == NULL, 0))
4933
return NULL;
4934
/* In BRE consecutive duplications are not allowed. */
4935
if ((syntax & RE_CONTEXT_INVALID_DUP)
4936
&& (token->type == OP_DUP_ASTERISK
4937
|| token->type == OP_OPEN_DUP_NUM))
4938
{
4939
*err = REG_BADRPT;
4940
return NULL;
4941
}
4942
}
4943
4944
return tree;
4945
}
4946
4947
/* This function build the following tree, from regular expression
4948
(<reg_exp>):
4949
SUBEXP
4950
|
4951
<reg_exp>
4952
*/
4953
4954
static bin_tree_t *
4955
parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
4956
reg_syntax_t syntax, int nest, reg_errcode_t *err)
4957
{
4958
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4959
bin_tree_t *tree;
4960
size_t cur_nsub;
4961
cur_nsub = preg->re_nsub++;
4962
4963
fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
4964
4965
/* The subexpression may be a null string. */
4966
if (token->type == OP_CLOSE_SUBEXP)
4967
tree = NULL;
4968
else
4969
{
4970
tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
4971
if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
4972
*err = REG_EPAREN;
4973
if (BE (*err != REG_NOERROR, 0))
4974
return NULL;
4975
}
4976
4977
if (cur_nsub <= '9' - '1')
4978
dfa->completed_bkref_map |= 1 << cur_nsub;
4979
4980
tree = create_tree (dfa, tree, NULL, SUBEXP);
4981
if (BE (tree == NULL, 0))
4982
{
4983
*err = REG_ESPACE;
4984
return NULL;
4985
}
4986
tree->token.opr.idx = cur_nsub;
4987
return tree;
4988
}
4989
4990
/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
4991
4992
static bin_tree_t *
4993
parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
4994
re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
4995
{
4996
bin_tree_t *tree = NULL, *old_tree = NULL;
4997
int i, start, end, start_idx = re_string_cur_idx (regexp);
4998
re_token_t start_token = *token;
4999
5000
if (token->type == OP_OPEN_DUP_NUM)
5001
{
5002
end = 0;
5003
start = fetch_number (regexp, token, syntax);
5004
if (start == -1)
5005
{
5006
if (token->type == CHARACTER && token->opr.c == ',')
5007
start = 0; /* We treat "{,m}" as "{0,m}". */
5008
else
5009
{
5010
*err = REG_BADBR; /* <re>{} is invalid. */
5011
return NULL;
5012
}
5013
}
5014
if (BE (start != -2, 1))
5015
{
5016
/* We treat "{n}" as "{n,n}". */
5017
end = ((token->type == OP_CLOSE_DUP_NUM) ? start
5018
: ((token->type == CHARACTER && token->opr.c == ',')
5019
? fetch_number (regexp, token, syntax) : -2));
5020
}
5021
if (BE (start == -2 || end == -2, 0))
5022
{
5023
/* Invalid sequence. */
5024
if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
5025
{
5026
if (token->type == END_OF_RE)
5027
*err = REG_EBRACE;
5028
else
5029
*err = REG_BADBR;
5030
5031
return NULL;
5032
}
5033
5034
/* If the syntax bit is set, rollback. */
5035
re_string_set_index (regexp, start_idx);
5036
*token = start_token;
5037
token->type = CHARACTER;
5038
/* mb_partial and word_char bits should be already initialized by
5039
peek_token. */
5040
return elem;
5041
}
5042
5043
if (BE (end != -1 && start > end, 0))
5044
{
5045
/* First number greater than second. */
5046
*err = REG_BADBR;
5047
return NULL;
5048
}
5049
}
5050
else
5051
{
5052
start = (token->type == OP_DUP_PLUS) ? 1 : 0;
5053
end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
5054
}
5055
5056
fetch_token (token, regexp, syntax);
5057
5058
if (BE (elem == NULL, 0))
5059
return NULL;
5060
if (BE (start == 0 && end == 0, 0))
5061
{
5062
postorder (elem, free_tree, NULL);
5063
return NULL;
5064
}
5065
5066
/* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
5067
if (BE (start > 0, 0))
5068
{
5069
tree = elem;
5070
for (i = 2; i <= start; ++i)
5071
{
5072
elem = duplicate_tree (elem, dfa);
5073
tree = create_tree (dfa, tree, elem, CONCAT);
5074
if (BE (elem == NULL || tree == NULL, 0))
5075
goto parse_dup_op_espace;
5076
}
5077
5078
if (start == end)
5079
return tree;
5080
5081
/* Duplicate ELEM before it is marked optional. */
5082
elem = duplicate_tree (elem, dfa);
5083
old_tree = tree;
5084
}
5085
else
5086
old_tree = NULL;
5087
5088
if (elem->token.type == SUBEXP)
5089
postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
5090
5091
tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
5092
if (BE (tree == NULL, 0))
5093
goto parse_dup_op_espace;
5094
5095
/* This loop is actually executed only when end != -1,
5096
to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
5097
already created the start+1-th copy. */
5098
for (i = start + 2; i <= end; ++i)
5099
{
5100
elem = duplicate_tree (elem, dfa);
5101
tree = create_tree (dfa, tree, elem, CONCAT);
5102
if (BE (elem == NULL || tree == NULL, 0))
5103
goto parse_dup_op_espace;
5104
5105
tree = create_tree (dfa, tree, NULL, OP_ALT);
5106
if (BE (tree == NULL, 0))
5107
goto parse_dup_op_espace;
5108
}
5109
5110
if (old_tree)
5111
tree = create_tree (dfa, old_tree, tree, CONCAT);
5112
5113
return tree;
5114
5115
parse_dup_op_espace:
5116
*err = REG_ESPACE;
5117
return NULL;
5118
}
5119
5120
/* Size of the names for collating symbol/equivalence_class/character_class.
5121
I'm not sure, but maybe enough. */
5122
#define BRACKET_NAME_BUF_SIZE 32
5123
5124
#ifndef _LIBC
5125
/* Local function for parse_bracket_exp only used in case of NOT _LIBC.
5126
Build the range expression which starts from START_ELEM, and ends
5127
at END_ELEM. The result are written to MBCSET and SBCSET.
5128
RANGE_ALLOC is the allocated size of mbcset->range_starts, and
5129
mbcset->range_ends, is a pointer argument sinse we may
5130
update it. */
5131
5132
static reg_errcode_t
5133
internal_function
5134
# ifdef RE_ENABLE_I18N
5135
build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
5136
bracket_elem_t *start_elem, bracket_elem_t *end_elem)
5137
# else /* not RE_ENABLE_I18N */
5138
build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
5139
bracket_elem_t *end_elem)
5140
# endif /* not RE_ENABLE_I18N */
5141
{
5142
unsigned int start_ch, end_ch;
5143
/* Equivalence Classes and Character Classes can't be a range start/end. */
5144
if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
5145
|| end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
5146
0))
5147
return REG_ERANGE;
5148
5149
/* We can handle no multi character collating elements without libc
5150
support. */
5151
if (BE ((start_elem->type == COLL_SYM
5152
&& strlen ((char *) start_elem->opr.name) > 1)
5153
|| (end_elem->type == COLL_SYM
5154
&& strlen ((char *) end_elem->opr.name) > 1), 0))
5155
return REG_ECOLLATE;
5156
5157
# ifdef RE_ENABLE_I18N
5158
{
5159
wchar_t wc;
5160
wint_t start_wc;
5161
wint_t end_wc;
5162
wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
5163
5164
start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
5165
: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
5166
: 0));
5167
end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
5168
: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
5169
: 0));
5170
start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
5171
? __btowc (start_ch) : start_elem->opr.wch);
5172
end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
5173
? __btowc (end_ch) : end_elem->opr.wch);
5174
if (start_wc == WEOF || end_wc == WEOF)
5175
return REG_ECOLLATE;
5176
cmp_buf[0] = start_wc;
5177
cmp_buf[4] = end_wc;
5178
if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
5179
return REG_ERANGE;
5180
5181
/* Got valid collation sequence values, add them as a new entry.
5182
However, for !_LIBC we have no collation elements: if the
5183
character set is single byte, the single byte character set
5184
that we build below suffices. parse_bracket_exp passes
5185
no MBCSET if dfa->mb_cur_max == 1. */
5186
if (mbcset)
5187
{
5188
/* Check the space of the arrays. */
5189
if (BE (*range_alloc == mbcset->nranges, 0))
5190
{
5191
/* There is not enough space, need realloc. */
5192
wchar_t *new_array_start, *new_array_end;
5193
int new_nranges;
5194
5195
/* +1 in case of mbcset->nranges is 0. */
5196
new_nranges = 2 * mbcset->nranges + 1;
5197
/* Use realloc since mbcset->range_starts and mbcset->range_ends
5198
are NULL if *range_alloc == 0. */
5199
new_array_start = re_realloc (mbcset->range_starts, wchar_t,
5200
new_nranges);
5201
new_array_end = re_realloc (mbcset->range_ends, wchar_t,
5202
new_nranges);
5203
5204
if (BE (new_array_start == NULL || new_array_end == NULL, 0))
5205
return REG_ESPACE;
5206
5207
mbcset->range_starts = new_array_start;
5208
mbcset->range_ends = new_array_end;
5209
*range_alloc = new_nranges;
5210
}
5211
5212
mbcset->range_starts[mbcset->nranges] = start_wc;
5213
mbcset->range_ends[mbcset->nranges++] = end_wc;
5214
}
5215
5216
/* Build the table for single byte characters. */
5217
for (wc = 0; wc < SBC_MAX; ++wc)
5218
{
5219
cmp_buf[2] = wc;
5220
if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
5221
&& wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
5222
bitset_set (sbcset, wc);
5223
}
5224
}
5225
# else /* not RE_ENABLE_I18N */
5226
{
5227
unsigned int ch;
5228
start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
5229
: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
5230
: 0));
5231
end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
5232
: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
5233
: 0));
5234
if (start_ch > end_ch)
5235
return REG_ERANGE;
5236
/* Build the table for single byte characters. */
5237
for (ch = 0; ch < SBC_MAX; ++ch)
5238
if (start_ch <= ch && ch <= end_ch)
5239
bitset_set (sbcset, ch);
5240
}
5241
# endif /* not RE_ENABLE_I18N */
5242
return REG_NOERROR;
5243
}
5244
#endif /* not _LIBC */
5245
5246
#ifndef _LIBC
5247
/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
5248
Build the collating element which is represented by NAME.
5249
The result are written to MBCSET and SBCSET.
5250
COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
5251
pointer argument since we may update it. */
5252
5253
static reg_errcode_t
5254
internal_function
5255
# ifdef RE_ENABLE_I18N
5256
build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
5257
int *coll_sym_alloc, const unsigned char *name)
5258
# else /* not RE_ENABLE_I18N */
5259
build_collating_symbol (bitset_t sbcset, const unsigned char *name)
5260
# endif /* not RE_ENABLE_I18N */
5261
{
5262
size_t name_len = strlen ((const char *) name);
5263
if (BE (name_len != 1, 0))
5264
return REG_ECOLLATE;
5265
else
5266
{
5267
bitset_set (sbcset, name[0]);
5268
return REG_NOERROR;
5269
}
5270
}
5271
#endif /* not _LIBC */
5272
5273
/* This function parse bracket expression like "[abc]", "[a-c]",
5274
"[[.a-a.]]" etc. */
5275
5276
static bin_tree_t *
5277
parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
5278
reg_syntax_t syntax, reg_errcode_t *err)
5279
{
5280
#ifdef _LIBC
5281
const unsigned char *collseqmb;
5282
const char *collseqwc;
5283
uint32_t nrules;
5284
int32_t table_size;
5285
const int32_t *symb_table;
5286
const unsigned char *extra;
5287
5288
/* Local function for parse_bracket_exp used in _LIBC environement.
5289
Seek the collating symbol entry correspondings to NAME.
5290
Return the index of the symbol in the SYMB_TABLE. */
5291
5292
auto inline int32_t
5293
__attribute ((always_inline))
5294
seek_collating_symbol_entry (name, name_len)
5295
const unsigned char *name;
5296
size_t name_len;
5297
{
5298
int32_t hash = elem_hash ((const char *) name, name_len);
5299
int32_t elem = hash % table_size;
5300
if (symb_table[2 * elem] != 0)
5301
{
5302
int32_t second = hash % (table_size - 2) + 1;
5303
5304
do
5305
{
5306
/* First compare the hashing value. */
5307
if (symb_table[2 * elem] == hash
5308
/* Compare the length of the name. */
5309
&& name_len == extra[symb_table[2 * elem + 1]]
5310
/* Compare the name. */
5311
&& memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
5312
name_len) == 0)
5313
{
5314
/* Yep, this is the entry. */
5315
break;
5316
}
5317
5318
/* Next entry. */
5319
elem += second;
5320
}
5321
while (symb_table[2 * elem] != 0);
5322
}
5323
return elem;
5324
}
5325
5326
/* Local function for parse_bracket_exp used in _LIBC environement.
5327
Look up the collation sequence value of BR_ELEM.
5328
Return the value if succeeded, UINT_MAX otherwise. */
5329
5330
auto inline unsigned int
5331
__attribute ((always_inline))
5332
lookup_collation_sequence_value (br_elem)
5333
bracket_elem_t *br_elem;
5334
{
5335
if (br_elem->type == SB_CHAR)
5336
{
5337
/*
5338
if (MB_CUR_MAX == 1)
5339
*/
5340
if (nrules == 0)
5341
return collseqmb[br_elem->opr.ch];
5342
else
5343
{
5344
wint_t wc = __btowc (br_elem->opr.ch);
5345
return __collseq_table_lookup (collseqwc, wc);
5346
}
5347
}
5348
else if (br_elem->type == MB_CHAR)
5349
{
5350
return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
5351
}
5352
else if (br_elem->type == COLL_SYM)
5353
{
5354
size_t sym_name_len = strlen ((char *) br_elem->opr.name);
5355
if (nrules != 0)
5356
{
5357
int32_t elem, idx;
5358
elem = seek_collating_symbol_entry (br_elem->opr.name,
5359
sym_name_len);
5360
if (symb_table[2 * elem] != 0)
5361
{
5362
/* We found the entry. */
5363
idx = symb_table[2 * elem + 1];
5364
/* Skip the name of collating element name. */
5365
idx += 1 + extra[idx];
5366
/* Skip the byte sequence of the collating element. */
5367
idx += 1 + extra[idx];
5368
/* Adjust for the alignment. */
5369
idx = (idx + 3) & ~3;
5370
/* Skip the multibyte collation sequence value. */
5371
idx += sizeof (unsigned int);
5372
/* Skip the wide char sequence of the collating element. */
5373
idx += sizeof (unsigned int) *
5374
(1 + *(unsigned int *) (extra + idx));
5375
/* Return the collation sequence value. */
5376
return *(unsigned int *) (extra + idx);
5377
}
5378
else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
5379
{
5380
/* No valid character. Match it as a single byte
5381
character. */
5382
return collseqmb[br_elem->opr.name[0]];
5383
}
5384
}
5385
else if (sym_name_len == 1)
5386
return collseqmb[br_elem->opr.name[0]];
5387
}
5388
return UINT_MAX;
5389
}
5390
5391
/* Local function for parse_bracket_exp used in _LIBC environement.
5392
Build the range expression which starts from START_ELEM, and ends
5393
at END_ELEM. The result are written to MBCSET and SBCSET.
5394
RANGE_ALLOC is the allocated size of mbcset->range_starts, and
5395
mbcset->range_ends, is a pointer argument sinse we may
5396
update it. */
5397
5398
auto inline reg_errcode_t
5399
__attribute ((always_inline))
5400
build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
5401
re_charset_t *mbcset;
5402
int *range_alloc;
5403
bitset_t sbcset;
5404
bracket_elem_t *start_elem, *end_elem;
5405
{
5406
unsigned int ch;
5407
uint32_t start_collseq;
5408
uint32_t end_collseq;
5409
5410
/* Equivalence Classes and Character Classes can't be a range
5411
start/end. */
5412
if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
5413
|| end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
5414
0))
5415
return REG_ERANGE;
5416
5417
start_collseq = lookup_collation_sequence_value (start_elem);
5418
end_collseq = lookup_collation_sequence_value (end_elem);
5419
/* Check start/end collation sequence values. */
5420
if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
5421
return REG_ECOLLATE;
5422
if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
5423
return REG_ERANGE;
5424
5425
/* Got valid collation sequence values, add them as a new entry.
5426
However, if we have no collation elements, and the character set
5427
is single byte, the single byte character set that we
5428
build below suffices. */
5429
if (nrules > 0 || dfa->mb_cur_max > 1)
5430
{
5431
/* Check the space of the arrays. */
5432
if (BE (*range_alloc == mbcset->nranges, 0))
5433
{
5434
/* There is not enough space, need realloc. */
5435
uint32_t *new_array_start;
5436
uint32_t *new_array_end;
5437
int new_nranges;
5438
5439
/* +1 in case of mbcset->nranges is 0. */
5440
new_nranges = 2 * mbcset->nranges + 1;
5441
new_array_start = re_realloc (mbcset->range_starts, uint32_t,
5442
new_nranges);
5443
new_array_end = re_realloc (mbcset->range_ends, uint32_t,
5444
new_nranges);
5445
5446
if (BE (new_array_start == NULL || new_array_end == NULL, 0))
5447
return REG_ESPACE;
5448
5449
mbcset->range_starts = new_array_start;
5450
mbcset->range_ends = new_array_end;
5451
*range_alloc = new_nranges;
5452
}
5453
5454
mbcset->range_starts[mbcset->nranges] = start_collseq;
5455
mbcset->range_ends[mbcset->nranges++] = end_collseq;
5456
}
5457
5458
/* Build the table for single byte characters. */
5459
for (ch = 0; ch < SBC_MAX; ch++)
5460
{
5461
uint32_t ch_collseq;
5462
/*
5463
if (MB_CUR_MAX == 1)
5464
*/
5465
if (nrules == 0)
5466
ch_collseq = collseqmb[ch];
5467
else
5468
ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
5469
if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
5470
bitset_set (sbcset, ch);
5471
}
5472
return REG_NOERROR;
5473
}
5474
5475
/* Local function for parse_bracket_exp used in _LIBC environement.
5476
Build the collating element which is represented by NAME.
5477
The result are written to MBCSET and SBCSET.
5478
COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
5479
pointer argument sinse we may update it. */
5480
5481
auto inline reg_errcode_t
5482
__attribute ((always_inline))
5483
build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
5484
re_charset_t *mbcset;
5485
int *coll_sym_alloc;
5486
bitset_t sbcset;
5487
const unsigned char *name;
5488
{
5489
int32_t elem, idx;
5490
size_t name_len = strlen ((const char *) name);
5491
if (nrules != 0)
5492
{
5493
elem = seek_collating_symbol_entry (name, name_len);
5494
if (symb_table[2 * elem] != 0)
5495
{
5496
/* We found the entry. */
5497
idx = symb_table[2 * elem + 1];
5498
/* Skip the name of collating element name. */
5499
idx += 1 + extra[idx];
5500
}
5501
else if (symb_table[2 * elem] == 0 && name_len == 1)
5502
{
5503
/* No valid character, treat it as a normal
5504
character. */
5505
bitset_set (sbcset, name[0]);
5506
return REG_NOERROR;
5507
}
5508
else
5509
return REG_ECOLLATE;
5510
5511
/* Got valid collation sequence, add it as a new entry. */
5512
/* Check the space of the arrays. */
5513
if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
5514
{
5515
/* Not enough, realloc it. */
5516
/* +1 in case of mbcset->ncoll_syms is 0. */
5517
int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
5518
/* Use realloc since mbcset->coll_syms is NULL
5519
if *alloc == 0. */
5520
int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
5521
new_coll_sym_alloc);
5522
if (BE (new_coll_syms == NULL, 0))
5523
return REG_ESPACE;
5524
mbcset->coll_syms = new_coll_syms;
5525
*coll_sym_alloc = new_coll_sym_alloc;
5526
}
5527
mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
5528
return REG_NOERROR;
5529
}
5530
else
5531
{
5532
if (BE (name_len != 1, 0))
5533
return REG_ECOLLATE;
5534
else
5535
{
5536
bitset_set (sbcset, name[0]);
5537
return REG_NOERROR;
5538
}
5539
}
5540
}
5541
#endif
5542
5543
re_token_t br_token;
5544
re_bitset_ptr_t sbcset;
5545
#ifdef RE_ENABLE_I18N
5546
re_charset_t *mbcset;
5547
int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
5548
int equiv_class_alloc = 0, char_class_alloc = 0;
5549
#endif /* not RE_ENABLE_I18N */
5550
int non_match = 0;
5551
bin_tree_t *work_tree;
5552
int token_len;
5553
int first_round = 1;
5554
#ifdef _LIBC
5555
collseqmb = (const unsigned char *)
5556
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
5557
nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
5558
if (nrules)
5559
{
5560
/*
5561
if (MB_CUR_MAX > 1)
5562
*/
5563
collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
5564
table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
5565
symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
5566
_NL_COLLATE_SYMB_TABLEMB);
5567
extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
5568
_NL_COLLATE_SYMB_EXTRAMB);
5569
}
5570
#endif
5571
sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
5572
#ifdef RE_ENABLE_I18N
5573
mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
5574
#endif /* RE_ENABLE_I18N */
5575
#ifdef RE_ENABLE_I18N
5576
if (BE (sbcset == NULL || mbcset == NULL, 0))
5577
#else
5578
if (BE (sbcset == NULL, 0))
5579
#endif /* RE_ENABLE_I18N */
5580
{
5581
*err = REG_ESPACE;
5582
return NULL;
5583
}
5584
5585
token_len = peek_token_bracket (token, regexp, syntax);
5586
if (BE (token->type == END_OF_RE, 0))
5587
{
5588
*err = REG_BADPAT;
5589
goto parse_bracket_exp_free_return;
5590
}
5591
if (token->type == OP_NON_MATCH_LIST)
5592
{
5593
#ifdef RE_ENABLE_I18N
5594
mbcset->non_match = 1;
5595
#endif /* not RE_ENABLE_I18N */
5596
non_match = 1;
5597
if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
5598
bitset_set (sbcset, '\0');
5599
re_string_skip_bytes (regexp, token_len); /* Skip a token. */
5600
token_len = peek_token_bracket (token, regexp, syntax);
5601
if (BE (token->type == END_OF_RE, 0))
5602
{
5603
*err = REG_BADPAT;
5604
goto parse_bracket_exp_free_return;
5605
}
5606
}
5607
5608
/* We treat the first ']' as a normal character. */
5609
if (token->type == OP_CLOSE_BRACKET)
5610
token->type = CHARACTER;
5611
5612
while (1)
5613
{
5614
bracket_elem_t start_elem, end_elem;
5615
unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
5616
unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
5617
reg_errcode_t ret;
5618
int token_len2 = 0, is_range_exp = 0;
5619
re_token_t token2;
5620
5621
start_elem.opr.name = start_name_buf;
5622
ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
5623
syntax, first_round);
5624
if (BE (ret != REG_NOERROR, 0))
5625
{
5626
*err = ret;
5627
goto parse_bracket_exp_free_return;
5628
}
5629
first_round = 0;
5630
5631
/* Get information about the next token. We need it in any case. */
5632
token_len = peek_token_bracket (token, regexp, syntax);
5633
5634
/* Do not check for ranges if we know they are not allowed. */
5635
if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
5636
{
5637
if (BE (token->type == END_OF_RE, 0))
5638
{
5639
*err = REG_EBRACK;
5640
goto parse_bracket_exp_free_return;
5641
}
5642
if (token->type == OP_CHARSET_RANGE)
5643
{
5644
re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
5645
token_len2 = peek_token_bracket (&token2, regexp, syntax);
5646
if (BE (token2.type == END_OF_RE, 0))
5647
{
5648
*err = REG_EBRACK;
5649
goto parse_bracket_exp_free_return;
5650
}
5651
if (token2.type == OP_CLOSE_BRACKET)
5652
{
5653
/* We treat the last '-' as a normal character. */
5654
re_string_skip_bytes (regexp, -token_len);
5655
token->type = CHARACTER;
5656
}
5657
else
5658
is_range_exp = 1;
5659
}
5660
}
5661
5662
if (is_range_exp == 1)
5663
{
5664
end_elem.opr.name = end_name_buf;
5665
ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
5666
dfa, syntax, 1);
5667
if (BE (ret != REG_NOERROR, 0))
5668
{
5669
*err = ret;
5670
goto parse_bracket_exp_free_return;
5671
}
5672
5673
token_len = peek_token_bracket (token, regexp, syntax);
5674
5675
#ifdef _LIBC
5676
*err = build_range_exp (sbcset, mbcset, &range_alloc,
5677
&start_elem, &end_elem);
5678
#else
5679
# ifdef RE_ENABLE_I18N
5680
*err = build_range_exp (sbcset,
5681
dfa->mb_cur_max > 1 ? mbcset : NULL,
5682
&range_alloc, &start_elem, &end_elem);
5683
# else
5684
*err = build_range_exp (sbcset, &start_elem, &end_elem);
5685
# endif
5686
#endif /* RE_ENABLE_I18N */
5687
if (BE (*err != REG_NOERROR, 0))
5688
goto parse_bracket_exp_free_return;
5689
}
5690
else
5691
{
5692
switch (start_elem.type)
5693
{
5694
case SB_CHAR:
5695
bitset_set (sbcset, start_elem.opr.ch);
5696
break;
5697
#ifdef RE_ENABLE_I18N
5698
case MB_CHAR:
5699
/* Check whether the array has enough space. */
5700
if (BE (mbchar_alloc == mbcset->nmbchars, 0))
5701
{
5702
wchar_t *new_mbchars;
5703
/* Not enough, realloc it. */
5704
/* +1 in case of mbcset->nmbchars is 0. */
5705
mbchar_alloc = 2 * mbcset->nmbchars + 1;
5706
/* Use realloc since array is NULL if *alloc == 0. */
5707
new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
5708
mbchar_alloc);
5709
if (BE (new_mbchars == NULL, 0))
5710
goto parse_bracket_exp_espace;
5711
mbcset->mbchars = new_mbchars;
5712
}
5713
mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
5714
break;
5715
#endif /* RE_ENABLE_I18N */
5716
case EQUIV_CLASS:
5717
*err = build_equiv_class (sbcset,
5718
#ifdef RE_ENABLE_I18N
5719
mbcset, &equiv_class_alloc,
5720
#endif /* RE_ENABLE_I18N */
5721
start_elem.opr.name);
5722
if (BE (*err != REG_NOERROR, 0))
5723
goto parse_bracket_exp_free_return;
5724
break;
5725
case COLL_SYM:
5726
*err = build_collating_symbol (sbcset,
5727
#ifdef RE_ENABLE_I18N
5728
mbcset, &coll_sym_alloc,
5729
#endif /* RE_ENABLE_I18N */
5730
start_elem.opr.name);
5731
if (BE (*err != REG_NOERROR, 0))
5732
goto parse_bracket_exp_free_return;
5733
break;
5734
case CHAR_CLASS:
5735
*err = build_charclass (regexp->trans, sbcset,
5736
#ifdef RE_ENABLE_I18N
5737
mbcset, &char_class_alloc,
5738
#endif /* RE_ENABLE_I18N */
5739
start_elem.opr.name, syntax);
5740
if (BE (*err != REG_NOERROR, 0))
5741
goto parse_bracket_exp_free_return;
5742
break;
5743
default:
5744
assert (0);
5745
break;
5746
}
5747
}
5748
if (BE (token->type == END_OF_RE, 0))
5749
{
5750
*err = REG_EBRACK;
5751
goto parse_bracket_exp_free_return;
5752
}
5753
if (token->type == OP_CLOSE_BRACKET)
5754
break;
5755
}
5756
5757
re_string_skip_bytes (regexp, token_len); /* Skip a token. */
5758
5759
/* If it is non-matching list. */
5760
if (non_match)
5761
bitset_not (sbcset);
5762
5763
#ifdef RE_ENABLE_I18N
5764
/* Ensure only single byte characters are set. */
5765
if (dfa->mb_cur_max > 1)
5766
bitset_mask (sbcset, dfa->sb_char);
5767
5768
if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
5769
|| mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
5770
|| mbcset->non_match)))
5771
{
5772
bin_tree_t *mbc_tree;
5773
int sbc_idx;
5774
/* Build a tree for complex bracket. */
5775
dfa->has_mb_node = 1;
5776
br_token.type = COMPLEX_BRACKET;
5777
br_token.opr.mbcset = mbcset;
5778
mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
5779
if (BE (mbc_tree == NULL, 0))
5780
goto parse_bracket_exp_espace;
5781
for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
5782
if (sbcset[sbc_idx])
5783
break;
5784
/* If there are no bits set in sbcset, there is no point
5785
of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
5786
if (sbc_idx < BITSET_WORDS)
5787
{
5788
/* Build a tree for simple bracket. */
5789
br_token.type = SIMPLE_BRACKET;
5790
br_token.opr.sbcset = sbcset;
5791
work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
5792
if (BE (work_tree == NULL, 0))
5793
goto parse_bracket_exp_espace;
5794
5795
/* Then join them by ALT node. */
5796
work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
5797
if (BE (work_tree == NULL, 0))
5798
goto parse_bracket_exp_espace;
5799
}
5800
else
5801
{
5802
re_free (sbcset);
5803
work_tree = mbc_tree;
5804
}
5805
}
5806
else
5807
#endif /* not RE_ENABLE_I18N */
5808
{
5809
#ifdef RE_ENABLE_I18N
5810
free_charset (mbcset);
5811
#endif
5812
/* Build a tree for simple bracket. */
5813
br_token.type = SIMPLE_BRACKET;
5814
br_token.opr.sbcset = sbcset;
5815
work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
5816
if (BE (work_tree == NULL, 0))
5817
goto parse_bracket_exp_espace;
5818
}
5819
return work_tree;
5820
5821
parse_bracket_exp_espace:
5822
*err = REG_ESPACE;
5823
parse_bracket_exp_free_return:
5824
re_free (sbcset);
5825
#ifdef RE_ENABLE_I18N
5826
free_charset (mbcset);
5827
#endif /* RE_ENABLE_I18N */
5828
return NULL;
5829
}
5830
5831
/* Parse an element in the bracket expression. */
5832
5833
static reg_errcode_t
5834
parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
5835
re_token_t *token, int token_len, re_dfa_t *dfa,
5836
reg_syntax_t syntax, int accept_hyphen)
5837
{
5838
#ifdef RE_ENABLE_I18N
5839
int cur_char_size;
5840
cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
5841
if (cur_char_size > 1)
5842
{
5843
elem->type = MB_CHAR;
5844
elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
5845
re_string_skip_bytes (regexp, cur_char_size);
5846
return REG_NOERROR;
5847
}
5848
#endif /* RE_ENABLE_I18N */
5849
re_string_skip_bytes (regexp, token_len); /* Skip a token. */
5850
if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
5851
|| token->type == OP_OPEN_EQUIV_CLASS)
5852
return parse_bracket_symbol (elem, regexp, token);
5853
if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
5854
{
5855
/* A '-' must only appear as anything but a range indicator before
5856
the closing bracket. Everything else is an error. */
5857
re_token_t token2;
5858
(void) peek_token_bracket (&token2, regexp, syntax);
5859
if (token2.type != OP_CLOSE_BRACKET)
5860
/* The actual error value is not standardized since this whole
5861
case is undefined. But ERANGE makes good sense. */
5862
return REG_ERANGE;
5863
}
5864
elem->type = SB_CHAR;
5865
elem->opr.ch = token->opr.c;
5866
return REG_NOERROR;
5867
}
5868
5869
/* Parse a bracket symbol in the bracket expression. Bracket symbols are
5870
such as [:<character_class>:], [.<collating_element>.], and
5871
[=<equivalent_class>=]. */
5872
5873
static reg_errcode_t
5874
parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
5875
re_token_t *token)
5876
{
5877
unsigned char ch, delim = token->opr.c;
5878
int i = 0;
5879
if (re_string_eoi(regexp))
5880
return REG_EBRACK;
5881
for (;; ++i)
5882
{
5883
if (i >= BRACKET_NAME_BUF_SIZE)
5884
return REG_EBRACK;
5885
if (token->type == OP_OPEN_CHAR_CLASS)
5886
ch = re_string_fetch_byte_case (regexp);
5887
else
5888
ch = re_string_fetch_byte (regexp);
5889
if (re_string_eoi(regexp))
5890
return REG_EBRACK;
5891
if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
5892
break;
5893
elem->opr.name[i] = ch;
5894
}
5895
re_string_skip_bytes (regexp, 1);
5896
elem->opr.name[i] = '\0';
5897
switch (token->type)
5898
{
5899
case OP_OPEN_COLL_ELEM:
5900
elem->type = COLL_SYM;
5901
break;
5902
case OP_OPEN_EQUIV_CLASS:
5903
elem->type = EQUIV_CLASS;
5904
break;
5905
case OP_OPEN_CHAR_CLASS:
5906
elem->type = CHAR_CLASS;
5907
break;
5908
default:
5909
break;
5910
}
5911
return REG_NOERROR;
5912
}
5913
5914
/* Helper function for parse_bracket_exp.
5915
Build the equivalence class which is represented by NAME.
5916
The result are written to MBCSET and SBCSET.
5917
EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
5918
is a pointer argument sinse we may update it. */
5919
5920
static reg_errcode_t
5921
#ifdef RE_ENABLE_I18N
5922
build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
5923
int *equiv_class_alloc, const unsigned char *name)
5924
#else /* not RE_ENABLE_I18N */
5925
build_equiv_class (bitset_t sbcset, const unsigned char *name)
5926
#endif /* not RE_ENABLE_I18N */
5927
{
5928
#ifdef _LIBC
5929
uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
5930
if (nrules != 0)
5931
{
5932
const int32_t *table, *indirect;
5933
const unsigned char *weights, *extra, *cp;
5934
unsigned char char_buf[2];
5935
int32_t idx1, idx2;
5936
unsigned int ch;
5937
size_t len;
5938
/* This #include defines a local function! */
5939
# include <locale/weight.h>
5940
/* Calculate the index for equivalence class. */
5941
cp = name;
5942
table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
5943
weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
5944
_NL_COLLATE_WEIGHTMB);
5945
extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
5946
_NL_COLLATE_EXTRAMB);
5947
indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
5948
_NL_COLLATE_INDIRECTMB);
5949
idx1 = findidx (&cp);
5950
if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
5951
/* This isn't a valid character. */
5952
return REG_ECOLLATE;
5953
5954
/* Build single byte matcing table for this equivalence class. */
5955
char_buf[1] = (unsigned char) '\0';
5956
len = weights[idx1];
5957
for (ch = 0; ch < SBC_MAX; ++ch)
5958
{
5959
char_buf[0] = ch;
5960
cp = char_buf;
5961
idx2 = findidx (&cp);
5962
/*
5963
idx2 = table[ch];
5964
*/
5965
if (idx2 == 0)
5966
/* This isn't a valid character. */
5967
continue;
5968
if (len == weights[idx2])
5969
{
5970
int cnt = 0;
5971
while (cnt <= len &&
5972
weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
5973
++cnt;
5974
5975
if (cnt > len)
5976
bitset_set (sbcset, ch);
5977
}
5978
}
5979
/* Check whether the array has enough space. */
5980
if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
5981
{
5982
/* Not enough, realloc it. */
5983
/* +1 in case of mbcset->nequiv_classes is 0. */
5984
int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
5985
/* Use realloc since the array is NULL if *alloc == 0. */
5986
int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
5987
int32_t,
5988
new_equiv_class_alloc);
5989
if (BE (new_equiv_classes == NULL, 0))
5990
return REG_ESPACE;
5991
mbcset->equiv_classes = new_equiv_classes;
5992
*equiv_class_alloc = new_equiv_class_alloc;
5993
}
5994
mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
5995
}
5996
else
5997
#endif /* _LIBC */
5998
{
5999
if (BE (strlen ((const char *) name) != 1, 0))
6000
return REG_ECOLLATE;
6001
bitset_set (sbcset, *name);
6002
}
6003
return REG_NOERROR;
6004
}
6005
6006
/* Helper function for parse_bracket_exp.
6007
Build the character class which is represented by NAME.
6008
The result are written to MBCSET and SBCSET.
6009
CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
6010
is a pointer argument sinse we may update it. */
6011
6012
static reg_errcode_t
6013
#ifdef RE_ENABLE_I18N
6014
build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
6015
re_charset_t *mbcset, int *char_class_alloc,
6016
const unsigned char *class_name, reg_syntax_t syntax)
6017
#else /* not RE_ENABLE_I18N */
6018
build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
6019
const unsigned char *class_name, reg_syntax_t syntax)
6020
#endif /* not RE_ENABLE_I18N */
6021
{
6022
int i;
6023
const char *name = (const char *) class_name;
6024
6025
/* In case of REG_ICASE "upper" and "lower" match the both of
6026
upper and lower cases. */
6027
if ((syntax & RE_ICASE)
6028
&& (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
6029
name = "alpha";
6030
6031
#ifdef RE_ENABLE_I18N
6032
/* Check the space of the arrays. */
6033
if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
6034
{
6035
/* Not enough, realloc it. */
6036
/* +1 in case of mbcset->nchar_classes is 0. */
6037
int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
6038
/* Use realloc since array is NULL if *alloc == 0. */
6039
wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
6040
new_char_class_alloc);
6041
if (BE (new_char_classes == NULL, 0))
6042
return REG_ESPACE;
6043
mbcset->char_classes = new_char_classes;
6044
*char_class_alloc = new_char_class_alloc;
6045
}
6046
mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
6047
#endif /* RE_ENABLE_I18N */
6048
6049
#define BUILD_CHARCLASS_LOOP(ctype_func) \
6050
do { \
6051
if (BE (trans != NULL, 0)) \
6052
{ \
6053
for (i = 0; i < SBC_MAX; ++i) \
6054
if (ctype_func (i)) \
6055
bitset_set (sbcset, trans[i]); \
6056
} \
6057
else \
6058
{ \
6059
for (i = 0; i < SBC_MAX; ++i) \
6060
if (ctype_func (i)) \
6061
bitset_set (sbcset, i); \
6062
} \
6063
} while (0)
6064
6065
if (strcmp (name, "alnum") == 0)
6066
BUILD_CHARCLASS_LOOP (isalnum);
6067
else if (strcmp (name, "cntrl") == 0)
6068
BUILD_CHARCLASS_LOOP (iscntrl);
6069
else if (strcmp (name, "lower") == 0)
6070
BUILD_CHARCLASS_LOOP (islower);
6071
else if (strcmp (name, "space") == 0)
6072
BUILD_CHARCLASS_LOOP (isspace);
6073
else if (strcmp (name, "alpha") == 0)
6074
BUILD_CHARCLASS_LOOP (isalpha);
6075
else if (strcmp (name, "digit") == 0)
6076
BUILD_CHARCLASS_LOOP (isdigit);
6077
else if (strcmp (name, "print") == 0)
6078
BUILD_CHARCLASS_LOOP (isprint);
6079
else if (strcmp (name, "upper") == 0)
6080
BUILD_CHARCLASS_LOOP (isupper);
6081
else if (strcmp (name, "blank") == 0)
6082
BUILD_CHARCLASS_LOOP (isblank);
6083
else if (strcmp (name, "graph") == 0)
6084
BUILD_CHARCLASS_LOOP (isgraph);
6085
else if (strcmp (name, "punct") == 0)
6086
BUILD_CHARCLASS_LOOP (ispunct);
6087
else if (strcmp (name, "xdigit") == 0)
6088
BUILD_CHARCLASS_LOOP (isxdigit);
6089
else
6090
return REG_ECTYPE;
6091
6092
return REG_NOERROR;
6093
}
6094
6095
static bin_tree_t *
6096
build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
6097
const unsigned char *class_name,
6098
const unsigned char *extra, int non_match,
6099
reg_errcode_t *err)
6100
{
6101
re_bitset_ptr_t sbcset;
6102
#ifdef RE_ENABLE_I18N
6103
re_charset_t *mbcset;
6104
int alloc = 0;
6105
#endif /* not RE_ENABLE_I18N */
6106
reg_errcode_t ret;
6107
re_token_t br_token;
6108
bin_tree_t *tree;
6109
6110
sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
6111
#ifdef RE_ENABLE_I18N
6112
mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
6113
#endif /* RE_ENABLE_I18N */
6114
6115
#ifdef RE_ENABLE_I18N
6116
if (BE (sbcset == NULL || mbcset == NULL, 0))
6117
#else /* not RE_ENABLE_I18N */
6118
if (BE (sbcset == NULL, 0))
6119
#endif /* not RE_ENABLE_I18N */
6120
{
6121
*err = REG_ESPACE;
6122
return NULL;
6123
}
6124
6125
if (non_match)
6126
{
6127
#ifdef RE_ENABLE_I18N
6128
/*
6129
if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
6130
bitset_set(cset->sbcset, '\0');
6131
*/
6132
mbcset->non_match = 1;
6133
#endif /* not RE_ENABLE_I18N */
6134
}
6135
6136
/* We don't care the syntax in this case. */
6137
ret = build_charclass (trans, sbcset,
6138
#ifdef RE_ENABLE_I18N
6139
mbcset, &alloc,
6140
#endif /* RE_ENABLE_I18N */
6141
class_name, 0);
6142
6143
if (BE (ret != REG_NOERROR, 0))
6144
{
6145
re_free (sbcset);
6146
#ifdef RE_ENABLE_I18N
6147
free_charset (mbcset);
6148
#endif /* RE_ENABLE_I18N */
6149
*err = ret;
6150
return NULL;
6151
}
6152
/* \w match '_' also. */
6153
for (; *extra; extra++)
6154
bitset_set (sbcset, *extra);
6155
6156
/* If it is non-matching list. */
6157
if (non_match)
6158
bitset_not (sbcset);
6159
6160
#ifdef RE_ENABLE_I18N
6161
/* Ensure only single byte characters are set. */
6162
if (dfa->mb_cur_max > 1)
6163
bitset_mask (sbcset, dfa->sb_char);
6164
#endif
6165
6166
/* Build a tree for simple bracket. */
6167
br_token.type = SIMPLE_BRACKET;
6168
br_token.opr.sbcset = sbcset;
6169
tree = create_token_tree (dfa, NULL, NULL, &br_token);
6170
if (BE (tree == NULL, 0))
6171
goto build_word_op_espace;
6172
6173
#ifdef RE_ENABLE_I18N
6174
if (dfa->mb_cur_max > 1)
6175
{
6176
bin_tree_t *mbc_tree;
6177
/* Build a tree for complex bracket. */
6178
br_token.type = COMPLEX_BRACKET;
6179
br_token.opr.mbcset = mbcset;
6180
dfa->has_mb_node = 1;
6181
mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
6182
if (BE (mbc_tree == NULL, 0))
6183
goto build_word_op_espace;
6184
/* Then join them by ALT node. */
6185
tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
6186
if (BE (mbc_tree != NULL, 1))
6187
return tree;
6188
}
6189
else
6190
{
6191
free_charset (mbcset);
6192
return tree;
6193
}
6194
#else /* not RE_ENABLE_I18N */
6195
return tree;
6196
#endif /* not RE_ENABLE_I18N */
6197
6198
build_word_op_espace:
6199
re_free (sbcset);
6200
#ifdef RE_ENABLE_I18N
6201
free_charset (mbcset);
6202
#endif /* RE_ENABLE_I18N */
6203
*err = REG_ESPACE;
6204
return NULL;
6205
}
6206
6207
/* This is intended for the expressions like "a{1,3}".
6208
Fetch a number from `input', and return the number.
6209
Return -1, if the number field is empty like "{,1}".
6210
Return -2, If an error is occured. */
6211
6212
static int
6213
fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
6214
{
6215
int num = -1;
6216
unsigned char c;
6217
while (1)
6218
{
6219
fetch_token (token, input, syntax);
6220
c = token->opr.c;
6221
if (BE (token->type == END_OF_RE, 0))
6222
return -2;
6223
if (token->type == OP_CLOSE_DUP_NUM || c == ',')
6224
break;
6225
num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
6226
? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
6227
num = (num > RE_DUP_MAX) ? -2 : num;
6228
}
6229
return num;
6230
}
6231
6232
#ifdef RE_ENABLE_I18N
6233
static void
6234
free_charset (re_charset_t *cset)
6235
{
6236
re_free (cset->mbchars);
6237
# ifdef _LIBC
6238
re_free (cset->coll_syms);
6239
re_free (cset->equiv_classes);
6240
re_free (cset->range_starts);
6241
re_free (cset->range_ends);
6242
# endif
6243
re_free (cset->char_classes);
6244
re_free (cset);
6245
}
6246
#endif /* RE_ENABLE_I18N */
6247
6248
/* Functions for binary tree operation. */
6249
6250
/* Create a tree node. */
6251
6252
static bin_tree_t *
6253
create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
6254
re_token_type_t type)
6255
{
6256
re_token_t t;
6257
t.type = type;
6258
return create_token_tree (dfa, left, right, &t);
6259
}
6260
6261
static bin_tree_t *
6262
create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
6263
const re_token_t *token)
6264
{
6265
bin_tree_t *tree;
6266
if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
6267
{
6268
bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
6269
6270
if (storage == NULL)
6271
return NULL;
6272
storage->next = dfa->str_tree_storage;
6273
dfa->str_tree_storage = storage;
6274
dfa->str_tree_storage_idx = 0;
6275
}
6276
tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
6277
6278
tree->parent = NULL;
6279
tree->left = left;
6280
tree->right = right;
6281
tree->token = *token;
6282
tree->token.duplicated = 0;
6283
tree->token.opt_subexp = 0;
6284
tree->first = NULL;
6285
tree->next = NULL;
6286
tree->node_idx = -1;
6287
6288
if (left != NULL)
6289
left->parent = tree;
6290
if (right != NULL)
6291
right->parent = tree;
6292
return tree;
6293
}
6294
6295
/* Mark the tree SRC as an optional subexpression.
6296
To be called from preorder or postorder. */
6297
6298
static reg_errcode_t
6299
mark_opt_subexp (void *extra, bin_tree_t *node)
6300
{
6301
int idx = (int) (long) extra;
6302
if (node->token.type == SUBEXP && node->token.opr.idx == idx)
6303
node->token.opt_subexp = 1;
6304
6305
return REG_NOERROR;
6306
}
6307
6308
/* Free the allocated memory inside NODE. */
6309
6310
static void
6311
free_token (re_token_t *node)
6312
{
6313
#ifdef RE_ENABLE_I18N
6314
if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
6315
free_charset (node->opr.mbcset);
6316
else
6317
#endif /* RE_ENABLE_I18N */
6318
if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
6319
re_free (node->opr.sbcset);
6320
}
6321
6322
/* Worker function for tree walking. Free the allocated memory inside NODE
6323
and its children. */
6324
6325
static reg_errcode_t
6326
free_tree (void *extra, bin_tree_t *node)
6327
{
6328
free_token (&node->token);
6329
return REG_NOERROR;
6330
}
6331
6332
6333
/* Duplicate the node SRC, and return new node. This is a preorder
6334
visit similar to the one implemented by the generic visitor, but
6335
we need more infrastructure to maintain two parallel trees --- so,
6336
it's easier to duplicate. */
6337
6338
static bin_tree_t *
6339
duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
6340
{
6341
const bin_tree_t *node;
6342
bin_tree_t *dup_root;
6343
bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
6344
6345
for (node = root; ; )
6346
{
6347
/* Create a new tree and link it back to the current parent. */
6348
*p_new = create_token_tree (dfa, NULL, NULL, &node->token);
6349
if (*p_new == NULL)
6350
return NULL;
6351
(*p_new)->parent = dup_node;
6352
(*p_new)->token.duplicated = 1;
6353
dup_node = *p_new;
6354
6355
/* Go to the left node, or up and to the right. */
6356
if (node->left)
6357
{
6358
node = node->left;
6359
p_new = &dup_node->left;
6360
}
6361
else
6362
{
6363
const bin_tree_t *prev = NULL;
6364
while (node->right == prev || node->right == NULL)
6365
{
6366
prev = node;
6367
node = node->parent;
6368
dup_node = dup_node->parent;
6369
if (!node)
6370
return dup_root;
6371
}
6372
node = node->right;
6373
p_new = &dup_node->right;
6374
}
6375
}
6376
}
6377
6378
/******************************************************************************/
6379
/******************************************************************************/
6380
/******************************************************************************/
6381
/* GKINCLUDE #include "regexec.c" */
6382
/******************************************************************************/
6383
/******************************************************************************/
6384
/******************************************************************************/
6385
static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
6386
int n) internal_function;
6387
static void match_ctx_clean (re_match_context_t *mctx) internal_function;
6388
static void match_ctx_free (re_match_context_t *cache) internal_function;
6389
static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
6390
int str_idx, int from, int to)
6391
internal_function;
6392
static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
6393
internal_function;
6394
static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
6395
int str_idx) internal_function;
6396
static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
6397
int node, int str_idx)
6398
internal_function;
6399
static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
6400
re_dfastate_t **limited_sts, int last_node,
6401
int last_str_idx)
6402
internal_function;
6403
static reg_errcode_t re_search_internal (const regex_t *preg,
6404
const char *string, int length,
6405
int start, int range, int stop,
6406
size_t nmatch, regmatch_t pmatch[],
6407
int eflags) internal_function;
6408
static int re_search_2_stub (struct re_pattern_buffer *bufp,
6409
const char *string1, int length1,
6410
const char *string2, int length2,
6411
int start, int range, struct re_registers *regs,
6412
int stop, int ret_len) internal_function;
6413
static int re_search_stub (struct re_pattern_buffer *bufp,
6414
const char *string, int length, int start,
6415
int range, int stop, struct re_registers *regs,
6416
int ret_len) internal_function;
6417
static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
6418
int nregs, int regs_allocated) internal_function;
6419
static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
6420
internal_function;
6421
static int check_matching (re_match_context_t *mctx, int fl_longest_match,
6422
int *p_match_first) internal_function;
6423
static int check_halt_state_context (const re_match_context_t *mctx,
6424
const re_dfastate_t *state, int idx)
6425
internal_function;
6426
static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
6427
regmatch_t *prev_idx_match, int cur_node,
6428
int cur_idx, int nmatch) internal_function;
6429
static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
6430
int str_idx, int dest_node, int nregs,
6431
regmatch_t *regs,
6432
re_node_set *eps_via_nodes)
6433
internal_function;
6434
static reg_errcode_t set_regs (const regex_t *preg,
6435
const re_match_context_t *mctx,
6436
size_t nmatch, regmatch_t *pmatch,
6437
int fl_backtrack) internal_function;
6438
static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
6439
internal_function;
6440
6441
#ifdef RE_ENABLE_I18N
6442
static int sift_states_iter_mb (const re_match_context_t *mctx,
6443
re_sift_context_t *sctx,
6444
int node_idx, int str_idx, int max_str_idx)
6445
internal_function;
6446
#endif /* RE_ENABLE_I18N */
6447
static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
6448
re_sift_context_t *sctx)
6449
internal_function;
6450
static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
6451
re_sift_context_t *sctx, int str_idx,
6452
re_node_set *cur_dest)
6453
internal_function;
6454
static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
6455
re_sift_context_t *sctx,
6456
int str_idx,
6457
re_node_set *dest_nodes)
6458
internal_function;
6459
static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
6460
re_node_set *dest_nodes,
6461
const re_node_set *candidates)
6462
internal_function;
6463
static int check_dst_limits (const re_match_context_t *mctx,
6464
re_node_set *limits,
6465
int dst_node, int dst_idx, int src_node,
6466
int src_idx) internal_function;
6467
static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
6468
int boundaries, int subexp_idx,
6469
int from_node, int bkref_idx)
6470
internal_function;
6471
static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
6472
int limit, int subexp_idx,
6473
int node, int str_idx,
6474
int bkref_idx) internal_function;
6475
static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
6476
re_node_set *dest_nodes,
6477
const re_node_set *candidates,
6478
re_node_set *limits,
6479
struct re_backref_cache_entry *bkref_ents,
6480
int str_idx) internal_function;
6481
static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
6482
re_sift_context_t *sctx,
6483
int str_idx, const re_node_set *candidates)
6484
internal_function;
6485
static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
6486
re_dfastate_t **dst,
6487
re_dfastate_t **src, int num)
6488
internal_function;
6489
static re_dfastate_t *find_recover_state (reg_errcode_t *err,
6490
re_match_context_t *mctx) internal_function;
6491
static re_dfastate_t *transit_state (reg_errcode_t *err,
6492
re_match_context_t *mctx,
6493
re_dfastate_t *state) internal_function;
6494
static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
6495
re_match_context_t *mctx,
6496
re_dfastate_t *next_state)
6497
internal_function;
6498
static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
6499
re_node_set *cur_nodes,
6500
int str_idx) internal_function;
6501
#if 0
6502
static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
6503
re_match_context_t *mctx,
6504
re_dfastate_t *pstate)
6505
internal_function;
6506
#endif
6507
#ifdef RE_ENABLE_I18N
6508
static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
6509
re_dfastate_t *pstate)
6510
internal_function;
6511
#endif /* RE_ENABLE_I18N */
6512
static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
6513
const re_node_set *nodes)
6514
internal_function;
6515
static reg_errcode_t get_subexp (re_match_context_t *mctx,
6516
int bkref_node, int bkref_str_idx)
6517
internal_function;
6518
static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
6519
const re_sub_match_top_t *sub_top,
6520
re_sub_match_last_t *sub_last,
6521
int bkref_node, int bkref_str)
6522
internal_function;
6523
static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
6524
int subexp_idx, int type) internal_function;
6525
static reg_errcode_t check_arrival (re_match_context_t *mctx,
6526
state_array_t *path, int top_node,
6527
int top_str, int last_node, int last_str,
6528
int type) internal_function;
6529
static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
6530
int str_idx,
6531
re_node_set *cur_nodes,
6532
re_node_set *next_nodes)
6533
internal_function;
6534
static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
6535
re_node_set *cur_nodes,
6536
int ex_subexp, int type)
6537
internal_function;
6538
static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
6539
re_node_set *dst_nodes,
6540
int target, int ex_subexp,
6541
int type) internal_function;
6542
static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
6543
re_node_set *cur_nodes, int cur_str,
6544
int subexp_num, int type)
6545
internal_function;
6546
static int build_trtable (const re_dfa_t *dfa,
6547
re_dfastate_t *state) internal_function;
6548
#ifdef RE_ENABLE_I18N
6549
static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
6550
const re_string_t *input, int idx)
6551
internal_function;
6552
# ifdef _LIBC
6553
static unsigned int find_collation_sequence_value (const unsigned char *mbs,
6554
size_t name_len)
6555
internal_function;
6556
# endif /* _LIBC */
6557
#endif /* RE_ENABLE_I18N */
6558
static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
6559
const re_dfastate_t *state,
6560
re_node_set *states_node,
6561
bitset_t *states_ch) internal_function;
6562
static int check_node_accept (const re_match_context_t *mctx,
6563
const re_token_t *node, int idx)
6564
internal_function;
6565
static reg_errcode_t extend_buffers (re_match_context_t *mctx)
6566
internal_function;
6567
6568
/* Entry point for POSIX code. */
6569
6570
/* regexec searches for a given pattern, specified by PREG, in the
6571
string STRING.
6572
6573
If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6574
`regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6575
least NMATCH elements, and we set them to the offsets of the
6576
corresponding matched substrings.
6577
6578
EFLAGS specifies `execution flags' which affect matching: if
6579
REG_NOTBOL is set, then ^ does not match at the beginning of the
6580
string; if REG_NOTEOL is set, then $ does not match at the end.
6581
6582
We return 0 if we find a match and REG_NOMATCH if not. */
6583
6584
int
6585
regexec (preg, string, nmatch, pmatch, eflags)
6586
const regex_t *__restrict preg;
6587
const char *__restrict string;
6588
size_t nmatch;
6589
regmatch_t pmatch[];
6590
int eflags;
6591
{
6592
reg_errcode_t err;
6593
int start, length;
6594
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
6595
6596
if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
6597
return REG_BADPAT;
6598
6599
if (eflags & REG_STARTEND)
6600
{
6601
start = pmatch[0].rm_so;
6602
length = pmatch[0].rm_eo;
6603
}
6604
else
6605
{
6606
start = 0;
6607
length = strlen (string);
6608
}
6609
6610
__libc_lock_lock (dfa->lock);
6611
if (preg->no_sub)
6612
err = re_search_internal (preg, string, length, start, length - start,
6613
length, 0, NULL, eflags);
6614
else
6615
err = re_search_internal (preg, string, length, start, length - start,
6616
length, nmatch, pmatch, eflags);
6617
__libc_lock_unlock (dfa->lock);
6618
return err != REG_NOERROR;
6619
}
6620
6621
#ifdef _LIBC
6622
# include <shlib-compat.h>
6623
versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
6624
6625
# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
6626
__typeof__ (__regexec) __compat_regexec;
6627
6628
int
6629
attribute_compat_text_section
6630
__compat_regexec (const regex_t *__restrict preg,
6631
const char *__restrict string, size_t nmatch,
6632
regmatch_t pmatch[], int eflags)
6633
{
6634
return regexec (preg, string, nmatch, pmatch,
6635
eflags & (REG_NOTBOL | REG_NOTEOL));
6636
}
6637
compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
6638
# endif
6639
#endif
6640
6641
/* Entry points for GNU code. */
6642
6643
/* re_match, re_search, re_match_2, re_search_2
6644
6645
The former two functions operate on STRING with length LENGTH,
6646
while the later two operate on concatenation of STRING1 and STRING2
6647
with lengths LENGTH1 and LENGTH2, respectively.
6648
6649
re_match() matches the compiled pattern in BUFP against the string,
6650
starting at index START.
6651
6652
re_search() first tries matching at index START, then it tries to match
6653
starting from index START + 1, and so on. The last start position tried
6654
is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
6655
way as re_match().)
6656
6657
The parameter STOP of re_{match,search}_2 specifies that no match exceeding
6658
the first STOP characters of the concatenation of the strings should be
6659
concerned.
6660
6661
If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
6662
and all groups is stroed in REGS. (For the "_2" variants, the offsets are
6663
computed relative to the concatenation, not relative to the individual
6664
strings.)
6665
6666
On success, re_match* functions return the length of the match, re_search*
6667
return the position of the start of the match. Return value -1 means no
6668
match was found and -2 indicates an internal error. */
6669
6670
int
6671
re_match (bufp, string, length, start, regs)
6672
struct re_pattern_buffer *bufp;
6673
const char *string;
6674
int length, start;
6675
struct re_registers *regs;
6676
{
6677
return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
6678
}
6679
#ifdef _LIBC
6680
weak_alias (__re_match, re_match)
6681
#endif
6682
6683
int
6684
re_search (bufp, string, length, start, range, regs)
6685
struct re_pattern_buffer *bufp;
6686
const char *string;
6687
int length, start, range;
6688
struct re_registers *regs;
6689
{
6690
return re_search_stub (bufp, string, length, start, range, length, regs, 0);
6691
}
6692
#ifdef _LIBC
6693
weak_alias (__re_search, re_search)
6694
#endif
6695
6696
int
6697
re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
6698
struct re_pattern_buffer *bufp;
6699
const char *string1, *string2;
6700
int length1, length2, start, stop;
6701
struct re_registers *regs;
6702
{
6703
return re_search_2_stub (bufp, string1, length1, string2, length2,
6704
start, 0, regs, stop, 1);
6705
}
6706
#ifdef _LIBC
6707
weak_alias (__re_match_2, re_match_2)
6708
#endif
6709
6710
int
6711
re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
6712
struct re_pattern_buffer *bufp;
6713
const char *string1, *string2;
6714
int length1, length2, start, range, stop;
6715
struct re_registers *regs;
6716
{
6717
return re_search_2_stub (bufp, string1, length1, string2, length2,
6718
start, range, regs, stop, 0);
6719
}
6720
#ifdef _LIBC
6721
weak_alias (__re_search_2, re_search_2)
6722
#endif
6723
6724
static int
6725
re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
6726
stop, ret_len)
6727
struct re_pattern_buffer *bufp;
6728
const char *string1, *string2;
6729
int length1, length2, start, range, stop, ret_len;
6730
struct re_registers *regs;
6731
{
6732
const char *str;
6733
int rval;
6734
int len = length1 + length2;
6735
int free_str = 0;
6736
6737
if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
6738
return -2;
6739
6740
/* Concatenate the strings. */
6741
if (length2 > 0)
6742
if (length1 > 0)
6743
{
6744
char *s = re_malloc (char, len);
6745
6746
if (BE (s == NULL, 0))
6747
return -2;
6748
#ifdef _LIBC
6749
memcpy (__mempcpy (s, string1, length1), string2, length2);
6750
#else
6751
memcpy (s, string1, length1);
6752
memcpy (s + length1, string2, length2);
6753
#endif
6754
str = s;
6755
free_str = 1;
6756
}
6757
else
6758
str = string2;
6759
else
6760
str = string1;
6761
6762
rval = re_search_stub (bufp, str, len, start, range, stop, regs,
6763
ret_len);
6764
if (free_str)
6765
re_free ((char *) str);
6766
return rval;
6767
}
6768
6769
/* The parameters have the same meaning as those of re_search.
6770
Additional parameters:
6771
If RET_LEN is nonzero the length of the match is returned (re_match style);
6772
otherwise the position of the match is returned. */
6773
6774
static int
6775
re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
6776
struct re_pattern_buffer *bufp;
6777
const char *string;
6778
int length, start, range, stop, ret_len;
6779
struct re_registers *regs;
6780
{
6781
reg_errcode_t result;
6782
regmatch_t *pmatch;
6783
int nregs, rval;
6784
int eflags = 0;
6785
re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
6786
6787
/* Check for out-of-range. */
6788
if (BE (start < 0 || start > length, 0))
6789
return -1;
6790
if (BE (start + range > length, 0))
6791
range = length - start;
6792
else if (BE (start + range < 0, 0))
6793
range = -start;
6794
6795
__libc_lock_lock (dfa->lock);
6796
6797
eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
6798
eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
6799
6800
/* Compile fastmap if we haven't yet. */
6801
if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
6802
re_compile_fastmap (bufp);
6803
6804
if (BE (bufp->no_sub, 0))
6805
regs = NULL;
6806
6807
/* We need at least 1 register. */
6808
if (regs == NULL)
6809
nregs = 1;
6810
else if (BE (bufp->regs_allocated == REGS_FIXED &&
6811
regs->num_regs < bufp->re_nsub + 1, 0))
6812
{
6813
nregs = regs->num_regs;
6814
if (BE (nregs < 1, 0))
6815
{
6816
/* Nothing can be copied to regs. */
6817
regs = NULL;
6818
nregs = 1;
6819
}
6820
}
6821
else
6822
nregs = bufp->re_nsub + 1;
6823
pmatch = re_malloc (regmatch_t, nregs);
6824
if (BE (pmatch == NULL, 0))
6825
{
6826
rval = -2;
6827
goto out;
6828
}
6829
6830
result = re_search_internal (bufp, string, length, start, range, stop,
6831
nregs, pmatch, eflags);
6832
6833
rval = 0;
6834
6835
/* I hope we needn't fill ther regs with -1's when no match was found. */
6836
if (result != REG_NOERROR)
6837
rval = -1;
6838
else if (regs != NULL)
6839
{
6840
/* If caller wants register contents data back, copy them. */
6841
bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
6842
bufp->regs_allocated);
6843
if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
6844
rval = -2;
6845
}
6846
6847
if (BE (rval == 0, 1))
6848
{
6849
if (ret_len)
6850
{
6851
assert (pmatch[0].rm_so == start);
6852
rval = pmatch[0].rm_eo - start;
6853
}
6854
else
6855
rval = pmatch[0].rm_so;
6856
}
6857
re_free (pmatch);
6858
out:
6859
__libc_lock_unlock (dfa->lock);
6860
return rval;
6861
}
6862
6863
static unsigned
6864
re_copy_regs (regs, pmatch, nregs, regs_allocated)
6865
struct re_registers *regs;
6866
regmatch_t *pmatch;
6867
int nregs, regs_allocated;
6868
{
6869
int rval = REGS_REALLOCATE;
6870
int i;
6871
int need_regs = nregs + 1;
6872
/* We need one extra element beyond `num_regs' for the `-1' marker GNU code
6873
uses. */
6874
6875
/* Have the register data arrays been allocated? */
6876
if (regs_allocated == REGS_UNALLOCATED)
6877
{ /* No. So allocate them with malloc. */
6878
regs->start = re_malloc (regoff_t, need_regs);
6879
regs->end = re_malloc (regoff_t, need_regs);
6880
if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
6881
return REGS_UNALLOCATED;
6882
regs->num_regs = need_regs;
6883
}
6884
else if (regs_allocated == REGS_REALLOCATE)
6885
{ /* Yes. If we need more elements than were already
6886
allocated, reallocate them. If we need fewer, just
6887
leave it alone. */
6888
if (BE (need_regs > regs->num_regs, 0))
6889
{
6890
regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
6891
regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
6892
if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
6893
return REGS_UNALLOCATED;
6894
regs->start = new_start;
6895
regs->end = new_end;
6896
regs->num_regs = need_regs;
6897
}
6898
}
6899
else
6900
{
6901
assert (regs_allocated == REGS_FIXED);
6902
/* This function may not be called with REGS_FIXED and nregs too big. */
6903
assert (regs->num_regs >= nregs);
6904
rval = REGS_FIXED;
6905
}
6906
6907
/* Copy the regs. */
6908
for (i = 0; i < nregs; ++i)
6909
{
6910
regs->start[i] = pmatch[i].rm_so;
6911
regs->end[i] = pmatch[i].rm_eo;
6912
}
6913
for ( ; i < regs->num_regs; ++i)
6914
regs->start[i] = regs->end[i] = -1;
6915
6916
return rval;
6917
}
6918
6919
/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
6920
ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
6921
this memory for recording register information. STARTS and ENDS
6922
must be allocated using the malloc library routine, and must each
6923
be at least NUM_REGS * sizeof (regoff_t) bytes long.
6924
6925
If NUM_REGS == 0, then subsequent matches should allocate their own
6926
register data.
6927
6928
Unless this function is called, the first search or match using
6929
PATTERN_BUFFER will allocate its own register data, without
6930
freeing the old data. */
6931
6932
void
6933
re_set_registers (bufp, regs, num_regs, starts, ends)
6934
struct re_pattern_buffer *bufp;
6935
struct re_registers *regs;
6936
unsigned num_regs;
6937
regoff_t *starts, *ends;
6938
{
6939
if (num_regs)
6940
{
6941
bufp->regs_allocated = REGS_REALLOCATE;
6942
regs->num_regs = num_regs;
6943
regs->start = starts;
6944
regs->end = ends;
6945
}
6946
else
6947
{
6948
bufp->regs_allocated = REGS_UNALLOCATED;
6949
regs->num_regs = 0;
6950
regs->start = regs->end = (regoff_t *) 0;
6951
}
6952
}
6953
#ifdef _LIBC
6954
weak_alias (__re_set_registers, re_set_registers)
6955
#endif
6956
6957
/* Entry points compatible with 4.2 BSD regex library. We don't define
6958
them unless specifically requested. */
6959
6960
#if defined _REGEX_RE_COMP || defined _LIBC
6961
int
6962
# ifdef _LIBC
6963
weak_function
6964
# endif
6965
re_exec (s)
6966
const char *s;
6967
{
6968
return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
6969
}
6970
#endif /* _REGEX_RE_COMP */
6971
6972
/* Internal entry point. */
6973
6974
/* Searches for a compiled pattern PREG in the string STRING, whose
6975
length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
6976
mingings with regexec. START, and RANGE have the same meanings
6977
with re_search.
6978
Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
6979
otherwise return the error code.
6980
Note: We assume front end functions already check ranges.
6981
(START + RANGE >= 0 && START + RANGE <= LENGTH) */
6982
6983
static reg_errcode_t
6984
re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
6985
eflags)
6986
const regex_t *preg;
6987
const char *string;
6988
int length, start, range, stop, eflags;
6989
size_t nmatch;
6990
regmatch_t pmatch[];
6991
{
6992
reg_errcode_t err;
6993
const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
6994
int left_lim, right_lim, incr;
6995
int fl_longest_match, match_first, match_kind, match_last = -1;
6996
int extra_nmatch;
6997
int sb, ch;
6998
#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
6999
re_match_context_t mctx = { .dfa = dfa };
7000
#else
7001
re_match_context_t mctx;
7002
#endif
7003
char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
7004
&& range && !preg->can_be_null) ? preg->fastmap : NULL;
7005
RE_TRANSLATE_TYPE t = preg->translate;
7006
7007
#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
7008
memset (&mctx, '\0', sizeof (re_match_context_t));
7009
mctx.dfa = dfa;
7010
#endif
7011
7012
extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
7013
nmatch -= extra_nmatch;
7014
7015
/* Check if the DFA haven't been compiled. */
7016
if (BE (preg->used == 0 || dfa->init_state == NULL
7017
|| dfa->init_state_word == NULL || dfa->init_state_nl == NULL
7018
|| dfa->init_state_begbuf == NULL, 0))
7019
return REG_NOMATCH;
7020
7021
#ifdef DEBUG
7022
/* We assume front-end functions already check them. */
7023
assert (start + range >= 0 && start + range <= length);
7024
#endif
7025
7026
/* If initial states with non-begbuf contexts have no elements,
7027
the regex must be anchored. If preg->newline_anchor is set,
7028
we'll never use init_state_nl, so do not check it. */
7029
if (dfa->init_state->nodes.nelem == 0
7030
&& dfa->init_state_word->nodes.nelem == 0
7031
&& (dfa->init_state_nl->nodes.nelem == 0
7032
|| !preg->newline_anchor))
7033
{
7034
if (start != 0 && start + range != 0)
7035
return REG_NOMATCH;
7036
start = range = 0;
7037
}
7038
7039
/* We must check the longest matching, if nmatch > 0. */
7040
fl_longest_match = (nmatch != 0 || dfa->nbackref);
7041
7042
err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
7043
preg->translate, preg->syntax & RE_ICASE, dfa);
7044
if (BE (err != REG_NOERROR, 0))
7045
goto free_return;
7046
mctx.input.stop = stop;
7047
mctx.input.raw_stop = stop;
7048
mctx.input.newline_anchor = preg->newline_anchor;
7049
7050
err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
7051
if (BE (err != REG_NOERROR, 0))
7052
goto free_return;
7053
7054
/* We will log all the DFA states through which the dfa pass,
7055
if nmatch > 1, or this dfa has "multibyte node", which is a
7056
back-reference or a node which can accept multibyte character or
7057
multi character collating element. */
7058
if (nmatch > 1 || dfa->has_mb_node)
7059
{
7060
mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
7061
if (BE (mctx.state_log == NULL, 0))
7062
{
7063
err = REG_ESPACE;
7064
goto free_return;
7065
}
7066
}
7067
else
7068
mctx.state_log = NULL;
7069
7070
match_first = start;
7071
mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
7072
: CONTEXT_NEWLINE | CONTEXT_BEGBUF;
7073
7074
/* Check incrementally whether of not the input string match. */
7075
incr = (range < 0) ? -1 : 1;
7076
left_lim = (range < 0) ? start + range : start;
7077
right_lim = (range < 0) ? start : start + range;
7078
sb = dfa->mb_cur_max == 1;
7079
match_kind =
7080
(fastmap
7081
? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
7082
| (range >= 0 ? 2 : 0)
7083
| (t != NULL ? 1 : 0))
7084
: 8);
7085
7086
for (;; match_first += incr)
7087
{
7088
err = REG_NOMATCH;
7089
if (match_first < left_lim || right_lim < match_first)
7090
goto free_return;
7091
7092
/* Advance as rapidly as possible through the string, until we
7093
find a plausible place to start matching. This may be done
7094
with varying efficiency, so there are various possibilities:
7095
only the most common of them are specialized, in order to
7096
save on code size. We use a switch statement for speed. */
7097
switch (match_kind)
7098
{
7099
case 8:
7100
/* No fastmap. */
7101
break;
7102
7103
case 7:
7104
/* Fastmap with single-byte translation, match forward. */
7105
while (BE (match_first < right_lim, 1)
7106
&& !fastmap[t[(unsigned char) string[match_first]]])
7107
++match_first;
7108
goto forward_match_found_start_or_reached_end;
7109
7110
case 6:
7111
/* Fastmap without translation, match forward. */
7112
while (BE (match_first < right_lim, 1)
7113
&& !fastmap[(unsigned char) string[match_first]])
7114
++match_first;
7115
7116
forward_match_found_start_or_reached_end:
7117
if (BE (match_first == right_lim, 0))
7118
{
7119
ch = match_first >= length
7120
? 0 : (unsigned char) string[match_first];
7121
if (!fastmap[t ? t[ch] : ch])
7122
goto free_return;
7123
}
7124
break;
7125
7126
case 4:
7127
case 5:
7128
/* Fastmap without multi-byte translation, match backwards. */
7129
while (match_first >= left_lim)
7130
{
7131
ch = match_first >= length
7132
? 0 : (unsigned char) string[match_first];
7133
if (fastmap[t ? t[ch] : ch])
7134
break;
7135
--match_first;
7136
}
7137
if (match_first < left_lim)
7138
goto free_return;
7139
break;
7140
7141
default:
7142
/* In this case, we can't determine easily the current byte,
7143
since it might be a component byte of a multibyte
7144
character. Then we use the constructed buffer instead. */
7145
for (;;)
7146
{
7147
/* If MATCH_FIRST is out of the valid range, reconstruct the
7148
buffers. */
7149
unsigned int offset = match_first - mctx.input.raw_mbs_idx;
7150
if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
7151
{
7152
err = re_string_reconstruct (&mctx.input, match_first,
7153
eflags);
7154
if (BE (err != REG_NOERROR, 0))
7155
goto free_return;
7156
7157
offset = match_first - mctx.input.raw_mbs_idx;
7158
}
7159
/* If MATCH_FIRST is out of the buffer, leave it as '\0'.
7160
Note that MATCH_FIRST must not be smaller than 0. */
7161
ch = (match_first >= length
7162
? 0 : re_string_byte_at (&mctx.input, offset));
7163
if (fastmap[ch])
7164
break;
7165
match_first += incr;
7166
if (match_first < left_lim || match_first > right_lim)
7167
{
7168
err = REG_NOMATCH;
7169
goto free_return;
7170
}
7171
}
7172
break;
7173
}
7174
7175
/* Reconstruct the buffers so that the matcher can assume that
7176
the matching starts from the beginning of the buffer. */
7177
err = re_string_reconstruct (&mctx.input, match_first, eflags);
7178
if (BE (err != REG_NOERROR, 0))
7179
goto free_return;
7180
7181
#ifdef RE_ENABLE_I18N
7182
/* Don't consider this char as a possible match start if it part,
7183
yet isn't the head, of a multibyte character. */
7184
if (!sb && !re_string_first_byte (&mctx.input, 0))
7185
continue;
7186
#endif
7187
7188
/* It seems to be appropriate one, then use the matcher. */
7189
/* We assume that the matching starts from 0. */
7190
mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
7191
match_last = check_matching (&mctx, fl_longest_match,
7192
range >= 0 ? &match_first : NULL);
7193
if (match_last != -1)
7194
{
7195
if (BE (match_last == -2, 0))
7196
{
7197
err = REG_ESPACE;
7198
goto free_return;
7199
}
7200
else
7201
{
7202
mctx.match_last = match_last;
7203
if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
7204
{
7205
re_dfastate_t *pstate = mctx.state_log[match_last];
7206
mctx.last_node = check_halt_state_context (&mctx, pstate,
7207
match_last);
7208
}
7209
if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
7210
|| dfa->nbackref)
7211
{
7212
err = prune_impossible_nodes (&mctx);
7213
if (err == REG_NOERROR)
7214
break;
7215
if (BE (err != REG_NOMATCH, 0))
7216
goto free_return;
7217
match_last = -1;
7218
}
7219
else
7220
break; /* We found a match. */
7221
}
7222
}
7223
7224
match_ctx_clean (&mctx);
7225
}
7226
7227
#ifdef DEBUG
7228
assert (match_last != -1);
7229
assert (err == REG_NOERROR);
7230
#endif
7231
7232
/* Set pmatch[] if we need. */
7233
if (nmatch > 0)
7234
{
7235
int reg_idx;
7236
7237
/* Initialize registers. */
7238
for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
7239
pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
7240
7241
/* Set the points where matching start/end. */
7242
pmatch[0].rm_so = 0;
7243
pmatch[0].rm_eo = mctx.match_last;
7244
7245
if (!preg->no_sub && nmatch > 1)
7246
{
7247
err = set_regs (preg, &mctx, nmatch, pmatch,
7248
dfa->has_plural_match && dfa->nbackref > 0);
7249
if (BE (err != REG_NOERROR, 0))
7250
goto free_return;
7251
}
7252
7253
/* At last, add the offset to the each registers, since we slided
7254
the buffers so that we could assume that the matching starts
7255
from 0. */
7256
for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
7257
if (pmatch[reg_idx].rm_so != -1)
7258
{
7259
#ifdef RE_ENABLE_I18N
7260
if (BE (mctx.input.offsets_needed != 0, 0))
7261
{
7262
pmatch[reg_idx].rm_so =
7263
(pmatch[reg_idx].rm_so == mctx.input.valid_len
7264
? mctx.input.valid_raw_len
7265
: mctx.input.offsets[pmatch[reg_idx].rm_so]);
7266
pmatch[reg_idx].rm_eo =
7267
(pmatch[reg_idx].rm_eo == mctx.input.valid_len
7268
? mctx.input.valid_raw_len
7269
: mctx.input.offsets[pmatch[reg_idx].rm_eo]);
7270
}
7271
#else
7272
assert (mctx.input.offsets_needed == 0);
7273
#endif
7274
pmatch[reg_idx].rm_so += match_first;
7275
pmatch[reg_idx].rm_eo += match_first;
7276
}
7277
for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
7278
{
7279
pmatch[nmatch + reg_idx].rm_so = -1;
7280
pmatch[nmatch + reg_idx].rm_eo = -1;
7281
}
7282
7283
if (dfa->subexp_map)
7284
for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
7285
if (dfa->subexp_map[reg_idx] != reg_idx)
7286
{
7287
pmatch[reg_idx + 1].rm_so
7288
= pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
7289
pmatch[reg_idx + 1].rm_eo
7290
= pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
7291
}
7292
}
7293
7294
free_return:
7295
re_free (mctx.state_log);
7296
if (dfa->nbackref)
7297
match_ctx_free (&mctx);
7298
re_string_destruct (&mctx.input);
7299
return err;
7300
}
7301
7302
static reg_errcode_t
7303
prune_impossible_nodes (mctx)
7304
re_match_context_t *mctx;
7305
{
7306
const re_dfa_t *const dfa = mctx->dfa;
7307
int halt_node, match_last;
7308
reg_errcode_t ret;
7309
re_dfastate_t **sifted_states;
7310
re_dfastate_t **lim_states = NULL;
7311
re_sift_context_t sctx;
7312
#ifdef DEBUG
7313
assert (mctx->state_log != NULL);
7314
#endif
7315
match_last = mctx->match_last;
7316
halt_node = mctx->last_node;
7317
sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
7318
if (BE (sifted_states == NULL, 0))
7319
{
7320
ret = REG_ESPACE;
7321
goto free_return;
7322
}
7323
if (dfa->nbackref)
7324
{
7325
lim_states = re_malloc (re_dfastate_t *, match_last + 1);
7326
if (BE (lim_states == NULL, 0))
7327
{
7328
ret = REG_ESPACE;
7329
goto free_return;
7330
}
7331
while (1)
7332
{
7333
memset (lim_states, '\0',
7334
sizeof (re_dfastate_t *) * (match_last + 1));
7335
sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
7336
match_last);
7337
ret = sift_states_backward (mctx, &sctx);
7338
re_node_set_free (&sctx.limits);
7339
if (BE (ret != REG_NOERROR, 0))
7340
goto free_return;
7341
if (sifted_states[0] != NULL || lim_states[0] != NULL)
7342
break;
7343
do
7344
{
7345
--match_last;
7346
if (match_last < 0)
7347
{
7348
ret = REG_NOMATCH;
7349
goto free_return;
7350
}
7351
} while (mctx->state_log[match_last] == NULL
7352
|| !mctx->state_log[match_last]->halt);
7353
halt_node = check_halt_state_context (mctx,
7354
mctx->state_log[match_last],
7355
match_last);
7356
}
7357
ret = merge_state_array (dfa, sifted_states, lim_states,
7358
match_last + 1);
7359
re_free (lim_states);
7360
lim_states = NULL;
7361
if (BE (ret != REG_NOERROR, 0))
7362
goto free_return;
7363
}
7364
else
7365
{
7366
sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
7367
ret = sift_states_backward (mctx, &sctx);
7368
re_node_set_free (&sctx.limits);
7369
if (BE (ret != REG_NOERROR, 0))
7370
goto free_return;
7371
}
7372
re_free (mctx->state_log);
7373
mctx->state_log = sifted_states;
7374
sifted_states = NULL;
7375
mctx->last_node = halt_node;
7376
mctx->match_last = match_last;
7377
ret = REG_NOERROR;
7378
free_return:
7379
re_free (sifted_states);
7380
re_free (lim_states);
7381
return ret;
7382
}
7383
7384
/* Acquire an initial state and return it.
7385
We must select appropriate initial state depending on the context,
7386
since initial states may have constraints like "\<", "^", etc.. */
7387
7388
static inline re_dfastate_t *
7389
__attribute ((always_inline)) internal_function
7390
acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
7391
int idx)
7392
{
7393
const re_dfa_t *const dfa = mctx->dfa;
7394
if (dfa->init_state->has_constraint)
7395
{
7396
unsigned int context;
7397
context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
7398
if (IS_WORD_CONTEXT (context))
7399
return dfa->init_state_word;
7400
else if (IS_ORDINARY_CONTEXT (context))
7401
return dfa->init_state;
7402
else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
7403
return dfa->init_state_begbuf;
7404
else if (IS_NEWLINE_CONTEXT (context))
7405
return dfa->init_state_nl;
7406
else if (IS_BEGBUF_CONTEXT (context))
7407
{
7408
/* It is relatively rare case, then calculate on demand. */
7409
return re_acquire_state_context (err, dfa,
7410
dfa->init_state->entrance_nodes,
7411
context);
7412
}
7413
else
7414
/* Must not happen? */
7415
return dfa->init_state;
7416
}
7417
else
7418
return dfa->init_state;
7419
}
7420
7421
/* Check whether the regular expression match input string INPUT or not,
7422
and return the index where the matching end, return -1 if not match,
7423
or return -2 in case of an error.
7424
FL_LONGEST_MATCH means we want the POSIX longest matching.
7425
If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
7426
next place where we may want to try matching.
7427
Note that the matcher assume that the maching starts from the current
7428
index of the buffer. */
7429
7430
static int
7431
internal_function
7432
check_matching (re_match_context_t *mctx, int fl_longest_match,
7433
int *p_match_first)
7434
{
7435
const re_dfa_t *const dfa = mctx->dfa;
7436
reg_errcode_t err;
7437
int match = 0;
7438
int match_last = -1;
7439
int cur_str_idx = re_string_cur_idx (&mctx->input);
7440
re_dfastate_t *cur_state;
7441
int at_init_state = p_match_first != NULL;
7442
int next_start_idx = cur_str_idx;
7443
7444
err = REG_NOERROR;
7445
cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
7446
/* An initial state must not be NULL (invalid). */
7447
if (BE (cur_state == NULL, 0))
7448
{
7449
assert (err == REG_ESPACE);
7450
return -2;
7451
}
7452
7453
if (mctx->state_log != NULL)
7454
{
7455
mctx->state_log[cur_str_idx] = cur_state;
7456
7457
/* Check OP_OPEN_SUBEXP in the initial state in case that we use them
7458
later. E.g. Processing back references. */
7459
if (BE (dfa->nbackref, 0))
7460
{
7461
at_init_state = 0;
7462
err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
7463
if (BE (err != REG_NOERROR, 0))
7464
return err;
7465
7466
if (cur_state->has_backref)
7467
{
7468
err = transit_state_bkref (mctx, &cur_state->nodes);
7469
if (BE (err != REG_NOERROR, 0))
7470
return err;
7471
}
7472
}
7473
}
7474
7475
/* If the RE accepts NULL string. */
7476
if (BE (cur_state->halt, 0))
7477
{
7478
if (!cur_state->has_constraint
7479
|| check_halt_state_context (mctx, cur_state, cur_str_idx))
7480
{
7481
if (!fl_longest_match)
7482
return cur_str_idx;
7483
else
7484
{
7485
match_last = cur_str_idx;
7486
match = 1;
7487
}
7488
}
7489
}
7490
7491
while (!re_string_eoi (&mctx->input))
7492
{
7493
re_dfastate_t *old_state = cur_state;
7494
int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
7495
7496
if (BE (next_char_idx >= mctx->input.bufs_len, 0)
7497
|| (BE (next_char_idx >= mctx->input.valid_len, 0)
7498
&& mctx->input.valid_len < mctx->input.len))
7499
{
7500
err = extend_buffers (mctx);
7501
if (BE (err != REG_NOERROR, 0))
7502
{
7503
assert (err == REG_ESPACE);
7504
return -2;
7505
}
7506
}
7507
7508
cur_state = transit_state (&err, mctx, cur_state);
7509
if (mctx->state_log != NULL)
7510
cur_state = merge_state_with_log (&err, mctx, cur_state);
7511
7512
if (cur_state == NULL)
7513
{
7514
/* Reached the invalid state or an error. Try to recover a valid
7515
state using the state log, if available and if we have not
7516
already found a valid (even if not the longest) match. */
7517
if (BE (err != REG_NOERROR, 0))
7518
return -2;
7519
7520
if (mctx->state_log == NULL
7521
|| (match && !fl_longest_match)
7522
|| (cur_state = find_recover_state (&err, mctx)) == NULL)
7523
break;
7524
}
7525
7526
if (BE (at_init_state, 0))
7527
{
7528
if (old_state == cur_state)
7529
next_start_idx = next_char_idx;
7530
else
7531
at_init_state = 0;
7532
}
7533
7534
if (cur_state->halt)
7535
{
7536
/* Reached a halt state.
7537
Check the halt state can satisfy the current context. */
7538
if (!cur_state->has_constraint
7539
|| check_halt_state_context (mctx, cur_state,
7540
re_string_cur_idx (&mctx->input)))
7541
{
7542
/* We found an appropriate halt state. */
7543
match_last = re_string_cur_idx (&mctx->input);
7544
match = 1;
7545
7546
/* We found a match, do not modify match_first below. */
7547
p_match_first = NULL;
7548
if (!fl_longest_match)
7549
break;
7550
}
7551
}
7552
}
7553
7554
if (p_match_first)
7555
*p_match_first += next_start_idx;
7556
7557
return match_last;
7558
}
7559
7560
/* Check NODE match the current context. */
7561
7562
static int
7563
internal_function
7564
check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
7565
{
7566
re_token_type_t type = dfa->nodes[node].type;
7567
unsigned int constraint = dfa->nodes[node].constraint;
7568
if (type != END_OF_RE)
7569
return 0;
7570
if (!constraint)
7571
return 1;
7572
if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
7573
return 0;
7574
return 1;
7575
}
7576
7577
/* Check the halt state STATE match the current context.
7578
Return 0 if not match, if the node, STATE has, is a halt node and
7579
match the context, return the node. */
7580
7581
static int
7582
internal_function
7583
check_halt_state_context (const re_match_context_t *mctx,
7584
const re_dfastate_t *state, int idx)
7585
{
7586
int i;
7587
unsigned int context;
7588
#ifdef DEBUG
7589
assert (state->halt);
7590
#endif
7591
context = re_string_context_at (&mctx->input, idx, mctx->eflags);
7592
for (i = 0; i < state->nodes.nelem; ++i)
7593
if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
7594
return state->nodes.elems[i];
7595
return 0;
7596
}
7597
7598
/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
7599
corresponding to the DFA).
7600
Return the destination node, and update EPS_VIA_NODES, return -1 in case
7601
of errors. */
7602
7603
static int
7604
internal_function
7605
proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
7606
int *pidx, int node, re_node_set *eps_via_nodes,
7607
struct re_fail_stack_t *fs)
7608
{
7609
const re_dfa_t *const dfa = mctx->dfa;
7610
int i, err;
7611
if (IS_EPSILON_NODE (dfa->nodes[node].type))
7612
{
7613
re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
7614
re_node_set *edests = &dfa->edests[node];
7615
int dest_node;
7616
err = re_node_set_insert (eps_via_nodes, node);
7617
if (BE (err < 0, 0))
7618
return -2;
7619
/* Pick up a valid destination, or return -1 if none is found. */
7620
for (dest_node = -1, i = 0; i < edests->nelem; ++i)
7621
{
7622
int candidate = edests->elems[i];
7623
if (!re_node_set_contains (cur_nodes, candidate))
7624
continue;
7625
if (dest_node == -1)
7626
dest_node = candidate;
7627
7628
else
7629
{
7630
/* In order to avoid infinite loop like "(a*)*", return the second
7631
epsilon-transition if the first was already considered. */
7632
if (re_node_set_contains (eps_via_nodes, dest_node))
7633
return candidate;
7634
7635
/* Otherwise, push the second epsilon-transition on the fail stack. */
7636
else if (fs != NULL
7637
&& push_fail_stack (fs, *pidx, candidate, nregs, regs,
7638
eps_via_nodes))
7639
return -2;
7640
7641
/* We know we are going to exit. */
7642
break;
7643
}
7644
}
7645
return dest_node;
7646
}
7647
else
7648
{
7649
int naccepted = 0;
7650
re_token_type_t type = dfa->nodes[node].type;
7651
7652
#ifdef RE_ENABLE_I18N
7653
if (dfa->nodes[node].accept_mb)
7654
naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
7655
else
7656
#endif /* RE_ENABLE_I18N */
7657
if (type == OP_BACK_REF)
7658
{
7659
int subexp_idx = dfa->nodes[node].opr.idx + 1;
7660
naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
7661
if (fs != NULL)
7662
{
7663
if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
7664
return -1;
7665
else if (naccepted)
7666
{
7667
char *buf = (char *) re_string_get_buffer (&mctx->input);
7668
if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
7669
naccepted) != 0)
7670
return -1;
7671
}
7672
}
7673
7674
if (naccepted == 0)
7675
{
7676
int dest_node;
7677
err = re_node_set_insert (eps_via_nodes, node);
7678
if (BE (err < 0, 0))
7679
return -2;
7680
dest_node = dfa->edests[node].elems[0];
7681
if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
7682
dest_node))
7683
return dest_node;
7684
}
7685
}
7686
7687
if (naccepted != 0
7688
|| check_node_accept (mctx, dfa->nodes + node, *pidx))
7689
{
7690
int dest_node = dfa->nexts[node];
7691
*pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
7692
if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
7693
|| !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
7694
dest_node)))
7695
return -1;
7696
re_node_set_empty (eps_via_nodes);
7697
return dest_node;
7698
}
7699
}
7700
return -1;
7701
}
7702
7703
static reg_errcode_t
7704
internal_function
7705
push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
7706
int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
7707
{
7708
reg_errcode_t err;
7709
int num = fs->num++;
7710
if (fs->num == fs->alloc)
7711
{
7712
struct re_fail_stack_ent_t *new_array;
7713
new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
7714
* fs->alloc * 2));
7715
if (new_array == NULL)
7716
return REG_ESPACE;
7717
fs->alloc *= 2;
7718
fs->stack = new_array;
7719
}
7720
fs->stack[num].idx = str_idx;
7721
fs->stack[num].node = dest_node;
7722
fs->stack[num].regs = re_malloc (regmatch_t, nregs);
7723
if (fs->stack[num].regs == NULL)
7724
return REG_ESPACE;
7725
memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
7726
err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
7727
return err;
7728
}
7729
7730
static int
7731
internal_function
7732
pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
7733
regmatch_t *regs, re_node_set *eps_via_nodes)
7734
{
7735
int num = --fs->num;
7736
assert (num >= 0);
7737
*pidx = fs->stack[num].idx;
7738
memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
7739
re_node_set_free (eps_via_nodes);
7740
re_free (fs->stack[num].regs);
7741
*eps_via_nodes = fs->stack[num].eps_via_nodes;
7742
return fs->stack[num].node;
7743
}
7744
7745
/* Set the positions where the subexpressions are starts/ends to registers
7746
PMATCH.
7747
Note: We assume that pmatch[0] is already set, and
7748
pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
7749
7750
static reg_errcode_t
7751
internal_function
7752
set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
7753
regmatch_t *pmatch, int fl_backtrack)
7754
{
7755
const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
7756
int idx, cur_node;
7757
re_node_set eps_via_nodes;
7758
struct re_fail_stack_t *fs;
7759
struct re_fail_stack_t fs_body = { 0, 2, NULL };
7760
regmatch_t *prev_idx_match;
7761
int prev_idx_match_malloced = 0;
7762
7763
#ifdef DEBUG
7764
assert (nmatch > 1);
7765
assert (mctx->state_log != NULL);
7766
#endif
7767
if (fl_backtrack)
7768
{
7769
fs = &fs_body;
7770
fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
7771
if (fs->stack == NULL)
7772
return REG_ESPACE;
7773
}
7774
else
7775
fs = NULL;
7776
7777
cur_node = dfa->init_node;
7778
re_node_set_init_empty (&eps_via_nodes);
7779
7780
if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
7781
prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
7782
else
7783
{
7784
prev_idx_match = re_malloc (regmatch_t, nmatch);
7785
if (prev_idx_match == NULL)
7786
{
7787
free_fail_stack_return (fs);
7788
return REG_ESPACE;
7789
}
7790
prev_idx_match_malloced = 1;
7791
}
7792
memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
7793
7794
for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
7795
{
7796
update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
7797
7798
if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
7799
{
7800
int reg_idx;
7801
if (fs)
7802
{
7803
for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
7804
if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
7805
break;
7806
if (reg_idx == nmatch)
7807
{
7808
re_node_set_free (&eps_via_nodes);
7809
if (prev_idx_match_malloced)
7810
re_free (prev_idx_match);
7811
return free_fail_stack_return (fs);
7812
}
7813
cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
7814
&eps_via_nodes);
7815
}
7816
else
7817
{
7818
re_node_set_free (&eps_via_nodes);
7819
if (prev_idx_match_malloced)
7820
re_free (prev_idx_match);
7821
return REG_NOERROR;
7822
}
7823
}
7824
7825
/* Proceed to next node. */
7826
cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
7827
&eps_via_nodes, fs);
7828
7829
if (BE (cur_node < 0, 0))
7830
{
7831
if (BE (cur_node == -2, 0))
7832
{
7833
re_node_set_free (&eps_via_nodes);
7834
if (prev_idx_match_malloced)
7835
re_free (prev_idx_match);
7836
free_fail_stack_return (fs);
7837
return REG_ESPACE;
7838
}
7839
if (fs)
7840
cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
7841
&eps_via_nodes);
7842
else
7843
{
7844
re_node_set_free (&eps_via_nodes);
7845
if (prev_idx_match_malloced)
7846
re_free (prev_idx_match);
7847
return REG_NOMATCH;
7848
}
7849
}
7850
}
7851
re_node_set_free (&eps_via_nodes);
7852
if (prev_idx_match_malloced)
7853
re_free (prev_idx_match);
7854
return free_fail_stack_return (fs);
7855
}
7856
7857
static reg_errcode_t
7858
internal_function
7859
free_fail_stack_return (struct re_fail_stack_t *fs)
7860
{
7861
if (fs)
7862
{
7863
int fs_idx;
7864
for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
7865
{
7866
re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
7867
re_free (fs->stack[fs_idx].regs);
7868
}
7869
re_free (fs->stack);
7870
}
7871
return REG_NOERROR;
7872
}
7873
7874
static void
7875
internal_function
7876
update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
7877
regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
7878
{
7879
int type = dfa->nodes[cur_node].type;
7880
if (type == OP_OPEN_SUBEXP)
7881
{
7882
int reg_num = dfa->nodes[cur_node].opr.idx + 1;
7883
7884
/* We are at the first node of this sub expression. */
7885
if (reg_num < nmatch)
7886
{
7887
pmatch[reg_num].rm_so = cur_idx;
7888
pmatch[reg_num].rm_eo = -1;
7889
}
7890
}
7891
else if (type == OP_CLOSE_SUBEXP)
7892
{
7893
int reg_num = dfa->nodes[cur_node].opr.idx + 1;
7894
if (reg_num < nmatch)
7895
{
7896
/* We are at the last node of this sub expression. */
7897
if (pmatch[reg_num].rm_so < cur_idx)
7898
{
7899
pmatch[reg_num].rm_eo = cur_idx;
7900
/* This is a non-empty match or we are not inside an optional
7901
subexpression. Accept this right away. */
7902
memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
7903
}
7904
else
7905
{
7906
if (dfa->nodes[cur_node].opt_subexp
7907
&& prev_idx_match[reg_num].rm_so != -1)
7908
/* We transited through an empty match for an optional
7909
subexpression, like (a?)*, and this is not the subexp's
7910
first match. Copy back the old content of the registers
7911
so that matches of an inner subexpression are undone as
7912
well, like in ((a?))*. */
7913
memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
7914
else
7915
/* We completed a subexpression, but it may be part of
7916
an optional one, so do not update PREV_IDX_MATCH. */
7917
pmatch[reg_num].rm_eo = cur_idx;
7918
}
7919
}
7920
}
7921
}
7922
7923
/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
7924
and sift the nodes in each states according to the following rules.
7925
Updated state_log will be wrote to STATE_LOG.
7926
7927
Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
7928
1. When STR_IDX == MATCH_LAST(the last index in the state_log):
7929
If `a' isn't the LAST_NODE and `a' can't epsilon transit to
7930
the LAST_NODE, we throw away the node `a'.
7931
2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
7932
string `s' and transit to `b':
7933
i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
7934
away the node `a'.
7935
ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
7936
thrown away, we throw away the node `a'.
7937
3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
7938
i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
7939
node `a'.
7940
ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
7941
we throw away the node `a'. */
7942
7943
#define STATE_NODE_CONTAINS(state,node) \
7944
((state) != NULL && re_node_set_contains (&(state)->nodes, node))
7945
7946
static reg_errcode_t
7947
internal_function
7948
sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
7949
{
7950
reg_errcode_t err;
7951
int null_cnt = 0;
7952
int str_idx = sctx->last_str_idx;
7953
re_node_set cur_dest;
7954
7955
#ifdef DEBUG
7956
assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
7957
#endif
7958
7959
/* Build sifted state_log[str_idx]. It has the nodes which can epsilon
7960
transit to the last_node and the last_node itself. */
7961
err = re_node_set_init_1 (&cur_dest, sctx->last_node);
7962
if (BE (err != REG_NOERROR, 0))
7963
return err;
7964
err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
7965
if (BE (err != REG_NOERROR, 0))
7966
goto free_return;
7967
7968
/* Then check each states in the state_log. */
7969
while (str_idx > 0)
7970
{
7971
/* Update counters. */
7972
null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
7973
if (null_cnt > mctx->max_mb_elem_len)
7974
{
7975
memset (sctx->sifted_states, '\0',
7976
sizeof (re_dfastate_t *) * str_idx);
7977
re_node_set_free (&cur_dest);
7978
return REG_NOERROR;
7979
}
7980
re_node_set_empty (&cur_dest);
7981
--str_idx;
7982
7983
if (mctx->state_log[str_idx])
7984
{
7985
err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
7986
if (BE (err != REG_NOERROR, 0))
7987
goto free_return;
7988
}
7989
7990
/* Add all the nodes which satisfy the following conditions:
7991
- It can epsilon transit to a node in CUR_DEST.
7992
- It is in CUR_SRC.
7993
And update state_log. */
7994
err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
7995
if (BE (err != REG_NOERROR, 0))
7996
goto free_return;
7997
}
7998
err = REG_NOERROR;
7999
free_return:
8000
re_node_set_free (&cur_dest);
8001
return err;
8002
}
8003
8004
static reg_errcode_t
8005
internal_function
8006
build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
8007
int str_idx, re_node_set *cur_dest)
8008
{
8009
const re_dfa_t *const dfa = mctx->dfa;
8010
const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
8011
int i;
8012
8013
/* Then build the next sifted state.
8014
We build the next sifted state on `cur_dest', and update
8015
`sifted_states[str_idx]' with `cur_dest'.
8016
Note:
8017
`cur_dest' is the sifted state from `state_log[str_idx + 1]'.
8018
`cur_src' points the node_set of the old `state_log[str_idx]'
8019
(with the epsilon nodes pre-filtered out). */
8020
for (i = 0; i < cur_src->nelem; i++)
8021
{
8022
int prev_node = cur_src->elems[i];
8023
int naccepted = 0;
8024
int ret;
8025
8026
#ifdef DEBUG
8027
re_token_type_t type = dfa->nodes[prev_node].type;
8028
assert (!IS_EPSILON_NODE (type));
8029
#endif
8030
#ifdef RE_ENABLE_I18N
8031
/* If the node may accept `multi byte'. */
8032
if (dfa->nodes[prev_node].accept_mb)
8033
naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
8034
str_idx, sctx->last_str_idx);
8035
#endif /* RE_ENABLE_I18N */
8036
8037
/* We don't check backreferences here.
8038
See update_cur_sifted_state(). */
8039
if (!naccepted
8040
&& check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
8041
&& STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
8042
dfa->nexts[prev_node]))
8043
naccepted = 1;
8044
8045
if (naccepted == 0)
8046
continue;
8047
8048
if (sctx->limits.nelem)
8049
{
8050
int to_idx = str_idx + naccepted;
8051
if (check_dst_limits (mctx, &sctx->limits,
8052
dfa->nexts[prev_node], to_idx,
8053
prev_node, str_idx))
8054
continue;
8055
}
8056
ret = re_node_set_insert (cur_dest, prev_node);
8057
if (BE (ret == -1, 0))
8058
return REG_ESPACE;
8059
}
8060
8061
return REG_NOERROR;
8062
}
8063
8064
/* Helper functions. */
8065
8066
static reg_errcode_t
8067
internal_function
8068
clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
8069
{
8070
int top = mctx->state_log_top;
8071
8072
if (next_state_log_idx >= mctx->input.bufs_len
8073
|| (next_state_log_idx >= mctx->input.valid_len
8074
&& mctx->input.valid_len < mctx->input.len))
8075
{
8076
reg_errcode_t err;
8077
err = extend_buffers (mctx);
8078
if (BE (err != REG_NOERROR, 0))
8079
return err;
8080
}
8081
8082
if (top < next_state_log_idx)
8083
{
8084
memset (mctx->state_log + top + 1, '\0',
8085
sizeof (re_dfastate_t *) * (next_state_log_idx - top));
8086
mctx->state_log_top = next_state_log_idx;
8087
}
8088
return REG_NOERROR;
8089
}
8090
8091
static reg_errcode_t
8092
internal_function
8093
merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
8094
re_dfastate_t **src, int num)
8095
{
8096
int st_idx;
8097
reg_errcode_t err;
8098
for (st_idx = 0; st_idx < num; ++st_idx)
8099
{
8100
if (dst[st_idx] == NULL)
8101
dst[st_idx] = src[st_idx];
8102
else if (src[st_idx] != NULL)
8103
{
8104
re_node_set merged_set;
8105
err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
8106
&src[st_idx]->nodes);
8107
if (BE (err != REG_NOERROR, 0))
8108
return err;
8109
dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
8110
re_node_set_free (&merged_set);
8111
if (BE (err != REG_NOERROR, 0))
8112
return err;
8113
}
8114
}
8115
return REG_NOERROR;
8116
}
8117
8118
static reg_errcode_t
8119
internal_function
8120
update_cur_sifted_state (const re_match_context_t *mctx,
8121
re_sift_context_t *sctx, int str_idx,
8122
re_node_set *dest_nodes)
8123
{
8124
const re_dfa_t *const dfa = mctx->dfa;
8125
reg_errcode_t err = REG_NOERROR;
8126
const re_node_set *candidates;
8127
candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
8128
: &mctx->state_log[str_idx]->nodes);
8129
8130
if (dest_nodes->nelem == 0)
8131
sctx->sifted_states[str_idx] = NULL;
8132
else
8133
{
8134
if (candidates)
8135
{
8136
/* At first, add the nodes which can epsilon transit to a node in
8137
DEST_NODE. */
8138
err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
8139
if (BE (err != REG_NOERROR, 0))
8140
return err;
8141
8142
/* Then, check the limitations in the current sift_context. */
8143
if (sctx->limits.nelem)
8144
{
8145
err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
8146
mctx->bkref_ents, str_idx);
8147
if (BE (err != REG_NOERROR, 0))
8148
return err;
8149
}
8150
}
8151
8152
sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
8153
if (BE (err != REG_NOERROR, 0))
8154
return err;
8155
}
8156
8157
if (candidates && mctx->state_log[str_idx]->has_backref)
8158
{
8159
err = sift_states_bkref (mctx, sctx, str_idx, candidates);
8160
if (BE (err != REG_NOERROR, 0))
8161
return err;
8162
}
8163
return REG_NOERROR;
8164
}
8165
8166
static reg_errcode_t
8167
internal_function
8168
add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
8169
const re_node_set *candidates)
8170
{
8171
reg_errcode_t err = REG_NOERROR;
8172
int i;
8173
8174
re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
8175
if (BE (err != REG_NOERROR, 0))
8176
return err;
8177
8178
if (!state->inveclosure.alloc)
8179
{
8180
err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
8181
if (BE (err != REG_NOERROR, 0))
8182
return REG_ESPACE;
8183
for (i = 0; i < dest_nodes->nelem; i++)
8184
re_node_set_merge (&state->inveclosure,
8185
dfa->inveclosures + dest_nodes->elems[i]);
8186
}
8187
return re_node_set_add_intersect (dest_nodes, candidates,
8188
&state->inveclosure);
8189
}
8190
8191
static reg_errcode_t
8192
internal_function
8193
sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
8194
const re_node_set *candidates)
8195
{
8196
int ecl_idx;
8197
reg_errcode_t err;
8198
re_node_set *inv_eclosure = dfa->inveclosures + node;
8199
re_node_set except_nodes;
8200
re_node_set_init_empty (&except_nodes);
8201
for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
8202
{
8203
int cur_node = inv_eclosure->elems[ecl_idx];
8204
if (cur_node == node)
8205
continue;
8206
if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
8207
{
8208
int edst1 = dfa->edests[cur_node].elems[0];
8209
int edst2 = ((dfa->edests[cur_node].nelem > 1)
8210
? dfa->edests[cur_node].elems[1] : -1);
8211
if ((!re_node_set_contains (inv_eclosure, edst1)
8212
&& re_node_set_contains (dest_nodes, edst1))
8213
|| (edst2 > 0
8214
&& !re_node_set_contains (inv_eclosure, edst2)
8215
&& re_node_set_contains (dest_nodes, edst2)))
8216
{
8217
err = re_node_set_add_intersect (&except_nodes, candidates,
8218
dfa->inveclosures + cur_node);
8219
if (BE (err != REG_NOERROR, 0))
8220
{
8221
re_node_set_free (&except_nodes);
8222
return err;
8223
}
8224
}
8225
}
8226
}
8227
for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
8228
{
8229
int cur_node = inv_eclosure->elems[ecl_idx];
8230
if (!re_node_set_contains (&except_nodes, cur_node))
8231
{
8232
int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
8233
re_node_set_remove_at (dest_nodes, idx);
8234
}
8235
}
8236
re_node_set_free (&except_nodes);
8237
return REG_NOERROR;
8238
}
8239
8240
static int
8241
internal_function
8242
check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
8243
int dst_node, int dst_idx, int src_node, int src_idx)
8244
{
8245
const re_dfa_t *const dfa = mctx->dfa;
8246
int lim_idx, src_pos, dst_pos;
8247
8248
int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
8249
int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
8250
for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
8251
{
8252
int subexp_idx;
8253
struct re_backref_cache_entry *ent;
8254
ent = mctx->bkref_ents + limits->elems[lim_idx];
8255
subexp_idx = dfa->nodes[ent->node].opr.idx;
8256
8257
dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
8258
subexp_idx, dst_node, dst_idx,
8259
dst_bkref_idx);
8260
src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
8261
subexp_idx, src_node, src_idx,
8262
src_bkref_idx);
8263
8264
/* In case of:
8265
<src> <dst> ( <subexp> )
8266
( <subexp> ) <src> <dst>
8267
( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
8268
if (src_pos == dst_pos)
8269
continue; /* This is unrelated limitation. */
8270
else
8271
return 1;
8272
}
8273
return 0;
8274
}
8275
8276
static int
8277
internal_function
8278
check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
8279
int subexp_idx, int from_node, int bkref_idx)
8280
{
8281
const re_dfa_t *const dfa = mctx->dfa;
8282
const re_node_set *eclosures = dfa->eclosures + from_node;
8283
int node_idx;
8284
8285
/* Else, we are on the boundary: examine the nodes on the epsilon
8286
closure. */
8287
for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
8288
{
8289
int node = eclosures->elems[node_idx];
8290
switch (dfa->nodes[node].type)
8291
{
8292
case OP_BACK_REF:
8293
if (bkref_idx != -1)
8294
{
8295
struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
8296
do
8297
{
8298
int dst, cpos;
8299
8300
if (ent->node != node)
8301
continue;
8302
8303
if (subexp_idx < BITSET_WORD_BITS
8304
&& !(ent->eps_reachable_subexps_map
8305
& ((bitset_word_t) 1 << subexp_idx)))
8306
continue;
8307
8308
/* Recurse trying to reach the OP_OPEN_SUBEXP and
8309
OP_CLOSE_SUBEXP cases below. But, if the
8310
destination node is the same node as the source
8311
node, don't recurse because it would cause an
8312
infinite loop: a regex that exhibits this behavior
8313
is ()\1*\1* */
8314
dst = dfa->edests[node].elems[0];
8315
if (dst == from_node)
8316
{
8317
if (boundaries & 1)
8318
return -1;
8319
else /* if (boundaries & 2) */
8320
return 0;
8321
}
8322
8323
cpos =
8324
check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
8325
dst, bkref_idx);
8326
if (cpos == -1 /* && (boundaries & 1) */)
8327
return -1;
8328
if (cpos == 0 && (boundaries & 2))
8329
return 0;
8330
8331
if (subexp_idx < BITSET_WORD_BITS)
8332
ent->eps_reachable_subexps_map
8333
&= ~((bitset_word_t) 1 << subexp_idx);
8334
}
8335
while (ent++->more);
8336
}
8337
break;
8338
8339
case OP_OPEN_SUBEXP:
8340
if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
8341
return -1;
8342
break;
8343
8344
case OP_CLOSE_SUBEXP:
8345
if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
8346
return 0;
8347
break;
8348
8349
default:
8350
break;
8351
}
8352
}
8353
8354
return (boundaries & 2) ? 1 : 0;
8355
}
8356
8357
static int
8358
internal_function
8359
check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
8360
int subexp_idx, int from_node, int str_idx,
8361
int bkref_idx)
8362
{
8363
struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
8364
int boundaries;
8365
8366
/* If we are outside the range of the subexpression, return -1 or 1. */
8367
if (str_idx < lim->subexp_from)
8368
return -1;
8369
8370
if (lim->subexp_to < str_idx)
8371
return 1;
8372
8373
/* If we are within the subexpression, return 0. */
8374
boundaries = (str_idx == lim->subexp_from);
8375
boundaries |= (str_idx == lim->subexp_to) << 1;
8376
if (boundaries == 0)
8377
return 0;
8378
8379
/* Else, examine epsilon closure. */
8380
return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
8381
from_node, bkref_idx);
8382
}
8383
8384
/* Check the limitations of sub expressions LIMITS, and remove the nodes
8385
which are against limitations from DEST_NODES. */
8386
8387
static reg_errcode_t
8388
internal_function
8389
check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
8390
const re_node_set *candidates, re_node_set *limits,
8391
struct re_backref_cache_entry *bkref_ents, int str_idx)
8392
{
8393
reg_errcode_t err;
8394
int node_idx, lim_idx;
8395
8396
for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
8397
{
8398
int subexp_idx;
8399
struct re_backref_cache_entry *ent;
8400
ent = bkref_ents + limits->elems[lim_idx];
8401
8402
if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
8403
continue; /* This is unrelated limitation. */
8404
8405
subexp_idx = dfa->nodes[ent->node].opr.idx;
8406
if (ent->subexp_to == str_idx)
8407
{
8408
int ops_node = -1;
8409
int cls_node = -1;
8410
for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
8411
{
8412
int node = dest_nodes->elems[node_idx];
8413
re_token_type_t type = dfa->nodes[node].type;
8414
if (type == OP_OPEN_SUBEXP
8415
&& subexp_idx == dfa->nodes[node].opr.idx)
8416
ops_node = node;
8417
else if (type == OP_CLOSE_SUBEXP
8418
&& subexp_idx == dfa->nodes[node].opr.idx)
8419
cls_node = node;
8420
}
8421
8422
/* Check the limitation of the open subexpression. */
8423
/* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
8424
if (ops_node >= 0)
8425
{
8426
err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
8427
candidates);
8428
if (BE (err != REG_NOERROR, 0))
8429
return err;
8430
}
8431
8432
/* Check the limitation of the close subexpression. */
8433
if (cls_node >= 0)
8434
for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
8435
{
8436
int node = dest_nodes->elems[node_idx];
8437
if (!re_node_set_contains (dfa->inveclosures + node,
8438
cls_node)
8439
&& !re_node_set_contains (dfa->eclosures + node,
8440
cls_node))
8441
{
8442
/* It is against this limitation.
8443
Remove it form the current sifted state. */
8444
err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
8445
candidates);
8446
if (BE (err != REG_NOERROR, 0))
8447
return err;
8448
--node_idx;
8449
}
8450
}
8451
}
8452
else /* (ent->subexp_to != str_idx) */
8453
{
8454
for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
8455
{
8456
int node = dest_nodes->elems[node_idx];
8457
re_token_type_t type = dfa->nodes[node].type;
8458
if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
8459
{
8460
if (subexp_idx != dfa->nodes[node].opr.idx)
8461
continue;
8462
/* It is against this limitation.
8463
Remove it form the current sifted state. */
8464
err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
8465
candidates);
8466
if (BE (err != REG_NOERROR, 0))
8467
return err;
8468
}
8469
}
8470
}
8471
}
8472
return REG_NOERROR;
8473
}
8474
8475
static reg_errcode_t
8476
internal_function
8477
sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
8478
int str_idx, const re_node_set *candidates)
8479
{
8480
const re_dfa_t *const dfa = mctx->dfa;
8481
reg_errcode_t err;
8482
int node_idx, node;
8483
re_sift_context_t local_sctx;
8484
int first_idx = search_cur_bkref_entry (mctx, str_idx);
8485
8486
if (first_idx == -1)
8487
return REG_NOERROR;
8488
8489
local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
8490
8491
for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
8492
{
8493
int enabled_idx;
8494
re_token_type_t type;
8495
struct re_backref_cache_entry *entry;
8496
node = candidates->elems[node_idx];
8497
type = dfa->nodes[node].type;
8498
/* Avoid infinite loop for the REs like "()\1+". */
8499
if (node == sctx->last_node && str_idx == sctx->last_str_idx)
8500
continue;
8501
if (type != OP_BACK_REF)
8502
continue;
8503
8504
entry = mctx->bkref_ents + first_idx;
8505
enabled_idx = first_idx;
8506
do
8507
{
8508
int subexp_len;
8509
int to_idx;
8510
int dst_node;
8511
int ret;
8512
re_dfastate_t *cur_state;
8513
8514
if (entry->node != node)
8515
continue;
8516
subexp_len = entry->subexp_to - entry->subexp_from;
8517
to_idx = str_idx + subexp_len;
8518
dst_node = (subexp_len ? dfa->nexts[node]
8519
: dfa->edests[node].elems[0]);
8520
8521
if (to_idx > sctx->last_str_idx
8522
|| sctx->sifted_states[to_idx] == NULL
8523
|| !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
8524
|| check_dst_limits (mctx, &sctx->limits, node,
8525
str_idx, dst_node, to_idx))
8526
continue;
8527
8528
if (local_sctx.sifted_states == NULL)
8529
{
8530
local_sctx = *sctx;
8531
err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
8532
if (BE (err != REG_NOERROR, 0))
8533
goto free_return;
8534
}
8535
local_sctx.last_node = node;
8536
local_sctx.last_str_idx = str_idx;
8537
ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
8538
if (BE (ret < 0, 0))
8539
{
8540
err = REG_ESPACE;
8541
goto free_return;
8542
}
8543
cur_state = local_sctx.sifted_states[str_idx];
8544
err = sift_states_backward (mctx, &local_sctx);
8545
if (BE (err != REG_NOERROR, 0))
8546
goto free_return;
8547
if (sctx->limited_states != NULL)
8548
{
8549
err = merge_state_array (dfa, sctx->limited_states,
8550
local_sctx.sifted_states,
8551
str_idx + 1);
8552
if (BE (err != REG_NOERROR, 0))
8553
goto free_return;
8554
}
8555
local_sctx.sifted_states[str_idx] = cur_state;
8556
re_node_set_remove (&local_sctx.limits, enabled_idx);
8557
8558
/* mctx->bkref_ents may have changed, reload the pointer. */
8559
entry = mctx->bkref_ents + enabled_idx;
8560
}
8561
while (enabled_idx++, entry++->more);
8562
}
8563
err = REG_NOERROR;
8564
free_return:
8565
if (local_sctx.sifted_states != NULL)
8566
{
8567
re_node_set_free (&local_sctx.limits);
8568
}
8569
8570
return err;
8571
}
8572
8573
8574
#ifdef RE_ENABLE_I18N
8575
static int
8576
internal_function
8577
sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
8578
int node_idx, int str_idx, int max_str_idx)
8579
{
8580
const re_dfa_t *const dfa = mctx->dfa;
8581
int naccepted;
8582
/* Check the node can accept `multi byte'. */
8583
naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
8584
if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
8585
!STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
8586
dfa->nexts[node_idx]))
8587
/* The node can't accept the `multi byte', or the
8588
destination was already thrown away, then the node
8589
could't accept the current input `multi byte'. */
8590
naccepted = 0;
8591
/* Otherwise, it is sure that the node could accept
8592
`naccepted' bytes input. */
8593
return naccepted;
8594
}
8595
#endif /* RE_ENABLE_I18N */
8596
8597
8598
/* Functions for state transition. */
8599
8600
/* Return the next state to which the current state STATE will transit by
8601
accepting the current input byte, and update STATE_LOG if necessary.
8602
If STATE can accept a multibyte char/collating element/back reference
8603
update the destination of STATE_LOG. */
8604
8605
static re_dfastate_t *
8606
internal_function
8607
transit_state (reg_errcode_t *err, re_match_context_t *mctx,
8608
re_dfastate_t *state)
8609
{
8610
re_dfastate_t **trtable;
8611
unsigned char ch;
8612
8613
#ifdef RE_ENABLE_I18N
8614
/* If the current state can accept multibyte. */
8615
if (BE (state->accept_mb, 0))
8616
{
8617
*err = transit_state_mb (mctx, state);
8618
if (BE (*err != REG_NOERROR, 0))
8619
return NULL;
8620
}
8621
#endif /* RE_ENABLE_I18N */
8622
8623
/* Then decide the next state with the single byte. */
8624
#if 0
8625
if (0)
8626
/* don't use transition table */
8627
return transit_state_sb (err, mctx, state);
8628
#endif
8629
8630
/* Use transition table */
8631
ch = re_string_fetch_byte (&mctx->input);
8632
for (;;)
8633
{
8634
trtable = state->trtable;
8635
if (BE (trtable != NULL, 1))
8636
return trtable[ch];
8637
8638
trtable = state->word_trtable;
8639
if (BE (trtable != NULL, 1))
8640
{
8641
unsigned int context;
8642
context
8643
= re_string_context_at (&mctx->input,
8644
re_string_cur_idx (&mctx->input) - 1,
8645
mctx->eflags);
8646
if (IS_WORD_CONTEXT (context))
8647
return trtable[ch + SBC_MAX];
8648
else
8649
return trtable[ch];
8650
}
8651
8652
if (!build_trtable (mctx->dfa, state))
8653
{
8654
*err = REG_ESPACE;
8655
return NULL;
8656
}
8657
8658
/* Retry, we now have a transition table. */
8659
}
8660
}
8661
8662
/* Update the state_log if we need */
8663
re_dfastate_t *
8664
internal_function
8665
merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
8666
re_dfastate_t *next_state)
8667
{
8668
const re_dfa_t *const dfa = mctx->dfa;
8669
int cur_idx = re_string_cur_idx (&mctx->input);
8670
8671
if (cur_idx > mctx->state_log_top)
8672
{
8673
mctx->state_log[cur_idx] = next_state;
8674
mctx->state_log_top = cur_idx;
8675
}
8676
else if (mctx->state_log[cur_idx] == 0)
8677
{
8678
mctx->state_log[cur_idx] = next_state;
8679
}
8680
else
8681
{
8682
re_dfastate_t *pstate;
8683
unsigned int context;
8684
re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
8685
/* If (state_log[cur_idx] != 0), it implies that cur_idx is
8686
the destination of a multibyte char/collating element/
8687
back reference. Then the next state is the union set of
8688
these destinations and the results of the transition table. */
8689
pstate = mctx->state_log[cur_idx];
8690
log_nodes = pstate->entrance_nodes;
8691
if (next_state != NULL)
8692
{
8693
table_nodes = next_state->entrance_nodes;
8694
*err = re_node_set_init_union (&next_nodes, table_nodes,
8695
log_nodes);
8696
if (BE (*err != REG_NOERROR, 0))
8697
return NULL;
8698
}
8699
else
8700
next_nodes = *log_nodes;
8701
/* Note: We already add the nodes of the initial state,
8702
then we don't need to add them here. */
8703
8704
context = re_string_context_at (&mctx->input,
8705
re_string_cur_idx (&mctx->input) - 1,
8706
mctx->eflags);
8707
next_state = mctx->state_log[cur_idx]
8708
= re_acquire_state_context (err, dfa, &next_nodes, context);
8709
/* We don't need to check errors here, since the return value of
8710
this function is next_state and ERR is already set. */
8711
8712
if (table_nodes != NULL)
8713
re_node_set_free (&next_nodes);
8714
}
8715
8716
if (BE (dfa->nbackref, 0) && next_state != NULL)
8717
{
8718
/* Check OP_OPEN_SUBEXP in the current state in case that we use them
8719
later. We must check them here, since the back references in the
8720
next state might use them. */
8721
*err = check_subexp_matching_top (mctx, &next_state->nodes,
8722
cur_idx);
8723
if (BE (*err != REG_NOERROR, 0))
8724
return NULL;
8725
8726
/* If the next state has back references. */
8727
if (next_state->has_backref)
8728
{
8729
*err = transit_state_bkref (mctx, &next_state->nodes);
8730
if (BE (*err != REG_NOERROR, 0))
8731
return NULL;
8732
next_state = mctx->state_log[cur_idx];
8733
}
8734
}
8735
8736
return next_state;
8737
}
8738
8739
/* Skip bytes in the input that correspond to part of a
8740
multi-byte match, then look in the log for a state
8741
from which to restart matching. */
8742
re_dfastate_t *
8743
internal_function
8744
find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
8745
{
8746
re_dfastate_t *cur_state;
8747
do
8748
{
8749
int max = mctx->state_log_top;
8750
int cur_str_idx = re_string_cur_idx (&mctx->input);
8751
8752
do
8753
{
8754
if (++cur_str_idx > max)
8755
return NULL;
8756
re_string_skip_bytes (&mctx->input, 1);
8757
}
8758
while (mctx->state_log[cur_str_idx] == NULL);
8759
8760
cur_state = merge_state_with_log (err, mctx, NULL);
8761
}
8762
while (*err == REG_NOERROR && cur_state == NULL);
8763
return cur_state;
8764
}
8765
8766
/* Helper functions for transit_state. */
8767
8768
/* From the node set CUR_NODES, pick up the nodes whose types are
8769
OP_OPEN_SUBEXP and which have corresponding back references in the regular
8770
expression. And register them to use them later for evaluating the
8771
correspoding back references. */
8772
8773
static reg_errcode_t
8774
internal_function
8775
check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
8776
int str_idx)
8777
{
8778
const re_dfa_t *const dfa = mctx->dfa;
8779
int node_idx;
8780
reg_errcode_t err;
8781
8782
/* TODO: This isn't efficient.
8783
Because there might be more than one nodes whose types are
8784
OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
8785
nodes.
8786
E.g. RE: (a){2} */
8787
for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
8788
{
8789
int node = cur_nodes->elems[node_idx];
8790
if (dfa->nodes[node].type == OP_OPEN_SUBEXP
8791
&& dfa->nodes[node].opr.idx < BITSET_WORD_BITS
8792
&& (dfa->used_bkref_map
8793
& ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
8794
{
8795
err = match_ctx_add_subtop (mctx, node, str_idx);
8796
if (BE (err != REG_NOERROR, 0))
8797
return err;
8798
}
8799
}
8800
return REG_NOERROR;
8801
}
8802
8803
#if 0
8804
/* Return the next state to which the current state STATE will transit by
8805
accepting the current input byte. */
8806
8807
static re_dfastate_t *
8808
transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
8809
re_dfastate_t *state)
8810
{
8811
const re_dfa_t *const dfa = mctx->dfa;
8812
re_node_set next_nodes;
8813
re_dfastate_t *next_state;
8814
int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
8815
unsigned int context;
8816
8817
*err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
8818
if (BE (*err != REG_NOERROR, 0))
8819
return NULL;
8820
for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
8821
{
8822
int cur_node = state->nodes.elems[node_cnt];
8823
if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
8824
{
8825
*err = re_node_set_merge (&next_nodes,
8826
dfa->eclosures + dfa->nexts[cur_node]);
8827
if (BE (*err != REG_NOERROR, 0))
8828
{
8829
re_node_set_free (&next_nodes);
8830
return NULL;
8831
}
8832
}
8833
}
8834
context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
8835
next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
8836
/* We don't need to check errors here, since the return value of
8837
this function is next_state and ERR is already set. */
8838
8839
re_node_set_free (&next_nodes);
8840
re_string_skip_bytes (&mctx->input, 1);
8841
return next_state;
8842
}
8843
#endif
8844
8845
#ifdef RE_ENABLE_I18N
8846
static reg_errcode_t
8847
internal_function
8848
transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
8849
{
8850
const re_dfa_t *const dfa = mctx->dfa;
8851
reg_errcode_t err;
8852
int i;
8853
8854
for (i = 0; i < pstate->nodes.nelem; ++i)
8855
{
8856
re_node_set dest_nodes, *new_nodes;
8857
int cur_node_idx = pstate->nodes.elems[i];
8858
int naccepted, dest_idx;
8859
unsigned int context;
8860
re_dfastate_t *dest_state;
8861
8862
if (!dfa->nodes[cur_node_idx].accept_mb)
8863
continue;
8864
8865
if (dfa->nodes[cur_node_idx].constraint)
8866
{
8867
context = re_string_context_at (&mctx->input,
8868
re_string_cur_idx (&mctx->input),
8869
mctx->eflags);
8870
if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
8871
context))
8872
continue;
8873
}
8874
8875
/* How many bytes the node can accept? */
8876
naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
8877
re_string_cur_idx (&mctx->input));
8878
if (naccepted == 0)
8879
continue;
8880
8881
/* The node can accepts `naccepted' bytes. */
8882
dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
8883
mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
8884
: mctx->max_mb_elem_len);
8885
err = clean_state_log_if_needed (mctx, dest_idx);
8886
if (BE (err != REG_NOERROR, 0))
8887
return err;
8888
#ifdef DEBUG
8889
assert (dfa->nexts[cur_node_idx] != -1);
8890
#endif
8891
new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
8892
8893
dest_state = mctx->state_log[dest_idx];
8894
if (dest_state == NULL)
8895
dest_nodes = *new_nodes;
8896
else
8897
{
8898
err = re_node_set_init_union (&dest_nodes,
8899
dest_state->entrance_nodes, new_nodes);
8900
if (BE (err != REG_NOERROR, 0))
8901
return err;
8902
}
8903
context = re_string_context_at (&mctx->input, dest_idx - 1,
8904
mctx->eflags);
8905
mctx->state_log[dest_idx]
8906
= re_acquire_state_context (&err, dfa, &dest_nodes, context);
8907
if (dest_state != NULL)
8908
re_node_set_free (&dest_nodes);
8909
if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
8910
return err;
8911
}
8912
return REG_NOERROR;
8913
}
8914
#endif /* RE_ENABLE_I18N */
8915
8916
static reg_errcode_t
8917
internal_function
8918
transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
8919
{
8920
const re_dfa_t *const dfa = mctx->dfa;
8921
reg_errcode_t err;
8922
int i;
8923
int cur_str_idx = re_string_cur_idx (&mctx->input);
8924
8925
for (i = 0; i < nodes->nelem; ++i)
8926
{
8927
int dest_str_idx, prev_nelem, bkc_idx;
8928
int node_idx = nodes->elems[i];
8929
unsigned int context;
8930
const re_token_t *node = dfa->nodes + node_idx;
8931
re_node_set *new_dest_nodes;
8932
8933
/* Check whether `node' is a backreference or not. */
8934
if (node->type != OP_BACK_REF)
8935
continue;
8936
8937
if (node->constraint)
8938
{
8939
context = re_string_context_at (&mctx->input, cur_str_idx,
8940
mctx->eflags);
8941
if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
8942
continue;
8943
}
8944
8945
/* `node' is a backreference.
8946
Check the substring which the substring matched. */
8947
bkc_idx = mctx->nbkref_ents;
8948
err = get_subexp (mctx, node_idx, cur_str_idx);
8949
if (BE (err != REG_NOERROR, 0))
8950
goto free_return;
8951
8952
/* And add the epsilon closures (which is `new_dest_nodes') of
8953
the backreference to appropriate state_log. */
8954
#ifdef DEBUG
8955
assert (dfa->nexts[node_idx] != -1);
8956
#endif
8957
for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
8958
{
8959
int subexp_len;
8960
re_dfastate_t *dest_state;
8961
struct re_backref_cache_entry *bkref_ent;
8962
bkref_ent = mctx->bkref_ents + bkc_idx;
8963
if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
8964
continue;
8965
subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
8966
new_dest_nodes = (subexp_len == 0
8967
? dfa->eclosures + dfa->edests[node_idx].elems[0]
8968
: dfa->eclosures + dfa->nexts[node_idx]);
8969
dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
8970
- bkref_ent->subexp_from);
8971
context = re_string_context_at (&mctx->input, dest_str_idx - 1,
8972
mctx->eflags);
8973
dest_state = mctx->state_log[dest_str_idx];
8974
prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
8975
: mctx->state_log[cur_str_idx]->nodes.nelem);
8976
/* Add `new_dest_node' to state_log. */
8977
if (dest_state == NULL)
8978
{
8979
mctx->state_log[dest_str_idx]
8980
= re_acquire_state_context (&err, dfa, new_dest_nodes,
8981
context);
8982
if (BE (mctx->state_log[dest_str_idx] == NULL
8983
&& err != REG_NOERROR, 0))
8984
goto free_return;
8985
}
8986
else
8987
{
8988
re_node_set dest_nodes;
8989
err = re_node_set_init_union (&dest_nodes,
8990
dest_state->entrance_nodes,
8991
new_dest_nodes);
8992
if (BE (err != REG_NOERROR, 0))
8993
{
8994
re_node_set_free (&dest_nodes);
8995
goto free_return;
8996
}
8997
mctx->state_log[dest_str_idx]
8998
= re_acquire_state_context (&err, dfa, &dest_nodes, context);
8999
re_node_set_free (&dest_nodes);
9000
if (BE (mctx->state_log[dest_str_idx] == NULL
9001
&& err != REG_NOERROR, 0))
9002
goto free_return;
9003
}
9004
/* We need to check recursively if the backreference can epsilon
9005
transit. */
9006
if (subexp_len == 0
9007
&& mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
9008
{
9009
err = check_subexp_matching_top (mctx, new_dest_nodes,
9010
cur_str_idx);
9011
if (BE (err != REG_NOERROR, 0))
9012
goto free_return;
9013
err = transit_state_bkref (mctx, new_dest_nodes);
9014
if (BE (err != REG_NOERROR, 0))
9015
goto free_return;
9016
}
9017
}
9018
}
9019
err = REG_NOERROR;
9020
free_return:
9021
return err;
9022
}
9023
9024
/* Enumerate all the candidates which the backreference BKREF_NODE can match
9025
at BKREF_STR_IDX, and register them by match_ctx_add_entry().
9026
Note that we might collect inappropriate candidates here.
9027
However, the cost of checking them strictly here is too high, then we
9028
delay these checking for prune_impossible_nodes(). */
9029
9030
static reg_errcode_t
9031
internal_function
9032
get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
9033
{
9034
const re_dfa_t *const dfa = mctx->dfa;
9035
int subexp_num, sub_top_idx;
9036
const char *buf = (const char *) re_string_get_buffer (&mctx->input);
9037
/* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
9038
int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
9039
if (cache_idx != -1)
9040
{
9041
const struct re_backref_cache_entry *entry
9042
= mctx->bkref_ents + cache_idx;
9043
do
9044
if (entry->node == bkref_node)
9045
return REG_NOERROR; /* We already checked it. */
9046
while (entry++->more);
9047
}
9048
9049
subexp_num = dfa->nodes[bkref_node].opr.idx;
9050
9051
/* For each sub expression */
9052
for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
9053
{
9054
reg_errcode_t err;
9055
re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
9056
re_sub_match_last_t *sub_last;
9057
int sub_last_idx, sl_str, bkref_str_off;
9058
9059
if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
9060
continue; /* It isn't related. */
9061
9062
sl_str = sub_top->str_idx;
9063
bkref_str_off = bkref_str_idx;
9064
/* At first, check the last node of sub expressions we already
9065
evaluated. */
9066
for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
9067
{
9068
int sl_str_diff;
9069
sub_last = sub_top->lasts[sub_last_idx];
9070
sl_str_diff = sub_last->str_idx - sl_str;
9071
/* The matched string by the sub expression match with the substring
9072
at the back reference? */
9073
if (sl_str_diff > 0)
9074
{
9075
if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
9076
{
9077
/* Not enough chars for a successful match. */
9078
if (bkref_str_off + sl_str_diff > mctx->input.len)
9079
break;
9080
9081
err = clean_state_log_if_needed (mctx,
9082
bkref_str_off
9083
+ sl_str_diff);
9084
if (BE (err != REG_NOERROR, 0))
9085
return err;
9086
buf = (const char *) re_string_get_buffer (&mctx->input);
9087
}
9088
if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
9089
/* We don't need to search this sub expression any more. */
9090
break;
9091
}
9092
bkref_str_off += sl_str_diff;
9093
sl_str += sl_str_diff;
9094
err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
9095
bkref_str_idx);
9096
9097
/* Reload buf, since the preceding call might have reallocated
9098
the buffer. */
9099
buf = (const char *) re_string_get_buffer (&mctx->input);
9100
9101
if (err == REG_NOMATCH)
9102
continue;
9103
if (BE (err != REG_NOERROR, 0))
9104
return err;
9105
}
9106
9107
if (sub_last_idx < sub_top->nlasts)
9108
continue;
9109
if (sub_last_idx > 0)
9110
++sl_str;
9111
/* Then, search for the other last nodes of the sub expression. */
9112
for (; sl_str <= bkref_str_idx; ++sl_str)
9113
{
9114
int cls_node, sl_str_off;
9115
const re_node_set *nodes;
9116
sl_str_off = sl_str - sub_top->str_idx;
9117
/* The matched string by the sub expression match with the substring
9118
at the back reference? */
9119
if (sl_str_off > 0)
9120
{
9121
if (BE (bkref_str_off >= mctx->input.valid_len, 0))
9122
{
9123
/* If we are at the end of the input, we cannot match. */
9124
if (bkref_str_off >= mctx->input.len)
9125
break;
9126
9127
err = extend_buffers (mctx);
9128
if (BE (err != REG_NOERROR, 0))
9129
return err;
9130
9131
buf = (const char *) re_string_get_buffer (&mctx->input);
9132
}
9133
if (buf [bkref_str_off++] != buf[sl_str - 1])
9134
break; /* We don't need to search this sub expression
9135
any more. */
9136
}
9137
if (mctx->state_log[sl_str] == NULL)
9138
continue;
9139
/* Does this state have a ')' of the sub expression? */
9140
nodes = &mctx->state_log[sl_str]->nodes;
9141
cls_node = find_subexp_node (dfa, nodes, subexp_num,
9142
OP_CLOSE_SUBEXP);
9143
if (cls_node == -1)
9144
continue; /* No. */
9145
if (sub_top->path == NULL)
9146
{
9147
sub_top->path = calloc (sizeof (state_array_t),
9148
sl_str - sub_top->str_idx + 1);
9149
if (sub_top->path == NULL)
9150
return REG_ESPACE;
9151
}
9152
/* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
9153
in the current context? */
9154
err = check_arrival (mctx, sub_top->path, sub_top->node,
9155
sub_top->str_idx, cls_node, sl_str,
9156
OP_CLOSE_SUBEXP);
9157
if (err == REG_NOMATCH)
9158
continue;
9159
if (BE (err != REG_NOERROR, 0))
9160
return err;
9161
sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
9162
if (BE (sub_last == NULL, 0))
9163
return REG_ESPACE;
9164
err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
9165
bkref_str_idx);
9166
if (err == REG_NOMATCH)
9167
continue;
9168
}
9169
}
9170
return REG_NOERROR;
9171
}
9172
9173
/* Helper functions for get_subexp(). */
9174
9175
/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
9176
If it can arrive, register the sub expression expressed with SUB_TOP
9177
and SUB_LAST. */
9178
9179
static reg_errcode_t
9180
internal_function
9181
get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
9182
re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
9183
{
9184
reg_errcode_t err;
9185
int to_idx;
9186
/* Can the subexpression arrive the back reference? */
9187
err = check_arrival (mctx, &sub_last->path, sub_last->node,
9188
sub_last->str_idx, bkref_node, bkref_str,
9189
OP_OPEN_SUBEXP);
9190
if (err != REG_NOERROR)
9191
return err;
9192
err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
9193
sub_last->str_idx);
9194
if (BE (err != REG_NOERROR, 0))
9195
return err;
9196
to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
9197
return clean_state_log_if_needed (mctx, to_idx);
9198
}
9199
9200
/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
9201
Search '(' if FL_OPEN, or search ')' otherwise.
9202
TODO: This function isn't efficient...
9203
Because there might be more than one nodes whose types are
9204
OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
9205
nodes.
9206
E.g. RE: (a){2} */
9207
9208
static int
9209
internal_function
9210
find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
9211
int subexp_idx, int type)
9212
{
9213
int cls_idx;
9214
for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
9215
{
9216
int cls_node = nodes->elems[cls_idx];
9217
const re_token_t *node = dfa->nodes + cls_node;
9218
if (node->type == type
9219
&& node->opr.idx == subexp_idx)
9220
return cls_node;
9221
}
9222
return -1;
9223
}
9224
9225
/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
9226
LAST_NODE at LAST_STR. We record the path onto PATH since it will be
9227
heavily reused.
9228
Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
9229
9230
static reg_errcode_t
9231
internal_function
9232
check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
9233
int top_str, int last_node, int last_str, int type)
9234
{
9235
const re_dfa_t *const dfa = mctx->dfa;
9236
reg_errcode_t err = REG_NOERROR;
9237
int subexp_num, backup_cur_idx, str_idx, null_cnt;
9238
re_dfastate_t *cur_state = NULL;
9239
re_node_set *cur_nodes, next_nodes;
9240
re_dfastate_t **backup_state_log;
9241
unsigned int context;
9242
9243
subexp_num = dfa->nodes[top_node].opr.idx;
9244
/* Extend the buffer if we need. */
9245
if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
9246
{
9247
re_dfastate_t **new_array;
9248
int old_alloc = path->alloc;
9249
path->alloc += last_str + mctx->max_mb_elem_len + 1;
9250
new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
9251
if (BE (new_array == NULL, 0))
9252
{
9253
path->alloc = old_alloc;
9254
return REG_ESPACE;
9255
}
9256
path->array = new_array;
9257
memset (new_array + old_alloc, '\0',
9258
sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
9259
}
9260
9261
str_idx = path->next_idx ? path->next_idx : top_str;
9262
9263
/* Temporary modify MCTX. */
9264
backup_state_log = mctx->state_log;
9265
backup_cur_idx = mctx->input.cur_idx;
9266
mctx->state_log = path->array;
9267
mctx->input.cur_idx = str_idx;
9268
9269
/* Setup initial node set. */
9270
context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
9271
if (str_idx == top_str)
9272
{
9273
err = re_node_set_init_1 (&next_nodes, top_node);
9274
if (BE (err != REG_NOERROR, 0))
9275
return err;
9276
err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
9277
if (BE (err != REG_NOERROR, 0))
9278
{
9279
re_node_set_free (&next_nodes);
9280
return err;
9281
}
9282
}
9283
else
9284
{
9285
cur_state = mctx->state_log[str_idx];
9286
if (cur_state && cur_state->has_backref)
9287
{
9288
err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
9289
if (BE (err != REG_NOERROR, 0))
9290
return err;
9291
}
9292
else
9293
re_node_set_init_empty (&next_nodes);
9294
}
9295
if (str_idx == top_str || (cur_state && cur_state->has_backref))
9296
{
9297
if (next_nodes.nelem)
9298
{
9299
err = expand_bkref_cache (mctx, &next_nodes, str_idx,
9300
subexp_num, type);
9301
if (BE (err != REG_NOERROR, 0))
9302
{
9303
re_node_set_free (&next_nodes);
9304
return err;
9305
}
9306
}
9307
cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
9308
if (BE (cur_state == NULL && err != REG_NOERROR, 0))
9309
{
9310
re_node_set_free (&next_nodes);
9311
return err;
9312
}
9313
mctx->state_log[str_idx] = cur_state;
9314
}
9315
9316
for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
9317
{
9318
re_node_set_empty (&next_nodes);
9319
if (mctx->state_log[str_idx + 1])
9320
{
9321
err = re_node_set_merge (&next_nodes,
9322
&mctx->state_log[str_idx + 1]->nodes);
9323
if (BE (err != REG_NOERROR, 0))
9324
{
9325
re_node_set_free (&next_nodes);
9326
return err;
9327
}
9328
}
9329
if (cur_state)
9330
{
9331
err = check_arrival_add_next_nodes (mctx, str_idx,
9332
&cur_state->non_eps_nodes,
9333
&next_nodes);
9334
if (BE (err != REG_NOERROR, 0))
9335
{
9336
re_node_set_free (&next_nodes);
9337
return err;
9338
}
9339
}
9340
++str_idx;
9341
if (next_nodes.nelem)
9342
{
9343
err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
9344
if (BE (err != REG_NOERROR, 0))
9345
{
9346
re_node_set_free (&next_nodes);
9347
return err;
9348
}
9349
err = expand_bkref_cache (mctx, &next_nodes, str_idx,
9350
subexp_num, type);
9351
if (BE (err != REG_NOERROR, 0))
9352
{
9353
re_node_set_free (&next_nodes);
9354
return err;
9355
}
9356
}
9357
context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
9358
cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
9359
if (BE (cur_state == NULL && err != REG_NOERROR, 0))
9360
{
9361
re_node_set_free (&next_nodes);
9362
return err;
9363
}
9364
mctx->state_log[str_idx] = cur_state;
9365
null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
9366
}
9367
re_node_set_free (&next_nodes);
9368
cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
9369
: &mctx->state_log[last_str]->nodes);
9370
path->next_idx = str_idx;
9371
9372
/* Fix MCTX. */
9373
mctx->state_log = backup_state_log;
9374
mctx->input.cur_idx = backup_cur_idx;
9375
9376
/* Then check the current node set has the node LAST_NODE. */
9377
if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
9378
return REG_NOERROR;
9379
9380
return REG_NOMATCH;
9381
}
9382
9383
/* Helper functions for check_arrival. */
9384
9385
/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
9386
to NEXT_NODES.
9387
TODO: This function is similar to the functions transit_state*(),
9388
however this function has many additional works.
9389
Can't we unify them? */
9390
9391
static reg_errcode_t
9392
internal_function
9393
check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
9394
re_node_set *cur_nodes, re_node_set *next_nodes)
9395
{
9396
const re_dfa_t *const dfa = mctx->dfa;
9397
int result;
9398
int cur_idx;
9399
reg_errcode_t err = REG_NOERROR;
9400
re_node_set union_set;
9401
re_node_set_init_empty (&union_set);
9402
for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
9403
{
9404
int naccepted = 0;
9405
int cur_node = cur_nodes->elems[cur_idx];
9406
#ifdef DEBUG
9407
re_token_type_t type = dfa->nodes[cur_node].type;
9408
assert (!IS_EPSILON_NODE (type));
9409
#endif
9410
#ifdef RE_ENABLE_I18N
9411
/* If the node may accept `multi byte'. */
9412
if (dfa->nodes[cur_node].accept_mb)
9413
{
9414
naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
9415
str_idx);
9416
if (naccepted > 1)
9417
{
9418
re_dfastate_t *dest_state;
9419
int next_node = dfa->nexts[cur_node];
9420
int next_idx = str_idx + naccepted;
9421
dest_state = mctx->state_log[next_idx];
9422
re_node_set_empty (&union_set);
9423
if (dest_state)
9424
{
9425
err = re_node_set_merge (&union_set, &dest_state->nodes);
9426
if (BE (err != REG_NOERROR, 0))
9427
{
9428
re_node_set_free (&union_set);
9429
return err;
9430
}
9431
}
9432
result = re_node_set_insert (&union_set, next_node);
9433
if (BE (result < 0, 0))
9434
{
9435
re_node_set_free (&union_set);
9436
return REG_ESPACE;
9437
}
9438
mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
9439
&union_set);
9440
if (BE (mctx->state_log[next_idx] == NULL
9441
&& err != REG_NOERROR, 0))
9442
{
9443
re_node_set_free (&union_set);
9444
return err;
9445
}
9446
}
9447
}
9448
#endif /* RE_ENABLE_I18N */
9449
if (naccepted
9450
|| check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
9451
{
9452
result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
9453
if (BE (result < 0, 0))
9454
{
9455
re_node_set_free (&union_set);
9456
return REG_ESPACE;
9457
}
9458
}
9459
}
9460
re_node_set_free (&union_set);
9461
return REG_NOERROR;
9462
}
9463
9464
/* For all the nodes in CUR_NODES, add the epsilon closures of them to
9465
CUR_NODES, however exclude the nodes which are:
9466
- inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
9467
- out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
9468
*/
9469
9470
static reg_errcode_t
9471
internal_function
9472
check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
9473
int ex_subexp, int type)
9474
{
9475
reg_errcode_t err;
9476
int idx, outside_node;
9477
re_node_set new_nodes;
9478
#ifdef DEBUG
9479
assert (cur_nodes->nelem);
9480
#endif
9481
err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
9482
if (BE (err != REG_NOERROR, 0))
9483
return err;
9484
/* Create a new node set NEW_NODES with the nodes which are epsilon
9485
closures of the node in CUR_NODES. */
9486
9487
for (idx = 0; idx < cur_nodes->nelem; ++idx)
9488
{
9489
int cur_node = cur_nodes->elems[idx];
9490
const re_node_set *eclosure = dfa->eclosures + cur_node;
9491
outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
9492
if (outside_node == -1)
9493
{
9494
/* There are no problematic nodes, just merge them. */
9495
err = re_node_set_merge (&new_nodes, eclosure);
9496
if (BE (err != REG_NOERROR, 0))
9497
{
9498
re_node_set_free (&new_nodes);
9499
return err;
9500
}
9501
}
9502
else
9503
{
9504
/* There are problematic nodes, re-calculate incrementally. */
9505
err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
9506
ex_subexp, type);
9507
if (BE (err != REG_NOERROR, 0))
9508
{
9509
re_node_set_free (&new_nodes);
9510
return err;
9511
}
9512
}
9513
}
9514
re_node_set_free (cur_nodes);
9515
*cur_nodes = new_nodes;
9516
return REG_NOERROR;
9517
}
9518
9519
/* Helper function for check_arrival_expand_ecl.
9520
Check incrementally the epsilon closure of TARGET, and if it isn't
9521
problematic append it to DST_NODES. */
9522
9523
static reg_errcode_t
9524
internal_function
9525
check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
9526
int target, int ex_subexp, int type)
9527
{
9528
int cur_node;
9529
for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
9530
{
9531
int err;
9532
9533
if (dfa->nodes[cur_node].type == type
9534
&& dfa->nodes[cur_node].opr.idx == ex_subexp)
9535
{
9536
if (type == OP_CLOSE_SUBEXP)
9537
{
9538
err = re_node_set_insert (dst_nodes, cur_node);
9539
if (BE (err == -1, 0))
9540
return REG_ESPACE;
9541
}
9542
break;
9543
}
9544
err = re_node_set_insert (dst_nodes, cur_node);
9545
if (BE (err == -1, 0))
9546
return REG_ESPACE;
9547
if (dfa->edests[cur_node].nelem == 0)
9548
break;
9549
if (dfa->edests[cur_node].nelem == 2)
9550
{
9551
err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
9552
dfa->edests[cur_node].elems[1],
9553
ex_subexp, type);
9554
if (BE (err != REG_NOERROR, 0))
9555
return err;
9556
}
9557
cur_node = dfa->edests[cur_node].elems[0];
9558
}
9559
return REG_NOERROR;
9560
}
9561
9562
9563
/* For all the back references in the current state, calculate the
9564
destination of the back references by the appropriate entry
9565
in MCTX->BKREF_ENTS. */
9566
9567
static reg_errcode_t
9568
internal_function
9569
expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
9570
int cur_str, int subexp_num, int type)
9571
{
9572
const re_dfa_t *const dfa = mctx->dfa;
9573
reg_errcode_t err;
9574
int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
9575
struct re_backref_cache_entry *ent;
9576
9577
if (cache_idx_start == -1)
9578
return REG_NOERROR;
9579
9580
restart:
9581
ent = mctx->bkref_ents + cache_idx_start;
9582
do
9583
{
9584
int to_idx, next_node;
9585
9586
/* Is this entry ENT is appropriate? */
9587
if (!re_node_set_contains (cur_nodes, ent->node))
9588
continue; /* No. */
9589
9590
to_idx = cur_str + ent->subexp_to - ent->subexp_from;
9591
/* Calculate the destination of the back reference, and append it
9592
to MCTX->STATE_LOG. */
9593
if (to_idx == cur_str)
9594
{
9595
/* The backreference did epsilon transit, we must re-check all the
9596
node in the current state. */
9597
re_node_set new_dests;
9598
reg_errcode_t err2, err3;
9599
next_node = dfa->edests[ent->node].elems[0];
9600
if (re_node_set_contains (cur_nodes, next_node))
9601
continue;
9602
err = re_node_set_init_1 (&new_dests, next_node);
9603
err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
9604
err3 = re_node_set_merge (cur_nodes, &new_dests);
9605
re_node_set_free (&new_dests);
9606
if (BE (err != REG_NOERROR || err2 != REG_NOERROR
9607
|| err3 != REG_NOERROR, 0))
9608
{
9609
err = (err != REG_NOERROR ? err
9610
: (err2 != REG_NOERROR ? err2 : err3));
9611
return err;
9612
}
9613
/* TODO: It is still inefficient... */
9614
goto restart;
9615
}
9616
else
9617
{
9618
re_node_set union_set;
9619
next_node = dfa->nexts[ent->node];
9620
if (mctx->state_log[to_idx])
9621
{
9622
int ret;
9623
if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
9624
next_node))
9625
continue;
9626
err = re_node_set_init_copy (&union_set,
9627
&mctx->state_log[to_idx]->nodes);
9628
ret = re_node_set_insert (&union_set, next_node);
9629
if (BE (err != REG_NOERROR || ret < 0, 0))
9630
{
9631
re_node_set_free (&union_set);
9632
err = err != REG_NOERROR ? err : REG_ESPACE;
9633
return err;
9634
}
9635
}
9636
else
9637
{
9638
err = re_node_set_init_1 (&union_set, next_node);
9639
if (BE (err != REG_NOERROR, 0))
9640
return err;
9641
}
9642
mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
9643
re_node_set_free (&union_set);
9644
if (BE (mctx->state_log[to_idx] == NULL
9645
&& err != REG_NOERROR, 0))
9646
return err;
9647
}
9648
}
9649
while (ent++->more);
9650
return REG_NOERROR;
9651
}
9652
9653
/* Build transition table for the state.
9654
Return 1 if succeeded, otherwise return NULL. */
9655
9656
static int
9657
internal_function
9658
build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
9659
{
9660
reg_errcode_t err;
9661
int i, j, ch, need_word_trtable = 0;
9662
bitset_word_t elem, mask;
9663
bool dests_node_malloced = false;
9664
bool dest_states_malloced = false;
9665
int ndests; /* Number of the destination states from `state'. */
9666
re_dfastate_t **trtable;
9667
re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
9668
re_node_set follows, *dests_node;
9669
bitset_t *dests_ch;
9670
bitset_t acceptable;
9671
9672
struct dests_alloc
9673
{
9674
re_node_set dests_node[SBC_MAX];
9675
bitset_t dests_ch[SBC_MAX];
9676
} *dests_alloc;
9677
9678
/* We build DFA states which corresponds to the destination nodes
9679
from `state'. `dests_node[i]' represents the nodes which i-th
9680
destination state contains, and `dests_ch[i]' represents the
9681
characters which i-th destination state accepts. */
9682
if (__libc_use_alloca (sizeof (struct dests_alloc)))
9683
dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
9684
else
9685
{
9686
dests_alloc = re_malloc (struct dests_alloc, 1);
9687
if (BE (dests_alloc == NULL, 0))
9688
return 0;
9689
dests_node_malloced = true;
9690
}
9691
dests_node = dests_alloc->dests_node;
9692
dests_ch = dests_alloc->dests_ch;
9693
9694
/* Initialize transiton table. */
9695
state->word_trtable = state->trtable = NULL;
9696
9697
/* At first, group all nodes belonging to `state' into several
9698
destinations. */
9699
ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
9700
if (BE (ndests <= 0, 0))
9701
{
9702
if (dests_node_malloced)
9703
free (dests_alloc);
9704
/* Return 0 in case of an error, 1 otherwise. */
9705
if (ndests == 0)
9706
{
9707
state->trtable = (re_dfastate_t **)
9708
calloc (sizeof (re_dfastate_t *), SBC_MAX);
9709
return 1;
9710
}
9711
return 0;
9712
}
9713
9714
err = re_node_set_alloc (&follows, ndests + 1);
9715
if (BE (err != REG_NOERROR, 0))
9716
goto out_free;
9717
9718
if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
9719
+ ndests * 3 * sizeof (re_dfastate_t *)))
9720
dest_states = (re_dfastate_t **)
9721
alloca (ndests * 3 * sizeof (re_dfastate_t *));
9722
else
9723
{
9724
dest_states = (re_dfastate_t **)
9725
malloc (ndests * 3 * sizeof (re_dfastate_t *));
9726
if (BE (dest_states == NULL, 0))
9727
{
9728
out_free:
9729
if (dest_states_malloced)
9730
free (dest_states);
9731
re_node_set_free (&follows);
9732
for (i = 0; i < ndests; ++i)
9733
re_node_set_free (dests_node + i);
9734
if (dests_node_malloced)
9735
free (dests_alloc);
9736
return 0;
9737
}
9738
dest_states_malloced = true;
9739
}
9740
dest_states_word = dest_states + ndests;
9741
dest_states_nl = dest_states_word + ndests;
9742
bitset_empty (acceptable);
9743
9744
/* Then build the states for all destinations. */
9745
for (i = 0; i < ndests; ++i)
9746
{
9747
int next_node;
9748
re_node_set_empty (&follows);
9749
/* Merge the follows of this destination states. */
9750
for (j = 0; j < dests_node[i].nelem; ++j)
9751
{
9752
next_node = dfa->nexts[dests_node[i].elems[j]];
9753
if (next_node != -1)
9754
{
9755
err = re_node_set_merge (&follows, dfa->eclosures + next_node);
9756
if (BE (err != REG_NOERROR, 0))
9757
goto out_free;
9758
}
9759
}
9760
dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
9761
if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
9762
goto out_free;
9763
/* If the new state has context constraint,
9764
build appropriate states for these contexts. */
9765
if (dest_states[i]->has_constraint)
9766
{
9767
dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
9768
CONTEXT_WORD);
9769
if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
9770
goto out_free;
9771
9772
if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
9773
need_word_trtable = 1;
9774
9775
dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
9776
CONTEXT_NEWLINE);
9777
if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
9778
goto out_free;
9779
}
9780
else
9781
{
9782
dest_states_word[i] = dest_states[i];
9783
dest_states_nl[i] = dest_states[i];
9784
}
9785
bitset_merge (acceptable, dests_ch[i]);
9786
}
9787
9788
if (!BE (need_word_trtable, 0))
9789
{
9790
/* We don't care about whether the following character is a word
9791
character, or we are in a single-byte character set so we can
9792
discern by looking at the character code: allocate a
9793
256-entry transition table. */
9794
trtable = state->trtable =
9795
(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
9796
if (BE (trtable == NULL, 0))
9797
goto out_free;
9798
9799
/* For all characters ch...: */
9800
for (i = 0; i < BITSET_WORDS; ++i)
9801
for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
9802
elem;
9803
mask <<= 1, elem >>= 1, ++ch)
9804
if (BE (elem & 1, 0))
9805
{
9806
/* There must be exactly one destination which accepts
9807
character ch. See group_nodes_into_DFAstates. */
9808
for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
9809
;
9810
9811
/* j-th destination accepts the word character ch. */
9812
if (dfa->word_char[i] & mask)
9813
trtable[ch] = dest_states_word[j];
9814
else
9815
trtable[ch] = dest_states[j];
9816
}
9817
}
9818
else
9819
{
9820
/* We care about whether the following character is a word
9821
character, and we are in a multi-byte character set: discern
9822
by looking at the character code: build two 256-entry
9823
transition tables, one starting at trtable[0] and one
9824
starting at trtable[SBC_MAX]. */
9825
trtable = state->word_trtable =
9826
(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
9827
if (BE (trtable == NULL, 0))
9828
goto out_free;
9829
9830
/* For all characters ch...: */
9831
for (i = 0; i < BITSET_WORDS; ++i)
9832
for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
9833
elem;
9834
mask <<= 1, elem >>= 1, ++ch)
9835
if (BE (elem & 1, 0))
9836
{
9837
/* There must be exactly one destination which accepts
9838
character ch. See group_nodes_into_DFAstates. */
9839
for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
9840
;
9841
9842
/* j-th destination accepts the word character ch. */
9843
trtable[ch] = dest_states[j];
9844
trtable[ch + SBC_MAX] = dest_states_word[j];
9845
}
9846
}
9847
9848
/* new line */
9849
if (bitset_contain (acceptable, NEWLINE_CHAR))
9850
{
9851
/* The current state accepts newline character. */
9852
for (j = 0; j < ndests; ++j)
9853
if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
9854
{
9855
/* k-th destination accepts newline character. */
9856
trtable[NEWLINE_CHAR] = dest_states_nl[j];
9857
if (need_word_trtable)
9858
trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
9859
/* There must be only one destination which accepts
9860
newline. See group_nodes_into_DFAstates. */
9861
break;
9862
}
9863
}
9864
9865
if (dest_states_malloced)
9866
free (dest_states);
9867
9868
re_node_set_free (&follows);
9869
for (i = 0; i < ndests; ++i)
9870
re_node_set_free (dests_node + i);
9871
9872
if (dests_node_malloced)
9873
free (dests_alloc);
9874
9875
return 1;
9876
}
9877
9878
/* Group all nodes belonging to STATE into several destinations.
9879
Then for all destinations, set the nodes belonging to the destination
9880
to DESTS_NODE[i] and set the characters accepted by the destination
9881
to DEST_CH[i]. This function return the number of destinations. */
9882
9883
static int
9884
internal_function
9885
group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
9886
re_node_set *dests_node, bitset_t *dests_ch)
9887
{
9888
reg_errcode_t err;
9889
int result;
9890
int i, j, k;
9891
int ndests; /* Number of the destinations from `state'. */
9892
bitset_t accepts; /* Characters a node can accept. */
9893
const re_node_set *cur_nodes = &state->nodes;
9894
bitset_empty (accepts);
9895
ndests = 0;
9896
9897
/* For all the nodes belonging to `state', */
9898
for (i = 0; i < cur_nodes->nelem; ++i)
9899
{
9900
re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
9901
re_token_type_t type = node->type;
9902
unsigned int constraint = node->constraint;
9903
9904
/* Enumerate all single byte character this node can accept. */
9905
if (type == CHARACTER)
9906
bitset_set (accepts, node->opr.c);
9907
else if (type == SIMPLE_BRACKET)
9908
{
9909
bitset_merge (accepts, node->opr.sbcset);
9910
}
9911
else if (type == OP_PERIOD)
9912
{
9913
#ifdef RE_ENABLE_I18N
9914
if (dfa->mb_cur_max > 1)
9915
bitset_merge (accepts, dfa->sb_char);
9916
else
9917
#endif
9918
bitset_set_all (accepts);
9919
if (!(dfa->syntax & RE_DOT_NEWLINE))
9920
bitset_clear (accepts, '\n');
9921
if (dfa->syntax & RE_DOT_NOT_NULL)
9922
bitset_clear (accepts, '\0');
9923
}
9924
#ifdef RE_ENABLE_I18N
9925
else if (type == OP_UTF8_PERIOD)
9926
{
9927
memset (accepts, '\xff', sizeof (bitset_t) / 2);
9928
if (!(dfa->syntax & RE_DOT_NEWLINE))
9929
bitset_clear (accepts, '\n');
9930
if (dfa->syntax & RE_DOT_NOT_NULL)
9931
bitset_clear (accepts, '\0');
9932
}
9933
#endif
9934
else
9935
continue;
9936
9937
/* Check the `accepts' and sift the characters which are not
9938
match it the context. */
9939
if (constraint)
9940
{
9941
if (constraint & NEXT_NEWLINE_CONSTRAINT)
9942
{
9943
bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
9944
bitset_empty (accepts);
9945
if (accepts_newline)
9946
bitset_set (accepts, NEWLINE_CHAR);
9947
else
9948
continue;
9949
}
9950
if (constraint & NEXT_ENDBUF_CONSTRAINT)
9951
{
9952
bitset_empty (accepts);
9953
continue;
9954
}
9955
9956
if (constraint & NEXT_WORD_CONSTRAINT)
9957
{
9958
bitset_word_t any_set = 0;
9959
if (type == CHARACTER && !node->word_char)
9960
{
9961
bitset_empty (accepts);
9962
continue;
9963
}
9964
#ifdef RE_ENABLE_I18N
9965
if (dfa->mb_cur_max > 1)
9966
for (j = 0; j < BITSET_WORDS; ++j)
9967
any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
9968
else
9969
#endif
9970
for (j = 0; j < BITSET_WORDS; ++j)
9971
any_set |= (accepts[j] &= dfa->word_char[j]);
9972
if (!any_set)
9973
continue;
9974
}
9975
if (constraint & NEXT_NOTWORD_CONSTRAINT)
9976
{
9977
bitset_word_t any_set = 0;
9978
if (type == CHARACTER && node->word_char)
9979
{
9980
bitset_empty (accepts);
9981
continue;
9982
}
9983
#ifdef RE_ENABLE_I18N
9984
if (dfa->mb_cur_max > 1)
9985
for (j = 0; j < BITSET_WORDS; ++j)
9986
any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
9987
else
9988
#endif
9989
for (j = 0; j < BITSET_WORDS; ++j)
9990
any_set |= (accepts[j] &= ~dfa->word_char[j]);
9991
if (!any_set)
9992
continue;
9993
}
9994
}
9995
9996
/* Then divide `accepts' into DFA states, or create a new
9997
state. Above, we make sure that accepts is not empty. */
9998
for (j = 0; j < ndests; ++j)
9999
{
10000
bitset_t intersec; /* Intersection sets, see below. */
10001
bitset_t remains;
10002
/* Flags, see below. */
10003
bitset_word_t has_intersec, not_subset, not_consumed;
10004
10005
/* Optimization, skip if this state doesn't accept the character. */
10006
if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
10007
continue;
10008
10009
/* Enumerate the intersection set of this state and `accepts'. */
10010
has_intersec = 0;
10011
for (k = 0; k < BITSET_WORDS; ++k)
10012
has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
10013
/* And skip if the intersection set is empty. */
10014
if (!has_intersec)
10015
continue;
10016
10017
/* Then check if this state is a subset of `accepts'. */
10018
not_subset = not_consumed = 0;
10019
for (k = 0; k < BITSET_WORDS; ++k)
10020
{
10021
not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
10022
not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
10023
}
10024
10025
/* If this state isn't a subset of `accepts', create a
10026
new group state, which has the `remains'. */
10027
if (not_subset)
10028
{
10029
bitset_copy (dests_ch[ndests], remains);
10030
bitset_copy (dests_ch[j], intersec);
10031
err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
10032
if (BE (err != REG_NOERROR, 0))
10033
goto error_return;
10034
++ndests;
10035
}
10036
10037
/* Put the position in the current group. */
10038
result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
10039
if (BE (result < 0, 0))
10040
goto error_return;
10041
10042
/* If all characters are consumed, go to next node. */
10043
if (!not_consumed)
10044
break;
10045
}
10046
/* Some characters remain, create a new group. */
10047
if (j == ndests)
10048
{
10049
bitset_copy (dests_ch[ndests], accepts);
10050
err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
10051
if (BE (err != REG_NOERROR, 0))
10052
goto error_return;
10053
++ndests;
10054
bitset_empty (accepts);
10055
}
10056
}
10057
return ndests;
10058
error_return:
10059
for (j = 0; j < ndests; ++j)
10060
re_node_set_free (dests_node + j);
10061
return -1;
10062
}
10063
10064
#ifdef RE_ENABLE_I18N
10065
/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
10066
Return the number of the bytes the node accepts.
10067
STR_IDX is the current index of the input string.
10068
10069
This function handles the nodes which can accept one character, or
10070
one collating element like '.', '[a-z]', opposite to the other nodes
10071
can only accept one byte. */
10072
10073
static int
10074
internal_function
10075
check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
10076
const re_string_t *input, int str_idx)
10077
{
10078
const re_token_t *node = dfa->nodes + node_idx;
10079
int char_len, elem_len;
10080
int i;
10081
10082
if (BE (node->type == OP_UTF8_PERIOD, 0))
10083
{
10084
unsigned char c = re_string_byte_at (input, str_idx), d;
10085
if (BE (c < 0xc2, 1))
10086
return 0;
10087
10088
if (str_idx + 2 > input->len)
10089
return 0;
10090
10091
d = re_string_byte_at (input, str_idx + 1);
10092
if (c < 0xe0)
10093
return (d < 0x80 || d > 0xbf) ? 0 : 2;
10094
else if (c < 0xf0)
10095
{
10096
char_len = 3;
10097
if (c == 0xe0 && d < 0xa0)
10098
return 0;
10099
}
10100
else if (c < 0xf8)
10101
{
10102
char_len = 4;
10103
if (c == 0xf0 && d < 0x90)
10104
return 0;
10105
}
10106
else if (c < 0xfc)
10107
{
10108
char_len = 5;
10109
if (c == 0xf8 && d < 0x88)
10110
return 0;
10111
}
10112
else if (c < 0xfe)
10113
{
10114
char_len = 6;
10115
if (c == 0xfc && d < 0x84)
10116
return 0;
10117
}
10118
else
10119
return 0;
10120
10121
if (str_idx + char_len > input->len)
10122
return 0;
10123
10124
for (i = 1; i < char_len; ++i)
10125
{
10126
d = re_string_byte_at (input, str_idx + i);
10127
if (d < 0x80 || d > 0xbf)
10128
return 0;
10129
}
10130
return char_len;
10131
}
10132
10133
char_len = re_string_char_size_at (input, str_idx);
10134
if (node->type == OP_PERIOD)
10135
{
10136
if (char_len <= 1)
10137
return 0;
10138
/* FIXME: I don't think this if is needed, as both '\n'
10139
and '\0' are char_len == 1. */
10140
/* '.' accepts any one character except the following two cases. */
10141
if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
10142
re_string_byte_at (input, str_idx) == '\n') ||
10143
((dfa->syntax & RE_DOT_NOT_NULL) &&
10144
re_string_byte_at (input, str_idx) == '\0'))
10145
return 0;
10146
return char_len;
10147
}
10148
10149
elem_len = re_string_elem_size_at (input, str_idx);
10150
if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
10151
return 0;
10152
10153
if (node->type == COMPLEX_BRACKET)
10154
{
10155
const re_charset_t *cset = node->opr.mbcset;
10156
# ifdef _LIBC
10157
const unsigned char *pin
10158
= ((const unsigned char *) re_string_get_buffer (input) + str_idx);
10159
int j;
10160
uint32_t nrules;
10161
# endif /* _LIBC */
10162
int match_len = 0;
10163
wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
10164
? re_string_wchar_at (input, str_idx) : 0);
10165
10166
/* match with multibyte character? */
10167
for (i = 0; i < cset->nmbchars; ++i)
10168
if (wc == cset->mbchars[i])
10169
{
10170
match_len = char_len;
10171
goto check_node_accept_bytes_match;
10172
}
10173
/* match with character_class? */
10174
for (i = 0; i < cset->nchar_classes; ++i)
10175
{
10176
wctype_t wt = cset->char_classes[i];
10177
if (__iswctype (wc, wt))
10178
{
10179
match_len = char_len;
10180
goto check_node_accept_bytes_match;
10181
}
10182
}
10183
10184
# ifdef _LIBC
10185
nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
10186
if (nrules != 0)
10187
{
10188
unsigned int in_collseq = 0;
10189
const int32_t *table, *indirect;
10190
const unsigned char *weights, *extra;
10191
const char *collseqwc;
10192
int32_t idx;
10193
/* This #include defines a local function! */
10194
# include <locale/weight.h>
10195
10196
/* match with collating_symbol? */
10197
if (cset->ncoll_syms)
10198
extra = (const unsigned char *)
10199
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
10200
for (i = 0; i < cset->ncoll_syms; ++i)
10201
{
10202
const unsigned char *coll_sym = extra + cset->coll_syms[i];
10203
/* Compare the length of input collating element and
10204
the length of current collating element. */
10205
if (*coll_sym != elem_len)
10206
continue;
10207
/* Compare each bytes. */
10208
for (j = 0; j < *coll_sym; j++)
10209
if (pin[j] != coll_sym[1 + j])
10210
break;
10211
if (j == *coll_sym)
10212
{
10213
/* Match if every bytes is equal. */
10214
match_len = j;
10215
goto check_node_accept_bytes_match;
10216
}
10217
}
10218
10219
if (cset->nranges)
10220
{
10221
if (elem_len <= char_len)
10222
{
10223
collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
10224
in_collseq = __collseq_table_lookup (collseqwc, wc);
10225
}
10226
else
10227
in_collseq = find_collation_sequence_value (pin, elem_len);
10228
}
10229
/* match with range expression? */
10230
for (i = 0; i < cset->nranges; ++i)
10231
if (cset->range_starts[i] <= in_collseq
10232
&& in_collseq <= cset->range_ends[i])
10233
{
10234
match_len = elem_len;
10235
goto check_node_accept_bytes_match;
10236
}
10237
10238
/* match with equivalence_class? */
10239
if (cset->nequiv_classes)
10240
{
10241
const unsigned char *cp = pin;
10242
table = (const int32_t *)
10243
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
10244
weights = (const unsigned char *)
10245
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
10246
extra = (const unsigned char *)
10247
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
10248
indirect = (const int32_t *)
10249
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
10250
idx = findidx (&cp);
10251
if (idx > 0)
10252
for (i = 0; i < cset->nequiv_classes; ++i)
10253
{
10254
int32_t equiv_class_idx = cset->equiv_classes[i];
10255
size_t weight_len = weights[idx];
10256
if (weight_len == weights[equiv_class_idx])
10257
{
10258
int cnt = 0;
10259
while (cnt <= weight_len
10260
&& (weights[equiv_class_idx + 1 + cnt]
10261
== weights[idx + 1 + cnt]))
10262
++cnt;
10263
if (cnt > weight_len)
10264
{
10265
match_len = elem_len;
10266
goto check_node_accept_bytes_match;
10267
}
10268
}
10269
}
10270
}
10271
}
10272
else
10273
# endif /* _LIBC */
10274
{
10275
/* match with range expression? */
10276
#if __GNUC__ >= 2
10277
wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
10278
#else
10279
wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
10280
cmp_buf[2] = wc;
10281
#endif
10282
for (i = 0; i < cset->nranges; ++i)
10283
{
10284
cmp_buf[0] = cset->range_starts[i];
10285
cmp_buf[4] = cset->range_ends[i];
10286
if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
10287
&& wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
10288
{
10289
match_len = char_len;
10290
goto check_node_accept_bytes_match;
10291
}
10292
}
10293
}
10294
check_node_accept_bytes_match:
10295
if (!cset->non_match)
10296
return match_len;
10297
else
10298
{
10299
if (match_len > 0)
10300
return 0;
10301
else
10302
return (elem_len > char_len) ? elem_len : char_len;
10303
}
10304
}
10305
return 0;
10306
}
10307
10308
# ifdef _LIBC
10309
static unsigned int
10310
internal_function
10311
find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
10312
{
10313
uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
10314
if (nrules == 0)
10315
{
10316
if (mbs_len == 1)
10317
{
10318
/* No valid character. Match it as a single byte character. */
10319
const unsigned char *collseq = (const unsigned char *)
10320
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
10321
return collseq[mbs[0]];
10322
}
10323
return UINT_MAX;
10324
}
10325
else
10326
{
10327
int32_t idx;
10328
const unsigned char *extra = (const unsigned char *)
10329
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
10330
int32_t extrasize = (const unsigned char *)
10331
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
10332
10333
for (idx = 0; idx < extrasize;)
10334
{
10335
int mbs_cnt, found = 0;
10336
int32_t elem_mbs_len;
10337
/* Skip the name of collating element name. */
10338
idx = idx + extra[idx] + 1;
10339
elem_mbs_len = extra[idx++];
10340
if (mbs_len == elem_mbs_len)
10341
{
10342
for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
10343
if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
10344
break;
10345
if (mbs_cnt == elem_mbs_len)
10346
/* Found the entry. */
10347
found = 1;
10348
}
10349
/* Skip the byte sequence of the collating element. */
10350
idx += elem_mbs_len;
10351
/* Adjust for the alignment. */
10352
idx = (idx + 3) & ~3;
10353
/* Skip the collation sequence value. */
10354
idx += sizeof (uint32_t);
10355
/* Skip the wide char sequence of the collating element. */
10356
idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
10357
/* If we found the entry, return the sequence value. */
10358
if (found)
10359
return *(uint32_t *) (extra + idx);
10360
/* Skip the collation sequence value. */
10361
idx += sizeof (uint32_t);
10362
}
10363
return UINT_MAX;
10364
}
10365
}
10366
# endif /* _LIBC */
10367
#endif /* RE_ENABLE_I18N */
10368
10369
/* Check whether the node accepts the byte which is IDX-th
10370
byte of the INPUT. */
10371
10372
static int
10373
internal_function
10374
check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
10375
int idx)
10376
{
10377
unsigned char ch;
10378
ch = re_string_byte_at (&mctx->input, idx);
10379
switch (node->type)
10380
{
10381
case CHARACTER:
10382
if (node->opr.c != ch)
10383
return 0;
10384
break;
10385
10386
case SIMPLE_BRACKET:
10387
if (!bitset_contain (node->opr.sbcset, ch))
10388
return 0;
10389
break;
10390
10391
#ifdef RE_ENABLE_I18N
10392
case OP_UTF8_PERIOD:
10393
if (ch >= 0x80)
10394
return 0;
10395
/* FALLTHROUGH */
10396
#endif
10397
case OP_PERIOD:
10398
if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
10399
|| (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
10400
return 0;
10401
break;
10402
10403
default:
10404
return 0;
10405
}
10406
10407
if (node->constraint)
10408
{
10409
/* The node has constraints. Check whether the current context
10410
satisfies the constraints. */
10411
unsigned int context = re_string_context_at (&mctx->input, idx,
10412
mctx->eflags);
10413
if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
10414
return 0;
10415
}
10416
10417
return 1;
10418
}
10419
10420
/* Extend the buffers, if the buffers have run out. */
10421
10422
static reg_errcode_t
10423
internal_function
10424
extend_buffers (re_match_context_t *mctx)
10425
{
10426
reg_errcode_t ret;
10427
re_string_t *pstr = &mctx->input;
10428
10429
/* Double the lengthes of the buffers. */
10430
ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
10431
if (BE (ret != REG_NOERROR, 0))
10432
return ret;
10433
10434
if (mctx->state_log != NULL)
10435
{
10436
/* And double the length of state_log. */
10437
/* XXX We have no indication of the size of this buffer. If this
10438
allocation fail we have no indication that the state_log array
10439
does not have the right size. */
10440
re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
10441
pstr->bufs_len + 1);
10442
if (BE (new_array == NULL, 0))
10443
return REG_ESPACE;
10444
mctx->state_log = new_array;
10445
}
10446
10447
/* Then reconstruct the buffers. */
10448
if (pstr->icase)
10449
{
10450
#ifdef RE_ENABLE_I18N
10451
if (pstr->mb_cur_max > 1)
10452
{
10453
ret = build_wcs_upper_buffer (pstr);
10454
if (BE (ret != REG_NOERROR, 0))
10455
return ret;
10456
}
10457
else
10458
#endif /* RE_ENABLE_I18N */
10459
build_upper_buffer (pstr);
10460
}
10461
else
10462
{
10463
#ifdef RE_ENABLE_I18N
10464
if (pstr->mb_cur_max > 1)
10465
build_wcs_buffer (pstr);
10466
else
10467
#endif /* RE_ENABLE_I18N */
10468
{
10469
if (pstr->trans != NULL)
10470
re_string_translate_buffer (pstr);
10471
}
10472
}
10473
return REG_NOERROR;
10474
}
10475
10476
10477
/* Functions for matching context. */
10478
10479
/* Initialize MCTX. */
10480
10481
static reg_errcode_t
10482
internal_function
10483
match_ctx_init (re_match_context_t *mctx, int eflags, int n)
10484
{
10485
mctx->eflags = eflags;
10486
mctx->match_last = -1;
10487
if (n > 0)
10488
{
10489
mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
10490
mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
10491
if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
10492
return REG_ESPACE;
10493
}
10494
/* Already zero-ed by the caller.
10495
else
10496
mctx->bkref_ents = NULL;
10497
mctx->nbkref_ents = 0;
10498
mctx->nsub_tops = 0; */
10499
mctx->abkref_ents = n;
10500
mctx->max_mb_elem_len = 1;
10501
mctx->asub_tops = n;
10502
return REG_NOERROR;
10503
}
10504
10505
/* Clean the entries which depend on the current input in MCTX.
10506
This function must be invoked when the matcher changes the start index
10507
of the input, or changes the input string. */
10508
10509
static void
10510
internal_function
10511
match_ctx_clean (re_match_context_t *mctx)
10512
{
10513
int st_idx;
10514
for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
10515
{
10516
int sl_idx;
10517
re_sub_match_top_t *top = mctx->sub_tops[st_idx];
10518
for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
10519
{
10520
re_sub_match_last_t *last = top->lasts[sl_idx];
10521
re_free (last->path.array);
10522
re_free (last);
10523
}
10524
re_free (top->lasts);
10525
if (top->path)
10526
{
10527
re_free (top->path->array);
10528
re_free (top->path);
10529
}
10530
free (top);
10531
}
10532
10533
mctx->nsub_tops = 0;
10534
mctx->nbkref_ents = 0;
10535
}
10536
10537
/* Free all the memory associated with MCTX. */
10538
10539
static void
10540
internal_function
10541
match_ctx_free (re_match_context_t *mctx)
10542
{
10543
/* First, free all the memory associated with MCTX->SUB_TOPS. */
10544
match_ctx_clean (mctx);
10545
re_free (mctx->sub_tops);
10546
re_free (mctx->bkref_ents);
10547
}
10548
10549
/* Add a new backreference entry to MCTX.
10550
Note that we assume that caller never call this function with duplicate
10551
entry, and call with STR_IDX which isn't smaller than any existing entry.
10552
*/
10553
10554
static reg_errcode_t
10555
internal_function
10556
match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
10557
int to)
10558
{
10559
if (mctx->nbkref_ents >= mctx->abkref_ents)
10560
{
10561
struct re_backref_cache_entry* new_entry;
10562
new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
10563
mctx->abkref_ents * 2);
10564
if (BE (new_entry == NULL, 0))
10565
{
10566
re_free (mctx->bkref_ents);
10567
return REG_ESPACE;
10568
}
10569
mctx->bkref_ents = new_entry;
10570
memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
10571
sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
10572
mctx->abkref_ents *= 2;
10573
}
10574
if (mctx->nbkref_ents > 0
10575
&& mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
10576
mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
10577
10578
mctx->bkref_ents[mctx->nbkref_ents].node = node;
10579
mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
10580
mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
10581
mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
10582
10583
/* This is a cache that saves negative results of check_dst_limits_calc_pos.
10584
If bit N is clear, means that this entry won't epsilon-transition to
10585
an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
10586
it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
10587
such node.
10588
10589
A backreference does not epsilon-transition unless it is empty, so set
10590
to all zeros if FROM != TO. */
10591
mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
10592
= (from == to ? ~0 : 0);
10593
10594
mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
10595
if (mctx->max_mb_elem_len < to - from)
10596
mctx->max_mb_elem_len = to - from;
10597
return REG_NOERROR;
10598
}
10599
10600
/* Search for the first entry which has the same str_idx, or -1 if none is
10601
found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
10602
10603
static int
10604
internal_function
10605
search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
10606
{
10607
int left, right, mid, last;
10608
last = right = mctx->nbkref_ents;
10609
for (left = 0; left < right;)
10610
{
10611
mid = (left + right) / 2;
10612
if (mctx->bkref_ents[mid].str_idx < str_idx)
10613
left = mid + 1;
10614
else
10615
right = mid;
10616
}
10617
if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
10618
return left;
10619
else
10620
return -1;
10621
}
10622
10623
/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
10624
at STR_IDX. */
10625
10626
static reg_errcode_t
10627
internal_function
10628
match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
10629
{
10630
#ifdef DEBUG
10631
assert (mctx->sub_tops != NULL);
10632
assert (mctx->asub_tops > 0);
10633
#endif
10634
if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
10635
{
10636
int new_asub_tops = mctx->asub_tops * 2;
10637
re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
10638
re_sub_match_top_t *,
10639
new_asub_tops);
10640
if (BE (new_array == NULL, 0))
10641
return REG_ESPACE;
10642
mctx->sub_tops = new_array;
10643
mctx->asub_tops = new_asub_tops;
10644
}
10645
mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
10646
if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
10647
return REG_ESPACE;
10648
mctx->sub_tops[mctx->nsub_tops]->node = node;
10649
mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
10650
return REG_NOERROR;
10651
}
10652
10653
/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
10654
at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
10655
10656
static re_sub_match_last_t *
10657
internal_function
10658
match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
10659
{
10660
re_sub_match_last_t *new_entry;
10661
if (BE (subtop->nlasts == subtop->alasts, 0))
10662
{
10663
int new_alasts = 2 * subtop->alasts + 1;
10664
re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
10665
re_sub_match_last_t *,
10666
new_alasts);
10667
if (BE (new_array == NULL, 0))
10668
return NULL;
10669
subtop->lasts = new_array;
10670
subtop->alasts = new_alasts;
10671
}
10672
new_entry = calloc (1, sizeof (re_sub_match_last_t));
10673
if (BE (new_entry != NULL, 1))
10674
{
10675
subtop->lasts[subtop->nlasts] = new_entry;
10676
new_entry->node = node;
10677
new_entry->str_idx = str_idx;
10678
++subtop->nlasts;
10679
}
10680
return new_entry;
10681
}
10682
10683
static void
10684
internal_function
10685
sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
10686
re_dfastate_t **limited_sts, int last_node, int last_str_idx)
10687
{
10688
sctx->sifted_states = sifted_sts;
10689
sctx->limited_states = limited_sts;
10690
sctx->last_node = last_node;
10691
sctx->last_str_idx = last_str_idx;
10692
re_node_set_init_empty (&sctx->limits);
10693
}
10694
10695
10696
/* Binary backward compatibility. */
10697
#if _LIBC
10698
# include <shlib-compat.h>
10699
# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3)
10700
link_warning (re_max_failures, "the 're_max_failures' variable is obsolete and will go away.")
10701
int re_max_failures = 2000;
10702
# endif
10703
#endif
10704
#endif
10705
10706