Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
att
GitHub Repository: att/ast
Path: blob/master/src/lib/libast/regex/reglib.h
1810 views
1
/***********************************************************************
2
* *
3
* This software is part of the ast package *
4
* Copyright (c) 1985-2012 AT&T Intellectual Property *
5
* and is licensed under the *
6
* Eclipse Public License, Version 1.0 *
7
* by AT&T Intellectual Property *
8
* *
9
* A copy of the License is available at *
10
* http://www.eclipse.org/org/documents/epl-v10.html *
11
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12
* *
13
* Information and Software Systems Research *
14
* AT&T Research *
15
* Florham Park NJ *
16
* *
17
* Glenn Fowler <[email protected]> *
18
* David Korn <[email protected]> *
19
* Phong Vo <[email protected]> *
20
* *
21
***********************************************************************/
22
#pragma prototyped
23
24
/*
25
* posix regex implementation
26
*
27
* based on Doug McIlroy's C++ implementation
28
* Knuth-Morris-Pratt adapted from Corman-Leiserson-Rivest
29
* Boyer-Moore from conversations with David Korn, Phong Vo, Andrew Hume
30
*/
31
32
#ifndef _REGLIB_H
33
#define _REGLIB_H
34
35
#define REG_VERSION_EXEC 20020509L
36
#define REG_VERSION_MAP 20030916L /* regdisc_t.re_map */
37
38
#define re_info env
39
40
#define alloc _reg_alloc
41
#define classfun _reg_classfun
42
#define drop _reg_drop
43
#define fatal _reg_fatal
44
#define state _reg_state
45
46
typedef struct regsubop_s
47
{
48
int op; /* REG_SUB_LOWER,REG_SUB_UPPER */
49
int off; /* re_rhs or match[] offset */
50
int len; /* re_rhs len or len==0 match[] */
51
} regsubop_t;
52
53
#define _REG_SUB_PRIVATE_ \
54
char* re_cur; /* re_buf cursor */ \
55
char* re_end; /* re_buf end */ \
56
regsubop_t* re_ops; /* rhs ops */ \
57
char re_rhs[1]; /* substitution rhs */
58
59
#include <ast.h>
60
#include <cdt.h>
61
#include <stk.h>
62
63
#include "regex.h"
64
65
#include <ctype.h>
66
#include <errno.h>
67
68
#if _BLD_DEBUG && !defined(_AST_REGEX_DEBUG)
69
#define _AST_REGEX_DEBUG 1
70
#endif
71
72
#define MBSIZE(p) ((ast.tmp_int=mbsize(p))>0?ast.tmp_int:1)
73
74
#undef RE_DUP_MAX /* posix puts this in limits.h! */
75
#define RE_DUP_MAX (INT_MAX/2-1) /* 2*RE_DUP_MAX won't overflow */
76
#define RE_DUP_INF (RE_DUP_MAX+1) /* infinity, for * */
77
#define BACK_REF_MAX 9
78
79
#define REG_COMP (~REG_EXEC)
80
#define REG_EXEC (REG_ADVANCE|REG_INVERT|REG_NOTBOL|REG_NOTEOL|REG_STARTEND)
81
82
#define REX_NULL 0 /* null string (internal) */
83
#define REX_ALT 1 /* a|b */
84
#define REX_ALT_CATCH 2 /* REX_ALT catcher */
85
#define REX_BACK 3 /* \1, \2, etc */
86
#define REX_BEG 4 /* initial ^ */
87
#define REX_BEG_STR 5 /* initial ^ w/ no newline */
88
#define REX_BM 6 /* Boyer-Moore */
89
#define REX_CAT 7 /* catenation catcher */
90
#define REX_CLASS 8 /* [...] */
91
#define REX_COLL_CLASS 9 /* collation order [...] */
92
#define REX_CONJ 10 /* a&b */
93
#define REX_CONJ_LEFT 11 /* REX_CONJ left catcher */
94
#define REX_CONJ_RIGHT 12 /* REX_CONJ right catcher */
95
#define REX_DONE 13 /* completed match (internal) */
96
#define REX_DOT 14 /* . */
97
#define REX_END 15 /* final $ */
98
#define REX_END_STR 16 /* final $ before tail newline */
99
#define REX_EXEC 17 /* call re.re_exec() */
100
#define REX_FIN_STR 18 /* final $ w/ no newline */
101
#define REX_GROUP 19 /* \(...\) */
102
#define REX_GROUP_CATCH 20 /* REX_GROUP catcher */
103
#define REX_GROUP_AHEAD 21 /* 0-width lookahead */
104
#define REX_GROUP_AHEAD_CATCH 22 /* REX_GROUP_AHEAD catcher */
105
#define REX_GROUP_AHEAD_NOT 23 /* inverted 0-width lookahead */
106
#define REX_GROUP_BEHIND 24 /* 0-width lookbehind */
107
#define REX_GROUP_BEHIND_CATCH 25 /* REX_GROUP_BEHIND catcher */
108
#define REX_GROUP_BEHIND_NOT 26 /* inverted 0-width lookbehind */
109
#define REX_GROUP_BEHIND_NOT_CATCH 27 /* REX_GROUP_BEHIND_NOT catcher */
110
#define REX_GROUP_COND 28 /* conditional group */
111
#define REX_GROUP_COND_CATCH 29 /* conditional group catcher */
112
#define REX_GROUP_CUT 30 /* don't backtrack over this */
113
#define REX_GROUP_CUT_CATCH 31 /* REX_GROUP_CUT catcher */
114
#define REX_KMP 32 /* Knuth-Morris-Pratt */
115
#define REX_NEG 33 /* negation */
116
#define REX_NEG_CATCH 34 /* REX_NEG catcher */
117
#define REX_NEST 35 /* nested match */
118
#define REX_ONECHAR 36 /* a single-character literal */
119
#define REX_REP 37 /* Kleene closure */
120
#define REX_REP_CATCH 38 /* REX_REP catcher */
121
#define REX_STRING 39 /* some chars */
122
#define REX_TRIE 40 /* alternation of strings */
123
#define REX_WBEG 41 /* \< */
124
#define REX_WEND 42 /* \> */
125
#define REX_WORD 43 /* word boundary */
126
#define REX_WORD_NOT 44 /* not word boundary */
127
128
#define T_META ((int)UCHAR_MAX+1)
129
#define T_STAR (T_META+0)
130
#define T_PLUS (T_META+1)
131
#define T_QUES (T_META+2)
132
#define T_BANG (T_META+3)
133
#define T_AT (T_META+4)
134
#define T_TILDE (T_META+5)
135
#define T_PERCENT (T_META+6)
136
#define T_LEFT (T_META+7)
137
#define T_OPEN (T_META+8)
138
#define T_CLOSE (T_OPEN+1)
139
#define T_RIGHT (T_OPEN+2)
140
#define T_CFLX (T_OPEN+3)
141
#define T_DOT (T_OPEN+4)
142
#define T_DOTSTAR (T_OPEN+5)
143
#define T_END (T_OPEN+6)
144
#define T_BAD (T_OPEN+7)
145
#define T_DOLL (T_OPEN+8)
146
#define T_BRA (T_OPEN+9)
147
#define T_BAR (T_OPEN+10)
148
#define T_AND (T_OPEN+11)
149
#define T_LT (T_OPEN+12)
150
#define T_GT (T_OPEN+13)
151
#define T_SLASHPLUS (T_OPEN+14)
152
#define T_GROUP (T_OPEN+15)
153
#define T_WORD (T_OPEN+16)
154
#define T_WORD_NOT (T_WORD+1)
155
#define T_BEG_STR (T_WORD+2)
156
#define T_END_STR (T_WORD+3)
157
#define T_FIN_STR (T_WORD+4)
158
#define T_ESCAPE (T_WORD+5)
159
#define T_ALNUM (T_WORD+6)
160
#define T_ALNUM_NOT (T_ALNUM+1)
161
#define T_DIGIT (T_ALNUM+2)
162
#define T_DIGIT_NOT (T_ALNUM+3)
163
#define T_SPACE (T_ALNUM+4)
164
#define T_SPACE_NOT (T_ALNUM+5)
165
#define T_BACK (T_ALNUM+6)
166
167
#define BRE 0
168
#define ERE 3
169
#define ARE 6
170
#define SRE 9
171
#define KRE 12
172
173
#define HIT SSIZE_MAX
174
175
#define bitclr(p,c) ((p)[((c)>>3)&037]&=(~(1<<((c)&07))))
176
#define bitset(p,c) ((p)[((c)>>3)&037]|=(1<<((c)&07)))
177
#define bittst(p,c) ((p)[((c)>>3)&037]&(1<<((c)&07)))
178
179
#define setadd(p,c) bitset((p)->bits,c)
180
#define setclr(p,c) bitclr((p)->bits,c)
181
#define settst(p,c) bittst((p)->bits,c)
182
183
#if _hdr_wchar && _lib_wctype && _lib_iswctype
184
185
#include <stdio.h> /* because <wchar.h> includes it and we generate it */
186
#include <wchar.h>
187
#if _hdr_wctype
188
#include <wctype.h>
189
#endif
190
191
#if !defined(iswblank) && !_lib_iswblank
192
#define _need_iswblank 1
193
#define iswblank(x) _reg_iswblank(x)
194
extern int _reg_iswblank(wint_t);
195
#endif
196
197
#if !defined(towupper) && !_lib_towupper
198
#define towupper(x) toupper(x)
199
#endif
200
201
#if !defined(towlower) && !_lib_towlower
202
#define towlower(x) tolower(x)
203
#endif
204
205
#else
206
207
#undef _lib_wctype
208
209
#ifndef iswalnum
210
#define iswalnum(x) isalnum(x)
211
#endif
212
#ifndef iswalpha
213
#define iswalpha(x) isalpha(x)
214
#endif
215
#ifndef iswcntrl
216
#define iswcntrl(x) iscntrl(x)
217
#endif
218
#ifndef iswdigit
219
#define iswdigit(x) isdigit(x)
220
#endif
221
#ifndef iswgraph
222
#define iswgraph(x) isgraph(x)
223
#endif
224
#ifndef iswlower
225
#define iswlower(x) islower(x)
226
#endif
227
#ifndef iswprint
228
#define iswprint(x) isprint(x)
229
#endif
230
#ifndef iswpunct
231
#define iswpunct(x) ispunct(x)
232
#endif
233
#ifndef iswspace
234
#define iswspace(x) isspace(x)
235
#endif
236
#ifndef iswupper
237
#define iswupper(x) isupper(x)
238
#endif
239
#ifndef iswxdigit
240
#define iswxdigit(x) isxdigit(x)
241
#endif
242
243
#ifndef towlower
244
#define towlower(x) tolower(x)
245
#endif
246
#ifndef towupper
247
#define towupper(x) toupper(x)
248
#endif
249
250
#endif
251
252
#ifndef iswblank
253
#define iswblank(x) ((x)==' '||(x)=='\t')
254
#endif
255
256
#ifndef iswgraph
257
#define iswgraph(x) (iswprint(x)&&!iswblank(x))
258
#endif
259
260
#define isword(x) (isalnum(x)||(x)=='_')
261
262
/*
263
* collation element support
264
*/
265
266
#define COLL_KEY_MAX 32
267
268
#if COLL_KEY_MAX < MB_LEN_MAX
269
#undef COLL_KEY_MAX
270
#define COLL_KEY_MAX MB_LEN_MAX
271
#endif
272
273
typedef unsigned char Ckey_t[COLL_KEY_MAX+1];
274
275
#define COLL_end 0
276
#define COLL_call 1
277
#define COLL_char 2
278
#define COLL_range 3
279
#define COLL_range_lc 4
280
#define COLL_range_uc 5
281
282
typedef struct Celt_s
283
{
284
short typ;
285
short min;
286
short max;
287
regclass_t fun;
288
Ckey_t beg;
289
Ckey_t end;
290
} Celt_t;
291
292
/*
293
* private stuff hanging off regex_t
294
*/
295
296
typedef struct Stk_pos_s
297
{
298
off_t offset;
299
char* base;
300
} Stk_pos_t;
301
302
typedef struct Vector_s
303
{
304
Stk_t* stk; /* stack pointer */
305
char* vec; /* the data */
306
int inc; /* growth increment */
307
int siz; /* element size */
308
ssize_t max; /* max index */
309
ssize_t cur; /* current index -- user domain */
310
} Vector_t;
311
312
/*
313
* Rex_t subtypes
314
*/
315
316
typedef struct Cond_s
317
{
318
unsigned char* beg; /* beginning of next match */
319
struct Rex_s* next[2]; /* 0:no 1:yes next pattern */
320
struct Rex_s* cont; /* right catcher */
321
int yes; /* yes condition hit */
322
} Cond_t;
323
324
typedef struct Conj_left_s
325
{
326
unsigned char* beg; /* beginning of left match */
327
struct Rex_s* right; /* right pattern */
328
struct Rex_s* cont; /* right catcher */
329
} Conj_left_t;
330
331
typedef struct Conj_right_s
332
{
333
unsigned char* end; /* end of left match */
334
struct Rex_s* cont; /* ambient continuation */
335
} Conj_right_t;
336
337
typedef unsigned int Bm_mask_t;
338
339
typedef struct Bm_s
340
{
341
Bm_mask_t** mask;
342
size_t* skip;
343
size_t* fail;
344
size_t size;
345
ssize_t back;
346
ssize_t left;
347
ssize_t right;
348
size_t complete;
349
} Bm_t;
350
351
typedef struct String_s
352
{
353
int* fail;
354
unsigned char* base;
355
size_t size;
356
} String_t;
357
358
typedef struct Set_s
359
{
360
unsigned char bits[(UCHAR_MAX+1)/CHAR_BIT];
361
} Set_t;
362
363
typedef struct Collate_s
364
{
365
int invert;
366
Celt_t* elements;
367
} Collate_t;
368
369
typedef struct Binary_s
370
{
371
struct Rex_s* left;
372
struct Rex_s* right;
373
int serial;
374
} Binary_t;
375
376
typedef struct Group_s
377
{
378
int number; /* group number */
379
int last; /* last contained group number */
380
int size; /* lookbehind size */
381
int back; /* backreferenced */
382
regflags_t flags; /* group flags */
383
union
384
{
385
Binary_t binary;
386
struct Rex_s* rex;
387
} expr;
388
} Group_t;
389
390
typedef struct Exec_s
391
{
392
void* data;
393
const char* text;
394
size_t size;
395
} Exec_t;
396
397
#define REX_NEST_open 0x01
398
#define REX_NEST_close 0x02
399
#define REX_NEST_escape 0x04
400
#define REX_NEST_quote 0x08
401
#define REX_NEST_literal 0x10
402
#define REX_NEST_delimiter 0x20
403
#define REX_NEST_terminator 0x40
404
#define REX_NEST_separator 0x80
405
406
#define REX_NEST_SHIFT 8
407
408
typedef struct Nest_s
409
{
410
int primary;
411
unsigned short none; /* for Nest_t.type[-1] */
412
unsigned short type[1];
413
} Nest_t;
414
415
/*
416
* REX_ALT catcher, solely to get control at the end of an
417
* alternative to keep records for comparing matches.
418
*/
419
420
typedef struct Alt_catch_s
421
{
422
struct Rex_s* cont;
423
} Alt_catch_t;
424
425
typedef struct Group_catch_s
426
{
427
struct Rex_s* cont;
428
regoff_t* eo;
429
} Group_catch_t;
430
431
typedef struct Behind_catch_s
432
{
433
struct Rex_s* cont;
434
unsigned char* beg;
435
unsigned char* end;
436
} Behind_catch_t;
437
438
/*
439
* REX_NEG catcher determines what string lengths can be matched,
440
* then Neg investigates continuations of other lengths.
441
* This is inefficient. For !POSITIONS expressions, we can do better:
442
* since matches to rex will be enumerated in decreasing order,
443
* we can investigate continuations whenever a length is skipped.
444
*/
445
446
typedef struct Neg_catch_s
447
{
448
unsigned char* beg;
449
unsigned char* index;
450
} Neg_catch_t;
451
452
/*
453
* REX_REP catcher. One is created on the stack for
454
* each iteration of a complex repetition.
455
*/
456
457
typedef struct Rep_catch_s
458
{
459
struct Rex_s* cont;
460
struct Rex_s* ref;
461
unsigned char* beg;
462
int n;
463
} Rep_catch_t;
464
465
/*
466
* data structure for an alternation of pure strings
467
* son points to a subtree of all strings with a common
468
* prefix ending in character c. sib links alternate
469
* letters in the same position of a word. end=1 if
470
* some word ends with c. the order of strings is
471
* irrelevant, except long words must be investigated
472
* before short ones.
473
*/
474
475
typedef struct Trie_node_s
476
{
477
unsigned char c;
478
unsigned char end;
479
struct Trie_node_s* son;
480
struct Trie_node_s* sib;
481
} Trie_node_t;
482
483
typedef struct Trie_s
484
{
485
Trie_node_t** root;
486
int min;
487
int max;
488
} Trie_t;
489
490
/*
491
* Rex_t is a node in a regular expression
492
*/
493
494
typedef struct Rex_s
495
{
496
unsigned char type; /* node type */
497
unsigned char marked; /* already marked */
498
short serial; /* subpattern number */
499
regflags_t flags; /* scoped flags */
500
int explicit; /* scoped explicit match*/
501
struct Rex_s* next; /* remaining parts */
502
int lo; /* lo dup count */
503
int hi; /* hi dup count */
504
unsigned char* map; /* fold and/or ccode map*/
505
union
506
{
507
Alt_catch_t alt_catch; /* alt catcher */
508
Bm_t bm; /* bm */
509
Behind_catch_t behind_catch; /* behind catcher */
510
Set_t* charclass; /* char class */
511
Collate_t collate; /* collation class */
512
Cond_t cond_catch; /* cond catcher */
513
Conj_left_t conj_left; /* conj left catcher */
514
Conj_right_t conj_right; /* conj right catcher */
515
void* data; /* data after Rex_t */
516
Exec_t exec; /* re.re_exec() args */
517
Group_t group; /* a|b or rep */
518
Group_catch_t group_catch; /* group catcher */
519
Neg_catch_t neg_catch; /* neg catcher */
520
Nest_t nest; /* nested match */
521
unsigned char onechar; /* single char */
522
Rep_catch_t rep_catch; /* rep catcher */
523
String_t string; /* string/kmp */
524
Trie_t trie; /* trie */
525
} re;
526
} Rex_t;
527
528
typedef struct reglib_s /* library private regex_t info */
529
{
530
struct Rex_s* rex; /* compiled expression */
531
regdisc_t* disc; /* REG_DISCIPLINE discipline */
532
const regex_t* regex; /* from regexec */
533
unsigned char* beg; /* beginning of string */
534
unsigned char* end; /* end of string */
535
Vector_t* pos; /* posns of certain subpatterns */
536
Vector_t* bestpos; /* ditto for best match */
537
regmatch_t* match; /* subexrs in current match */
538
regmatch_t* best; /* ditto in best match yet */
539
Stk_pos_t stk; /* exec stack pos */
540
size_t min; /* minimum match length */
541
size_t nsub; /* internal re_nsub */
542
regflags_t flags; /* flags from regcomp() */
543
int error; /* last error */
544
int explicit; /* explicit match on this char */
545
int leading; /* leading match on this char */
546
int refs; /* regcomp()+regdup() references*/
547
Rex_t done; /* the last continuation */
548
regstat_t stats; /* for regstat() */
549
unsigned char fold[UCHAR_MAX+1]; /* REG_ICASE map */
550
unsigned char hard; /* hard comp */
551
unsigned char once; /* if 1st parse fails, quit */
552
unsigned char separate; /* cannot combine */
553
unsigned char stack; /* hard comp or exec */
554
unsigned char sub; /* re_sub is valid */
555
unsigned char test; /* debug/test bitmask */
556
} Env_t;
557
558
typedef struct oldregmatch_s /* pre-20120528 regmatch_t */
559
{
560
int rm_so; /* offset of start */
561
int rm_eo; /* offset of end */
562
} oldregmatch_t;
563
564
typedef struct State_s /* shared state */
565
{
566
regmatch_t nomatch;
567
struct
568
{
569
unsigned char key;
570
short val[15];
571
} escape[52];
572
short* magic[UCHAR_MAX+1];
573
regdisc_t disc;
574
int fatal;
575
int initialized;
576
Dt_t* attrs;
577
Dt_t* names;
578
Dtdisc_t dtdisc;
579
} State_t;
580
581
extern State_t state;
582
583
extern void* alloc(regdisc_t*, void*, size_t);
584
extern regclass_t classfun(int);
585
extern void drop(regdisc_t*, Rex_t*);
586
extern int fatal(regdisc_t*, int, const char*);
587
588
#endif
589
590