Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
att
GitHub Repository: att/ast
Path: blob/master/src/lib/libast/regex/regcomp.c
1810 views
1
/***********************************************************************
2
* *
3
* This software is part of the ast package *
4
* Copyright (c) 1985-2012 AT&T Intellectual Property *
5
* and is licensed under the *
6
* Eclipse Public License, Version 1.0 *
7
* by AT&T Intellectual Property *
8
* *
9
* A copy of the License is available at *
10
* http://www.eclipse.org/org/documents/epl-v10.html *
11
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12
* *
13
* Information and Software Systems Research *
14
* AT&T Research *
15
* Florham Park NJ *
16
* *
17
* Glenn Fowler <[email protected]> *
18
* David Korn <[email protected]> *
19
* Phong Vo <[email protected]> *
20
* *
21
***********************************************************************/
22
#pragma prototyped
23
24
/*
25
* posix regex compiler
26
*/
27
28
#include "reglib.h"
29
30
#if _PACKAGE_ast
31
#include "lclib.h"
32
#endif
33
34
#define serialize re_serialize /* hp.ia64 <unistd.h>! */
35
36
#define C_ESC (-1)
37
#define C_MB (-2)
38
39
#if _AST_REGEX_DEBUG
40
41
#define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n))
42
#define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0)
43
#define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_comp_debug")) debug |= strtoul(t, NiL, 0); } } while (0)
44
45
static unsigned long debug;
46
static unsigned long debug_flag;
47
48
#else
49
50
#define DEBUG_INIT()
51
#define DEBUG_TEST(f,y,n) (n)
52
#define DEBUG_CODE(f,y,n) do {n} while(0)
53
54
#endif
55
56
#if _PACKAGE_ast
57
58
typedef struct Cchr_s
59
{
60
Dtlink_t lnk;
61
unsigned char nam[2];
62
Ckey_t key;
63
} Cchr_t;
64
65
#endif
66
67
#define eat(p) do{if ((p)->token.push)(p)->token.push=0;else (p)->cursor+=(p)->token.len;}while (0)
68
69
/*
70
* determine whether greedy matching will work, i.e. produce
71
* the best match first. such expressions are "easy", and
72
* need no backtracking once a complete match is found.
73
* if an expression has backreferences or alts it's hard
74
* else if it has only one closure it's easy
75
* else if all closures are simple (i.e. one-character) it's easy
76
* else it's hard.
77
*/
78
79
typedef struct Stats_s
80
{
81
unsigned long l; /* min length to left of x */
82
unsigned long k; /* min length to left of y */
83
unsigned long m; /* min length */
84
unsigned long n; /* max length */
85
unsigned short a; /* number of alternations */
86
unsigned short b; /* number of backrefs */
87
unsigned short c; /* number of closures */
88
unsigned short e; /* $ */
89
unsigned short i; /* number of negations */
90
unsigned short p; /* number of named subexpressions */
91
unsigned short s; /* number of simple closures */
92
unsigned short t; /* number of tries */
93
unsigned short u; /* number of unnamed subexpressions */
94
Rex_t* x; /* max length REX_STRING */
95
Rex_t* y; /* max length REX_TRIE */
96
} Stats_t;
97
98
typedef struct Token_s
99
{
100
unsigned long min;
101
unsigned long max;
102
short lex;
103
short len;
104
short esc;
105
short att;
106
short push;
107
} Token_t;
108
109
typedef struct Cenv_s
110
{
111
int delimiter; /* pattern delimiter */
112
int error; /* last error */
113
int explicit; /* explicit match on this char */
114
int mappeddot; /* inverse mapped '.' */
115
int mappednewline; /* inverse mapped '\n' */
116
int mappedslash; /* inverse mapped '/' */
117
regflags_t flags; /* flags arg to regcomp */
118
int type; /* BRE,ERE,ARE,SRE,KRE */
119
unsigned char* cursor; /* curent point in re */
120
unsigned char* pattern; /* the original pattern */
121
unsigned char* literal; /* literal restart pattern */
122
int parno; /* number of last open paren */
123
int parnest; /* paren nest count */
124
int posixkludge; /* to make * nonspecial */
125
Token_t token; /* token lookahead */
126
Stats_t stats; /* RE statistics */
127
int terminator; /* pattern terminator */
128
Rex_t* paren[2*(BACK_REF_MAX+2)];
129
/* paren[i]!=0 if \i defined */
130
regex_t* regex; /* user handle */
131
regdisc_t* disc; /* user discipline */
132
unsigned char* map; /* external to native ccode map */
133
unsigned char* MAP; /* fold and/or map */
134
} Cenv_t;
135
136
/*
137
* allocate a new Rex_t node
138
*/
139
140
#if _BLD_DEBUG
141
#define node(a,b,c,d,e) node(a,b,c,d,e, const char* file, unsigned int line)
142
#endif
143
144
static Rex_t*
145
node(Cenv_t* env, int type, int lo, int hi, size_t extra)
146
{
147
register Rex_t* e;
148
149
DEBUG_TEST(0x0800,(sfprintf(sfstdout, "%s:%d node(%d,%d,%d,%u)\n", file, line, type, lo, hi, sizeof(Rex_t) + extra)),(0));
150
if (e = (Rex_t*)alloc(env->disc, 0, sizeof(Rex_t) + extra))
151
{
152
memset(e, 0, sizeof(Rex_t) + extra);
153
e->type = type;
154
e->marked = 0;
155
e->lo = lo;
156
e->hi = hi;
157
e->flags = env->flags;
158
e->map = (e->flags & REG_ICASE) ? env->MAP : env->map;
159
e->explicit = env->explicit;
160
if (extra)
161
e->re.data = (char*)e + sizeof(Rex_t);
162
}
163
return e;
164
}
165
166
#if _BLD_DEBUG
167
#undef node
168
#define node(a,b,c,d,e) node(a,b,c,d,e,__FILE__,__LINE__)
169
#endif
170
171
/*
172
* free a Trie_node_t node
173
*/
174
175
static void
176
triedrop(regdisc_t* disc, Trie_node_t* e)
177
{
178
if (e)
179
{
180
triedrop(disc, e->son);
181
triedrop(disc, e->sib);
182
alloc(disc, e, 0);
183
}
184
}
185
186
/*
187
* free a Rex_t node
188
*/
189
190
void
191
drop(regdisc_t* disc, Rex_t* e)
192
{
193
int i;
194
Rex_t* x;
195
196
if (e && !(disc->re_flags & REG_NOFREE))
197
do
198
{
199
switch (e->type)
200
{
201
case REX_ALT:
202
case REX_CONJ:
203
drop(disc, e->re.group.expr.binary.left);
204
drop(disc, e->re.group.expr.binary.right);
205
break;
206
case REX_GROUP:
207
case REX_GROUP_AHEAD:
208
case REX_GROUP_AHEAD_NOT:
209
case REX_GROUP_BEHIND:
210
case REX_GROUP_BEHIND_NOT:
211
case REX_GROUP_CUT:
212
case REX_NEG:
213
case REX_REP:
214
drop(disc, e->re.group.expr.rex);
215
break;
216
case REX_TRIE:
217
for (i = 0; i <= UCHAR_MAX; i++)
218
triedrop(disc, e->re.trie.root[i]);
219
break;
220
}
221
x = e->next;
222
alloc(disc, e, 0);
223
} while (e = x);
224
}
225
226
/*
227
* mark e and descendants minimal
228
*/
229
230
static void
231
mark(register Rex_t* e, int set)
232
{
233
if (e && !e->marked)
234
do
235
{
236
e->marked = 1;
237
if (set)
238
e->flags |= REG_MINIMAL;
239
else
240
e->flags &= ~REG_MINIMAL;
241
switch (e->type)
242
{
243
case REX_ALT:
244
case REX_CONJ:
245
case REX_GROUP_COND:
246
if (e->re.group.expr.binary.left)
247
mark(e->re.group.expr.binary.left, set);
248
if (e->re.group.expr.binary.right)
249
mark(e->re.group.expr.binary.right, set);
250
break;
251
case REX_GROUP:
252
case REX_GROUP_AHEAD:
253
case REX_GROUP_AHEAD_NOT:
254
case REX_GROUP_BEHIND:
255
case REX_GROUP_BEHIND_NOT:
256
case REX_GROUP_CUT:
257
case REX_NEG:
258
case REX_REP:
259
case REX_TRIE:
260
mark(e->re.group.expr.rex, set);
261
break;
262
}
263
} while (e = e->next);
264
}
265
266
/*
267
* assign subexpression numbers by a preorder tree walk
268
*/
269
270
static int
271
serialize(Cenv_t* env, Rex_t* e, int n)
272
{
273
do
274
{
275
e->serial = n++;
276
switch (e->type)
277
{
278
case REX_ALT:
279
case REX_GROUP_COND:
280
if (e->re.group.expr.binary.left)
281
n = serialize(env, e->re.group.expr.binary.left, n);
282
e->re.group.expr.binary.serial = n++;
283
if (e->re.group.expr.binary.right)
284
n = serialize(env, e->re.group.expr.binary.right, n);
285
break;
286
case REX_CONJ:
287
n = serialize(env, e->re.group.expr.binary.left, n);
288
n = serialize(env, e->re.group.expr.binary.right, n);
289
break;
290
case REX_GROUP:
291
case REX_GROUP_AHEAD:
292
case REX_GROUP_AHEAD_NOT:
293
case REX_GROUP_BEHIND:
294
case REX_GROUP_BEHIND_NOT:
295
case REX_GROUP_CUT:
296
case REX_NEG:
297
case REX_REP:
298
n = serialize(env, e->re.group.expr.rex, n);
299
break;
300
}
301
} while (e = e->next);
302
return n;
303
}
304
305
/*
306
* catenate e and f into a sequence, collapsing them if possible
307
*/
308
309
static Rex_t*
310
cat(Cenv_t* env, Rex_t* e, Rex_t* f)
311
{
312
Rex_t* g;
313
314
if (!f)
315
{
316
drop(env->disc, e);
317
return 0;
318
}
319
if (e->type == REX_NULL)
320
{
321
drop(env->disc, e);
322
return f;
323
}
324
if (f->type == REX_NULL)
325
{
326
g = f->next;
327
f->next = 0;
328
drop(env->disc, f);
329
f = g;
330
}
331
else if (e->type == REX_DOT && f->type == REX_DOT)
332
{
333
unsigned int m = e->lo + f->lo;
334
unsigned int n = e->hi + f->hi;
335
336
if (m <= RE_DUP_MAX)
337
{
338
if (e->hi > RE_DUP_MAX || f->hi > RE_DUP_MAX)
339
{
340
n = RE_DUP_INF;
341
goto combine;
342
}
343
else if (n <= RE_DUP_MAX)
344
{
345
combine:
346
e->lo = m;
347
e->hi = n;
348
g = f->next;
349
f->next = 0;
350
drop(env->disc, f);
351
f = g;
352
}
353
}
354
}
355
e->next = f;
356
return e;
357
}
358
359
/*
360
* collect re statistics
361
*/
362
363
static int
364
stats(register Cenv_t* env, register Rex_t* e)
365
{
366
register unsigned long n;
367
register unsigned long m;
368
unsigned long cm;
369
unsigned long nm;
370
unsigned long cn;
371
unsigned long nn;
372
unsigned long l;
373
unsigned long k;
374
unsigned long t;
375
Rex_t* q;
376
Rex_t* x;
377
Rex_t* y;
378
unsigned char c;
379
unsigned char b;
380
381
do
382
{
383
switch (e->type)
384
{
385
case REX_ALT:
386
x = env->stats.x;
387
l = env->stats.l;
388
y = env->stats.y;
389
k = env->stats.k;
390
t = env->stats.t;
391
if (++env->stats.a <= 0)
392
return 1;
393
cm = env->stats.m;
394
env->stats.m = 0;
395
cn = env->stats.n;
396
env->stats.n = 0;
397
if (stats(env, e->re.group.expr.binary.left))
398
return 1;
399
m = env->stats.m;
400
env->stats.m = 0;
401
n = env->stats.n;
402
env->stats.n = 0;
403
if (e->re.group.expr.binary.right && stats(env, e->re.group.expr.binary.right))
404
return 1;
405
if (env->stats.m > m)
406
env->stats.m = m;
407
else
408
m = env->stats.m;
409
if ((env->stats.m += cm) < m)
410
return 1;
411
if (env->stats.n < n)
412
env->stats.n = n;
413
else
414
n = env->stats.n;
415
if ((env->stats.n += cn) < n)
416
return 1;
417
env->stats.x = x;
418
env->stats.l = l;
419
env->stats.y = y;
420
env->stats.k = k;
421
env->stats.t = t;
422
break;
423
case REX_BACK:
424
if (++env->stats.b <= 0)
425
return 1;
426
break;
427
case REX_CLASS:
428
case REX_COLL_CLASS:
429
case REX_DOT:
430
case REX_ONECHAR:
431
n = env->stats.m;
432
if ((env->stats.m += e->lo) < n)
433
return 1;
434
if (e->hi != RE_DUP_INF)
435
{
436
n = env->stats.n;
437
if ((env->stats.n += e->hi) < n)
438
return 1;
439
}
440
if (e->lo != e->hi)
441
{
442
if (++env->stats.c <= 0)
443
return 1;
444
if (++env->stats.s <= 0)
445
return 1;
446
}
447
break;
448
case REX_CONJ:
449
cm = env->stats.m;
450
env->stats.m = 0;
451
cn = env->stats.n;
452
env->stats.n = 0;
453
if (stats(env, e->re.group.expr.binary.left))
454
return 1;
455
nm = env->stats.m;
456
env->stats.m = 0;
457
nn = env->stats.n;
458
env->stats.n = 0;
459
if (stats(env, e->re.group.expr.binary.right))
460
return 1;
461
if (env->stats.m < nm)
462
env->stats.m = nm;
463
else
464
nm = env->stats.m;
465
if ((env->stats.m += cm) < nm)
466
return 1;
467
if (env->stats.n < nn)
468
env->stats.n = nn;
469
else
470
nn = env->stats.n;
471
if ((env->stats.n += cn) < nn)
472
return 1;
473
break;
474
case REX_END:
475
env->stats.e = 1;
476
break;
477
case REX_GROUP:
478
if (e->re.group.number && ++env->stats.p <= 0 || !e->re.group.number && ++env->stats.u <= 0)
479
return 1;
480
if (stats(env, e->re.group.expr.rex))
481
return 1;
482
break;
483
case REX_GROUP_AHEAD:
484
case REX_GROUP_AHEAD_NOT:
485
case REX_GROUP_BEHIND:
486
case REX_GROUP_BEHIND_NOT:
487
m = env->stats.m;
488
n = env->stats.n;
489
x = env->stats.x;
490
y = env->stats.y;
491
if (stats(env, e->re.group.expr.rex))
492
return 1;
493
env->stats.m = m;
494
env->stats.n = n;
495
env->stats.x = x;
496
env->stats.y = y;
497
switch (e->type)
498
{
499
case REX_GROUP_AHEAD:
500
case REX_GROUP_BEHIND:
501
if (++env->stats.u <= 0)
502
return 1;
503
break;
504
}
505
break;
506
case REX_GROUP_COND:
507
if (++env->stats.u <= 0)
508
return 1;
509
m = env->stats.m;
510
n = env->stats.n;
511
x = env->stats.x;
512
y = env->stats.y;
513
if (e->re.group.size > 0 && ++env->stats.b <= 0)
514
return 1;
515
if (e->re.group.expr.binary.left && stats(env, e->re.group.expr.binary.left))
516
return 1;
517
if (q = e->re.group.expr.binary.right)
518
{
519
if (q->re.group.expr.binary.left && stats(env, q->re.group.expr.binary.left))
520
return 1;
521
if (q->re.group.expr.binary.right && stats(env, q->re.group.expr.binary.right))
522
return 1;
523
}
524
env->stats.m = m;
525
env->stats.n = n;
526
env->stats.x = x;
527
env->stats.y = y;
528
break;
529
case REX_GROUP_CUT:
530
if (++env->stats.u <= 0)
531
return 1;
532
m = env->stats.m;
533
n = env->stats.n;
534
x = env->stats.x;
535
y = env->stats.y;
536
if (stats(env, e->re.group.expr.rex))
537
return 1;
538
env->stats.m = m;
539
env->stats.n = n;
540
env->stats.x = x;
541
env->stats.y = y;
542
break;
543
case REX_NEG:
544
env->stats.i++;
545
x = env->stats.x;
546
l = env->stats.l;
547
y = env->stats.y;
548
k = env->stats.k;
549
t = env->stats.t;
550
cm = env->stats.m;
551
env->stats.m = 0;
552
if (stats(env, e->re.group.expr.rex))
553
return 1;
554
env->stats.m = !env->stats.m;
555
if ((env->stats.m += cm) < cm)
556
return 1;
557
env->stats.x = x;
558
env->stats.l = l;
559
env->stats.y = y;
560
env->stats.k = k;
561
env->stats.t = t;
562
break;
563
case REX_REP:
564
x = env->stats.x;
565
l = env->stats.l;
566
y = env->stats.y;
567
k = env->stats.k;
568
t = env->stats.t;
569
if (++env->stats.c <= 0)
570
return 1;
571
b = env->stats.b;
572
c = env->stats.c;
573
cm = env->stats.m;
574
env->stats.m = 0;
575
if (stats(env, e->re.group.expr.rex))
576
return 1;
577
if (env->stats.m == 1 && b == env->stats.b && c == env->stats.c && ++env->stats.s <= 0)
578
return 1;
579
if (e->lo < 1)
580
{
581
env->stats.x = x;
582
env->stats.l = l;
583
env->stats.y = y;
584
env->stats.k = k;
585
env->stats.t = t;
586
env->stats.m = cm;
587
}
588
else
589
{
590
m = env->stats.m;
591
if ((env->stats.m *= e->lo) > 0 && env->stats.m < m)
592
return 1;
593
m = env->stats.m;
594
if ((env->stats.m += cm) < m)
595
return 1;
596
if (env->stats.x != x)
597
env->stats.l = cm;
598
if (env->stats.y != y)
599
env->stats.k = cm;
600
}
601
break;
602
case REX_STRING:
603
if (!e->map)
604
{
605
cm = env->stats.m;
606
if ((env->stats.m += e->re.string.size) < cm)
607
return 1;
608
cn = env->stats.n;
609
if ((env->stats.n += e->re.string.size) < cn)
610
return 1;
611
if (!env->stats.x || env->stats.x->re.string.size < e->re.string.size)
612
{
613
env->stats.x = e;
614
env->stats.l = cm;
615
}
616
}
617
break;
618
case REX_TRIE:
619
if (++env->stats.s <= 0)
620
return 1;
621
cm = env->stats.m;
622
if ((env->stats.m += e->re.trie.min) < cm)
623
return 1;
624
cn = env->stats.n;
625
if ((env->stats.n += e->re.trie.max) < cn)
626
return 1;
627
env->stats.t++;
628
if (!env->stats.y || env->stats.y->re.trie.min < e->re.trie.min)
629
{
630
env->stats.y = e;
631
env->stats.k = cm;
632
}
633
break;
634
}
635
} while (e = e->next);
636
return 0;
637
}
638
639
static int token(Cenv_t*);
640
641
static int
642
magic(register Cenv_t* env, register int c, int escaped)
643
{
644
register char* sp;
645
register int n;
646
int o = c;
647
int e = env->error;
648
int l = env->token.len;
649
short* mp;
650
char* ep;
651
652
if (mp = state.magic[c])
653
{
654
c = mp[env->type+escaped];
655
if (c >= T_META)
656
{
657
sp = (char*)env->cursor + env->token.len;
658
switch (c)
659
{
660
case T_LEFT:
661
n = 0;
662
ep = sp;
663
while (*sp >= '0' && *sp <= '9')
664
{
665
if (n > (INT_MAX / 10))
666
{
667
env->error = REG_BADBR;
668
goto bad;
669
}
670
n = n * 10 + *sp++ - '0';
671
}
672
if (sp == ep)
673
{
674
if (env->type < SRE || *sp != ',')
675
{
676
env->error = *sp ? REG_BADBR : REG_EBRACE;
677
goto bad;
678
}
679
}
680
else if (n > RE_DUP_MAX)
681
{
682
env->error = REG_BADBR;
683
goto bad;
684
}
685
env->token.min = n;
686
if (*sp == ',')
687
{
688
n = 0;
689
ep = ++sp;
690
while (*sp >= '0' && *sp <= '9')
691
{
692
if (n > (INT_MAX / 10))
693
{
694
env->error = REG_BADBR;
695
goto bad;
696
}
697
n = n * 10 + *sp++ - '0';
698
}
699
if (sp == ep)
700
n = RE_DUP_INF;
701
else if (n < env->token.min)
702
{
703
env->error = REG_BADBR;
704
goto bad;
705
}
706
}
707
env->token.max = n;
708
switch (*sp)
709
{
710
case 0:
711
env->error = REG_EBRACE;
712
goto bad;
713
case '\\':
714
if (!escaped)
715
{
716
env->error = REG_BADBR;
717
goto bad;
718
}
719
sp++;
720
break;
721
default:
722
if (escaped)
723
{
724
env->error = REG_BADBR;
725
goto bad;
726
}
727
break;
728
}
729
switch (*sp++)
730
{
731
case 0:
732
env->error = REG_EBRACE;
733
goto bad;
734
case '}':
735
break;
736
default:
737
env->error = REG_BADBR;
738
goto bad;
739
}
740
env->token.len = sp - (char*)env->cursor;
741
break;
742
case T_RIGHT:
743
env->error = REG_EBRACE;
744
goto bad;
745
case T_OPEN:
746
if (env->type < SRE && *sp == '?')
747
{
748
env->token.len++;
749
env->token.lex = 0;
750
goto group;
751
}
752
break;
753
case T_ESCAPE:
754
c = chresc(sp - 2, &ep);
755
if (ep < sp)
756
goto bad;
757
env->token.len += ep - sp;
758
if (c >= T_META)
759
{
760
env->token.lex = c;
761
c = C_ESC;
762
}
763
return c;
764
case T_BACK+0:
765
case T_BACK+1:
766
case T_BACK+2:
767
case T_BACK+3:
768
case T_BACK+4:
769
case T_BACK+5:
770
case T_BACK+6:
771
case T_BACK+7:
772
n = chresc(sp - 2, &ep);
773
if (ep > sp + 1)
774
{
775
env->token.len += ep - sp;
776
return n;
777
}
778
/*FALLTHROUGH*/
779
case T_BACK+8:
780
case T_BACK+9:
781
if (env->type == SRE || c == T_BACK && !(env->flags & (REG_LENIENT|REG_REGEXP)))
782
{
783
env->error = REG_BADESC;
784
goto bad;
785
}
786
if ((env->flags & REG_MULTIREF) && isdigit(*sp))
787
{
788
c = (c - T_BACK) * 10 + (*sp - '0');
789
if (c > 0 && c <= env->parno && env->paren[c])
790
c += T_BACK;
791
else
792
c = chresc(sp - 2, &ep);
793
env->token.len++;
794
}
795
if (c == T_BACK)
796
c = 0;
797
break;
798
case T_BAD:
799
if (escaped == 1 && (env->flags & (REG_LENIENT|REG_REGEXP)) && (c = mp[env->type+escaped+2]) >= T_META)
800
return c;
801
goto bad;
802
}
803
if (env->type >= SRE)
804
{
805
if (c == T_DOT)
806
c = '.';
807
else if (c < T_OPEN)
808
{
809
if (env->type == KRE && *(env->cursor + env->token.len) == '-' && *(env->cursor + env->token.len + 1) == '(')
810
{
811
env->token.len++;
812
env->token.att = 1;
813
}
814
if (env->type == KRE && *(env->cursor + env->token.len) == '(')
815
{
816
env->token.len++;
817
switch (c)
818
{
819
case T_AT:
820
break;
821
case T_PERCENT:
822
env->token.lex = c;
823
goto group;
824
case T_TILDE:
825
env->token.lex = 0;
826
goto group;
827
default:
828
env->token.lex = c;
829
break;
830
}
831
c = T_OPEN;
832
}
833
else if (c == T_STAR)
834
c = T_DOTSTAR;
835
else if (c == T_QUES)
836
c = T_DOT;
837
else
838
{
839
c = o;
840
env->token.len = l;
841
}
842
}
843
else if (c > T_BACK)
844
{
845
c = (c - T_BACK) * 2 - 1;
846
c = (c > env->parno || !env->paren[c]) ? o : T_BACK + c;
847
}
848
else if (env->type == KRE && !env->parnest && (env->flags & REG_SHELL_GROUP))
849
{
850
if (c == T_AND)
851
c = '&';
852
else if (c == T_BAR)
853
c = '|';
854
else if (c == T_OPEN)
855
c = '(';
856
}
857
}
858
}
859
}
860
else if (escaped == 2)
861
{
862
if (env->type >= SRE && !(env->flags & REG_SHELL_ESCAPED) || (env->flags & REG_ESCAPE) && (c == '[' || c == '-' || c == ']' || env->delimiter && c == env->delimiter))
863
/*ok*/;
864
else
865
{
866
env->error = REG_BADESC;
867
goto bad;
868
}
869
}
870
else if (escaped && !(env->flags & (REG_LENIENT|REG_REGEXP)) && c != ']')
871
{
872
env->error = REG_BADESC;
873
goto bad;
874
}
875
return c;
876
group:
877
sp = (char*)env->cursor + env->token.len;
878
switch (*sp++)
879
{
880
case ')':
881
break;
882
case '#':
883
for (;;)
884
{
885
switch (*sp++)
886
{
887
case 0:
888
env->error = REG_EPAREN;
889
return T_BAD;
890
case ')':
891
break;
892
default:
893
continue;
894
}
895
break;
896
}
897
break;
898
default:
899
return T_GROUP;
900
}
901
env->cursor = (unsigned char*)sp;
902
return token(env);
903
bad:
904
if (escaped == 2)
905
env->error = e;
906
else if (env->flags & (REG_LENIENT|REG_REGEXP))
907
return o;
908
else if (escaped == 1 && !env->error)
909
{
910
if (mp || o == ']')
911
return o;
912
env->error = REG_BADESC;
913
}
914
return T_BAD;
915
}
916
917
static int
918
token(register Cenv_t* env)
919
{
920
int c;
921
int posixkludge;
922
923
if (env->token.push)
924
return env->token.lex;
925
env->token.att = env->token.esc = 0;
926
if ((env->token.len = MBSIZE(env->cursor)) > 1)
927
return env->token.lex = C_MB;
928
env->token.lex = 0;
929
for (;;)
930
{
931
c = *env->cursor;
932
if (c == 0 || c == env->delimiter || c == env->terminator)
933
return T_END;
934
if (!(env->flags & REG_COMMENT))
935
break;
936
if (c == '#')
937
{
938
do
939
{
940
c = *++env->cursor;
941
if (c == 0 || c == env->delimiter)
942
return T_END;
943
} while (c != '\n');
944
}
945
else if (!isspace(c))
946
break;
947
env->cursor++;
948
}
949
if (c == '\n' && (env->flags & REG_MULTIPLE) && !env->delimiter)
950
{
951
if (env->parnest)
952
{
953
env->error = REG_EPAREN;
954
return T_BAD;
955
}
956
env->parno = 0;
957
env->pattern = env->cursor + 1;
958
return T_BAR;
959
}
960
if (env->flags & REG_LITERAL)
961
return c;
962
if (posixkludge = env->posixkludge)
963
{
964
env->posixkludge = 0;
965
if (c == '*')
966
return c;
967
}
968
if (c == '\\')
969
{
970
if (env->flags & REG_SHELL_ESCAPED)
971
return c;
972
if (!(c = *(env->cursor + 1)) || c == env->terminator)
973
{
974
if (env->flags & (REG_LENIENT|REG_REGEXP))
975
{
976
if (c)
977
{
978
env->token.esc = env->token.len;
979
env->token.len += MBSIZE(env->cursor + 1);
980
return c;
981
}
982
return '\\';
983
}
984
env->error = REG_EESCAPE;
985
return T_BAD;
986
}
987
env->token.esc = env->token.len;
988
env->token.len += MBSIZE(env->cursor + 1);
989
if (env->delimiter && c == 'n')
990
return '\n';
991
else if (c == env->delimiter)
992
return magic(env, c, 0);
993
else if (c == '(' && env->type == BRE)
994
env->posixkludge = 1;
995
else if (c == ')' && env->type == BRE && env->parnest <= 0)
996
{
997
env->error = REG_EPAREN;
998
return T_BAD;
999
}
1000
else if (isspace(c) && (env->flags & REG_COMMENT))
1001
return c;
1002
return magic(env, c, 1);
1003
}
1004
else if (c == '$')
1005
{
1006
if (env->type == BRE && (*(env->cursor + 1) == 0 || *(env->cursor + 1) == env->delimiter || *(env->cursor + 1) == env->terminator || *(env->cursor + 1) == '\\' && *(env->cursor + 2) == ')') || (env->flags & REG_MULTIPLE) && *(env->cursor + 1) == '\n')
1007
return T_DOLL;
1008
}
1009
else if (c == '^')
1010
{
1011
if (env->type == BRE && (env->cursor == env->pattern || posixkludge == 1))
1012
{
1013
env->posixkludge = 2;
1014
return T_CFLX;
1015
}
1016
}
1017
else if (c == ')')
1018
{
1019
if (env->type != BRE && env->parnest <= 0)
1020
return c;
1021
}
1022
else if (c == '/' && env->explicit == env->mappedslash)
1023
{
1024
while (*(env->cursor + env->token.len) == c)
1025
env->token.len++;
1026
return T_SLASHPLUS;
1027
}
1028
return magic(env, c, 0);
1029
}
1030
1031
static Celt_t*
1032
col(Celt_t* ce, int ic, unsigned char* bp, int bw, int bc, unsigned char* ep, int ew, int ec)
1033
{
1034
register char* s;
1035
register unsigned char* k;
1036
register unsigned char* e;
1037
register int c;
1038
register int cc;
1039
int bt;
1040
int et;
1041
Ckey_t key;
1042
1043
cc = 0;
1044
for (;;)
1045
{
1046
k = key;
1047
if (bw == 1)
1048
{
1049
c = bc;
1050
if (ic)
1051
{
1052
if (isupper(c))
1053
{
1054
c = tolower(c);
1055
cc = -1;
1056
}
1057
else if (islower(c))
1058
{
1059
c = toupper(c);
1060
cc = -1;
1061
}
1062
}
1063
*k++ = c;
1064
}
1065
else if (bw < COLL_KEY_MAX)
1066
{
1067
s = (char*)bp;
1068
if (ic)
1069
{
1070
c = mbchar(s);
1071
if (iswupper(c))
1072
{
1073
c = towlower(c);
1074
cc = 1;
1075
}
1076
else if (iswlower(c))
1077
{
1078
c = towupper(c);
1079
cc = 1;
1080
}
1081
}
1082
if (cc > 0)
1083
{
1084
cc = -1;
1085
k += mbconv((char*)k, c);
1086
}
1087
else
1088
for (e = k + bw; k < e; *k++ = *s++);
1089
}
1090
*k = 0;
1091
mbxfrm(ce->beg, key, COLL_KEY_MAX);
1092
if (ep)
1093
{
1094
k = key;
1095
c = mbchar(k);
1096
if (iswupper(c))
1097
bt = COLL_range_uc;
1098
else if (iswlower(c))
1099
bt = COLL_range_lc;
1100
else
1101
bt = COLL_range;
1102
k = key;
1103
if (ew == 1)
1104
{
1105
c = ec;
1106
if (ic)
1107
{
1108
if (isupper(c))
1109
{
1110
c = tolower(c);
1111
cc = -1;
1112
}
1113
else if (islower(c))
1114
{
1115
c = toupper(c);
1116
cc = -1;
1117
}
1118
}
1119
*k++ = c;
1120
}
1121
else if (ew < COLL_KEY_MAX)
1122
{
1123
s = (char*)ep;
1124
if (ic)
1125
{
1126
c = mbchar(s);
1127
if (iswupper(c))
1128
{
1129
c = towlower(c);
1130
cc = 1;
1131
}
1132
else if (iswlower(c))
1133
{
1134
c = towupper(c);
1135
cc = 1;
1136
}
1137
}
1138
if (cc > 0)
1139
{
1140
cc = -1;
1141
k += mbconv((char*)k, c);
1142
}
1143
else
1144
for (e = k + ew; k < e; *k++ = *s++);
1145
}
1146
*k = 0;
1147
mbxfrm(ce->end, key, COLL_KEY_MAX);
1148
k = key;
1149
c = mbchar(k);
1150
if (iswupper(c))
1151
et = COLL_range_uc;
1152
else if (iswlower(c))
1153
et = COLL_range_lc;
1154
else
1155
et = COLL_range;
1156
ce->typ = bt == et ? bt : COLL_range;
1157
}
1158
else
1159
ce->typ = COLL_char;
1160
ce++;
1161
if (!ic || !cc)
1162
break;
1163
ic = 0;
1164
}
1165
return ce;
1166
}
1167
1168
static Rex_t*
1169
bra(Cenv_t* env)
1170
{
1171
Rex_t* e;
1172
int c;
1173
int i;
1174
int w;
1175
int neg;
1176
int last;
1177
int inrange;
1178
int complicated;
1179
int collate;
1180
int elements;
1181
unsigned char* first;
1182
unsigned char* start;
1183
unsigned char* begin;
1184
unsigned char* s;
1185
regclass_t f;
1186
unsigned char buf[4 * (COLL_KEY_MAX + 1)];
1187
#if _PACKAGE_ast
1188
int ic;
1189
char mbc[COLL_KEY_MAX + 1];
1190
#endif
1191
1192
if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t))))
1193
return 0;
1194
collate = complicated = elements = 0;
1195
if (*env->cursor == '^' || env->type >= SRE && *env->cursor == '!')
1196
{
1197
env->cursor++;
1198
complicated = neg = 1;
1199
}
1200
else
1201
neg = 0;
1202
first = env->cursor;
1203
start = first + MBSIZE(first);
1204
if (*env->cursor == 0 || *(env->cursor + 1) == 0 || *env->cursor == env->terminator || *(env->cursor + 1) == env->terminator || (env->flags & REG_ESCAPE) && (*env->cursor == env->delimiter || *env->cursor != '\\' && *(env->cursor + 1) == env->delimiter))
1205
goto error;
1206
begin = env->cursor + MBSIZE(env->cursor);
1207
1208
/*
1209
* inrange: 0=no, 1=possibly, 2=definitely
1210
*/
1211
1212
inrange = 0;
1213
for (;;)
1214
{
1215
if (!(c = *env->cursor) || c == env->terminator || c == env->delimiter && (env->flags & REG_ESCAPE))
1216
goto error;
1217
env->cursor += (w = MBSIZE(env->cursor));
1218
if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)))
1219
{
1220
if (*env->cursor)
1221
{
1222
if (*env->cursor == 'n')
1223
{
1224
env->cursor++;
1225
c = '\n';
1226
}
1227
else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED))
1228
{
1229
env->token.len = 1;
1230
w = magic(env, *env->cursor, 2);
1231
if (env->token.len > 1 || w != T_BAD)
1232
{
1233
if (env->token.len == 1 && (f = classfun(w)))
1234
{
1235
if (inrange > 1)
1236
{
1237
if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP)))
1238
goto erange;
1239
inrange = 0;
1240
}
1241
env->cursor++;
1242
for (c = 0; c <= UCHAR_MAX; c++)
1243
if ((*f)(c))
1244
setadd(e->re.charclass, c);
1245
complicated++;
1246
elements++;
1247
continue;
1248
}
1249
if (env->token.len > 1 || w >= 0 && w < T_META)
1250
{
1251
c = w;
1252
if (c > UCHAR_MAX)
1253
{
1254
if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP)) && !mbwide())
1255
goto erange;
1256
c = UCHAR_MAX;
1257
}
1258
env->cursor += env->token.len;
1259
}
1260
}
1261
}
1262
}
1263
}
1264
else if (c == ']')
1265
{
1266
if (env->cursor == begin)
1267
{
1268
last = c;
1269
inrange = 1;
1270
continue;
1271
}
1272
if (inrange != 0)
1273
{
1274
setadd(e->re.charclass, last);
1275
elements++;
1276
if (inrange == 2)
1277
{
1278
setadd(e->re.charclass, '-');
1279
elements++;
1280
}
1281
}
1282
break;
1283
}
1284
else if (c == '-')
1285
{
1286
if (!inrange && env->cursor != begin && *env->cursor != ']')
1287
{
1288
if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP)))
1289
goto erange;
1290
continue;
1291
}
1292
else if (inrange == 1)
1293
{
1294
inrange = 2;
1295
complicated++;
1296
continue;
1297
}
1298
}
1299
else if (c == '[')
1300
{
1301
switch (*env->cursor)
1302
{
1303
case 0:
1304
goto error;
1305
case ':':
1306
if (env->flags & REG_REGEXP)
1307
goto normal;
1308
if (inrange == 1)
1309
{
1310
setadd(e->re.charclass, last);
1311
elements++;
1312
}
1313
if (!(f = regclass((char*)env->cursor, (char**)&env->cursor)))
1314
{
1315
if (env->cursor == start && (c = *(env->cursor + 1)))
1316
{
1317
s = start = env->cursor + 1;
1318
while (*++s && *s != ':');
1319
if (*s == ':' && *(s + 1) == ']' && *(s + 2) == ']')
1320
{
1321
if ((i = (s - start)) == 1)
1322
{
1323
switch (c)
1324
{
1325
case '<':
1326
i = REX_WBEG;
1327
break;
1328
case '>':
1329
i = REX_WEND;
1330
break;
1331
default:
1332
i = 0;
1333
break;
1334
}
1335
if (i)
1336
{
1337
env->cursor = s + 3;
1338
drop(env->disc, e);
1339
return node(env, i, 0, 0, 0);
1340
}
1341
}
1342
}
1343
}
1344
env->error = REG_ECTYPE;
1345
goto error;
1346
}
1347
for (c = 0; c <= UCHAR_MAX; c++)
1348
if ((*f)(c))
1349
setadd(e->re.charclass, c);
1350
inrange = 0;
1351
complicated++;
1352
elements++;
1353
continue;
1354
case '=':
1355
if (env->flags & REG_REGEXP)
1356
goto normal;
1357
if (inrange == 2)
1358
goto erange;
1359
if (inrange == 1)
1360
{
1361
setadd(e->re.charclass, last);
1362
elements++;
1363
}
1364
if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf), NiL)) < 0)
1365
goto ecollate;
1366
if (c > 1)
1367
collate++;
1368
else
1369
setadd(e->re.charclass, buf[0]);
1370
c = buf[0];
1371
inrange = 0;
1372
complicated++;
1373
elements++;
1374
continue;
1375
case '.':
1376
if (env->flags & REG_REGEXP)
1377
goto normal;
1378
if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf), NiL)) < 0)
1379
goto ecollate;
1380
if (c > 1)
1381
collate++;
1382
c = buf[0];
1383
complicated++;
1384
break;
1385
default:
1386
normal:
1387
if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))
1388
goto error;
1389
break;
1390
}
1391
}
1392
else if (w > 1)
1393
complicated++;
1394
if (inrange == 2)
1395
{
1396
if (last <= c)
1397
{
1398
for (i = last; i <= c; i++)
1399
setadd(e->re.charclass, i);
1400
inrange = env->type >= SRE || (env->flags & (REG_LENIENT|REG_REGEXP));
1401
elements += 2;
1402
}
1403
else if (env->type >= SRE)
1404
{
1405
setadd(e->re.charclass, last);
1406
setadd(e->re.charclass, c);
1407
elements += 2;
1408
inrange = 1;
1409
}
1410
else if (!(env->flags & (REG_LENIENT|REG_REGEXP)))
1411
goto erange;
1412
else
1413
inrange = 0;
1414
}
1415
else if (inrange == 1)
1416
{
1417
setadd(e->re.charclass, last);
1418
elements++;
1419
}
1420
else
1421
inrange = 1;
1422
last = c;
1423
}
1424
#if _PACKAGE_ast
1425
if (complicated && mbcoll())
1426
{
1427
Dt_t* dt;
1428
Cchr_t* cc;
1429
Cchr_t* tc;
1430
Cchr_t* xc;
1431
Celt_t* ce;
1432
Cchr_t key;
1433
int rw;
1434
int rc;
1435
wchar_t wc;
1436
unsigned char* rp;
1437
unsigned char* pp;
1438
char cb[2][COLL_KEY_MAX+1];
1439
1440
static Dtdisc_t disc;
1441
1442
static const char primary[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
1443
1444
if (!(dt = (Dt_t*)LCINFO(AST_LC_COLLATE)->data))
1445
{
1446
disc.key = offsetof(Cchr_t, key);
1447
if ((cc = newof(0, Cchr_t, elementsof(primary), 0)) && (dt = dtopen(&disc, Dtoset)))
1448
{
1449
for (i = 0; i < elementsof(primary) - 1; i++, cc++)
1450
{
1451
cc->nam[0] = primary[i];
1452
mbxfrm(cc->key, cc->nam, COLL_KEY_MAX);
1453
dtinsert(dt, cc);
1454
}
1455
for (i = 0; i < elementsof(cc->key); i++)
1456
cc->key[i] = ~0;
1457
dtinsert(dt, cc);
1458
LCINFO(AST_LC_COLLATE)->data = (void*)dt;
1459
}
1460
else
1461
{
1462
if (cc)
1463
free(cc);
1464
drop(env->disc, e);
1465
return 0;
1466
}
1467
}
1468
if (dt)
1469
{
1470
drop(env->disc, e);
1471
if (ic = env->flags & REG_ICASE)
1472
elements *= 2;
1473
if (!(e = node(env, REX_COLL_CLASS, 1, 1, (elements + 3) * sizeof(Celt_t))))
1474
return 0;
1475
ce = (Celt_t*)e->re.data;
1476
e->re.collate.invert = neg;
1477
e->re.collate.elements = ce;
1478
env->cursor = first;
1479
inrange = 0;
1480
for (;;)
1481
{
1482
if ((c = *env->cursor) == 0 || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter)
1483
goto error;
1484
pp = env->cursor;
1485
env->cursor += (w = MBSIZE(env->cursor));
1486
if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)))
1487
{
1488
if (*env->cursor)
1489
{
1490
if (*env->cursor == 'n')
1491
{
1492
pp = env->cursor++;
1493
c = '\n';
1494
}
1495
else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED))
1496
{
1497
env->token.len = 1;
1498
w = magic(env, *env->cursor, 2);
1499
if (env->token.len > 1 || w != T_BAD)
1500
{
1501
if (env->token.len == 1 && (f = classfun(w)))
1502
{
1503
if (inrange > 1)
1504
{
1505
if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP)))
1506
goto erange;
1507
inrange = 0;
1508
}
1509
env->cursor++;
1510
ce->fun = f;
1511
ce->typ = COLL_call;
1512
ce++;
1513
continue;
1514
}
1515
if (env->token.len > 1 || w >= 0 && w < T_META)
1516
{
1517
c = w;
1518
w = mbconv(mbc, c);
1519
pp = (unsigned char*)mbc;
1520
env->cursor += env->token.len;
1521
}
1522
}
1523
}
1524
}
1525
}
1526
else if (c == ']')
1527
{
1528
if (env->cursor == begin)
1529
{
1530
rp = pp;
1531
rw = w;
1532
inrange = 1;
1533
continue;
1534
}
1535
if (inrange != 0)
1536
{
1537
ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1538
if (inrange == 2)
1539
ce = col(ce, ic, NiL, 1, '-', NiL, 0, 0);
1540
}
1541
break;
1542
}
1543
else if (c == '-')
1544
{
1545
if (!inrange && env->cursor != begin && *env->cursor != ']')
1546
{
1547
if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP)))
1548
goto erange;
1549
continue;
1550
}
1551
else if (inrange == 1)
1552
{
1553
inrange = 2;
1554
continue;
1555
}
1556
}
1557
else if (c == '[')
1558
{
1559
switch (*env->cursor)
1560
{
1561
case 0:
1562
goto error;
1563
case ':':
1564
if (env->flags & REG_REGEXP)
1565
goto complicated_normal;
1566
if (inrange == 1)
1567
ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1568
if (!(f = regclass((char*)env->cursor, (char**)&env->cursor)))
1569
{
1570
if (env->cursor == start && (c = *(env->cursor + 1)) && *(env->cursor + 2) == ':' && *(env->cursor + 3) == ']' && *(env->cursor + 4) == ']')
1571
{
1572
switch (c)
1573
{
1574
case '<':
1575
i = REX_WBEG;
1576
break;
1577
case '>':
1578
i = REX_WEND;
1579
break;
1580
default:
1581
i = 0;
1582
break;
1583
}
1584
if (i)
1585
{
1586
env->cursor += 5;
1587
drop(env->disc, e);
1588
return node(env, i, 0, 0, 0);
1589
}
1590
}
1591
env->error = REG_ECTYPE;
1592
goto error;
1593
}
1594
ce->fun = f;
1595
ce->typ = COLL_call;
1596
ce++;
1597
inrange = 0;
1598
continue;
1599
case '=':
1600
if (env->flags & REG_REGEXP)
1601
goto complicated_normal;
1602
if (inrange == 2)
1603
goto erange;
1604
if (inrange == 1)
1605
ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1606
pp = (unsigned char*)cb[inrange];
1607
rp = env->cursor + 1;
1608
if ((rw = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX, &wc)) < 0)
1609
goto ecollate;
1610
c = 0;
1611
if (ic)
1612
{
1613
if (iswupper(wc))
1614
{
1615
wc = towlower(wc);
1616
rw = mbconv((char*)pp, wc);
1617
c = 'u';
1618
}
1619
else if (iswlower(wc))
1620
c = 'l';
1621
}
1622
i = 1;
1623
for (;;)
1624
{
1625
mbxfrm(key.key, (char*)pp, COLL_KEY_MAX);
1626
if (!(cc = (Cchr_t*)dtsearch(dt, &key)) && !(cc = (Cchr_t*)dtprev(dt, &key)))
1627
{
1628
if (i)
1629
{
1630
c = *pp;
1631
goto singleton;
1632
}
1633
goto ecollate;
1634
}
1635
xc = (tc = (Cchr_t*)dtprev(dt, cc)) && !strcasecmp((char*)tc->nam, (char*)cc->nam) ? tc : cc;
1636
if (c == 'l' || c == 'L' && !(c = 0))
1637
ce->typ = COLL_range_lc;
1638
else if (c == 'u' || c == 'U' && !(c = 0))
1639
ce->typ = COLL_range_uc;
1640
else
1641
ce->typ = COLL_range;
1642
strcpy((char*)ce->beg, (char*)xc->key);
1643
if (!(cc = (Cchr_t*)dtnext(dt, cc)))
1644
{
1645
if (i)
1646
{
1647
c = *pp;
1648
goto singleton;
1649
}
1650
goto ecollate;
1651
}
1652
if (!strcasecmp((char*)xc->nam, (char*)cc->nam) && (tc = (Cchr_t*)dtnext(dt, cc)))
1653
cc = tc;
1654
strcpy((char*)ce->end, (char*)cc->key);
1655
ce->max = -1;
1656
ce++;
1657
if (!c)
1658
break;
1659
if (c == 'u')
1660
{
1661
wc = towlower(wc);
1662
c = 'L';
1663
}
1664
else
1665
{
1666
wc = towupper(wc);
1667
c = 'U';
1668
}
1669
rw = mbconv((char*)pp, wc);
1670
i = 0;
1671
}
1672
inrange = 0;
1673
c = *pp;
1674
continue;
1675
case '.':
1676
if (env->flags & REG_REGEXP)
1677
goto complicated_normal;
1678
pp = (unsigned char*)cb[inrange];
1679
if ((w = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX, NiL)) < 0)
1680
goto ecollate;
1681
c = *pp;
1682
break;
1683
default:
1684
complicated_normal:
1685
if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))
1686
goto error;
1687
break;
1688
}
1689
}
1690
singleton:
1691
if (inrange == 2)
1692
{
1693
ce = col(ce, ic, rp, rw, rc, pp, w, c);
1694
if (strcmp((char*)ce->beg, (char*)ce->end) > 0)
1695
{
1696
if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP)))
1697
goto erange;
1698
(ce-1)->typ = COLL_char;
1699
strcpy((char*)ce->beg, (char*)(ce-1)->end);
1700
ce->typ = COLL_char;
1701
ce++;
1702
}
1703
inrange = env->type >= SRE || (env->flags & (REG_LENIENT|REG_REGEXP));
1704
}
1705
else if (inrange == 1)
1706
ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1707
else
1708
inrange = 1;
1709
rp = pp;
1710
rw = w;
1711
rc = c;
1712
}
1713
ce->typ = COLL_end;
1714
return e;
1715
}
1716
}
1717
#endif
1718
if (collate)
1719
goto ecollate;
1720
if (env->flags & REG_ICASE)
1721
for (i = 0; i <= UCHAR_MAX; i++)
1722
if (settst(e->re.charclass, i))
1723
{
1724
if (isupper(i))
1725
c = tolower(i);
1726
else if (islower(i))
1727
c = toupper(i);
1728
else
1729
continue;
1730
setadd(e->re.charclass, c);
1731
}
1732
if (neg)
1733
{
1734
for (i = 0; i < elementsof(e->re.charclass->bits); i++)
1735
e->re.charclass->bits[i] ^= ~0;
1736
if (env->explicit >= 0)
1737
setclr(e->re.charclass, env->explicit);
1738
}
1739
return e;
1740
ecollate:
1741
env->error = REG_ECOLLATE;
1742
goto error;
1743
erange:
1744
env->error = REG_ERANGE;
1745
error:
1746
drop(env->disc, e);
1747
if (!env->error)
1748
env->error = REG_EBRACK;
1749
return 0;
1750
}
1751
1752
static Rex_t*
1753
ccl(Cenv_t* env, int type)
1754
{
1755
int i;
1756
Rex_t* e;
1757
Celt_t* ce;
1758
regclass_t f;
1759
1760
if (!(f = classfun(type)))
1761
{
1762
env->error = REG_BADESC;
1763
return 0;
1764
}
1765
if (!mbcoll())
1766
{
1767
if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t))))
1768
return 0;
1769
for (i = 0; i <= UCHAR_MAX; i++)
1770
if ((*f)(i))
1771
setadd(e->re.charclass, i);
1772
if (env->explicit >= 0)
1773
setclr(e->re.charclass, env->explicit);
1774
}
1775
else
1776
{
1777
if (!(e = node(env, REX_COLL_CLASS, 1, 1, 2 * sizeof(Celt_t))))
1778
return 0;
1779
ce = (Celt_t*)e->re.data;
1780
e->re.collate.invert = 0;
1781
e->re.collate.elements = ce;
1782
ce->fun = f;
1783
ce->typ = COLL_call;
1784
ce++;
1785
ce->typ = COLL_end;
1786
}
1787
return e;
1788
}
1789
1790
static Rex_t*
1791
rep(Cenv_t* env, Rex_t* e, int number, int last)
1792
{
1793
Rex_t* f;
1794
unsigned long m = 0;
1795
unsigned long n = RE_DUP_INF;
1796
int minimal = -1;
1797
1798
if (!e)
1799
return 0;
1800
switch (token(env))
1801
{
1802
case T_BANG:
1803
eat(env);
1804
if (!(f = node(env, REX_NEG, m, n, 0)))
1805
{
1806
drop(env->disc, e);
1807
return 0;
1808
}
1809
f->re.group.expr.rex = e;
1810
return f;
1811
case T_QUES:
1812
eat(env);
1813
n = 1;
1814
break;
1815
case T_STAR:
1816
eat(env);
1817
break;
1818
case T_PLUS:
1819
eat(env);
1820
m = 1;
1821
break;
1822
case T_LEFT:
1823
eat(env);
1824
m = env->token.min;
1825
n = env->token.max;
1826
break;
1827
default:
1828
return e;
1829
}
1830
if (env->token.att)
1831
minimal = 1;
1832
else if (env->type < SRE)
1833
switch (token(env))
1834
{
1835
case T_QUES:
1836
eat(env);
1837
minimal = !(env->flags & REG_MINIMAL);
1838
break;
1839
case T_STAR: /*AST*/
1840
eat(env);
1841
minimal = !!(env->flags & REG_MINIMAL);
1842
break;
1843
}
1844
switch (e->type)
1845
{
1846
case REX_DOT:
1847
case REX_CLASS:
1848
case REX_COLL_CLASS:
1849
case REX_ONECHAR:
1850
e->lo = m;
1851
e->hi = n;
1852
if (minimal >= 0)
1853
mark(e, minimal);
1854
return e;
1855
#if HUH_2002_08_07
1856
case REX_BEG:
1857
#endif
1858
case REX_BEG_STR:
1859
case REX_END_STR:
1860
case REX_FIN_STR:
1861
case REX_WBEG:
1862
case REX_WEND:
1863
case REX_WORD:
1864
case REX_WORD_NOT:
1865
env->error = REG_BADRPT;
1866
drop(env->disc, e);
1867
return 0;
1868
}
1869
if (m == 1 && n == 1)
1870
{
1871
if (minimal >= 0)
1872
mark(e, minimal);
1873
return e;
1874
}
1875
if (!(f = node(env, REX_REP, m, n, 0)))
1876
{
1877
drop(env->disc, e);
1878
return 0;
1879
}
1880
f->re.group.expr.rex = e;
1881
f->re.group.number = number;
1882
f->re.group.last = last;
1883
if (minimal >= 0)
1884
mark(f, minimal);
1885
if (m <= n && n)
1886
{
1887
for (; e && e->type >= REX_GROUP && e->type <= REX_GROUP_CUT; e = e->re.group.expr.rex);
1888
if (e && e->type == REX_NEG)
1889
f->type = REX_GROUP;
1890
}
1891
return f;
1892
}
1893
1894
static int
1895
isstring(Rex_t* e)
1896
{
1897
switch (e->type)
1898
{
1899
case REX_ONECHAR:
1900
return e->lo == 1 && e->hi == 1;
1901
case REX_STRING:
1902
return 1;
1903
}
1904
return 0;
1905
}
1906
1907
static Trie_node_t*
1908
trienode(Cenv_t* env, int c)
1909
{
1910
Trie_node_t* t;
1911
1912
if (t = (Trie_node_t*)alloc(env->disc, 0, sizeof(Trie_node_t)))
1913
{
1914
memset(t, 0, sizeof(Trie_node_t));
1915
t->c = c;
1916
}
1917
return t;
1918
}
1919
1920
static int
1921
insert(Cenv_t* env, Rex_t* f, Rex_t* g)
1922
{
1923
unsigned char* s;
1924
unsigned char* e;
1925
Trie_node_t* t;
1926
int len;
1927
unsigned char tmp[2];
1928
1929
switch (f->type)
1930
{
1931
case REX_ONECHAR:
1932
*(s = tmp) = f->re.onechar;
1933
e = s + 1;
1934
break;
1935
case REX_STRING:
1936
s = f->re.string.base;
1937
e = s + f->re.string.size;
1938
break;
1939
default:
1940
return 1;
1941
}
1942
if (!(t = g->re.trie.root[*s]) && !(t = g->re.trie.root[*s] = trienode(env, *s)))
1943
return 1;
1944
for (len = 1;;)
1945
{
1946
if (t->c == *s)
1947
{
1948
if (++s >= e)
1949
break;
1950
if (!t->son && !(t->son = trienode(env, *s)))
1951
return 1;
1952
t = t->son;
1953
len++;
1954
}
1955
else
1956
{
1957
if (!t->sib && !(t->sib = trienode(env, *s)))
1958
return 1;
1959
t = t->sib;
1960
}
1961
}
1962
if (g->re.trie.min > len)
1963
g->re.trie.min = len;
1964
if (g->re.trie.max < len)
1965
g->re.trie.max = len;
1966
t->end = 1;
1967
return 0;
1968
}
1969
1970
/*
1971
* trie() tries to combine nontrivial e and f into a REX_TRIE
1972
* unless 0 is returned, e and f are deleted as far as possible
1973
* also called by regcomb
1974
*/
1975
1976
static Rex_t*
1977
trie(Cenv_t* env, Rex_t* e, Rex_t* f)
1978
{
1979
Rex_t* g;
1980
1981
if (e->next || f->next || !isstring(e) || e->flags != f->flags)
1982
return 0;
1983
if (isstring(f))
1984
{
1985
if (!(g = node(env, REX_TRIE, 0, 0, (UCHAR_MAX + 1) * sizeof(Trie_node_t*))))
1986
return 0;
1987
g->re.trie.min = INT_MAX;
1988
if (insert(env, f, g))
1989
goto nospace;
1990
drop(env->disc, f);
1991
}
1992
else if (f->type != REX_TRIE)
1993
return 0;
1994
else
1995
g = f;
1996
if (insert(env, e, g))
1997
goto nospace;
1998
drop(env->disc, e);
1999
return g;
2000
nospace:
2001
if (g != f)
2002
drop(env->disc, g);
2003
return 0;
2004
}
2005
2006
static Rex_t* alt(Cenv_t*, int, int);
2007
2008
static int
2009
chr(register Cenv_t* env, int* escaped)
2010
{
2011
unsigned char* p;
2012
int c;
2013
2014
*escaped = 0;
2015
if (!(c = *env->cursor))
2016
return -1;
2017
env->cursor++;
2018
if (c == '\\')
2019
{
2020
if (env->flags & REG_SHELL_ESCAPED)
2021
return c;
2022
if (!(c = *(env->cursor + 1)) || c == env->terminator)
2023
{
2024
if (env->flags & (REG_LENIENT|REG_REGEXP))
2025
return c ? c : '\\';
2026
env->error = REG_EESCAPE;
2027
return -1;
2028
}
2029
p = env->cursor;
2030
c = chresc((char*)env->cursor - 1, (char**)&env->cursor);
2031
*escaped = env->cursor - p;
2032
}
2033
return c;
2034
}
2035
2036
/*
2037
* open the perly gates
2038
*/
2039
2040
static Rex_t*
2041
grp(Cenv_t* env, int parno)
2042
{
2043
Rex_t* e;
2044
Rex_t* f;
2045
int c;
2046
int g;
2047
int i;
2048
int n;
2049
int x;
2050
int esc;
2051
int typ;
2052
int beg;
2053
unsigned char* p;
2054
2055
g = env->flags;
2056
beg = env->pattern == env->cursor - env->token.len;
2057
if (!(c = env->token.lex) && (c = *env->cursor))
2058
env->cursor++;
2059
env->token.len = 0;
2060
env->parnest++;
2061
typ = -1;
2062
switch (c)
2063
{
2064
case '-':
2065
case '+':
2066
case 'a':
2067
case 'g':
2068
case 'i':
2069
case 'l':
2070
case 'm':
2071
case 'p':
2072
case 'r':
2073
case 's':
2074
case 'x':
2075
case 'A':
2076
case 'B':
2077
case 'E':
2078
case 'F':
2079
case 'G':
2080
case 'I':
2081
case 'K':
2082
case 'L':
2083
case 'M': /* glob(3) */
2084
case 'N': /* glob(3) */
2085
case 'O': /* glob(3) */
2086
case 'P':
2087
case 'R': /* pcre */
2088
case 'S':
2089
case 'U': /* pcre */
2090
case 'V':
2091
case 'X': /* pcre */
2092
x = REX_GROUP;
2093
i = 1;
2094
env->token.push = 1;
2095
for (;;)
2096
{
2097
switch (c)
2098
{
2099
case ')':
2100
if (!(env->flags & REG_LITERAL))
2101
{
2102
env->error = REG_BADRPT;
2103
return 0;
2104
}
2105
/*FALLTHROUGH*/
2106
case 0:
2107
case T_CLOSE:
2108
x = 0;
2109
goto done;
2110
case ':':
2111
eat(env);
2112
if (token(env) == T_CLOSE)
2113
x = 0;
2114
goto done;
2115
case '-':
2116
i = 0;
2117
break;
2118
case '+':
2119
i = 1;
2120
break;
2121
case 'a':
2122
if (i)
2123
env->flags |= (REG_LEFT|REG_RIGHT);
2124
else
2125
env->flags &= ~(REG_LEFT|REG_RIGHT);
2126
break;
2127
case 'g':
2128
if (i)
2129
env->flags &= ~REG_MINIMAL;
2130
else
2131
env->flags |= REG_MINIMAL;
2132
break;
2133
case 'i':
2134
if (i)
2135
env->flags |= REG_ICASE;
2136
else
2137
env->flags &= ~REG_ICASE;
2138
break;
2139
case 'l':
2140
if (i)
2141
env->flags |= REG_LEFT;
2142
else
2143
env->flags &= ~REG_LEFT;
2144
break;
2145
case 'm':
2146
if (i)
2147
env->flags |= REG_NEWLINE;
2148
else
2149
env->flags &= ~REG_NEWLINE;
2150
env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1;
2151
break;
2152
case 'p':
2153
if (i)
2154
env->flags &= ~REG_LENIENT;
2155
else
2156
env->flags |= REG_LENIENT;
2157
break;
2158
case 'r':
2159
if (i)
2160
env->flags |= REG_RIGHT;
2161
else
2162
env->flags &= ~REG_RIGHT;
2163
break;
2164
case 's':
2165
if (i)
2166
env->flags |= REG_SPAN;
2167
else
2168
env->flags &= ~REG_SPAN;
2169
env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1;
2170
break;
2171
case 'x':
2172
if (i)
2173
env->flags |= REG_COMMENT;
2174
else
2175
env->flags &= ~REG_COMMENT;
2176
break;
2177
case 'X':
2178
if (typ >= 0 || env->type == ERE && (env->flags & REG_CLASS_ESCAPE))
2179
break; /* PCRE_EXTRA */
2180
/*FALLTHROUGH*/
2181
case 'A':
2182
env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2183
env->flags |= REG_AUGMENTED|REG_EXTENDED;
2184
typ = ARE;
2185
break;
2186
case 'B':
2187
case 'G':
2188
env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2189
typ = BRE;
2190
break;
2191
case 'E':
2192
env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2193
env->flags |= REG_EXTENDED;
2194
typ = ERE;
2195
break;
2196
case 'F':
2197
case 'L':
2198
env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2199
env->flags |= REG_LITERAL;
2200
typ = ERE;
2201
break;
2202
case 'K':
2203
env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2204
env->flags |= REG_AUGMENTED|REG_SHELL|REG_LEFT|REG_RIGHT;
2205
typ = KRE;
2206
break;
2207
case 'M':
2208
/* used by caller to disable glob(3) GLOB_BRACE */
2209
break;
2210
case 'N':
2211
/* used by caller to disable glob(3) GLOB_NOCHECK */
2212
break;
2213
case 'O':
2214
/* used by caller to disable glob(3) GLOB_STARSTAR */
2215
break;
2216
case 'P':
2217
env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2218
env->flags |= REG_EXTENDED|REG_CLASS_ESCAPE;
2219
typ = ERE;
2220
break;
2221
case 'S':
2222
env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2223
env->flags |= REG_SHELL|REG_LEFT|REG_RIGHT;
2224
typ = SRE;
2225
break;
2226
case 'U': /* PCRE_UNGREEDY */
2227
if (typ >= 0 || env->type == ERE && (env->flags & REG_CLASS_ESCAPE))
2228
{
2229
if (i)
2230
env->flags |= REG_MINIMAL;
2231
else
2232
env->flags &= ~REG_MINIMAL;
2233
}
2234
break;
2235
case 'V':
2236
env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2237
env->flags |= REG_REGEXP;
2238
typ = BRE;
2239
break;
2240
default:
2241
env->error = REG_BADRPT;
2242
return 0;
2243
}
2244
eat(env);
2245
c = token(env);
2246
}
2247
done:
2248
break;
2249
case ':':
2250
x = REX_GROUP;
2251
break;
2252
case '=':
2253
x = REX_GROUP_AHEAD;
2254
break;
2255
case '!':
2256
x = REX_GROUP_AHEAD_NOT;
2257
break;
2258
case '<':
2259
switch (token(env))
2260
{
2261
case '=':
2262
x = REX_GROUP_BEHIND;
2263
break;
2264
case '!':
2265
case T_BANG:
2266
x = REX_GROUP_BEHIND_NOT;
2267
break;
2268
default:
2269
env->error = REG_BADRPT;
2270
return 0;
2271
}
2272
eat(env);
2273
break;
2274
case '>':
2275
x = REX_GROUP_CUT;
2276
break;
2277
case '%':
2278
case T_PERCENT:
2279
e = node(env, REX_NEST, 0, 0, (UCHAR_MAX + 1) * sizeof(unsigned short));
2280
e->re.nest.primary = isalnum(*env->cursor) ? -1 : *env->cursor;
2281
n = 1;
2282
for (;;)
2283
{
2284
switch (i = chr(env, &esc))
2285
{
2286
case -1:
2287
case 0:
2288
invalid:
2289
env->cursor -= esc + 1;
2290
env->error = REG_EPAREN;
2291
return 0;
2292
case 'D':
2293
x = REX_NEST_delimiter;
2294
/*FALLTHROUGH*/
2295
delimiter:
2296
if ((i = chr(env, &esc)) < 0)
2297
goto invalid;
2298
if (e->re.nest.type[i] & ~x)
2299
goto invalid;
2300
e->re.nest.type[i] = x;
2301
continue;
2302
case 'E':
2303
x = REX_NEST_escape;
2304
goto delimiter;
2305
case 'L':
2306
x = REX_NEST_literal;
2307
goto quote;
2308
case 'O':
2309
switch (i = chr(env, &esc))
2310
{
2311
case 'T':
2312
e->re.nest.type[UCHAR_MAX+1] |= REX_NEST_terminator;
2313
break;
2314
default:
2315
goto invalid;
2316
}
2317
continue;
2318
case 'Q':
2319
x = REX_NEST_quote;
2320
/*FALLTHROUGH*/
2321
quote:
2322
if ((i = chr(env, &esc)) < 0)
2323
goto invalid;
2324
if (e->re.nest.type[i] & ~x)
2325
goto invalid;
2326
e->re.nest.type[i] = x|REX_NEST_open|REX_NEST_close|(i<<REX_NEST_SHIFT);
2327
continue;
2328
case 'S':
2329
x = REX_NEST_separator;
2330
goto delimiter;
2331
case 'T':
2332
x = REX_NEST_terminator;
2333
goto delimiter;
2334
case '|':
2335
case '&':
2336
if (!esc)
2337
goto invalid;
2338
goto nesting;
2339
case '(':
2340
if (!esc)
2341
n++;
2342
goto nesting;
2343
case ')':
2344
if (!esc && !--n)
2345
break;
2346
goto nesting;
2347
default:
2348
nesting:
2349
if (isalnum(i) || (e->re.nest.type[i] & (REX_NEST_close|REX_NEST_escape|REX_NEST_literal|REX_NEST_quote|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)))
2350
goto invalid;
2351
e->re.nest.type[i] = REX_NEST_open;
2352
if ((x = chr(env, &esc)) < 0 || (e->re.nest.type[x] & (REX_NEST_close|REX_NEST_escape|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)))
2353
goto invalid;
2354
if (!esc)
2355
{
2356
if (x == ')' && !--n)
2357
goto invalid;
2358
else if (x == '(')
2359
n++;
2360
}
2361
e->re.nest.type[x] |= REX_NEST_close;
2362
e->re.nest.type[i] |= x << REX_NEST_SHIFT;
2363
continue;
2364
}
2365
break;
2366
}
2367
env->parnest--;
2368
if (c == T_PERCENT)
2369
for (n = 0; n < 2; n++)
2370
{
2371
parno = ++env->parno;
2372
if (!(f = node(env, REX_GROUP, 0, 0, 0)))
2373
{
2374
drop(env->disc, e);
2375
return 0;
2376
}
2377
if (parno < elementsof(env->paren))
2378
env->paren[parno] = f;
2379
f->re.group.back = 0;
2380
f->re.group.number = parno;
2381
f->re.group.expr.rex = e;
2382
e = f;
2383
}
2384
return e;
2385
case '(':
2386
c = 0;
2387
if (isdigit(*env->cursor))
2388
{
2389
f = 0;
2390
do
2391
{
2392
if (c > (INT_MAX / 10))
2393
{
2394
env->error = REG_BADRPT;
2395
return 0;
2396
}
2397
c = c * 10 + (*env->cursor++ - '0');
2398
} while (isdigit(*env->cursor));
2399
if (*env->cursor++ != ')')
2400
{
2401
env->error = REG_BADRPT;
2402
return 0;
2403
}
2404
if (c && env->type >= SRE)
2405
c = c * 2 - 1;
2406
if (!c || c > env->parno || !env->paren[c])
2407
{
2408
if (!(env->flags & (REG_LENIENT|REG_REGEXP)))
2409
{
2410
env->error = REG_ESUBREG;
2411
return 0;
2412
}
2413
if (c)
2414
c = -1;
2415
}
2416
}
2417
else
2418
{
2419
if (env->type < SRE && *env->cursor++ != '?')
2420
{
2421
env->error = REG_BADRPT;
2422
return 0;
2423
}
2424
if (!(f = grp(env, parno + 1)) && env->error)
2425
return 0;
2426
}
2427
if (!(e = node(env, REX_GROUP_COND, 0, 0, 0)))
2428
{
2429
drop(env->disc, f);
2430
return 0;
2431
}
2432
e->re.group.size = c;
2433
e->re.group.expr.binary.left = f;
2434
if (!(e->re.group.expr.binary.right = alt(env, parno, 1)))
2435
{
2436
drop(env->disc, e);
2437
return 0;
2438
}
2439
if (token(env) != T_CLOSE)
2440
{
2441
env->error = REG_EPAREN;
2442
return 0;
2443
}
2444
eat(env);
2445
env->parnest--;
2446
return rep(env, e, parno, parno);
2447
case '{':
2448
p = env->cursor;
2449
n = 1;
2450
while (c = *env->cursor)
2451
{
2452
if (c == '\\' && *(env->cursor + 1))
2453
env->cursor++;
2454
else if (c == '{')
2455
n++;
2456
else if (c == '}' && !--n)
2457
break;
2458
else if (c == env->delimiter || c == env->terminator)
2459
break;
2460
env->cursor++;
2461
}
2462
if (c != '}')
2463
{
2464
env->error = REG_EBRACE;
2465
return 0;
2466
}
2467
if (*++env->cursor != ')')
2468
{
2469
env->error = REG_EPAREN;
2470
return 0;
2471
}
2472
env->cursor++;
2473
env->parnest--;
2474
if (env->disc->re_version < REG_VERSION_EXEC)
2475
{
2476
env->error = REG_BADRPT;
2477
return 0;
2478
}
2479
if (!env->disc->re_execf)
2480
return 0;
2481
if (!(e = node(env, REX_EXEC, 0, 0, 0)))
2482
return 0;
2483
e->re.exec.text = (const char*)p;
2484
e->re.exec.size = env->cursor - p - 2;
2485
if (!env->disc->re_compf)
2486
e->re.exec.data = 0;
2487
else
2488
e->re.exec.data = (*env->disc->re_compf)(env->regex, e->re.exec.text, e->re.exec.size, env->disc);
2489
return e;
2490
case '0': case '1': case '2': case '3': case '4':
2491
case '5': case '6': case '7': case '8': case '9':
2492
c -= '0';
2493
while (isdigit(*env->cursor))
2494
{
2495
if (c > (INT_MAX / 10))
2496
{
2497
env->error = REG_ESUBREG;
2498
return 0;
2499
}
2500
c = c * 10 + *env->cursor++ - '0';
2501
}
2502
if (*env->cursor == ')')
2503
{
2504
env->cursor++;
2505
env->parnest--;
2506
env->token.len = 1;
2507
if (c > env->parno || !env->paren[c])
2508
{
2509
env->error = REG_ESUBREG;
2510
return 0;
2511
}
2512
env->paren[c]->re.group.back = 1;
2513
return rep(env, node(env, REX_BACK, c, 0, 0), 0, 0);
2514
}
2515
/*FALLTHROUGH*/
2516
default:
2517
env->error = REG_BADRPT;
2518
return 0;
2519
}
2520
p = env->pattern;
2521
i = env->type;
2522
if (x)
2523
{
2524
if (typ >= 0)
2525
env->type = typ;
2526
if (!(e = alt(env, parno, 0)))
2527
goto nope;
2528
env->flags = g;
2529
env->type = i;
2530
}
2531
c = token(env);
2532
env->parnest--;
2533
if (c != T_CLOSE && (!(env->flags & REG_LITERAL) || c != ')'))
2534
{
2535
env->error = REG_EPAREN;
2536
goto nope;
2537
}
2538
eat(env);
2539
if (typ >= 0 && beg)
2540
env->pattern = env->cursor;
2541
if (!x)
2542
{
2543
if (typ >= 0)
2544
env->type = typ;
2545
return 0;
2546
}
2547
if (!(f = node(env, x, 0, 0, 0)))
2548
{
2549
drop(env->disc, e);
2550
goto nope;
2551
}
2552
f->re.group.expr.rex = e;
2553
if (x == REX_GROUP_BEHIND || x == REX_GROUP_BEHIND_NOT)
2554
{
2555
if (stats(env, e))
2556
{
2557
drop(env->disc, f);
2558
if (!env->error)
2559
env->error = REG_ECOUNT;
2560
goto nope;
2561
}
2562
f->re.group.size = env->stats.m;
2563
memset(&env->stats, 0, sizeof(env->stats));
2564
}
2565
switch (x)
2566
{
2567
case REX_GROUP:
2568
case REX_GROUP_CUT:
2569
f = rep(env, f, parno, env->parno);
2570
break;
2571
}
2572
if (f)
2573
return f;
2574
nope:
2575
env->flags = g;
2576
env->pattern = p;
2577
env->type = i;
2578
return 0;
2579
}
2580
2581
static Rex_t*
2582
seq(Cenv_t* env)
2583
{
2584
Rex_t* e;
2585
Rex_t* f;
2586
Token_t tok;
2587
int c;
2588
int i;
2589
int n;
2590
int x;
2591
int parno;
2592
int type;
2593
regflags_t flags;
2594
unsigned char* s;
2595
unsigned char* p;
2596
unsigned char* t;
2597
unsigned char* u;
2598
unsigned char buf[256];
2599
2600
for (;;)
2601
{
2602
s = buf;
2603
while ((c = token(env)) < T_META && s < &buf[sizeof(buf) - env->token.len])
2604
{
2605
x = c;
2606
p = env->cursor;
2607
if (c >= 0)
2608
{
2609
n = 1;
2610
*s++ = (env->flags & REG_ICASE) ? toupper(c) : c;
2611
}
2612
else if (c == C_ESC || (env->flags & REG_ICASE))
2613
{
2614
c = (c == C_ESC) ? env->token.lex : mbchar(p);
2615
if (env->flags & REG_ICASE)
2616
c = towupper(c);
2617
if ((&buf[sizeof(buf)] - s) < MB_CUR_MAX)
2618
break;
2619
if ((n = mbconv((char*)s, c)) < 0)
2620
*s++ = c;
2621
else if (n)
2622
s += n;
2623
else
2624
{
2625
n = 1;
2626
*s++ = 0;
2627
}
2628
}
2629
else
2630
{
2631
n = env->token.len - env->token.esc;
2632
for (t = p, u = s + n; s < u; *s++ = *t++);
2633
}
2634
eat(env);
2635
}
2636
if (c == T_BAD)
2637
return 0;
2638
if (s > buf)
2639
switch (c)
2640
{
2641
case T_STAR:
2642
case T_PLUS:
2643
case T_LEFT:
2644
case T_QUES:
2645
case T_BANG:
2646
if ((s -= n) == buf)
2647
e = 0;
2648
else
2649
{
2650
i = s - buf;
2651
if (!(e = node(env, REX_STRING, 0, 0, i)))
2652
return 0;
2653
memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, i);
2654
e->re.string.size = i;
2655
}
2656
if (x >= 0)
2657
{
2658
if (!(f = node(env, REX_ONECHAR, 1, 1, 0)))
2659
{
2660
drop(env->disc, e);
2661
return 0;
2662
}
2663
f->re.onechar = (env->flags & REG_ICASE) ? toupper(x) : x;
2664
}
2665
else
2666
{
2667
if (!(f = node(env, REX_STRING, 0, 0, n)))
2668
return 0;
2669
memcpy((char*)(f->re.string.base = (unsigned char*)f->re.data), (char*)p, n);
2670
f->re.string.size = n;
2671
}
2672
if (!(f = rep(env, f, 0, 0)) || !(f = cat(env, f, seq(env))))
2673
{
2674
drop(env->disc, e);
2675
return 0;
2676
}
2677
if (e)
2678
f = cat(env, e, f);
2679
return f;
2680
default:
2681
c = s - buf;
2682
if (!(e = node(env, REX_STRING, 0, 0, c)))
2683
return 0;
2684
memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, c);
2685
e->re.string.size = c;
2686
return cat(env, e, seq(env));
2687
}
2688
else if (c > T_BACK)
2689
{
2690
eat(env);
2691
c -= T_BACK;
2692
if (c > env->parno || !env->paren[c])
2693
{
2694
env->error = REG_ESUBREG;
2695
return 0;
2696
}
2697
env->paren[c]->re.group.back = 1;
2698
e = rep(env, node(env, REX_BACK, c, 0, 0), 0, 0);
2699
}
2700
else
2701
switch (c)
2702
{
2703
case T_AND:
2704
case T_CLOSE:
2705
case T_BAR:
2706
case T_END:
2707
return node(env, REX_NULL, 0, 0, 0);
2708
case T_DOLL:
2709
eat(env);
2710
e = rep(env, node(env, REX_END, 0, 0, 0), 0, 0);
2711
break;
2712
case T_CFLX:
2713
eat(env);
2714
if ((e = node(env, REX_BEG, 0, 0, 0)) && (env->flags & REG_EXTENDED))
2715
e = rep(env, e, 0, 0);
2716
break;
2717
case T_OPEN:
2718
tok = env->token;
2719
eat(env);
2720
flags = env->flags;
2721
type = env->type;
2722
if (env->token.att)
2723
env->flags |= REG_MINIMAL;
2724
env->parnest++;
2725
if (env->type == KRE)
2726
++env->parno;
2727
parno = ++env->parno;
2728
if (!(e = alt(env, parno + 1, 0)))
2729
break;
2730
if (e->type == REX_NULL && env->type == ERE && !(env->flags & (REG_NULL|REG_REGEXP)))
2731
{
2732
drop(env->disc, e);
2733
env->error = (*env->cursor == 0 || *env->cursor == env->delimiter || *env->cursor == env->terminator) ? REG_EPAREN : REG_ENULL;
2734
return 0;
2735
}
2736
if (token(env) != T_CLOSE)
2737
{
2738
drop(env->disc, e);
2739
env->error = REG_EPAREN;
2740
return 0;
2741
}
2742
env->parnest--;
2743
eat(env);
2744
if (!(f = node(env, REX_GROUP, 0, 0, 0)))
2745
{
2746
drop(env->disc, e);
2747
return 0;
2748
}
2749
if (parno < elementsof(env->paren))
2750
env->paren[parno] = f;
2751
f->re.group.back = 0;
2752
f->re.group.number = parno;
2753
f->re.group.expr.rex = e;
2754
if (tok.lex)
2755
{
2756
tok.push = 1;
2757
env->token = tok;
2758
}
2759
if (!(e = rep(env, f, parno, env->parno)))
2760
return 0;
2761
if (env->type == KRE)
2762
{
2763
if (!(f = node(env, REX_GROUP, 0, 0, 0)))
2764
{
2765
drop(env->disc, e);
2766
return 0;
2767
}
2768
if (--parno < elementsof(env->paren))
2769
env->paren[parno] = f;
2770
f->re.group.back = 0;
2771
f->re.group.number = parno;
2772
f->re.group.expr.rex = e;
2773
e = f;
2774
}
2775
env->flags = flags;
2776
env->type = type;
2777
break;
2778
case T_GROUP:
2779
p = env->cursor;
2780
eat(env);
2781
flags = env->flags;
2782
type = env->type;
2783
if (!(e = grp(env, env->parno + 1)))
2784
{
2785
if (env->error)
2786
return 0;
2787
if (env->literal == env->pattern && env->literal == p)
2788
env->literal = env->cursor;
2789
continue;
2790
}
2791
env->flags = flags;
2792
env->type = type;
2793
break;
2794
case T_BRA:
2795
eat(env);
2796
if (e = bra(env))
2797
e = rep(env, e, 0, 0);
2798
break;
2799
case T_ALNUM:
2800
case T_ALNUM_NOT:
2801
case T_DIGIT:
2802
case T_DIGIT_NOT:
2803
case T_SPACE:
2804
case T_SPACE_NOT:
2805
eat(env);
2806
if (e = ccl(env, c))
2807
e = rep(env, e, 0, 0);
2808
break;
2809
case T_LT:
2810
eat(env);
2811
e = rep(env, node(env, REX_WBEG, 0, 0, 0), 0, 0);
2812
break;
2813
case T_GT:
2814
eat(env);
2815
e = rep(env, node(env, REX_WEND, 0, 0, 0), 0, 0);
2816
break;
2817
case T_DOT:
2818
eat(env);
2819
e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0);
2820
break;
2821
case T_DOTSTAR:
2822
eat(env);
2823
env->token.lex = T_STAR;
2824
env->token.push = 1;
2825
e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0);
2826
break;
2827
case T_SLASHPLUS:
2828
eat(env);
2829
env->token.lex = T_PLUS;
2830
env->token.push = 1;
2831
if (e = node(env, REX_ONECHAR, 1, 1, 0))
2832
{
2833
e->re.onechar = '/';
2834
e = rep(env, e, 0, 0);
2835
}
2836
break;
2837
case T_WORD:
2838
eat(env);
2839
e = rep(env, node(env, REX_WORD, 0, 0, 0), 0, 0);
2840
break;
2841
case T_WORD_NOT:
2842
eat(env);
2843
e = rep(env, node(env, REX_WORD_NOT, 0, 0, 0), 0, 0);
2844
break;
2845
case T_BEG_STR:
2846
eat(env);
2847
e = rep(env, node(env, REX_BEG_STR, 0, 0, 0), 0, 0);
2848
break;
2849
case T_END_STR:
2850
eat(env);
2851
e = rep(env, node(env, REX_END_STR, 0, 0, 0), 0, 0);
2852
break;
2853
case T_FIN_STR:
2854
eat(env);
2855
e = rep(env, node(env, REX_FIN_STR, 0, 0, 0), 0, 0);
2856
break;
2857
default:
2858
env->error = REG_BADRPT;
2859
return 0;
2860
}
2861
if (e && *env->cursor != 0 && *env->cursor != env->delimiter && *env->cursor != env->terminator)
2862
e = cat(env, e, seq(env));
2863
return e;
2864
}
2865
}
2866
2867
static Rex_t*
2868
con(Cenv_t* env)
2869
{
2870
Rex_t* e;
2871
Rex_t* f;
2872
Rex_t* g;
2873
2874
if (!(e = seq(env)) || !(env->flags & REG_AUGMENTED) || token(env) != T_AND)
2875
return e;
2876
eat(env);
2877
if (!(f = con(env)))
2878
{
2879
drop(env->disc, e);
2880
return 0;
2881
}
2882
if (!(g = node(env, REX_CONJ, 0, 0, 0)))
2883
{
2884
drop(env->disc, e);
2885
drop(env->disc, f);
2886
return 0;
2887
}
2888
g->re.group.expr.binary.left = e;
2889
g->re.group.expr.binary.right = f;
2890
return g;
2891
}
2892
2893
static Rex_t*
2894
alt(Cenv_t* env, int number, int cond)
2895
{
2896
Rex_t* e;
2897
Rex_t* f;
2898
Rex_t* g;
2899
2900
if (!(e = con(env)))
2901
return 0;
2902
else if (token(env) != T_BAR)
2903
{
2904
if (!cond)
2905
return e;
2906
f = 0;
2907
if (e->type == REX_NULL)
2908
goto bad;
2909
}
2910
else
2911
{
2912
eat(env);
2913
if (!(f = alt(env, number, 0)))
2914
{
2915
drop(env->disc, e);
2916
return 0;
2917
}
2918
if ((e->type == REX_NULL || f->type == REX_NULL) && !(env->flags & (REG_NULL|REG_REGEXP)))
2919
goto bad;
2920
if (!cond && (g = trie(env, e, f)))
2921
return g;
2922
}
2923
if (!(g = node(env, REX_ALT, 0, 0, 0)))
2924
{
2925
env->error = REG_ESPACE;
2926
goto bad;
2927
}
2928
g->re.group.number = number;
2929
g->re.group.last = env->parno;
2930
g->re.group.expr.binary.left = e;
2931
g->re.group.expr.binary.right = f;
2932
return g;
2933
bad:
2934
drop(env->disc, e);
2935
drop(env->disc, f);
2936
if (!env->error)
2937
env->error = REG_ENULL;
2938
return 0;
2939
}
2940
2941
/*
2942
* add v to REX_BM tables
2943
*/
2944
2945
static void
2946
bmstr(Cenv_t* env, register Rex_t* a, unsigned char* v, int n, Bm_mask_t b)
2947
{
2948
int c;
2949
int m;
2950
size_t z;
2951
2952
for (m = 0; m < n; m++)
2953
{
2954
if (!(z = n - m - 1))
2955
z = HIT;
2956
c = v[m];
2957
a->re.bm.mask[m][c] |= b;
2958
if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT)
2959
a->re.bm.skip[c] = z;
2960
if (a->flags & REG_ICASE)
2961
{
2962
if (isupper(c))
2963
c = tolower(c);
2964
else if (islower(c))
2965
c = toupper(c);
2966
else
2967
continue;
2968
a->re.bm.mask[m][c] |= b;
2969
if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT)
2970
a->re.bm.skip[c] = z;
2971
}
2972
}
2973
}
2974
2975
/*
2976
* set up BM table from trie
2977
*/
2978
2979
static int
2980
bmtrie(Cenv_t* env, Rex_t* a, unsigned char* v, Trie_node_t* x, int n, int m, Bm_mask_t b)
2981
{
2982
do
2983
{
2984
v[m] = x->c;
2985
if (m >= (n - 1))
2986
{
2987
bmstr(env, a, v, n, b);
2988
if (!(b <<= 1))
2989
{
2990
b = 1;
2991
a->re.bm.complete = 0;
2992
}
2993
else if (x->son)
2994
a->re.bm.complete = 0;
2995
}
2996
else if (x->son)
2997
b = bmtrie(env, a, v, x->son, n, m + 1, b);
2998
} while (x = x->sib);
2999
return b;
3000
}
3001
3002
/*
3003
* rewrite the expression tree for some special cases
3004
* 1. it is a null expression - illegal
3005
* 2. max length fixed string found -- use BM algorithm
3006
* 3. it begins with an unanchored string - use KMP algorithm
3007
* 0 returned on success
3008
*/
3009
3010
static int
3011
special(Cenv_t* env, regex_t* p)
3012
{
3013
Rex_t* a;
3014
Rex_t* e;
3015
Rex_t* t;
3016
Rex_t* x;
3017
Rex_t* y;
3018
unsigned char* s;
3019
int* f;
3020
int n;
3021
int m;
3022
int k;
3023
3024
DEBUG_INIT();
3025
if (e = p->env->rex)
3026
{
3027
if ((x = env->stats.x) && x->re.string.size < 3)
3028
x = 0;
3029
if ((t = env->stats.y) && t->re.trie.min < 3)
3030
t = 0;
3031
if (x && t)
3032
{
3033
if (x->re.string.size >= t->re.trie.min)
3034
t = 0;
3035
else
3036
x = 0;
3037
}
3038
if (x || t)
3039
{
3040
Bm_mask_t** mask;
3041
Bm_mask_t* h;
3042
unsigned char* v;
3043
size_t* q;
3044
size_t l;
3045
int i;
3046
int j;
3047
3048
if (x)
3049
{
3050
y = x;
3051
n = m = x->re.string.size;
3052
l = env->stats.l;
3053
}
3054
else
3055
{
3056
y = t;
3057
n = t->re.trie.min;
3058
m = t->re.trie.max;
3059
l = env->stats.k;
3060
}
3061
if (!(q = (size_t*)alloc(env->disc, 0, (n + 1) * sizeof(size_t))))
3062
return 1;
3063
if (!(a = node(env, REX_BM, 0, 0, n * (sizeof(Bm_mask_t*) + (UCHAR_MAX + 1) * sizeof(Bm_mask_t)) + (UCHAR_MAX + n + 2) * sizeof(size_t))))
3064
{
3065
alloc(env->disc, q, 0);
3066
return 1;
3067
}
3068
a->flags = y->flags;
3069
a->map = y->map;
3070
a->re.bm.size = n;
3071
a->re.bm.back = (y == e || y == e->re.group.expr.rex) ? (m - n) : -1;
3072
a->re.bm.left = l - 1;
3073
a->re.bm.right = env->stats.m - l - n;
3074
a->re.bm.complete = (env->stats.e || y != e && (e->type != REX_GROUP || y != e->re.group.expr.rex) || e->next || ((a->re.bm.left + a->re.bm.right) >= 0)) ? 0 : n;
3075
h = (Bm_mask_t*)&a->re.bm.mask[n];
3076
a->re.bm.skip = (size_t*)(h + n * (UCHAR_MAX + 1));
3077
a->re.bm.fail = &a->re.bm.skip[UCHAR_MAX + 1];
3078
for (m = 0; m <= UCHAR_MAX; m++)
3079
a->re.bm.skip[m] = n;
3080
a->re.bm.skip[0] = a->re.bm.skip[env->mappednewline] = (y->next && y->next->type == REX_END) ? HIT : (n + a->re.bm.left);
3081
for (i = 1; i <= n; i++)
3082
a->re.bm.fail[i] = 2 * n - i;
3083
mask = a->re.bm.mask;
3084
for (m = 0; m < n; m++)
3085
{
3086
mask[m] = h;
3087
h += UCHAR_MAX + 1;
3088
}
3089
if (x)
3090
bmstr(env, a, x->re.string.base, n, 1);
3091
else
3092
{
3093
v = (unsigned char*)q;
3094
memset(v, 0, n);
3095
m = 1;
3096
for (i = 0; i <= UCHAR_MAX; i++)
3097
if (t->re.trie.root[i])
3098
m = bmtrie(env, a, v, t->re.trie.root[i], n, 0, m);
3099
}
3100
mask--;
3101
memset(q, 0, n * sizeof(*q));
3102
for (k = (j = n) + 1; j > 0; j--, k--)
3103
{
3104
DEBUG_TEST(0x0010,(sfprintf(sfstderr, "BM#0: k=%d j=%d\n", k, j)),(0));
3105
for (q[j] = k; k <= n; k = q[k])
3106
{
3107
for (m = 0; m <= UCHAR_MAX; m++)
3108
if (mask[k][m] == mask[j][m])
3109
{
3110
DEBUG_TEST(0x0010,sfprintf(sfstderr, "CUT1: mask[%d][%c]=mask[%d][%c]\n", k, m, j, m), (0));
3111
goto cut;
3112
}
3113
DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#2: fail[%d]=%d => %d\n", k, a->re.bm.fail[k], (a->re.bm.fail[k] > n - j) ? (n - j) : a->re.bm.fail[k]), (0));
3114
if (a->re.bm.fail[k] > n - j)
3115
a->re.bm.fail[k] = n - j;
3116
}
3117
cut: ;
3118
}
3119
for (i = 1; i <= n; i++)
3120
if (a->re.bm.fail[i] > n + k - i)
3121
{
3122
DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#4: fail[%d]=%d => %d\n", i, a->re.bm.fail[i], n + k - i), (0));
3123
a->re.bm.fail[i] = n + k - i;
3124
}
3125
#if _AST_REGEX_DEBUG
3126
if (DEBUG_TEST(0x0020,(1),(0)))
3127
{
3128
sfprintf(sfstderr, "STAT: complete=%d n=%d k=%d l=%d r=%d y=%d:%d e=%d:%d\n", a->re.bm.complete, n, k, a->re.bm.left, a->re.bm.right, y->type, y->next ? y->next->type : 0, e->type, e->next ? e->next->type : 0);
3129
for (m = 0; m < n; m++)
3130
for (i = 1; i <= UCHAR_MAX; i++)
3131
if (a->re.bm.mask[m][i])
3132
sfprintf(sfstderr, "MASK: [%d]['%c'] = %032..2u\n", m, i, a->re.bm.mask[m][i]);
3133
for (i = ' '; i <= UCHAR_MAX; i++)
3134
if (a->re.bm.skip[i] >= HIT)
3135
sfprintf(sfstderr, "SKIP: ['%c'] = *\n", i);
3136
else if (a->re.bm.skip[i] > 0 && a->re.bm.skip[i] < n)
3137
sfprintf(sfstderr, "SKIP: ['%c'] = %3d\n", i, a->re.bm.skip[i]);
3138
for (j = 31; j >= 0; j--)
3139
{
3140
sfprintf(sfstderr, " ");
3141
next:
3142
for (m = 0; m < n; m++)
3143
{
3144
for (i = 0040; i < 0177; i++)
3145
if (a->re.bm.mask[m][i] & (1 << j))
3146
{
3147
sfprintf(sfstderr, " %c", i);
3148
break;
3149
}
3150
if (i >= 0177)
3151
{
3152
if (j > 0)
3153
{
3154
j--;
3155
goto next;
3156
}
3157
sfprintf(sfstderr, " ?");
3158
}
3159
}
3160
sfprintf(sfstderr, "\n");
3161
}
3162
sfprintf(sfstderr, "FAIL: ");
3163
for (m = 1; m <= n; m++)
3164
sfprintf(sfstderr, "%3d", a->re.bm.fail[m]);
3165
sfprintf(sfstderr, "\n");
3166
}
3167
#endif
3168
alloc(env->disc, q, 0);
3169
a->next = e;
3170
p->env->rex = a;
3171
return 0;
3172
}
3173
switch (e->type)
3174
{
3175
case REX_BEG:
3176
if (env->flags & REG_NEWLINE)
3177
return 0;
3178
break;
3179
case REX_GROUP:
3180
if (env->stats.b)
3181
return 0;
3182
e = e->re.group.expr.rex;
3183
if (e->type != REX_DOT)
3184
return 0;
3185
/*FALLTHROUGH*/
3186
case REX_DOT:
3187
if (e->lo == 0 && e->hi == RE_DUP_INF)
3188
break;
3189
return 0;
3190
case REX_NULL:
3191
if (env->flags & (REG_NULL|REG_REGEXP))
3192
break;
3193
env->error = REG_ENULL;
3194
return 1;
3195
case REX_STRING:
3196
if ((env->flags & (REG_LEFT|REG_LITERAL|REG_RIGHT)) || e->map)
3197
return 0;
3198
s = e->re.string.base;
3199
n = e->re.string.size;
3200
if (!(a = node(env, REX_KMP, 0, 0, n * (sizeof(int*) + 1))))
3201
return 1;
3202
a->flags = e->flags;
3203
a->map = e->map;
3204
f = a->re.string.fail;
3205
memcpy((char*)(a->re.string.base = (unsigned char*)&f[n]), (char*)s, n);
3206
s = a->re.string.base;
3207
a->re.string.size = n;
3208
f[0] = m = -1;
3209
for (k = 1; k < n; k++)
3210
{
3211
while (m >= 0 && s[m+1] != s[k])
3212
m = f[m];
3213
if (s[m+1] == s[k])
3214
m++;
3215
f[k] = m;
3216
}
3217
a->next = e->next;
3218
p->env->rex = a;
3219
e->next = 0;
3220
drop(env->disc, e);
3221
break;
3222
default:
3223
return 0;
3224
}
3225
}
3226
p->env->once = 1;
3227
return 0;
3228
}
3229
3230
int
3231
regcomp(regex_t* p, const char* pattern, regflags_t flags)
3232
{
3233
Rex_t* e;
3234
Rex_t* f;
3235
regdisc_t* disc;
3236
unsigned char* fold;
3237
int i;
3238
Cenv_t env;
3239
3240
if (!p)
3241
return REG_BADPAT;
3242
if (flags & REG_DISCIPLINE)
3243
{
3244
flags &= ~REG_DISCIPLINE;
3245
disc = p->re_disc;
3246
}
3247
else
3248
disc = &state.disc;
3249
if (!disc->re_errorlevel)
3250
disc->re_errorlevel = 2;
3251
p->env = 0;
3252
if (!pattern)
3253
return fatal(disc, REG_BADPAT, pattern);
3254
if (!state.initialized)
3255
{
3256
state.initialized = 1;
3257
for (i = 0; i < elementsof(state.escape); i++)
3258
state.magic[state.escape[i].key] = state.escape[i].val;
3259
}
3260
if (!(fold = (unsigned char*)LCINFO(AST_LC_CTYPE)->data))
3261
{
3262
if (!(fold = newof(0, unsigned char, UCHAR_MAX, 1)))
3263
return fatal(disc, REG_ESPACE, pattern);
3264
for (i = 0; i <= UCHAR_MAX; i++)
3265
fold[i] = toupper(i);
3266
LCINFO(AST_LC_CTYPE)->data = (void*)fold;
3267
}
3268
again:
3269
if (!(p->env = (Env_t*)alloc(disc, 0, sizeof(Env_t))))
3270
return fatal(disc, REG_ESPACE, pattern);
3271
memset(p->env, 0, sizeof(*p->env));
3272
memset(&env, 0, sizeof(env));
3273
env.regex = p;
3274
env.flags = flags;
3275
env.disc = p->env->disc = disc;
3276
if (env.flags & REG_AUGMENTED)
3277
env.flags |= REG_EXTENDED;
3278
env.mappeddot = '.';
3279
env.mappednewline = '\n';
3280
env.mappedslash = '/';
3281
if (disc->re_version >= REG_VERSION_MAP && disc->re_map)
3282
{
3283
env.map = disc->re_map;
3284
env.MAP = p->env->fold;
3285
for (i = 0; i <= UCHAR_MAX; i++)
3286
{
3287
env.MAP[i] = fold[env.map[i]];
3288
if (env.map[i] == '.')
3289
env.mappeddot = i;
3290
if (env.map[i] == '\n')
3291
env.mappednewline = i;
3292
if (env.map[i] == '/')
3293
env.mappedslash = i;
3294
}
3295
}
3296
else
3297
env.MAP = fold;
3298
env.type = (env.flags & REG_AUGMENTED) ? ARE : (env.flags & REG_EXTENDED) ? ERE : BRE;
3299
env.explicit = -1;
3300
if (env.flags & REG_SHELL)
3301
{
3302
if (env.flags & REG_SHELL_PATH)
3303
env.explicit = env.mappedslash;
3304
if (!(env.flags & REG_SHELL_ESCAPED))
3305
env.flags |= REG_CLASS_ESCAPE;
3306
env.flags |= REG_LENIENT|REG_NULL;
3307
env.type = env.type == BRE ? SRE : KRE;
3308
}
3309
else
3310
env.flags &= ~(REG_SHELL_DOT|REG_SHELL_ESCAPED|REG_SHELL_GROUP|REG_SHELL_PATH);
3311
if ((env.flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE)
3312
env.explicit = env.mappednewline;
3313
p->env->leading = (env.flags & REG_SHELL_DOT) ? env.mappeddot : -1;
3314
env.posixkludge = !(env.flags & (REG_EXTENDED|REG_SHELL));
3315
env.token.lex = 0;
3316
env.token.push = 0;
3317
if (env.flags & REG_DELIMITED)
3318
{
3319
switch (env.delimiter = *pattern++)
3320
{
3321
case 0:
3322
case '\\':
3323
case '\n':
3324
case '\r':
3325
env.error = REG_EDELIM;
3326
goto bad;
3327
}
3328
env.terminator = '\n';
3329
}
3330
env.literal = env.pattern = env.cursor = (unsigned char*)pattern;
3331
if (!(p->env->rex = alt(&env, 1, 0)))
3332
goto bad;
3333
if (env.parnest)
3334
{
3335
env.error = REG_EPAREN;
3336
goto bad;
3337
}
3338
if ((env.flags & REG_LEFT) && p->env->rex->type != REX_BEG)
3339
{
3340
if (p->env->rex->type == REX_ALT)
3341
env.flags &= ~REG_FIRST;
3342
if (!(e = node(&env, REX_BEG, 0, 0, 0)))
3343
{
3344
regfree(p);
3345
return fatal(disc, REG_ESPACE, pattern);
3346
}
3347
e->next = p->env->rex;
3348
p->env->rex = e;
3349
p->env->once = 1;
3350
}
3351
for (e = p->env->rex; e->next; e = e->next);
3352
p->env->done.type = REX_DONE;
3353
p->env->done.flags = e->flags;
3354
if ((env.flags & REG_RIGHT) && e->type != REX_END)
3355
{
3356
if (p->env->rex->type == REX_ALT)
3357
env.flags &= ~REG_FIRST;
3358
if (!(f = node(&env, REX_END, 0, 0, 0)))
3359
{
3360
regfree(p);
3361
return fatal(disc, REG_ESPACE, pattern);
3362
}
3363
f->flags = e->flags;
3364
f->map = e->map;
3365
e->next = f;
3366
}
3367
if (stats(&env, p->env->rex))
3368
{
3369
if (!env.error)
3370
env.error = REG_ECOUNT;
3371
goto bad;
3372
}
3373
if (env.stats.b)
3374
p->env->hard = p->env->separate = 1;
3375
else if (!(env.flags & REG_FIRST) && (env.stats.a || env.stats.c > 1 && env.stats.c != env.stats.s || env.stats.t && (env.stats.t > 1 || env.stats.a || env.stats.c)))
3376
p->env->hard = 1;
3377
if (p->env->hard || env.stats.c || env.stats.i)
3378
p->env->stats.re_min = p->env->stats.re_max = -1;
3379
else
3380
{
3381
if (!(p->env->stats.re_min = env.stats.m))
3382
p->env->stats.re_min = -1;
3383
if (!(p->env->stats.re_max = env.stats.n))
3384
p->env->stats.re_max = -1;
3385
}
3386
if (special(&env, p))
3387
goto bad;
3388
serialize(&env, p->env->rex, 1);
3389
p->re_nsub = env.stats.p;
3390
if (env.type == KRE)
3391
p->re_nsub /= 2;
3392
if (env.flags & REG_DELIMITED)
3393
{
3394
p->re_npat = env.cursor - env.pattern + 1;
3395
if (*env.cursor == env.delimiter)
3396
p->re_npat++;
3397
else if (env.flags & REG_MUSTDELIM)
3398
{
3399
env.error = REG_EDELIM;
3400
goto bad;
3401
}
3402
else
3403
env.flags &= ~REG_DELIMITED;
3404
}
3405
p->env->explicit = env.explicit;
3406
p->env->flags = env.flags & REG_COMP;
3407
p->env->min = env.stats.m;
3408
p->env->nsub = env.stats.p + env.stats.u;
3409
p->env->refs = 1;
3410
return 0;
3411
bad:
3412
regfree(p);
3413
if (!env.error)
3414
env.error = REG_ESPACE;
3415
if (env.type >= SRE && env.error != REG_ESPACE && !(flags & REG_LITERAL))
3416
{
3417
flags |= REG_LITERAL;
3418
pattern = (const char*)env.literal;
3419
goto again;
3420
}
3421
return fatal(disc, env.error, pattern);
3422
}
3423
3424
/*
3425
* regcomp() on sized pattern
3426
* the lazy way around adding and checking env.end
3427
*/
3428
3429
int
3430
regncomp(regex_t* p, const char* pattern, size_t size, regflags_t flags)
3431
{
3432
char* s;
3433
int r;
3434
3435
if (!(s = malloc(size + 1)))
3436
return fatal((flags & REG_DISCIPLINE) ? p->re_disc : &state.disc, REG_ESPACE, pattern);
3437
memcpy(s, pattern, size);
3438
s[size] = 0;
3439
r = regcomp(p, s, flags);
3440
free(s);
3441
return r;
3442
}
3443
3444
/*
3445
* combine two compiled regular expressions if possible,
3446
* replacing first with the combination and freeing second.
3447
* return 0 on success.
3448
* the only combinations handled are building a Trie
3449
* from String|Kmp|Trie and String|Kmp
3450
*/
3451
3452
int
3453
regcomb(regex_t* p, regex_t* q)
3454
{
3455
Rex_t* e = p->env->rex;
3456
Rex_t* f = q->env->rex;
3457
Rex_t* g;
3458
Rex_t* h;
3459
Cenv_t env;
3460
3461
if (!e || !f)
3462
return fatal(p->env->disc, REG_BADPAT, NiL);
3463
if (p->env->separate || q->env->separate)
3464
return REG_ESUBREG;
3465
memset(&env, 0, sizeof(env));
3466
env.disc = p->env->disc;
3467
if (e->type == REX_BM)
3468
{
3469
p->env->rex = e->next;
3470
e->next = 0;
3471
drop(env.disc, e);
3472
e = p->env->rex;
3473
}
3474
if (f->type == REX_BM)
3475
{
3476
q->env->rex = f->next;
3477
f->next = 0;
3478
drop(env.disc, f);
3479
f = q->env->rex;
3480
}
3481
if (e->type == REX_BEG && f->type == REX_BEG)
3482
{
3483
p->env->flags |= REG_LEFT;
3484
p->env->rex = e->next;
3485
e->next = 0;
3486
drop(env.disc, e);
3487
e = p->env->rex;
3488
q->env->rex = f->next;
3489
f->next = 0;
3490
drop(env.disc, f);
3491
f = q->env->rex;
3492
}
3493
for (g = e; g->next; g = g->next);
3494
for (h = f; h->next; h = h->next);
3495
if (g->next && g->next->type == REX_END && h->next && h->next->type == REX_END)
3496
{
3497
p->env->flags |= REG_RIGHT;
3498
drop(env.disc, g->next);
3499
g->next = 0;
3500
drop(env.disc, h->next);
3501
h->next = 0;
3502
}
3503
if (!(g = trie(&env, f, e)))
3504
return fatal(p->env->disc, REG_BADPAT, NiL);
3505
p->env->rex = g;
3506
if (!q->env->once)
3507
p->env->once = 0;
3508
q->env->rex = 0;
3509
if (p->env->flags & REG_LEFT)
3510
{
3511
if (!(e = node(&env, REX_BEG, 0, 0, 0)))
3512
{
3513
regfree(p);
3514
return fatal(p->env->disc, REG_ESPACE, NiL);
3515
}
3516
e->next = p->env->rex;
3517
p->env->rex = e;
3518
p->env->once = 1;
3519
}
3520
if (p->env->flags & REG_RIGHT)
3521
{
3522
for (f = p->env->rex; f->next; f = f->next);
3523
if (f->type != REX_END)
3524
{
3525
if (!(e = node(&env, REX_END, 0, 0, 0)))
3526
{
3527
regfree(p);
3528
return fatal(p->env->disc, REG_ESPACE, NiL);
3529
}
3530
f->next = e;
3531
}
3532
}
3533
env.explicit = p->env->explicit;
3534
env.flags = p->env->flags;
3535
env.disc = p->env->disc;
3536
if (stats(&env, p->env->rex))
3537
{
3538
regfree(p);
3539
return fatal(p->env->disc, env.error ? env.error : REG_ECOUNT, NiL);
3540
}
3541
if (special(&env, p))
3542
{
3543
regfree(p);
3544
return fatal(p->env->disc, env.error ? env.error : REG_ESPACE, NiL);
3545
}
3546
p->env->min = g->re.trie.min;
3547
return 0;
3548
}
3549
3550
/*
3551
* copy a reference of p into q
3552
* p and q may then have separate regsubcomp() instantiations
3553
*/
3554
3555
int
3556
regdup(regex_t* p, regex_t* q)
3557
{
3558
if (!p || !q)
3559
return REG_BADPAT;
3560
*q = *p;
3561
p->env->refs++;
3562
q->re_sub = 0;
3563
return 0;
3564
}
3565
3566