Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
srohatgi01
GitHub Repository: srohatgi01/cups
Path: blob/master/vcnet/regex/regcomp.c
1090 views
1
#include <assert.h>
2
#include <sys/types.h>
3
#include <stdio.h>
4
#include <string.h>
5
#include <ctype.h>
6
#include <limits.h>
7
#include <stdlib.h>
8
9
#include "regex.h"
10
#include "regex2.h"
11
12
#ifdef _POSIX2_RE_DUP_MAX
13
#define DUPMAX _POSIX2_RE_DUP_MAX
14
#else
15
#define DUPMAX 255
16
#endif
17
#define INFINITY (DUPMAX + 1)
18
19
/*
20
* parse structure, passed up and down to avoid global variables and
21
* other clumsinesses
22
*/
23
struct parse {
24
char *next; /* next character in RE */
25
char *end; /* end of string (-> NUL normally) */
26
int error; /* has an error been seen? */
27
sop *strip; /* malloced strip */
28
sopno ssize; /* malloced strip size (allocated) */
29
sopno slen; /* malloced strip length (used) */
30
int ncsalloc; /* number of csets allocated */
31
struct re_guts *g;
32
# define NPAREN 10 /* we need to remember () 1-9 for back refs */
33
sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
34
sopno pend[NPAREN]; /* -> ) ([0] unused) */
35
};
36
37
static char nuls[10]; /* place to point scanner in event of error */
38
39
static void p_ere(struct parse *p, int stop);
40
static void p_ere_exp(struct parse *p);
41
static void p_str(struct parse *p);
42
static void p_bre(struct parse *p, int end1,
43
int end2);
44
static int p_simp_re(struct parse *p, int starordinary);
45
static int p_count(struct parse *p);
46
static void p_bracket(struct parse *p);
47
static void p_b_term(struct parse *p, cset *cs);
48
static void p_b_cclass(struct parse *p, cset *cs);
49
static void p_b_eclass(struct parse *p, cset *cs);
50
static char p_b_symbol(struct parse *p);
51
static char p_b_coll_elem(struct parse *p, int endc);
52
static char othercase(int ch);
53
static void bothcases(struct parse *p, int ch);
54
static void ordinary(struct parse *p, int ch);
55
static void nonnewline(struct parse *p);
56
static void repeat(struct parse *p, sopno start, int from, int to);
57
static void seterr(struct parse *p, int e);
58
static cset *allocset(struct parse *p);
59
static void freeset(struct parse *p, cset *cs);
60
static int freezeset(struct parse *p, cset *cs);
61
static int firstch(struct parse *p, cset *cs);
62
static int nch(struct parse *p, cset *cs);
63
static void mcadd(struct parse *p, cset *cs,
64
char *cp);
65
static int isinsets(struct re_guts *g, int c);
66
static int samesets(struct re_guts *g, int c1, int c2);
67
static void categorize(struct parse *p, struct re_guts *g);
68
static sopno dupl(struct parse *p, sopno start, sopno finish);
69
static void doemit(struct parse *p, sop op, sop opnd);
70
static void doinsert(struct parse *p, sop op, sopno opnd, sopno pos);
71
static void dofwd(struct parse *p, sopno pos, sop value);
72
static void enlarge(struct parse *p, sopno size);
73
static void stripsnug(struct parse *p, struct re_guts *g);
74
static void findmust(struct parse *p, struct re_guts *g);
75
static sopno pluscount(struct parse *p, struct re_guts *g);
76
77
/*
78
* macros for use with parse structure
79
* BEWARE: these know that the parse structure is named `p' !!!
80
*/
81
#define PEEK() (*p->next)
82
#define PEEK2() (*(p->next+1))
83
#define MORE() (p->next < p->end)
84
#define MORE2() (p->next+1 < p->end)
85
#define SEE(c) (MORE() && PEEK() == (c))
86
#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
87
#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
88
#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
89
#define NEXT() (p->next++)
90
#define NEXT2() (p->next += 2)
91
#define NEXTn(n) (p->next += (n))
92
#define GETNEXT() (*p->next++)
93
#define REQUIRE(co, e) ((co) ? (void) 0 : seterr(p, (e)))
94
#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
95
#define EMIT(op, sopnd) doemit(p, (sop)(op), (sop)(sopnd))
96
#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
97
#define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
98
#define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
99
#define HERE() (p->slen)
100
#define THERE() (p->slen - 1)
101
#define THERETHERE() (p->slen - 2)
102
#define DROP(n) (p->slen -= (n))
103
104
#ifndef NDEBUG
105
static int never = 0; /* for use in asserts; shuts lint up */
106
#else
107
#define never 0 /* some <assert.h>s have bugs too */
108
#endif
109
110
int /* 0 success, otherwise REG_something */
111
regcomp(preg, pattern, cflags)
112
regex_t *preg;
113
const char *pattern;
114
int cflags;
115
{
116
struct parse pa;
117
struct re_guts *g;
118
struct parse *p = &pa;
119
int i;
120
size_t len;
121
122
if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
123
return(REG_INVARG);
124
125
if (cflags&REG_PEND) {
126
if (preg->re_endp < pattern)
127
return(REG_INVARG);
128
len = (size_t) (preg->re_endp - pattern);
129
} else
130
len = strlen((char *)pattern);
131
132
/* do the mallocs early so failure handling is easy */
133
g = (struct re_guts *)malloc(sizeof(struct re_guts) +
134
(NC-1)*sizeof(cat_t));
135
if (g == NULL)
136
return(REG_ESPACE);
137
{
138
/* Patched for CERT Vulnerability Note VU#695940, Feb 2015. */
139
size_t new_ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
140
if (new_ssize < len || new_ssize > LONG_MAX / sizeof(sop)) {
141
free((char *) g);
142
return REG_INVARG;
143
}
144
p->ssize = (sopno)new_ssize;
145
}
146
p->strip = (sop *)malloc((size_t)p->ssize * sizeof(sop));
147
p->slen = 0;
148
if (p->strip == NULL) {
149
free((char *)g);
150
return(REG_ESPACE);
151
}
152
153
/* set things up */
154
p->g = g;
155
p->next = (char *)pattern; /* convenience; we do not modify it */
156
p->end = p->next + len;
157
p->error = 0;
158
p->ncsalloc = 0;
159
for (i = 0; i < NPAREN; i++) {
160
p->pbegin[i] = 0;
161
p->pend[i] = 0;
162
}
163
g->csetsize = NC;
164
g->sets = NULL;
165
g->setbits = NULL;
166
g->ncsets = 0;
167
g->cflags = cflags;
168
g->iflags = 0;
169
g->nbol = 0;
170
g->neol = 0;
171
g->must = NULL;
172
g->mlen = 0;
173
g->nsub = 0;
174
g->ncategories = 1; /* category 0 is "everything else" */
175
g->categories = &g->catspace[-(CHAR_MIN)];
176
(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
177
g->backrefs = 0;
178
179
/* do it */
180
EMIT(OEND, 0);
181
g->firststate = THERE();
182
if (cflags&REG_EXTENDED)
183
p_ere(p, OUT);
184
else if (cflags&REG_NOSPEC)
185
p_str(p);
186
else
187
p_bre(p, OUT, OUT);
188
EMIT(OEND, 0);
189
g->laststate = THERE();
190
191
/* tidy up loose ends and fill things in */
192
categorize(p, g);
193
stripsnug(p, g);
194
findmust(p, g);
195
g->nplus = pluscount(p, g);
196
g->magic = MAGIC2;
197
preg->re_nsub = g->nsub;
198
preg->re_g = g;
199
preg->re_magic = MAGIC1;
200
#ifndef REDEBUG
201
/* not debugging, so can't rely on the assert() in regexec() */
202
if (g->iflags&BAD)
203
seterr(p, REG_ASSERT);
204
#endif
205
206
/* win or lose, we're done */
207
if (p->error != 0) /* lose */
208
regfree(preg);
209
return(p->error);
210
}
211
212
/*
213
- p_ere - ERE parser top level, concatenation and alternation
214
*/
215
static void
216
p_ere(p, stop)
217
struct parse *p;
218
int stop; /* character this ERE should end at */
219
{
220
char c;
221
sopno prevback;
222
sopno prevfwd;
223
sopno conc;
224
int first = 1; /* is this the first alternative? */
225
226
for (;;) {
227
/* do a bunch of concatenated expressions */
228
conc = HERE();
229
while (MORE() && (c = PEEK()) != '|' && c != stop)
230
p_ere_exp(p);
231
REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
232
233
if (!EAT('|'))
234
break; /* NOTE BREAK OUT */
235
236
if (first) {
237
INSERT(OCH_, conc); /* offset is wrong */
238
prevfwd = conc;
239
prevback = conc;
240
first = 0;
241
}
242
ASTERN(OOR1, prevback);
243
prevback = THERE();
244
AHEAD(prevfwd); /* fix previous offset */
245
prevfwd = HERE();
246
EMIT(OOR2, 0); /* offset is very wrong */
247
}
248
249
if (!first) { /* tail-end fixups */
250
AHEAD(prevfwd);
251
ASTERN(O_CH, prevback);
252
}
253
254
assert(!MORE() || SEE(stop));
255
}
256
257
/*
258
- p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
259
*/
260
static void
261
p_ere_exp(p)
262
struct parse *p;
263
{
264
char c;
265
sopno pos;
266
int count;
267
int count2;
268
sopno subno;
269
int wascaret = 0;
270
271
assert(MORE()); /* caller should have ensured this */
272
c = GETNEXT();
273
274
pos = HERE();
275
switch (c) {
276
case '(':
277
REQUIRE(MORE(), REG_EPAREN);
278
p->g->nsub++;
279
subno = (sopno)p->g->nsub;
280
if (subno < NPAREN)
281
p->pbegin[subno] = HERE();
282
EMIT(OLPAREN, subno);
283
if (!SEE(')'))
284
p_ere(p, ')');
285
if (subno < NPAREN) {
286
p->pend[subno] = HERE();
287
assert(p->pend[subno] != 0);
288
}
289
EMIT(ORPAREN, subno);
290
MUSTEAT(')', REG_EPAREN);
291
break;
292
#ifndef POSIX_MISTAKE
293
case ')': /* happens only if no current unmatched ( */
294
/*
295
* You may ask, why the ifndef? Because I didn't notice
296
* this until slightly too late for 1003.2, and none of the
297
* other 1003.2 regular-expression reviewers noticed it at
298
* all. So an unmatched ) is legal POSIX, at least until
299
* we can get it fixed.
300
*/
301
seterr(p, REG_EPAREN);
302
break;
303
#endif
304
case '^':
305
EMIT(OBOL, 0);
306
p->g->iflags |= USEBOL;
307
p->g->nbol++;
308
wascaret = 1;
309
break;
310
case '$':
311
EMIT(OEOL, 0);
312
p->g->iflags |= USEEOL;
313
p->g->neol++;
314
break;
315
case '|':
316
seterr(p, REG_EMPTY);
317
break;
318
case '*':
319
case '+':
320
case '?':
321
seterr(p, REG_BADRPT);
322
break;
323
case '.':
324
if (p->g->cflags&REG_NEWLINE)
325
nonnewline(p);
326
else
327
EMIT(OANY, 0);
328
break;
329
case '[':
330
p_bracket(p);
331
break;
332
case '\\':
333
REQUIRE(MORE(), REG_EESCAPE);
334
c = GETNEXT();
335
ordinary(p, c);
336
break;
337
case '{': /* okay as ordinary except if digit follows */
338
REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
339
/* FALLTHROUGH */
340
default:
341
ordinary(p, c);
342
break;
343
}
344
345
if (!MORE())
346
return;
347
c = PEEK();
348
/* we call { a repetition if followed by a digit */
349
if (!( c == '*' || c == '+' || c == '?' ||
350
(c == '{' && MORE2() && isdigit(PEEK2())) ))
351
return; /* no repetition, we're done */
352
NEXT();
353
354
REQUIRE(!wascaret, REG_BADRPT);
355
switch (c) {
356
case '*': /* implemented as +? */
357
/* this case does not require the (y|) trick, noKLUDGE */
358
INSERT(OPLUS_, pos);
359
ASTERN(O_PLUS, pos);
360
INSERT(OQUEST_, pos);
361
ASTERN(O_QUEST, pos);
362
break;
363
case '+':
364
INSERT(OPLUS_, pos);
365
ASTERN(O_PLUS, pos);
366
break;
367
case '?':
368
/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
369
INSERT(OCH_, pos); /* offset slightly wrong */
370
ASTERN(OOR1, pos); /* this one's right */
371
AHEAD(pos); /* fix the OCH_ */
372
EMIT(OOR2, 0); /* offset very wrong... */
373
AHEAD(THERE()); /* ...so fix it */
374
ASTERN(O_CH, THERETHERE());
375
break;
376
case '{':
377
count = p_count(p);
378
if (EAT(',')) {
379
if (isdigit(PEEK())) {
380
count2 = p_count(p);
381
REQUIRE(count <= count2, REG_BADBR);
382
} else /* single number with comma */
383
count2 = INFINITY;
384
} else /* just a single number */
385
count2 = count;
386
repeat(p, pos, count, count2);
387
if (!EAT('}')) { /* error heuristics */
388
while (MORE() && PEEK() != '}')
389
NEXT();
390
REQUIRE(MORE(), REG_EBRACE);
391
seterr(p, REG_BADBR);
392
}
393
break;
394
}
395
396
if (!MORE())
397
return;
398
c = PEEK();
399
if (!( c == '*' || c == '+' || c == '?' ||
400
(c == '{' && MORE2() && isdigit(PEEK2())) ) )
401
return;
402
seterr(p, REG_BADRPT);
403
}
404
405
/*
406
- p_str - string (no metacharacters) "parser"
407
*/
408
static void
409
p_str(p)
410
struct parse *p;
411
{
412
REQUIRE(MORE(), REG_EMPTY);
413
while (MORE())
414
ordinary(p, GETNEXT());
415
}
416
417
/*
418
- p_bre - BRE parser top level, anchoring and concatenation
419
* Giving end1 as OUT essentially eliminates the end1/end2 check.
420
*
421
* This implementation is a bit of a kludge, in that a trailing $ is first
422
* taken as an ordinary character and then revised to be an anchor. The
423
* only undesirable side effect is that '$' gets included as a character
424
* category in such cases. This is fairly harmless; not worth fixing.
425
* The amount of lookahead needed to avoid this kludge is excessive.
426
*/
427
static void
428
p_bre(p, end1, end2)
429
struct parse *p;
430
int end1; /* first terminating character */
431
int end2; /* second terminating character */
432
{
433
sopno start = HERE();
434
int first = 1; /* first subexpression? */
435
int wasdollar = 0;
436
437
if (EAT('^')) {
438
EMIT(OBOL, 0);
439
p->g->iflags |= USEBOL;
440
p->g->nbol++;
441
}
442
while (MORE() && !SEETWO(end1, end2)) {
443
wasdollar = p_simp_re(p, first);
444
first = 0;
445
}
446
if (wasdollar) { /* oops, that was a trailing anchor */
447
DROP(1);
448
EMIT(OEOL, 0);
449
p->g->iflags |= USEEOL;
450
p->g->neol++;
451
}
452
453
REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
454
}
455
456
/*
457
- p_simp_re - parse a simple RE, an atom possibly followed by a repetition
458
*/
459
static int /* was the simple RE an unbackslashed $? */
460
p_simp_re(p, starordinary)
461
struct parse *p;
462
int starordinary; /* is a leading * an ordinary character? */
463
{
464
int c;
465
int count;
466
int count2;
467
sopno pos;
468
int i;
469
sopno subno;
470
# define BACKSL (1<<CHAR_BIT)
471
472
pos = HERE(); /* repetion op, if any, covers from here */
473
474
assert(MORE()); /* caller should have ensured this */
475
c = GETNEXT();
476
if (c == '\\') {
477
REQUIRE(MORE(), REG_EESCAPE);
478
c = BACKSL | (unsigned char)GETNEXT();
479
}
480
switch (c) {
481
case '.':
482
if (p->g->cflags&REG_NEWLINE)
483
nonnewline(p);
484
else
485
EMIT(OANY, 0);
486
break;
487
case '[':
488
p_bracket(p);
489
break;
490
case BACKSL|'{':
491
seterr(p, REG_BADRPT);
492
break;
493
case BACKSL|'(':
494
p->g->nsub++;
495
subno = (sopno)p->g->nsub;
496
if (subno < NPAREN)
497
p->pbegin[subno] = HERE();
498
EMIT(OLPAREN, subno);
499
/* the MORE here is an error heuristic */
500
if (MORE() && !SEETWO('\\', ')'))
501
p_bre(p, '\\', ')');
502
if (subno < NPAREN) {
503
p->pend[subno] = HERE();
504
assert(p->pend[subno] != 0);
505
}
506
EMIT(ORPAREN, subno);
507
REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
508
break;
509
case BACKSL|')': /* should not get here -- must be user */
510
case BACKSL|'}':
511
seterr(p, REG_EPAREN);
512
break;
513
case BACKSL|'1':
514
case BACKSL|'2':
515
case BACKSL|'3':
516
case BACKSL|'4':
517
case BACKSL|'5':
518
case BACKSL|'6':
519
case BACKSL|'7':
520
case BACKSL|'8':
521
case BACKSL|'9':
522
i = (c&~BACKSL) - '0';
523
assert(i < NPAREN);
524
if (p->pend[i] != 0) {
525
assert((size_t) i <= p->g->nsub);
526
EMIT(OBACK_, i);
527
assert(p->pbegin[i] != 0);
528
assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
529
assert(OP(p->strip[p->pend[i]]) == ORPAREN);
530
(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
531
EMIT(O_BACK, i);
532
} else
533
seterr(p, REG_ESUBREG);
534
p->g->backrefs = 1;
535
break;
536
case '*':
537
REQUIRE(starordinary, REG_BADRPT);
538
/* FALLTHROUGH */
539
default:
540
ordinary(p, (char)c); /* takes off BACKSL, if any */
541
break;
542
}
543
544
if (EAT('*')) { /* implemented as +? */
545
/* this case does not require the (y|) trick, noKLUDGE */
546
INSERT(OPLUS_, pos);
547
ASTERN(O_PLUS, pos);
548
INSERT(OQUEST_, pos);
549
ASTERN(O_QUEST, pos);
550
} else if (EATTWO('\\', '{')) {
551
count = p_count(p);
552
if (EAT(',')) {
553
if (MORE() && isdigit(PEEK())) {
554
count2 = p_count(p);
555
REQUIRE(count <= count2, REG_BADBR);
556
} else /* single number with comma */
557
count2 = INFINITY;
558
} else /* just a single number */
559
count2 = count;
560
repeat(p, pos, count, count2);
561
if (!EATTWO('\\', '}')) { /* error heuristics */
562
while (MORE() && !SEETWO('\\', '}'))
563
NEXT();
564
REQUIRE(MORE(), REG_EBRACE);
565
seterr(p, REG_BADBR);
566
}
567
} else if (c == (unsigned char)'$') /* $ (but not \$) ends it */
568
return(1);
569
570
return(0);
571
}
572
573
/*
574
- p_count - parse a repetition count
575
*/
576
static int /* the value */
577
p_count(p)
578
struct parse *p;
579
{
580
int count = 0;
581
int ndigits = 0;
582
583
while (MORE() && isdigit(PEEK()) && count <= DUPMAX) {
584
count = count*10 + (GETNEXT() - '0');
585
ndigits++;
586
}
587
588
REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
589
return(count);
590
}
591
592
/*
593
- p_bracket - parse a bracketed character list
594
*
595
* Note a significant property of this code: if the allocset() did seterr,
596
* no set operations are done.
597
*/
598
static void
599
p_bracket(p)
600
struct parse *p;
601
{
602
cset *cs = allocset(p);
603
int invert = 0;
604
605
/* Dept of Truly Sickening Special-Case Kludges */
606
if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
607
EMIT(OBOW, 0);
608
NEXTn(6);
609
return;
610
}
611
if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
612
EMIT(OEOW, 0);
613
NEXTn(6);
614
return;
615
}
616
617
if (EAT('^'))
618
invert++; /* make note to invert set at end */
619
if (EAT(']'))
620
CHadd(cs, ']');
621
else if (EAT('-'))
622
CHadd(cs, '-');
623
while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
624
p_b_term(p, cs);
625
if (EAT('-'))
626
CHadd(cs, '-');
627
MUSTEAT(']', REG_EBRACK);
628
629
if (p->error != 0) /* don't mess things up further */
630
return;
631
632
if (p->g->cflags&REG_ICASE) {
633
int i;
634
int ci;
635
636
for (i = p->g->csetsize - 1; i >= 0; i--)
637
if (CHIN(cs, i) && isalpha(i)) {
638
ci = othercase(i);
639
if (ci != i)
640
CHadd(cs, ci);
641
}
642
assert(cs->multis == NULL); /* xxx */
643
#if 0
644
if (cs->multis != NULL)
645
mccase(p, cs);
646
#endif
647
}
648
if (invert) {
649
int i;
650
651
for (i = p->g->csetsize - 1; i >= 0; i--)
652
if (CHIN(cs, i))
653
CHsub(cs, i);
654
else
655
CHadd(cs, i);
656
if (p->g->cflags&REG_NEWLINE)
657
CHsub(cs, '\n');
658
assert(cs->multis == NULL); /* xxx */
659
#if 0
660
if (cs->multis != NULL)
661
mcinvert(p, cs);
662
#endif
663
}
664
665
assert(cs->multis == NULL); /* xxx */
666
667
if (nch(p, cs) == 1) { /* optimize singleton sets */
668
ordinary(p, firstch(p, cs));
669
freeset(p, cs);
670
} else
671
EMIT(OANYOF, freezeset(p, cs));
672
}
673
674
/*
675
- p_b_term - parse one term of a bracketed character list
676
*/
677
static void
678
p_b_term(p, cs)
679
struct parse *p;
680
cset *cs;
681
{
682
char c;
683
char start, finish;
684
int i;
685
686
/* classify what we've got */
687
switch ((MORE()) ? PEEK() : '\0') {
688
case '[':
689
c = (MORE2()) ? PEEK2() : '\0';
690
break;
691
case '-':
692
seterr(p, REG_ERANGE);
693
return; /* NOTE RETURN */
694
break;
695
default:
696
c = '\0';
697
break;
698
}
699
700
switch (c) {
701
case ':': /* character class */
702
NEXT2();
703
REQUIRE(MORE(), REG_EBRACK);
704
c = PEEK();
705
REQUIRE(c != '-' && c != ']', REG_ECTYPE);
706
p_b_cclass(p, cs);
707
REQUIRE(MORE(), REG_EBRACK);
708
REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
709
break;
710
case '=': /* equivalence class */
711
NEXT2();
712
REQUIRE(MORE(), REG_EBRACK);
713
c = PEEK();
714
REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
715
p_b_eclass(p, cs);
716
REQUIRE(MORE(), REG_EBRACK);
717
REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
718
break;
719
default: /* symbol, ordinary character, or range */
720
/* xxx revision needed for multichar stuff */
721
start = p_b_symbol(p);
722
if (SEE('-') && MORE2() && PEEK2() != ']') {
723
/* range */
724
NEXT();
725
if (EAT('-'))
726
finish = '-';
727
else
728
finish = p_b_symbol(p);
729
} else
730
finish = start;
731
/* xxx what about signed chars here... */
732
REQUIRE(start <= finish, REG_ERANGE);
733
for (i = start; i <= finish; i++)
734
CHadd(cs, i);
735
break;
736
}
737
}
738
739
/* Character-class table. */
740
static struct cclass {
741
char *name;
742
char *chars;
743
char *multis;
744
} cclasses[] = {
745
{"alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", ""},
746
{"alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", ""},
747
{"blank", " \t", ""},
748
{"cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\25\26\27\30\31\32\33\34\35\36\37\177", ""},
749
{"digit", "0123456789", ""},
750
{"graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", ""},
751
{"lower", "abcdefghijklmnopqrstuvwxyz", ""},
752
{"print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ", ""},
753
{"punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", ""},
754
{"space", "\t\n\v\f\r ", ""},
755
{"upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", ""},
756
{"xdigit", "0123456789ABCDEFabcdef", ""},
757
{NULL, 0, ""}
758
};
759
760
/*
761
- p_b_cclass - parse a character-class name and deal with it
762
*/
763
static void
764
p_b_cclass(p, cs)
765
struct parse *p;
766
cset *cs;
767
{
768
char *sp = p->next;
769
struct cclass *cp;
770
size_t len;
771
char *u;
772
char c;
773
774
while (MORE() && isalpha(PEEK()))
775
NEXT();
776
len = (size_t)(p->next - sp);
777
for (cp = cclasses; cp->name != NULL; cp++)
778
if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
779
break;
780
if (cp->name == NULL) {
781
/* oops, didn't find it */
782
seterr(p, REG_ECTYPE);
783
return;
784
}
785
786
u = cp->chars;
787
while ((c = *u++) != '\0')
788
CHadd(cs, c);
789
for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
790
MCadd(p, cs, u);
791
}
792
793
/*
794
- p_b_eclass - parse an equivalence-class name and deal with it
795
*
796
* This implementation is incomplete. xxx
797
*/
798
static void
799
p_b_eclass(p, cs)
800
struct parse *p;
801
cset *cs;
802
{
803
char c;
804
805
c = p_b_coll_elem(p, '=');
806
CHadd(cs, c);
807
}
808
809
/*
810
- p_b_symbol - parse a character or [..]ed multicharacter collating symbol
811
*/
812
static char /* value of symbol */
813
p_b_symbol(p)
814
struct parse *p;
815
{
816
char value;
817
818
REQUIRE(MORE(), REG_EBRACK);
819
if (!EATTWO('[', '.'))
820
return(GETNEXT());
821
822
/* collating symbol */
823
value = p_b_coll_elem(p, '.');
824
REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
825
return(value);
826
}
827
828
/* character-name table */
829
static struct cname {
830
char *name;
831
char code;
832
} cnames[] = {
833
{"NUL", '\0'},
834
{"SOH", '\001'},
835
{"STX", '\002'},
836
{"ETX", '\003'},
837
{"EOT", '\004'},
838
{"ENQ", '\005'},
839
{"ACK", '\006'},
840
{"BEL", '\007'},
841
{"alert", '\007'},
842
{"BS", '\010'},
843
{"backspace", '\b'},
844
{"HT", '\011'},
845
{"tab", '\t'},
846
{"LF", '\012'},
847
{"newline", '\n'},
848
{"VT", '\013'},
849
{"vertical-tab", '\v'},
850
{"FF", '\014'},
851
{"form-feed", '\f'},
852
{"CR", '\015'},
853
{"carriage-return", '\r'},
854
{"SO", '\016'},
855
{"SI", '\017'},
856
{"DLE", '\020'},
857
{"DC1", '\021'},
858
{"DC2", '\022'},
859
{"DC3", '\023'},
860
{"DC4", '\024'},
861
{"NAK", '\025'},
862
{"SYN", '\026'},
863
{"ETB", '\027'},
864
{"CAN", '\030'},
865
{"EM", '\031'},
866
{"SUB", '\032'},
867
{"ESC", '\033'},
868
{"IS4", '\034'},
869
{"FS", '\034'},
870
{"IS3", '\035'},
871
{"GS", '\035'},
872
{"IS2", '\036'},
873
{"RS", '\036'},
874
{"IS1", '\037'},
875
{"US", '\037'},
876
{"space", ' '},
877
{"exclamation-mark", '!'},
878
{"quotation-mark", '"'},
879
{"number-sign", '#'},
880
{"dollar-sign", '$'},
881
{"percent-sign", '%'},
882
{"ampersand", '&'},
883
{"apostrophe", '\''},
884
{"left-parenthesis", '('},
885
{"right-parenthesis", ')'},
886
{"asterisk", '*'},
887
{"plus-sign", '+'},
888
{"comma", ','},
889
{"hyphen", '-'},
890
{"hyphen-minus", '-'},
891
{"period", '.'},
892
{"full-stop", '.'},
893
{"slash", '/'},
894
{"solidus", '/'},
895
{"zero", '0'},
896
{"one", '1'},
897
{"two", '2'},
898
{"three", '3'},
899
{"four", '4'},
900
{"five", '5'},
901
{"six", '6'},
902
{"seven", '7'},
903
{"eight", '8'},
904
{"nine", '9'},
905
{"colon", ':'},
906
{"semicolon", ';'},
907
{"less-than-sign", '<'},
908
{"equals-sign", '='},
909
{"greater-than-sign", '>'},
910
{"question-mark", '?'},
911
{"commercial-at", '@'},
912
{"left-square-bracket", '['},
913
{"backslash", '\\'},
914
{"reverse-solidus", '\\'},
915
{"right-square-bracket", ']'},
916
{"circumflex", '^'},
917
{"circumflex-accent", '^'},
918
{"underscore", '_'},
919
{"low-line", '_'},
920
{"grave-accent", '`'},
921
{"left-brace", '{'},
922
{"left-curly-bracket", '{'},
923
{"vertical-line", '|'},
924
{"right-brace", '}'},
925
{"right-curly-bracket", '}'},
926
{"tilde", '~'},
927
{"DEL", '\177'},
928
{NULL, 0},
929
};
930
931
/*
932
- p_b_coll_elem - parse a collating-element name and look it up
933
*/
934
static char /* value of collating element */
935
p_b_coll_elem(p, endc)
936
struct parse *p;
937
int endc; /* name ended by endc,']' */
938
{
939
char *sp = p->next;
940
struct cname *cp;
941
size_t len;
942
943
while (MORE() && !SEETWO(endc, ']'))
944
NEXT();
945
if (!MORE()) {
946
seterr(p, REG_EBRACK);
947
return(0);
948
}
949
len = (size_t)(p->next - sp);
950
for (cp = cnames; cp->name != NULL; cp++)
951
if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
952
return(cp->code); /* known name */
953
if (len == 1)
954
return(*sp); /* single character */
955
seterr(p, REG_ECOLLATE); /* neither */
956
return(0);
957
}
958
959
/*
960
- othercase - return the case counterpart of an alphabetic
961
*/
962
static char /* if no counterpart, return ch */
963
othercase(ch)
964
int ch;
965
{
966
assert(isalpha(ch));
967
if (isupper(ch))
968
return((char)tolower(ch));
969
else if (islower(ch))
970
return((char)toupper(ch));
971
else /* peculiar, but could happen */
972
return((char)ch);
973
}
974
975
/*
976
- bothcases - emit a dualcase version of a two-case character
977
*
978
* Boy, is this implementation ever a kludge...
979
*/
980
static void
981
bothcases(p, ch)
982
struct parse *p;
983
int ch;
984
{
985
char *oldnext = p->next;
986
char *oldend = p->end;
987
char bracket[3];
988
989
assert(othercase(ch) != ch); /* p_bracket() would recurse */
990
p->next = bracket;
991
p->end = bracket+2;
992
bracket[0] = (char)ch;
993
bracket[1] = ']';
994
bracket[2] = '\0';
995
p_bracket(p);
996
assert(p->next == bracket+2);
997
p->next = oldnext;
998
p->end = oldend;
999
}
1000
1001
/*
1002
- ordinary - emit an ordinary character
1003
*/
1004
static void
1005
ordinary(p, ch)
1006
struct parse *p;
1007
int ch;
1008
{
1009
cat_t *cap = p->g->categories;
1010
1011
if ((p->g->cflags&REG_ICASE) && isalpha(ch) && othercase(ch) != ch)
1012
bothcases(p, ch);
1013
else {
1014
EMIT(OCHAR, (sopno)ch);
1015
if (cap[ch] == 0)
1016
cap[ch] = (cat_t)(p->g->ncategories++);
1017
}
1018
}
1019
1020
/*
1021
- nonnewline - emit REG_NEWLINE version of OANY
1022
*
1023
* Boy, is this implementation ever a kludge...
1024
*/
1025
static void
1026
nonnewline(p)
1027
struct parse *p;
1028
{
1029
char *oldnext = p->next;
1030
char *oldend = p->end;
1031
char bracket[4];
1032
1033
p->next = bracket;
1034
p->end = bracket+3;
1035
bracket[0] = '^';
1036
bracket[1] = '\n';
1037
bracket[2] = ']';
1038
bracket[3] = '\0';
1039
p_bracket(p);
1040
assert(p->next == bracket+3);
1041
p->next = oldnext;
1042
p->end = oldend;
1043
}
1044
1045
/*
1046
- repeat - generate code for a bounded repetition, recursively if needed
1047
*/
1048
static void
1049
repeat(p, start, from, to)
1050
struct parse *p;
1051
sopno start; /* operand from here to end of strip */
1052
int from; /* repeated from this number */
1053
int to; /* to this number of times (maybe INFINITY) */
1054
{
1055
sopno finish = HERE();
1056
# define N 2
1057
# define INF 3
1058
# define REP(f, t) ((f)*8 + (t))
1059
# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1060
sopno copy;
1061
1062
if (p->error != 0) /* head off possible runaway recursion */
1063
return;
1064
1065
assert(from <= to);
1066
1067
switch (REP(MAP(from), MAP(to))) {
1068
case REP(0, 0): /* must be user doing this */
1069
DROP(finish-start); /* drop the operand */
1070
break;
1071
case REP(0, 1): /* as x{1,1}? */
1072
case REP(0, N): /* as x{1,n}? */
1073
case REP(0, INF): /* as x{1,}? */
1074
/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1075
INSERT(OCH_, start); /* offset is wrong... */
1076
repeat(p, start+1, 1, to);
1077
ASTERN(OOR1, start);
1078
AHEAD(start); /* ... fix it */
1079
EMIT(OOR2, 0);
1080
AHEAD(THERE());
1081
ASTERN(O_CH, THERETHERE());
1082
break;
1083
case REP(1, 1): /* trivial case */
1084
/* done */
1085
break;
1086
case REP(1, N): /* as x?x{1,n-1} */
1087
/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1088
INSERT(OCH_, start);
1089
ASTERN(OOR1, start);
1090
AHEAD(start);
1091
EMIT(OOR2, 0); /* offset very wrong... */
1092
AHEAD(THERE()); /* ...so fix it */
1093
ASTERN(O_CH, THERETHERE());
1094
copy = dupl(p, start+1, finish+1);
1095
assert(copy == finish+4);
1096
repeat(p, copy, 1, to-1);
1097
break;
1098
case REP(1, INF): /* as x+ */
1099
INSERT(OPLUS_, start);
1100
ASTERN(O_PLUS, start);
1101
break;
1102
case REP(N, N): /* as xx{m-1,n-1} */
1103
copy = dupl(p, start, finish);
1104
repeat(p, copy, from-1, to-1);
1105
break;
1106
case REP(N, INF): /* as xx{n-1,INF} */
1107
copy = dupl(p, start, finish);
1108
repeat(p, copy, from-1, to);
1109
break;
1110
default: /* "can't happen" */
1111
seterr(p, REG_ASSERT); /* just in case */
1112
break;
1113
}
1114
}
1115
1116
/*
1117
- seterr - set an error condition
1118
*/
1119
static void
1120
seterr(p, e)
1121
struct parse *p;
1122
int e;
1123
{
1124
if (p->error == 0) /* keep earliest error condition */
1125
p->error = e;
1126
p->next = nuls; /* try to bring things to a halt */
1127
p->end = nuls;
1128
}
1129
1130
/*
1131
- allocset - allocate a set of characters for []
1132
*/
1133
static cset *
1134
allocset(p)
1135
struct parse *p;
1136
{
1137
int no = p->g->ncsets++;
1138
int nc;
1139
int nbytes;
1140
cset *cs;
1141
int css = p->g->csetsize;
1142
int i;
1143
1144
if (no >= p->ncsalloc) { /* need another column of space */
1145
p->ncsalloc += CHAR_BIT;
1146
nc = p->ncsalloc;
1147
assert(nc % CHAR_BIT == 0);
1148
nbytes = nc / CHAR_BIT * css;
1149
if (p->g->sets == NULL)
1150
p->g->sets = (cset *)malloc((size_t)nc * sizeof(cset));
1151
else
1152
p->g->sets = (cset *)realloc((char *)p->g->sets,
1153
(size_t)nc * sizeof(cset));
1154
if (p->g->setbits == NULL)
1155
p->g->setbits = (uch *)malloc((size_t)nbytes);
1156
else {
1157
p->g->setbits = (uch *)realloc((char *)p->g->setbits,
1158
(size_t)nbytes);
1159
/* xxx this isn't right if setbits is now NULL */
1160
for (i = 0; i < no; i++)
1161
p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1162
}
1163
if (p->g->sets != NULL && p->g->setbits != NULL)
1164
(void) memset((char *)p->g->setbits + (nbytes - css),
1165
0, (size_t)css);
1166
else {
1167
no = 0;
1168
seterr(p, REG_ESPACE);
1169
/* caller's responsibility not to do set ops */
1170
}
1171
}
1172
1173
assert(p->g->sets != NULL); /* xxx */
1174
cs = &p->g->sets[no];
1175
cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1176
cs->mask = (uch) (1 << ((no) % CHAR_BIT));
1177
cs->hash = 0;
1178
cs->smultis = 0;
1179
cs->multis = NULL;
1180
1181
return(cs);
1182
}
1183
1184
/*
1185
- freeset - free a now-unused set
1186
*/
1187
static void
1188
freeset(p, cs)
1189
struct parse *p;
1190
cset *cs;
1191
{
1192
size_t i;
1193
cset *top = &p->g->sets[p->g->ncsets];
1194
size_t css = (size_t)p->g->csetsize;
1195
1196
for (i = 0; i < css; i++)
1197
CHsub(cs, i);
1198
if (cs == top-1) /* recover only the easy case */
1199
p->g->ncsets--;
1200
}
1201
1202
/*
1203
- freezeset - final processing on a set of characters
1204
*
1205
* The main task here is merging identical sets. This is usually a waste
1206
* of time (although the hash code minimizes the overhead), but can win
1207
* big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1208
* is done using addition rather than xor -- all ASCII [aA] sets xor to
1209
* the same value!
1210
*/
1211
static int /* set number */
1212
freezeset(p, cs)
1213
struct parse *p;
1214
cset *cs;
1215
{
1216
uch h = cs->hash;
1217
size_t i;
1218
cset *top = &p->g->sets[p->g->ncsets];
1219
cset *cs2;
1220
size_t css = (size_t)p->g->csetsize;
1221
1222
/* look for an earlier one which is the same */
1223
for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1224
if (cs2->hash == h && cs2 != cs) {
1225
/* maybe */
1226
for (i = 0; i < css; i++)
1227
if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1228
break; /* no */
1229
if (i == css)
1230
break; /* yes */
1231
}
1232
1233
if (cs2 < top) { /* found one */
1234
freeset(p, cs);
1235
cs = cs2;
1236
}
1237
1238
return((int)(cs - p->g->sets));
1239
}
1240
1241
/*
1242
- firstch - return first character in a set (which must have at least one)
1243
*/
1244
static int /* character; there is no "none" value */
1245
firstch(p, cs)
1246
struct parse *p;
1247
cset *cs;
1248
{
1249
size_t i;
1250
size_t css = (size_t)p->g->csetsize;
1251
1252
for (i = 0; i < css; i++)
1253
if (CHIN(cs, i))
1254
return((char)i);
1255
assert(never);
1256
return(0); /* arbitrary */
1257
}
1258
1259
/*
1260
- nch - number of characters in a set
1261
*/
1262
static int
1263
nch(p, cs)
1264
struct parse *p;
1265
cset *cs;
1266
{
1267
size_t i;
1268
size_t css = (size_t)p->g->csetsize;
1269
int n = 0;
1270
1271
for (i = 0; i < css; i++)
1272
if (CHIN(cs, i))
1273
n++;
1274
return(n);
1275
}
1276
1277
/*
1278
- mcadd - add a collating element to a cset
1279
*/
1280
static void
1281
mcadd(p, cs, cp)
1282
struct parse *p;
1283
cset *cs;
1284
char *cp;
1285
{
1286
size_t oldend = cs->smultis;
1287
1288
cs->smultis += strlen(cp) + 1;
1289
if (cs->multis == NULL)
1290
cs->multis = malloc(cs->smultis);
1291
else
1292
cs->multis = realloc(cs->multis, cs->smultis);
1293
if (cs->multis == NULL) {
1294
seterr(p, REG_ESPACE);
1295
return;
1296
}
1297
1298
(void) strcpy(cs->multis + oldend - 1, cp);
1299
cs->multis[cs->smultis - 1] = '\0';
1300
}
1301
1302
#if 0
1303
/*
1304
- mcsub - subtract a collating element from a cset
1305
*/
1306
static void
1307
mcsub(cs, cp)
1308
cset *cs;
1309
char *cp;
1310
{
1311
char *fp = mcfind(cs, cp);
1312
size_t len = strlen(fp);
1313
1314
assert(fp != NULL);
1315
(void) memmove(fp, fp + len + 1,
1316
cs->smultis - (fp + len + 1 - cs->multis));
1317
cs->smultis -= len;
1318
1319
if (cs->smultis == 0) {
1320
free(cs->multis);
1321
cs->multis = NULL;
1322
return;
1323
}
1324
1325
cs->multis = realloc(cs->multis, cs->smultis);
1326
assert(cs->multis != NULL);
1327
}
1328
1329
/*
1330
- mcin - is a collating element in a cset?
1331
*/
1332
static int
1333
mcin(cs, cp)
1334
cset *cs;
1335
char *cp;
1336
{
1337
return(mcfind(cs, cp) != NULL);
1338
}
1339
1340
/*
1341
- mcfind - find a collating element in a cset
1342
*/
1343
static char *
1344
mcfind(cs, cp)
1345
cset *cs;
1346
char *cp;
1347
{
1348
char *p;
1349
1350
if (cs->multis == NULL)
1351
return(NULL);
1352
for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1353
if (strcmp(cp, p) == 0)
1354
return(p);
1355
return(NULL);
1356
}
1357
1358
/*
1359
- mcinvert - invert the list of collating elements in a cset
1360
*
1361
* This would have to know the set of possibilities. Implementation
1362
* is deferred.
1363
*/
1364
static void
1365
mcinvert(p, cs)
1366
struct parse *p;
1367
cset *cs;
1368
{
1369
assert(cs->multis == NULL); /* xxx */
1370
}
1371
1372
/*
1373
- mccase - add case counterparts of the list of collating elements in a cset
1374
*
1375
* This would have to know the set of possibilities. Implementation
1376
* is deferred.
1377
*/
1378
static void
1379
mccase(p, cs)
1380
struct parse *p;
1381
cset *cs;
1382
{
1383
assert(cs->multis == NULL); /* xxx */
1384
}
1385
#endif
1386
1387
/*
1388
- isinsets - is this character in any sets?
1389
*/
1390
static int /* predicate */
1391
isinsets(g, c)
1392
struct re_guts *g;
1393
int c;
1394
{
1395
uch *col;
1396
int i;
1397
int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1398
unsigned uc = (unsigned char)c;
1399
1400
for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1401
if (col[uc] != 0)
1402
return(1);
1403
return(0);
1404
}
1405
1406
/*
1407
- samesets - are these two characters in exactly the same sets?
1408
*/
1409
static int /* predicate */
1410
samesets(g, c1, c2)
1411
struct re_guts *g;
1412
int c1;
1413
int c2;
1414
{
1415
uch *col;
1416
int i;
1417
int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1418
unsigned uc1 = (unsigned char)c1;
1419
unsigned uc2 = (unsigned char)c2;
1420
1421
for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1422
if (col[uc1] != col[uc2])
1423
return(0);
1424
return(1);
1425
}
1426
1427
/*
1428
- categorize - sort out character categories
1429
*/
1430
static void
1431
categorize(p, g)
1432
struct parse *p;
1433
struct re_guts *g;
1434
{
1435
cat_t *cats = g->categories;
1436
int c;
1437
int c2;
1438
cat_t cat;
1439
1440
/* avoid making error situations worse */
1441
if (p->error != 0)
1442
return;
1443
1444
for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1445
if (cats[c] == 0 && isinsets(g, c)) {
1446
cat = (cat_t)g->ncategories++;
1447
cats[c] = cat;
1448
for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1449
if (cats[c2] == 0 && samesets(g, c, c2))
1450
cats[c2] = cat;
1451
}
1452
}
1453
1454
/*
1455
- dupl - emit a duplicate of a bunch of sops
1456
*/
1457
static sopno /* start of duplicate */
1458
dupl(p, start, finish)
1459
struct parse *p;
1460
sopno start; /* from here */
1461
sopno finish; /* to this less one */
1462
{
1463
sopno ret = HERE();
1464
sopno len = finish - start;
1465
1466
assert(finish >= start);
1467
if (len == 0)
1468
return(ret);
1469
enlarge(p, p->ssize + len); /* this many unexpected additions */
1470
assert(p->ssize >= p->slen + len);
1471
(void) memcpy((char *)(p->strip + p->slen),
1472
(char *)(p->strip + start), (size_t)len*sizeof(sop));
1473
p->slen += len;
1474
return(ret);
1475
}
1476
1477
/*
1478
- doemit - emit a strip operator
1479
*
1480
* It might seem better to implement this as a macro with a function as
1481
* hard-case backup, but it's just too big and messy unless there are
1482
* some changes to the data structures. Maybe later.
1483
*/
1484
static void
1485
doemit(p, op, opnd)
1486
struct parse *p;
1487
sop op;
1488
sop opnd;
1489
{
1490
/* avoid making error situations worse */
1491
if (p->error != 0)
1492
return;
1493
1494
/* deal with oversize operands ("can't happen", more or less) */
1495
assert(opnd < 1<<OPSHIFT);
1496
1497
/* deal with undersized strip */
1498
if (p->slen >= p->ssize)
1499
enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
1500
assert(p->slen < p->ssize);
1501
1502
/* finally, it's all reduced to the easy case */
1503
p->strip[p->slen++] = SOP(op, opnd);
1504
}
1505
1506
/*
1507
- doinsert - insert a sop into the strip
1508
*/
1509
static void
1510
doinsert(p, op, opnd, pos)
1511
struct parse *p;
1512
sop op;
1513
sopno opnd;
1514
sopno pos;
1515
{
1516
sopno sn;
1517
sop s;
1518
int i;
1519
1520
/* avoid making error situations worse */
1521
if (p->error != 0)
1522
return;
1523
1524
sn = HERE();
1525
EMIT(op, opnd); /* do checks, ensure space */
1526
assert(HERE() == sn+1);
1527
s = p->strip[sn];
1528
1529
/* adjust paren pointers */
1530
assert(pos > 0);
1531
for (i = 1; i < NPAREN; i++) {
1532
if (p->pbegin[i] >= pos) {
1533
p->pbegin[i]++;
1534
}
1535
if (p->pend[i] >= pos) {
1536
p->pend[i]++;
1537
}
1538
}
1539
1540
memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1541
(size_t)(HERE()-pos-1)*sizeof(sop));
1542
p->strip[pos] = s;
1543
}
1544
1545
/*
1546
- dofwd - complete a forward reference
1547
*/
1548
static void
1549
dofwd(p, pos, value)
1550
struct parse *p;
1551
sopno pos;
1552
sop value;
1553
{
1554
/* avoid making error situations worse */
1555
if (p->error != 0)
1556
return;
1557
1558
assert(value < 1<<OPSHIFT);
1559
p->strip[pos] = OP(p->strip[pos]) | value;
1560
}
1561
1562
/*
1563
- enlarge - enlarge the strip
1564
*/
1565
static void
1566
enlarge(p, size)
1567
struct parse *p;
1568
sopno size;
1569
{
1570
sop *sp;
1571
1572
if (p->ssize >= size)
1573
return;
1574
1575
sp = (sop *)realloc(p->strip, (size_t)size*sizeof(sop));
1576
if (sp == NULL) {
1577
seterr(p, REG_ESPACE);
1578
return;
1579
}
1580
p->strip = sp;
1581
p->ssize = size;
1582
}
1583
1584
/*
1585
- stripsnug - compact the strip
1586
*/
1587
static void
1588
stripsnug(p, g)
1589
struct parse *p;
1590
struct re_guts *g;
1591
{
1592
g->nstates = p->slen;
1593
g->strip = (sop *)realloc((char *)p->strip, (size_t)p->slen * sizeof(sop));
1594
if (g->strip == NULL) {
1595
seterr(p, REG_ESPACE);
1596
g->strip = p->strip;
1597
}
1598
}
1599
1600
/*
1601
- findmust - fill in must and mlen with longest mandatory literal string
1602
*
1603
* This algorithm could do fancy things like analyzing the operands of |
1604
* for common subsequences. Someday. This code is simple and finds most
1605
* of the interesting cases.
1606
*
1607
* Note that must and mlen got initialized during setup.
1608
*/
1609
static void
1610
findmust(p, g)
1611
struct parse *p;
1612
struct re_guts *g;
1613
{
1614
sop *scan;
1615
sop *start;
1616
sop *newstart;
1617
int newlen;
1618
sop s;
1619
char *cp;
1620
sopno i;
1621
1622
/* avoid making error situations worse */
1623
if (p->error != 0)
1624
return;
1625
1626
/* find the longest OCHAR sequence in strip */
1627
newlen = 0;
1628
scan = g->strip + 1;
1629
do {
1630
s = *scan++;
1631
switch (OP(s)) {
1632
case OCHAR: /* sequence member */
1633
if (newlen == 0) /* new sequence */
1634
newstart = scan - 1;
1635
newlen++;
1636
break;
1637
case OPLUS_: /* things that don't break one */
1638
case OLPAREN:
1639
case ORPAREN:
1640
break;
1641
case OQUEST_: /* things that must be skipped */
1642
case OCH_:
1643
scan--;
1644
do {
1645
scan += OPND(s);
1646
s = *scan;
1647
/* assert() interferes w debug printouts */
1648
if (OP(s) != O_QUEST && OP(s) != O_CH &&
1649
OP(s) != OOR2) {
1650
g->iflags |= BAD;
1651
return;
1652
}
1653
} while (OP(s) != O_QUEST && OP(s) != O_CH);
1654
/* fallthrough */
1655
default: /* things that break a sequence */
1656
if (newlen > g->mlen) { /* ends one */
1657
start = newstart;
1658
g->mlen = newlen;
1659
}
1660
newlen = 0;
1661
break;
1662
}
1663
} while (OP(s) != OEND);
1664
1665
if (g->mlen == 0) /* there isn't one */
1666
return;
1667
1668
/* turn it into a character string */
1669
g->must = malloc((size_t)g->mlen + 1);
1670
if (g->must == NULL) { /* argh; just forget it */
1671
g->mlen = 0;
1672
return;
1673
}
1674
cp = g->must;
1675
scan = start;
1676
for (i = g->mlen; i > 0; i--) {
1677
while (OP(s = *scan++) != OCHAR)
1678
continue;
1679
assert(cp < g->must + g->mlen);
1680
*cp++ = (char)OPND(s);
1681
}
1682
assert(cp == g->must + g->mlen);
1683
*cp++ = '\0'; /* just on general principles */
1684
}
1685
1686
/*
1687
- pluscount - count + nesting
1688
*/
1689
static sopno /* nesting depth */
1690
pluscount(p, g)
1691
struct parse *p;
1692
struct re_guts *g;
1693
{
1694
sop *scan;
1695
sop s;
1696
sopno plusnest = 0;
1697
sopno maxnest = 0;
1698
1699
if (p->error != 0)
1700
return(0); /* there may not be an OEND */
1701
1702
scan = g->strip + 1;
1703
do {
1704
s = *scan++;
1705
switch (OP(s)) {
1706
case OPLUS_:
1707
plusnest++;
1708
break;
1709
case O_PLUS:
1710
if (plusnest > maxnest)
1711
maxnest = plusnest;
1712
plusnest--;
1713
break;
1714
}
1715
} while (OP(s) != OEND);
1716
if (plusnest != 0)
1717
g->iflags |= BAD;
1718
return(maxnest);
1719
}
1720
1721