CoCalc -- regcomp.c

GitHub Repository: srohatgi01/cups
Path: blob/master/vcnet/regex/regcomp.c
¹⁰⁹⁰ views
1
#include <assert.h>
2
#include <sys/types.h>
3
#include <stdio.h>
4
#include <string.h>
5
#include <ctype.h>
6
#include <limits.h>
7
#include <stdlib.h>
8

9
#include "regex.h"
10
#include "regex2.h"
11

12
#ifdef _POSIX2_RE_DUP_MAX
13
#define	DUPMAX	_POSIX2_RE_DUP_MAX
14
#else
15
#define	DUPMAX	255
16
#endif
17
#define	INFINITY	(DUPMAX + 1)
18

19
/*
20
 * parse structure, passed up and down to avoid global variables and
21
 * other clumsinesses
22
 */
23
struct parse {
24
	char *next;		/* next character in RE */
25
	char *end;		/* end of string (-> NUL normally) */
26
	int error;		/* has an error been seen? */
27
	sop *strip;		/* malloced strip */
28
	sopno ssize;		/* malloced strip size (allocated) */
29
	sopno slen;		/* malloced strip length (used) */
30
	int ncsalloc;		/* number of csets allocated */
31
	struct re_guts *g;
32
#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
33
	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
34
	sopno pend[NPAREN];	/* -> ) ([0] unused) */
35
};
36

37
static char nuls[10];		/* place to point scanner in event of error */
38

39
static void p_ere(struct parse *p, int stop);
40
static void p_ere_exp(struct parse *p);
41
static void p_str(struct parse *p);
42
static void p_bre(struct parse *p, int end1,
43
	int end2);
44
static int p_simp_re(struct parse *p, int starordinary);
45
static int p_count(struct parse *p);
46
static void p_bracket(struct parse *p);
47
static void p_b_term(struct parse *p, cset *cs);
48
static void p_b_cclass(struct parse *p, cset *cs);
49
static void p_b_eclass(struct parse *p, cset *cs);
50
static char p_b_symbol(struct parse *p);
51
static char p_b_coll_elem(struct parse *p, int endc);
52
static char othercase(int ch);
53
static void bothcases(struct parse *p, int ch);
54
static void ordinary(struct parse *p, int ch);
55
static void nonnewline(struct parse *p);
56
static void repeat(struct parse *p, sopno start, int from, int to);
57
static void seterr(struct parse *p, int e);
58
static cset *allocset(struct parse *p);
59
static void freeset(struct parse *p, cset *cs);
60
static int freezeset(struct parse *p, cset *cs);
61
static int firstch(struct parse *p, cset *cs);
62
static int nch(struct parse *p, cset *cs);
63
static void mcadd(struct parse *p, cset *cs,
64
	char *cp);
65
static int isinsets(struct re_guts *g, int c);
66
static int samesets(struct re_guts *g, int c1, int c2);
67
static void categorize(struct parse *p, struct re_guts *g);
68
static sopno dupl(struct parse *p, sopno start, sopno finish);
69
static void doemit(struct parse *p, sop op, sop opnd);
70
static void doinsert(struct parse *p, sop op, sopno opnd, sopno pos);
71
static void dofwd(struct parse *p, sopno pos, sop value);
72
static void enlarge(struct parse *p, sopno size);
73
static void stripsnug(struct parse *p, struct re_guts *g);
74
static void findmust(struct parse *p, struct re_guts *g);
75
static sopno pluscount(struct parse *p, struct re_guts *g);
76

77
/*
78
 * macros for use with parse structure
79
 * BEWARE:  these know that the parse structure is named `p' !!!
80
 */
81
#define	PEEK()	(*p->next)
82
#define	PEEK2()	(*(p->next+1))
83
#define	MORE()	(p->next < p->end)
84
#define	MORE2()	(p->next+1 < p->end)
85
#define	SEE(c)	(MORE() && PEEK() == (c))
86
#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
87
#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
88
#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
89
#define	NEXT()	(p->next++)
90
#define	NEXT2()	(p->next += 2)
91
#define	NEXTn(n)	(p->next += (n))
92
#define	GETNEXT()	(*p->next++)
93
#define	REQUIRE(co, e)	((co) ? (void) 0 : seterr(p, (e)))
94
#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
95
#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (sop)(sopnd))
96
#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
97
#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
98
#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
99
#define	HERE()		(p->slen)
100
#define	THERE()		(p->slen - 1)
101
#define	THERETHERE()	(p->slen - 2)
102
#define	DROP(n)	(p->slen -= (n))
103

104
#ifndef NDEBUG
105
static int never = 0;		/* for use in asserts; shuts lint up */
106
#else
107
#define	never	0		/* some <assert.h>s have bugs too */
108
#endif
109

110
int				/* 0 success, otherwise REG_something */
111
regcomp(preg, pattern, cflags)
112
regex_t *preg;
113
const char *pattern;
114
int cflags;
115
{
116
	struct parse pa;
117
	struct re_guts *g;
118
	struct parse *p = &pa;
119
	int i;
120
	size_t len;
121

122
	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
123
		return(REG_INVARG);
124

125
	if (cflags&REG_PEND) {
126
		if (preg->re_endp < pattern)
127
			return(REG_INVARG);
128
		len = (size_t) (preg->re_endp - pattern);
129
	} else
130
		len = strlen((char *)pattern);
131

132
	/* do the mallocs early so failure handling is easy */
133
	g = (struct re_guts *)malloc(sizeof(struct re_guts) +
134
							(NC-1)*sizeof(cat_t));
135
	if (g == NULL)
136
		return(REG_ESPACE);
137
	{
138
	  /* Patched for CERT Vulnerability Note VU#695940, Feb 2015. */
139
	  size_t new_ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
140
	  if (new_ssize < len || new_ssize > LONG_MAX / sizeof(sop)) {
141
	    free((char *) g);
142
	    return REG_INVARG;
143
	  }
144
	  p->ssize = (sopno)new_ssize;
145
	}
146
	p->strip = (sop *)malloc((size_t)p->ssize * sizeof(sop));
147
	p->slen = 0;
148
	if (p->strip == NULL) {
149
		free((char *)g);
150
		return(REG_ESPACE);
151
	}
152

153
	/* set things up */
154
	p->g = g;
155
	p->next = (char *)pattern;	/* convenience; we do not modify it */
156
	p->end = p->next + len;
157
	p->error = 0;
158
	p->ncsalloc = 0;
159
	for (i = 0; i < NPAREN; i++) {
160
		p->pbegin[i] = 0;
161
		p->pend[i] = 0;
162
	}
163
	g->csetsize = NC;
164
	g->sets = NULL;
165
	g->setbits = NULL;
166
	g->ncsets = 0;
167
	g->cflags = cflags;
168
	g->iflags = 0;
169
	g->nbol = 0;
170
	g->neol = 0;
171
	g->must = NULL;
172
	g->mlen = 0;
173
	g->nsub = 0;
174
	g->ncategories = 1;	/* category 0 is "everything else" */
175
	g->categories = &g->catspace[-(CHAR_MIN)];
176
	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
177
	g->backrefs = 0;
178

179
	/* do it */
180
	EMIT(OEND, 0);
181
	g->firststate = THERE();
182
	if (cflags&REG_EXTENDED)
183
		p_ere(p, OUT);
184
	else if (cflags&REG_NOSPEC)
185
		p_str(p);
186
	else
187
		p_bre(p, OUT, OUT);
188
	EMIT(OEND, 0);
189
	g->laststate = THERE();
190

191
	/* tidy up loose ends and fill things in */
192
	categorize(p, g);
193
	stripsnug(p, g);
194
	findmust(p, g);
195
	g->nplus = pluscount(p, g);
196
	g->magic = MAGIC2;
197
	preg->re_nsub = g->nsub;
198
	preg->re_g = g;
199
	preg->re_magic = MAGIC1;
200
#ifndef REDEBUG
201
	/* not debugging, so can't rely on the assert() in regexec() */
202
	if (g->iflags&BAD)
203
		seterr(p, REG_ASSERT);
204
#endif
205

206
	/* win or lose, we're done */
207
	if (p->error != 0)	/* lose */
208
		regfree(preg);
209
	return(p->error);
210
}
211

212
/*
213
 - p_ere - ERE parser top level, concatenation and alternation
214
 */
215
static void
216
p_ere(p, stop)
217
struct parse *p;
218
int stop;			/* character this ERE should end at */
219
{
220
	char c;
221
	sopno prevback;
222
	sopno prevfwd;
223
	sopno conc;
224
	int first = 1;		/* is this the first alternative? */
225

226
	for (;;) {
227
		/* do a bunch of concatenated expressions */
228
		conc = HERE();
229
		while (MORE() && (c = PEEK()) != '|' && c != stop)
230
			p_ere_exp(p);
231
		REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
232

233
		if (!EAT('|'))
234
			break;		/* NOTE BREAK OUT */
235

236
		if (first) {
237
			INSERT(OCH_, conc);	/* offset is wrong */
238
			prevfwd = conc;
239
			prevback = conc;
240
			first = 0;
241
		}
242
		ASTERN(OOR1, prevback);
243
		prevback = THERE();
244
		AHEAD(prevfwd);			/* fix previous offset */
245
		prevfwd = HERE();
246
		EMIT(OOR2, 0);			/* offset is very wrong */
247
	}
248

249
	if (!first) {		/* tail-end fixups */
250
		AHEAD(prevfwd);
251
		ASTERN(O_CH, prevback);
252
	}
253

254
	assert(!MORE() || SEE(stop));
255
}
256

257
/*
258
 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
259
 */
260
static void
261
p_ere_exp(p)
262
struct parse *p;
263
{
264
	char c;
265
	sopno pos;
266
	int count;
267
	int count2;
268
	sopno subno;
269
	int wascaret = 0;
270

271
	assert(MORE());		/* caller should have ensured this */
272
	c = GETNEXT();
273

274
	pos = HERE();
275
	switch (c) {
276
	case '(':
277
		REQUIRE(MORE(), REG_EPAREN);
278
		p->g->nsub++;
279
		subno = (sopno)p->g->nsub;
280
		if (subno < NPAREN)
281
			p->pbegin[subno] = HERE();
282
		EMIT(OLPAREN, subno);
283
		if (!SEE(')'))
284
			p_ere(p, ')');
285
		if (subno < NPAREN) {
286
			p->pend[subno] = HERE();
287
			assert(p->pend[subno] != 0);
288
		}
289
		EMIT(ORPAREN, subno);
290
		MUSTEAT(')', REG_EPAREN);
291
		break;
292
#ifndef POSIX_MISTAKE
293
	case ')':		/* happens only if no current unmatched ( */
294
		/*
295
		 * You may ask, why the ifndef?  Because I didn't notice
296
		 * this until slightly too late for 1003.2, and none of the
297
		 * other 1003.2 regular-expression reviewers noticed it at
298
		 * all.  So an unmatched ) is legal POSIX, at least until
299
		 * we can get it fixed.
300
		 */
301
		seterr(p, REG_EPAREN);
302
		break;
303
#endif
304
	case '^':
305
		EMIT(OBOL, 0);
306
		p->g->iflags |= USEBOL;
307
		p->g->nbol++;
308
		wascaret = 1;
309
		break;
310
	case '$':
311
		EMIT(OEOL, 0);
312
		p->g->iflags |= USEEOL;
313
		p->g->neol++;
314
		break;
315
	case '|':
316
		seterr(p, REG_EMPTY);
317
		break;
318
	case '*':
319
	case '+':
320
	case '?':
321
		seterr(p, REG_BADRPT);
322
		break;
323
	case '.':
324
		if (p->g->cflags&REG_NEWLINE)
325
			nonnewline(p);
326
		else
327
			EMIT(OANY, 0);
328
		break;
329
	case '[':
330
		p_bracket(p);
331
		break;
332
	case '\\':
333
		REQUIRE(MORE(), REG_EESCAPE);
334
		c = GETNEXT();
335
		ordinary(p, c);
336
		break;
337
	case '{':		/* okay as ordinary except if digit follows */
338
		REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
339
		/* FALLTHROUGH */
340
	default:
341
		ordinary(p, c);
342
		break;
343
	}
344

345
	if (!MORE())
346
		return;
347
	c = PEEK();
348
	/* we call { a repetition if followed by a digit */
349
	if (!( c == '*' || c == '+' || c == '?' ||
350
				(c == '{' && MORE2() && isdigit(PEEK2())) ))
351
		return;		/* no repetition, we're done */
352
	NEXT();
353

354
	REQUIRE(!wascaret, REG_BADRPT);
355
	switch (c) {
356
	case '*':	/* implemented as +? */
357
		/* this case does not require the (y|) trick, noKLUDGE */
358
		INSERT(OPLUS_, pos);
359
		ASTERN(O_PLUS, pos);
360
		INSERT(OQUEST_, pos);
361
		ASTERN(O_QUEST, pos);
362
		break;
363
	case '+':
364
		INSERT(OPLUS_, pos);
365
		ASTERN(O_PLUS, pos);
366
		break;
367
	case '?':
368
		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
369
		INSERT(OCH_, pos);		/* offset slightly wrong */
370
		ASTERN(OOR1, pos);		/* this one's right */
371
		AHEAD(pos);			/* fix the OCH_ */
372
		EMIT(OOR2, 0);			/* offset very wrong... */
373
		AHEAD(THERE());			/* ...so fix it */
374
		ASTERN(O_CH, THERETHERE());
375
		break;
376
	case '{':
377
		count = p_count(p);
378
		if (EAT(',')) {
379
			if (isdigit(PEEK())) {
380
				count2 = p_count(p);
381
				REQUIRE(count <= count2, REG_BADBR);
382
			} else		/* single number with comma */
383
				count2 = INFINITY;
384
		} else		/* just a single number */
385
			count2 = count;
386
		repeat(p, pos, count, count2);
387
		if (!EAT('}')) {	/* error heuristics */
388
			while (MORE() && PEEK() != '}')
389
				NEXT();
390
			REQUIRE(MORE(), REG_EBRACE);
391
			seterr(p, REG_BADBR);
392
		}
393
		break;
394
	}
395

396
	if (!MORE())
397
		return;
398
	c = PEEK();
399
	if (!( c == '*' || c == '+' || c == '?' ||
400
				(c == '{' && MORE2() && isdigit(PEEK2())) ) )
401
		return;
402
	seterr(p, REG_BADRPT);
403
}
404

405
/*
406
 - p_str - string (no metacharacters) "parser"
407
 */
408
static void
409
p_str(p)
410
struct parse *p;
411
{
412
	REQUIRE(MORE(), REG_EMPTY);
413
	while (MORE())
414
		ordinary(p, GETNEXT());
415
}
416

417
/*
418
 - p_bre - BRE parser top level, anchoring and concatenation
419
 * Giving end1 as OUT essentially eliminates the end1/end2 check.
420
 *
421
 * This implementation is a bit of a kludge, in that a trailing $ is first
422
 * taken as an ordinary character and then revised to be an anchor.  The
423
 * only undesirable side effect is that '$' gets included as a character
424
 * category in such cases.  This is fairly harmless; not worth fixing.
425
 * The amount of lookahead needed to avoid this kludge is excessive.
426
 */
427
static void
428
p_bre(p, end1, end2)
429
struct parse *p;
430
int end1;		/* first terminating character */
431
int end2;		/* second terminating character */
432
{
433
	sopno start = HERE();
434
	int first = 1;			/* first subexpression? */
435
	int wasdollar = 0;
436

437
	if (EAT('^')) {
438
		EMIT(OBOL, 0);
439
		p->g->iflags |= USEBOL;
440
		p->g->nbol++;
441
	}
442
	while (MORE() && !SEETWO(end1, end2)) {
443
		wasdollar = p_simp_re(p, first);
444
		first = 0;
445
	}
446
	if (wasdollar) {	/* oops, that was a trailing anchor */
447
		DROP(1);
448
		EMIT(OEOL, 0);
449
		p->g->iflags |= USEEOL;
450
		p->g->neol++;
451
	}
452

453
	REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
454
}
455

456
/*
457
 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
458
 */
459
static int			/* was the simple RE an unbackslashed $? */
460
p_simp_re(p, starordinary)
461
struct parse *p;
462
int starordinary;		/* is a leading * an ordinary character? */
463
{
464
	int c;
465
	int count;
466
	int count2;
467
	sopno pos;
468
	int i;
469
	sopno subno;
470
#	define	BACKSL	(1<<CHAR_BIT)
471

472
	pos = HERE();		/* repetion op, if any, covers from here */
473

474
	assert(MORE());		/* caller should have ensured this */
475
	c = GETNEXT();
476
	if (c == '\\') {
477
		REQUIRE(MORE(), REG_EESCAPE);
478
		c = BACKSL | (unsigned char)GETNEXT();
479
	}
480
	switch (c) {
481
	case '.':
482
		if (p->g->cflags&REG_NEWLINE)
483
			nonnewline(p);
484
		else
485
			EMIT(OANY, 0);
486
		break;
487
	case '[':
488
		p_bracket(p);
489
		break;
490
	case BACKSL|'{':
491
		seterr(p, REG_BADRPT);
492
		break;
493
	case BACKSL|'(':
494
		p->g->nsub++;
495
		subno = (sopno)p->g->nsub;
496
		if (subno < NPAREN)
497
			p->pbegin[subno] = HERE();
498
		EMIT(OLPAREN, subno);
499
		/* the MORE here is an error heuristic */
500
		if (MORE() && !SEETWO('\\', ')'))
501
			p_bre(p, '\\', ')');
502
		if (subno < NPAREN) {
503
			p->pend[subno] = HERE();
504
			assert(p->pend[subno] != 0);
505
		}
506
		EMIT(ORPAREN, subno);
507
		REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
508
		break;
509
	case BACKSL|')':	/* should not get here -- must be user */
510
	case BACKSL|'}':
511
		seterr(p, REG_EPAREN);
512
		break;
513
	case BACKSL|'1':
514
	case BACKSL|'2':
515
	case BACKSL|'3':
516
	case BACKSL|'4':
517
	case BACKSL|'5':
518
	case BACKSL|'6':
519
	case BACKSL|'7':
520
	case BACKSL|'8':
521
	case BACKSL|'9':
522
		i = (c&~BACKSL) - '0';
523
		assert(i < NPAREN);
524
		if (p->pend[i] != 0) {
525
			assert((size_t) i <= p->g->nsub);
526
			EMIT(OBACK_, i);
527
			assert(p->pbegin[i] != 0);
528
			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
529
			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
530
			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
531
			EMIT(O_BACK, i);
532
		} else
533
			seterr(p, REG_ESUBREG);
534
		p->g->backrefs = 1;
535
		break;
536
	case '*':
537
		REQUIRE(starordinary, REG_BADRPT);
538
		/* FALLTHROUGH */
539
	default:
540
		ordinary(p, (char)c);	/* takes off BACKSL, if any */
541
		break;
542
	}
543

544
	if (EAT('*')) {		/* implemented as +? */
545
		/* this case does not require the (y|) trick, noKLUDGE */
546
		INSERT(OPLUS_, pos);
547
		ASTERN(O_PLUS, pos);
548
		INSERT(OQUEST_, pos);
549
		ASTERN(O_QUEST, pos);
550
	} else if (EATTWO('\\', '{')) {
551
		count = p_count(p);
552
		if (EAT(',')) {
553
			if (MORE() && isdigit(PEEK())) {
554
				count2 = p_count(p);
555
				REQUIRE(count <= count2, REG_BADBR);
556
			} else		/* single number with comma */
557
				count2 = INFINITY;
558
		} else		/* just a single number */
559
			count2 = count;
560
		repeat(p, pos, count, count2);
561
		if (!EATTWO('\\', '}')) {	/* error heuristics */
562
			while (MORE() && !SEETWO('\\', '}'))
563
				NEXT();
564
			REQUIRE(MORE(), REG_EBRACE);
565
			seterr(p, REG_BADBR);
566
		}
567
	} else if (c == (unsigned char)'$')	/* $ (but not \$) ends it */
568
		return(1);
569

570
	return(0);
571
}
572

573
/*
574
 - p_count - parse a repetition count
575
 */
576
static int			/* the value */
577
p_count(p)
578
struct parse *p;
579
{
580
	int count = 0;
581
	int ndigits = 0;
582

583
	while (MORE() && isdigit(PEEK()) && count <= DUPMAX) {
584
		count = count*10 + (GETNEXT() - '0');
585
		ndigits++;
586
	}
587

588
	REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
589
	return(count);
590
}
591

592
/*
593
 - p_bracket - parse a bracketed character list
594
 *
595
 * Note a significant property of this code:  if the allocset() did seterr,
596
 * no set operations are done.
597
 */
598
static void
599
p_bracket(p)
600
struct parse *p;
601
{
602
	cset *cs = allocset(p);
603
	int invert = 0;
604

605
	/* Dept of Truly Sickening Special-Case Kludges */
606
	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
607
		EMIT(OBOW, 0);
608
		NEXTn(6);
609
		return;
610
	}
611
	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
612
		EMIT(OEOW, 0);
613
		NEXTn(6);
614
		return;
615
	}
616

617
	if (EAT('^'))
618
		invert++;	/* make note to invert set at end */
619
	if (EAT(']'))
620
		CHadd(cs, ']');
621
	else if (EAT('-'))
622
		CHadd(cs, '-');
623
	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
624
		p_b_term(p, cs);
625
	if (EAT('-'))
626
		CHadd(cs, '-');
627
	MUSTEAT(']', REG_EBRACK);
628

629
	if (p->error != 0)	/* don't mess things up further */
630
		return;
631

632
	if (p->g->cflags&REG_ICASE) {
633
		int i;
634
		int ci;
635

636
		for (i = p->g->csetsize - 1; i >= 0; i--)
637
			if (CHIN(cs, i) && isalpha(i)) {
638
				ci = othercase(i);
639
				if (ci != i)
640
					CHadd(cs, ci);
641
			}
642
		assert(cs->multis == NULL);	/* xxx */
643
#if 0		
644
		if (cs->multis != NULL)
645
			mccase(p, cs);
646
#endif
647
	}
648
	if (invert) {
649
		int i;
650

651
		for (i = p->g->csetsize - 1; i >= 0; i--)
652
			if (CHIN(cs, i))
653
				CHsub(cs, i);
654
			else
655
				CHadd(cs, i);
656
		if (p->g->cflags&REG_NEWLINE)
657
			CHsub(cs, '\n');
658
		assert(cs->multis == NULL);	/* xxx */
659
#if 0
660
		if (cs->multis != NULL)
661
			mcinvert(p, cs);
662
#endif
663
	}
664

665
	assert(cs->multis == NULL);		/* xxx */
666

667
	if (nch(p, cs) == 1) {		/* optimize singleton sets */
668
		ordinary(p, firstch(p, cs));
669
		freeset(p, cs);
670
	} else
671
		EMIT(OANYOF, freezeset(p, cs));
672
}
673

674
/*
675
 - p_b_term - parse one term of a bracketed character list
676
 */
677
static void
678
p_b_term(p, cs)
679
struct parse *p;
680
cset *cs;
681
{
682
	char c;
683
	char start, finish;
684
	int i;
685

686
	/* classify what we've got */
687
	switch ((MORE()) ? PEEK() : '\0') {
688
	case '[':
689
		c = (MORE2()) ? PEEK2() : '\0';
690
		break;
691
	case '-':
692
		seterr(p, REG_ERANGE);
693
		return;			/* NOTE RETURN */
694
		break;
695
	default:
696
		c = '\0';
697
		break;
698
	}
699

700
	switch (c) {
701
	case ':':		/* character class */
702
		NEXT2();
703
		REQUIRE(MORE(), REG_EBRACK);
704
		c = PEEK();
705
		REQUIRE(c != '-' && c != ']', REG_ECTYPE);
706
		p_b_cclass(p, cs);
707
		REQUIRE(MORE(), REG_EBRACK);
708
		REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
709
		break;
710
	case '=':		/* equivalence class */
711
		NEXT2();
712
		REQUIRE(MORE(), REG_EBRACK);
713
		c = PEEK();
714
		REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
715
		p_b_eclass(p, cs);
716
		REQUIRE(MORE(), REG_EBRACK);
717
		REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
718
		break;
719
	default:		/* symbol, ordinary character, or range */
720
/* xxx revision needed for multichar stuff */
721
		start = p_b_symbol(p);
722
		if (SEE('-') && MORE2() && PEEK2() != ']') {
723
			/* range */
724
			NEXT();
725
			if (EAT('-'))
726
				finish = '-';
727
			else
728
				finish = p_b_symbol(p);
729
		} else
730
			finish = start;
731
/* xxx what about signed chars here... */
732
		REQUIRE(start <= finish, REG_ERANGE);
733
		for (i = start; i <= finish; i++)
734
			CHadd(cs, i);
735
		break;
736
	}
737
}
738

739
/* Character-class table. */
740
static struct cclass {
741
	char *name;
742
	char *chars;
743
	char *multis;
744
} cclasses[] = {
745
	{"alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", ""},
746
	{"alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", ""},
747
	{"blank", " \t", ""},
748
	{"cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\25\26\27\30\31\32\33\34\35\36\37\177", ""},
749
	{"digit", "0123456789", ""},
750
	{"graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", ""},
751
	{"lower", "abcdefghijklmnopqrstuvwxyz",	""},
752
	{"print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ", ""},
753
	{"punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",	""},
754
	{"space", "\t\n\v\f\r ", ""},
755
	{"upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ",	""},
756
	{"xdigit", "0123456789ABCDEFabcdef", ""},
757
	{NULL, 0, ""}
758
};
759

760
/*
761
 - p_b_cclass - parse a character-class name and deal with it
762
 */
763
static void
764
p_b_cclass(p, cs)
765
struct parse *p;
766
cset *cs;
767
{
768
	char *sp = p->next;
769
	struct cclass *cp;
770
	size_t len;
771
	char *u;
772
	char c;
773

774
	while (MORE() && isalpha(PEEK()))
775
		NEXT();
776
	len = (size_t)(p->next - sp);
777
	for (cp = cclasses; cp->name != NULL; cp++)
778
		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
779
			break;
780
	if (cp->name == NULL) {
781
		/* oops, didn't find it */
782
		seterr(p, REG_ECTYPE);
783
		return;
784
	}
785

786
	u = cp->chars;
787
	while ((c = *u++) != '\0')
788
		CHadd(cs, c);
789
	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
790
		MCadd(p, cs, u);
791
}
792

793
/*
794
 - p_b_eclass - parse an equivalence-class name and deal with it
795
 *
796
 * This implementation is incomplete. xxx
797
 */
798
static void
799
p_b_eclass(p, cs)
800
struct parse *p;
801
cset *cs;
802
{
803
	char c;
804

805
	c = p_b_coll_elem(p, '=');
806
	CHadd(cs, c);
807
}
808

809
/*
810
 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
811
 */
812
static char			/* value of symbol */
813
p_b_symbol(p)
814
struct parse *p;
815
{
816
	char value;
817

818
	REQUIRE(MORE(), REG_EBRACK);
819
	if (!EATTWO('[', '.'))
820
		return(GETNEXT());
821

822
	/* collating symbol */
823
	value = p_b_coll_elem(p, '.');
824
	REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
825
	return(value);
826
}
827

828
/* character-name table */
829
static struct cname {
830
	char *name;
831
	char code;
832
} cnames[] = {
833
	{"NUL",	'\0'},
834
	{"SOH",	'\001'},
835
	{"STX",	'\002'},
836
	{"ETX",	'\003'},
837
	{"EOT",	'\004'},
838
	{"ENQ",	'\005'},
839
	{"ACK",	'\006'},
840
	{"BEL",	'\007'},
841
	{"alert",	'\007'},
842
	{"BS",		'\010'},
843
	{"backspace",	'\b'},
844
	{"HT",		'\011'},
845
	{"tab",		'\t'},
846
	{"LF",		'\012'},
847
	{"newline",	'\n'},
848
	{"VT",		'\013'},
849
	{"vertical-tab",	'\v'},
850
	{"FF",		'\014'},
851
	{"form-feed",	'\f'},
852
	{"CR",		'\015'},
853
	{"carriage-return",	'\r'},
854
	{"SO",	'\016'},
855
	{"SI",	'\017'},
856
	{"DLE",	'\020'},
857
	{"DC1",	'\021'},
858
	{"DC2",	'\022'},
859
	{"DC3",	'\023'},
860
	{"DC4",	'\024'},
861
	{"NAK",	'\025'},
862
	{"SYN",	'\026'},
863
	{"ETB",	'\027'},
864
	{"CAN",	'\030'},
865
	{"EM",	'\031'},
866
	{"SUB",	'\032'},
867
	{"ESC",	'\033'},
868
	{"IS4",	'\034'},
869
	{"FS",	'\034'},
870
	{"IS3",	'\035'},
871
	{"GS",	'\035'},
872
	{"IS2",	'\036'},
873
	{"RS",	'\036'},
874
	{"IS1",	'\037'},
875
	{"US",	'\037'},
876
	{"space",		' '},
877
	{"exclamation-mark",	'!'},
878
	{"quotation-mark",	'"'},
879
	{"number-sign",		'#'},
880
	{"dollar-sign",		'$'},
881
	{"percent-sign",		'%'},
882
	{"ampersand",		'&'},
883
	{"apostrophe",		'\''},
884
	{"left-parenthesis",	'('},
885
	{"right-parenthesis",	')'},
886
	{"asterisk",	'*'},
887
	{"plus-sign",	'+'},
888
	{"comma",	','},
889
	{"hyphen",	'-'},
890
	{"hyphen-minus",	'-'},
891
	{"period",	'.'},
892
	{"full-stop",	'.'},
893
	{"slash",	'/'},
894
	{"solidus",	'/'},
895
	{"zero",		'0'},
896
	{"one",		'1'},
897
	{"two",		'2'},
898
	{"three",	'3'},
899
	{"four",		'4'},
900
	{"five",		'5'},
901
	{"six",		'6'},
902
	{"seven",	'7'},
903
	{"eight",	'8'},
904
	{"nine",		'9'},
905
	{"colon",	':'},
906
	{"semicolon",	';'},
907
	{"less-than-sign",	'<'},
908
	{"equals-sign",		'='},
909
	{"greater-than-sign",	'>'},
910
	{"question-mark",	'?'},
911
	{"commercial-at",	'@'},
912
	{"left-square-bracket",	'['},
913
	{"backslash",		'\\'},
914
	{"reverse-solidus",	'\\'},
915
	{"right-square-bracket",	']'},
916
	{"circumflex",		'^'},
917
	{"circumflex-accent",	'^'},
918
	{"underscore",		'_'},
919
	{"low-line",		'_'},
920
	{"grave-accent",		'`'},
921
	{"left-brace",		'{'},
922
	{"left-curly-bracket",	'{'},
923
	{"vertical-line",	'|'},
924
	{"right-brace",		'}'},
925
	{"right-curly-bracket",	'}'},
926
	{"tilde",		'~'},
927
	{"DEL",	'\177'},
928
	{NULL,	0},
929
};
930

931
/*
932
 - p_b_coll_elem - parse a collating-element name and look it up
933
 */
934
static char			/* value of collating element */
935
p_b_coll_elem(p, endc)
936
struct parse *p;
937
int endc;			/* name ended by endc,']' */
938
{
939
	char *sp = p->next;
940
	struct cname *cp;
941
	size_t len;
942

943
	while (MORE() && !SEETWO(endc, ']'))
944
		NEXT();
945
	if (!MORE()) {
946
		seterr(p, REG_EBRACK);
947
		return(0);
948
	}
949
	len = (size_t)(p->next - sp);
950
	for (cp = cnames; cp->name != NULL; cp++)
951
		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
952
			return(cp->code);	/* known name */
953
	if (len == 1)
954
		return(*sp);	/* single character */
955
	seterr(p, REG_ECOLLATE);			/* neither */
956
	return(0);
957
}
958

959
/*
960
 - othercase - return the case counterpart of an alphabetic
961
 */
962
static char			/* if no counterpart, return ch */
963
othercase(ch)
964
int ch;
965
{
966
	assert(isalpha(ch));
967
	if (isupper(ch))
968
		return((char)tolower(ch));
969
	else if (islower(ch))
970
		return((char)toupper(ch));
971
	else			/* peculiar, but could happen */
972
		return((char)ch);
973
}
974

975
/*
976
 - bothcases - emit a dualcase version of a two-case character
977
 *
978
 * Boy, is this implementation ever a kludge...
979
 */
980
static void
981
bothcases(p, ch)
982
struct parse *p;
983
int ch;
984
{
985
	char *oldnext = p->next;
986
	char *oldend = p->end;
987
	char bracket[3];
988

989
	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
990
	p->next = bracket;
991
	p->end = bracket+2;
992
	bracket[0] = (char)ch;
993
	bracket[1] = ']';
994
	bracket[2] = '\0';
995
	p_bracket(p);
996
	assert(p->next == bracket+2);
997
	p->next = oldnext;
998
	p->end = oldend;
999
}
1000

1001
/*
1002
 - ordinary - emit an ordinary character
1003
 */
1004
static void
1005
ordinary(p, ch)
1006
struct parse *p;
1007
int ch;
1008
{
1009
	cat_t *cap = p->g->categories;
1010

1011
	if ((p->g->cflags&REG_ICASE) && isalpha(ch) && othercase(ch) != ch)
1012
		bothcases(p, ch);
1013
	else {
1014
		EMIT(OCHAR, (sopno)ch);
1015
		if (cap[ch] == 0)
1016
			cap[ch] = (cat_t)(p->g->ncategories++);
1017
	}
1018
}
1019

1020
/*
1021
 - nonnewline - emit REG_NEWLINE version of OANY
1022
 *
1023
 * Boy, is this implementation ever a kludge...
1024
 */
1025
static void
1026
nonnewline(p)
1027
struct parse *p;
1028
{
1029
	char *oldnext = p->next;
1030
	char *oldend = p->end;
1031
	char bracket[4];
1032

1033
	p->next = bracket;
1034
	p->end = bracket+3;
1035
	bracket[0] = '^';
1036
	bracket[1] = '\n';
1037
	bracket[2] = ']';
1038
	bracket[3] = '\0';
1039
	p_bracket(p);
1040
	assert(p->next == bracket+3);
1041
	p->next = oldnext;
1042
	p->end = oldend;
1043
}
1044

1045
/*
1046
 - repeat - generate code for a bounded repetition, recursively if needed
1047
 */
1048
static void
1049
repeat(p, start, from, to)
1050
struct parse *p;
1051
sopno start;			/* operand from here to end of strip */
1052
int from;			/* repeated from this number */
1053
int to;				/* to this number of times (maybe INFINITY) */
1054
{
1055
	sopno finish = HERE();
1056
#	define	N	2
1057
#	define	INF	3
1058
#	define	REP(f, t)	((f)*8 + (t))
1059
#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1060
	sopno copy;
1061

1062
	if (p->error != 0)	/* head off possible runaway recursion */
1063
		return;
1064

1065
	assert(from <= to);
1066

1067
	switch (REP(MAP(from), MAP(to))) {
1068
	case REP(0, 0):			/* must be user doing this */
1069
		DROP(finish-start);	/* drop the operand */
1070
		break;
1071
	case REP(0, 1):			/* as x{1,1}? */
1072
	case REP(0, N):			/* as x{1,n}? */
1073
	case REP(0, INF):		/* as x{1,}? */
1074
		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1075
		INSERT(OCH_, start);		/* offset is wrong... */
1076
		repeat(p, start+1, 1, to);
1077
		ASTERN(OOR1, start);
1078
		AHEAD(start);			/* ... fix it */
1079
		EMIT(OOR2, 0);
1080
		AHEAD(THERE());
1081
		ASTERN(O_CH, THERETHERE());
1082
		break;
1083
	case REP(1, 1):			/* trivial case */
1084
		/* done */
1085
		break;
1086
	case REP(1, N):			/* as x?x{1,n-1} */
1087
		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1088
		INSERT(OCH_, start);
1089
		ASTERN(OOR1, start);
1090
		AHEAD(start);
1091
		EMIT(OOR2, 0);			/* offset very wrong... */
1092
		AHEAD(THERE());			/* ...so fix it */
1093
		ASTERN(O_CH, THERETHERE());
1094
		copy = dupl(p, start+1, finish+1);
1095
		assert(copy == finish+4);
1096
		repeat(p, copy, 1, to-1);
1097
		break;
1098
	case REP(1, INF):		/* as x+ */
1099
		INSERT(OPLUS_, start);
1100
		ASTERN(O_PLUS, start);
1101
		break;
1102
	case REP(N, N):			/* as xx{m-1,n-1} */
1103
		copy = dupl(p, start, finish);
1104
		repeat(p, copy, from-1, to-1);
1105
		break;
1106
	case REP(N, INF):		/* as xx{n-1,INF} */
1107
		copy = dupl(p, start, finish);
1108
		repeat(p, copy, from-1, to);
1109
		break;
1110
	default:			/* "can't happen" */
1111
		seterr(p, REG_ASSERT);	/* just in case */
1112
		break;
1113
	}
1114
}
1115

1116
/*
1117
 - seterr - set an error condition
1118
 */
1119
static void
1120
seterr(p, e)
1121
struct parse *p;
1122
int e;
1123
{
1124
	if (p->error == 0)	/* keep earliest error condition */
1125
		p->error = e;
1126
	p->next = nuls;		/* try to bring things to a halt */
1127
	p->end = nuls;
1128
}
1129

1130
/*
1131
 - allocset - allocate a set of characters for []
1132
 */
1133
static cset *
1134
allocset(p)
1135
struct parse *p;
1136
{
1137
	int no = p->g->ncsets++;
1138
	int nc;
1139
	int nbytes;
1140
	cset *cs;
1141
	int css = p->g->csetsize;
1142
	int i;
1143

1144
	if (no >= p->ncsalloc) {	/* need another column of space */
1145
		p->ncsalloc += CHAR_BIT;
1146
		nc = p->ncsalloc;
1147
		assert(nc % CHAR_BIT == 0);
1148
		nbytes = nc / CHAR_BIT * css;
1149
		if (p->g->sets == NULL)
1150
			p->g->sets = (cset *)malloc((size_t)nc * sizeof(cset));
1151
		else
1152
			p->g->sets = (cset *)realloc((char *)p->g->sets,
1153
							(size_t)nc * sizeof(cset));
1154
		if (p->g->setbits == NULL)
1155
			p->g->setbits = (uch *)malloc((size_t)nbytes);
1156
		else {
1157
			p->g->setbits = (uch *)realloc((char *)p->g->setbits,
1158
							(size_t)nbytes);
1159
			/* xxx this isn't right if setbits is now NULL */
1160
			for (i = 0; i < no; i++)
1161
				p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1162
		}
1163
		if (p->g->sets != NULL && p->g->setbits != NULL)
1164
			(void) memset((char *)p->g->setbits + (nbytes - css),
1165
							0, (size_t)css);
1166
		else {
1167
			no = 0;
1168
			seterr(p, REG_ESPACE);
1169
			/* caller's responsibility not to do set ops */
1170
		}
1171
	}
1172

1173
	assert(p->g->sets != NULL);	/* xxx */
1174
	cs = &p->g->sets[no];
1175
	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1176
	cs->mask = (uch) (1 << ((no) % CHAR_BIT));
1177
	cs->hash = 0;
1178
	cs->smultis = 0;
1179
	cs->multis = NULL;
1180

1181
	return(cs);
1182
}
1183

1184
/*
1185
 - freeset - free a now-unused set
1186
 */
1187
static void
1188
freeset(p, cs)
1189
struct parse *p;
1190
cset *cs;
1191
{
1192
	size_t i;
1193
	cset *top = &p->g->sets[p->g->ncsets];
1194
	size_t css = (size_t)p->g->csetsize;
1195

1196
	for (i = 0; i < css; i++)
1197
		CHsub(cs, i);
1198
	if (cs == top-1)	/* recover only the easy case */
1199
		p->g->ncsets--;
1200
}
1201

1202
/*
1203
 - freezeset - final processing on a set of characters
1204
 *
1205
 * The main task here is merging identical sets.  This is usually a waste
1206
 * of time (although the hash code minimizes the overhead), but can win
1207
 * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1208
 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1209
 * the same value!
1210
 */
1211
static int			/* set number */
1212
freezeset(p, cs)
1213
struct parse *p;
1214
cset *cs;
1215
{
1216
	uch h = cs->hash;
1217
	size_t i;
1218
	cset *top = &p->g->sets[p->g->ncsets];
1219
	cset *cs2;
1220
	size_t css = (size_t)p->g->csetsize;
1221

1222
	/* look for an earlier one which is the same */
1223
	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1224
		if (cs2->hash == h && cs2 != cs) {
1225
			/* maybe */
1226
			for (i = 0; i < css; i++)
1227
				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1228
					break;		/* no */
1229
			if (i == css)
1230
				break;			/* yes */
1231
		}
1232

1233
	if (cs2 < top) {	/* found one */
1234
		freeset(p, cs);
1235
		cs = cs2;
1236
	}
1237

1238
	return((int)(cs - p->g->sets));
1239
}
1240

1241
/*
1242
 - firstch - return first character in a set (which must have at least one)
1243
 */
1244
static int			/* character; there is no "none" value */
1245
firstch(p, cs)
1246
struct parse *p;
1247
cset *cs;
1248
{
1249
	size_t i;
1250
	size_t css = (size_t)p->g->csetsize;
1251

1252
	for (i = 0; i < css; i++)
1253
		if (CHIN(cs, i))
1254
			return((char)i);
1255
	assert(never);
1256
	return(0);		/* arbitrary */
1257
}
1258

1259
/*
1260
 - nch - number of characters in a set
1261
 */
1262
static int
1263
nch(p, cs)
1264
struct parse *p;
1265
cset *cs;
1266
{
1267
	size_t i;
1268
	size_t css = (size_t)p->g->csetsize;
1269
	int n = 0;
1270

1271
	for (i = 0; i < css; i++)
1272
		if (CHIN(cs, i))
1273
			n++;
1274
	return(n);
1275
}
1276

1277
/*
1278
 - mcadd - add a collating element to a cset
1279
 */
1280
static void
1281
mcadd(p, cs, cp)
1282
struct parse *p;
1283
cset *cs;
1284
char *cp;
1285
{
1286
	size_t oldend = cs->smultis;
1287

1288
	cs->smultis += strlen(cp) + 1;
1289
	if (cs->multis == NULL)
1290
		cs->multis = malloc(cs->smultis);
1291
	else
1292
		cs->multis = realloc(cs->multis, cs->smultis);
1293
	if (cs->multis == NULL) {
1294
		seterr(p, REG_ESPACE);
1295
		return;
1296
	}
1297

1298
	(void) strcpy(cs->multis + oldend - 1, cp);
1299
	cs->multis[cs->smultis - 1] = '\0';
1300
}
1301

1302
#if 0
1303
/*
1304
 - mcsub - subtract a collating element from a cset
1305
 */
1306
static void
1307
mcsub(cs, cp)
1308
cset *cs;
1309
char *cp;
1310
{
1311
	char *fp = mcfind(cs, cp);
1312
	size_t len = strlen(fp);
1313

1314
	assert(fp != NULL);
1315
	(void) memmove(fp, fp + len + 1,
1316
				cs->smultis - (fp + len + 1 - cs->multis));
1317
	cs->smultis -= len;
1318

1319
	if (cs->smultis == 0) {
1320
		free(cs->multis);
1321
		cs->multis = NULL;
1322
		return;
1323
	}
1324

1325
	cs->multis = realloc(cs->multis, cs->smultis);
1326
	assert(cs->multis != NULL);
1327
}
1328

1329
/*
1330
 - mcin - is a collating element in a cset?
1331
 */
1332
static int
1333
mcin(cs, cp)
1334
cset *cs;
1335
char *cp;
1336
{
1337
	return(mcfind(cs, cp) != NULL);
1338
}
1339

1340
/*
1341
 - mcfind - find a collating element in a cset
1342
 */
1343
static char *
1344
mcfind(cs, cp)
1345
cset *cs;
1346
char *cp;
1347
{
1348
	char *p;
1349

1350
	if (cs->multis == NULL)
1351
		return(NULL);
1352
	for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1353
		if (strcmp(cp, p) == 0)
1354
			return(p);
1355
	return(NULL);
1356
}
1357

1358
/*
1359
 - mcinvert - invert the list of collating elements in a cset
1360
 *
1361
 * This would have to know the set of possibilities.  Implementation
1362
 * is deferred.
1363
 */
1364
static void
1365
mcinvert(p, cs)
1366
struct parse *p;
1367
cset *cs;
1368
{
1369
	assert(cs->multis == NULL);	/* xxx */
1370
}
1371

1372
/*
1373
 - mccase - add case counterparts of the list of collating elements in a cset
1374
 *
1375
 * This would have to know the set of possibilities.  Implementation
1376
 * is deferred.
1377
 */
1378
static void
1379
mccase(p, cs)
1380
struct parse *p;
1381
cset *cs;
1382
{
1383
	assert(cs->multis == NULL);	/* xxx */
1384
}
1385
#endif
1386

1387
/*
1388
 - isinsets - is this character in any sets?
1389
 */
1390
static int			/* predicate */
1391
isinsets(g, c)
1392
struct re_guts *g;
1393
int c;
1394
{
1395
	uch *col;
1396
	int i;
1397
	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1398
	unsigned uc = (unsigned char)c;
1399

1400
	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1401
		if (col[uc] != 0)
1402
			return(1);
1403
	return(0);
1404
}
1405

1406
/*
1407
 - samesets - are these two characters in exactly the same sets?
1408
 */
1409
static int			/* predicate */
1410
samesets(g, c1, c2)
1411
struct re_guts *g;
1412
int c1;
1413
int c2;
1414
{
1415
	uch *col;
1416
	int i;
1417
	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1418
	unsigned uc1 = (unsigned char)c1;
1419
	unsigned uc2 = (unsigned char)c2;
1420

1421
	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1422
		if (col[uc1] != col[uc2])
1423
			return(0);
1424
	return(1);
1425
}
1426

1427
/*
1428
 - categorize - sort out character categories
1429
 */
1430
static void
1431
categorize(p, g)
1432
struct parse *p;
1433
struct re_guts *g;
1434
{
1435
	cat_t *cats = g->categories;
1436
	int c;
1437
	int c2;
1438
	cat_t cat;
1439

1440
	/* avoid making error situations worse */
1441
	if (p->error != 0)
1442
		return;
1443

1444
	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1445
		if (cats[c] == 0 && isinsets(g, c)) {
1446
			cat = (cat_t)g->ncategories++;
1447
			cats[c] = cat;
1448
			for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1449
				if (cats[c2] == 0 && samesets(g, c, c2))
1450
					cats[c2] = cat;
1451
		}
1452
}
1453

1454
/*
1455
 - dupl - emit a duplicate of a bunch of sops
1456
 */
1457
static sopno			/* start of duplicate */
1458
dupl(p, start, finish)
1459
struct parse *p;
1460
sopno start;			/* from here */
1461
sopno finish;			/* to this less one */
1462
{
1463
	sopno ret = HERE();
1464
	sopno len = finish - start;
1465

1466
	assert(finish >= start);
1467
	if (len == 0)
1468
		return(ret);
1469
	enlarge(p, p->ssize + len);	/* this many unexpected additions */
1470
	assert(p->ssize >= p->slen + len);
1471
	(void) memcpy((char *)(p->strip + p->slen),
1472
		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1473
	p->slen += len;
1474
	return(ret);
1475
}
1476

1477
/*
1478
 - doemit - emit a strip operator
1479
 *
1480
 * It might seem better to implement this as a macro with a function as
1481
 * hard-case backup, but it's just too big and messy unless there are
1482
 * some changes to the data structures.  Maybe later.
1483
 */
1484
static void
1485
doemit(p, op, opnd)
1486
struct parse *p;
1487
sop op;
1488
sop opnd;
1489
{
1490
	/* avoid making error situations worse */
1491
	if (p->error != 0)
1492
		return;
1493

1494
	/* deal with oversize operands ("can't happen", more or less) */
1495
	assert(opnd < 1<<OPSHIFT);
1496

1497
	/* deal with undersized strip */
1498
	if (p->slen >= p->ssize)
1499
		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
1500
	assert(p->slen < p->ssize);
1501

1502
	/* finally, it's all reduced to the easy case */
1503
	p->strip[p->slen++] = SOP(op, opnd);
1504
}
1505

1506
/*
1507
 - doinsert - insert a sop into the strip
1508
 */
1509
static void
1510
doinsert(p, op, opnd, pos)
1511
struct parse *p;
1512
sop op;
1513
sopno opnd;
1514
sopno pos;
1515
{
1516
	sopno sn;
1517
	sop s;
1518
	int i;
1519

1520
	/* avoid making error situations worse */
1521
	if (p->error != 0)
1522
		return;
1523

1524
	sn = HERE();
1525
	EMIT(op, opnd);		/* do checks, ensure space */
1526
	assert(HERE() == sn+1);
1527
	s = p->strip[sn];
1528

1529
	/* adjust paren pointers */
1530
	assert(pos > 0);
1531
	for (i = 1; i < NPAREN; i++) {
1532
		if (p->pbegin[i] >= pos) {
1533
			p->pbegin[i]++;
1534
		}
1535
		if (p->pend[i] >= pos) {
1536
			p->pend[i]++;
1537
		}
1538
	}
1539

1540
	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1541
				(size_t)(HERE()-pos-1)*sizeof(sop));
1542
	p->strip[pos] = s;
1543
}
1544

1545
/*
1546
 - dofwd - complete a forward reference
1547
 */
1548
static void
1549
dofwd(p, pos, value)
1550
struct parse *p;
1551
sopno pos;
1552
sop value;
1553
{
1554
	/* avoid making error situations worse */
1555
	if (p->error != 0)
1556
		return;
1557

1558
	assert(value < 1<<OPSHIFT);
1559
	p->strip[pos] = OP(p->strip[pos]) | value;
1560
}
1561

1562
/*
1563
 - enlarge - enlarge the strip
1564
 */
1565
static void
1566
enlarge(p, size)
1567
struct parse *p;
1568
sopno size;
1569
{
1570
	sop *sp;
1571

1572
	if (p->ssize >= size)
1573
		return;
1574

1575
	sp = (sop *)realloc(p->strip, (size_t)size*sizeof(sop));
1576
	if (sp == NULL) {
1577
		seterr(p, REG_ESPACE);
1578
		return;
1579
	}
1580
	p->strip = sp;
1581
	p->ssize = size;
1582
}
1583

1584
/*
1585
 - stripsnug - compact the strip
1586
 */
1587
static void
1588
stripsnug(p, g)
1589
struct parse *p;
1590
struct re_guts *g;
1591
{
1592
	g->nstates = p->slen;
1593
	g->strip = (sop *)realloc((char *)p->strip, (size_t)p->slen * sizeof(sop));
1594
	if (g->strip == NULL) {
1595
		seterr(p, REG_ESPACE);
1596
		g->strip = p->strip;
1597
	}
1598
}
1599

1600
/*
1601
 - findmust - fill in must and mlen with longest mandatory literal string
1602
 *
1603
 * This algorithm could do fancy things like analyzing the operands of |
1604
 * for common subsequences.  Someday.  This code is simple and finds most
1605
 * of the interesting cases.
1606
 *
1607
 * Note that must and mlen got initialized during setup.
1608
 */
1609
static void
1610
findmust(p, g)
1611
struct parse *p;
1612
struct re_guts *g;
1613
{
1614
	sop *scan;
1615
	sop *start;
1616
	sop *newstart;
1617
	int newlen;
1618
	sop s;
1619
	char *cp;
1620
	sopno i;
1621

1622
	/* avoid making error situations worse */
1623
	if (p->error != 0)
1624
		return;
1625

1626
	/* find the longest OCHAR sequence in strip */
1627
	newlen = 0;
1628
	scan = g->strip + 1;
1629
	do {
1630
		s = *scan++;
1631
		switch (OP(s)) {
1632
		case OCHAR:		/* sequence member */
1633
			if (newlen == 0)		/* new sequence */
1634
				newstart = scan - 1;
1635
			newlen++;
1636
			break;
1637
		case OPLUS_:		/* things that don't break one */
1638
		case OLPAREN:
1639
		case ORPAREN:
1640
			break;
1641
		case OQUEST_:		/* things that must be skipped */
1642
		case OCH_:
1643
			scan--;
1644
			do {
1645
				scan += OPND(s);
1646
				s = *scan;
1647
				/* assert() interferes w debug printouts */
1648
				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1649
							OP(s) != OOR2) {
1650
					g->iflags |= BAD;
1651
					return;
1652
				}
1653
			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1654
			/* fallthrough */
1655
		default:		/* things that break a sequence */
1656
			if (newlen > g->mlen) {		/* ends one */
1657
				start = newstart;
1658
				g->mlen = newlen;
1659
			}
1660
			newlen = 0;
1661
			break;
1662
		}
1663
	} while (OP(s) != OEND);
1664

1665
	if (g->mlen == 0)		/* there isn't one */
1666
		return;
1667

1668
	/* turn it into a character string */
1669
	g->must = malloc((size_t)g->mlen + 1);
1670
	if (g->must == NULL) {		/* argh; just forget it */
1671
		g->mlen = 0;
1672
		return;
1673
	}
1674
	cp = g->must;
1675
	scan = start;
1676
	for (i = g->mlen; i > 0; i--) {
1677
		while (OP(s = *scan++) != OCHAR)
1678
			continue;
1679
		assert(cp < g->must + g->mlen);
1680
		*cp++ = (char)OPND(s);
1681
	}
1682
	assert(cp == g->must + g->mlen);
1683
	*cp++ = '\0';		/* just on general principles */
1684
}
1685

1686
/*
1687
 - pluscount - count + nesting
1688
 */
1689
static sopno			/* nesting depth */
1690
pluscount(p, g)
1691
struct parse *p;
1692
struct re_guts *g;
1693
{
1694
	sop *scan;
1695
	sop s;
1696
	sopno plusnest = 0;
1697
	sopno maxnest = 0;
1698

1699
	if (p->error != 0)
1700
		return(0);	/* there may not be an OEND */
1701

1702
	scan = g->strip + 1;
1703
	do {
1704
		s = *scan++;
1705
		switch (OP(s)) {
1706
		case OPLUS_:
1707
			plusnest++;
1708
			break;
1709
		case O_PLUS:
1710
			if (plusnest > maxnest)
1711
				maxnest = plusnest;
1712
			plusnest--;
1713
			break;
1714
		}
1715
	} while (OP(s) != OEND);
1716
	if (plusnest != 0)
1717
		g->iflags |= BAD;
1718
	return(maxnest);
1719
}
1720

1721
Product

Resources

Company