Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
att
GitHub Repository: att/ast
Path: blob/master/src/lib/libtksh/tcl/regexp.c
1810 views
1
/*
2
* TclRegComp and TclRegExec -- TclRegSub is elsewhere
3
*
4
* Copyright (c) 1986 by University of Toronto.
5
* Written by Henry Spencer. Not derived from licensed software.
6
*
7
* Permission is granted to anyone to use this software for any
8
* purpose on any computer system, and to redistribute it freely,
9
* subject to the following restrictions:
10
*
11
* 1. The author is not responsible for the consequences of use of
12
* this software, no matter how awful, even if they arise
13
* from defects in it.
14
*
15
* 2. The origin of this software must not be misrepresented, either
16
* by explicit claim or by omission.
17
*
18
* 3. Altered versions must be plainly marked as such, and must not
19
* be misrepresented as being the original software.
20
*
21
* Beware that some of this code is subtly aware of the way operator
22
* precedence is structured in regular expressions. Serious changes in
23
* regular-expression syntax might require a total rethink.
24
*
25
* *** NOTE: this code has been altered slightly for use in Tcl: ***
26
* *** 1. Use ckalloc and ckfree instead of malloc and free. ***
27
* *** 2. Add extra argument to regexp to specify the real ***
28
* *** start of the string separately from the start of the ***
29
* *** current search. This is needed to search for multiple ***
30
* *** matches within a string. ***
31
* *** 3. Names have been changed, e.g. from regcomp to ***
32
* *** TclRegComp, to avoid clashes with other ***
33
* *** regexp implementations used by applications. ***
34
* *** 4. Added errMsg declaration and TclRegError procedure ***
35
* *** 5. Various lint-like things, such as casting arguments ***
36
* *** in procedure calls. ***
37
*
38
* *** NOTE: This code has been altered for use in MT-Sturdy Tcl ***
39
* *** 1. All use of static variables has been changed to access ***
40
* *** fields of a structure. ***
41
* *** 2. This in addition to changes to TclRegError makes the ***
42
* *** code multi-thread safe. ***
43
*
44
* SCCS: @(#) regexp.c 1.12 96/04/02 13:54:57
45
*/
46
47
#include "tclInt.h"
48
#include "tclPort.h"
49
50
/*
51
* The variable below is set to NULL before invoking regexp functions
52
* and checked after those functions. If an error occurred then TclRegError
53
* will set the variable to point to a (static) error message. This
54
* mechanism unfortunately does not support multi-threading, but the
55
* procedures TclRegError and TclGetRegError can be modified to use
56
* thread-specific storage for the variable and thereby make the code
57
* thread-safe.
58
*/
59
60
static char *errMsg = NULL;
61
62
/*
63
* The "internal use only" fields in regexp.h are present to pass info from
64
* compile to execute that permits the execute phase to run lots faster on
65
* simple cases. They are:
66
*
67
* regstart char that must begin a match; '\0' if none obvious
68
* reganch is the match anchored (at beginning-of-line only)?
69
* regmust string (pointer into program) that match must include, or NULL
70
* regmlen length of regmust string
71
*
72
* Regstart and reganch permit very fast decisions on suitable starting points
73
* for a match, cutting down the work a lot. Regmust permits fast rejection
74
* of lines that cannot possibly match. The regmust tests are costly enough
75
* that TclRegComp() supplies a regmust only if the r.e. contains something
76
* potentially expensive (at present, the only such thing detected is * or +
77
* at the start of the r.e., which can involve a lot of backup). Regmlen is
78
* supplied because the test in TclRegExec() needs it and TclRegComp() is
79
* computing it anyway.
80
*/
81
82
/*
83
* Structure for regexp "program". This is essentially a linear encoding
84
* of a nondeterministic finite-state machine (aka syntax charts or
85
* "railroad normal form" in parsing technology). Each node is an opcode
86
* plus a "next" pointer, possibly plus an operand. "Next" pointers of
87
* all nodes except BRANCH implement concatenation; a "next" pointer with
88
* a BRANCH on both ends of it is connecting two alternatives. (Here we
89
* have one of the subtle syntax dependencies: an individual BRANCH (as
90
* opposed to a collection of them) is never concatenated with anything
91
* because of operator precedence.) The operand of some types of node is
92
* a literal string; for others, it is a node leading into a sub-FSM. In
93
* particular, the operand of a BRANCH node is the first node of the branch.
94
* (NB this is *not* a tree structure: the tail of the branch connects
95
* to the thing following the set of BRANCHes.) The opcodes are:
96
*/
97
98
/* definition number opnd? meaning */
99
#define END 0 /* no End of program. */
100
#define BOL 1 /* no Match "" at beginning of line. */
101
#define EOL 2 /* no Match "" at end of line. */
102
#define ANY 3 /* no Match any one character. */
103
#define ANYOF 4 /* str Match any character in this string. */
104
#define ANYBUT 5 /* str Match any character not in this string. */
105
#define BRANCH 6 /* node Match this alternative, or the next... */
106
#define BACK 7 /* no Match "", "next" ptr points backward. */
107
#define EXACTLY 8 /* str Match this string. */
108
#define NOTHING 9 /* no Match empty string. */
109
#define STAR 10 /* node Match this (simple) thing 0 or more times. */
110
#define PLUS 11 /* node Match this (simple) thing 1 or more times. */
111
#define OPEN 20 /* no Mark this point in input as start of #n. */
112
/* OPEN+1 is number 1, etc. */
113
#define CLOSE (OPEN+NSUBEXP) /* no Analogous to OPEN. */
114
115
/*
116
* Opcode notes:
117
*
118
* BRANCH The set of branches constituting a single choice are hooked
119
* together with their "next" pointers, since precedence prevents
120
* anything being concatenated to any individual branch. The
121
* "next" pointer of the last BRANCH in a choice points to the
122
* thing following the whole choice. This is also where the
123
* final "next" pointer of each individual branch points; each
124
* branch starts with the operand node of a BRANCH node.
125
*
126
* BACK Normal "next" pointers all implicitly point forward; BACK
127
* exists to make loop structures possible.
128
*
129
* STAR,PLUS '?', and complex '*' and '+', are implemented as circular
130
* BRANCH structures using BACK. Simple cases (one character
131
* per match) are implemented with STAR and PLUS for speed
132
* and to minimize recursive plunges.
133
*
134
* OPEN,CLOSE ...are numbered at compile time.
135
*/
136
137
/*
138
* A node is one char of opcode followed by two chars of "next" pointer.
139
* "Next" pointers are stored as two 8-bit pieces, high order first. The
140
* value is a positive offset from the opcode of the node containing it.
141
* An operand, if any, simply follows the node. (Note that much of the
142
* code generation knows about this implicit relationship.)
143
*
144
* Using two bytes for the "next" pointer is vast overkill for most things,
145
* but allows patterns to get big without disasters.
146
*/
147
#define OP(p) (*(p))
148
#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
149
#define OPERAND(p) ((p) + 3)
150
151
/*
152
* See regmagic.h for one further detail of program structure.
153
*/
154
155
156
/*
157
* Utility definitions.
158
*/
159
#ifndef CHARBITS
160
#define UCHARAT(p) ((int)*(unsigned char *)(p))
161
#else
162
#define UCHARAT(p) ((int)*(p)&CHARBITS)
163
#endif
164
165
#define FAIL(m) { TclRegError(m); return(NULL); }
166
#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
167
#define META "^$.[()|?+*\\"
168
169
/*
170
* Flags to be passed up and down.
171
*/
172
#define HASWIDTH 01 /* Known never to match null string. */
173
#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
174
#define SPSTART 04 /* Starts with * or +. */
175
#define WORST 0 /* Worst case. */
176
177
/*
178
* Global work variables for TclRegComp().
179
*/
180
struct regcomp_state {
181
char *regparse; /* Input-scan pointer. */
182
int regnpar; /* () count. */
183
char *regcode; /* Code-emit pointer; &regdummy = don't. */
184
long regsize; /* Code size. */
185
};
186
187
static char regdummy;
188
189
/*
190
* The first byte of the regexp internal "program" is actually this magic
191
* number; the start node begins in the second byte.
192
*/
193
#define MAGIC 0234
194
195
196
/*
197
* Forward declarations for TclRegComp()'s friends.
198
*/
199
200
static char * reg _ANSI_ARGS_((int paren, int *flagp,
201
struct regcomp_state *rcstate));
202
static char * regatom _ANSI_ARGS_((int *flagp,
203
struct regcomp_state *rcstate));
204
static char * regbranch _ANSI_ARGS_((int *flagp,
205
struct regcomp_state *rcstate));
206
static void regc _ANSI_ARGS_((int b,
207
struct regcomp_state *rcstate));
208
static void reginsert _ANSI_ARGS_((int op, char *opnd,
209
struct regcomp_state *rcstate));
210
static char * regnext _ANSI_ARGS_((char *p));
211
static char * regnode _ANSI_ARGS_((int op,
212
struct regcomp_state *rcstate));
213
static void regoptail _ANSI_ARGS_((char *p, char *val));
214
static char * regpiece _ANSI_ARGS_((int *flagp,
215
struct regcomp_state *rcstate));
216
static void regtail _ANSI_ARGS_((char *p, char *val));
217
218
#ifdef STRCSPN
219
static int strcspn _ANSI_ARGS_((char *s1, char *s2));
220
#endif
221
222
/*
223
- TclRegComp - compile a regular expression into internal code
224
*
225
* We can't allocate space until we know how big the compiled form will be,
226
* but we can't compile it (and thus know how big it is) until we've got a
227
* place to put the code. So we cheat: we compile it twice, once with code
228
* generation turned off and size counting turned on, and once "for real".
229
* This also means that we don't allocate space until we are sure that the
230
* thing really will compile successfully, and we never have to move the
231
* code and thus invalidate pointers into it. (Note that it has to be in
232
* one piece because free() must be able to free it all.)
233
*
234
* Beware that the optimization-preparation code in here knows about some
235
* of the structure of the compiled regexp.
236
*/
237
regexp *
238
TclRegComp(exp)
239
char *exp;
240
{
241
register regexp *r;
242
register char *scan;
243
register char *longest;
244
register int len;
245
int flags;
246
struct regcomp_state state;
247
struct regcomp_state *rcstate= &state;
248
249
if (exp == NULL)
250
FAIL("NULL argument");
251
252
/* First pass: determine size, legality. */
253
rcstate->regparse = exp;
254
rcstate->regnpar = 1;
255
rcstate->regsize = 0L;
256
rcstate->regcode = &regdummy;
257
regc(MAGIC, rcstate);
258
if (reg(0, &flags, rcstate) == NULL)
259
return(NULL);
260
261
/* Small enough for pointer-storage convention? */
262
if (rcstate->regsize >= 32767L) /* Probably could be 65535L. */
263
FAIL("regexp too big");
264
265
/* Allocate space. */
266
r = (regexp *)ckalloc(sizeof(regexp) + (unsigned)rcstate->regsize);
267
if (r == NULL)
268
FAIL("out of space");
269
270
/* Second pass: emit code. */
271
rcstate->regparse = exp;
272
rcstate->regnpar = 1;
273
rcstate->regcode = r->program;
274
regc(MAGIC, rcstate);
275
if (reg(0, &flags, rcstate) == NULL)
276
return(NULL);
277
278
/* Dig out information for optimizations. */
279
r->regstart = '\0'; /* Worst-case defaults. */
280
r->reganch = 0;
281
r->regmust = NULL;
282
r->regmlen = 0;
283
scan = r->program+1; /* First BRANCH. */
284
if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
285
scan = OPERAND(scan);
286
287
/* Starting-point info. */
288
if (OP(scan) == EXACTLY)
289
r->regstart = *OPERAND(scan);
290
else if (OP(scan) == BOL)
291
r->reganch++;
292
293
/*
294
* If there's something expensive in the r.e., find the
295
* longest literal string that must appear and make it the
296
* regmust. Resolve ties in favor of later strings, since
297
* the regstart check works with the beginning of the r.e.
298
* and avoiding duplication strengthens checking. Not a
299
* strong reason, but sufficient in the absence of others.
300
*/
301
if (flags&SPSTART) {
302
longest = NULL;
303
len = 0;
304
for (; scan != NULL; scan = regnext(scan))
305
if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) {
306
longest = OPERAND(scan);
307
len = strlen(OPERAND(scan));
308
}
309
r->regmust = longest;
310
r->regmlen = len;
311
}
312
}
313
314
return(r);
315
}
316
317
/*
318
- reg - regular expression, i.e. main body or parenthesized thing
319
*
320
* Caller must absorb opening parenthesis.
321
*
322
* Combining parenthesis handling with the base level of regular expression
323
* is a trifle forced, but the need to tie the tails of the branches to what
324
* follows makes it hard to avoid.
325
*/
326
static char *
327
reg(paren, flagp, rcstate)
328
int paren; /* Parenthesized? */
329
int *flagp;
330
struct regcomp_state *rcstate;
331
{
332
register char *ret;
333
register char *br;
334
register char *ender;
335
register int parno = 0;
336
int flags;
337
338
*flagp = HASWIDTH; /* Tentatively. */
339
340
/* Make an OPEN node, if parenthesized. */
341
if (paren) {
342
if (rcstate->regnpar >= NSUBEXP)
343
FAIL("too many ()");
344
parno = rcstate->regnpar;
345
rcstate->regnpar++;
346
ret = regnode(OPEN+parno,rcstate);
347
} else
348
ret = NULL;
349
350
/* Pick up the branches, linking them together. */
351
br = regbranch(&flags,rcstate);
352
if (br == NULL)
353
return(NULL);
354
if (ret != NULL)
355
regtail(ret, br); /* OPEN -> first. */
356
else
357
ret = br;
358
if (!(flags&HASWIDTH))
359
*flagp &= ~HASWIDTH;
360
*flagp |= flags&SPSTART;
361
while (*rcstate->regparse == '|') {
362
rcstate->regparse++;
363
br = regbranch(&flags,rcstate);
364
if (br == NULL)
365
return(NULL);
366
regtail(ret, br); /* BRANCH -> BRANCH. */
367
if (!(flags&HASWIDTH))
368
*flagp &= ~HASWIDTH;
369
*flagp |= flags&SPSTART;
370
}
371
372
/* Make a closing node, and hook it on the end. */
373
ender = regnode((paren) ? CLOSE+parno : END,rcstate);
374
regtail(ret, ender);
375
376
/* Hook the tails of the branches to the closing node. */
377
for (br = ret; br != NULL; br = regnext(br))
378
regoptail(br, ender);
379
380
/* Check for proper termination. */
381
if (paren && *rcstate->regparse++ != ')') {
382
FAIL("unmatched ()");
383
} else if (!paren && *rcstate->regparse != '\0') {
384
if (*rcstate->regparse == ')') {
385
FAIL("unmatched ()");
386
} else
387
FAIL("junk on end"); /* "Can't happen". */
388
/* NOTREACHED */
389
}
390
391
return(ret);
392
}
393
394
/*
395
- regbranch - one alternative of an | operator
396
*
397
* Implements the concatenation operator.
398
*/
399
static char *
400
regbranch(flagp, rcstate)
401
int *flagp;
402
struct regcomp_state *rcstate;
403
{
404
register char *ret;
405
register char *chain;
406
register char *latest;
407
int flags;
408
409
*flagp = WORST; /* Tentatively. */
410
411
ret = regnode(BRANCH,rcstate);
412
chain = NULL;
413
while (*rcstate->regparse != '\0' && *rcstate->regparse != '|' &&
414
*rcstate->regparse != ')') {
415
latest = regpiece(&flags, rcstate);
416
if (latest == NULL)
417
return(NULL);
418
*flagp |= flags&HASWIDTH;
419
if (chain == NULL) /* First piece. */
420
*flagp |= flags&SPSTART;
421
else
422
regtail(chain, latest);
423
chain = latest;
424
}
425
if (chain == NULL) /* Loop ran zero times. */
426
(void) regnode(NOTHING,rcstate);
427
428
return(ret);
429
}
430
431
/*
432
- regpiece - something followed by possible [*+?]
433
*
434
* Note that the branching code sequences used for ? and the general cases
435
* of * and + are somewhat optimized: they use the same NOTHING node as
436
* both the endmarker for their branch list and the body of the last branch.
437
* It might seem that this node could be dispensed with entirely, but the
438
* endmarker role is not redundant.
439
*/
440
static char *
441
regpiece(flagp, rcstate)
442
int *flagp;
443
struct regcomp_state *rcstate;
444
{
445
register char *ret;
446
register char op;
447
register char *next;
448
int flags;
449
450
ret = regatom(&flags,rcstate);
451
if (ret == NULL)
452
return(NULL);
453
454
op = *rcstate->regparse;
455
if (!ISMULT(op)) {
456
*flagp = flags;
457
return(ret);
458
}
459
460
if (!(flags&HASWIDTH) && op != '?')
461
FAIL("*+ operand could be empty");
462
*flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
463
464
if (op == '*' && (flags&SIMPLE))
465
reginsert(STAR, ret, rcstate);
466
else if (op == '*') {
467
/* Emit x* as (x&|), where & means "self". */
468
reginsert(BRANCH, ret, rcstate); /* Either x */
469
regoptail(ret, regnode(BACK,rcstate)); /* and loop */
470
regoptail(ret, ret); /* back */
471
regtail(ret, regnode(BRANCH,rcstate)); /* or */
472
regtail(ret, regnode(NOTHING,rcstate)); /* null. */
473
} else if (op == '+' && (flags&SIMPLE))
474
reginsert(PLUS, ret, rcstate);
475
else if (op == '+') {
476
/* Emit x+ as x(&|), where & means "self". */
477
next = regnode(BRANCH,rcstate); /* Either */
478
regtail(ret, next);
479
regtail(regnode(BACK,rcstate), ret); /* loop back */
480
regtail(next, regnode(BRANCH,rcstate)); /* or */
481
regtail(ret, regnode(NOTHING,rcstate)); /* null. */
482
} else if (op == '?') {
483
/* Emit x? as (x|) */
484
reginsert(BRANCH, ret, rcstate); /* Either x */
485
regtail(ret, regnode(BRANCH,rcstate)); /* or */
486
next = regnode(NOTHING,rcstate); /* null. */
487
regtail(ret, next);
488
regoptail(ret, next);
489
}
490
rcstate->regparse++;
491
if (ISMULT(*rcstate->regparse))
492
FAIL("nested *?+");
493
494
return(ret);
495
}
496
497
/*
498
- regatom - the lowest level
499
*
500
* Optimization: gobbles an entire sequence of ordinary characters so that
501
* it can turn them into a single node, which is smaller to store and
502
* faster to run. Backslashed characters are exceptions, each becoming a
503
* separate node; the code is simpler that way and it's not worth fixing.
504
*/
505
static char *
506
regatom(flagp, rcstate)
507
int *flagp;
508
struct regcomp_state *rcstate;
509
{
510
register char *ret;
511
int flags;
512
513
*flagp = WORST; /* Tentatively. */
514
515
switch (*rcstate->regparse++) {
516
case '^':
517
ret = regnode(BOL,rcstate);
518
break;
519
case '$':
520
ret = regnode(EOL,rcstate);
521
break;
522
case '.':
523
ret = regnode(ANY,rcstate);
524
*flagp |= HASWIDTH|SIMPLE;
525
break;
526
case '[': {
527
register int clss;
528
register int classend;
529
530
if (*rcstate->regparse == '^') { /* Complement of range. */
531
ret = regnode(ANYBUT,rcstate);
532
rcstate->regparse++;
533
} else
534
ret = regnode(ANYOF,rcstate);
535
if (*rcstate->regparse == ']' || *rcstate->regparse == '-')
536
regc(*rcstate->regparse++,rcstate);
537
while (*rcstate->regparse != '\0' && *rcstate->regparse != ']') {
538
if (*rcstate->regparse == '-') {
539
rcstate->regparse++;
540
if (*rcstate->regparse == ']' || *rcstate->regparse == '\0')
541
regc('-',rcstate);
542
else {
543
clss = UCHARAT(rcstate->regparse-2)+1;
544
classend = UCHARAT(rcstate->regparse);
545
if (clss > classend+1)
546
FAIL("invalid [] range");
547
for (; clss <= classend; clss++)
548
regc((char)clss,rcstate);
549
rcstate->regparse++;
550
}
551
} else
552
regc(*rcstate->regparse++,rcstate);
553
}
554
regc('\0',rcstate);
555
if (*rcstate->regparse != ']')
556
FAIL("unmatched []");
557
rcstate->regparse++;
558
*flagp |= HASWIDTH|SIMPLE;
559
}
560
break;
561
case '(':
562
ret = reg(1, &flags, rcstate);
563
if (ret == NULL)
564
return(NULL);
565
*flagp |= flags&(HASWIDTH|SPSTART);
566
break;
567
case '\0':
568
case '|':
569
case ')':
570
FAIL("internal urp"); /* Supposed to be caught earlier. */
571
/* NOTREACHED */
572
break;
573
case '?':
574
case '+':
575
case '*':
576
FAIL("?+* follows nothing");
577
/* NOTREACHED */
578
break;
579
case '\\':
580
if (*rcstate->regparse == '\0')
581
FAIL("trailing \\");
582
ret = regnode(EXACTLY,rcstate);
583
regc(*rcstate->regparse++,rcstate);
584
regc('\0',rcstate);
585
*flagp |= HASWIDTH|SIMPLE;
586
break;
587
default: {
588
register int len;
589
register char ender;
590
591
rcstate->regparse--;
592
len = strcspn(rcstate->regparse, META);
593
if (len <= 0)
594
FAIL("internal disaster");
595
ender = *(rcstate->regparse+len);
596
if (len > 1 && ISMULT(ender))
597
len--; /* Back off clear of ?+* operand. */
598
*flagp |= HASWIDTH;
599
if (len == 1)
600
*flagp |= SIMPLE;
601
ret = regnode(EXACTLY,rcstate);
602
while (len > 0) {
603
regc(*rcstate->regparse++,rcstate);
604
len--;
605
}
606
regc('\0',rcstate);
607
}
608
break;
609
}
610
611
return(ret);
612
}
613
614
/*
615
- regnode - emit a node
616
*/
617
static char * /* Location. */
618
regnode(op, rcstate)
619
int op;
620
struct regcomp_state *rcstate;
621
{
622
register char *ret;
623
register char *ptr;
624
625
ret = rcstate->regcode;
626
if (ret == &regdummy) {
627
rcstate->regsize += 3;
628
return(ret);
629
}
630
631
ptr = ret;
632
*ptr++ = (char)op;
633
*ptr++ = '\0'; /* Null "next" pointer. */
634
*ptr++ = '\0';
635
rcstate->regcode = ptr;
636
637
return(ret);
638
}
639
640
/*
641
- regc - emit (if appropriate) a byte of code
642
*/
643
static void
644
regc(b, rcstate)
645
int b;
646
struct regcomp_state *rcstate;
647
{
648
if (rcstate->regcode != &regdummy)
649
*rcstate->regcode++ = (char)b;
650
else
651
rcstate->regsize++;
652
}
653
654
/*
655
- reginsert - insert an operator in front of already-emitted operand
656
*
657
* Means relocating the operand.
658
*/
659
static void
660
reginsert(op, opnd, rcstate)
661
int op;
662
char *opnd;
663
struct regcomp_state *rcstate;
664
{
665
register char *src;
666
register char *dst;
667
register char *place;
668
669
if (rcstate->regcode == &regdummy) {
670
rcstate->regsize += 3;
671
return;
672
}
673
674
src = rcstate->regcode;
675
rcstate->regcode += 3;
676
dst = rcstate->regcode;
677
while (src > opnd)
678
*--dst = *--src;
679
680
place = opnd; /* Op node, where operand used to be. */
681
*place++ = (char)op;
682
*place++ = '\0';
683
*place = '\0';
684
}
685
686
/*
687
- regtail - set the next-pointer at the end of a node chain
688
*/
689
static void
690
regtail(p, val)
691
char *p;
692
char *val;
693
{
694
register char *scan;
695
register char *temp;
696
register int offset;
697
698
if (p == &regdummy)
699
return;
700
701
/* Find last node. */
702
scan = p;
703
for (;;) {
704
temp = regnext(scan);
705
if (temp == NULL)
706
break;
707
scan = temp;
708
}
709
710
if (OP(scan) == BACK)
711
offset = scan - val;
712
else
713
offset = val - scan;
714
*(scan+1) = (char)((offset>>8)&0377);
715
*(scan+2) = (char)(offset&0377);
716
}
717
718
/*
719
- regoptail - regtail on operand of first argument; nop if operandless
720
*/
721
static void
722
regoptail(p, val)
723
char *p;
724
char *val;
725
{
726
/* "Operandless" and "op != BRANCH" are synonymous in practice. */
727
if (p == NULL || p == &regdummy || OP(p) != BRANCH)
728
return;
729
regtail(OPERAND(p), val);
730
}
731
732
/*
733
* TclRegExec and friends
734
*/
735
736
/*
737
* Global work variables for TclRegExec().
738
*/
739
struct regexec_state {
740
char *reginput; /* String-input pointer. */
741
char *regbol; /* Beginning of input, for ^ check. */
742
char **regstartp; /* Pointer to startp array. */
743
char **regendp; /* Ditto for endp. */
744
};
745
746
/*
747
* Forwards.
748
*/
749
static int regtry _ANSI_ARGS_((regexp *prog, char *string,
750
struct regexec_state *restate));
751
static int regmatch _ANSI_ARGS_((char *prog,
752
struct regexec_state *restate));
753
static int regrepeat _ANSI_ARGS_((char *p,
754
struct regexec_state *restate));
755
756
#ifdef DEBUG
757
int regnarrate = 0;
758
void regdump _ANSI_ARGS_((regexp *r));
759
static char *regprop _ANSI_ARGS_((char *op));
760
#endif
761
762
/*
763
- TclRegExec - match a regexp against a string
764
*/
765
int
766
TclRegExec(prog, string, start)
767
register regexp *prog;
768
register char *string;
769
char *start;
770
{
771
register char *s;
772
struct regexec_state state;
773
struct regexec_state *restate= &state;
774
775
/* Be paranoid... */
776
if (prog == NULL || string == NULL) {
777
TclRegError("NULL parameter");
778
return(0);
779
}
780
781
/* Check validity of program. */
782
if (UCHARAT(prog->program) != MAGIC) {
783
TclRegError("corrupted program");
784
return(0);
785
}
786
787
/* If there is a "must appear" string, look for it. */
788
if (prog->regmust != NULL) {
789
s = string;
790
while ((s = strchr(s, prog->regmust[0])) != NULL) {
791
if (strncmp(s, prog->regmust, (size_t) prog->regmlen)
792
== 0)
793
break; /* Found it. */
794
s++;
795
}
796
if (s == NULL) /* Not present. */
797
return(0);
798
}
799
800
/* Mark beginning of line for ^ . */
801
restate->regbol = start;
802
803
/* Simplest case: anchored match need be tried only once. */
804
if (prog->reganch)
805
return(regtry(prog, string, restate));
806
807
/* Messy cases: unanchored match. */
808
s = string;
809
if (prog->regstart != '\0')
810
/* We know what char it must start with. */
811
while ((s = strchr(s, prog->regstart)) != NULL) {
812
if (regtry(prog, s, restate))
813
return(1);
814
s++;
815
}
816
else
817
/* We don't -- general case. */
818
do {
819
if (regtry(prog, s, restate))
820
return(1);
821
} while (*s++ != '\0');
822
823
/* Failure. */
824
return(0);
825
}
826
827
/*
828
- regtry - try match at specific point
829
*/
830
static int /* 0 failure, 1 success */
831
regtry(prog, string, restate)
832
regexp *prog;
833
char *string;
834
struct regexec_state *restate;
835
{
836
register int i;
837
register char **sp;
838
register char **ep;
839
840
restate->reginput = string;
841
restate->regstartp = prog->startp;
842
restate->regendp = prog->endp;
843
844
sp = prog->startp;
845
ep = prog->endp;
846
for (i = NSUBEXP; i > 0; i--) {
847
*sp++ = NULL;
848
*ep++ = NULL;
849
}
850
if (regmatch(prog->program + 1,restate)) {
851
prog->startp[0] = string;
852
prog->endp[0] = restate->reginput;
853
return(1);
854
} else
855
return(0);
856
}
857
858
/*
859
- regmatch - main matching routine
860
*
861
* Conceptually the strategy is simple: check to see whether the current
862
* node matches, call self recursively to see whether the rest matches,
863
* and then act accordingly. In practice we make some effort to avoid
864
* recursion, in particular by going through "ordinary" nodes (that don't
865
* need to know whether the rest of the match failed) by a loop instead of
866
* by recursion.
867
*/
868
static int /* 0 failure, 1 success */
869
regmatch(prog, restate)
870
char *prog;
871
struct regexec_state *restate;
872
{
873
register char *scan; /* Current node. */
874
char *next; /* Next node. */
875
876
scan = prog;
877
#ifdef DEBUG
878
if (scan != NULL && regnarrate)
879
fprintf(stderr, "%s(\n", regprop(scan));
880
#endif
881
while (scan != NULL) {
882
#ifdef DEBUG
883
if (regnarrate)
884
fprintf(stderr, "%s...\n", regprop(scan));
885
#endif
886
next = regnext(scan);
887
888
switch (OP(scan)) {
889
case BOL:
890
if (restate->reginput != restate->regbol) {
891
return 0;
892
}
893
break;
894
case EOL:
895
if (*restate->reginput != '\0') {
896
return 0;
897
}
898
break;
899
case ANY:
900
if (*restate->reginput == '\0') {
901
return 0;
902
}
903
restate->reginput++;
904
break;
905
case EXACTLY: {
906
register int len;
907
register char *opnd;
908
909
opnd = OPERAND(scan);
910
/* Inline the first character, for speed. */
911
if (*opnd != *restate->reginput) {
912
return 0 ;
913
}
914
len = strlen(opnd);
915
if (len > 1 && strncmp(opnd, restate->reginput, (size_t) len)
916
!= 0) {
917
return 0;
918
}
919
restate->reginput += len;
920
break;
921
}
922
case ANYOF:
923
if (*restate->reginput == '\0'
924
|| strchr(OPERAND(scan), *restate->reginput) == NULL) {
925
return 0;
926
}
927
restate->reginput++;
928
break;
929
case ANYBUT:
930
if (*restate->reginput == '\0'
931
|| strchr(OPERAND(scan), *restate->reginput) != NULL) {
932
return 0;
933
}
934
restate->reginput++;
935
break;
936
case NOTHING:
937
break;
938
case BACK:
939
break;
940
case OPEN+1:
941
case OPEN+2:
942
case OPEN+3:
943
case OPEN+4:
944
case OPEN+5:
945
case OPEN+6:
946
case OPEN+7:
947
case OPEN+8:
948
case OPEN+9: {
949
register int no;
950
register char *save;
951
952
doOpen:
953
no = OP(scan) - OPEN;
954
save = restate->reginput;
955
956
if (regmatch(next,restate)) {
957
/*
958
* Don't set startp if some later invocation of the
959
* same parentheses already has.
960
*/
961
if (restate->regstartp[no] == NULL) {
962
restate->regstartp[no] = save;
963
}
964
return 1;
965
} else {
966
return 0;
967
}
968
}
969
case CLOSE+1:
970
case CLOSE+2:
971
case CLOSE+3:
972
case CLOSE+4:
973
case CLOSE+5:
974
case CLOSE+6:
975
case CLOSE+7:
976
case CLOSE+8:
977
case CLOSE+9: {
978
register int no;
979
register char *save;
980
981
doClose:
982
no = OP(scan) - CLOSE;
983
save = restate->reginput;
984
985
if (regmatch(next,restate)) {
986
/*
987
* Don't set endp if some later
988
* invocation of the same parentheses
989
* already has.
990
*/
991
if (restate->regendp[no] == NULL)
992
restate->regendp[no] = save;
993
return 1;
994
} else {
995
return 0;
996
}
997
}
998
case BRANCH: {
999
register char *save;
1000
1001
if (OP(next) != BRANCH) { /* No choice. */
1002
next = OPERAND(scan); /* Avoid recursion. */
1003
} else {
1004
do {
1005
save = restate->reginput;
1006
if (regmatch(OPERAND(scan),restate))
1007
return(1);
1008
restate->reginput = save;
1009
scan = regnext(scan);
1010
} while (scan != NULL && OP(scan) == BRANCH);
1011
return 0;
1012
}
1013
break;
1014
}
1015
case STAR:
1016
case PLUS: {
1017
register char nextch;
1018
register int no;
1019
register char *save;
1020
register int min;
1021
1022
/*
1023
* Lookahead to avoid useless match attempts
1024
* when we know what character comes next.
1025
*/
1026
nextch = '\0';
1027
if (OP(next) == EXACTLY)
1028
nextch = *OPERAND(next);
1029
min = (OP(scan) == STAR) ? 0 : 1;
1030
save = restate->reginput;
1031
no = regrepeat(OPERAND(scan),restate);
1032
while (no >= min) {
1033
/* If it could work, try it. */
1034
if (nextch == '\0' || *restate->reginput == nextch)
1035
if (regmatch(next,restate))
1036
return(1);
1037
/* Couldn't or didn't -- back up. */
1038
no--;
1039
restate->reginput = save + no;
1040
}
1041
return(0);
1042
}
1043
case END:
1044
return(1); /* Success! */
1045
default:
1046
if (OP(scan) > OPEN && OP(scan) < OPEN+NSUBEXP) {
1047
goto doOpen;
1048
} else if (OP(scan) > CLOSE && OP(scan) < CLOSE+NSUBEXP) {
1049
goto doClose;
1050
}
1051
TclRegError("memory corruption");
1052
return 0;
1053
}
1054
1055
scan = next;
1056
}
1057
1058
/*
1059
* We get here only if there's trouble -- normally "case END" is
1060
* the terminating point.
1061
*/
1062
TclRegError("corrupted pointers");
1063
return(0);
1064
}
1065
1066
/*
1067
- regrepeat - repeatedly match something simple, report how many
1068
*/
1069
static int
1070
regrepeat(p, restate)
1071
char *p;
1072
struct regexec_state *restate;
1073
{
1074
register int count = 0;
1075
register char *scan;
1076
register char *opnd;
1077
1078
scan = restate->reginput;
1079
opnd = OPERAND(p);
1080
switch (OP(p)) {
1081
case ANY:
1082
count = strlen(scan);
1083
scan += count;
1084
break;
1085
case EXACTLY:
1086
while (*opnd == *scan) {
1087
count++;
1088
scan++;
1089
}
1090
break;
1091
case ANYOF:
1092
while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1093
count++;
1094
scan++;
1095
}
1096
break;
1097
case ANYBUT:
1098
while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1099
count++;
1100
scan++;
1101
}
1102
break;
1103
default: /* Oh dear. Called inappropriately. */
1104
TclRegError("internal foulup");
1105
count = 0; /* Best compromise. */
1106
break;
1107
}
1108
restate->reginput = scan;
1109
1110
return(count);
1111
}
1112
1113
/*
1114
- regnext - dig the "next" pointer out of a node
1115
*/
1116
static char *
1117
regnext(p)
1118
register char *p;
1119
{
1120
register int offset;
1121
1122
if (p == &regdummy)
1123
return(NULL);
1124
1125
offset = NEXT(p);
1126
if (offset == 0)
1127
return(NULL);
1128
1129
if (OP(p) == BACK)
1130
return(p-offset);
1131
else
1132
return(p+offset);
1133
}
1134
1135
#ifdef DEBUG
1136
1137
static char *regprop();
1138
1139
/*
1140
- regdump - dump a regexp onto stdout in vaguely comprehensible form
1141
*/
1142
void
1143
regdump(r)
1144
regexp *r;
1145
{
1146
register char *s;
1147
register char op = EXACTLY; /* Arbitrary non-END op. */
1148
register char *next;
1149
1150
1151
s = r->program + 1;
1152
while (op != END) { /* While that wasn't END last time... */
1153
op = OP(s);
1154
printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1155
next = regnext(s);
1156
if (next == NULL) /* Next ptr. */
1157
printf("(0)");
1158
else
1159
printf("(%d)", (s-r->program)+(next-s));
1160
s += 3;
1161
if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1162
/* Literal string, where present. */
1163
while (*s != '\0') {
1164
putchar(*s);
1165
s++;
1166
}
1167
s++;
1168
}
1169
putchar('\n');
1170
}
1171
1172
/* Header fields of interest. */
1173
if (r->regstart != '\0')
1174
printf("start `%c' ", r->regstart);
1175
if (r->reganch)
1176
printf("anchored ");
1177
if (r->regmust != NULL)
1178
printf("must have \"%s\"", r->regmust);
1179
printf("\n");
1180
}
1181
1182
/*
1183
- regprop - printable representation of opcode
1184
*/
1185
static char *
1186
regprop(op)
1187
char *op;
1188
{
1189
register char *p;
1190
static char buf[50];
1191
1192
(void) strcpy(buf, ":");
1193
1194
switch (OP(op)) {
1195
case BOL:
1196
p = "BOL";
1197
break;
1198
case EOL:
1199
p = "EOL";
1200
break;
1201
case ANY:
1202
p = "ANY";
1203
break;
1204
case ANYOF:
1205
p = "ANYOF";
1206
break;
1207
case ANYBUT:
1208
p = "ANYBUT";
1209
break;
1210
case BRANCH:
1211
p = "BRANCH";
1212
break;
1213
case EXACTLY:
1214
p = "EXACTLY";
1215
break;
1216
case NOTHING:
1217
p = "NOTHING";
1218
break;
1219
case BACK:
1220
p = "BACK";
1221
break;
1222
case END:
1223
p = "END";
1224
break;
1225
case OPEN+1:
1226
case OPEN+2:
1227
case OPEN+3:
1228
case OPEN+4:
1229
case OPEN+5:
1230
case OPEN+6:
1231
case OPEN+7:
1232
case OPEN+8:
1233
case OPEN+9:
1234
sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1235
p = NULL;
1236
break;
1237
case CLOSE+1:
1238
case CLOSE+2:
1239
case CLOSE+3:
1240
case CLOSE+4:
1241
case CLOSE+5:
1242
case CLOSE+6:
1243
case CLOSE+7:
1244
case CLOSE+8:
1245
case CLOSE+9:
1246
sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1247
p = NULL;
1248
break;
1249
case STAR:
1250
p = "STAR";
1251
break;
1252
case PLUS:
1253
p = "PLUS";
1254
break;
1255
default:
1256
if (OP(op) > OPEN && OP(op) < OPEN+NSUBEXP) {
1257
sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1258
p = NULL;
1259
break;
1260
} else if (OP(op) > CLOSE && OP(op) < CLOSE+NSUBEXP) {
1261
sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1262
p = NULL;
1263
} else {
1264
TclRegError("corrupted opcode");
1265
}
1266
break;
1267
}
1268
if (p != NULL)
1269
(void) strcat(buf, p);
1270
return(buf);
1271
}
1272
#endif
1273
1274
/*
1275
* The following is provided for those people who do not have strcspn() in
1276
* their C libraries. They should get off their butts and do something
1277
* about it; at least one public-domain implementation of those (highly
1278
* useful) string routines has been published on Usenet.
1279
*/
1280
#ifdef STRCSPN
1281
/*
1282
* strcspn - find length of initial segment of s1 consisting entirely
1283
* of characters not from s2
1284
*/
1285
1286
static int
1287
strcspn(s1, s2)
1288
char *s1;
1289
char *s2;
1290
{
1291
register char *scan1;
1292
register char *scan2;
1293
register int count;
1294
1295
count = 0;
1296
for (scan1 = s1; *scan1 != '\0'; scan1++) {
1297
for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1298
if (*scan1 == *scan2++)
1299
return(count);
1300
count++;
1301
}
1302
return(count);
1303
}
1304
#endif
1305
1306
/*
1307
*----------------------------------------------------------------------
1308
*
1309
* TclRegError --
1310
*
1311
* This procedure is invoked by the regexp code when an error
1312
* occurs. It saves the error message so it can be seen by the
1313
* code that called Spencer's code.
1314
*
1315
* Results:
1316
* None.
1317
*
1318
* Side effects:
1319
* The value of "string" is saved in "errMsg".
1320
*
1321
*----------------------------------------------------------------------
1322
*/
1323
1324
void
1325
TclRegError(string)
1326
char *string; /* Error message. */
1327
{
1328
errMsg = string;
1329
}
1330
1331
char *
1332
TclGetRegError()
1333
{
1334
return errMsg;
1335
}
1336
1337