Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
7639 views
1
#include <stdlib.h>
2
#include <stdio.h>
3
#include <string.h>
4
#include <setjmp.h>
5
#include <limits.h>
6
7
#include "regex.h"
8
#include "utf.h"
9
10
#define emit regemit
11
#define next regnext
12
#define accept regaccept
13
14
#define nelem(a) (sizeof (a) / sizeof (a)[0])
15
16
#define REPINF 255
17
#define MAXTHREAD 1000
18
#define MAXSUB REG_MAXSUB
19
20
typedef struct Reclass Reclass;
21
typedef struct Renode Renode;
22
typedef struct Reinst Reinst;
23
typedef struct Rethread Rethread;
24
25
struct Reclass {
26
Rune *end;
27
Rune spans[64];
28
};
29
30
struct Reprog {
31
Reinst *start, *end;
32
int flags;
33
unsigned int nsub;
34
Reclass cclass[16];
35
};
36
37
struct cstate {
38
Reprog *prog;
39
Renode *pstart, *pend;
40
41
const char *source;
42
unsigned int ncclass;
43
unsigned int nsub;
44
Renode *sub[MAXSUB];
45
46
int lookahead;
47
Rune yychar;
48
Reclass *yycc;
49
int yymin, yymax;
50
51
const char *error;
52
jmp_buf kaboom;
53
};
54
55
static void die(struct cstate *g, const char *message)
56
{
57
g->error = message;
58
longjmp(g->kaboom, 1);
59
}
60
61
static int canon(Rune c)
62
{
63
Rune u = toupperrune(c);
64
if (c >= 128 && u < 128)
65
return c;
66
return u;
67
}
68
69
/* Scan */
70
71
enum {
72
L_CHAR = 256,
73
L_CCLASS, /* character class */
74
L_NCCLASS, /* negative character class */
75
L_NC, /* "(?:" no capture */
76
L_PLA, /* "(?=" positive lookahead */
77
L_NLA, /* "(?!" negative lookahead */
78
L_WORD, /* "\b" word boundary */
79
L_NWORD, /* "\B" non-word boundary */
80
L_REF, /* "\1" back-reference */
81
L_COUNT, /* {M,N} */
82
};
83
84
static int hex(struct cstate *g, int c)
85
{
86
if (c >= '0' && c <= '9') return c - '0';
87
if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
88
if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
89
die(g, "invalid escape sequence");
90
return 0;
91
}
92
93
static int dec(struct cstate *g, int c)
94
{
95
if (c >= '0' && c <= '9') return c - '0';
96
die(g, "invalid quantifier");
97
return 0;
98
}
99
100
#define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|0123456789"
101
102
static int isunicodeletter(int c)
103
{
104
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
105
}
106
107
static int nextrune(struct cstate *g)
108
{
109
g->source += chartorune(&g->yychar, g->source);
110
if (g->yychar == '\\') {
111
g->source += chartorune(&g->yychar, g->source);
112
switch (g->yychar) {
113
case 0: die(g, "unterminated escape sequence");
114
case 'f': g->yychar = '\f'; return 0;
115
case 'n': g->yychar = '\n'; return 0;
116
case 'r': g->yychar = '\r'; return 0;
117
case 't': g->yychar = '\t'; return 0;
118
case 'v': g->yychar = '\v'; return 0;
119
case 'c':
120
g->yychar = (*g->source++) & 31;
121
return 0;
122
case 'x':
123
g->yychar = hex(g, *g->source++) << 4;
124
g->yychar += hex(g, *g->source++);
125
if (g->yychar == 0) {
126
g->yychar = '0';
127
return 1;
128
}
129
return 0;
130
case 'u':
131
g->yychar = hex(g, *g->source++) << 12;
132
g->yychar += hex(g, *g->source++) << 8;
133
g->yychar += hex(g, *g->source++) << 4;
134
g->yychar += hex(g, *g->source++);
135
if (g->yychar == 0) {
136
g->yychar = '0';
137
return 1;
138
}
139
return 0;
140
}
141
if (strchr(ESCAPES, g->yychar))
142
return 1;
143
if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */
144
die(g, "invalid escape character");
145
return 0;
146
}
147
return 0;
148
}
149
150
static int lexcount(struct cstate *g)
151
{
152
g->yychar = *g->source++;
153
154
g->yymin = dec(g, g->yychar);
155
g->yychar = *g->source++;
156
while (g->yychar != ',' && g->yychar != '}') {
157
g->yymin = g->yymin * 10 + dec(g, g->yychar);
158
g->yychar = *g->source++;
159
}
160
if (g->yymin >= REPINF)
161
die(g, "numeric overflow");
162
163
if (g->yychar == ',') {
164
g->yychar = *g->source++;
165
if (g->yychar == '}') {
166
g->yymax = REPINF;
167
} else {
168
g->yymax = dec(g, g->yychar);
169
g->yychar = *g->source++;
170
while (g->yychar != '}') {
171
g->yymax = g->yymax * 10 + dec(g, g->yychar);
172
g->yychar = *g->source++;
173
}
174
if (g->yymax >= REPINF)
175
die(g, "numeric overflow");
176
}
177
} else {
178
g->yymax = g->yymin;
179
}
180
181
return L_COUNT;
182
}
183
184
static void newcclass(struct cstate *g)
185
{
186
if (g->ncclass >= nelem(g->prog->cclass))
187
die(g, "too many character classes");
188
g->yycc = g->prog->cclass + g->ncclass++;
189
g->yycc->end = g->yycc->spans;
190
}
191
192
static void addrange(struct cstate *g, Rune a, Rune b)
193
{
194
if (a > b)
195
die(g, "invalid character class range");
196
if (g->yycc->end + 2 == g->yycc->spans + nelem(g->yycc->spans))
197
die(g, "too many character class ranges");
198
*g->yycc->end++ = a;
199
*g->yycc->end++ = b;
200
}
201
202
static void addranges_d(struct cstate *g)
203
{
204
addrange(g, '0', '9');
205
}
206
207
static void addranges_D(struct cstate *g)
208
{
209
addrange(g, 0, '0'-1);
210
addrange(g, '9'+1, 0xFFFF);
211
}
212
213
static void addranges_s(struct cstate *g)
214
{
215
addrange(g, 0x9, 0x9);
216
addrange(g, 0xA, 0xD);
217
addrange(g, 0x20, 0x20);
218
addrange(g, 0xA0, 0xA0);
219
addrange(g, 0x2028, 0x2029);
220
addrange(g, 0xFEFF, 0xFEFF);
221
}
222
223
static void addranges_S(struct cstate *g)
224
{
225
addrange(g, 0, 0x9-1);
226
addrange(g, 0x9+1, 0xA-1);
227
addrange(g, 0xD+1, 0x20-1);
228
addrange(g, 0x20+1, 0xA0-1);
229
addrange(g, 0xA0+1, 0x2028-1);
230
addrange(g, 0x2029+1, 0xFEFF-1);
231
addrange(g, 0xFEFF+1, 0xFFFF);
232
}
233
234
static void addranges_w(struct cstate *g)
235
{
236
addrange(g, '0', '9');
237
addrange(g, 'A', 'Z');
238
addrange(g, '_', '_');
239
addrange(g, 'a', 'z');
240
}
241
242
static void addranges_W(struct cstate *g)
243
{
244
addrange(g, 0, '0'-1);
245
addrange(g, '9'+1, 'A'-1);
246
addrange(g, 'Z'+1, '_'-1);
247
addrange(g, '_'+1, 'a'-1);
248
addrange(g, 'z'+1, 0xFFFF);
249
}
250
251
static int lexclass(struct cstate *g)
252
{
253
int type = L_CCLASS;
254
int quoted, havesave, havedash;
255
Rune save;
256
257
newcclass(g);
258
259
quoted = nextrune(g);
260
if (!quoted && g->yychar == '^') {
261
type = L_NCCLASS;
262
quoted = nextrune(g);
263
}
264
265
havesave = havedash = 0;
266
for (;;) {
267
if (g->yychar == 0)
268
die(g, "unterminated character class");
269
if (!quoted && g->yychar == ']')
270
break;
271
272
if (!quoted && g->yychar == '-') {
273
if (havesave) {
274
if (havedash) {
275
addrange(g, save, '-');
276
havesave = havedash = 0;
277
} else {
278
havedash = 1;
279
}
280
} else {
281
save = '-';
282
havesave = 1;
283
}
284
} else if (quoted && strchr("DSWdsw", g->yychar)) {
285
if (havesave) {
286
addrange(g, save, save);
287
if (havedash)
288
addrange(g, '-', '-');
289
}
290
switch (g->yychar) {
291
case 'd': addranges_d(g); break;
292
case 's': addranges_s(g); break;
293
case 'w': addranges_w(g); break;
294
case 'D': addranges_D(g); break;
295
case 'S': addranges_S(g); break;
296
case 'W': addranges_W(g); break;
297
}
298
havesave = havedash = 0;
299
} else {
300
if (quoted) {
301
if (g->yychar == 'b')
302
g->yychar = '\b';
303
else if (g->yychar == '0')
304
g->yychar = 0;
305
/* else identity escape */
306
}
307
if (havesave) {
308
if (havedash) {
309
addrange(g, save, g->yychar);
310
havesave = havedash = 0;
311
} else {
312
addrange(g, save, save);
313
save = g->yychar;
314
}
315
} else {
316
save = g->yychar;
317
havesave = 1;
318
}
319
}
320
321
quoted = nextrune(g);
322
}
323
324
if (havesave) {
325
addrange(g, save, save);
326
if (havedash)
327
addrange(g, '-', '-');
328
}
329
330
return type;
331
}
332
333
static int lex(struct cstate *g)
334
{
335
int quoted = nextrune(g);
336
if (quoted) {
337
switch (g->yychar) {
338
case 'b': return L_WORD;
339
case 'B': return L_NWORD;
340
case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
341
case 's': newcclass(g); addranges_s(g); return L_CCLASS;
342
case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
343
case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
344
case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
345
case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
346
case '0': g->yychar = 0; return L_CHAR;
347
}
348
if (g->yychar >= '0' && g->yychar <= '9') {
349
g->yychar -= '0';
350
if (*g->source >= '0' && *g->source <= '9')
351
g->yychar = g->yychar * 10 + *g->source++ - '0';
352
return L_REF;
353
}
354
return L_CHAR;
355
}
356
357
switch (g->yychar) {
358
case 0:
359
case '$': case ')': case '*': case '+':
360
case '.': case '?': case '^': case '|':
361
return g->yychar;
362
}
363
364
if (g->yychar == '{')
365
return lexcount(g);
366
if (g->yychar == '[')
367
return lexclass(g);
368
if (g->yychar == '(') {
369
if (g->source[0] == '?') {
370
if (g->source[1] == ':') {
371
g->source += 2;
372
return L_NC;
373
}
374
if (g->source[1] == '=') {
375
g->source += 2;
376
return L_PLA;
377
}
378
if (g->source[1] == '!') {
379
g->source += 2;
380
return L_NLA;
381
}
382
}
383
return '(';
384
}
385
386
return L_CHAR;
387
}
388
389
/* Parse */
390
391
enum {
392
P_CAT, P_ALT, P_REP,
393
P_BOL, P_EOL, P_WORD, P_NWORD,
394
P_PAR, P_PLA, P_NLA,
395
P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
396
P_REF,
397
};
398
399
struct Renode {
400
unsigned char type;
401
unsigned char ng, m, n;
402
Rune c;
403
Reclass *cc;
404
Renode *x;
405
Renode *y;
406
};
407
408
static Renode *newnode(struct cstate *g, int type)
409
{
410
Renode *node = g->pend++;
411
node->type = type;
412
node->cc = NULL;
413
node->c = 0;
414
node->ng = 0;
415
node->m = 0;
416
node->n = 0;
417
node->x = node->y = NULL;
418
return node;
419
}
420
421
static int empty(Renode *node)
422
{
423
if (!node) return 1;
424
switch (node->type) {
425
default: return 1;
426
case P_CAT: return empty(node->x) && empty(node->y);
427
case P_ALT: return empty(node->x) || empty(node->y);
428
case P_REP: return empty(node->x) || node->m == 0;
429
case P_PAR: return empty(node->x);
430
case P_REF: return empty(node->x);
431
case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
432
}
433
}
434
435
static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)
436
{
437
Renode *rep = newnode(g, P_REP);
438
if (max == REPINF && empty(atom))
439
die(g, "infinite loop matching the empty string");
440
rep->ng = ng;
441
rep->m = min;
442
rep->n = max;
443
rep->x = atom;
444
return rep;
445
}
446
447
static void next(struct cstate *g)
448
{
449
g->lookahead = lex(g);
450
}
451
452
static int accept(struct cstate *g, int t)
453
{
454
if (g->lookahead == t) {
455
next(g);
456
return 1;
457
}
458
return 0;
459
}
460
461
static Renode *parsealt(struct cstate *g);
462
463
static Renode *parseatom(struct cstate *g)
464
{
465
Renode *atom;
466
if (g->lookahead == L_CHAR) {
467
atom = newnode(g, P_CHAR);
468
atom->c = g->yychar;
469
next(g);
470
return atom;
471
}
472
if (g->lookahead == L_CCLASS) {
473
atom = newnode(g, P_CCLASS);
474
atom->cc = g->yycc;
475
next(g);
476
return atom;
477
}
478
if (g->lookahead == L_NCCLASS) {
479
atom = newnode(g, P_NCCLASS);
480
atom->cc = g->yycc;
481
next(g);
482
return atom;
483
}
484
if (g->lookahead == L_REF) {
485
atom = newnode(g, P_REF);
486
if (g->yychar == 0 || g->yychar > g->nsub || !g->sub[g->yychar])
487
die(g, "invalid back-reference");
488
atom->n = g->yychar;
489
atom->x = g->sub[g->yychar];
490
next(g);
491
return atom;
492
}
493
if (accept(g, '.'))
494
return newnode(g, P_ANY);
495
if (accept(g, '(')) {
496
atom = newnode(g, P_PAR);
497
if (g->nsub == MAXSUB)
498
die(g, "too many captures");
499
atom->n = g->nsub++;
500
atom->x = parsealt(g);
501
g->sub[atom->n] = atom;
502
if (!accept(g, ')'))
503
die(g, "unmatched '('");
504
return atom;
505
}
506
if (accept(g, L_NC)) {
507
atom = parsealt(g);
508
if (!accept(g, ')'))
509
die(g, "unmatched '('");
510
return atom;
511
}
512
if (accept(g, L_PLA)) {
513
atom = newnode(g, P_PLA);
514
atom->x = parsealt(g);
515
if (!accept(g, ')'))
516
die(g, "unmatched '('");
517
return atom;
518
}
519
if (accept(g, L_NLA)) {
520
atom = newnode(g, P_NLA);
521
atom->x = parsealt(g);
522
if (!accept(g, ')'))
523
die(g, "unmatched '('");
524
return atom;
525
}
526
die(g, "syntax error");
527
return NULL;
528
}
529
530
static Renode *parserep(struct cstate *g)
531
{
532
Renode *atom;
533
534
if (accept(g, '^')) return newnode(g, P_BOL);
535
if (accept(g, '$')) return newnode(g, P_EOL);
536
if (accept(g, L_WORD)) return newnode(g, P_WORD);
537
if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
538
539
atom = parseatom(g);
540
if (g->lookahead == L_COUNT) {
541
int min = g->yymin, max = g->yymax;
542
next(g);
543
if (max < min)
544
die(g, "invalid quantifier");
545
return newrep(g, atom, accept(g, '?'), min, max);
546
}
547
if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF);
548
if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF);
549
if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);
550
return atom;
551
}
552
553
static Renode *parsecat(struct cstate *g)
554
{
555
Renode *cat, *x;
556
if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
557
cat = parserep(g);
558
while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
559
x = cat;
560
cat = newnode(g, P_CAT);
561
cat->x = x;
562
cat->y = parserep(g);
563
}
564
return cat;
565
}
566
return NULL;
567
}
568
569
static Renode *parsealt(struct cstate *g)
570
{
571
Renode *alt, *x;
572
alt = parsecat(g);
573
while (accept(g, '|')) {
574
x = alt;
575
alt = newnode(g, P_ALT);
576
alt->x = x;
577
alt->y = parsecat(g);
578
}
579
return alt;
580
}
581
582
/* Compile */
583
584
enum {
585
I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
586
I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
587
I_BOL, I_EOL, I_WORD, I_NWORD,
588
I_LPAR, I_RPAR
589
};
590
591
struct Reinst {
592
unsigned char opcode;
593
unsigned char n;
594
Rune c;
595
Reclass *cc;
596
Reinst *x;
597
Reinst *y;
598
};
599
600
static unsigned int count(Renode *node)
601
{
602
unsigned int min, max;
603
if (!node) return 0;
604
switch (node->type) {
605
default: return 1;
606
case P_CAT: return count(node->x) + count(node->y);
607
case P_ALT: return count(node->x) + count(node->y) + 2;
608
case P_REP:
609
min = node->m;
610
max = node->n;
611
if (min == max) return count(node->x) * min;
612
if (max < REPINF) return count(node->x) * max + (max - min);
613
return count(node->x) * (min + 1) + 2;
614
case P_PAR: return count(node->x) + 2;
615
case P_PLA: return count(node->x) + 2;
616
case P_NLA: return count(node->x) + 2;
617
}
618
}
619
620
static Reinst *emit(Reprog *prog, int opcode)
621
{
622
Reinst *inst = prog->end++;
623
inst->opcode = opcode;
624
inst->n = 0;
625
inst->c = 0;
626
inst->cc = NULL;
627
inst->x = inst->y = NULL;
628
return inst;
629
}
630
631
static void compile(Reprog *prog, Renode *node)
632
{
633
Reinst *inst, *split, *jump;
634
unsigned int i;
635
636
if (!node)
637
return;
638
639
switch (node->type) {
640
case P_CAT:
641
compile(prog, node->x);
642
compile(prog, node->y);
643
break;
644
645
case P_ALT:
646
split = emit(prog, I_SPLIT);
647
compile(prog, node->x);
648
jump = emit(prog, I_JUMP);
649
compile(prog, node->y);
650
split->x = split + 1;
651
split->y = jump + 1;
652
jump->x = prog->end;
653
break;
654
655
case P_REP:
656
for (i = 0; i < node->m; ++i) {
657
inst = prog->end;
658
compile(prog, node->x);
659
}
660
if (node->m == node->n)
661
break;
662
if (node->n < REPINF) {
663
for (i = node->m; i < node->n; ++i) {
664
split = emit(prog, I_SPLIT);
665
compile(prog, node->x);
666
if (node->ng) {
667
split->y = split + 1;
668
split->x = prog->end;
669
} else {
670
split->x = split + 1;
671
split->y = prog->end;
672
}
673
}
674
} else if (node->m == 0) {
675
split = emit(prog, I_SPLIT);
676
compile(prog, node->x);
677
jump = emit(prog, I_JUMP);
678
if (node->ng) {
679
split->y = split + 1;
680
split->x = prog->end;
681
} else {
682
split->x = split + 1;
683
split->y = prog->end;
684
}
685
jump->x = split;
686
} else {
687
split = emit(prog, I_SPLIT);
688
if (node->ng) {
689
split->y = inst;
690
split->x = prog->end;
691
} else {
692
split->x = inst;
693
split->y = prog->end;
694
}
695
}
696
break;
697
698
case P_BOL: emit(prog, I_BOL); break;
699
case P_EOL: emit(prog, I_EOL); break;
700
case P_WORD: emit(prog, I_WORD); break;
701
case P_NWORD: emit(prog, I_NWORD); break;
702
703
case P_PAR:
704
inst = emit(prog, I_LPAR);
705
inst->n = node->n;
706
compile(prog, node->x);
707
inst = emit(prog, I_RPAR);
708
inst->n = node->n;
709
break;
710
case P_PLA:
711
split = emit(prog, I_PLA);
712
compile(prog, node->x);
713
emit(prog, I_END);
714
split->x = split + 1;
715
split->y = prog->end;
716
break;
717
case P_NLA:
718
split = emit(prog, I_NLA);
719
compile(prog, node->x);
720
emit(prog, I_END);
721
split->x = split + 1;
722
split->y = prog->end;
723
break;
724
725
case P_ANY:
726
emit(prog, I_ANY);
727
break;
728
case P_CHAR:
729
inst = emit(prog, I_CHAR);
730
inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
731
break;
732
case P_CCLASS:
733
inst = emit(prog, I_CCLASS);
734
inst->cc = node->cc;
735
break;
736
case P_NCCLASS:
737
inst = emit(prog, I_NCCLASS);
738
inst->cc = node->cc;
739
break;
740
case P_REF:
741
inst = emit(prog, I_REF);
742
inst->n = node->n;
743
break;
744
}
745
}
746
747
#ifdef TEST
748
static void dumpnode(Renode *node)
749
{
750
Rune *p;
751
if (!node) { printf("Empty"); return; }
752
switch (node->type) {
753
case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
754
case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
755
case P_REP:
756
printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
757
dumpnode(node->x);
758
printf(")");
759
break;
760
case P_BOL: printf("Bol"); break;
761
case P_EOL: printf("Eol"); break;
762
case P_WORD: printf("Word"); break;
763
case P_NWORD: printf("NotWord"); break;
764
case P_PAR: printf("Par(%d,", node->n); dumpnode(node->x); printf(")"); break;
765
case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;
766
case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;
767
case P_ANY: printf("Any"); break;
768
case P_CHAR: printf("Char(%c)", node->c); break;
769
case P_CCLASS:
770
printf("Class(");
771
for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
772
printf(")");
773
break;
774
case P_NCCLASS:
775
printf("NotClass(");
776
for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
777
printf(")");
778
break;
779
case P_REF: printf("Ref(%d)", node->n); break;
780
}
781
}
782
783
static void dumpprog(Reprog *prog)
784
{
785
Reinst *inst;
786
int i;
787
for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
788
printf("% 5d: ", i);
789
switch (inst->opcode) {
790
case I_END: puts("end"); break;
791
case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
792
case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
793
case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
794
case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
795
case I_ANY: puts("any"); break;
796
case I_ANYNL: puts("anynl"); break;
797
case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
798
case I_CCLASS: puts("cclass"); break;
799
case I_NCCLASS: puts("ncclass"); break;
800
case I_REF: printf("ref %d\n", inst->n); break;
801
case I_BOL: puts("bol"); break;
802
case I_EOL: puts("eol"); break;
803
case I_WORD: puts("word"); break;
804
case I_NWORD: puts("nword"); break;
805
case I_LPAR: printf("lpar %d\n", inst->n); break;
806
case I_RPAR: printf("rpar %d\n", inst->n); break;
807
}
808
}
809
}
810
#endif
811
812
Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
813
{
814
struct cstate g;
815
Renode *node;
816
Reinst *split, *jump;
817
int i;
818
819
g.prog = malloc(sizeof (Reprog));
820
g.pstart = g.pend = malloc(sizeof (Renode) * strlen(pattern) * 2);
821
822
if (setjmp(g.kaboom)) {
823
if (errorp) *errorp = g.error;
824
free(g.pstart);
825
free(g.prog);
826
return NULL;
827
}
828
829
g.source = pattern;
830
g.ncclass = 0;
831
g.nsub = 1;
832
for (i = 0; i < MAXSUB; ++i)
833
g.sub[i] = 0;
834
835
g.prog->flags = cflags;
836
837
next(&g);
838
node = parsealt(&g);
839
if (g.lookahead == ')')
840
die(&g, "unmatched ')'");
841
if (g.lookahead != 0)
842
die(&g, "syntax error");
843
844
g.prog->nsub = g.nsub;
845
g.prog->start = g.prog->end = malloc((count(node) + 6) * sizeof (Reinst));
846
847
split = emit(g.prog, I_SPLIT);
848
split->x = split + 3;
849
split->y = split + 1;
850
emit(g.prog, I_ANYNL);
851
jump = emit(g.prog, I_JUMP);
852
jump->x = split;
853
emit(g.prog, I_LPAR);
854
compile(g.prog, node);
855
emit(g.prog, I_RPAR);
856
emit(g.prog, I_END);
857
858
#ifdef TEST
859
dumpnode(node);
860
putchar('\n');
861
dumpprog(g.prog);
862
#endif
863
864
free(g.pstart);
865
866
if (errorp) *errorp = NULL;
867
return g.prog;
868
}
869
870
void regfree(Reprog *prog)
871
{
872
if (prog) {
873
free(prog->start);
874
free(prog);
875
}
876
}
877
878
/* Match */
879
880
static int isnewline(int c)
881
{
882
return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
883
}
884
885
static int iswordchar(int c)
886
{
887
return c == '_' ||
888
(c >= 'a' && c <= 'z') ||
889
(c >= 'A' && c <= 'Z') ||
890
(c >= '0' && c <= '9');
891
}
892
893
static int incclass(Reclass *cc, Rune c)
894
{
895
Rune *p;
896
for (p = cc->spans; p < cc->end; p += 2)
897
if (p[0] <= c && c <= p[1])
898
return 1;
899
return 0;
900
}
901
902
static int incclasscanon(Reclass *cc, Rune c)
903
{
904
Rune *p, r;
905
for (p = cc->spans; p < cc->end; p += 2)
906
for (r = p[0]; r <= p[1]; ++r)
907
if (c == canon(r))
908
return 1;
909
return 0;
910
}
911
912
static int strncmpcanon(const char *a, const char *b, unsigned int n)
913
{
914
Rune ra, rb;
915
int c;
916
while (n--) {
917
if (!*a) return -1;
918
if (!*b) return 1;
919
a += chartorune(&ra, a);
920
b += chartorune(&rb, b);
921
c = canon(ra) - canon(rb);
922
if (c)
923
return c;
924
}
925
return 0;
926
}
927
928
struct Rethread {
929
Reinst *pc;
930
const char *sp;
931
Resub sub;
932
};
933
934
static void spawn(Rethread *t, Reinst *pc, const char *sp, Resub *sub)
935
{
936
t->pc = pc;
937
t->sp = sp;
938
memcpy(&t->sub, sub, sizeof t->sub);
939
}
940
941
static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out)
942
{
943
Rethread ready[MAXTHREAD];
944
Resub scratch;
945
Resub sub;
946
Rune c;
947
unsigned int nready;
948
int i;
949
950
/* queue initial thread */
951
spawn(ready + 0, pc, sp, out);
952
nready = 1;
953
954
/* run threads in stack order */
955
while (nready > 0) {
956
--nready;
957
pc = ready[nready].pc;
958
sp = ready[nready].sp;
959
memcpy(&sub, &ready[nready].sub, sizeof sub);
960
for (;;) {
961
switch (pc->opcode) {
962
case I_END:
963
for (i = 0; i < MAXSUB; ++i) {
964
out->sub[i].sp = sub.sub[i].sp;
965
out->sub[i].ep = sub.sub[i].ep;
966
}
967
return 1;
968
case I_JUMP:
969
pc = pc->x;
970
continue;
971
case I_SPLIT:
972
if (nready >= MAXTHREAD) {
973
fprintf(stderr, "regexec: backtrack overflow!\n");
974
return 0;
975
}
976
spawn(&ready[nready++], pc->y, sp, &sub);
977
pc = pc->x;
978
continue;
979
980
case I_PLA:
981
if (!match(pc->x, sp, bol, flags, &sub))
982
goto dead;
983
pc = pc->y;
984
continue;
985
case I_NLA:
986
memcpy(&scratch, &sub, sizeof scratch);
987
if (match(pc->x, sp, bol, flags, &scratch))
988
goto dead;
989
pc = pc->y;
990
continue;
991
992
case I_ANYNL:
993
sp += chartorune(&c, sp);
994
if (c == 0)
995
goto dead;
996
break;
997
case I_ANY:
998
sp += chartorune(&c, sp);
999
if (c == 0)
1000
goto dead;
1001
if (isnewline(c))
1002
goto dead;
1003
break;
1004
case I_CHAR:
1005
sp += chartorune(&c, sp);
1006
if (c == 0)
1007
goto dead;
1008
if (flags & REG_ICASE)
1009
c = canon(c);
1010
if (c != pc->c)
1011
goto dead;
1012
break;
1013
case I_CCLASS:
1014
sp += chartorune(&c, sp);
1015
if (c == 0)
1016
goto dead;
1017
if (flags & REG_ICASE) {
1018
if (!incclasscanon(pc->cc, canon(c)))
1019
goto dead;
1020
} else {
1021
if (!incclass(pc->cc, c))
1022
goto dead;
1023
}
1024
break;
1025
case I_NCCLASS:
1026
sp += chartorune(&c, sp);
1027
if (c == 0)
1028
goto dead;
1029
if (flags & REG_ICASE) {
1030
if (incclasscanon(pc->cc, canon(c)))
1031
goto dead;
1032
} else {
1033
if (incclass(pc->cc, c))
1034
goto dead;
1035
}
1036
break;
1037
case I_REF:
1038
i = sub.sub[pc->n].ep - sub.sub[pc->n].sp;
1039
if (flags & REG_ICASE) {
1040
if (strncmpcanon(sp, sub.sub[pc->n].sp, i))
1041
goto dead;
1042
} else {
1043
if (strncmp(sp, sub.sub[pc->n].sp, i))
1044
goto dead;
1045
}
1046
if (i > 0)
1047
sp += i;
1048
break;
1049
1050
case I_BOL:
1051
if (sp == bol && !(flags & REG_NOTBOL))
1052
break;
1053
if (flags & REG_NEWLINE)
1054
if (sp > bol && isnewline(sp[-1]))
1055
break;
1056
goto dead;
1057
case I_EOL:
1058
if (*sp == 0)
1059
break;
1060
if (flags & REG_NEWLINE)
1061
if (isnewline(*sp))
1062
break;
1063
goto dead;
1064
case I_WORD:
1065
i = sp > bol && iswordchar(sp[-1]);
1066
i ^= iswordchar(sp[0]);
1067
if (i)
1068
break;
1069
goto dead;
1070
case I_NWORD:
1071
i = sp > bol && iswordchar(sp[-1]);
1072
i ^= iswordchar(sp[0]);
1073
if (!i)
1074
break;
1075
goto dead;
1076
1077
case I_LPAR:
1078
sub.sub[pc->n].sp = sp;
1079
break;
1080
case I_RPAR:
1081
sub.sub[pc->n].ep = sp;
1082
break;
1083
default:
1084
goto dead;
1085
}
1086
pc = pc + 1;
1087
}
1088
dead: ;
1089
}
1090
return 0;
1091
}
1092
1093
int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
1094
{
1095
Resub scratch;
1096
int i;
1097
1098
if (!sub)
1099
sub = &scratch;
1100
1101
sub->nsub = prog->nsub;
1102
for (i = 0; i < MAXSUB; ++i)
1103
sub->sub[i].sp = sub->sub[i].ep = NULL;
1104
1105
return !match(prog->start, sp, sp, prog->flags | eflags, sub);
1106
}
1107
1108
#ifdef TEST
1109
int main(int argc, char **argv)
1110
{
1111
const char *error;
1112
const char *s;
1113
Reprog *p;
1114
Resub m;
1115
unsigned int i;
1116
1117
if (argc > 1) {
1118
p = regcomp(argv[1], 0, &error);
1119
if (!p) {
1120
fprintf(stderr, "regcomp: %s\n", error);
1121
return 1;
1122
}
1123
1124
if (argc > 2) {
1125
s = argv[2];
1126
printf("nsub = %d\n", p->nsub);
1127
if (!regexec(p, s, &m, 0)) {
1128
for (i = 0; i < m.nsub; ++i) {
1129
int n = m.sub[i].ep - m.sub[i].sp;
1130
if (n > 0)
1131
printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
1132
else
1133
printf("match %d: n=0 ''\n", i);
1134
}
1135
} else {
1136
printf("no match\n");
1137
}
1138
}
1139
}
1140
1141
return 0;
1142
}
1143
#endif
1144
1145