Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
att
GitHub Repository: att/ast
Path: blob/master/src/cmd/re/sed1.c
1808 views
1
/***********************************************************************
2
* *
3
* This software is part of the ast package *
4
* Copyright (c) 1995-2012 AT&T Intellectual Property *
5
* and is licensed under the *
6
* Eclipse Public License, Version 1.0 *
7
* by AT&T Intellectual Property *
8
* *
9
* A copy of the License is available at *
10
* http://www.eclipse.org/org/documents/epl-v10.html *
11
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12
* *
13
* Information and Software Systems Research *
14
* AT&T Research *
15
* Florham Park NJ *
16
* *
17
* Glenn Fowler <[email protected]> *
18
* *
19
***********************************************************************/
20
#pragma prototyped
21
22
#include "sed.h"
23
24
#include <ctype.h>
25
26
#define ustrlen(p) strlen((char*)(p))
27
#define ustrcmp(p, q) strcmp((char*)(p), (char*)(q))
28
#define ustrcpy(p, q) (unsigned char*)strcpy((char*)(p), (char*)(q))
29
#define ustrchr(p, c) (unsigned char*)strchr((char*)(p), c)
30
31
int blank(Text*);
32
void fixlabels(Text*);
33
void fixbrack(Text*);
34
void ckludge(Text*, int, int, int, Text*);
35
int addr(Text*, Text*);
36
word* instr(unsigned char*);
37
unsigned char *succi(unsigned char*);
38
39
#if DEBUG
40
extern void regdump(regex_t*); /* secret entry into regex pkg */
41
#endif
42
43
static Text rebuf;
44
45
static const unsigned char adrs[UCHAR_MAX+1] = { /* max no. of addrs, 3 is illegal */
46
0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, /* <nl> */
47
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
48
3, 2, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* !# */
49
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 3, 1, 3, 3, /* := */
50
3, 3, 3, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 3, /* DGHN */
51
2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* P */
52
3, 1, 2, 2, 2, 3, 3, 2, 2, 1, 3, 3, 2, 3, 2, 3, /* a-n */
53
2, 1, 2, 2, 2, 3, 3, 2, 2, 2, 3, 2, 3, 0, 3, 3, /* p-y{} */
54
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
55
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
56
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
57
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
58
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
59
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
60
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
61
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
62
};
63
64
#define Ec Tc /* commands that have same compilation method */
65
#define Dc Tc
66
#define Gc Tc
67
#define Hc Tc
68
#define Nc Tc
69
#define Pc Tc
70
#define dc Tc
71
#define gc Tc
72
#define hc Tc
73
#define lc Tc
74
#define nc Tc
75
#define pc Tc
76
#define xc Tc
77
#define tc bc
78
#define ic ac
79
#define cc ac
80
81
unsigned char *synl; /* current line pointer for syntax errors */
82
83
/* COMMAND LAYOUT */
84
85
int
86
blank(Text *t)
87
{
88
if(*t->w==' ' || *t->w=='\t' || *t->w=='\r') {
89
t->w++;
90
return 1;
91
} else
92
return 0;
93
}
94
95
word *
96
instr(unsigned char *p) /* get address of command word */
97
{
98
word *q = (word*)p;
99
while((*q & IMASK) != IMASK)
100
q++;
101
return q;
102
}
103
104
unsigned char *
105
succi(unsigned char *p)
106
{
107
word *q = instr(p);
108
if(code(*q) == '{')
109
return (unsigned char*)(q+1);
110
else
111
return p + (*q & LMASK);
112
}
113
114
word
115
pack(int neg, int cmd, word length)
116
{
117
int l = length & LMASK;
118
if(length != l)
119
syntax("<command-list> or <text> too long");
120
return IMASK | neg | cmd << 2*BYTE | l;
121
}
122
123
void
124
putword(Text *s, word n)
125
{
126
assure(s, sizeof(word));
127
*(word*)s->w = n;
128
s->w += sizeof(word);
129
}
130
131
int
132
number(Text *t)
133
{
134
unsigned n = 0;
135
while(isdigit(*t->w)) {
136
if(n > (INT_MAX-9)/10)
137
syntax("number too big");
138
n = n*10 + *t->w++ - '0';
139
}
140
return n;
141
}
142
143
int
144
addr(Text *script, Text *t)
145
{
146
word n;
147
if(reflags & REG_LENIENT)
148
while(*t->w == ' ' || *t->w == '\t' || *t->w == '\r')
149
t->w++;
150
switch(*t->w) {
151
default:
152
return 0;
153
case '$':
154
t->w++;
155
n = DOLLAR;
156
break;
157
case '\\':
158
t->w++;
159
case '/':
160
n = recomp(&rebuf, t, 0) | REGADR;
161
break;
162
case '0': case '1': case '2': case '3': case '4':
163
case '5': case '6': case '7': case '8': case '9':
164
n = number(t);
165
if(n == 0)
166
syntax("address is zero");
167
}
168
putword(script, n);
169
if(reflags & REG_LENIENT)
170
while(*t->w == ' ' || *t->w == '\t' || *t->w == '\r')
171
t->w++;
172
return 1;
173
}
174
175
regex_t *
176
readdr(word x)
177
{
178
return (regex_t*)(rebuf.s + (x&AMASK));
179
}
180
181
/* LABEL HANDLING */
182
183
/* the labels array consists of int values followed by strings.
184
value -1 means unassigned; other values are relative to the
185
beginning of the script
186
187
on the first pass, every script ref to a label becomes the
188
integer offset of that label in the labels array, or -1 if
189
it is a branch to the end of script
190
191
on the second pass (fixlabels), the script ref is replaced
192
by the value from the labels array. */
193
194
Text labels;
195
196
word *
197
lablook(unsigned char *l, Text *labels)
198
{
199
unsigned char *p, *q;
200
word n, m;
201
assure(labels, 1);
202
for(p = labels->s; p < labels->w; ) {
203
q = p + sizeof(word);
204
if(ustrcmp(q, l) == 0)
205
return (word*)p;
206
q += ustrlen(q) + 1;
207
p = (unsigned char*)wordp(q);
208
}
209
n = ustrlen(l);
210
m = (p - labels->s);
211
assure(labels, sizeof(word)+n+1+sizeof(word));
212
p = labels->s + m;
213
*(word*)p = -1;
214
q = p + sizeof(word);
215
ustrcpy(q, l);
216
q += ustrlen(q) + 1;
217
labels->w = (unsigned char*)wordp(q);
218
return (word*)p;
219
}
220
221
/* find pos in label list; assign value i to label if i>=0 */
222
223
word
224
getlab(Text *t, word i)
225
{
226
word *p;
227
unsigned char *u;
228
while(blank(t)); /* not exactly posix */
229
for(u=t->w; *t->w!='\n'; t->w++)
230
if(!isprint(*t->w) || *t->w==' ' || *t->w=='\t' || *t->w=='\r')
231
synwarn("invisible character in name");
232
if(u == t->w)
233
return -1;
234
*t->w = 0;
235
p = lablook(u, &labels);
236
if(*p == -1)
237
*p = i;
238
else if(i != -1)
239
syntax("duplicate label");
240
*t->w = '\n';
241
return (unsigned char*)p - labels.s;
242
}
243
244
void
245
Cc(Text *script, Text *t) /* colon */
246
{
247
if(getlab(t, script->w - sizeof(word) - script->s) == -1)
248
syntax("missing label");
249
}
250
251
void
252
bc(Text *script, Text *t)
253
{
254
word g;
255
g = getlab(t, -1); /* relative pointer to label list */
256
putword(script, g);
257
}
258
259
void
260
fixlabels(Text *script)
261
{
262
unsigned char *p;
263
word *q;
264
for(p=script->s; p<script->w; p=succi(p)) {
265
q = instr(p);
266
switch(code(*q)) {
267
case 't':
268
case 'b':
269
if(q[1] == -1)
270
q[1] = script->w - script->s;
271
else if(*(word*)(labels.s+q[1]) != -1)
272
q[1] = *(word*)(labels.s+q[1]);
273
else
274
error(3, "undefined label: %s",
275
labels.s+q[1]+sizeof(word));
276
}
277
}
278
free(labels.s);
279
}
280
281
/* FILES */
282
283
Text files;
284
285
void
286
rc(Text *script, Text *t)
287
{
288
unsigned char *u;
289
if(!blank(t))
290
synwarn("no space before file name");
291
while(blank(t)) ;
292
for(u=t->w; *t->w!='\n'; t->w++) ;
293
if(u == t->w)
294
syntax("missing file name");
295
*t->w = 0;
296
putword(script, (unsigned char*)lablook(u, &files) - files.s);
297
*t->w = '\n';
298
}
299
300
void
301
wc(Text *script, Text *t)
302
{
303
word *p;
304
rc(script, t);
305
p = (word*)(files.s + ((word*)script->w)[-1]);
306
if(*p != -1)
307
return;
308
*(Sfio_t**)p = sfopen(NiL, (char*)(p+1), "w");
309
if(*(Sfio_t**)p == 0)
310
syntax("can't open file for writing");
311
}
312
313
/* BRACKETS */
314
315
Text brack;
316
317
/* Lc() stacks (in brack) the location of the { command word.
318
Rc() stuffs into that word the offset of the } sequel
319
relative to the command word.
320
fixbrack() modifies the offset to be relative to the
321
beginning of the instruction, including addresses. */
322
323
void /* { */
324
Lc(Text *script, Text *t)
325
{
326
while(blank(t));
327
putword(&brack, script->w - sizeof(word) - script->s);
328
}
329
330
void /* } */
331
Rc(Text *script, Text *t)
332
{
333
word l;
334
word *p;
335
t = t;
336
if(brack.w == 0 || (brack.w-=sizeof(word)) < brack.s)
337
syntax("unmatched }");
338
l = *(word*)brack.w;
339
p = (word*)(script->s + l);
340
l = script->w - script->s - l;
341
if(l >= LMASK - 3*sizeof(word)) /* fixbrack could add 3 */
342
syntax("{command-list} too long)");
343
*p = (*p&~LMASK) | l;
344
}
345
346
void
347
fixbrack(Text *script)
348
{
349
unsigned char *p;
350
word *q;
351
if(brack.w == 0)
352
return;
353
if(brack.w > brack.s)
354
syntax("unmatched {");
355
for(p=script->s; p<script->w; p=succi(p)) {
356
q = instr(p);
357
if(code(*q) == '{')
358
*q += (unsigned char*)q - p;
359
}
360
free(brack.s);
361
}
362
363
/* EASY COMMANDS */
364
365
void
366
Xc(Text *script, Text *t) /* # */
367
{
368
script = script; /* avoid use/set diagnostics */
369
if(t->s[1]=='n')
370
nflag = 1;
371
while(*t->w != '\n')
372
t->w++;
373
}
374
375
void
376
Ic(Text *script, Text *t) /* ignore */
377
{
378
script = script;
379
t->w--;
380
}
381
382
void
383
Tc(Text *script, Text *t) /* trivial to compile */
384
{
385
script = script;
386
t = t;
387
}
388
389
void
390
xx(Text *script, Text *t)
391
{
392
script = script;
393
t = t;
394
syntax("unknown command");
395
}
396
397
/* MISCELLANY */
398
399
void
400
ac(Text *script, Text *t)
401
{
402
if(*t->w++ != '\\' || *t->w++ != '\n')
403
syntax("\\<newline> missing after command");
404
for(;;) {
405
while(bflag && blank(t)) ;
406
assure(script, 2 + sizeof(word));
407
switch(*t->w) {
408
case 0:
409
error(ERROR_PANIC|4, "bug: missed end of <text>");
410
case '\n':
411
*script->w++ = *t->w;
412
*script->w++ = 0;
413
script->w = (unsigned char*)wordp(script->w);
414
return;
415
case '\\':
416
t->w++;
417
default:
418
*script->w++ = *t->w++;
419
}
420
}
421
}
422
423
void
424
qc(Text *script, Text *t)
425
{
426
sfset(sfstdin, SF_SHARE, 1);
427
script = script;
428
t = t;
429
}
430
431
void
432
sc(Text *script, Text *t)
433
{
434
regex_t* re;
435
word n;
436
int c;
437
n = recomp(&rebuf, t, 1);
438
putword(script, n);
439
re = readdr(n);
440
if(c = regsubcomp(re, (char*)t->w, NiL, 0, 0))
441
badre(re, c);
442
t->w += re->re_npat;
443
script->w = (unsigned char*)wordp(script->w);
444
if(re->re_sub->re_flags & REG_SUB_WRITE)
445
wc(script, t);
446
}
447
448
void
449
yc(Text *script, Text *t)
450
{
451
word i, m, x;
452
int delim;
453
unsigned char *s, *pb, *qb;
454
unsigned char *p, *q, *o, *v, **w;
455
int pc, qc;
456
m = 0;
457
if(mbwide()) {
458
pb = t->w;
459
if((delim = mbchar(pb)) == '\n' || delim=='\\')
460
syntax("missing delimiter");
461
p = pb;
462
while((o=p),(pc = mbchar(p))!=delim) {
463
if(pc=='\n')
464
syntax("missing delimiter");
465
if(pc=='\\') {
466
o = p;
467
pc = mbchar(p);
468
}
469
if((p-o)>1 && pc>m)
470
m = pc;
471
}
472
}
473
if(m) {
474
x = 0;
475
qb = p;
476
while((o=p), (pc = mbchar(p))!=delim) {
477
if(pc=='\\') {
478
o = p;
479
pc = mbchar(p);
480
}
481
x += (p-o)+1;
482
}
483
x = roundof(x, sizeof(word));
484
m++;
485
assure(script, (m+1)*sizeof(unsigned char*)+x);
486
w = (unsigned char**)script->w;
487
*w++ = (unsigned char*)0 + m;
488
script->w += (m+1)*sizeof(unsigned char*);
489
v = (unsigned char*)script->w;
490
script->w += x;
491
for(i=0; i<m; i++)
492
w[i] = 0;
493
p = pb;
494
q = qb;
495
while((pb=p), (pc = mbchar(p))!=delim) {
496
if(pc=='\\') {
497
pb = p;
498
if((qc = mbchar(p))=='n')
499
pc = '\n';
500
else if(qc==delim || qc=='\\')
501
pc = qc;
502
else
503
p = pb-1;
504
}
505
qb = q;
506
if((qc = mbchar(q)) == '\n')
507
syntax("missing delimiter");
508
if(qc==delim)
509
syntax("string lengths differ");
510
if(qc=='\\') {
511
qb = q;
512
if((qc = mbchar(q))=='n')
513
*qb = '\n';
514
else if(qc!=delim && qc!='\\')
515
q = qb-1;
516
}
517
i = (q-qb);
518
if(w[pc]) {
519
if(w[pc][0]!=i || memcmp(&w[pc][1], qb, i))
520
syntax("ambiguous map");
521
synwarn("redundant map");
522
}
523
else {
524
w[pc] = v;
525
*v++ = (unsigned char)i;
526
memcpy(v, qb, i);
527
v += i;
528
}
529
}
530
if(mbchar(q) != delim)
531
syntax("string lengths differ");
532
}
533
else {
534
if((delim = *t->w++) == '\n' || delim=='\\')
535
syntax("missing delimiter");
536
assure(script, sizeof(unsigned char*)+UCHAR_MAX+1);
537
w = (unsigned char**)script->w;
538
*w++ = 0;
539
s = (unsigned char*)w;
540
script->w += sizeof(unsigned char*)+UCHAR_MAX+1;
541
for(i=0; i<UCHAR_MAX+1; i++)
542
s[i] = 0;
543
for(q=t->w; (qc = *q++)!=delim; ) {
544
if(qc == '\n')
545
syntax("missing delimiter");
546
if(qc=='\\' && *q==delim)
547
q++;
548
}
549
for(p=t->w; (pc = *p++) != delim; ) {
550
if(pc=='\\') {
551
if(*p==delim || *p=='\\')
552
pc = *p++;
553
else if(*p=='n') {
554
p++;
555
pc = '\n';
556
}
557
}
558
if((qc = *q++) == '\n')
559
syntax("missing delimiter");
560
if(qc==delim)
561
syntax("string lengths differ");
562
if(qc=='\\') {
563
if(*q==delim || *q=='\\')
564
qc = *q++;
565
else if(*q=='n') {
566
q++;
567
qc = '\n';
568
}
569
}
570
if(s[pc]) {
571
if(s[pc]!=qc)
572
syntax("ambiguous map");
573
synwarn("redundant map");
574
}
575
s[pc] = qc;
576
}
577
if(*q++ != delim)
578
syntax("string lengths differ");
579
for(i=0; i<UCHAR_MAX+1; i++)
580
if(s[i] == 0)
581
s[i] = (unsigned char)i;
582
}
583
t->w = q;
584
}
585
586
void
587
synwarn(char *s)
588
{
589
unsigned char *t = ustrchr(synl, '\n');
590
error(1, "%s: %.*s", s, t-synl, synl);
591
}
592
593
void
594
syntax(char *s)
595
{
596
unsigned char *t = ustrchr(synl, '\n');
597
error(3, "%s: %.*s", s, t-synl, synl);
598
}
599
600
void
601
badre(regex_t* re, int code)
602
{
603
unsigned char *t = ustrchr(synl, '\n');
604
if(code && code!= REG_NOMATCH) {
605
char buf[UCHAR_MAX+1];
606
regerror(code, re, buf, sizeof(buf));
607
error(3, "%s: %.*s", buf, t-synl, synl);
608
}
609
else
610
error(3, "invalid regular expression: %.*s", t-synl, synl);
611
}
612
613
#if DEBUG
614
615
void
616
printscript(Text *script)
617
{
618
unsigned char *s;
619
word *q;
620
for(s=script->s; s<script->w; s = succi(s)) {
621
q = (word*)s;
622
if((*q&IMASK) != IMASK) {
623
if((*q&REGADR) == 0)
624
printf("%d", *q);
625
else
626
regdump((regex_t*)(*q & AMASK));
627
q++;
628
}
629
if((*q&IMASK) != IMASK) {
630
if((*q&REGADR) == 0)
631
printf(",%d", *q);
632
else
633
regdump((regex_t*)(*q & AMASK));
634
q += 2;
635
}
636
if(code(*q) == '\n')
637
continue;
638
printf("%s%c\n", *q&NEG?"!":"", code(*q));
639
}
640
}
641
642
#endif
643
644
#if DEBUG & 2
645
646
/* debugging code 2; execute stub.
647
prints the compiled script (without arguments)
648
then each input line with line numbers */
649
650
void
651
execute(Text *script, Text *y)
652
{
653
if(recno == 1)
654
printscript(script);
655
printf("%d:%s",recno,y->s);
656
}
657
658
#endif
659
660
typedef void (*cmdf)(Text*, Text*);
661
662
static const cmdf docom[128] = {
663
xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,Ic,xx,xx,xx,xx,xx, /* <nl> */
664
xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,
665
xx,Ic,xx,Xc,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx, /* !# */
666
xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,Cc,Ic,xx,Ec,xx,xx, /* :;= */
667
xx,xx,xx,xx,Dc,xx,xx,Gc,Hc,xx,xx,xx,xx,xx,Nc,xx, /* DGHN */
668
Pc,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx, /* P */
669
xx,ac,bc,cc,dc,xx,xx,gc,hc,ic,xx,xx,lc,xx,nc,xx, /* a-n */
670
pc,qc,rc,sc,tc,xx,xx,wc,xc,yc,xx,Lc,xx,Rc,xx,xx /* p-y{} */
671
};
672
673
void
674
compile(Text *script, Text *t)
675
{
676
word loc; /* progam counter */
677
int neg; /* ! in effect */
678
int cmd;
679
int naddr;
680
word *q; /* address of instruction word */
681
t->w = t->s; /* here w is a read pointer */
682
while(*t->w) {
683
assure(script, 4*sizeof(word));
684
loc = script->w - script->s;
685
synl = t->w;
686
naddr = 0;
687
while(blank(t)) ;
688
naddr += addr(script, t);
689
if(naddr && *t->w ==',') {
690
t->w++;
691
naddr += addr(script, t);
692
if(naddr < 2)
693
syntax("missing address");
694
}
695
q = (word*)script->w;
696
if(naddr == 2)
697
*q++ = INACT;
698
script->w = (unsigned char*)(q+1);
699
neg = 0;
700
for(;;) {
701
while(blank(t));
702
cmd = *t->w++;
703
if(neg && docom[ccmapchr(map,cmd)&0x7f]==Ic)
704
syntax("improper !");
705
if(cmd != '!')
706
break;
707
neg = NEG;
708
}
709
if(!neg) {
710
switch(adrs[ccmapchr(map,cmd)]) {
711
case 1:
712
if(naddr <= 1)
713
break;
714
case 0:
715
if(naddr == 0)
716
break;
717
syntax("too many addresses");
718
}
719
}
720
(*docom[ccmapchr(map,cmd)&0x7f])(script, t);
721
while(*t->w == ' ' || *t->w == '\t' || *t->w == '\r')
722
t->w++;
723
switch(*t->w) {
724
case 0:
725
script->w = script->s + loc;
726
break;
727
case ';':
728
case '\n':
729
t->w++;
730
break;
731
default:
732
if(cmd == '{')
733
break;
734
syntax("junk after command");
735
}
736
*q = pack(neg,cmd,script->w-script->s-loc);
737
}
738
fixbrack(script);
739
fixlabels(script);
740
}
741
742