Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
att
GitHub Repository: att/ast
Path: blob/master/src/cmd/html/bb2tok.c
1808 views
1
/***********************************************************************
2
* *
3
* This software is part of the ast package *
4
* Copyright (c) 1996-2011 AT&T Intellectual Property *
5
* and is licensed under the *
6
* Eclipse Public License, Version 1.0 *
7
* by AT&T Intellectual Property *
8
* *
9
* A copy of the License is available at *
10
* http://www.eclipse.org/org/documents/epl-v10.html *
11
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12
* *
13
* Information and Software Systems Research *
14
* AT&T Research *
15
* Florham Park NJ *
16
* *
17
* Glenn Fowler <[email protected]> *
18
* *
19
***********************************************************************/
20
#pragma prototyped
21
/*
22
* Glenn Fowler
23
* AT&T Research
24
*/
25
26
static const char usage[] =
27
"[-?\n@(#)$Id: bb2tok (AT&T Research) 2007-12-19 $\n]"
28
USAGE_LICENSE
29
"[+NAME?bb2tok - convert bb html to tokens]"
30
"[+DESCRIPTION?\bbb2tok\b extracts tokens from input \bhtml\b \afile\as. "
31
"If \afile\a is not specified then the standard input is read. The "
32
"\bhtml\b parse is rudimentary; don't use \bbb2tok\b to detect valid "
33
"\bhtml\b files.]"
34
35
"\n"
36
"\n[ file ... ]\n"
37
"\n"
38
39
"[+SEE ALSO?\bhtml2db\b(1), \bhtml2rtf\b(1)]"
40
;
41
42
#include <ast.h>
43
#include <ctype.h>
44
#include <error.h>
45
46
#define LINK 0
47
#define NAME 1
48
#define HEADER 2
49
#define BODY 3
50
#define QUOTE 4
51
#define CODE 5
52
#define LABEL 6
53
#define LINE 7
54
55
typedef struct Header_s
56
{
57
char* in;
58
char* out;
59
int lex;
60
int unary;
61
} Header_t;
62
63
static const Header_t header[] =
64
{
65
"a", "link/", LINK, 1,
66
"name", "name", NAME, 0,
67
"postdetails", "header", HEADER, 0,
68
"postbody", "body", BODY, 0,
69
"quote", "quote", QUOTE, 0,
70
"code", "code", CODE, 0,
71
"genmed", "label", LABEL, 0,
72
"line", "line/", LINE, 1,
73
};
74
75
typedef struct State_s
76
{
77
Header_t* prev;
78
int push;
79
int keep;
80
int last;
81
unsigned char* lex;
82
} State_t;
83
84
static void
85
token(State_t* state, Sfio_t* op, const char* text, const Header_t* head, int push)
86
{
87
if (!head)
88
{
89
if (state->keep)
90
{
91
if (*state->lex == LABEL && (streq(text, ":") || streq(text, "Code") || streq(text, "wrote")))
92
return;
93
if (state->prev)
94
{
95
sfprintf(op, "%s<%s%s>\n", (!state->push && (state->prev->lex == HEADER || state->prev->lex == CODE && state->last != '\n')) ? "\n" : "", state->push ? "" : "/", state->prev->out);
96
state->prev = 0;
97
}
98
sfputr(op, text, *state->lex == HEADER ? ' ' : '\n');
99
}
100
}
101
else if (push)
102
{
103
if (state->prev)
104
{
105
if (head->lex == LINK && state->prev->lex == NAME && state->push)
106
return;
107
if (head->lex == LINE && state->prev->lex == HEADER && !state->push)
108
return;
109
if (head->lex == HEADER && push && state->prev->lex == HEADER && !state->push)
110
{
111
state->prev = 0;
112
return;
113
}
114
if (state->keep && (state->prev->lex != head->lex || !head->unary && state->push))
115
sfprintf(op, "%s<%s%s>\n", (!state->push && (state->prev->lex == HEADER || state->prev->lex == CODE && state->last != '\n')) ? "\n" : "", state->push ? "" : "/", state->prev->out);
116
if (head->lex == LINE && state->prev->lex == BODY && !state->push)
117
state->keep = 0;
118
}
119
switch (head->lex)
120
{
121
case CODE:
122
state->prev = 0;
123
sfprintf(op, "<%s>", head->out);
124
return;
125
case NAME:
126
state->keep = 1;
127
break;
128
}
129
state->prev = (Header_t*)head;
130
state->push = push;
131
}
132
else
133
{
134
if (state->keep && state->prev)
135
{
136
if (state->prev->lex == head->lex && state->push)
137
{
138
state->prev = 0;
139
return;
140
}
141
sfprintf(op, "%s<%s%s>\n", (!state->push && (state->prev->lex == HEADER || state->prev->lex == CODE && state->last != '\n')) ? "\n" : "", state->push ? "" : "/", state->prev->out);
142
}
143
state->prev = (Header_t*)head;
144
state->push = push;
145
}
146
}
147
148
#define TOKEN(sp,op,tok,t) do { if (t > tok) { *t = 0; token(sp, op, t = tok, 0, 0); } } while (0)
149
#define PUSH(sp,op,h) token(sp,op,0,h,1)
150
#define POP(sp,op,h) token(sp,op,0,h,0)
151
152
static void
153
parse(const char* path, Sfio_t* ip, Sfio_t* op)
154
{
155
register int c;
156
register int i;
157
register int k;
158
register int q;
159
register int n;
160
register int x;
161
register int level;
162
register char* e;
163
register char* s;
164
register char* t;
165
const Header_t* h;
166
167
char tag[256];
168
char tok[4 * 1024];
169
unsigned char lex[4 * 1024];
170
const Header_t* block[4 * 1024];
171
172
State_t state;
173
174
state.prev = (Header_t*)&header[*(state.lex = lex) = LINE];
175
state.push = 1;
176
state.keep = 0;
177
t = tok;
178
k = q = n = level = 0;
179
for (;;)
180
{
181
switch (c = sfgetc(ip))
182
{
183
case EOF:
184
TOKEN(&state, op, tok, t);
185
break;
186
case '<':
187
TOKEN(&state, op, tok, t);
188
x = 0;
189
s = tag;
190
for (;;)
191
{
192
switch (c = sfgetc(ip))
193
{
194
case EOF:
195
TOKEN(&state, op, tok, t);
196
return;
197
case '"':
198
if (!q)
199
q = c;
200
else if (q == c)
201
q = 0;
202
goto keep;
203
case '!':
204
if (s != tag)
205
goto keep;
206
x = 1;
207
continue;
208
case '\n':
209
x = 1;
210
continue;
211
case '>':
212
if (!q)
213
break;
214
/*FALLTHROUGH*/
215
default:
216
keep:
217
if (!x && s < &tag[sizeof(tag)-1])
218
*s++ = isupper(c) ? tolower(c) : c;
219
continue;
220
}
221
break;
222
}
223
*s = 0;
224
s = tag;
225
if (!k)
226
{
227
if (s[0] == 'b' && s[1] == 'o' && s[2] == 'd' && s[3] == 'y' && (!s[4] || s[4] == ' '))
228
k = 1;
229
else
230
continue;
231
}
232
if (s[0] == 's' && s[1] == 'p' && s[2] == 'a' && s[3] == 'n' && (!s[4] || s[4] == ' ') && (s += 4) || s[0] == 't' && s[1] == 'd' && (!s[2] || s[2] == ' ') && (s += 2))
233
{
234
h = 0;
235
if (s[0] == ' ' && strneq(s + 1, "class=\"", 7))
236
{
237
for (e = s += 8; *e && *e != '"'; e++);
238
*e = 0;
239
for (i = 0; i < elementsof(header); i++)
240
if (streq(s, header[i].in))
241
{
242
h = &header[i];
243
if (level < elementsof(block))
244
{
245
PUSH(&state, op, h);
246
n++;
247
}
248
break;
249
}
250
}
251
if (level < elementsof(block) && (block[level] = h))
252
*++state.lex = h->lex;
253
level++;
254
}
255
else if (s[0] == '/' && (s[1] == 's' && s[2] == 'p' && s[3] == 'a' && s[4] == 'n' && !s[5] || s[1] == 't' && s[2] == 'd' && !s[3]))
256
{
257
if (level > 0)
258
{
259
level--;
260
if (level < elementsof(block) && (h = block[level]))
261
{
262
POP(&state, op, h);
263
n--;
264
state.lex--;
265
}
266
}
267
}
268
else if (n)
269
{
270
if (s[0] == 'b' && s[1] == 'r' && (!s[2] || s[2] == ' ' || s[2] == '/'))
271
{
272
if ((c = sfgetc(ip)) == '\n')
273
continue;
274
sfungetc(ip, c);
275
}
276
if (s[0] == 'a' && s[1] == ' ')
277
PUSH(&state, op, &header[LINK]);
278
else
279
{
280
c = ' ';
281
goto space;
282
}
283
}
284
continue;
285
case '&':
286
while ((c = sfgetc(ip)) != EOF && isalnum(c));
287
c = ' ';
288
goto space;
289
case ':':
290
case ';':
291
case ',':
292
case '.':
293
if (*state.lex == CODE)
294
goto code;
295
TOKEN(&state, op, tok, t);
296
*t++ = c;
297
TOKEN(&state, op, tok, t);
298
continue;
299
case ' ':
300
case '\t':
301
case '\r':
302
case '\v':
303
space:
304
if (*state.lex == CODE)
305
goto code;
306
TOKEN(&state, op, tok, t);
307
continue;
308
case '\n':
309
if (*state.lex == CODE)
310
goto code;
311
TOKEN(&state, op, tok, t);
312
PUSH(&state, op, &header[LINE]);
313
continue;
314
default:
315
if (*state.lex == CODE)
316
goto code;
317
if (t >= &tok[sizeof(tok) - 1])
318
TOKEN(&state, op, tok, t);
319
*t++ = c;
320
continue;
321
code:
322
sfputc(op, c);
323
state.last = c;
324
continue;
325
}
326
break;
327
}
328
}
329
330
int
331
main(int argc, char** argv)
332
{
333
register char* s;
334
register Sfio_t* ip;
335
336
NoP(argc);
337
error_info.id = "bb2tok";
338
for (;;)
339
{
340
switch (optget(argv, usage))
341
{
342
case '?':
343
error(ERROR_USAGE|4, "%s", opt_info.arg);
344
continue;
345
case ':':
346
error(2, "%s", opt_info.arg);
347
continue;
348
}
349
break;
350
}
351
argv += opt_info.index;
352
if (error_info.errors)
353
error(ERROR_USAGE|4, "%s", optusage(NiL));
354
do
355
{
356
if (!(s = *argv) || streq(s, "-") || streq(s, "/dev/stdin") || streq(s, "/dev/fd/0"))
357
{
358
s = "/dev/stdin";
359
ip = sfstdin;
360
}
361
else if (!(ip = sfopen(NiL, s, "r")))
362
{
363
error(ERROR_SYSTEM|2, "%s: cannot read", s);
364
continue;
365
}
366
parse(s, ip, sfstdout);
367
if (ip != sfstdin)
368
sfclose(ip);
369
} while (*argv && *++argv);
370
return error_info.errors != 0;
371
}
372
373