Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
att
GitHub Repository: att/ast
Path: blob/master/src/cmd/html/html2db.c
1808 views
1
/***********************************************************************
2
* *
3
* This software is part of the ast package *
4
* Copyright (c) 1996-2011 AT&T Intellectual Property *
5
* and is licensed under the *
6
* Eclipse Public License, Version 1.0 *
7
* by AT&T Intellectual Property *
8
* *
9
* A copy of the License is available at *
10
* http://www.eclipse.org/org/documents/epl-v10.html *
11
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12
* *
13
* Information and Software Systems Research *
14
* AT&T Research *
15
* Florham Park NJ *
16
* *
17
* Glenn Fowler <[email protected]> *
18
* *
19
***********************************************************************/
20
#pragma prototyped
21
/*
22
* Glenn Fowler
23
* AT&T Research
24
*
25
* html2db - extract flat file database from html tables
26
*/
27
28
static const char usage[] =
29
"[-?\n@(#)$Id: html2db (AT&T Research) 1998-11-10 $\n]"
30
USAGE_LICENSE
31
"[+NAME?html2db - extract flat file database from html tables]"
32
"[+DESCRIPTION?\bhtml2db\b extracts a flat file database from tables in the"
33
" input \bhtml\b \afile\as. If \afile\a is not specified then the"
34
" standard input is read. The \bhtml\b parse is rudimentary; don't use"
35
" \bhtml2db\b to detect valid \bhtml\b files.]"
36
37
"\n"
38
"\n[ file ... ]\n"
39
"\n"
40
41
"[+SEE ALSO?\bhtml2rtf\b(1)]"
42
;
43
#include <ast.h>
44
#include <ctype.h>
45
#include <error.h>
46
47
static void
48
flatten(const char* path, Sfio_t* ip, Sfio_t* op)
49
{
50
register int c;
51
register int q;
52
register int p;
53
register int b;
54
register char* s;
55
56
char tag[256];
57
58
b = p = 0;
59
for (;;)
60
{
61
switch (c = sfgetc(ip))
62
{
63
case EOF:
64
break;
65
case '<':
66
q = 0;
67
s = tag;
68
for (;;)
69
{
70
switch (c = sfgetc(ip))
71
{
72
case EOF:
73
return;
74
case '>':
75
sfungetc(ip, c);
76
break;
77
default:
78
if (isspace(c))
79
break;
80
if (s < &tag[sizeof(tag)-1])
81
*s++ = islower(c) ? toupper(c) : c;
82
continue;
83
}
84
break;
85
}
86
*s = 0;
87
q = 0;
88
for (;;)
89
{
90
switch (c = sfgetc(ip))
91
{
92
case EOF:
93
return;
94
case '\'':
95
case '"':
96
if (q == c)
97
q = 0;
98
else if (q == 0)
99
q = c;
100
continue;
101
case '>':
102
if (q == 0)
103
break;
104
continue;
105
default:
106
continue;
107
}
108
break;
109
}
110
s = tag;
111
if (s[0] == 'T' && s[1] == 'D' && s[2] == 0)
112
p = 1;
113
else if (s[0] == '/' && s[1] == 'T')
114
{
115
if (s[2] == 'D' && s[3] == 0)
116
{
117
b = p = 0;
118
sfputc(op, ';');
119
}
120
else if (s[2] == 'R' && s[3] == 0)
121
sfputc(op, '\n');
122
}
123
continue;
124
default:
125
if (p)
126
{
127
if (isspace(c))
128
{
129
if (b)
130
continue;
131
b = 1;
132
c = ' ';
133
}
134
else
135
b = 0;
136
sfputc(op, c);
137
}
138
continue;
139
}
140
break;
141
}
142
}
143
144
int
145
main(int argc, char** argv)
146
{
147
register char* s;
148
register Sfio_t* ip;
149
150
NoP(argc);
151
error_info.id = "html2db";
152
for (;;)
153
{
154
switch (optget(argv, usage))
155
{
156
case '?':
157
error(ERROR_USAGE|4, "%s", opt_info.arg);
158
continue;
159
case ':':
160
error(2, "%s", opt_info.arg);
161
continue;
162
}
163
break;
164
}
165
argv += opt_info.index;
166
if (error_info.errors)
167
error(ERROR_USAGE|4, "%s", optusage(NiL));
168
do
169
{
170
if (!(s = *argv) || streq(s, "-") || streq(s, "/dev/stdin") || streq(s, "/dev/fd/0"))
171
{
172
s = "/dev/stdin";
173
ip = sfstdin;
174
}
175
else if (!(ip = sfopen(NiL, s, "r")))
176
{
177
error(ERROR_SYSTEM|2, "%s: cannot read", s);
178
continue;
179
}
180
flatten(s, ip, sfstdout);
181
if (ip != sfstdin)
182
sfclose(ip);
183
} while (*argv++);
184
return error_info.errors != 0;
185
}
186
187