CoCalc -- HTMLparser.c

GitHub Repository: wine-mirror/wine
Path: blob/master/libs/xml2/HTMLparser.c
⁸⁵⁷⁷ views
1
/*
2
 * HTMLparser.c : an HTML 4.0 non-verifying parser
3
 *
4
 * See Copyright for the status of this software.
5
 *
6
 * [email protected]
7
 */
8

9
#define IN_LIBXML
10
#include "libxml.h"
11
#ifdef LIBXML_HTML_ENABLED
12

13
#include <string.h>
14
#include <ctype.h>
15
#include <stdlib.h>
16

17
#include <libxml/HTMLparser.h>
18
#include <libxml/xmlmemory.h>
19
#include <libxml/tree.h>
20
#include <libxml/parser.h>
21
#include <libxml/parserInternals.h>
22
#include <libxml/xmlerror.h>
23
#include <libxml/HTMLtree.h>
24
#include <libxml/entities.h>
25
#include <libxml/encoding.h>
26
#include <libxml/xmlIO.h>
27
#include <libxml/uri.h>
28

29
#include "private/buf.h"
30
#include "private/enc.h"
31
#include "private/error.h"
32
#include "private/html.h"
33
#include "private/io.h"
34
#include "private/parser.h"
35
#include "private/tree.h"
36

37
#define HTML_MAX_NAMELEN 1000
38
#define HTML_PARSER_BIG_BUFFER_SIZE 1000
39
#define HTML_PARSER_BUFFER_SIZE 100
40

41
static int htmlOmittedDefaultValue = 1;
42

43
xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
44
			     xmlChar end, xmlChar  end2, xmlChar end3);
45
static void htmlParseComment(htmlParserCtxtPtr ctxt);
46

47
/************************************************************************
48
 *									*
49
 *		Some factorized error routines				*
50
 *									*
51
 ************************************************************************/
52

53
/**
54
 * htmlErrMemory:
55
 * @ctxt:  an HTML parser context
56
 * @extra:  extra information
57
 *
58
 * Handle a redefinition of attribute error
59
 */
60
static void
61
htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
62
{
63
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
64
        (ctxt->instate == XML_PARSER_EOF))
65
	return;
66
    if (ctxt != NULL) {
67
        ctxt->errNo = XML_ERR_NO_MEMORY;
68
        ctxt->instate = XML_PARSER_EOF;
69
        ctxt->disableSAX = 1;
70
    }
71
    if (extra)
72
        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
73
                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
74
                        NULL, NULL, 0, 0,
75
                        "Memory allocation failed : %s\n", extra);
76
    else
77
        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
78
                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
79
                        NULL, NULL, 0, 0, "Memory allocation failed\n");
80
}
81

82
/**
83
 * htmlParseErr:
84
 * @ctxt:  an HTML parser context
85
 * @error:  the error number
86
 * @msg:  the error message
87
 * @str1:  string infor
88
 * @str2:  string infor
89
 *
90
 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
91
 */
92
static void LIBXML_ATTR_FORMAT(3,0)
93
htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
94
             const char *msg, const xmlChar *str1, const xmlChar *str2)
95
{
96
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
97
        (ctxt->instate == XML_PARSER_EOF))
98
	return;
99
    if (ctxt != NULL)
100
	ctxt->errNo = error;
101
    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
102
                    XML_ERR_ERROR, NULL, 0,
103
		    (const char *) str1, (const char *) str2,
104
		    NULL, 0, 0,
105
		    msg, str1, str2);
106
    if (ctxt != NULL)
107
	ctxt->wellFormed = 0;
108
}
109

110
/**
111
 * htmlParseErrInt:
112
 * @ctxt:  an HTML parser context
113
 * @error:  the error number
114
 * @msg:  the error message
115
 * @val:  integer info
116
 *
117
 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
118
 */
119
static void LIBXML_ATTR_FORMAT(3,0)
120
htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
121
             const char *msg, int val)
122
{
123
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
124
        (ctxt->instate == XML_PARSER_EOF))
125
	return;
126
    if (ctxt != NULL)
127
	ctxt->errNo = error;
128
    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
129
                    XML_ERR_ERROR, NULL, 0, NULL, NULL,
130
		    NULL, val, 0, msg, val);
131
    if (ctxt != NULL)
132
	ctxt->wellFormed = 0;
133
}
134

135
/************************************************************************
136
 *									*
137
 *	Parser stacks related functions and macros		*
138
 *									*
139
 ************************************************************************/
140

141
/**
142
 * htmlnamePush:
143
 * @ctxt:  an HTML parser context
144
 * @value:  the element name
145
 *
146
 * Pushes a new element name on top of the name stack
147
 *
148
 * Returns -1 in case of error, the index in the stack otherwise
149
 */
150
static int
151
htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
152
{
153
    if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
154
        ctxt->html = 3;
155
    if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
156
        ctxt->html = 10;
157
    if (ctxt->nameNr >= ctxt->nameMax) {
158
        size_t newSize = ctxt->nameMax * 2;
159
        const xmlChar **tmp;
160

161
        tmp = xmlRealloc((xmlChar **) ctxt->nameTab,
162
                         newSize * sizeof(ctxt->nameTab[0]));
163
        if (tmp == NULL) {
164
            htmlErrMemory(ctxt, NULL);
165
            return (-1);
166
        }
167
        ctxt->nameTab = tmp;
168
        ctxt->nameMax = newSize;
169
    }
170
    ctxt->nameTab[ctxt->nameNr] = value;
171
    ctxt->name = value;
172
    return (ctxt->nameNr++);
173
}
174
/**
175
 * htmlnamePop:
176
 * @ctxt: an HTML parser context
177
 *
178
 * Pops the top element name from the name stack
179
 *
180
 * Returns the name just removed
181
 */
182
static const xmlChar *
183
htmlnamePop(htmlParserCtxtPtr ctxt)
184
{
185
    const xmlChar *ret;
186

187
    if (ctxt->nameNr <= 0)
188
        return (NULL);
189
    ctxt->nameNr--;
190
    if (ctxt->nameNr < 0)
191
        return (NULL);
192
    if (ctxt->nameNr > 0)
193
        ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
194
    else
195
        ctxt->name = NULL;
196
    ret = ctxt->nameTab[ctxt->nameNr];
197
    ctxt->nameTab[ctxt->nameNr] = NULL;
198
    return (ret);
199
}
200

201
/**
202
 * htmlNodeInfoPush:
203
 * @ctxt:  an HTML parser context
204
 * @value:  the node info
205
 *
206
 * Pushes a new element name on top of the node info stack
207
 *
208
 * Returns 0 in case of error, the index in the stack otherwise
209
 */
210
static int
211
htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
212
{
213
    if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
214
        if (ctxt->nodeInfoMax == 0)
215
                ctxt->nodeInfoMax = 5;
216
        ctxt->nodeInfoMax *= 2;
217
        ctxt->nodeInfoTab = (htmlParserNodeInfo *)
218
                         xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
219
                                    ctxt->nodeInfoMax *
220
                                    sizeof(ctxt->nodeInfoTab[0]));
221
        if (ctxt->nodeInfoTab == NULL) {
222
            htmlErrMemory(ctxt, NULL);
223
            return (0);
224
        }
225
    }
226
    ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
227
    ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
228
    return (ctxt->nodeInfoNr++);
229
}
230

231
/**
232
 * htmlNodeInfoPop:
233
 * @ctxt:  an HTML parser context
234
 *
235
 * Pops the top element name from the node info stack
236
 *
237
 * Returns 0 in case of error, the pointer to NodeInfo otherwise
238
 */
239
static htmlParserNodeInfo *
240
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
241
{
242
    if (ctxt->nodeInfoNr <= 0)
243
        return (NULL);
244
    ctxt->nodeInfoNr--;
245
    if (ctxt->nodeInfoNr < 0)
246
        return (NULL);
247
    if (ctxt->nodeInfoNr > 0)
248
        ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
249
    else
250
        ctxt->nodeInfo = NULL;
251
    return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
252
}
253

254
/*
255
 * Macros for accessing the content. Those should be used only by the parser,
256
 * and not exported.
257
 *
258
 * Dirty macros, i.e. one need to make assumption on the context to use them
259
 *
260
 *   CUR_PTR return the current pointer to the xmlChar to be parsed.
261
 *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
262
 *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
263
 *           in UNICODE mode. This should be used internally by the parser
264
 *           only to compare to ASCII values otherwise it would break when
265
 *           running with UTF-8 encoding.
266
 *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
267
 *           to compare on ASCII based substring.
268
 *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
269
 *           it should be used only to compare on ASCII based substring.
270
 *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
271
 *           strings without newlines within the parser.
272
 *
273
 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
274
 *
275
 *   NEXT    Skip to the next character, this does the proper decoding
276
 *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
277
 *   NEXTL(l) Skip the current unicode character of l xmlChars long.
278
 *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
279
 */
280

281
#define UPPER (toupper(*ctxt->input->cur))
282

283
#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
284

285
#define NXT(val) ctxt->input->cur[(val)]
286

287
#define UPP(val) (toupper(ctxt->input->cur[(val)]))
288

289
#define CUR_PTR ctxt->input->cur
290
#define BASE_PTR ctxt->input->base
291

292
#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
293
		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
294
	xmlParserShrink(ctxt)
295

296
#define GROW if ((ctxt->progressive == 0) &&				\
297
		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\
298
	xmlParserGrow(ctxt)
299

300
#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
301

302
/* Imported from XML */
303

304
#define CUR (*ctxt->input->cur)
305
#define NEXT xmlNextChar(ctxt)
306

307
#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
308

309

310
#define NEXTL(l) do {							\
311
    if (*(ctxt->input->cur) == '\n') {					\
312
	ctxt->input->line++; ctxt->input->col = 1;			\
313
    } else ctxt->input->col++;						\
314
    ctxt->token = 0; ctxt->input->cur += l;				\
315
  } while (0)
316

317
/************
318
    \
319
    if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
320
    if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
321
 ************/
322

323
#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
324

325
#define COPY_BUF(l,b,i,v)						\
326
    if (l == 1) b[i++] = v;						\
327
    else i += xmlCopyChar(l,&b[i],v)
328

329
/**
330
 * htmlFindEncoding:
331
 * @the HTML parser context
332
 *
333
 * Ty to find and encoding in the current data available in the input
334
 * buffer this is needed to try to switch to the proper encoding when
335
 * one face a character error.
336
 * That's an heuristic, since it's operating outside of parsing it could
337
 * try to use a meta which had been commented out, that's the reason it
338
 * should only be used in case of error, not as a default.
339
 *
340
 * Returns an encoding string or NULL if not found, the string need to
341
 *   be freed
342
 */
343
static xmlChar *
344
htmlFindEncoding(xmlParserCtxtPtr ctxt) {
345
    const xmlChar *start, *cur, *end;
346

347
    if ((ctxt == NULL) || (ctxt->input == NULL) ||
348
        (ctxt->input->flags & XML_INPUT_HAS_ENCODING))
349
        return(NULL);
350
    if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
351
        return(NULL);
352

353
    start = ctxt->input->cur;
354
    end = ctxt->input->end;
355
    /* we also expect the input buffer to be zero terminated */
356
    if (*end != 0)
357
        return(NULL);
358

359
    cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
360
    if (cur == NULL)
361
        return(NULL);
362
    cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
363
    if (cur == NULL)
364
        return(NULL);
365
    cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
366
    if (cur == NULL)
367
        return(NULL);
368
    cur += 8;
369
    start = cur;
370
    while (((*cur >= 'A') && (*cur <= 'Z')) ||
371
           ((*cur >= 'a') && (*cur <= 'z')) ||
372
           ((*cur >= '0') && (*cur <= '9')) ||
373
           (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
374
           cur++;
375
    if (cur == start)
376
        return(NULL);
377
    return(xmlStrndup(start, cur - start));
378
}
379

380
/**
381
 * htmlCurrentChar:
382
 * @ctxt:  the HTML parser context
383
 * @len:  pointer to the length of the char read
384
 *
385
 * The current char value, if using UTF-8 this may actually span multiple
386
 * bytes in the input buffer. Implement the end of line normalization:
387
 * 2.11 End-of-Line Handling
388
 * If the encoding is unspecified, in the case we find an ISO-Latin-1
389
 * char, then the encoding converter is plugged in automatically.
390
 *
391
 * Returns the current char value and its length
392
 */
393

394
static int
395
htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
396
    const unsigned char *cur;
397
    unsigned char c;
398
    unsigned int val;
399

400
    if (ctxt->instate == XML_PARSER_EOF)
401
	return(0);
402

403
    if (ctxt->token != 0) {
404
	*len = 0;
405
	return(ctxt->token);
406
    }
407

408
    if (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK) {
409
        xmlParserGrow(ctxt);
410
        if (ctxt->instate == XML_PARSER_EOF)
411
            return(0);
412
    }
413

414
    if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
415
        xmlChar * guess;
416
        xmlCharEncodingHandlerPtr handler;
417

418
        /*
419
         * Assume it's a fixed length encoding (1) with
420
         * a compatible encoding for the ASCII set, since
421
         * HTML constructs only use < 128 chars
422
         */
423
        if (*ctxt->input->cur < 0x80) {
424
            *len = 1;
425
            if ((*ctxt->input->cur == 0) &&
426
                (ctxt->input->cur < ctxt->input->end)) {
427
                htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
428
                                "Char 0x%X out of allowed range\n", 0);
429
                return(' ');
430
            }
431
            return(*ctxt->input->cur);
432
        }
433

434
        /*
435
         * Humm this is bad, do an automatic flow conversion
436
         */
437
        guess = htmlFindEncoding(ctxt);
438
        if (guess == NULL) {
439
            xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
440
        } else {
441
            handler = xmlFindCharEncodingHandler((const char *) guess);
442
            if (handler != NULL) {
443
                /*
444
                 * Don't use UTF-8 encoder which isn't required and
445
                 * can produce invalid UTF-8.
446
                 */
447
                if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
448
                    xmlSwitchToEncoding(ctxt, handler);
449
            } else {
450
                htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
451
                             "Unsupported encoding %s", guess, NULL);
452
            }
453
            xmlFree(guess);
454
        }
455
        ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
456
    }
457

458
    /*
459
     * We are supposed to handle UTF8, check it's valid
460
     * From rfc2044: encoding of the Unicode values on UTF-8:
461
     *
462
     * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
463
     * 0000 0000-0000 007F   0xxxxxxx
464
     * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
465
     * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
466
     *
467
     * Check for the 0x110000 limit too
468
     */
469
    cur = ctxt->input->cur;
470
    c = *cur;
471
    if (c & 0x80) {
472
        size_t avail;
473

474
        if ((c & 0x40) == 0)
475
            goto encoding_error;
476

477
        avail = ctxt->input->end - ctxt->input->cur;
478

479
        if ((avail < 2) || ((cur[1] & 0xc0) != 0x80))
480
            goto encoding_error;
481
        if ((c & 0xe0) == 0xe0) {
482
            if ((avail < 3) || ((cur[2] & 0xc0) != 0x80))
483
                goto encoding_error;
484
            if ((c & 0xf0) == 0xf0) {
485
                if (((c & 0xf8) != 0xf0) ||
486
                    (avail < 4) || ((cur[3] & 0xc0) != 0x80))
487
                    goto encoding_error;
488
                /* 4-byte code */
489
                *len = 4;
490
                val = (cur[0] & 0x7) << 18;
491
                val |= (cur[1] & 0x3f) << 12;
492
                val |= (cur[2] & 0x3f) << 6;
493
                val |= cur[3] & 0x3f;
494
                if (val < 0x10000)
495
                    goto encoding_error;
496
            } else {
497
              /* 3-byte code */
498
                *len = 3;
499
                val = (cur[0] & 0xf) << 12;
500
                val |= (cur[1] & 0x3f) << 6;
501
                val |= cur[2] & 0x3f;
502
                if (val < 0x800)
503
                    goto encoding_error;
504
            }
505
        } else {
506
          /* 2-byte code */
507
            *len = 2;
508
            val = (cur[0] & 0x1f) << 6;
509
            val |= cur[1] & 0x3f;
510
            if (val < 0x80)
511
                goto encoding_error;
512
        }
513
        if (!IS_CHAR(val)) {
514
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
515
                            "Char 0x%X out of allowed range\n", val);
516
        }
517
        return(val);
518
    } else {
519
        if ((*ctxt->input->cur == 0) &&
520
            (ctxt->input->cur < ctxt->input->end)) {
521
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
522
                            "Char 0x%X out of allowed range\n", 0);
523
            *len = 1;
524
            return(' ');
525
        }
526
        /* 1-byte code */
527
        *len = 1;
528
        return(*ctxt->input->cur);
529
    }
530

531
encoding_error:
532
    {
533
        char buffer[150];
534

535
	if (ctxt->input->end - ctxt->input->cur >= 4) {
536
	    snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
537
			    ctxt->input->cur[0], ctxt->input->cur[1],
538
			    ctxt->input->cur[2], ctxt->input->cur[3]);
539
	} else {
540
	    snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
541
	}
542
	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
543
		     "Input is not proper UTF-8, indicate encoding !\n",
544
		     BAD_CAST buffer, NULL);
545
    }
546

547
    if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0)
548
        xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
549
    *len = 1;
550
    return(*ctxt->input->cur);
551
}
552

553
/**
554
 * htmlSkipBlankChars:
555
 * @ctxt:  the HTML parser context
556
 *
557
 * skip all blanks character found at that point in the input streams.
558
 *
559
 * Returns the number of space chars skipped
560
 */
561

562
static int
563
htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
564
    int res = 0;
565

566
    while (IS_BLANK_CH(*(ctxt->input->cur))) {
567
        if (*(ctxt->input->cur) == '\n') {
568
            ctxt->input->line++; ctxt->input->col = 1;
569
        } else ctxt->input->col++;
570
        ctxt->input->cur++;
571
        if (*ctxt->input->cur == 0)
572
            xmlParserGrow(ctxt);
573
	if (res < INT_MAX)
574
	    res++;
575
    }
576
    return(res);
577
}
578

579

580

581
/************************************************************************
582
 *									*
583
 *	The list of HTML elements and their properties		*
584
 *									*
585
 ************************************************************************/
586

587
/*
588
 *  Start Tag: 1 means the start tag can be omitted
589
 *  End Tag:   1 means the end tag can be omitted
590
 *             2 means it's forbidden (empty elements)
591
 *             3 means the tag is stylistic and should be closed easily
592
 *  Depr:      this element is deprecated
593
 *  DTD:       1 means that this element is valid only in the Loose DTD
594
 *             2 means that this element is valid only in the Frameset DTD
595
 *
596
 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
597
	, subElements , impliedsubelt , Attributes, userdata
598
 */
599

600
/* Definitions and a couple of vars for HTML Elements */
601

602
#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
603
#define NB_FONTSTYLE 8
604
#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
605
#define NB_PHRASE 10
606
#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
607
#define NB_SPECIAL 16
608
#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
609
#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
610
#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
611
#define NB_BLOCK NB_HEADING + NB_LIST + 14
612
#define FORMCTRL "input", "select", "textarea", "label", "button"
613
#define NB_FORMCTRL 5
614
#define PCDATA
615
#define NB_PCDATA 0
616
#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
617
#define NB_HEADING 6
618
#define LIST "ul", "ol", "dir", "menu"
619
#define NB_LIST 4
620
#define MODIFIER
621
#define NB_MODIFIER 0
622
#define FLOW BLOCK,INLINE
623
#define NB_FLOW NB_BLOCK + NB_INLINE
624
#define EMPTY NULL
625

626

627
static const char* const html_flow[] = { FLOW, NULL } ;
628
static const char* const html_inline[] = { INLINE, NULL } ;
629

630
/* placeholders: elts with content but no subelements */
631
static const char* const html_pcdata[] = { NULL } ;
632
#define html_cdata html_pcdata
633

634

635
/* ... and for HTML Attributes */
636

637
#define COREATTRS "id", "class", "style", "title"
638
#define NB_COREATTRS 4
639
#define I18N "lang", "dir"
640
#define NB_I18N 2
641
#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
642
#define NB_EVENTS 9
643
#define ATTRS COREATTRS,I18N,EVENTS
644
#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
645
#define CELLHALIGN "align", "char", "charoff"
646
#define NB_CELLHALIGN 3
647
#define CELLVALIGN "valign"
648
#define NB_CELLVALIGN 1
649

650
static const char* const html_attrs[] = { ATTRS, NULL } ;
651
static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
652
static const char* const core_attrs[] = { COREATTRS, NULL } ;
653
static const char* const i18n_attrs[] = { I18N, NULL } ;
654

655

656
/* Other declarations that should go inline ... */
657
static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
658
	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
659
	"tabindex", "onfocus", "onblur", NULL } ;
660
static const char* const target_attr[] = { "target", NULL } ;
661
static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
662
static const char* const alt_attr[] = { "alt", NULL } ;
663
static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
664
static const char* const href_attrs[] = { "href", NULL } ;
665
static const char* const clear_attrs[] = { "clear", NULL } ;
666
static const char* const inline_p[] = { INLINE, "p", NULL } ;
667

668
static const char* const flow_param[] = { FLOW, "param", NULL } ;
669
static const char* const applet_attrs[] = { COREATTRS , "codebase",
670
		"archive", "alt", "name", "height", "width", "align",
671
		"hspace", "vspace", NULL } ;
672
static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
673
	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
674
static const char* const basefont_attrs[] =
675
	{ "id", "size", "color", "face", NULL } ;
676
static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
677
static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
678
static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
679
static const char* const body_depr[] = { "background", "bgcolor", "text",
680
	"link", "vlink", "alink", NULL } ;
681
static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
682
	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
683

684

685
static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
686
static const char* const col_elt[] = { "col", NULL } ;
687
static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
688
static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
689
static const char* const dl_contents[] = { "dt", "dd", NULL } ;
690
static const char* const compact_attr[] = { "compact", NULL } ;
691
static const char* const label_attr[] = { "label", NULL } ;
692
static const char* const fieldset_contents[] = { FLOW, "legend" } ;
693
static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
694
static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
695
static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
696
static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
697
static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
698
static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
699
static const char* const head_attrs[] = { I18N, "profile", NULL } ;
700
static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
701
static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
702
static const char* const version_attr[] = { "version", NULL } ;
703
static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
704
static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
705
static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
706
static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
707
static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
708
static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
709
static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
710
static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
711
static const char* const align_attr[] = { "align", NULL } ;
712
static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
713
static const char* const map_contents[] = { BLOCK, "area", NULL } ;
714
static const char* const name_attr[] = { "name", NULL } ;
715
static const char* const action_attr[] = { "action", NULL } ;
716
static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
717
static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
718
static const char* const content_attr[] = { "content", NULL } ;
719
static const char* const type_attr[] = { "type", NULL } ;
720
static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
721
static const char* const object_contents[] = { FLOW, "param", NULL } ;
722
static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
723
static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
724
static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
725
static const char* const option_elt[] = { "option", NULL } ;
726
static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
727
static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
728
static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
729
static const char* const width_attr[] = { "width", NULL } ;
730
static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
731
static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
732
static const char* const language_attr[] = { "language", NULL } ;
733
static const char* const select_content[] = { "optgroup", "option", NULL } ;
734
static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
735
static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
736
static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
737
static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
738
static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
739
static const char* const tr_elt[] = { "tr", NULL } ;
740
static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
741
static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
742
static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
743
static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
744
static const char* const tr_contents[] = { "th", "td", NULL } ;
745
static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
746
static const char* const li_elt[] = { "li", NULL } ;
747
static const char* const ul_depr[] = { "type", "compact", NULL} ;
748
static const char* const dir_attr[] = { "dir", NULL} ;
749

750
#define DECL (const char**)
751

752
static const htmlElemDesc
753
html40ElementTable[] = {
754
{ "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
755
	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
756
},
757
{ "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
758
	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
759
},
760
{ "acronym",	0, 0, 0, 0, 0, 0, 1, "",
761
	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
762
},
763
{ "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
764
	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
765
},
766
{ "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
767
	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
768
},
769
{ "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
770
	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
771
},
772
{ "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
773
	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
774
},
775
{ "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
776
	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
777
},
778
{ "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
779
	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
780
},
781
{ "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
782
	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
783
},
784
{ "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
785
	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
786
},
787
{ "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
788
	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
789
},
790
{ "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
791
	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
792
},
793
{ "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
794
	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
795
},
796
{ "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
797
	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
798
},
799
{ "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
800
	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
801
},
802
{ "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
803
	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
804
},
805
{ "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
806
	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
807
},
808
{ "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
809
	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
810
},
811
{ "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
812
	EMPTY , NULL , DECL col_attrs , NULL, NULL
813
},
814
{ "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
815
	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
816
},
817
{ "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
818
	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
819
},
820
{ "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
821
	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
822
},
823
{ "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
824
	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
825
},
826
{ "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
827
	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
828
},
829
{ "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
830
	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
831
},
832
{ "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
833
	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
834
},
835
{ "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
836
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
837
},
838
{ "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
839
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
840
},
841
{ "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
842
	EMPTY, NULL, DECL embed_attrs, NULL, NULL
843
},
844
{ "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
845
	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
846
},
847
{ "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
848
	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
849
},
850
{ "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
851
	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
852
},
853
{ "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
854
	EMPTY, NULL, NULL, DECL frame_attrs, NULL
855
},
856
{ "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
857
	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
858
},
859
{ "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
860
	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
861
},
862
{ "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
863
	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
864
},
865
{ "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
866
	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
867
},
868
{ "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
869
	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
870
},
871
{ "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
872
	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
873
},
874
{ "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
875
	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
876
},
877
{ "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
878
	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
879
},
880
{ "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
881
	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
882
},
883
{ "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
884
	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
885
},
886
{ "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
887
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
888
},
889
{ "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
890
	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
891
},
892
{ "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
893
	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
894
},
895
{ "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
896
	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
897
},
898
{ "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
899
	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
900
},
901
{ "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
902
	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
903
},
904
{ "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
905
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
906
},
907
{ "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
908
	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
909
},
910
{ "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
911
	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
912
},
913
{ "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
914
	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
915
},
916
{ "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
917
	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
918
},
919
{ "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
920
	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
921
},
922
{ "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
923
	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
924
},
925
{ "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
926
	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
927
},
928
{ "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
929
	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
930
},
931
{ "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
932
	DECL html_flow, "div", DECL html_attrs, NULL, NULL
933
},
934
{ "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
935
	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
936
},
937
{ "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
938
	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
939
},
940
{ "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
941
	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
942
},
943
{ "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
944
	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
945
},
946
{ "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
947
	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
948
},
949
{ "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
950
	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
951
},
952
{ "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
953
	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
954
},
955
{ "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
956
	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
957
},
958
{ "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
959
	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
960
},
961
{ "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
962
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
963
},
964
{ "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
965
	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
966
},
967
{ "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
968
	DECL select_content, NULL, DECL select_attrs, NULL, NULL
969
},
970
{ "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
971
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
972
},
973
{ "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
974
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
975
},
976
{ "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
977
	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
978
},
979
{ "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
980
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
981
},
982
{ "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
983
	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
984
},
985
{ "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
986
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
987
},
988
{ "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
989
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
990
},
991
{ "table",	0, 0, 0, 0, 0, 0, 0, "",
992
	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
993
},
994
{ "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
995
	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
996
},
997
{ "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
998
	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
999
},
1000
{ "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1001
	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1002
},
1003
{ "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
1004
	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1005
},
1006
{ "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
1007
	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1008
},
1009
{ "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
1010
	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1011
},
1012
{ "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
1013
	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1014
},
1015
{ "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
1016
	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1017
},
1018
{ "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1019
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1020
},
1021
{ "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
1022
	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1023
},
1024
{ "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
1025
	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1026
},
1027
{ "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1028
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1029
}
1030
};
1031

1032
typedef struct {
1033
    const char *oldTag;
1034
    const char *newTag;
1035
} htmlStartCloseEntry;
1036

1037
/*
1038
 * start tags that imply the end of current element
1039
 */
1040
static const htmlStartCloseEntry htmlStartClose[] = {
1041
    { "a", "a" },
1042
    { "a", "fieldset" },
1043
    { "a", "table" },
1044
    { "a", "td" },
1045
    { "a", "th" },
1046
    { "address", "dd" },
1047
    { "address", "dl" },
1048
    { "address", "dt" },
1049
    { "address", "form" },
1050
    { "address", "li" },
1051
    { "address", "ul" },
1052
    { "b", "center" },
1053
    { "b", "p" },
1054
    { "b", "td" },
1055
    { "b", "th" },
1056
    { "big", "p" },
1057
    { "caption", "col" },
1058
    { "caption", "colgroup" },
1059
    { "caption", "tbody" },
1060
    { "caption", "tfoot" },
1061
    { "caption", "thead" },
1062
    { "caption", "tr" },
1063
    { "col", "col" },
1064
    { "col", "colgroup" },
1065
    { "col", "tbody" },
1066
    { "col", "tfoot" },
1067
    { "col", "thead" },
1068
    { "col", "tr" },
1069
    { "colgroup", "colgroup" },
1070
    { "colgroup", "tbody" },
1071
    { "colgroup", "tfoot" },
1072
    { "colgroup", "thead" },
1073
    { "colgroup", "tr" },
1074
    { "dd", "dt" },
1075
    { "dir", "dd" },
1076
    { "dir", "dl" },
1077
    { "dir", "dt" },
1078
    { "dir", "form" },
1079
    { "dir", "ul" },
1080
    { "dl", "form" },
1081
    { "dl", "li" },
1082
    { "dt", "dd" },
1083
    { "dt", "dl" },
1084
    { "font", "center" },
1085
    { "font", "td" },
1086
    { "font", "th" },
1087
    { "form", "form" },
1088
    { "h1", "fieldset" },
1089
    { "h1", "form" },
1090
    { "h1", "li" },
1091
    { "h1", "p" },
1092
    { "h1", "table" },
1093
    { "h2", "fieldset" },
1094
    { "h2", "form" },
1095
    { "h2", "li" },
1096
    { "h2", "p" },
1097
    { "h2", "table" },
1098
    { "h3", "fieldset" },
1099
    { "h3", "form" },
1100
    { "h3", "li" },
1101
    { "h3", "p" },
1102
    { "h3", "table" },
1103
    { "h4", "fieldset" },
1104
    { "h4", "form" },
1105
    { "h4", "li" },
1106
    { "h4", "p" },
1107
    { "h4", "table" },
1108
    { "h5", "fieldset" },
1109
    { "h5", "form" },
1110
    { "h5", "li" },
1111
    { "h5", "p" },
1112
    { "h5", "table" },
1113
    { "h6", "fieldset" },
1114
    { "h6", "form" },
1115
    { "h6", "li" },
1116
    { "h6", "p" },
1117
    { "h6", "table" },
1118
    { "head", "a" },
1119
    { "head", "abbr" },
1120
    { "head", "acronym" },
1121
    { "head", "address" },
1122
    { "head", "b" },
1123
    { "head", "bdo" },
1124
    { "head", "big" },
1125
    { "head", "blockquote" },
1126
    { "head", "body" },
1127
    { "head", "br" },
1128
    { "head", "center" },
1129
    { "head", "cite" },
1130
    { "head", "code" },
1131
    { "head", "dd" },
1132
    { "head", "dfn" },
1133
    { "head", "dir" },
1134
    { "head", "div" },
1135
    { "head", "dl" },
1136
    { "head", "dt" },
1137
    { "head", "em" },
1138
    { "head", "fieldset" },
1139
    { "head", "font" },
1140
    { "head", "form" },
1141
    { "head", "frameset" },
1142
    { "head", "h1" },
1143
    { "head", "h2" },
1144
    { "head", "h3" },
1145
    { "head", "h4" },
1146
    { "head", "h5" },
1147
    { "head", "h6" },
1148
    { "head", "hr" },
1149
    { "head", "i" },
1150
    { "head", "iframe" },
1151
    { "head", "img" },
1152
    { "head", "kbd" },
1153
    { "head", "li" },
1154
    { "head", "listing" },
1155
    { "head", "map" },
1156
    { "head", "menu" },
1157
    { "head", "ol" },
1158
    { "head", "p" },
1159
    { "head", "pre" },
1160
    { "head", "q" },
1161
    { "head", "s" },
1162
    { "head", "samp" },
1163
    { "head", "small" },
1164
    { "head", "span" },
1165
    { "head", "strike" },
1166
    { "head", "strong" },
1167
    { "head", "sub" },
1168
    { "head", "sup" },
1169
    { "head", "table" },
1170
    { "head", "tt" },
1171
    { "head", "u" },
1172
    { "head", "ul" },
1173
    { "head", "var" },
1174
    { "head", "xmp" },
1175
    { "hr", "form" },
1176
    { "i", "center" },
1177
    { "i", "p" },
1178
    { "i", "td" },
1179
    { "i", "th" },
1180
    { "legend", "fieldset" },
1181
    { "li", "li" },
1182
    { "link", "body" },
1183
    { "link", "frameset" },
1184
    { "listing", "dd" },
1185
    { "listing", "dl" },
1186
    { "listing", "dt" },
1187
    { "listing", "fieldset" },
1188
    { "listing", "form" },
1189
    { "listing", "li" },
1190
    { "listing", "table" },
1191
    { "listing", "ul" },
1192
    { "menu", "dd" },
1193
    { "menu", "dl" },
1194
    { "menu", "dt" },
1195
    { "menu", "form" },
1196
    { "menu", "ul" },
1197
    { "ol", "form" },
1198
    { "option", "optgroup" },
1199
    { "option", "option" },
1200
    { "p", "address" },
1201
    { "p", "blockquote" },
1202
    { "p", "body" },
1203
    { "p", "caption" },
1204
    { "p", "center" },
1205
    { "p", "col" },
1206
    { "p", "colgroup" },
1207
    { "p", "dd" },
1208
    { "p", "dir" },
1209
    { "p", "div" },
1210
    { "p", "dl" },
1211
    { "p", "dt" },
1212
    { "p", "fieldset" },
1213
    { "p", "form" },
1214
    { "p", "frameset" },
1215
    { "p", "h1" },
1216
    { "p", "h2" },
1217
    { "p", "h3" },
1218
    { "p", "h4" },
1219
    { "p", "h5" },
1220
    { "p", "h6" },
1221
    { "p", "head" },
1222
    { "p", "hr" },
1223
    { "p", "li" },
1224
    { "p", "listing" },
1225
    { "p", "menu" },
1226
    { "p", "ol" },
1227
    { "p", "p" },
1228
    { "p", "pre" },
1229
    { "p", "table" },
1230
    { "p", "tbody" },
1231
    { "p", "td" },
1232
    { "p", "tfoot" },
1233
    { "p", "th" },
1234
    { "p", "title" },
1235
    { "p", "tr" },
1236
    { "p", "ul" },
1237
    { "p", "xmp" },
1238
    { "pre", "dd" },
1239
    { "pre", "dl" },
1240
    { "pre", "dt" },
1241
    { "pre", "fieldset" },
1242
    { "pre", "form" },
1243
    { "pre", "li" },
1244
    { "pre", "table" },
1245
    { "pre", "ul" },
1246
    { "s", "p" },
1247
    { "script", "noscript" },
1248
    { "small", "p" },
1249
    { "span", "td" },
1250
    { "span", "th" },
1251
    { "strike", "p" },
1252
    { "style", "body" },
1253
    { "style", "frameset" },
1254
    { "tbody", "tbody" },
1255
    { "tbody", "tfoot" },
1256
    { "td", "tbody" },
1257
    { "td", "td" },
1258
    { "td", "tfoot" },
1259
    { "td", "th" },
1260
    { "td", "tr" },
1261
    { "tfoot", "tbody" },
1262
    { "th", "tbody" },
1263
    { "th", "td" },
1264
    { "th", "tfoot" },
1265
    { "th", "th" },
1266
    { "th", "tr" },
1267
    { "thead", "tbody" },
1268
    { "thead", "tfoot" },
1269
    { "title", "body" },
1270
    { "title", "frameset" },
1271
    { "tr", "tbody" },
1272
    { "tr", "tfoot" },
1273
    { "tr", "tr" },
1274
    { "tt", "p" },
1275
    { "u", "p" },
1276
    { "u", "td" },
1277
    { "u", "th" },
1278
    { "ul", "address" },
1279
    { "ul", "form" },
1280
    { "ul", "menu" },
1281
    { "ul", "pre" },
1282
    { "xmp", "dd" },
1283
    { "xmp", "dl" },
1284
    { "xmp", "dt" },
1285
    { "xmp", "fieldset" },
1286
    { "xmp", "form" },
1287
    { "xmp", "li" },
1288
    { "xmp", "table" },
1289
    { "xmp", "ul" }
1290
};
1291

1292
/*
1293
 * The list of HTML elements which are supposed not to have
1294
 * CDATA content and where a p element will be implied
1295
 *
1296
 * TODO: extend that list by reading the HTML SGML DTD on
1297
 *       implied paragraph
1298
 */
1299
static const char *const htmlNoContentElements[] = {
1300
    "html",
1301
    "head",
1302
    NULL
1303
};
1304

1305
/*
1306
 * The list of HTML attributes which are of content %Script;
1307
 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1308
 *       it assumes the name starts with 'on'
1309
 */
1310
static const char *const htmlScriptAttributes[] = {
1311
    "onclick",
1312
    "ondblclick",
1313
    "onmousedown",
1314
    "onmouseup",
1315
    "onmouseover",
1316
    "onmousemove",
1317
    "onmouseout",
1318
    "onkeypress",
1319
    "onkeydown",
1320
    "onkeyup",
1321
    "onload",
1322
    "onunload",
1323
    "onfocus",
1324
    "onblur",
1325
    "onsubmit",
1326
    "onreset",
1327
    "onchange",
1328
    "onselect"
1329
};
1330

1331
/*
1332
 * This table is used by the htmlparser to know what to do with
1333
 * broken html pages. By assigning different priorities to different
1334
 * elements the parser can decide how to handle extra endtags.
1335
 * Endtags are only allowed to close elements with lower or equal
1336
 * priority.
1337
 */
1338

1339
typedef struct {
1340
    const char *name;
1341
    int priority;
1342
} elementPriority;
1343

1344
static const elementPriority htmlEndPriority[] = {
1345
    {"div",   150},
1346
    {"td",    160},
1347
    {"th",    160},
1348
    {"tr",    170},
1349
    {"thead", 180},
1350
    {"tbody", 180},
1351
    {"tfoot", 180},
1352
    {"table", 190},
1353
    {"head",  200},
1354
    {"body",  200},
1355
    {"html",  220},
1356
    {NULL,    100} /* Default priority */
1357
};
1358

1359
/************************************************************************
1360
 *									*
1361
 *	functions to handle HTML specific data			*
1362
 *									*
1363
 ************************************************************************/
1364

1365
/**
1366
 * htmlInitAutoClose:
1367
 *
1368
 * DEPRECATED: This is a no-op.
1369
 */
1370
void
1371
htmlInitAutoClose(void) {
1372
}
1373

1374
static int
1375
htmlCompareTags(const void *key, const void *member) {
1376
    const xmlChar *tag = (const xmlChar *) key;
1377
    const htmlElemDesc *desc = (const htmlElemDesc *) member;
1378

1379
    return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1380
}
1381

1382
/**
1383
 * htmlTagLookup:
1384
 * @tag:  The tag name in lowercase
1385
 *
1386
 * Lookup the HTML tag in the ElementTable
1387
 *
1388
 * Returns the related htmlElemDescPtr or NULL if not found.
1389
 */
1390
const htmlElemDesc *
1391
htmlTagLookup(const xmlChar *tag) {
1392
    if (tag == NULL)
1393
        return(NULL);
1394

1395
    return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1396
                sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1397
                sizeof(htmlElemDesc), htmlCompareTags));
1398
}
1399

1400
/**
1401
 * htmlGetEndPriority:
1402
 * @name: The name of the element to look up the priority for.
1403
 *
1404
 * Return value: The "endtag" priority.
1405
 **/
1406
static int
1407
htmlGetEndPriority (const xmlChar *name) {
1408
    int i = 0;
1409

1410
    while ((htmlEndPriority[i].name != NULL) &&
1411
	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1412
	i++;
1413

1414
    return(htmlEndPriority[i].priority);
1415
}
1416

1417

1418
static int
1419
htmlCompareStartClose(const void *vkey, const void *member) {
1420
    const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1421
    const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1422
    int ret;
1423

1424
    ret = strcmp(key->oldTag, entry->oldTag);
1425
    if (ret == 0)
1426
        ret = strcmp(key->newTag, entry->newTag);
1427

1428
    return(ret);
1429
}
1430

1431
/**
1432
 * htmlCheckAutoClose:
1433
 * @newtag:  The new tag name
1434
 * @oldtag:  The old tag name
1435
 *
1436
 * Checks whether the new tag is one of the registered valid tags for
1437
 * closing old.
1438
 *
1439
 * Returns 0 if no, 1 if yes.
1440
 */
1441
static int
1442
htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1443
{
1444
    htmlStartCloseEntry key;
1445
    void *res;
1446

1447
    key.oldTag = (const char *) oldtag;
1448
    key.newTag = (const char *) newtag;
1449
    res = bsearch(&key, htmlStartClose,
1450
            sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1451
            sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1452
    return(res != NULL);
1453
}
1454

1455
/**
1456
 * htmlAutoCloseOnClose:
1457
 * @ctxt:  an HTML parser context
1458
 * @newtag:  The new tag name
1459
 * @force:  force the tag closure
1460
 *
1461
 * The HTML DTD allows an ending tag to implicitly close other tags.
1462
 */
1463
static void
1464
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1465
{
1466
    const htmlElemDesc *info;
1467
    int i, priority;
1468

1469
    priority = htmlGetEndPriority(newtag);
1470

1471
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1472

1473
        if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1474
            break;
1475
        /*
1476
         * A misplaced endtag can only close elements with lower
1477
         * or equal priority, so if we find an element with higher
1478
         * priority before we find an element with
1479
         * matching name, we just ignore this endtag
1480
         */
1481
        if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1482
            return;
1483
    }
1484
    if (i < 0)
1485
        return;
1486

1487
    while (!xmlStrEqual(newtag, ctxt->name)) {
1488
        info = htmlTagLookup(ctxt->name);
1489
        if ((info != NULL) && (info->endTag == 3)) {
1490
            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1491
	                 "Opening and ending tag mismatch: %s and %s\n",
1492
			 newtag, ctxt->name);
1493
        }
1494
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1495
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1496
	htmlnamePop(ctxt);
1497
    }
1498
}
1499

1500
/**
1501
 * htmlAutoCloseOnEnd:
1502
 * @ctxt:  an HTML parser context
1503
 *
1504
 * Close all remaining tags at the end of the stream
1505
 */
1506
static void
1507
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1508
{
1509
    int i;
1510

1511
    if (ctxt->nameNr == 0)
1512
        return;
1513
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1514
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1515
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1516
	htmlnamePop(ctxt);
1517
    }
1518
}
1519

1520
/**
1521
 * htmlAutoClose:
1522
 * @ctxt:  an HTML parser context
1523
 * @newtag:  The new tag name or NULL
1524
 *
1525
 * The HTML DTD allows a tag to implicitly close other tags.
1526
 * The list is kept in htmlStartClose array. This function is
1527
 * called when a new tag has been detected and generates the
1528
 * appropriates closes if possible/needed.
1529
 * If newtag is NULL this mean we are at the end of the resource
1530
 * and we should check
1531
 */
1532
static void
1533
htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1534
{
1535
    if (newtag == NULL)
1536
        return;
1537

1538
    while ((ctxt->name != NULL) &&
1539
           (htmlCheckAutoClose(newtag, ctxt->name))) {
1540
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1541
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1542
	htmlnamePop(ctxt);
1543
    }
1544
}
1545

1546
/**
1547
 * htmlAutoCloseTag:
1548
 * @doc:  the HTML document
1549
 * @name:  The tag name
1550
 * @elem:  the HTML element
1551
 *
1552
 * The HTML DTD allows a tag to implicitly close other tags.
1553
 * The list is kept in htmlStartClose array. This function checks
1554
 * if the element or one of it's children would autoclose the
1555
 * given tag.
1556
 *
1557
 * Returns 1 if autoclose, 0 otherwise
1558
 */
1559
int
1560
htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1561
    htmlNodePtr child;
1562

1563
    if (elem == NULL) return(1);
1564
    if (xmlStrEqual(name, elem->name)) return(0);
1565
    if (htmlCheckAutoClose(elem->name, name)) return(1);
1566
    child = elem->children;
1567
    while (child != NULL) {
1568
        if (htmlAutoCloseTag(doc, name, child)) return(1);
1569
	child = child->next;
1570
    }
1571
    return(0);
1572
}
1573

1574
/**
1575
 * htmlIsAutoClosed:
1576
 * @doc:  the HTML document
1577
 * @elem:  the HTML element
1578
 *
1579
 * The HTML DTD allows a tag to implicitly close other tags.
1580
 * The list is kept in htmlStartClose array. This function checks
1581
 * if a tag is autoclosed by one of it's child
1582
 *
1583
 * Returns 1 if autoclosed, 0 otherwise
1584
 */
1585
int
1586
htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1587
    htmlNodePtr child;
1588

1589
    if (elem == NULL) return(1);
1590
    child = elem->children;
1591
    while (child != NULL) {
1592
	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1593
	child = child->next;
1594
    }
1595
    return(0);
1596
}
1597

1598
/**
1599
 * htmlCheckImplied:
1600
 * @ctxt:  an HTML parser context
1601
 * @newtag:  The new tag name
1602
 *
1603
 * The HTML DTD allows a tag to exists only implicitly
1604
 * called when a new tag has been detected and generates the
1605
 * appropriates implicit tags if missing
1606
 */
1607
static void
1608
htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1609
    int i;
1610

1611
    if (ctxt->options & HTML_PARSE_NOIMPLIED)
1612
        return;
1613
    if (!htmlOmittedDefaultValue)
1614
	return;
1615
    if (xmlStrEqual(newtag, BAD_CAST"html"))
1616
	return;
1617
    if (ctxt->nameNr <= 0) {
1618
	htmlnamePush(ctxt, BAD_CAST"html");
1619
	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1620
	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1621
    }
1622
    if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1623
        return;
1624
    if ((ctxt->nameNr <= 1) &&
1625
        ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1626
	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1627
	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1628
	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1629
	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1630
	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1631
        if (ctxt->html >= 3) {
1632
            /* we already saw or generated an <head> before */
1633
            return;
1634
        }
1635
        /*
1636
         * dropped OBJECT ... i you put it first BODY will be
1637
         * assumed !
1638
         */
1639
        htmlnamePush(ctxt, BAD_CAST"head");
1640
        if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1641
            ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1642
    } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1643
	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1644
	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1645
        if (ctxt->html >= 10) {
1646
            /* we already saw or generated a <body> before */
1647
            return;
1648
        }
1649
	for (i = 0;i < ctxt->nameNr;i++) {
1650
	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1651
		return;
1652
	    }
1653
	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1654
		return;
1655
	    }
1656
	}
1657

1658
	htmlnamePush(ctxt, BAD_CAST"body");
1659
	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1660
	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1661
    }
1662
}
1663

1664
/**
1665
 * htmlCheckParagraph
1666
 * @ctxt:  an HTML parser context
1667
 *
1668
 * Check whether a p element need to be implied before inserting
1669
 * characters in the current element.
1670
 *
1671
 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1672
 *         in case of error.
1673
 */
1674

1675
static int
1676
htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1677
    const xmlChar *tag;
1678
    int i;
1679

1680
    if (ctxt == NULL)
1681
	return(-1);
1682
    tag = ctxt->name;
1683
    if (tag == NULL) {
1684
	htmlAutoClose(ctxt, BAD_CAST"p");
1685
	htmlCheckImplied(ctxt, BAD_CAST"p");
1686
	htmlnamePush(ctxt, BAD_CAST"p");
1687
	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1688
	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1689
	return(1);
1690
    }
1691
    if (!htmlOmittedDefaultValue)
1692
	return(0);
1693
    for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1694
	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1695
	    htmlAutoClose(ctxt, BAD_CAST"p");
1696
	    htmlCheckImplied(ctxt, BAD_CAST"p");
1697
	    htmlnamePush(ctxt, BAD_CAST"p");
1698
	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1699
		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1700
	    return(1);
1701
	}
1702
    }
1703
    return(0);
1704
}
1705

1706
/**
1707
 * htmlIsScriptAttribute:
1708
 * @name:  an attribute name
1709
 *
1710
 * Check if an attribute is of content type Script
1711
 *
1712
 * Returns 1 is the attribute is a script 0 otherwise
1713
 */
1714
int
1715
htmlIsScriptAttribute(const xmlChar *name) {
1716
    unsigned int i;
1717

1718
    if (name == NULL)
1719
      return(0);
1720
    /*
1721
     * all script attributes start with 'on'
1722
     */
1723
    if ((name[0] != 'o') || (name[1] != 'n'))
1724
      return(0);
1725
    for (i = 0;
1726
	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1727
	 i++) {
1728
	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1729
	    return(1);
1730
    }
1731
    return(0);
1732
}
1733

1734
/************************************************************************
1735
 *									*
1736
 *	The list of HTML predefined entities			*
1737
 *									*
1738
 ************************************************************************/
1739

1740

1741
static const htmlEntityDesc  html40EntitiesTable[] = {
1742
/*
1743
 * the 4 absolute ones, plus apostrophe.
1744
 */
1745
{ 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
1746
{ 38,	"amp",	"ampersand, U+0026 ISOnum" },
1747
{ 39,	"apos",	"single quote" },
1748
{ 60,	"lt",	"less-than sign, U+003C ISOnum" },
1749
{ 62,	"gt",	"greater-than sign, U+003E ISOnum" },
1750

1751
/*
1752
 * A bunch still in the 128-255 range
1753
 * Replacing them depend really on the charset used.
1754
 */
1755
{ 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
1756
{ 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1757
{ 162,	"cent",	"cent sign, U+00A2 ISOnum" },
1758
{ 163,	"pound","pound sign, U+00A3 ISOnum" },
1759
{ 164,	"curren","currency sign, U+00A4 ISOnum" },
1760
{ 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
1761
{ 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1762
{ 167,	"sect",	"section sign, U+00A7 ISOnum" },
1763
{ 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1764
{ 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
1765
{ 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
1766
{ 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1767
{ 172,	"not",	"not sign, U+00AC ISOnum" },
1768
{ 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1769
{ 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
1770
{ 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1771
{ 176,	"deg",	"degree sign, U+00B0 ISOnum" },
1772
{ 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1773
{ 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1774
{ 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1775
{ 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
1776
{ 181,	"micro","micro sign, U+00B5 ISOnum" },
1777
{ 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1778
{ 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1779
{ 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1780
{ 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
1781
{ 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
1782
{ 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1783
{ 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1784
{ 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1785
{ 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1786
{ 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1787
{ 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1788
{ 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1789
{ 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1790
{ 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1791
{ 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1792
{ 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1793
{ 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1794
{ 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1795
{ 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1796
{ 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1797
{ 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1798
{ 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
1799
{ 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1800
{ 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1801
{ 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1802
{ 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
1803
{ 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
1804
{ 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1805
{ 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1806
{ 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1807
{ 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1808
{ 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1809
{ 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1810
{ 215,	"times","multiplication sign, U+00D7 ISOnum" },
1811
{ 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1812
{ 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1813
{ 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1814
{ 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1815
{ 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
1816
{ 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1817
{ 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
1818
{ 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1819
{ 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1820
{ 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1821
{ 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1822
{ 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1823
{ 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
1824
{ 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1825
{ 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1826
{ 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1827
{ 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1828
{ 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1829
{ 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1830
{ 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
1831
{ 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
1832
{ 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
1833
{ 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1834
{ 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
1835
{ 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
1836
{ 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1837
{ 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1838
{ 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1839
{ 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1840
{ 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1841
{ 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
1842
{ 247,	"divide","division sign, U+00F7 ISOnum" },
1843
{ 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1844
{ 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1845
{ 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
1846
{ 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1847
{ 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
1848
{ 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
1849
{ 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
1850
{ 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
1851

1852
{ 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
1853
{ 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
1854
{ 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1855
{ 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
1856
{ 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1857

1858
/*
1859
 * Anything below should really be kept as entities references
1860
 */
1861
{ 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
1862

1863
{ 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
1864
{ 732,	"tilde","small tilde, U+02DC ISOdia" },
1865

1866
{ 913,	"Alpha","greek capital letter alpha, U+0391" },
1867
{ 914,	"Beta",	"greek capital letter beta, U+0392" },
1868
{ 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1869
{ 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
1870
{ 917,	"Epsilon","greek capital letter epsilon, U+0395" },
1871
{ 918,	"Zeta",	"greek capital letter zeta, U+0396" },
1872
{ 919,	"Eta",	"greek capital letter eta, U+0397" },
1873
{ 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
1874
{ 921,	"Iota",	"greek capital letter iota, U+0399" },
1875
{ 922,	"Kappa","greek capital letter kappa, U+039A" },
1876
{ 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1877
{ 924,	"Mu",	"greek capital letter mu, U+039C" },
1878
{ 925,	"Nu",	"greek capital letter nu, U+039D" },
1879
{ 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
1880
{ 927,	"Omicron","greek capital letter omicron, U+039F" },
1881
{ 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
1882
{ 929,	"Rho",	"greek capital letter rho, U+03A1" },
1883
{ 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1884
{ 932,	"Tau",	"greek capital letter tau, U+03A4" },
1885
{ 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1886
{ 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
1887
{ 935,	"Chi",	"greek capital letter chi, U+03A7" },
1888
{ 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
1889
{ 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1890

1891
{ 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1892
{ 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
1893
{ 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1894
{ 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
1895
{ 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1896
{ 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
1897
{ 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
1898
{ 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
1899
{ 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
1900
{ 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
1901
{ 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
1902
{ 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
1903
{ 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
1904
{ 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
1905
{ 959,	"omicron","greek small letter omicron, U+03BF NEW" },
1906
{ 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
1907
{ 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
1908
{ 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1909
{ 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1910
{ 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
1911
{ 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1912
{ 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
1913
{ 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
1914
{ 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
1915
{ 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
1916
{ 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
1917
{ 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1918
{ 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
1919

1920
{ 8194,	"ensp",	"en space, U+2002 ISOpub" },
1921
{ 8195,	"emsp",	"em space, U+2003 ISOpub" },
1922
{ 8201,	"thinsp","thin space, U+2009 ISOpub" },
1923
{ 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
1924
{ 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
1925
{ 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
1926
{ 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
1927
{ 8211,	"ndash","en dash, U+2013 ISOpub" },
1928
{ 8212,	"mdash","em dash, U+2014 ISOpub" },
1929
{ 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
1930
{ 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
1931
{ 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
1932
{ 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
1933
{ 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
1934
{ 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
1935
{ 8224,	"dagger","dagger, U+2020 ISOpub" },
1936
{ 8225,	"Dagger","double dagger, U+2021 ISOpub" },
1937

1938
{ 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
1939
{ 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1940

1941
{ 8240,	"permil","per mille sign, U+2030 ISOtech" },
1942

1943
{ 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
1944
{ 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
1945

1946
{ 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1947
{ 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1948

1949
{ 8254,	"oline","overline = spacing overscore, U+203E NEW" },
1950
{ 8260,	"frasl","fraction slash, U+2044 NEW" },
1951

1952
{ 8364,	"euro",	"euro sign, U+20AC NEW" },
1953

1954
{ 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1955
{ 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1956
{ 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
1957
{ 8482,	"trade","trade mark sign, U+2122 ISOnum" },
1958
{ 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1959
{ 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
1960
{ 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
1961
{ 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
1962
{ 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
1963
{ 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
1964
{ 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1965
{ 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
1966
{ 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
1967
{ 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
1968
{ 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
1969
{ 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
1970

1971
{ 8704,	"forall","for all, U+2200 ISOtech" },
1972
{ 8706,	"part",	"partial differential, U+2202 ISOtech" },
1973
{ 8707,	"exist","there exists, U+2203 ISOtech" },
1974
{ 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
1975
{ 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
1976
{ 8712,	"isin",	"element of, U+2208 ISOtech" },
1977
{ 8713,	"notin","not an element of, U+2209 ISOtech" },
1978
{ 8715,	"ni",	"contains as member, U+220B ISOtech" },
1979
{ 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
1980
{ 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
1981
{ 8722,	"minus","minus sign, U+2212 ISOtech" },
1982
{ 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
1983
{ 8730,	"radic","square root = radical sign, U+221A ISOtech" },
1984
{ 8733,	"prop",	"proportional to, U+221D ISOtech" },
1985
{ 8734,	"infin","infinity, U+221E ISOtech" },
1986
{ 8736,	"ang",	"angle, U+2220 ISOamso" },
1987
{ 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
1988
{ 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
1989
{ 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
1990
{ 8746,	"cup",	"union = cup, U+222A ISOtech" },
1991
{ 8747,	"int",	"integral, U+222B ISOtech" },
1992
{ 8756,	"there4","therefore, U+2234 ISOtech" },
1993
{ 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
1994
{ 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
1995
{ 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1996
{ 8800,	"ne",	"not equal to, U+2260 ISOtech" },
1997
{ 8801,	"equiv","identical to, U+2261 ISOtech" },
1998
{ 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
1999
{ 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
2000
{ 8834,	"sub",	"subset of, U+2282 ISOtech" },
2001
{ 8835,	"sup",	"superset of, U+2283 ISOtech" },
2002
{ 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
2003
{ 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
2004
{ 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
2005
{ 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
2006
{ 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
2007
{ 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2008
{ 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
2009
{ 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2010
{ 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
2011
{ 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
2012
{ 8971,	"rfloor","right floor, U+230B ISOamsc" },
2013
{ 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
2014
{ 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
2015
{ 9674,	"loz",	"lozenge, U+25CA ISOpub" },
2016

2017
{ 9824,	"spades","black spade suit, U+2660 ISOpub" },
2018
{ 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
2019
{ 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
2020
{ 9830,	"diams","black diamond suit, U+2666 ISOpub" },
2021

2022
};
2023

2024
/************************************************************************
2025
 *									*
2026
 *		Commodity functions to handle entities			*
2027
 *									*
2028
 ************************************************************************/
2029

2030
/*
2031
 * Macro used to grow the current buffer.
2032
 */
2033
#define growBuffer(buffer) {						\
2034
    xmlChar *tmp;							\
2035
    buffer##_size *= 2;							\
2036
    tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size); 		\
2037
    if (tmp == NULL) {							\
2038
	htmlErrMemory(ctxt, "growing buffer\n");			\
2039
	xmlFree(buffer);						\
2040
	return(NULL);							\
2041
    }									\
2042
    buffer = tmp;							\
2043
}
2044

2045
/**
2046
 * htmlEntityLookup:
2047
 * @name: the entity name
2048
 *
2049
 * Lookup the given entity in EntitiesTable
2050
 *
2051
 * TODO: the linear scan is really ugly, an hash table is really needed.
2052
 *
2053
 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2054
 */
2055
const htmlEntityDesc *
2056
htmlEntityLookup(const xmlChar *name) {
2057
    unsigned int i;
2058

2059
    for (i = 0;i < (sizeof(html40EntitiesTable)/
2060
                    sizeof(html40EntitiesTable[0]));i++) {
2061
        if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2062
            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2063
	}
2064
    }
2065
    return(NULL);
2066
}
2067

2068
/**
2069
 * htmlEntityValueLookup:
2070
 * @value: the entity's unicode value
2071
 *
2072
 * Lookup the given entity in EntitiesTable
2073
 *
2074
 * TODO: the linear scan is really ugly, an hash table is really needed.
2075
 *
2076
 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2077
 */
2078
const htmlEntityDesc *
2079
htmlEntityValueLookup(unsigned int value) {
2080
    unsigned int i;
2081

2082
    for (i = 0;i < (sizeof(html40EntitiesTable)/
2083
                    sizeof(html40EntitiesTable[0]));i++) {
2084
        if (html40EntitiesTable[i].value >= value) {
2085
	    if (html40EntitiesTable[i].value > value)
2086
		break;
2087
            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2088
	}
2089
    }
2090
    return(NULL);
2091
}
2092

2093
/**
2094
 * UTF8ToHtml:
2095
 * @out:  a pointer to an array of bytes to store the result
2096
 * @outlen:  the length of @out
2097
 * @in:  a pointer to an array of UTF-8 chars
2098
 * @inlen:  the length of @in
2099
 *
2100
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2101
 * plus HTML entities block of chars out.
2102
 *
2103
 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2104
 * The value of @inlen after return is the number of octets consumed
2105
 *     as the return value is positive, else unpredictable.
2106
 * The value of @outlen after return is the number of octets consumed.
2107
 */
2108
int
2109
UTF8ToHtml(unsigned char* out, int *outlen,
2110
              const unsigned char* in, int *inlen) {
2111
    const unsigned char* processed = in;
2112
    const unsigned char* outend;
2113
    const unsigned char* outstart = out;
2114
    const unsigned char* instart = in;
2115
    const unsigned char* inend;
2116
    unsigned int c, d;
2117
    int trailing;
2118

2119
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2120
    if (in == NULL) {
2121
        /*
2122
	 * initialization nothing to do
2123
	 */
2124
	*outlen = 0;
2125
	*inlen = 0;
2126
	return(0);
2127
    }
2128
    inend = in + (*inlen);
2129
    outend = out + (*outlen);
2130
    while (in < inend) {
2131
	d = *in++;
2132
	if      (d < 0x80)  { c= d; trailing= 0; }
2133
	else if (d < 0xC0) {
2134
	    /* trailing byte in leading position */
2135
	    *outlen = out - outstart;
2136
	    *inlen = processed - instart;
2137
	    return(-2);
2138
        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2139
        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2140
        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2141
	else {
2142
	    /* no chance for this in Ascii */
2143
	    *outlen = out - outstart;
2144
	    *inlen = processed - instart;
2145
	    return(-2);
2146
	}
2147

2148
	if (inend - in < trailing) {
2149
	    break;
2150
	}
2151

2152
	for ( ; trailing; trailing--) {
2153
	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2154
		break;
2155
	    c <<= 6;
2156
	    c |= d & 0x3F;
2157
	}
2158

2159
	/* assertion: c is a single UTF-4 value */
2160
	if (c < 0x80) {
2161
	    if (out + 1 >= outend)
2162
		break;
2163
	    *out++ = c;
2164
	} else {
2165
	    int len;
2166
	    const htmlEntityDesc * ent;
2167
	    const char *cp;
2168
	    char nbuf[16];
2169

2170
	    /*
2171
	     * Try to lookup a predefined HTML entity for it
2172
	     */
2173

2174
	    ent = htmlEntityValueLookup(c);
2175
	    if (ent == NULL) {
2176
	      snprintf(nbuf, sizeof(nbuf), "#%u", c);
2177
	      cp = nbuf;
2178
	    }
2179
	    else
2180
	      cp = ent->name;
2181
	    len = strlen(cp);
2182
	    if (out + 2 + len >= outend)
2183
		break;
2184
	    *out++ = '&';
2185
	    memcpy(out, cp, len);
2186
	    out += len;
2187
	    *out++ = ';';
2188
	}
2189
	processed = in;
2190
    }
2191
    *outlen = out - outstart;
2192
    *inlen = processed - instart;
2193
    return(0);
2194
}
2195

2196
/**
2197
 * htmlEncodeEntities:
2198
 * @out:  a pointer to an array of bytes to store the result
2199
 * @outlen:  the length of @out
2200
 * @in:  a pointer to an array of UTF-8 chars
2201
 * @inlen:  the length of @in
2202
 * @quoteChar: the quote character to escape (' or ") or zero.
2203
 *
2204
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2205
 * plus HTML entities block of chars out.
2206
 *
2207
 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2208
 * The value of @inlen after return is the number of octets consumed
2209
 *     as the return value is positive, else unpredictable.
2210
 * The value of @outlen after return is the number of octets consumed.
2211
 */
2212
int
2213
htmlEncodeEntities(unsigned char* out, int *outlen,
2214
		   const unsigned char* in, int *inlen, int quoteChar) {
2215
    const unsigned char* processed = in;
2216
    const unsigned char* outend;
2217
    const unsigned char* outstart = out;
2218
    const unsigned char* instart = in;
2219
    const unsigned char* inend;
2220
    unsigned int c, d;
2221
    int trailing;
2222

2223
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2224
        return(-1);
2225
    outend = out + (*outlen);
2226
    inend = in + (*inlen);
2227
    while (in < inend) {
2228
	d = *in++;
2229
	if      (d < 0x80)  { c= d; trailing= 0; }
2230
	else if (d < 0xC0) {
2231
	    /* trailing byte in leading position */
2232
	    *outlen = out - outstart;
2233
	    *inlen = processed - instart;
2234
	    return(-2);
2235
        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2236
        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2237
        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2238
	else {
2239
	    /* no chance for this in Ascii */
2240
	    *outlen = out - outstart;
2241
	    *inlen = processed - instart;
2242
	    return(-2);
2243
	}
2244

2245
	if (inend - in < trailing)
2246
	    break;
2247

2248
	while (trailing--) {
2249
	    if (((d= *in++) & 0xC0) != 0x80) {
2250
		*outlen = out - outstart;
2251
		*inlen = processed - instart;
2252
		return(-2);
2253
	    }
2254
	    c <<= 6;
2255
	    c |= d & 0x3F;
2256
	}
2257

2258
	/* assertion: c is a single UTF-4 value */
2259
	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2260
	    (c != '&') && (c != '<') && (c != '>')) {
2261
	    if (out >= outend)
2262
		break;
2263
	    *out++ = c;
2264
	} else {
2265
	    const htmlEntityDesc * ent;
2266
	    const char *cp;
2267
	    char nbuf[16];
2268
	    int len;
2269

2270
	    /*
2271
	     * Try to lookup a predefined HTML entity for it
2272
	     */
2273
	    ent = htmlEntityValueLookup(c);
2274
	    if (ent == NULL) {
2275
		snprintf(nbuf, sizeof(nbuf), "#%u", c);
2276
		cp = nbuf;
2277
	    }
2278
	    else
2279
		cp = ent->name;
2280
	    len = strlen(cp);
2281
	    if (outend - out < len + 2)
2282
		break;
2283
	    *out++ = '&';
2284
	    memcpy(out, cp, len);
2285
	    out += len;
2286
	    *out++ = ';';
2287
	}
2288
	processed = in;
2289
    }
2290
    *outlen = out - outstart;
2291
    *inlen = processed - instart;
2292
    return(0);
2293
}
2294

2295
/************************************************************************
2296
 *									*
2297
 *		Commodity functions to handle streams			*
2298
 *									*
2299
 ************************************************************************/
2300

2301
#ifdef LIBXML_PUSH_ENABLED
2302
/**
2303
 * htmlNewInputStream:
2304
 * @ctxt:  an HTML parser context
2305
 *
2306
 * Create a new input stream structure
2307
 * Returns the new input stream or NULL
2308
 */
2309
static htmlParserInputPtr
2310
htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2311
    htmlParserInputPtr input;
2312

2313
    input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2314
    if (input == NULL) {
2315
        htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2316
	return(NULL);
2317
    }
2318
    memset(input, 0, sizeof(htmlParserInput));
2319
    input->filename = NULL;
2320
    input->directory = NULL;
2321
    input->base = NULL;
2322
    input->cur = NULL;
2323
    input->buf = NULL;
2324
    input->line = 1;
2325
    input->col = 1;
2326
    input->buf = NULL;
2327
    input->free = NULL;
2328
    input->version = NULL;
2329
    input->consumed = 0;
2330
    input->length = 0;
2331
    return(input);
2332
}
2333
#endif
2334

2335

2336
/************************************************************************
2337
 *									*
2338
 *		Commodity functions, cleanup needed ?			*
2339
 *									*
2340
 ************************************************************************/
2341
/*
2342
 * all tags allowing pc data from the html 4.01 loose dtd
2343
 * NOTE: it might be more appropriate to integrate this information
2344
 * into the html40ElementTable array but I don't want to risk any
2345
 * binary incompatibility
2346
 */
2347
static const char *allowPCData[] = {
2348
    "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2349
    "blockquote", "body", "button", "caption", "center", "cite", "code",
2350
    "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2351
    "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2352
    "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2353
    "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2354
};
2355

2356
/**
2357
 * areBlanks:
2358
 * @ctxt:  an HTML parser context
2359
 * @str:  a xmlChar *
2360
 * @len:  the size of @str
2361
 *
2362
 * Is this a sequence of blank chars that one can ignore ?
2363
 *
2364
 * Returns 1 if ignorable 0 otherwise.
2365
 */
2366

2367
static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2368
    unsigned int i;
2369
    int j;
2370
    xmlNodePtr lastChild;
2371
    xmlDtdPtr dtd;
2372

2373
    for (j = 0;j < len;j++)
2374
        if (!(IS_BLANK_CH(str[j]))) return(0);
2375

2376
    if (CUR == 0) return(1);
2377
    if (CUR != '<') return(0);
2378
    if (ctxt->name == NULL)
2379
	return(1);
2380
    if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2381
	return(1);
2382
    if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2383
	return(1);
2384

2385
    /* Only strip CDATA children of the body tag for strict HTML DTDs */
2386
    if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2387
        dtd = xmlGetIntSubset(ctxt->myDoc);
2388
        if (dtd != NULL && dtd->ExternalID != NULL) {
2389
            if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2390
                    !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2391
                return(1);
2392
        }
2393
    }
2394

2395
    if (ctxt->node == NULL) return(0);
2396
    lastChild = xmlGetLastChild(ctxt->node);
2397
    while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2398
	lastChild = lastChild->prev;
2399
    if (lastChild == NULL) {
2400
        if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2401
            (ctxt->node->content != NULL)) return(0);
2402
	/* keep ws in constructs like ...<b> </b>...
2403
	   for all tags "b" allowing PCDATA */
2404
	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2405
	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2406
		return(0);
2407
	    }
2408
	}
2409
    } else if (xmlNodeIsText(lastChild)) {
2410
        return(0);
2411
    } else {
2412
	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2413
	   for all tags "p" allowing PCDATA */
2414
	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2415
	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2416
		return(0);
2417
	    }
2418
	}
2419
    }
2420
    return(1);
2421
}
2422

2423
/**
2424
 * htmlNewDocNoDtD:
2425
 * @URI:  URI for the dtd, or NULL
2426
 * @ExternalID:  the external ID of the DTD, or NULL
2427
 *
2428
 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2429
 * are NULL
2430
 *
2431
 * Returns a new document, do not initialize the DTD if not provided
2432
 */
2433
htmlDocPtr
2434
htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2435
    xmlDocPtr cur;
2436

2437
    /*
2438
     * Allocate a new document and fill the fields.
2439
     */
2440
    cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2441
    if (cur == NULL) {
2442
	htmlErrMemory(NULL, "HTML document creation failed\n");
2443
	return(NULL);
2444
    }
2445
    memset(cur, 0, sizeof(xmlDoc));
2446

2447
    cur->type = XML_HTML_DOCUMENT_NODE;
2448
    cur->version = NULL;
2449
    cur->intSubset = NULL;
2450
    cur->doc = cur;
2451
    cur->name = NULL;
2452
    cur->children = NULL;
2453
    cur->extSubset = NULL;
2454
    cur->oldNs = NULL;
2455
    cur->encoding = NULL;
2456
    cur->standalone = 1;
2457
    cur->compression = 0;
2458
    cur->ids = NULL;
2459
    cur->refs = NULL;
2460
    cur->_private = NULL;
2461
    cur->charset = XML_CHAR_ENCODING_UTF8;
2462
    cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2463
    if ((ExternalID != NULL) ||
2464
	(URI != NULL))
2465
	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2466
    if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2467
	xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2468
    return(cur);
2469
}
2470

2471
/**
2472
 * htmlNewDoc:
2473
 * @URI:  URI for the dtd, or NULL
2474
 * @ExternalID:  the external ID of the DTD, or NULL
2475
 *
2476
 * Creates a new HTML document
2477
 *
2478
 * Returns a new document
2479
 */
2480
htmlDocPtr
2481
htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2482
    if ((URI == NULL) && (ExternalID == NULL))
2483
	return(htmlNewDocNoDtD(
2484
		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2485
		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2486

2487
    return(htmlNewDocNoDtD(URI, ExternalID));
2488
}
2489

2490

2491
/************************************************************************
2492
 *									*
2493
 *			The parser itself				*
2494
 *	Relates to http://www.w3.org/TR/html40				*
2495
 *									*
2496
 ************************************************************************/
2497

2498
/************************************************************************
2499
 *									*
2500
 *			The parser itself				*
2501
 *									*
2502
 ************************************************************************/
2503

2504
static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2505

2506
static void
2507
htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
2508
    int c;
2509

2510
    htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
2511
                 "Incorrectly opened comment\n", NULL, NULL);
2512

2513
    do {
2514
        c = CUR;
2515
        if (c == 0)
2516
            break;
2517
        NEXT;
2518
    } while (c != '>');
2519
}
2520

2521
/**
2522
 * htmlParseHTMLName:
2523
 * @ctxt:  an HTML parser context
2524
 *
2525
 * parse an HTML tag or attribute name, note that we convert it to lowercase
2526
 * since HTML names are not case-sensitive.
2527
 *
2528
 * Returns the Tag Name parsed or NULL
2529
 */
2530

2531
static const xmlChar *
2532
htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2533
    const xmlChar *ret;
2534
    int i = 0;
2535
    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2536

2537
    if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2538
        (CUR != ':') && (CUR != '.')) return(NULL);
2539

2540
    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2541
           ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2542
	   (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2543
           (CUR == '.'))) {
2544
	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2545
        else loc[i] = CUR;
2546
	i++;
2547

2548
	NEXT;
2549
    }
2550

2551
    ret = xmlDictLookup(ctxt->dict, loc, i);
2552
    if (ret == NULL)
2553
        htmlErrMemory(ctxt, NULL);
2554

2555
    return(ret);
2556
}
2557

2558

2559
/**
2560
 * htmlParseHTMLName_nonInvasive:
2561
 * @ctxt:  an HTML parser context
2562
 *
2563
 * parse an HTML tag or attribute name, note that we convert it to lowercase
2564
 * since HTML names are not case-sensitive, this doesn't consume the data
2565
 * from the stream, it's a look-ahead
2566
 *
2567
 * Returns the Tag Name parsed or NULL
2568
 */
2569

2570
static const xmlChar *
2571
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2572
    int i = 0;
2573
    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2574

2575
    if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2576
        (NXT(1) != ':')) return(NULL);
2577

2578
    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2579
           ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2580
	   (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2581
	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2582
        else loc[i] = NXT(1+i);
2583
	i++;
2584
    }
2585

2586
    return(xmlDictLookup(ctxt->dict, loc, i));
2587
}
2588

2589

2590
/**
2591
 * htmlParseName:
2592
 * @ctxt:  an HTML parser context
2593
 *
2594
 * parse an HTML name, this routine is case sensitive.
2595
 *
2596
 * Returns the Name parsed or NULL
2597
 */
2598

2599
static const xmlChar *
2600
htmlParseName(htmlParserCtxtPtr ctxt) {
2601
    const xmlChar *in;
2602
    const xmlChar *ret;
2603
    int count = 0;
2604

2605
    GROW;
2606

2607
    /*
2608
     * Accelerator for simple ASCII names
2609
     */
2610
    in = ctxt->input->cur;
2611
    if (((*in >= 0x61) && (*in <= 0x7A)) ||
2612
	((*in >= 0x41) && (*in <= 0x5A)) ||
2613
	(*in == '_') || (*in == ':')) {
2614
	in++;
2615
	while (((*in >= 0x61) && (*in <= 0x7A)) ||
2616
	       ((*in >= 0x41) && (*in <= 0x5A)) ||
2617
	       ((*in >= 0x30) && (*in <= 0x39)) ||
2618
	       (*in == '_') || (*in == '-') ||
2619
	       (*in == ':') || (*in == '.'))
2620
	    in++;
2621

2622
	if (in == ctxt->input->end)
2623
	    return(NULL);
2624

2625
	if ((*in > 0) && (*in < 0x80)) {
2626
	    count = in - ctxt->input->cur;
2627
	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2628
	    ctxt->input->cur = in;
2629
	    ctxt->input->col += count;
2630
	    return(ret);
2631
	}
2632
    }
2633
    return(htmlParseNameComplex(ctxt));
2634
}
2635

2636
static const xmlChar *
2637
htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2638
    int len = 0, l;
2639
    int c;
2640
    int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2641
                    XML_MAX_TEXT_LENGTH :
2642
                    XML_MAX_NAME_LENGTH;
2643
    const xmlChar *base = ctxt->input->base;
2644

2645
    /*
2646
     * Handler for more complex cases
2647
     */
2648
    c = CUR_CHAR(l);
2649
    if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2650
	(!IS_LETTER(c) && (c != '_') &&
2651
         (c != ':'))) {
2652
	return(NULL);
2653
    }
2654

2655
    while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2656
	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2657
            (c == '.') || (c == '-') ||
2658
	    (c == '_') || (c == ':') ||
2659
	    (IS_COMBINING(c)) ||
2660
	    (IS_EXTENDER(c)))) {
2661
	len += l;
2662
        if (len > maxLength) {
2663
            htmlParseErr(ctxt, XML_ERR_NAME_TOO_LONG, "name too long", NULL, NULL);
2664
            return(NULL);
2665
        }
2666
	NEXTL(l);
2667
	c = CUR_CHAR(l);
2668
	if (ctxt->input->base != base) {
2669
	    /*
2670
	     * We changed encoding from an unknown encoding
2671
	     * Input buffer changed location, so we better start again
2672
	     */
2673
	    return(htmlParseNameComplex(ctxt));
2674
	}
2675
    }
2676
    if (ctxt->instate == XML_PARSER_EOF)
2677
        return(NULL);
2678

2679
    if (ctxt->input->cur - ctxt->input->base < len) {
2680
        /* Sanity check */
2681
	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2682
                     "unexpected change of input buffer", NULL, NULL);
2683
        return (NULL);
2684
    }
2685

2686
    return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2687
}
2688

2689

2690
/**
2691
 * htmlParseHTMLAttribute:
2692
 * @ctxt:  an HTML parser context
2693
 * @stop:  a char stop value
2694
 *
2695
 * parse an HTML attribute value till the stop (quote), if
2696
 * stop is 0 then it stops at the first space
2697
 *
2698
 * Returns the attribute parsed or NULL
2699
 */
2700

2701
static xmlChar *
2702
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2703
    xmlChar *buffer = NULL;
2704
    int buffer_size = 0;
2705
    int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2706
                    XML_MAX_HUGE_LENGTH :
2707
                    XML_MAX_TEXT_LENGTH;
2708
    xmlChar *out = NULL;
2709
    const xmlChar *name = NULL;
2710
    const xmlChar *cur = NULL;
2711
    const htmlEntityDesc * ent;
2712

2713
    /*
2714
     * allocate a translation buffer.
2715
     */
2716
    buffer_size = HTML_PARSER_BUFFER_SIZE;
2717
    buffer = (xmlChar *) xmlMallocAtomic(buffer_size);
2718
    if (buffer == NULL) {
2719
	htmlErrMemory(ctxt, "buffer allocation failed\n");
2720
	return(NULL);
2721
    }
2722
    out = buffer;
2723

2724
    /*
2725
     * Ok loop until we reach one of the ending chars
2726
     */
2727
    while ((CUR != 0) && (CUR != stop)) {
2728
	if ((stop == 0) && (CUR == '>')) break;
2729
	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2730
        if (CUR == '&') {
2731
	    if (NXT(1) == '#') {
2732
		unsigned int c;
2733
		int bits;
2734

2735
		c = htmlParseCharRef(ctxt);
2736
		if      (c <    0x80)
2737
		        { *out++  = c;                bits= -6; }
2738
		else if (c <   0x800)
2739
		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2740
		else if (c < 0x10000)
2741
		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2742
		else
2743
		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2744

2745
		for ( ; bits >= 0; bits-= 6) {
2746
		    *out++  = ((c >> bits) & 0x3F) | 0x80;
2747
		}
2748

2749
		if (out - buffer > buffer_size - 100) {
2750
			int indx = out - buffer;
2751

2752
			growBuffer(buffer);
2753
			out = &buffer[indx];
2754
		}
2755
	    } else {
2756
		ent = htmlParseEntityRef(ctxt, &name);
2757
		if (name == NULL) {
2758
		    *out++ = '&';
2759
		    if (out - buffer > buffer_size - 100) {
2760
			int indx = out - buffer;
2761

2762
			growBuffer(buffer);
2763
			out = &buffer[indx];
2764
		    }
2765
		} else if (ent == NULL) {
2766
		    *out++ = '&';
2767
		    cur = name;
2768
		    while (*cur != 0) {
2769
			if (out - buffer > buffer_size - 100) {
2770
			    int indx = out - buffer;
2771

2772
			    growBuffer(buffer);
2773
			    out = &buffer[indx];
2774
			}
2775
			*out++ = *cur++;
2776
		    }
2777
		} else {
2778
		    unsigned int c;
2779
		    int bits;
2780

2781
		    if (out - buffer > buffer_size - 100) {
2782
			int indx = out - buffer;
2783

2784
			growBuffer(buffer);
2785
			out = &buffer[indx];
2786
		    }
2787
		    c = ent->value;
2788
		    if      (c <    0x80)
2789
			{ *out++  = c;                bits= -6; }
2790
		    else if (c <   0x800)
2791
			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2792
		    else if (c < 0x10000)
2793
			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2794
		    else
2795
			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2796

2797
		    for ( ; bits >= 0; bits-= 6) {
2798
			*out++  = ((c >> bits) & 0x3F) | 0x80;
2799
		    }
2800
		}
2801
	    }
2802
	} else {
2803
	    unsigned int c;
2804
	    int bits, l;
2805

2806
	    if (out - buffer > buffer_size - 100) {
2807
		int indx = out - buffer;
2808

2809
		growBuffer(buffer);
2810
		out = &buffer[indx];
2811
	    }
2812
	    c = CUR_CHAR(l);
2813
            if (ctxt->instate == XML_PARSER_EOF) {
2814
                xmlFree(buffer);
2815
                return(NULL);
2816
            }
2817
	    if      (c <    0x80)
2818
		    { *out++  = c;                bits= -6; }
2819
	    else if (c <   0x800)
2820
		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2821
	    else if (c < 0x10000)
2822
		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2823
	    else
2824
		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2825

2826
	    for ( ; bits >= 0; bits-= 6) {
2827
		*out++  = ((c >> bits) & 0x3F) | 0x80;
2828
	    }
2829
	    NEXTL(l);
2830
	}
2831
        if (out - buffer > maxLength) {
2832
            htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2833
                         "attribute value too long\n", NULL, NULL);
2834
            xmlFree(buffer);
2835
            return(NULL);
2836
        }
2837
    }
2838
    *out = 0;
2839
    return(buffer);
2840
}
2841

2842
/**
2843
 * htmlParseEntityRef:
2844
 * @ctxt:  an HTML parser context
2845
 * @str:  location to store the entity name
2846
 *
2847
 * DEPRECATED: Internal function, don't use.
2848
 *
2849
 * parse an HTML ENTITY references
2850
 *
2851
 * [68] EntityRef ::= '&' Name ';'
2852
 *
2853
 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2854
 *         if non-NULL *str will have to be freed by the caller.
2855
 */
2856
const htmlEntityDesc *
2857
htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2858
    const xmlChar *name;
2859
    const htmlEntityDesc * ent = NULL;
2860

2861
    if (str != NULL) *str = NULL;
2862
    if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2863

2864
    if (CUR == '&') {
2865
        NEXT;
2866
        name = htmlParseName(ctxt);
2867
	if (name == NULL) {
2868
	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2869
	                 "htmlParseEntityRef: no name\n", NULL, NULL);
2870
	} else {
2871
	    GROW;
2872
	    if (CUR == ';') {
2873
	        if (str != NULL)
2874
		    *str = name;
2875

2876
		/*
2877
		 * Lookup the entity in the table.
2878
		 */
2879
		ent = htmlEntityLookup(name);
2880
		if (ent != NULL) /* OK that's ugly !!! */
2881
		    NEXT;
2882
	    } else {
2883
		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2884
		             "htmlParseEntityRef: expecting ';'\n",
2885
			     NULL, NULL);
2886
	        if (str != NULL)
2887
		    *str = name;
2888
	    }
2889
	}
2890
    }
2891
    return(ent);
2892
}
2893

2894
/**
2895
 * htmlParseAttValue:
2896
 * @ctxt:  an HTML parser context
2897
 *
2898
 * parse a value for an attribute
2899
 * Note: the parser won't do substitution of entities here, this
2900
 * will be handled later in xmlStringGetNodeList, unless it was
2901
 * asked for ctxt->replaceEntities != 0
2902
 *
2903
 * Returns the AttValue parsed or NULL.
2904
 */
2905

2906
static xmlChar *
2907
htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2908
    xmlChar *ret = NULL;
2909

2910
    if (CUR == '"') {
2911
        NEXT;
2912
	ret = htmlParseHTMLAttribute(ctxt, '"');
2913
        if (CUR != '"') {
2914
	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2915
	                 "AttValue: \" expected\n", NULL, NULL);
2916
	} else
2917
	    NEXT;
2918
    } else if (CUR == '\'') {
2919
        NEXT;
2920
	ret = htmlParseHTMLAttribute(ctxt, '\'');
2921
        if (CUR != '\'') {
2922
	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2923
	                 "AttValue: ' expected\n", NULL, NULL);
2924
	} else
2925
	    NEXT;
2926
    } else {
2927
        /*
2928
	 * That's an HTMLism, the attribute value may not be quoted
2929
	 */
2930
	ret = htmlParseHTMLAttribute(ctxt, 0);
2931
	if (ret == NULL) {
2932
	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2933
	                 "AttValue: no value found\n", NULL, NULL);
2934
	}
2935
    }
2936
    return(ret);
2937
}
2938

2939
/**
2940
 * htmlParseSystemLiteral:
2941
 * @ctxt:  an HTML parser context
2942
 *
2943
 * parse an HTML Literal
2944
 *
2945
 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2946
 *
2947
 * Returns the SystemLiteral parsed or NULL
2948
 */
2949

2950
static xmlChar *
2951
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2952
    size_t len = 0, startPosition = 0;
2953
    int err = 0;
2954
    int quote;
2955
    xmlChar *ret = NULL;
2956

2957
    if ((CUR != '"') && (CUR != '\'')) {
2958
	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2959
	             "SystemLiteral \" or ' expected\n", NULL, NULL);
2960
        return(NULL);
2961
    }
2962
    quote = CUR;
2963
    NEXT;
2964

2965
    if (CUR_PTR < BASE_PTR)
2966
        return(ret);
2967
    startPosition = CUR_PTR - BASE_PTR;
2968

2969
    while ((CUR != 0) && (CUR != quote)) {
2970
        /* TODO: Handle UTF-8 */
2971
        if (!IS_CHAR_CH(CUR)) {
2972
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2973
                            "Invalid char in SystemLiteral 0x%X\n", CUR);
2974
            err = 1;
2975
        }
2976
        NEXT;
2977
        len++;
2978
    }
2979
    if (CUR != quote) {
2980
        htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2981
                     "Unfinished SystemLiteral\n", NULL, NULL);
2982
    } else {
2983
        if (err == 0)
2984
            ret = xmlStrndup((BASE_PTR+startPosition), len);
2985
        NEXT;
2986
    }
2987

2988
    return(ret);
2989
}
2990

2991
/**
2992
 * htmlParsePubidLiteral:
2993
 * @ctxt:  an HTML parser context
2994
 *
2995
 * parse an HTML public literal
2996
 *
2997
 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2998
 *
2999
 * Returns the PubidLiteral parsed or NULL.
3000
 */
3001

3002
static xmlChar *
3003
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3004
    size_t len = 0, startPosition = 0;
3005
    int err = 0;
3006
    int quote;
3007
    xmlChar *ret = NULL;
3008

3009
    if ((CUR != '"') && (CUR != '\'')) {
3010
	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3011
	             "PubidLiteral \" or ' expected\n", NULL, NULL);
3012
        return(NULL);
3013
    }
3014
    quote = CUR;
3015
    NEXT;
3016

3017
    /*
3018
     * Name ::= (Letter | '_') (NameChar)*
3019
     */
3020
    if (CUR_PTR < BASE_PTR)
3021
        return(ret);
3022
    startPosition = CUR_PTR - BASE_PTR;
3023

3024
    while ((CUR != 0) && (CUR != quote)) {
3025
        if (!IS_PUBIDCHAR_CH(CUR)) {
3026
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3027
                            "Invalid char in PubidLiteral 0x%X\n", CUR);
3028
            err = 1;
3029
        }
3030
        len++;
3031
        NEXT;
3032
    }
3033

3034
    if (CUR != quote) {
3035
        htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3036
                     "Unfinished PubidLiteral\n", NULL, NULL);
3037
    } else {
3038
        if (err == 0)
3039
            ret = xmlStrndup((BASE_PTR + startPosition), len);
3040
        NEXT;
3041
    }
3042

3043
    return(ret);
3044
}
3045

3046
/**
3047
 * htmlParseScript:
3048
 * @ctxt:  an HTML parser context
3049
 *
3050
 * parse the content of an HTML SCRIPT or STYLE element
3051
 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3052
 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3053
 * http://www.w3.org/TR/html4/types.html#type-script
3054
 * http://www.w3.org/TR/html4/types.html#h-6.15
3055
 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3056
 *
3057
 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3058
 * element and the value of intrinsic event attributes. User agents must
3059
 * not evaluate script data as HTML markup but instead must pass it on as
3060
 * data to a script engine.
3061
 * NOTES:
3062
 * - The content is passed like CDATA
3063
 * - the attributes for style and scripting "onXXX" are also described
3064
 *   as CDATA but SGML allows entities references in attributes so their
3065
 *   processing is identical as other attributes
3066
 */
3067
static void
3068
htmlParseScript(htmlParserCtxtPtr ctxt) {
3069
    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3070
    int nbchar = 0;
3071
    int cur,l;
3072

3073
    cur = CUR_CHAR(l);
3074
    while (cur != 0) {
3075
	if ((cur == '<') && (NXT(1) == '/')) {
3076
            /*
3077
             * One should break here, the specification is clear:
3078
             * Authors should therefore escape "</" within the content.
3079
             * Escape mechanisms are specific to each scripting or
3080
             * style sheet language.
3081
             *
3082
             * In recovery mode, only break if end tag match the
3083
             * current tag, effectively ignoring all tags inside the
3084
             * script/style block and treating the entire block as
3085
             * CDATA.
3086
             */
3087
            if (ctxt->recovery) {
3088
                if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3089
				   xmlStrlen(ctxt->name)) == 0)
3090
                {
3091
                    break; /* while */
3092
                } else {
3093
		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3094
				 "Element %s embeds close tag\n",
3095
		                 ctxt->name, NULL);
3096
		}
3097
            } else {
3098
                if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3099
                    ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3100
                {
3101
                    break; /* while */
3102
                }
3103
            }
3104
	}
3105
        if (IS_CHAR(cur)) {
3106
	    COPY_BUF(l,buf,nbchar,cur);
3107
        } else {
3108
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3109
                            "Invalid char in CDATA 0x%X\n", cur);
3110
        }
3111
	NEXTL(l);
3112
	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3113
            buf[nbchar] = 0;
3114
	    if (ctxt->sax->cdataBlock!= NULL) {
3115
		/*
3116
		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3117
		 */
3118
		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3119
	    } else if (ctxt->sax->characters != NULL) {
3120
		ctxt->sax->characters(ctxt->userData, buf, nbchar);
3121
	    }
3122
	    nbchar = 0;
3123
            SHRINK;
3124
	}
3125
	cur = CUR_CHAR(l);
3126
    }
3127

3128
    if (ctxt->instate == XML_PARSER_EOF)
3129
        return;
3130

3131
    if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3132
        buf[nbchar] = 0;
3133
	if (ctxt->sax->cdataBlock!= NULL) {
3134
	    /*
3135
	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3136
	     */
3137
	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3138
	} else if (ctxt->sax->characters != NULL) {
3139
	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3140
	}
3141
    }
3142
}
3143

3144

3145
/**
3146
 * htmlParseCharDataInternal:
3147
 * @ctxt:  an HTML parser context
3148
 * @readahead: optional read ahead character in ascii range
3149
 *
3150
 * parse a CharData section.
3151
 * if we are within a CDATA section ']]>' marks an end of section.
3152
 *
3153
 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3154
 */
3155

3156
static void
3157
htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3158
    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3159
    int nbchar = 0;
3160
    int cur, l;
3161

3162
    if (readahead)
3163
        buf[nbchar++] = readahead;
3164

3165
    cur = CUR_CHAR(l);
3166
    while (((cur != '<') || (ctxt->token == '<')) &&
3167
           ((cur != '&') || (ctxt->token == '&')) &&
3168
	   (cur != 0)) {
3169
	if (!(IS_CHAR(cur))) {
3170
	    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3171
	                "Invalid char in CDATA 0x%X\n", cur);
3172
	} else {
3173
	    COPY_BUF(l,buf,nbchar,cur);
3174
	}
3175
	NEXTL(l);
3176
	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3177
            buf[nbchar] = 0;
3178

3179
	    /*
3180
	     * Ok the segment is to be consumed as chars.
3181
	     */
3182
	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3183
		if (areBlanks(ctxt, buf, nbchar)) {
3184
		    if (ctxt->keepBlanks) {
3185
			if (ctxt->sax->characters != NULL)
3186
			    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3187
		    } else {
3188
			if (ctxt->sax->ignorableWhitespace != NULL)
3189
			    ctxt->sax->ignorableWhitespace(ctxt->userData,
3190
			                                   buf, nbchar);
3191
		    }
3192
		} else {
3193
		    htmlCheckParagraph(ctxt);
3194
		    if (ctxt->sax->characters != NULL)
3195
			ctxt->sax->characters(ctxt->userData, buf, nbchar);
3196
		}
3197
	    }
3198
	    nbchar = 0;
3199
            SHRINK;
3200
	}
3201
	cur = CUR_CHAR(l);
3202
    }
3203
    if (ctxt->instate == XML_PARSER_EOF)
3204
        return;
3205
    if (nbchar != 0) {
3206
        buf[nbchar] = 0;
3207

3208
	/*
3209
	 * Ok the segment is to be consumed as chars.
3210
	 */
3211
	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3212
	    if (areBlanks(ctxt, buf, nbchar)) {
3213
		if (ctxt->keepBlanks) {
3214
		    if (ctxt->sax->characters != NULL)
3215
			ctxt->sax->characters(ctxt->userData, buf, nbchar);
3216
		} else {
3217
		    if (ctxt->sax->ignorableWhitespace != NULL)
3218
			ctxt->sax->ignorableWhitespace(ctxt->userData,
3219
			                               buf, nbchar);
3220
		}
3221
	    } else {
3222
		htmlCheckParagraph(ctxt);
3223
		if (ctxt->sax->characters != NULL)
3224
		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3225
	    }
3226
	}
3227
    }
3228
}
3229

3230
/**
3231
 * htmlParseCharData:
3232
 * @ctxt:  an HTML parser context
3233
 *
3234
 * parse a CharData section.
3235
 * if we are within a CDATA section ']]>' marks an end of section.
3236
 *
3237
 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3238
 */
3239

3240
static void
3241
htmlParseCharData(htmlParserCtxtPtr ctxt) {
3242
    htmlParseCharDataInternal(ctxt, 0);
3243
}
3244

3245
/**
3246
 * htmlParseExternalID:
3247
 * @ctxt:  an HTML parser context
3248
 * @publicID:  a xmlChar** receiving PubidLiteral
3249
 *
3250
 * Parse an External ID or a Public ID
3251
 *
3252
 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3253
 *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3254
 *
3255
 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3256
 *
3257
 * Returns the function returns SystemLiteral and in the second
3258
 *                case publicID receives PubidLiteral, is strict is off
3259
 *                it is possible to return NULL and have publicID set.
3260
 */
3261

3262
static xmlChar *
3263
htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3264
    xmlChar *URI = NULL;
3265

3266
    if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3267
         (UPP(2) == 'S') && (UPP(3) == 'T') &&
3268
	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3269
        SKIP(6);
3270
	if (!IS_BLANK_CH(CUR)) {
3271
	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3272
	                 "Space required after 'SYSTEM'\n", NULL, NULL);
3273
	}
3274
        SKIP_BLANKS;
3275
	URI = htmlParseSystemLiteral(ctxt);
3276
	if (URI == NULL) {
3277
	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3278
	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3279
        }
3280
    } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3281
	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
3282
	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
3283
        SKIP(6);
3284
	if (!IS_BLANK_CH(CUR)) {
3285
	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3286
	                 "Space required after 'PUBLIC'\n", NULL, NULL);
3287
	}
3288
        SKIP_BLANKS;
3289
	*publicID = htmlParsePubidLiteral(ctxt);
3290
	if (*publicID == NULL) {
3291
	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3292
	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3293
			 NULL, NULL);
3294
	}
3295
        SKIP_BLANKS;
3296
        if ((CUR == '"') || (CUR == '\'')) {
3297
	    URI = htmlParseSystemLiteral(ctxt);
3298
	}
3299
    }
3300
    return(URI);
3301
}
3302

3303
/**
3304
 * xmlParsePI:
3305
 * @ctxt:  an XML parser context
3306
 *
3307
 * parse an XML Processing Instruction.
3308
 *
3309
 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3310
 */
3311
static void
3312
htmlParsePI(htmlParserCtxtPtr ctxt) {
3313
    xmlChar *buf = NULL;
3314
    int len = 0;
3315
    int size = HTML_PARSER_BUFFER_SIZE;
3316
    int cur, l;
3317
    int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3318
                    XML_MAX_HUGE_LENGTH :
3319
                    XML_MAX_TEXT_LENGTH;
3320
    const xmlChar *target;
3321
    xmlParserInputState state;
3322

3323
    if ((RAW == '<') && (NXT(1) == '?')) {
3324
	state = ctxt->instate;
3325
        ctxt->instate = XML_PARSER_PI;
3326
	/*
3327
	 * this is a Processing Instruction.
3328
	 */
3329
	SKIP(2);
3330

3331
	/*
3332
	 * Parse the target name and check for special support like
3333
	 * namespace.
3334
	 */
3335
        target = htmlParseName(ctxt);
3336
	if (target != NULL) {
3337
	    if (RAW == '>') {
3338
		SKIP(1);
3339

3340
		/*
3341
		 * SAX: PI detected.
3342
		 */
3343
		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3344
		    (ctxt->sax->processingInstruction != NULL))
3345
		    ctxt->sax->processingInstruction(ctxt->userData,
3346
		                                     target, NULL);
3347
		ctxt->instate = state;
3348
		return;
3349
	    }
3350
	    buf = (xmlChar *) xmlMallocAtomic(size);
3351
	    if (buf == NULL) {
3352
		htmlErrMemory(ctxt, NULL);
3353
		ctxt->instate = state;
3354
		return;
3355
	    }
3356
	    cur = CUR;
3357
	    if (!IS_BLANK(cur)) {
3358
		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3359
			  "ParsePI: PI %s space expected\n", target, NULL);
3360
	    }
3361
            SKIP_BLANKS;
3362
	    cur = CUR_CHAR(l);
3363
	    while ((cur != 0) && (cur != '>')) {
3364
		if (len + 5 >= size) {
3365
		    xmlChar *tmp;
3366

3367
		    size *= 2;
3368
		    tmp = (xmlChar *) xmlRealloc(buf, size);
3369
		    if (tmp == NULL) {
3370
			htmlErrMemory(ctxt, NULL);
3371
			xmlFree(buf);
3372
			ctxt->instate = state;
3373
			return;
3374
		    }
3375
		    buf = tmp;
3376
		}
3377
                if (IS_CHAR(cur)) {
3378
		    COPY_BUF(l,buf,len,cur);
3379
                } else {
3380
                    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3381
                                    "Invalid char in processing instruction "
3382
                                    "0x%X\n", cur);
3383
                }
3384
                if (len > maxLength) {
3385
                    htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3386
                                 "PI %s too long", target, NULL);
3387
                    xmlFree(buf);
3388
                    ctxt->instate = state;
3389
                    return;
3390
                }
3391
		NEXTL(l);
3392
		cur = CUR_CHAR(l);
3393
	    }
3394
	    buf[len] = 0;
3395
            if (ctxt->instate == XML_PARSER_EOF) {
3396
                xmlFree(buf);
3397
                return;
3398
            }
3399
	    if (cur != '>') {
3400
		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3401
		      "ParsePI: PI %s never end ...\n", target, NULL);
3402
	    } else {
3403
		SKIP(1);
3404

3405
		/*
3406
		 * SAX: PI detected.
3407
		 */
3408
		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3409
		    (ctxt->sax->processingInstruction != NULL))
3410
		    ctxt->sax->processingInstruction(ctxt->userData,
3411
		                                     target, buf);
3412
	    }
3413
	    xmlFree(buf);
3414
	} else {
3415
	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3416
                         "PI is not started correctly", NULL, NULL);
3417
	}
3418
	ctxt->instate = state;
3419
    }
3420
}
3421

3422
/**
3423
 * htmlParseComment:
3424
 * @ctxt:  an HTML parser context
3425
 *
3426
 * Parse an XML (SGML) comment <!-- .... -->
3427
 *
3428
 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3429
 */
3430
static void
3431
htmlParseComment(htmlParserCtxtPtr ctxt) {
3432
    xmlChar *buf = NULL;
3433
    int len;
3434
    int size = HTML_PARSER_BUFFER_SIZE;
3435
    int q, ql;
3436
    int r, rl;
3437
    int cur, l;
3438
    int next, nl;
3439
    int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3440
                    XML_MAX_HUGE_LENGTH :
3441
                    XML_MAX_TEXT_LENGTH;
3442
    xmlParserInputState state;
3443

3444
    /*
3445
     * Check that there is a comment right here.
3446
     */
3447
    if ((RAW != '<') || (NXT(1) != '!') ||
3448
        (NXT(2) != '-') || (NXT(3) != '-')) return;
3449

3450
    state = ctxt->instate;
3451
    ctxt->instate = XML_PARSER_COMMENT;
3452
    SKIP(4);
3453
    buf = (xmlChar *) xmlMallocAtomic(size);
3454
    if (buf == NULL) {
3455
        htmlErrMemory(ctxt, "buffer allocation failed\n");
3456
	ctxt->instate = state;
3457
	return;
3458
    }
3459
    len = 0;
3460
    buf[len] = 0;
3461
    q = CUR_CHAR(ql);
3462
    if (q == 0)
3463
        goto unfinished;
3464
    if (q == '>') {
3465
        htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3466
        cur = '>';
3467
        goto finished;
3468
    }
3469
    NEXTL(ql);
3470
    r = CUR_CHAR(rl);
3471
    if (r == 0)
3472
        goto unfinished;
3473
    if (q == '-' && r == '>') {
3474
        htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3475
        cur = '>';
3476
        goto finished;
3477
    }
3478
    NEXTL(rl);
3479
    cur = CUR_CHAR(l);
3480
    while ((cur != 0) &&
3481
           ((cur != '>') ||
3482
	    (r != '-') || (q != '-'))) {
3483
	NEXTL(l);
3484
	next = CUR_CHAR(nl);
3485

3486
	if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3487
	  htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3488
		       "Comment incorrectly closed by '--!>'", NULL, NULL);
3489
	  cur = '>';
3490
	  break;
3491
	}
3492

3493
	if (len + 5 >= size) {
3494
	    xmlChar *tmp;
3495

3496
	    size *= 2;
3497
	    tmp = (xmlChar *) xmlRealloc(buf, size);
3498
	    if (tmp == NULL) {
3499
	        xmlFree(buf);
3500
	        htmlErrMemory(ctxt, "growing buffer failed\n");
3501
		ctxt->instate = state;
3502
		return;
3503
	    }
3504
	    buf = tmp;
3505
	}
3506
        if (IS_CHAR(q)) {
3507
	    COPY_BUF(ql,buf,len,q);
3508
        } else {
3509
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3510
                            "Invalid char in comment 0x%X\n", q);
3511
        }
3512
        if (len > maxLength) {
3513
            htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3514
                         "comment too long", NULL, NULL);
3515
            xmlFree(buf);
3516
            ctxt->instate = state;
3517
            return;
3518
        }
3519

3520
	q = r;
3521
	ql = rl;
3522
	r = cur;
3523
	rl = l;
3524
	cur = next;
3525
	l = nl;
3526
    }
3527
finished:
3528
    buf[len] = 0;
3529
    if (ctxt->instate == XML_PARSER_EOF) {
3530
        xmlFree(buf);
3531
        return;
3532
    }
3533
    if (cur == '>') {
3534
        NEXT;
3535
	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3536
	    (!ctxt->disableSAX))
3537
	    ctxt->sax->comment(ctxt->userData, buf);
3538
	xmlFree(buf);
3539
	ctxt->instate = state;
3540
	return;
3541
    }
3542

3543
unfinished:
3544
    htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3545
		 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3546
    xmlFree(buf);
3547
}
3548

3549
/**
3550
 * htmlParseCharRef:
3551
 * @ctxt:  an HTML parser context
3552
 *
3553
 * DEPRECATED: Internal function, don't use.
3554
 *
3555
 * parse Reference declarations
3556
 *
3557
 * [66] CharRef ::= '&#' [0-9]+ ';' |
3558
 *                  '&#x' [0-9a-fA-F]+ ';'
3559
 *
3560
 * Returns the value parsed (as an int)
3561
 */
3562
int
3563
htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3564
    int val = 0;
3565

3566
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3567
	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3568
		     "htmlParseCharRef: context error\n",
3569
		     NULL, NULL);
3570
        return(0);
3571
    }
3572
    if ((CUR == '&') && (NXT(1) == '#') &&
3573
        ((NXT(2) == 'x') || NXT(2) == 'X')) {
3574
	SKIP(3);
3575
	while (CUR != ';') {
3576
	    if ((CUR >= '0') && (CUR <= '9')) {
3577
                if (val < 0x110000)
3578
	            val = val * 16 + (CUR - '0');
3579
            } else if ((CUR >= 'a') && (CUR <= 'f')) {
3580
                if (val < 0x110000)
3581
	            val = val * 16 + (CUR - 'a') + 10;
3582
            } else if ((CUR >= 'A') && (CUR <= 'F')) {
3583
                if (val < 0x110000)
3584
	            val = val * 16 + (CUR - 'A') + 10;
3585
            } else {
3586
	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3587
		             "htmlParseCharRef: missing semicolon\n",
3588
			     NULL, NULL);
3589
		break;
3590
	    }
3591
	    NEXT;
3592
	}
3593
	if (CUR == ';')
3594
	    NEXT;
3595
    } else if  ((CUR == '&') && (NXT(1) == '#')) {
3596
	SKIP(2);
3597
	while (CUR != ';') {
3598
	    if ((CUR >= '0') && (CUR <= '9')) {
3599
                if (val < 0x110000)
3600
	            val = val * 10 + (CUR - '0');
3601
            } else {
3602
	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3603
		             "htmlParseCharRef: missing semicolon\n",
3604
			     NULL, NULL);
3605
		break;
3606
	    }
3607
	    NEXT;
3608
	}
3609
	if (CUR == ';')
3610
	    NEXT;
3611
    } else {
3612
	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3613
	             "htmlParseCharRef: invalid value\n", NULL, NULL);
3614
    }
3615
    /*
3616
     * Check the value IS_CHAR ...
3617
     */
3618
    if (IS_CHAR(val)) {
3619
        return(val);
3620
    } else if (val >= 0x110000) {
3621
	htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3622
		     "htmlParseCharRef: value too large\n", NULL, NULL);
3623
    } else {
3624
	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3625
			"htmlParseCharRef: invalid xmlChar value %d\n",
3626
			val);
3627
    }
3628
    return(0);
3629
}
3630

3631

3632
/**
3633
 * htmlParseDocTypeDecl:
3634
 * @ctxt:  an HTML parser context
3635
 *
3636
 * parse a DOCTYPE declaration
3637
 *
3638
 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3639
 *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3640
 */
3641

3642
static void
3643
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3644
    const xmlChar *name;
3645
    xmlChar *ExternalID = NULL;
3646
    xmlChar *URI = NULL;
3647

3648
    /*
3649
     * We know that '<!DOCTYPE' has been detected.
3650
     */
3651
    SKIP(9);
3652

3653
    SKIP_BLANKS;
3654

3655
    /*
3656
     * Parse the DOCTYPE name.
3657
     */
3658
    name = htmlParseName(ctxt);
3659
    if (name == NULL) {
3660
	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3661
	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3662
		     NULL, NULL);
3663
    }
3664
    /*
3665
     * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3666
     */
3667

3668
    SKIP_BLANKS;
3669

3670
    /*
3671
     * Check for SystemID and ExternalID
3672
     */
3673
    URI = htmlParseExternalID(ctxt, &ExternalID);
3674
    SKIP_BLANKS;
3675

3676
    /*
3677
     * We should be at the end of the DOCTYPE declaration.
3678
     */
3679
    if (CUR != '>') {
3680
	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3681
	             "DOCTYPE improperly terminated\n", NULL, NULL);
3682
        /* Ignore bogus content */
3683
        while ((CUR != 0) && (CUR != '>') &&
3684
               (ctxt->instate != XML_PARSER_EOF))
3685
            NEXT;
3686
    }
3687
    if (CUR == '>')
3688
        NEXT;
3689

3690
    /*
3691
     * Create or update the document accordingly to the DOCTYPE
3692
     */
3693
    if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3694
	(!ctxt->disableSAX))
3695
	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3696

3697
    /*
3698
     * Cleanup, since we don't use all those identifiers
3699
     */
3700
    if (URI != NULL) xmlFree(URI);
3701
    if (ExternalID != NULL) xmlFree(ExternalID);
3702
}
3703

3704
/**
3705
 * htmlParseAttribute:
3706
 * @ctxt:  an HTML parser context
3707
 * @value:  a xmlChar ** used to store the value of the attribute
3708
 *
3709
 * parse an attribute
3710
 *
3711
 * [41] Attribute ::= Name Eq AttValue
3712
 *
3713
 * [25] Eq ::= S? '=' S?
3714
 *
3715
 * With namespace:
3716
 *
3717
 * [NS 11] Attribute ::= QName Eq AttValue
3718
 *
3719
 * Also the case QName == xmlns:??? is handled independently as a namespace
3720
 * definition.
3721
 *
3722
 * Returns the attribute name, and the value in *value.
3723
 */
3724

3725
static const xmlChar *
3726
htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3727
    const xmlChar *name;
3728
    xmlChar *val = NULL;
3729

3730
    *value = NULL;
3731
    name = htmlParseHTMLName(ctxt);
3732
    if (name == NULL) {
3733
	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3734
	             "error parsing attribute name\n", NULL, NULL);
3735
        return(NULL);
3736
    }
3737

3738
    /*
3739
     * read the value
3740
     */
3741
    SKIP_BLANKS;
3742
    if (CUR == '=') {
3743
        NEXT;
3744
	SKIP_BLANKS;
3745
	val = htmlParseAttValue(ctxt);
3746
    }
3747

3748
    *value = val;
3749
    return(name);
3750
}
3751

3752
/**
3753
 * htmlCheckEncoding:
3754
 * @ctxt:  an HTML parser context
3755
 * @attvalue: the attribute value
3756
 *
3757
 * Checks an http-equiv attribute from a Meta tag to detect
3758
 * the encoding
3759
 * If a new encoding is detected the parser is switched to decode
3760
 * it and pass UTF8
3761
 */
3762
static void
3763
htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3764
    const xmlChar *encoding;
3765

3766
    if (!attvalue)
3767
	return;
3768

3769
    encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3770
    if (encoding != NULL) {
3771
	encoding += 7;
3772
    }
3773
    /*
3774
     * skip blank
3775
     */
3776
    if (encoding && IS_BLANK_CH(*encoding))
3777
	encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3778
    if (encoding && *encoding == '=') {
3779
	encoding ++;
3780
	xmlSetDeclaredEncoding(ctxt, xmlStrdup(encoding));
3781
    }
3782
}
3783

3784
/**
3785
 * htmlCheckMeta:
3786
 * @ctxt:  an HTML parser context
3787
 * @atts:  the attributes values
3788
 *
3789
 * Checks an attributes from a Meta tag
3790
 */
3791
static void
3792
htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3793
    int i;
3794
    const xmlChar *att, *value;
3795
    int http = 0;
3796
    const xmlChar *content = NULL;
3797

3798
    if ((ctxt == NULL) || (atts == NULL))
3799
	return;
3800

3801
    i = 0;
3802
    att = atts[i++];
3803
    while (att != NULL) {
3804
	value = atts[i++];
3805
	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3806
	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3807
	    http = 1;
3808
	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3809
	    xmlSetDeclaredEncoding(ctxt, xmlStrdup(value));
3810
	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3811
	    content = value;
3812
	att = atts[i++];
3813
    }
3814
    if ((http) && (content != NULL))
3815
	htmlCheckEncoding(ctxt, content);
3816

3817
}
3818

3819
/**
3820
 * htmlParseStartTag:
3821
 * @ctxt:  an HTML parser context
3822
 *
3823
 * parse a start of tag either for rule element or
3824
 * EmptyElement. In both case we don't parse the tag closing chars.
3825
 *
3826
 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3827
 *
3828
 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3829
 *
3830
 * With namespace:
3831
 *
3832
 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3833
 *
3834
 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3835
 *
3836
 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3837
 */
3838

3839
static int
3840
htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3841
    const xmlChar *name;
3842
    const xmlChar *attname;
3843
    xmlChar *attvalue;
3844
    const xmlChar **atts;
3845
    int nbatts = 0;
3846
    int maxatts;
3847
    int meta = 0;
3848
    int i;
3849
    int discardtag = 0;
3850

3851
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3852
	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3853
		     "htmlParseStartTag: context error\n", NULL, NULL);
3854
	return -1;
3855
    }
3856
    if (ctxt->instate == XML_PARSER_EOF)
3857
        return(-1);
3858
    if (CUR != '<') return -1;
3859
    NEXT;
3860

3861
    atts = ctxt->atts;
3862
    maxatts = ctxt->maxatts;
3863

3864
    GROW;
3865
    name = htmlParseHTMLName(ctxt);
3866
    if (name == NULL) {
3867
	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3868
	             "htmlParseStartTag: invalid element name\n",
3869
		     NULL, NULL);
3870
	/* Dump the bogus tag like browsers do */
3871
	while ((CUR != 0) && (CUR != '>') &&
3872
               (ctxt->instate != XML_PARSER_EOF))
3873
	    NEXT;
3874
        return -1;
3875
    }
3876
    if (xmlStrEqual(name, BAD_CAST"meta"))
3877
	meta = 1;
3878

3879
    /*
3880
     * Check for auto-closure of HTML elements.
3881
     */
3882
    htmlAutoClose(ctxt, name);
3883

3884
    /*
3885
     * Check for implied HTML elements.
3886
     */
3887
    htmlCheckImplied(ctxt, name);
3888

3889
    /*
3890
     * Avoid html at any level > 0, head at any level != 1
3891
     * or any attempt to recurse body
3892
     */
3893
    if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3894
	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3895
	             "htmlParseStartTag: misplaced <html> tag\n",
3896
		     name, NULL);
3897
	discardtag = 1;
3898
	ctxt->depth++;
3899
    }
3900
    if ((ctxt->nameNr != 1) &&
3901
	(xmlStrEqual(name, BAD_CAST"head"))) {
3902
	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3903
	             "htmlParseStartTag: misplaced <head> tag\n",
3904
		     name, NULL);
3905
	discardtag = 1;
3906
	ctxt->depth++;
3907
    }
3908
    if (xmlStrEqual(name, BAD_CAST"body")) {
3909
	int indx;
3910
	for (indx = 0;indx < ctxt->nameNr;indx++) {
3911
	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3912
		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3913
		             "htmlParseStartTag: misplaced <body> tag\n",
3914
			     name, NULL);
3915
		discardtag = 1;
3916
		ctxt->depth++;
3917
	    }
3918
	}
3919
    }
3920

3921
    /*
3922
     * Now parse the attributes, it ends up with the ending
3923
     *
3924
     * (S Attribute)* S?
3925
     */
3926
    SKIP_BLANKS;
3927
    while ((CUR != 0) &&
3928
           (CUR != '>') &&
3929
	   ((CUR != '/') || (NXT(1) != '>')) &&
3930
           (ctxt->instate != XML_PARSER_EOF)) {
3931
	GROW;
3932
	attname = htmlParseAttribute(ctxt, &attvalue);
3933
        if (attname != NULL) {
3934

3935
	    /*
3936
	     * Well formedness requires at most one declaration of an attribute
3937
	     */
3938
	    for (i = 0; i < nbatts;i += 2) {
3939
	        if (xmlStrEqual(atts[i], attname)) {
3940
		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3941
		                 "Attribute %s redefined\n", attname, NULL);
3942
		    if (attvalue != NULL)
3943
			xmlFree(attvalue);
3944
		    goto failed;
3945
		}
3946
	    }
3947

3948
	    /*
3949
	     * Add the pair to atts
3950
	     */
3951
	    if (atts == NULL) {
3952
	        maxatts = 22; /* allow for 10 attrs by default */
3953
	        atts = (const xmlChar **)
3954
		       xmlMalloc(maxatts * sizeof(xmlChar *));
3955
		if (atts == NULL) {
3956
		    htmlErrMemory(ctxt, NULL);
3957
		    if (attvalue != NULL)
3958
			xmlFree(attvalue);
3959
		    goto failed;
3960
		}
3961
		ctxt->atts = atts;
3962
		ctxt->maxatts = maxatts;
3963
	    } else if (nbatts + 4 > maxatts) {
3964
	        const xmlChar **n;
3965

3966
	        maxatts *= 2;
3967
	        n = (const xmlChar **) xmlRealloc((void *) atts,
3968
					     maxatts * sizeof(const xmlChar *));
3969
		if (n == NULL) {
3970
		    htmlErrMemory(ctxt, NULL);
3971
		    if (attvalue != NULL)
3972
			xmlFree(attvalue);
3973
		    goto failed;
3974
		}
3975
		atts = n;
3976
		ctxt->atts = atts;
3977
		ctxt->maxatts = maxatts;
3978
	    }
3979
	    atts[nbatts++] = attname;
3980
	    atts[nbatts++] = attvalue;
3981
	    atts[nbatts] = NULL;
3982
	    atts[nbatts + 1] = NULL;
3983
	}
3984
	else {
3985
	    if (attvalue != NULL)
3986
	        xmlFree(attvalue);
3987
	    /* Dump the bogus attribute string up to the next blank or
3988
	     * the end of the tag. */
3989
	    while ((CUR != 0) &&
3990
	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3991
		   ((CUR != '/') || (NXT(1) != '>')) &&
3992
                   (ctxt->instate != XML_PARSER_EOF))
3993
		NEXT;
3994
	}
3995

3996
failed:
3997
	SKIP_BLANKS;
3998
    }
3999

4000
    /*
4001
     * Handle specific association to the META tag
4002
     */
4003
    if (meta && (nbatts != 0))
4004
	htmlCheckMeta(ctxt, atts);
4005

4006
    /*
4007
     * SAX: Start of Element !
4008
     */
4009
    if (!discardtag) {
4010
	htmlnamePush(ctxt, name);
4011
	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4012
	    if (nbatts != 0)
4013
		ctxt->sax->startElement(ctxt->userData, name, atts);
4014
	    else
4015
		ctxt->sax->startElement(ctxt->userData, name, NULL);
4016
	}
4017
    }
4018

4019
    if (atts != NULL) {
4020
        for (i = 1;i < nbatts;i += 2) {
4021
	    if (atts[i] != NULL)
4022
		xmlFree((xmlChar *) atts[i]);
4023
	}
4024
    }
4025

4026
    return(discardtag);
4027
}
4028

4029
/**
4030
 * htmlParseEndTag:
4031
 * @ctxt:  an HTML parser context
4032
 *
4033
 * parse an end of tag
4034
 *
4035
 * [42] ETag ::= '</' Name S? '>'
4036
 *
4037
 * With namespace
4038
 *
4039
 * [NS 9] ETag ::= '</' QName S? '>'
4040
 *
4041
 * Returns 1 if the current level should be closed.
4042
 */
4043

4044
static int
4045
htmlParseEndTag(htmlParserCtxtPtr ctxt)
4046
{
4047
    const xmlChar *name;
4048
    const xmlChar *oldname;
4049
    int i, ret;
4050

4051
    if ((CUR != '<') || (NXT(1) != '/')) {
4052
        htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4053
	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
4054
        return (0);
4055
    }
4056
    SKIP(2);
4057

4058
    name = htmlParseHTMLName(ctxt);
4059
    if (name == NULL)
4060
        return (0);
4061
    /*
4062
     * We should definitely be at the ending "S? '>'" part
4063
     */
4064
    SKIP_BLANKS;
4065
    if (CUR != '>') {
4066
        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4067
	             "End tag : expected '>'\n", NULL, NULL);
4068
        /* Skip to next '>' */
4069
        while ((CUR != 0) && (CUR != '>'))
4070
            NEXT;
4071
    }
4072
    if (CUR == '>')
4073
        NEXT;
4074

4075
    /*
4076
     * if we ignored misplaced tags in htmlParseStartTag don't pop them
4077
     * out now.
4078
     */
4079
    if ((ctxt->depth > 0) &&
4080
        (xmlStrEqual(name, BAD_CAST "html") ||
4081
         xmlStrEqual(name, BAD_CAST "body") ||
4082
	 xmlStrEqual(name, BAD_CAST "head"))) {
4083
	ctxt->depth--;
4084
	return (0);
4085
    }
4086

4087
    /*
4088
     * If the name read is not one of the element in the parsing stack
4089
     * then return, it's just an error.
4090
     */
4091
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4092
        if (xmlStrEqual(name, ctxt->nameTab[i]))
4093
            break;
4094
    }
4095
    if (i < 0) {
4096
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4097
	             "Unexpected end tag : %s\n", name, NULL);
4098
        return (0);
4099
    }
4100

4101

4102
    /*
4103
     * Check for auto-closure of HTML elements.
4104
     */
4105

4106
    htmlAutoCloseOnClose(ctxt, name);
4107

4108
    /*
4109
     * Well formedness constraints, opening and closing must match.
4110
     * With the exception that the autoclose may have popped stuff out
4111
     * of the stack.
4112
     */
4113
    if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4114
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4115
                     "Opening and ending tag mismatch: %s and %s\n",
4116
                     name, ctxt->name);
4117
    }
4118

4119
    /*
4120
     * SAX: End of Tag
4121
     */
4122
    oldname = ctxt->name;
4123
    if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4124
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4125
            ctxt->sax->endElement(ctxt->userData, name);
4126
	htmlNodeInfoPop(ctxt);
4127
        htmlnamePop(ctxt);
4128
        ret = 1;
4129
    } else {
4130
        ret = 0;
4131
    }
4132

4133
    return (ret);
4134
}
4135

4136

4137
/**
4138
 * htmlParseReference:
4139
 * @ctxt:  an HTML parser context
4140
 *
4141
 * parse and handle entity references in content,
4142
 * this will end-up in a call to character() since this is either a
4143
 * CharRef, or a predefined entity.
4144
 */
4145
static void
4146
htmlParseReference(htmlParserCtxtPtr ctxt) {
4147
    const htmlEntityDesc * ent;
4148
    xmlChar out[6];
4149
    const xmlChar *name;
4150
    if (CUR != '&') return;
4151

4152
    if (NXT(1) == '#') {
4153
	unsigned int c;
4154
	int bits, i = 0;
4155

4156
	c = htmlParseCharRef(ctxt);
4157
	if (c == 0)
4158
	    return;
4159

4160
        if      (c <    0x80) { out[i++]= c;                bits= -6; }
4161
        else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4162
        else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4163
        else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4164

4165
        for ( ; bits >= 0; bits-= 6) {
4166
            out[i++]= ((c >> bits) & 0x3F) | 0x80;
4167
        }
4168
	out[i] = 0;
4169

4170
	htmlCheckParagraph(ctxt);
4171
	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4172
	    ctxt->sax->characters(ctxt->userData, out, i);
4173
    } else {
4174
	ent = htmlParseEntityRef(ctxt, &name);
4175
	if (name == NULL) {
4176
	    htmlCheckParagraph(ctxt);
4177
	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4178
	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4179
	    return;
4180
	}
4181
	if ((ent == NULL) || !(ent->value > 0)) {
4182
	    htmlCheckParagraph(ctxt);
4183
	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4184
		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4185
		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4186
		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4187
	    }
4188
	} else {
4189
	    unsigned int c;
4190
	    int bits, i = 0;
4191

4192
	    c = ent->value;
4193
	    if      (c <    0x80)
4194
	            { out[i++]= c;                bits= -6; }
4195
	    else if (c <   0x800)
4196
	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4197
	    else if (c < 0x10000)
4198
	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4199
	    else
4200
	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4201

4202
	    for ( ; bits >= 0; bits-= 6) {
4203
		out[i++]= ((c >> bits) & 0x3F) | 0x80;
4204
	    }
4205
	    out[i] = 0;
4206

4207
	    htmlCheckParagraph(ctxt);
4208
	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4209
		ctxt->sax->characters(ctxt->userData, out, i);
4210
	}
4211
    }
4212
}
4213

4214
/**
4215
 * htmlParseContent:
4216
 * @ctxt:  an HTML parser context
4217
 *
4218
 * Parse a content: comment, sub-element, reference or text.
4219
 * Kept for compatibility with old code
4220
 */
4221

4222
static void
4223
htmlParseContent(htmlParserCtxtPtr ctxt) {
4224
    xmlChar *currentNode;
4225
    int depth;
4226
    const xmlChar *name;
4227

4228
    currentNode = xmlStrdup(ctxt->name);
4229
    depth = ctxt->nameNr;
4230
    while (1) {
4231
        GROW;
4232

4233
        if (ctxt->instate == XML_PARSER_EOF)
4234
            break;
4235

4236
	/*
4237
	 * Our tag or one of it's parent or children is ending.
4238
	 */
4239
        if ((CUR == '<') && (NXT(1) == '/')) {
4240
	    if (htmlParseEndTag(ctxt) &&
4241
		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4242
		if (currentNode != NULL)
4243
		    xmlFree(currentNode);
4244
		return;
4245
	    }
4246
	    continue; /* while */
4247
        }
4248

4249
	else if ((CUR == '<') &&
4250
	         ((IS_ASCII_LETTER(NXT(1))) ||
4251
		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4252
	    name = htmlParseHTMLName_nonInvasive(ctxt);
4253
	    if (name == NULL) {
4254
	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4255
			 "htmlParseStartTag: invalid element name\n",
4256
			 NULL, NULL);
4257
	        /* Dump the bogus tag like browsers do */
4258
                while ((CUR != 0) && (CUR != '>'))
4259
	            NEXT;
4260

4261
	        if (currentNode != NULL)
4262
	            xmlFree(currentNode);
4263
	        return;
4264
	    }
4265

4266
	    if (ctxt->name != NULL) {
4267
	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4268
	            htmlAutoClose(ctxt, name);
4269
	            continue;
4270
	        }
4271
	    }
4272
	}
4273

4274
	/*
4275
	 * Has this node been popped out during parsing of
4276
	 * the next element
4277
	 */
4278
        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4279
	    (!xmlStrEqual(currentNode, ctxt->name)))
4280
	     {
4281
	    if (currentNode != NULL) xmlFree(currentNode);
4282
	    return;
4283
	}
4284

4285
	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4286
	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4287
	    /*
4288
	     * Handle SCRIPT/STYLE separately
4289
	     */
4290
	    htmlParseScript(ctxt);
4291
	}
4292

4293
        else if ((CUR == '<') && (NXT(1) == '!')) {
4294
            /*
4295
             * Sometimes DOCTYPE arrives in the middle of the document
4296
             */
4297
            if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4298
                (UPP(4) == 'C') && (UPP(5) == 'T') &&
4299
                (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4300
                (UPP(8) == 'E')) {
4301
                htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4302
                             "Misplaced DOCTYPE declaration\n",
4303
                             BAD_CAST "DOCTYPE" , NULL);
4304
                htmlParseDocTypeDecl(ctxt);
4305
            }
4306
            /*
4307
             * First case :  a comment
4308
             */
4309
            else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4310
                htmlParseComment(ctxt);
4311
            }
4312
            else {
4313
                htmlSkipBogusComment(ctxt);
4314
            }
4315
        }
4316

4317
        /*
4318
         * Second case : a Processing Instruction.
4319
         */
4320
        else if ((CUR == '<') && (NXT(1) == '?')) {
4321
            htmlParsePI(ctxt);
4322
        }
4323

4324
        /*
4325
         * Third case :  a sub-element.
4326
         */
4327
        else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4328
            htmlParseElement(ctxt);
4329
        }
4330
        else if (CUR == '<') {
4331
            if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4332
                (ctxt->sax->characters != NULL))
4333
                ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4334
            NEXT;
4335
        }
4336

4337
        /*
4338
         * Fourth case : a reference. If if has not been resolved,
4339
         *    parsing returns it's Name, create the node
4340
         */
4341
        else if (CUR == '&') {
4342
            htmlParseReference(ctxt);
4343
        }
4344

4345
        /*
4346
         * Fifth case : end of the resource
4347
         */
4348
        else if (CUR == 0) {
4349
            htmlAutoCloseOnEnd(ctxt);
4350
            break;
4351
        }
4352

4353
        /*
4354
         * Last case, text. Note that References are handled directly.
4355
         */
4356
        else {
4357
            htmlParseCharData(ctxt);
4358
        }
4359

4360
        SHRINK;
4361
        GROW;
4362
    }
4363
    if (currentNode != NULL) xmlFree(currentNode);
4364
}
4365

4366
/**
4367
 * htmlParseElement:
4368
 * @ctxt:  an HTML parser context
4369
 *
4370
 * DEPRECATED: Internal function, don't use.
4371
 *
4372
 * parse an HTML element, this is highly recursive
4373
 * this is kept for compatibility with previous code versions
4374
 *
4375
 * [39] element ::= EmptyElemTag | STag content ETag
4376
 *
4377
 * [41] Attribute ::= Name Eq AttValue
4378
 */
4379

4380
void
4381
htmlParseElement(htmlParserCtxtPtr ctxt) {
4382
    const xmlChar *name;
4383
    xmlChar *currentNode = NULL;
4384
    const htmlElemDesc * info;
4385
    htmlParserNodeInfo node_info;
4386
    int failed;
4387
    int depth;
4388
    const xmlChar *oldptr;
4389

4390
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4391
	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4392
		     "htmlParseElement: context error\n", NULL, NULL);
4393
	return;
4394
    }
4395

4396
    if (ctxt->instate == XML_PARSER_EOF)
4397
        return;
4398

4399
    /* Capture start position */
4400
    if (ctxt->record_info) {
4401
        node_info.begin_pos = ctxt->input->consumed +
4402
                          (CUR_PTR - ctxt->input->base);
4403
	node_info.begin_line = ctxt->input->line;
4404
    }
4405

4406
    failed = htmlParseStartTag(ctxt);
4407
    name = ctxt->name;
4408
    if ((failed == -1) || (name == NULL)) {
4409
	if (CUR == '>')
4410
	    NEXT;
4411
        return;
4412
    }
4413

4414
    /*
4415
     * Lookup the info for that element.
4416
     */
4417
    info = htmlTagLookup(name);
4418
    if (info == NULL) {
4419
	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4420
	             "Tag %s invalid\n", name, NULL);
4421
    }
4422

4423
    /*
4424
     * Check for an Empty Element labeled the XML/SGML way
4425
     */
4426
    if ((CUR == '/') && (NXT(1) == '>')) {
4427
        SKIP(2);
4428
	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4429
	    ctxt->sax->endElement(ctxt->userData, name);
4430
	htmlnamePop(ctxt);
4431
	return;
4432
    }
4433

4434
    if (CUR == '>') {
4435
        NEXT;
4436
    } else {
4437
	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4438
	             "Couldn't find end of Start Tag %s\n", name, NULL);
4439

4440
	/*
4441
	 * end of parsing of this node.
4442
	 */
4443
	if (xmlStrEqual(name, ctxt->name)) {
4444
	    nodePop(ctxt);
4445
	    htmlnamePop(ctxt);
4446
	}
4447

4448
	/*
4449
	 * Capture end position and add node
4450
	 */
4451
	if (ctxt->record_info) {
4452
	   node_info.end_pos = ctxt->input->consumed +
4453
			      (CUR_PTR - ctxt->input->base);
4454
	   node_info.end_line = ctxt->input->line;
4455
	   node_info.node = ctxt->node;
4456
	   xmlParserAddNodeInfo(ctxt, &node_info);
4457
	}
4458
	return;
4459
    }
4460

4461
    /*
4462
     * Check for an Empty Element from DTD definition
4463
     */
4464
    if ((info != NULL) && (info->empty)) {
4465
	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4466
	    ctxt->sax->endElement(ctxt->userData, name);
4467
	htmlnamePop(ctxt);
4468
	return;
4469
    }
4470

4471
    /*
4472
     * Parse the content of the element:
4473
     */
4474
    currentNode = xmlStrdup(ctxt->name);
4475
    depth = ctxt->nameNr;
4476
    while (CUR != 0) {
4477
	oldptr = ctxt->input->cur;
4478
	htmlParseContent(ctxt);
4479
	if (oldptr==ctxt->input->cur) break;
4480
	if (ctxt->nameNr < depth) break;
4481
    }
4482

4483
    /*
4484
     * Capture end position and add node
4485
     */
4486
    if ( currentNode != NULL && ctxt->record_info ) {
4487
       node_info.end_pos = ctxt->input->consumed +
4488
                          (CUR_PTR - ctxt->input->base);
4489
       node_info.end_line = ctxt->input->line;
4490
       node_info.node = ctxt->node;
4491
       xmlParserAddNodeInfo(ctxt, &node_info);
4492
    }
4493
    if (CUR == 0) {
4494
	htmlAutoCloseOnEnd(ctxt);
4495
    }
4496

4497
    if (currentNode != NULL)
4498
	xmlFree(currentNode);
4499
}
4500

4501
static void
4502
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4503
    /*
4504
     * Capture end position and add node
4505
     */
4506
    if ( ctxt->node != NULL && ctxt->record_info ) {
4507
       ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4508
                                (CUR_PTR - ctxt->input->base);
4509
       ctxt->nodeInfo->end_line = ctxt->input->line;
4510
       ctxt->nodeInfo->node = ctxt->node;
4511
       xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4512
       htmlNodeInfoPop(ctxt);
4513
    }
4514
    if (CUR == 0) {
4515
       htmlAutoCloseOnEnd(ctxt);
4516
    }
4517
}
4518

4519
/**
4520
 * htmlParseElementInternal:
4521
 * @ctxt:  an HTML parser context
4522
 *
4523
 * parse an HTML element, new version, non recursive
4524
 *
4525
 * [39] element ::= EmptyElemTag | STag content ETag
4526
 *
4527
 * [41] Attribute ::= Name Eq AttValue
4528
 */
4529

4530
static void
4531
htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4532
    const xmlChar *name;
4533
    const htmlElemDesc * info;
4534
    htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4535
    int failed;
4536

4537
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4538
	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4539
		     "htmlParseElementInternal: context error\n", NULL, NULL);
4540
	return;
4541
    }
4542

4543
    if (ctxt->instate == XML_PARSER_EOF)
4544
        return;
4545

4546
    /* Capture start position */
4547
    if (ctxt->record_info) {
4548
        node_info.begin_pos = ctxt->input->consumed +
4549
                          (CUR_PTR - ctxt->input->base);
4550
	node_info.begin_line = ctxt->input->line;
4551
    }
4552

4553
    failed = htmlParseStartTag(ctxt);
4554
    name = ctxt->name;
4555
    if ((failed == -1) || (name == NULL)) {
4556
	if (CUR == '>')
4557
	    NEXT;
4558
        return;
4559
    }
4560

4561
    /*
4562
     * Lookup the info for that element.
4563
     */
4564
    info = htmlTagLookup(name);
4565
    if (info == NULL) {
4566
	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4567
	             "Tag %s invalid\n", name, NULL);
4568
    }
4569

4570
    /*
4571
     * Check for an Empty Element labeled the XML/SGML way
4572
     */
4573
    if ((CUR == '/') && (NXT(1) == '>')) {
4574
        SKIP(2);
4575
	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4576
	    ctxt->sax->endElement(ctxt->userData, name);
4577
	htmlnamePop(ctxt);
4578
	return;
4579
    }
4580

4581
    if (CUR == '>') {
4582
        NEXT;
4583
    } else {
4584
	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4585
	             "Couldn't find end of Start Tag %s\n", name, NULL);
4586

4587
	/*
4588
	 * end of parsing of this node.
4589
	 */
4590
	if (xmlStrEqual(name, ctxt->name)) {
4591
	    nodePop(ctxt);
4592
	    htmlnamePop(ctxt);
4593
	}
4594

4595
        if (ctxt->record_info)
4596
            htmlNodeInfoPush(ctxt, &node_info);
4597
        htmlParserFinishElementParsing(ctxt);
4598
	return;
4599
    }
4600

4601
    /*
4602
     * Check for an Empty Element from DTD definition
4603
     */
4604
    if ((info != NULL) && (info->empty)) {
4605
	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4606
	    ctxt->sax->endElement(ctxt->userData, name);
4607
	htmlnamePop(ctxt);
4608
	return;
4609
    }
4610

4611
    if (ctxt->record_info)
4612
        htmlNodeInfoPush(ctxt, &node_info);
4613
}
4614

4615
/**
4616
 * htmlParseContentInternal:
4617
 * @ctxt:  an HTML parser context
4618
 *
4619
 * Parse a content: comment, sub-element, reference or text.
4620
 * New version for non recursive htmlParseElementInternal
4621
 */
4622

4623
static void
4624
htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4625
    xmlChar *currentNode;
4626
    int depth;
4627
    const xmlChar *name;
4628

4629
    depth = ctxt->nameNr;
4630
    if (depth <= 0) {
4631
        currentNode = NULL;
4632
    } else {
4633
        currentNode = xmlStrdup(ctxt->name);
4634
        if (currentNode == NULL) {
4635
            htmlErrMemory(ctxt, NULL);
4636
            return;
4637
        }
4638
    }
4639
    while (1) {
4640
        GROW;
4641

4642
        if (ctxt->instate == XML_PARSER_EOF)
4643
            break;
4644

4645
	/*
4646
	 * Our tag or one of it's parent or children is ending.
4647
	 */
4648
        if ((CUR == '<') && (NXT(1) == '/')) {
4649
	    if (htmlParseEndTag(ctxt) &&
4650
		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4651
		if (currentNode != NULL)
4652
		    xmlFree(currentNode);
4653

4654
	        depth = ctxt->nameNr;
4655
                if (depth <= 0) {
4656
                    currentNode = NULL;
4657
                } else {
4658
                    currentNode = xmlStrdup(ctxt->name);
4659
                    if (currentNode == NULL) {
4660
                        htmlErrMemory(ctxt, NULL);
4661
                        break;
4662
                    }
4663
                }
4664
	    }
4665
	    continue; /* while */
4666
        }
4667

4668
	else if ((CUR == '<') &&
4669
	         ((IS_ASCII_LETTER(NXT(1))) ||
4670
		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4671
	    name = htmlParseHTMLName_nonInvasive(ctxt);
4672
	    if (name == NULL) {
4673
	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4674
			 "htmlParseStartTag: invalid element name\n",
4675
			 NULL, NULL);
4676
	        /* Dump the bogus tag like browsers do */
4677
	        while ((CUR == 0) && (CUR != '>'))
4678
	            NEXT;
4679

4680
	        htmlParserFinishElementParsing(ctxt);
4681
	        if (currentNode != NULL)
4682
	            xmlFree(currentNode);
4683

4684
	        currentNode = xmlStrdup(ctxt->name);
4685
                if (currentNode == NULL) {
4686
                    htmlErrMemory(ctxt, NULL);
4687
                    break;
4688
                }
4689
	        depth = ctxt->nameNr;
4690
	        continue;
4691
	    }
4692

4693
	    if (ctxt->name != NULL) {
4694
	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4695
	            htmlAutoClose(ctxt, name);
4696
	            continue;
4697
	        }
4698
	    }
4699
	}
4700

4701
	/*
4702
	 * Has this node been popped out during parsing of
4703
	 * the next element
4704
	 */
4705
        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4706
	    (!xmlStrEqual(currentNode, ctxt->name)))
4707
	     {
4708
	    htmlParserFinishElementParsing(ctxt);
4709
	    if (currentNode != NULL) xmlFree(currentNode);
4710

4711
	    currentNode = xmlStrdup(ctxt->name);
4712
            if (currentNode == NULL) {
4713
                htmlErrMemory(ctxt, NULL);
4714
                break;
4715
            }
4716
	    depth = ctxt->nameNr;
4717
	    continue;
4718
	}
4719

4720
	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4721
	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4722
	    /*
4723
	     * Handle SCRIPT/STYLE separately
4724
	     */
4725
	    htmlParseScript(ctxt);
4726
	}
4727

4728
        else if ((CUR == '<') && (NXT(1) == '!')) {
4729
            /*
4730
             * Sometimes DOCTYPE arrives in the middle of the document
4731
             */
4732
            if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4733
                (UPP(4) == 'C') && (UPP(5) == 'T') &&
4734
                (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4735
                (UPP(8) == 'E')) {
4736
                htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4737
                             "Misplaced DOCTYPE declaration\n",
4738
                             BAD_CAST "DOCTYPE" , NULL);
4739
                htmlParseDocTypeDecl(ctxt);
4740
            }
4741
            /*
4742
             * First case :  a comment
4743
             */
4744
            else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4745
                htmlParseComment(ctxt);
4746
            }
4747
            else {
4748
                htmlSkipBogusComment(ctxt);
4749
            }
4750
        }
4751

4752
        /*
4753
         * Second case : a Processing Instruction.
4754
         */
4755
        else if ((CUR == '<') && (NXT(1) == '?')) {
4756
            htmlParsePI(ctxt);
4757
        }
4758

4759
        /*
4760
         * Third case :  a sub-element.
4761
         */
4762
        else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4763
            htmlParseElementInternal(ctxt);
4764
            if (currentNode != NULL) xmlFree(currentNode);
4765

4766
            currentNode = xmlStrdup(ctxt->name);
4767
            if (currentNode == NULL) {
4768
                htmlErrMemory(ctxt, NULL);
4769
                break;
4770
            }
4771
            depth = ctxt->nameNr;
4772
        }
4773
        else if (CUR == '<') {
4774
            if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4775
                (ctxt->sax->characters != NULL))
4776
                ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4777
            NEXT;
4778
        }
4779

4780
        /*
4781
         * Fourth case : a reference. If if has not been resolved,
4782
         *    parsing returns it's Name, create the node
4783
         */
4784
        else if (CUR == '&') {
4785
            htmlParseReference(ctxt);
4786
        }
4787

4788
        /*
4789
         * Fifth case : end of the resource
4790
         */
4791
        else if (CUR == 0) {
4792
            htmlAutoCloseOnEnd(ctxt);
4793
            break;
4794
        }
4795

4796
        /*
4797
         * Last case, text. Note that References are handled directly.
4798
         */
4799
        else {
4800
            htmlParseCharData(ctxt);
4801
        }
4802

4803
        SHRINK;
4804
        GROW;
4805
    }
4806
    if (currentNode != NULL) xmlFree(currentNode);
4807
}
4808

4809
/**
4810
 * htmlParseContent:
4811
 * @ctxt:  an HTML parser context
4812
 *
4813
 * Parse a content: comment, sub-element, reference or text.
4814
 * This is the entry point when called from parser.c
4815
 */
4816

4817
void
4818
__htmlParseContent(void *ctxt) {
4819
    if (ctxt != NULL)
4820
	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4821
}
4822

4823
/**
4824
 * htmlParseDocument:
4825
 * @ctxt:  an HTML parser context
4826
 *
4827
 * parse an HTML document (and build a tree if using the standard SAX
4828
 * interface).
4829
 *
4830
 * Returns 0, -1 in case of error. the parser context is augmented
4831
 *                as a result of the parsing.
4832
 */
4833

4834
int
4835
htmlParseDocument(htmlParserCtxtPtr ctxt) {
4836
    xmlDtdPtr dtd;
4837

4838
    xmlInitParser();
4839

4840
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4841
	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4842
		     "htmlParseDocument: context error\n", NULL, NULL);
4843
	return(XML_ERR_INTERNAL_ERROR);
4844
    }
4845

4846
    /*
4847
     * SAX: beginning of the document processing.
4848
     */
4849
    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4850
        ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4851

4852
    xmlDetectEncoding(ctxt);
4853

4854
    /*
4855
     * This is wrong but matches long-standing behavior. In most cases,
4856
     * a document starting with an XML declaration will specify UTF-8.
4857
     */
4858
    if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4859
        (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
4860
        xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4861

4862
    /*
4863
     * Wipe out everything which is before the first '<'
4864
     */
4865
    SKIP_BLANKS;
4866
    if (CUR == 0) {
4867
	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4868
	             "Document is empty\n", NULL, NULL);
4869
    }
4870

4871
    if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4872
	ctxt->sax->startDocument(ctxt->userData);
4873

4874

4875
    /*
4876
     * Parse possible comments and PIs before any content
4877
     */
4878
    while (((CUR == '<') && (NXT(1) == '!') &&
4879
            (NXT(2) == '-') && (NXT(3) == '-')) ||
4880
	   ((CUR == '<') && (NXT(1) == '?'))) {
4881
        htmlParseComment(ctxt);
4882
        htmlParsePI(ctxt);
4883
	SKIP_BLANKS;
4884
    }
4885

4886

4887
    /*
4888
     * Then possibly doc type declaration(s) and more Misc
4889
     * (doctypedecl Misc*)?
4890
     */
4891
    if ((CUR == '<') && (NXT(1) == '!') &&
4892
	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4893
	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4894
	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4895
	(UPP(8) == 'E')) {
4896
	htmlParseDocTypeDecl(ctxt);
4897
    }
4898
    SKIP_BLANKS;
4899

4900
    /*
4901
     * Parse possible comments and PIs before any content
4902
     */
4903
    while (((CUR == '<') && (NXT(1) == '!') &&
4904
            (NXT(2) == '-') && (NXT(3) == '-')) ||
4905
	   ((CUR == '<') && (NXT(1) == '?'))) {
4906
        htmlParseComment(ctxt);
4907
        htmlParsePI(ctxt);
4908
	SKIP_BLANKS;
4909
    }
4910

4911
    /*
4912
     * Time to start parsing the tree itself
4913
     */
4914
    htmlParseContentInternal(ctxt);
4915

4916
    /*
4917
     * autoclose
4918
     */
4919
    if (CUR == 0)
4920
	htmlAutoCloseOnEnd(ctxt);
4921

4922

4923
    /*
4924
     * SAX: end of the document processing.
4925
     */
4926
    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4927
        ctxt->sax->endDocument(ctxt->userData);
4928

4929
    if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4930
	dtd = xmlGetIntSubset(ctxt->myDoc);
4931
	if (dtd == NULL)
4932
	    ctxt->myDoc->intSubset =
4933
		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4934
		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4935
		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4936
    }
4937
    if (! ctxt->wellFormed) return(-1);
4938
    return(0);
4939
}
4940

4941

4942
/************************************************************************
4943
 *									*
4944
 *			Parser contexts handling			*
4945
 *									*
4946
 ************************************************************************/
4947

4948
/**
4949
 * htmlInitParserCtxt:
4950
 * @ctxt:  an HTML parser context
4951
 * @sax:  SAX handler
4952
 * @userData:  user data
4953
 *
4954
 * Initialize a parser context
4955
 *
4956
 * Returns 0 in case of success and -1 in case of error
4957
 */
4958

4959
static int
4960
htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
4961
                   void *userData)
4962
{
4963
    if (ctxt == NULL) return(-1);
4964
    memset(ctxt, 0, sizeof(htmlParserCtxt));
4965

4966
    ctxt->dict = xmlDictCreate();
4967
    if (ctxt->dict == NULL) {
4968
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4969
	return(-1);
4970
    }
4971

4972
    if (ctxt->sax == NULL)
4973
        ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4974
    if (ctxt->sax == NULL) {
4975
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4976
	return(-1);
4977
    }
4978
    if (sax == NULL) {
4979
        memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
4980
        xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
4981
        ctxt->userData = ctxt;
4982
    } else {
4983
        memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4984
        ctxt->userData = userData ? userData : ctxt;
4985
    }
4986

4987
    /* Allocate the Input stack */
4988
    ctxt->inputTab = (htmlParserInputPtr *)
4989
                      xmlMalloc(5 * sizeof(htmlParserInputPtr));
4990
    if (ctxt->inputTab == NULL) {
4991
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4992
	ctxt->inputNr = 0;
4993
	ctxt->inputMax = 0;
4994
	ctxt->input = NULL;
4995
	return(-1);
4996
    }
4997
    ctxt->inputNr = 0;
4998
    ctxt->inputMax = 5;
4999
    ctxt->input = NULL;
5000
    ctxt->version = NULL;
5001
    ctxt->encoding = NULL;
5002
    ctxt->standalone = -1;
5003
    ctxt->instate = XML_PARSER_START;
5004

5005
    /* Allocate the Node stack */
5006
    ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
5007
    if (ctxt->nodeTab == NULL) {
5008
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5009
	ctxt->nodeNr = 0;
5010
	ctxt->nodeMax = 0;
5011
	ctxt->node = NULL;
5012
	ctxt->inputNr = 0;
5013
	ctxt->inputMax = 0;
5014
	ctxt->input = NULL;
5015
	return(-1);
5016
    }
5017
    ctxt->nodeNr = 0;
5018
    ctxt->nodeMax = 10;
5019
    ctxt->node = NULL;
5020

5021
    /* Allocate the Name stack */
5022
    ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5023
    if (ctxt->nameTab == NULL) {
5024
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5025
	ctxt->nameNr = 0;
5026
	ctxt->nameMax = 0;
5027
	ctxt->name = NULL;
5028
	ctxt->nodeNr = 0;
5029
	ctxt->nodeMax = 0;
5030
	ctxt->node = NULL;
5031
	ctxt->inputNr = 0;
5032
	ctxt->inputMax = 0;
5033
	ctxt->input = NULL;
5034
	return(-1);
5035
    }
5036
    ctxt->nameNr = 0;
5037
    ctxt->nameMax = 10;
5038
    ctxt->name = NULL;
5039

5040
    ctxt->nodeInfoTab = NULL;
5041
    ctxt->nodeInfoNr  = 0;
5042
    ctxt->nodeInfoMax = 0;
5043

5044
    ctxt->myDoc = NULL;
5045
    ctxt->wellFormed = 1;
5046
    ctxt->replaceEntities = 0;
5047
    ctxt->linenumbers = xmlLineNumbersDefaultValue;
5048
    ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5049
    ctxt->html = 1;
5050
    ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
5051
    ctxt->vctxt.userData = ctxt;
5052
    ctxt->vctxt.error = xmlParserValidityError;
5053
    ctxt->vctxt.warning = xmlParserValidityWarning;
5054
    ctxt->record_info = 0;
5055
    ctxt->validate = 0;
5056
    ctxt->checkIndex = 0;
5057
    ctxt->catalogs = NULL;
5058
    xmlInitNodeInfoSeq(&ctxt->node_seq);
5059
    return(0);
5060
}
5061

5062
/**
5063
 * htmlFreeParserCtxt:
5064
 * @ctxt:  an HTML parser context
5065
 *
5066
 * Free all the memory used by a parser context. However the parsed
5067
 * document in ctxt->myDoc is not freed.
5068
 */
5069

5070
void
5071
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5072
{
5073
    xmlFreeParserCtxt(ctxt);
5074
}
5075

5076
/**
5077
 * htmlNewParserCtxt:
5078
 *
5079
 * Allocate and initialize a new parser context.
5080
 *
5081
 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5082
 */
5083

5084
htmlParserCtxtPtr
5085
htmlNewParserCtxt(void)
5086
{
5087
    return(htmlNewSAXParserCtxt(NULL, NULL));
5088
}
5089

5090
/**
5091
 * htmlNewSAXParserCtxt:
5092
 * @sax:  SAX handler
5093
 * @userData:  user data
5094
 *
5095
 * Allocate and initialize a new SAX parser context. If userData is NULL,
5096
 * the parser context will be passed as user data.
5097
 *
5098
 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5099
 */
5100

5101
htmlParserCtxtPtr
5102
htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData)
5103
{
5104
    xmlParserCtxtPtr ctxt;
5105

5106
    ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5107
    if (ctxt == NULL) {
5108
        htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5109
	return(NULL);
5110
    }
5111
    memset(ctxt, 0, sizeof(xmlParserCtxt));
5112
    if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
5113
        htmlFreeParserCtxt(ctxt);
5114
	return(NULL);
5115
    }
5116
    return(ctxt);
5117
}
5118

5119
/**
5120
 * htmlCreateMemoryParserCtxt:
5121
 * @buffer:  a pointer to a char array
5122
 * @size:  the size of the array
5123
 *
5124
 * Create a parser context for an HTML in-memory document.
5125
 *
5126
 * Returns the new parser context or NULL
5127
 */
5128
htmlParserCtxtPtr
5129
htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5130
    xmlParserCtxtPtr ctxt;
5131
    xmlParserInputPtr input;
5132
    xmlParserInputBufferPtr buf;
5133

5134
    if (buffer == NULL)
5135
	return(NULL);
5136
    if (size <= 0)
5137
	return(NULL);
5138

5139
    ctxt = htmlNewParserCtxt();
5140
    if (ctxt == NULL)
5141
	return(NULL);
5142

5143
    buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5144
    if (buf == NULL) {
5145
	xmlFreeParserCtxt(ctxt);
5146
        return(NULL);
5147
    }
5148

5149
    input = xmlNewInputStream(ctxt);
5150
    if (input == NULL) {
5151
	xmlFreeParserInputBuffer(buf);
5152
	xmlFreeParserCtxt(ctxt);
5153
	return(NULL);
5154
    }
5155

5156
    input->filename = NULL;
5157
    input->buf = buf;
5158
    xmlBufResetInput(buf->buffer, input);
5159

5160
    inputPush(ctxt, input);
5161
    return(ctxt);
5162
}
5163

5164
/**
5165
 * htmlCreateDocParserCtxt:
5166
 * @str:  a pointer to an array of xmlChar
5167
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5168
 *
5169
 * Create a parser context for an HTML document.
5170
 *
5171
 * TODO: check the need to add encoding handling there
5172
 *
5173
 * Returns the new parser context or NULL
5174
 */
5175
static htmlParserCtxtPtr
5176
htmlCreateDocParserCtxt(const xmlChar *str, const char *encoding) {
5177
    xmlParserCtxtPtr ctxt;
5178
    xmlParserInputPtr input;
5179
    xmlParserInputBufferPtr buf;
5180

5181
    if (str == NULL)
5182
	return(NULL);
5183

5184
    ctxt = htmlNewParserCtxt();
5185
    if (ctxt == NULL)
5186
	return(NULL);
5187

5188
    buf = xmlParserInputBufferCreateString(str);
5189
    if (buf == NULL) {
5190
	xmlFreeParserCtxt(ctxt);
5191
        return(NULL);
5192
    }
5193

5194
    input = xmlNewInputStream(ctxt);
5195
    if (input == NULL) {
5196
	xmlFreeParserInputBuffer(buf);
5197
	xmlFreeParserCtxt(ctxt);
5198
	return(NULL);
5199
    }
5200

5201
    input->filename = NULL;
5202
    input->buf = buf;
5203
    xmlBufResetInput(buf->buffer, input);
5204

5205
    inputPush(ctxt, input);
5206

5207
    if (encoding != NULL) {
5208
	xmlCharEncoding enc;
5209
	xmlCharEncodingHandlerPtr handler;
5210

5211
	enc = xmlParseCharEncoding(encoding);
5212
	/*
5213
	 * registered set of known encodings
5214
	 */
5215
	if (enc != XML_CHAR_ENCODING_ERROR) {
5216
	    xmlSwitchEncoding(ctxt, enc);
5217
	    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5218
		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5219
		             "Unsupported encoding %s\n",
5220
			     (const xmlChar *) encoding, NULL);
5221
	    }
5222
	} else {
5223
	    /*
5224
	     * fallback for unknown encodings
5225
	     */
5226
	    handler = xmlFindCharEncodingHandler((const char *) encoding);
5227
	    if (handler != NULL) {
5228
		xmlSwitchToEncoding(ctxt, handler);
5229
	    } else {
5230
		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5231
		             "Unsupported encoding %s\n",
5232
			     (const xmlChar *) encoding, NULL);
5233
	    }
5234
	}
5235
    }
5236

5237
    return(ctxt);
5238
}
5239

5240
#ifdef LIBXML_PUSH_ENABLED
5241
/************************************************************************
5242
 *									*
5243
 *	Progressive parsing interfaces				*
5244
 *									*
5245
 ************************************************************************/
5246

5247
/**
5248
 * htmlParseLookupSequence:
5249
 * @ctxt:  an HTML parser context
5250
 * @first:  the first char to lookup
5251
 * @next:  the next char to lookup or zero
5252
 * @third:  the next char to lookup or zero
5253
 * @ignoreattrval: skip over attribute values
5254
 *
5255
 * Try to find if a sequence (first, next, third) or  just (first next) or
5256
 * (first) is available in the input stream.
5257
 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5258
 * to avoid rescanning sequences of bytes, it DOES change the state of the
5259
 * parser, do not use liberally.
5260
 * This is basically similar to xmlParseLookupSequence()
5261
 *
5262
 * Returns the index to the current parsing point if the full sequence
5263
 *      is available, -1 otherwise.
5264
 */
5265
static int
5266
htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5267
                        xmlChar next, xmlChar third, int ignoreattrval)
5268
{
5269
    size_t base, len;
5270
    htmlParserInputPtr in;
5271
    const xmlChar *buf;
5272
    int quote;
5273

5274
    in = ctxt->input;
5275
    if (in == NULL)
5276
        return (-1);
5277

5278
    base = ctxt->checkIndex;
5279
    quote = ctxt->endCheckState;
5280

5281
    buf = in->cur;
5282
    len = in->end - in->cur;
5283

5284
    /* take into account the sequence length */
5285
    if (third)
5286
        len -= 2;
5287
    else if (next)
5288
        len--;
5289
    for (; base < len; base++) {
5290
        if (base >= INT_MAX / 2) {
5291
            ctxt->checkIndex = 0;
5292
            ctxt->endCheckState = 0;
5293
            return (base - 2);
5294
        }
5295
        if (ignoreattrval) {
5296
            if (quote) {
5297
                if (buf[base] == quote)
5298
                    quote = 0;
5299
                continue;
5300
            }
5301
            if (buf[base] == '"' || buf[base] == '\'') {
5302
                quote = buf[base];
5303
                continue;
5304
            }
5305
        }
5306
        if (buf[base] == first) {
5307
            if (third != 0) {
5308
                if ((buf[base + 1] != next) || (buf[base + 2] != third))
5309
                    continue;
5310
            } else if (next != 0) {
5311
                if (buf[base + 1] != next)
5312
                    continue;
5313
            }
5314
            ctxt->checkIndex = 0;
5315
            ctxt->endCheckState = 0;
5316
            return (base);
5317
        }
5318
    }
5319
    ctxt->checkIndex = base;
5320
    ctxt->endCheckState = quote;
5321
    return (-1);
5322
}
5323

5324
/**
5325
 * htmlParseLookupCommentEnd:
5326
 * @ctxt: an HTML parser context
5327
 *
5328
 * Try to find a comment end tag in the input stream
5329
 * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5330
 * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5331
 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5332
 * to avoid rescanning sequences of bytes, it DOES change the state of the
5333
 * parser, do not use liberally.
5334
 * This wraps to htmlParseLookupSequence()
5335
 *
5336
 * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5337
 */
5338
static int
5339
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5340
{
5341
    int mark = 0;
5342
    int offset;
5343

5344
    while (1) {
5345
	mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5346
	if (mark < 0)
5347
            break;
5348
        if ((NXT(mark+2) == '>') ||
5349
	    ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5350
            ctxt->checkIndex = 0;
5351
	    break;
5352
	}
5353
        offset = (NXT(mark+2) == '!') ? 3 : 2;
5354
        if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
5355
	    ctxt->checkIndex = mark;
5356
            return(-1);
5357
        }
5358
	ctxt->checkIndex = mark + 1;
5359
    }
5360
    return mark;
5361
}
5362

5363

5364
/**
5365
 * htmlParseTryOrFinish:
5366
 * @ctxt:  an HTML parser context
5367
 * @terminate:  last chunk indicator
5368
 *
5369
 * Try to progress on parsing
5370
 *
5371
 * Returns zero if no parsing was possible
5372
 */
5373
static int
5374
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5375
    int ret = 0;
5376
    htmlParserInputPtr in;
5377
    ptrdiff_t avail = 0;
5378
    xmlChar cur, next;
5379

5380
    htmlParserNodeInfo node_info;
5381

5382
    while (1) {
5383

5384
	in = ctxt->input;
5385
	if (in == NULL) break;
5386
	avail = in->end - in->cur;
5387
	if ((avail == 0) && (terminate)) {
5388
	    htmlAutoCloseOnEnd(ctxt);
5389
	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5390
		/*
5391
		 * SAX: end of the document processing.
5392
		 */
5393
		ctxt->instate = XML_PARSER_EOF;
5394
		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5395
		    ctxt->sax->endDocument(ctxt->userData);
5396
	    }
5397
	}
5398
        if (avail < 1)
5399
	    goto done;
5400
        /*
5401
         * This is done to make progress and avoid an infinite loop
5402
         * if a parsing attempt was aborted by hitting a NUL byte. After
5403
         * changing htmlCurrentChar, this probably isn't necessary anymore.
5404
         * We should consider removing this check.
5405
         */
5406
	cur = in->cur[0];
5407
	if (cur == 0) {
5408
	    SKIP(1);
5409
	    continue;
5410
	}
5411

5412
        switch (ctxt->instate) {
5413
            case XML_PARSER_EOF:
5414
	        /*
5415
		 * Document parsing is done !
5416
		 */
5417
	        goto done;
5418
            case XML_PARSER_START:
5419
                /*
5420
                 * This is wrong but matches long-standing behavior. In most
5421
                 * cases, a document starting with an XML declaration will
5422
                 * specify UTF-8.
5423
                 */
5424
                if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
5425
                    (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {
5426
                    xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
5427
                }
5428

5429
	        /*
5430
		 * Very first chars read from the document flow.
5431
		 */
5432
		cur = in->cur[0];
5433
		if (IS_BLANK_CH(cur)) {
5434
		    SKIP_BLANKS;
5435
                    avail = in->end - in->cur;
5436
		}
5437
		if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5438
		    ctxt->sax->setDocumentLocator(ctxt->userData,
5439
						  &xmlDefaultSAXLocator);
5440
		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5441
	            (!ctxt->disableSAX))
5442
		    ctxt->sax->startDocument(ctxt->userData);
5443
                if (ctxt->instate == XML_PARSER_EOF)
5444
                    goto done;
5445

5446
		cur = in->cur[0];
5447
		next = in->cur[1];
5448
		if ((cur == '<') && (next == '!') &&
5449
		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5450
		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5451
		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5452
		    (UPP(8) == 'E')) {
5453
		    if ((!terminate) &&
5454
		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5455
			goto done;
5456
		    htmlParseDocTypeDecl(ctxt);
5457
                    if (ctxt->instate == XML_PARSER_EOF)
5458
                        goto done;
5459
		    ctxt->instate = XML_PARSER_PROLOG;
5460
                } else {
5461
		    ctxt->instate = XML_PARSER_MISC;
5462
		}
5463
		break;
5464
            case XML_PARSER_MISC:
5465
		SKIP_BLANKS;
5466
                avail = in->end - in->cur;
5467
		/*
5468
		 * no chars in buffer
5469
		 */
5470
		if (avail < 1)
5471
		    goto done;
5472
		/*
5473
		 * not enough chars in buffer
5474
		 */
5475
		if (avail < 2) {
5476
		    if (!terminate)
5477
			goto done;
5478
		    else
5479
			next = ' ';
5480
		} else {
5481
		    next = in->cur[1];
5482
		}
5483
		cur = in->cur[0];
5484
	        if ((cur == '<') && (next == '!') &&
5485
		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5486
		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5487
			goto done;
5488
		    htmlParseComment(ctxt);
5489
                    if (ctxt->instate == XML_PARSER_EOF)
5490
                        goto done;
5491
		    ctxt->instate = XML_PARSER_MISC;
5492
	        } else if ((cur == '<') && (next == '?')) {
5493
		    if ((!terminate) &&
5494
		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5495
			goto done;
5496
		    htmlParsePI(ctxt);
5497
                    if (ctxt->instate == XML_PARSER_EOF)
5498
                        goto done;
5499
		    ctxt->instate = XML_PARSER_MISC;
5500
		} else if ((cur == '<') && (next == '!') &&
5501
		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5502
		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5503
		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5504
		    (UPP(8) == 'E')) {
5505
		    if ((!terminate) &&
5506
		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5507
			goto done;
5508
		    htmlParseDocTypeDecl(ctxt);
5509
                    if (ctxt->instate == XML_PARSER_EOF)
5510
                        goto done;
5511
		    ctxt->instate = XML_PARSER_PROLOG;
5512
		} else if ((cur == '<') && (next == '!') &&
5513
		           (avail < 9)) {
5514
		    goto done;
5515
		} else {
5516
		    ctxt->instate = XML_PARSER_CONTENT;
5517
		}
5518
		break;
5519
            case XML_PARSER_PROLOG:
5520
		SKIP_BLANKS;
5521
                avail = in->end - in->cur;
5522
		if (avail < 2)
5523
		    goto done;
5524
		cur = in->cur[0];
5525
		next = in->cur[1];
5526
		if ((cur == '<') && (next == '!') &&
5527
		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5528
		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5529
			goto done;
5530
		    htmlParseComment(ctxt);
5531
                    if (ctxt->instate == XML_PARSER_EOF)
5532
                        goto done;
5533
		    ctxt->instate = XML_PARSER_PROLOG;
5534
	        } else if ((cur == '<') && (next == '?')) {
5535
		    if ((!terminate) &&
5536
		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5537
			goto done;
5538
		    htmlParsePI(ctxt);
5539
                    if (ctxt->instate == XML_PARSER_EOF)
5540
                        goto done;
5541
		    ctxt->instate = XML_PARSER_PROLOG;
5542
		} else if ((cur == '<') && (next == '!') &&
5543
		           (avail < 4)) {
5544
		    goto done;
5545
		} else {
5546
		    ctxt->instate = XML_PARSER_CONTENT;
5547
		}
5548
		break;
5549
            case XML_PARSER_EPILOG:
5550
                avail = in->end - in->cur;
5551
		if (avail < 1)
5552
		    goto done;
5553
		cur = in->cur[0];
5554
		if (IS_BLANK_CH(cur)) {
5555
		    htmlParseCharData(ctxt);
5556
		    goto done;
5557
		}
5558
		if (avail < 2)
5559
		    goto done;
5560
		next = in->cur[1];
5561
	        if ((cur == '<') && (next == '!') &&
5562
		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5563
		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5564
			goto done;
5565
		    htmlParseComment(ctxt);
5566
                    if (ctxt->instate == XML_PARSER_EOF)
5567
                        goto done;
5568
		    ctxt->instate = XML_PARSER_EPILOG;
5569
	        } else if ((cur == '<') && (next == '?')) {
5570
		    if ((!terminate) &&
5571
		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5572
			goto done;
5573
		    htmlParsePI(ctxt);
5574
                    if (ctxt->instate == XML_PARSER_EOF)
5575
                        goto done;
5576
		    ctxt->instate = XML_PARSER_EPILOG;
5577
		} else if ((cur == '<') && (next == '!') &&
5578
		           (avail < 4)) {
5579
		    goto done;
5580
		} else {
5581
		    ctxt->errNo = XML_ERR_DOCUMENT_END;
5582
		    ctxt->wellFormed = 0;
5583
		    ctxt->instate = XML_PARSER_EOF;
5584
		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5585
			ctxt->sax->endDocument(ctxt->userData);
5586
		    goto done;
5587
		}
5588
		break;
5589
            case XML_PARSER_START_TAG: {
5590
	        const xmlChar *name;
5591
		int failed;
5592
		const htmlElemDesc * info;
5593

5594
		/*
5595
		 * no chars in buffer
5596
		 */
5597
		if (avail < 1)
5598
		    goto done;
5599
		/*
5600
		 * not enough chars in buffer
5601
		 */
5602
		if (avail < 2) {
5603
		    if (!terminate)
5604
			goto done;
5605
		    else
5606
			next = ' ';
5607
		} else {
5608
		    next = in->cur[1];
5609
		}
5610
		cur = in->cur[0];
5611
	        if (cur != '<') {
5612
		    ctxt->instate = XML_PARSER_CONTENT;
5613
		    break;
5614
		}
5615
		if (next == '/') {
5616
		    ctxt->instate = XML_PARSER_END_TAG;
5617
		    ctxt->checkIndex = 0;
5618
		    break;
5619
		}
5620
		if ((!terminate) &&
5621
		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5622
		    goto done;
5623

5624
                /* Capture start position */
5625
	        if (ctxt->record_info) {
5626
	             node_info.begin_pos = ctxt->input->consumed +
5627
	                                (CUR_PTR - ctxt->input->base);
5628
	             node_info.begin_line = ctxt->input->line;
5629
	        }
5630

5631

5632
		failed = htmlParseStartTag(ctxt);
5633
		name = ctxt->name;
5634
		if ((failed == -1) ||
5635
		    (name == NULL)) {
5636
		    if (CUR == '>')
5637
			NEXT;
5638
		    break;
5639
		}
5640

5641
		/*
5642
		 * Lookup the info for that element.
5643
		 */
5644
		info = htmlTagLookup(name);
5645
		if (info == NULL) {
5646
		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5647
		                 "Tag %s invalid\n", name, NULL);
5648
		}
5649

5650
		/*
5651
		 * Check for an Empty Element labeled the XML/SGML way
5652
		 */
5653
		if ((CUR == '/') && (NXT(1) == '>')) {
5654
		    SKIP(2);
5655
		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5656
			ctxt->sax->endElement(ctxt->userData, name);
5657
		    htmlnamePop(ctxt);
5658
                    if (ctxt->instate == XML_PARSER_EOF)
5659
                        goto done;
5660
		    ctxt->instate = XML_PARSER_CONTENT;
5661
		    break;
5662
		}
5663

5664
		if (CUR == '>') {
5665
		    NEXT;
5666
		} else {
5667
		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5668
		                 "Couldn't find end of Start Tag %s\n",
5669
				 name, NULL);
5670

5671
		    /*
5672
		     * end of parsing of this node.
5673
		     */
5674
		    if (xmlStrEqual(name, ctxt->name)) {
5675
			nodePop(ctxt);
5676
			htmlnamePop(ctxt);
5677
		    }
5678

5679
		    if (ctxt->record_info)
5680
		        htmlNodeInfoPush(ctxt, &node_info);
5681

5682
                    if (ctxt->instate == XML_PARSER_EOF)
5683
                        goto done;
5684
		    ctxt->instate = XML_PARSER_CONTENT;
5685
		    break;
5686
		}
5687

5688
		/*
5689
		 * Check for an Empty Element from DTD definition
5690
		 */
5691
		if ((info != NULL) && (info->empty)) {
5692
		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5693
			ctxt->sax->endElement(ctxt->userData, name);
5694
		    htmlnamePop(ctxt);
5695
		}
5696

5697
                if (ctxt->record_info)
5698
	            htmlNodeInfoPush(ctxt, &node_info);
5699

5700
                if (ctxt->instate == XML_PARSER_EOF)
5701
                    goto done;
5702
		ctxt->instate = XML_PARSER_CONTENT;
5703
                break;
5704
	    }
5705
            case XML_PARSER_CONTENT: {
5706
		xmlChar chr[2] = { 0, 0 };
5707

5708
                /*
5709
		 * Handle preparsed entities and charRef
5710
		 */
5711
		if (ctxt->token != 0) {
5712
		    chr[0] = ctxt->token;
5713
		    htmlCheckParagraph(ctxt);
5714
		    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5715
			ctxt->sax->characters(ctxt->userData, chr, 1);
5716
		    ctxt->token = 0;
5717
		    ctxt->checkIndex = 0;
5718
		}
5719
		if ((avail == 1) && (terminate)) {
5720
		    cur = in->cur[0];
5721
		    if ((cur != '<') && (cur != '&')) {
5722
			if (ctxt->sax != NULL) {
5723
                            chr[0] = cur;
5724
			    if (IS_BLANK_CH(cur)) {
5725
				if (ctxt->keepBlanks) {
5726
				    if (ctxt->sax->characters != NULL)
5727
					ctxt->sax->characters(
5728
						ctxt->userData, chr, 1);
5729
				} else {
5730
				    if (ctxt->sax->ignorableWhitespace != NULL)
5731
					ctxt->sax->ignorableWhitespace(
5732
						ctxt->userData, chr, 1);
5733
				}
5734
			    } else {
5735
				htmlCheckParagraph(ctxt);
5736
				if (ctxt->sax->characters != NULL)
5737
				    ctxt->sax->characters(
5738
					    ctxt->userData, chr, 1);
5739
			    }
5740
			}
5741
			ctxt->token = 0;
5742
			ctxt->checkIndex = 0;
5743
			in->cur++;
5744
			break;
5745
		    }
5746
		}
5747
		if (avail < 2)
5748
		    goto done;
5749
		cur = in->cur[0];
5750
		next = in->cur[1];
5751
		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5752
		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5753
		    /*
5754
		     * Handle SCRIPT/STYLE separately
5755
		     */
5756
		    if (!terminate) {
5757
		        int idx;
5758
			xmlChar val;
5759

5760
			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5761
			if (idx < 0)
5762
			    goto done;
5763
		        val = in->cur[idx + 2];
5764
			if (val == 0) { /* bad cut of input */
5765
                            /*
5766
                             * FIXME: htmlParseScript checks for additional
5767
                             * characters after '</'.
5768
                             */
5769
                            ctxt->checkIndex = idx;
5770
			    goto done;
5771
                        }
5772
		    }
5773
		    htmlParseScript(ctxt);
5774
                    if (ctxt->instate == XML_PARSER_EOF)
5775
                        goto done;
5776
		    if ((cur == '<') && (next == '/')) {
5777
			ctxt->instate = XML_PARSER_END_TAG;
5778
			ctxt->checkIndex = 0;
5779
			break;
5780
		    }
5781
		} else if ((cur == '<') && (next == '!')) {
5782
                    if (avail < 4)
5783
                        goto done;
5784
                    /*
5785
                     * Sometimes DOCTYPE arrives in the middle of the document
5786
                     */
5787
                    if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5788
                        (UPP(4) == 'C') && (UPP(5) == 'T') &&
5789
                        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5790
                        (UPP(8) == 'E')) {
5791
                        if ((!terminate) &&
5792
                            (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5793
                            goto done;
5794
                        htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5795
                                     "Misplaced DOCTYPE declaration\n",
5796
                                     BAD_CAST "DOCTYPE" , NULL);
5797
                        htmlParseDocTypeDecl(ctxt);
5798
                    } else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5799
                        if ((!terminate) &&
5800
                            (htmlParseLookupCommentEnd(ctxt) < 0))
5801
                            goto done;
5802
                        htmlParseComment(ctxt);
5803
                        if (ctxt->instate == XML_PARSER_EOF)
5804
                            goto done;
5805
                        ctxt->instate = XML_PARSER_CONTENT;
5806
                    } else {
5807
                        if ((!terminate) &&
5808
                            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5809
                            goto done;
5810
                        htmlSkipBogusComment(ctxt);
5811
                    }
5812
                } else if ((cur == '<') && (next == '?')) {
5813
                    if ((!terminate) &&
5814
                        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5815
                        goto done;
5816
                    htmlParsePI(ctxt);
5817
                    if (ctxt->instate == XML_PARSER_EOF)
5818
                        goto done;
5819
                    ctxt->instate = XML_PARSER_CONTENT;
5820
                } else if ((cur == '<') && (next == '/')) {
5821
                    ctxt->instate = XML_PARSER_END_TAG;
5822
                    ctxt->checkIndex = 0;
5823
                    break;
5824
                } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
5825
                    if ((!terminate) && (next == 0))
5826
                        goto done;
5827
                    ctxt->instate = XML_PARSER_START_TAG;
5828
                    ctxt->checkIndex = 0;
5829
                    break;
5830
                } else if (cur == '<') {
5831
                    if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
5832
                        (ctxt->sax->characters != NULL))
5833
                        ctxt->sax->characters(ctxt->userData,
5834
                                              BAD_CAST "<", 1);
5835
                    NEXT;
5836
                } else {
5837
                    /*
5838
                     * check that the text sequence is complete
5839
                     * before handing out the data to the parser
5840
                     * to avoid problems with erroneous end of
5841
                     * data detection.
5842
                     */
5843
                    if ((!terminate) &&
5844
                        (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5845
                        goto done;
5846
                    ctxt->checkIndex = 0;
5847
                    while ((ctxt->instate != XML_PARSER_EOF) &&
5848
                           (cur != '<') && (in->cur < in->end)) {
5849
                        if (cur == '&') {
5850
                            htmlParseReference(ctxt);
5851
                        } else {
5852
                            htmlParseCharData(ctxt);
5853
                        }
5854
                        cur = in->cur[0];
5855
                    }
5856
		}
5857

5858
		break;
5859
	    }
5860
            case XML_PARSER_END_TAG:
5861
		if (avail < 2)
5862
		    goto done;
5863
		if ((!terminate) &&
5864
		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5865
		    goto done;
5866
		htmlParseEndTag(ctxt);
5867
                if (ctxt->instate == XML_PARSER_EOF)
5868
                    goto done;
5869
		if (ctxt->nameNr == 0) {
5870
		    ctxt->instate = XML_PARSER_EPILOG;
5871
		} else {
5872
		    ctxt->instate = XML_PARSER_CONTENT;
5873
		}
5874
		ctxt->checkIndex = 0;
5875
	        break;
5876
	    default:
5877
		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5878
			     "HPP: internal error\n", NULL, NULL);
5879
		ctxt->instate = XML_PARSER_EOF;
5880
		break;
5881
	}
5882
    }
5883
done:
5884
    if ((avail == 0) && (terminate)) {
5885
	htmlAutoCloseOnEnd(ctxt);
5886
	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5887
	    /*
5888
	     * SAX: end of the document processing.
5889
	     */
5890
	    ctxt->instate = XML_PARSER_EOF;
5891
	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5892
		ctxt->sax->endDocument(ctxt->userData);
5893
	}
5894
    }
5895
    if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
5896
	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5897
	 (ctxt->instate == XML_PARSER_EPILOG))) {
5898
	xmlDtdPtr dtd;
5899
	dtd = xmlGetIntSubset(ctxt->myDoc);
5900
	if (dtd == NULL)
5901
	    ctxt->myDoc->intSubset =
5902
		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5903
		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5904
		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5905
    }
5906
    return(ret);
5907
}
5908

5909
/**
5910
 * htmlParseChunk:
5911
 * @ctxt:  an HTML parser context
5912
 * @chunk:  an char array
5913
 * @size:  the size in byte of the chunk
5914
 * @terminate:  last chunk indicator
5915
 *
5916
 * Parse a Chunk of memory
5917
 *
5918
 * Returns zero if no error, the xmlParserErrors otherwise.
5919
 */
5920
int
5921
htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5922
              int terminate) {
5923
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
5924
	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5925
		     "htmlParseChunk: context error\n", NULL, NULL);
5926
	return(XML_ERR_INTERNAL_ERROR);
5927
    }
5928
    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5929
        (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
5930
	size_t pos = ctxt->input->cur - ctxt->input->base;
5931
	int res;
5932

5933
	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5934
        xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
5935
	if (res < 0) {
5936
            htmlParseErr(ctxt, ctxt->input->buf->error,
5937
                         "xmlParserInputBufferPush failed", NULL, NULL);
5938
            xmlHaltParser(ctxt);
5939
	    return (ctxt->errNo);
5940
	}
5941
    }
5942
    htmlParseTryOrFinish(ctxt, terminate);
5943
    if (terminate) {
5944
	if ((ctxt->instate != XML_PARSER_EOF) &&
5945
	    (ctxt->instate != XML_PARSER_EPILOG) &&
5946
	    (ctxt->instate != XML_PARSER_MISC)) {
5947
	    ctxt->errNo = XML_ERR_DOCUMENT_END;
5948
	    ctxt->wellFormed = 0;
5949
	}
5950
	if (ctxt->instate != XML_PARSER_EOF) {
5951
	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5952
		ctxt->sax->endDocument(ctxt->userData);
5953
	}
5954
	ctxt->instate = XML_PARSER_EOF;
5955
    }
5956
    return((xmlParserErrors) ctxt->errNo);
5957
}
5958

5959
/************************************************************************
5960
 *									*
5961
 *			User entry points				*
5962
 *									*
5963
 ************************************************************************/
5964

5965
/**
5966
 * htmlCreatePushParserCtxt:
5967
 * @sax:  a SAX handler
5968
 * @user_data:  The user data returned on SAX callbacks
5969
 * @chunk:  a pointer to an array of chars
5970
 * @size:  number of chars in the array
5971
 * @filename:  an optional file name or URI
5972
 * @enc:  an optional encoding
5973
 *
5974
 * Create a parser context for using the HTML parser in push mode
5975
 * The value of @filename is used for fetching external entities
5976
 * and error/warning reports.
5977
 *
5978
 * Returns the new parser context or NULL
5979
 */
5980
htmlParserCtxtPtr
5981
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5982
                         const char *chunk, int size, const char *filename,
5983
			 xmlCharEncoding enc) {
5984
    htmlParserCtxtPtr ctxt;
5985
    htmlParserInputPtr inputStream;
5986
    xmlParserInputBufferPtr buf;
5987

5988
    xmlInitParser();
5989

5990
    buf = xmlAllocParserInputBuffer(XML_CHAR_ENCODING_NONE);
5991
    if (buf == NULL) return(NULL);
5992

5993
    ctxt = htmlNewSAXParserCtxt(sax, user_data);
5994
    if (ctxt == NULL) {
5995
	xmlFreeParserInputBuffer(buf);
5996
	return(NULL);
5997
    }
5998
    if (filename == NULL) {
5999
	ctxt->directory = NULL;
6000
    } else {
6001
        ctxt->directory = xmlParserGetDirectory(filename);
6002
    }
6003

6004
    inputStream = htmlNewInputStream(ctxt);
6005
    if (inputStream == NULL) {
6006
	xmlFreeParserCtxt(ctxt);
6007
	xmlFreeParserInputBuffer(buf);
6008
	return(NULL);
6009
    }
6010

6011
    if (filename == NULL)
6012
	inputStream->filename = NULL;
6013
    else
6014
	inputStream->filename = (char *)
6015
	    xmlCanonicPath((const xmlChar *) filename);
6016
    inputStream->buf = buf;
6017
    xmlBufResetInput(buf->buffer, inputStream);
6018

6019
    inputPush(ctxt, inputStream);
6020

6021
    if (enc != XML_CHAR_ENCODING_NONE)
6022
        xmlSwitchEncoding(ctxt, enc);
6023

6024
    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6025
        (ctxt->input->buf != NULL))  {
6026
	size_t pos = ctxt->input->cur - ctxt->input->base;
6027
        int res;
6028

6029
	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6030
        xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
6031
        if (res < 0) {
6032
            htmlParseErr(ctxt, ctxt->input->buf->error,
6033
                         "xmlParserInputBufferPush failed\n", NULL, NULL);
6034
            xmlHaltParser(ctxt);
6035
        }
6036
    }
6037
    ctxt->progressive = 1;
6038

6039
    return(ctxt);
6040
}
6041
#endif /* LIBXML_PUSH_ENABLED */
6042

6043
/**
6044
 * htmlSAXParseDoc:
6045
 * @cur:  a pointer to an array of xmlChar
6046
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6047
 * @sax:  the SAX handler block
6048
 * @userData: if using SAX, this pointer will be provided on callbacks.
6049
 *
6050
 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc.
6051
 *
6052
 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6053
 * to handle parse events. If sax is NULL, fallback to the default DOM
6054
 * behavior and return a tree.
6055
 *
6056
 * Returns the resulting document tree unless SAX is NULL or the document is
6057
 *     not well formed.
6058
 */
6059

6060
htmlDocPtr
6061
htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6062
                htmlSAXHandlerPtr sax, void *userData) {
6063
    htmlDocPtr ret;
6064
    htmlParserCtxtPtr ctxt;
6065

6066
    xmlInitParser();
6067

6068
    if (cur == NULL) return(NULL);
6069

6070

6071
    ctxt = htmlCreateDocParserCtxt(cur, encoding);
6072
    if (ctxt == NULL) return(NULL);
6073
    if (sax != NULL) {
6074
        if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6075
        ctxt->sax = sax;
6076
        ctxt->userData = userData;
6077
    }
6078

6079
    htmlParseDocument(ctxt);
6080
    ret = ctxt->myDoc;
6081
    if (sax != NULL) {
6082
	ctxt->sax = NULL;
6083
	ctxt->userData = NULL;
6084
    }
6085
    htmlFreeParserCtxt(ctxt);
6086

6087
    return(ret);
6088
}
6089

6090
/**
6091
 * htmlParseDoc:
6092
 * @cur:  a pointer to an array of xmlChar
6093
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6094
 *
6095
 * parse an HTML in-memory document and build a tree.
6096
 *
6097
 * Returns the resulting document tree
6098
 */
6099

6100
htmlDocPtr
6101
htmlParseDoc(const xmlChar *cur, const char *encoding) {
6102
    return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6103
}
6104

6105

6106
/**
6107
 * htmlCreateFileParserCtxt:
6108
 * @filename:  the filename
6109
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6110
 *
6111
 * Create a parser context for a file content.
6112
 * Automatic support for ZLIB/Compress compressed document is provided
6113
 * by default if found at compile-time.
6114
 *
6115
 * Returns the new parser context or NULL
6116
 */
6117
htmlParserCtxtPtr
6118
htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6119
{
6120
    htmlParserCtxtPtr ctxt;
6121
    htmlParserInputPtr inputStream;
6122
    char *canonicFilename;
6123

6124
    if (filename == NULL)
6125
        return(NULL);
6126

6127
    ctxt = htmlNewParserCtxt();
6128
    if (ctxt == NULL) {
6129
	return(NULL);
6130
    }
6131
    canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6132
    if (canonicFilename == NULL) {
6133
	xmlFreeParserCtxt(ctxt);
6134
	return(NULL);
6135
    }
6136

6137
    inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6138
    xmlFree(canonicFilename);
6139
    if (inputStream == NULL) {
6140
	xmlFreeParserCtxt(ctxt);
6141
	return(NULL);
6142
    }
6143

6144
    inputPush(ctxt, inputStream);
6145

6146
    /* set encoding */
6147
    if (encoding) {
6148
        xmlCharEncodingHandlerPtr hdlr;
6149

6150
        hdlr = xmlFindCharEncodingHandler(encoding);
6151
        if (hdlr != NULL) {
6152
            xmlSwitchToEncoding(ctxt, hdlr);
6153
        }
6154
    }
6155

6156
    return(ctxt);
6157
}
6158

6159
/**
6160
 * htmlSAXParseFile:
6161
 * @filename:  the filename
6162
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6163
 * @sax:  the SAX handler block
6164
 * @userData: if using SAX, this pointer will be provided on callbacks.
6165
 *
6166
 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile.
6167
 *
6168
 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6169
 * compressed document is provided by default if found at compile-time.
6170
 * It use the given SAX function block to handle the parsing callback.
6171
 * If sax is NULL, fallback to the default DOM tree building routines.
6172
 *
6173
 * Returns the resulting document tree unless SAX is NULL or the document is
6174
 *     not well formed.
6175
 */
6176

6177
htmlDocPtr
6178
htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6179
                 void *userData) {
6180
    htmlDocPtr ret;
6181
    htmlParserCtxtPtr ctxt;
6182
    htmlSAXHandlerPtr oldsax = NULL;
6183

6184
    xmlInitParser();
6185

6186
    ctxt = htmlCreateFileParserCtxt(filename, encoding);
6187
    if (ctxt == NULL) return(NULL);
6188
    if (sax != NULL) {
6189
	oldsax = ctxt->sax;
6190
        ctxt->sax = sax;
6191
        ctxt->userData = userData;
6192
    }
6193

6194
    htmlParseDocument(ctxt);
6195

6196
    ret = ctxt->myDoc;
6197
    if (sax != NULL) {
6198
        ctxt->sax = oldsax;
6199
        ctxt->userData = NULL;
6200
    }
6201
    htmlFreeParserCtxt(ctxt);
6202

6203
    return(ret);
6204
}
6205

6206
/**
6207
 * htmlParseFile:
6208
 * @filename:  the filename
6209
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6210
 *
6211
 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6212
 * compressed document is provided by default if found at compile-time.
6213
 *
6214
 * Returns the resulting document tree
6215
 */
6216

6217
htmlDocPtr
6218
htmlParseFile(const char *filename, const char *encoding) {
6219
    return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6220
}
6221

6222
/**
6223
 * htmlHandleOmittedElem:
6224
 * @val:  int 0 or 1
6225
 *
6226
 * Set and return the previous value for handling HTML omitted tags.
6227
 *
6228
 * Returns the last value for 0 for no handling, 1 for auto insertion.
6229
 */
6230

6231
int
6232
htmlHandleOmittedElem(int val) {
6233
    int old = htmlOmittedDefaultValue;
6234

6235
    htmlOmittedDefaultValue = val;
6236
    return(old);
6237
}
6238

6239
/**
6240
 * htmlElementAllowedHere:
6241
 * @parent: HTML parent element
6242
 * @elt: HTML element
6243
 *
6244
 * Checks whether an HTML element may be a direct child of a parent element.
6245
 * Note - doesn't check for deprecated elements
6246
 *
6247
 * Returns 1 if allowed; 0 otherwise.
6248
 */
6249
int
6250
htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6251
  const char** p ;
6252

6253
  if ( ! elt || ! parent || ! parent->subelts )
6254
	return 0 ;
6255

6256
  for ( p = parent->subelts; *p; ++p )
6257
    if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6258
      return 1 ;
6259

6260
  return 0 ;
6261
}
6262
/**
6263
 * htmlElementStatusHere:
6264
 * @parent: HTML parent element
6265
 * @elt: HTML element
6266
 *
6267
 * Checks whether an HTML element may be a direct child of a parent element.
6268
 * and if so whether it is valid or deprecated.
6269
 *
6270
 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6271
 */
6272
htmlStatus
6273
htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6274
  if ( ! parent || ! elt )
6275
    return HTML_INVALID ;
6276
  if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6277
    return HTML_INVALID ;
6278

6279
  return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6280
}
6281
/**
6282
 * htmlAttrAllowed:
6283
 * @elt: HTML element
6284
 * @attr: HTML attribute
6285
 * @legacy: whether to allow deprecated attributes
6286
 *
6287
 * Checks whether an attribute is valid for an element
6288
 * Has full knowledge of Required and Deprecated attributes
6289
 *
6290
 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6291
 */
6292
htmlStatus
6293
htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6294
  const char** p ;
6295

6296
  if ( !elt || ! attr )
6297
	return HTML_INVALID ;
6298

6299
  if ( elt->attrs_req )
6300
    for ( p = elt->attrs_req; *p; ++p)
6301
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6302
        return HTML_REQUIRED ;
6303

6304
  if ( elt->attrs_opt )
6305
    for ( p = elt->attrs_opt; *p; ++p)
6306
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6307
        return HTML_VALID ;
6308

6309
  if ( legacy && elt->attrs_depr )
6310
    for ( p = elt->attrs_depr; *p; ++p)
6311
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6312
        return HTML_DEPRECATED ;
6313

6314
  return HTML_INVALID ;
6315
}
6316
/**
6317
 * htmlNodeStatus:
6318
 * @node: an htmlNodePtr in a tree
6319
 * @legacy: whether to allow deprecated elements (YES is faster here
6320
 *	for Element nodes)
6321
 *
6322
 * Checks whether the tree node is valid.  Experimental (the author
6323
 *     only uses the HTML enhancements in a SAX parser)
6324
 *
6325
 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6326
 *	legacy allowed) or htmlElementStatusHere (otherwise).
6327
 *	for Attribute nodes, a return from htmlAttrAllowed
6328
 *	for other nodes, HTML_NA (no checks performed)
6329
 */
6330
htmlStatus
6331
htmlNodeStatus(const htmlNodePtr node, int legacy) {
6332
  if ( ! node )
6333
    return HTML_INVALID ;
6334

6335
  switch ( node->type ) {
6336
    case XML_ELEMENT_NODE:
6337
      return legacy
6338
	? ( htmlElementAllowedHere (
6339
		htmlTagLookup(node->parent->name) , node->name
6340
		) ? HTML_VALID : HTML_INVALID )
6341
	: htmlElementStatusHere(
6342
		htmlTagLookup(node->parent->name) ,
6343
		htmlTagLookup(node->name) )
6344
	;
6345
    case XML_ATTRIBUTE_NODE:
6346
      return htmlAttrAllowed(
6347
	htmlTagLookup(node->parent->name) , node->name, legacy) ;
6348
    default: return HTML_NA ;
6349
  }
6350
}
6351
/************************************************************************
6352
 *									*
6353
 *	New set (2.6.0) of simpler and more flexible APIs		*
6354
 *									*
6355
 ************************************************************************/
6356
/**
6357
 * DICT_FREE:
6358
 * @str:  a string
6359
 *
6360
 * Free a string if it is not owned by the "dict" dictionary in the
6361
 * current scope
6362
 */
6363
#define DICT_FREE(str)						\
6364
	if ((str) && ((!dict) ||				\
6365
	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
6366
	    xmlFree((char *)(str));
6367

6368
/**
6369
 * htmlCtxtReset:
6370
 * @ctxt: an HTML parser context
6371
 *
6372
 * Reset a parser context
6373
 */
6374
void
6375
htmlCtxtReset(htmlParserCtxtPtr ctxt)
6376
{
6377
    xmlParserInputPtr input;
6378
    xmlDictPtr dict;
6379

6380
    if (ctxt == NULL)
6381
        return;
6382

6383
    xmlInitParser();
6384
    dict = ctxt->dict;
6385

6386
    while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6387
        xmlFreeInputStream(input);
6388
    }
6389
    ctxt->inputNr = 0;
6390
    ctxt->input = NULL;
6391

6392
    ctxt->spaceNr = 0;
6393
    if (ctxt->spaceTab != NULL) {
6394
	ctxt->spaceTab[0] = -1;
6395
	ctxt->space = &ctxt->spaceTab[0];
6396
    } else {
6397
	ctxt->space = NULL;
6398
    }
6399

6400

6401
    ctxt->nodeNr = 0;
6402
    ctxt->node = NULL;
6403

6404
    ctxt->nameNr = 0;
6405
    ctxt->name = NULL;
6406

6407
    ctxt->nsNr = 0;
6408

6409
    DICT_FREE(ctxt->version);
6410
    ctxt->version = NULL;
6411
    DICT_FREE(ctxt->encoding);
6412
    ctxt->encoding = NULL;
6413
    DICT_FREE(ctxt->directory);
6414
    ctxt->directory = NULL;
6415
    DICT_FREE(ctxt->extSubURI);
6416
    ctxt->extSubURI = NULL;
6417
    DICT_FREE(ctxt->extSubSystem);
6418
    ctxt->extSubSystem = NULL;
6419
    if (ctxt->myDoc != NULL)
6420
        xmlFreeDoc(ctxt->myDoc);
6421
    ctxt->myDoc = NULL;
6422

6423
    ctxt->standalone = -1;
6424
    ctxt->hasExternalSubset = 0;
6425
    ctxt->hasPErefs = 0;
6426
    ctxt->html = 1;
6427
    ctxt->external = 0;
6428
    ctxt->instate = XML_PARSER_START;
6429
    ctxt->token = 0;
6430

6431
    ctxt->wellFormed = 1;
6432
    ctxt->nsWellFormed = 1;
6433
    ctxt->disableSAX = 0;
6434
    ctxt->valid = 1;
6435
    ctxt->vctxt.userData = ctxt;
6436
    ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
6437
    ctxt->vctxt.error = xmlParserValidityError;
6438
    ctxt->vctxt.warning = xmlParserValidityWarning;
6439
    ctxt->record_info = 0;
6440
    ctxt->checkIndex = 0;
6441
    ctxt->endCheckState = 0;
6442
    ctxt->inSubset = 0;
6443
    ctxt->errNo = XML_ERR_OK;
6444
    ctxt->depth = 0;
6445
    ctxt->catalogs = NULL;
6446
    xmlInitNodeInfoSeq(&ctxt->node_seq);
6447

6448
    if (ctxt->attsDefault != NULL) {
6449
        xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6450
        ctxt->attsDefault = NULL;
6451
    }
6452
    if (ctxt->attsSpecial != NULL) {
6453
        xmlHashFree(ctxt->attsSpecial, NULL);
6454
        ctxt->attsSpecial = NULL;
6455
    }
6456

6457
    ctxt->nbErrors = 0;
6458
    ctxt->nbWarnings = 0;
6459
    if (ctxt->lastError.code != XML_ERR_OK)
6460
        xmlResetError(&ctxt->lastError);
6461
}
6462

6463
/**
6464
 * htmlCtxtUseOptions:
6465
 * @ctxt: an HTML parser context
6466
 * @options:  a combination of htmlParserOption(s)
6467
 *
6468
 * Applies the options to the parser context
6469
 *
6470
 * Returns 0 in case of success, the set of unknown or unimplemented options
6471
 *         in case of error.
6472
 */
6473
int
6474
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6475
{
6476
    if (ctxt == NULL)
6477
        return(-1);
6478

6479
    if (options & HTML_PARSE_NOWARNING) {
6480
        ctxt->sax->warning = NULL;
6481
        ctxt->vctxt.warning = NULL;
6482
        options -= XML_PARSE_NOWARNING;
6483
	ctxt->options |= XML_PARSE_NOWARNING;
6484
    }
6485
    if (options & HTML_PARSE_NOERROR) {
6486
        ctxt->sax->error = NULL;
6487
        ctxt->vctxt.error = NULL;
6488
        ctxt->sax->fatalError = NULL;
6489
        options -= XML_PARSE_NOERROR;
6490
	ctxt->options |= XML_PARSE_NOERROR;
6491
    }
6492
    if (options & HTML_PARSE_PEDANTIC) {
6493
        ctxt->pedantic = 1;
6494
        options -= XML_PARSE_PEDANTIC;
6495
	ctxt->options |= XML_PARSE_PEDANTIC;
6496
    } else
6497
        ctxt->pedantic = 0;
6498
    if (options & XML_PARSE_NOBLANKS) {
6499
        ctxt->keepBlanks = 0;
6500
        ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6501
        options -= XML_PARSE_NOBLANKS;
6502
	ctxt->options |= XML_PARSE_NOBLANKS;
6503
    } else
6504
        ctxt->keepBlanks = 1;
6505
    if (options & HTML_PARSE_RECOVER) {
6506
        ctxt->recovery = 1;
6507
	options -= HTML_PARSE_RECOVER;
6508
    } else
6509
        ctxt->recovery = 0;
6510
    if (options & HTML_PARSE_COMPACT) {
6511
	ctxt->options |= HTML_PARSE_COMPACT;
6512
        options -= HTML_PARSE_COMPACT;
6513
    }
6514
    if (options & XML_PARSE_HUGE) {
6515
	ctxt->options |= XML_PARSE_HUGE;
6516
        options -= XML_PARSE_HUGE;
6517
    }
6518
    if (options & HTML_PARSE_NODEFDTD) {
6519
	ctxt->options |= HTML_PARSE_NODEFDTD;
6520
        options -= HTML_PARSE_NODEFDTD;
6521
    }
6522
    if (options & HTML_PARSE_IGNORE_ENC) {
6523
	ctxt->options |= HTML_PARSE_IGNORE_ENC;
6524
        options -= HTML_PARSE_IGNORE_ENC;
6525
    }
6526
    if (options & HTML_PARSE_NOIMPLIED) {
6527
        ctxt->options |= HTML_PARSE_NOIMPLIED;
6528
        options -= HTML_PARSE_NOIMPLIED;
6529
    }
6530
    ctxt->dictNames = 0;
6531
    ctxt->linenumbers = 1;
6532
    return (options);
6533
}
6534

6535
/**
6536
 * htmlDoRead:
6537
 * @ctxt:  an HTML parser context
6538
 * @URL:  the base URL to use for the document
6539
 * @encoding:  the document encoding, or NULL
6540
 * @options:  a combination of htmlParserOption(s)
6541
 * @reuse:  keep the context for reuse
6542
 *
6543
 * Common front-end for the htmlRead functions
6544
 *
6545
 * Returns the resulting document tree or NULL
6546
 */
6547
static htmlDocPtr
6548
htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6549
          int options, int reuse)
6550
{
6551
    htmlDocPtr ret;
6552

6553
    htmlCtxtUseOptions(ctxt, options);
6554
    ctxt->html = 1;
6555
    if (encoding != NULL) {
6556
        xmlCharEncodingHandlerPtr hdlr;
6557

6558
	hdlr = xmlFindCharEncodingHandler(encoding);
6559
	if (hdlr != NULL) {
6560
	    xmlSwitchToEncoding(ctxt, hdlr);
6561
        }
6562
    }
6563
    if ((URL != NULL) && (ctxt->input != NULL) &&
6564
        (ctxt->input->filename == NULL))
6565
        ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6566
    htmlParseDocument(ctxt);
6567
    ret = ctxt->myDoc;
6568
    ctxt->myDoc = NULL;
6569
    if (!reuse) {
6570
        if ((ctxt->dictNames) &&
6571
	    (ret != NULL) &&
6572
	    (ret->dict == ctxt->dict))
6573
	    ctxt->dict = NULL;
6574
	xmlFreeParserCtxt(ctxt);
6575
    }
6576
    return (ret);
6577
}
6578

6579
/**
6580
 * htmlReadDoc:
6581
 * @cur:  a pointer to a zero terminated string
6582
 * @URL:  the base URL to use for the document
6583
 * @encoding:  the document encoding, or NULL
6584
 * @options:  a combination of htmlParserOption(s)
6585
 *
6586
 * parse an XML in-memory document and build a tree.
6587
 *
6588
 * Returns the resulting document tree
6589
 */
6590
htmlDocPtr
6591
htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6592
{
6593
    htmlParserCtxtPtr ctxt;
6594

6595
    if (cur == NULL)
6596
        return (NULL);
6597

6598
    xmlInitParser();
6599
    ctxt = htmlCreateDocParserCtxt(cur, NULL);
6600
    if (ctxt == NULL)
6601
        return (NULL);
6602
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6603
}
6604

6605
/**
6606
 * htmlReadFile:
6607
 * @filename:  a file or URL
6608
 * @encoding:  the document encoding, or NULL
6609
 * @options:  a combination of htmlParserOption(s)
6610
 *
6611
 * parse an XML file from the filesystem or the network.
6612
 *
6613
 * Returns the resulting document tree
6614
 */
6615
htmlDocPtr
6616
htmlReadFile(const char *filename, const char *encoding, int options)
6617
{
6618
    htmlParserCtxtPtr ctxt;
6619

6620
    xmlInitParser();
6621
    ctxt = htmlCreateFileParserCtxt(filename, encoding);
6622
    if (ctxt == NULL)
6623
        return (NULL);
6624
    return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6625
}
6626

6627
/**
6628
 * htmlReadMemory:
6629
 * @buffer:  a pointer to a char array
6630
 * @size:  the size of the array
6631
 * @URL:  the base URL to use for the document
6632
 * @encoding:  the document encoding, or NULL
6633
 * @options:  a combination of htmlParserOption(s)
6634
 *
6635
 * parse an XML in-memory document and build a tree.
6636
 *
6637
 * Returns the resulting document tree
6638
 */
6639
htmlDocPtr
6640
htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6641
{
6642
    htmlParserCtxtPtr ctxt;
6643

6644
    xmlInitParser();
6645
    ctxt = htmlCreateMemoryParserCtxt(buffer, size);
6646
    if (ctxt == NULL)
6647
        return (NULL);
6648
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6649
}
6650

6651
/**
6652
 * htmlReadFd:
6653
 * @fd:  an open file descriptor
6654
 * @URL:  the base URL to use for the document
6655
 * @encoding:  the document encoding, or NULL
6656
 * @options:  a combination of htmlParserOption(s)
6657
 *
6658
 * parse an HTML from a file descriptor and build a tree.
6659
 * NOTE that the file descriptor will not be closed when the
6660
 *      reader is closed or reset.
6661
 *
6662
 * Returns the resulting document tree
6663
 */
6664
htmlDocPtr
6665
htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6666
{
6667
    htmlParserCtxtPtr ctxt;
6668
    xmlParserInputBufferPtr input;
6669
    htmlParserInputPtr stream;
6670

6671
    if (fd < 0)
6672
        return (NULL);
6673

6674
    xmlInitParser();
6675
    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6676
    if (input == NULL)
6677
        return (NULL);
6678
    input->closecallback = NULL;
6679
    ctxt = htmlNewParserCtxt();
6680
    if (ctxt == NULL) {
6681
        xmlFreeParserInputBuffer(input);
6682
        return (NULL);
6683
    }
6684
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6685
    if (stream == NULL) {
6686
        xmlFreeParserInputBuffer(input);
6687
	htmlFreeParserCtxt(ctxt);
6688
        return (NULL);
6689
    }
6690
    inputPush(ctxt, stream);
6691
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6692
}
6693

6694
/**
6695
 * htmlReadIO:
6696
 * @ioread:  an I/O read function
6697
 * @ioclose:  an I/O close function
6698
 * @ioctx:  an I/O handler
6699
 * @URL:  the base URL to use for the document
6700
 * @encoding:  the document encoding, or NULL
6701
 * @options:  a combination of htmlParserOption(s)
6702
 *
6703
 * parse an HTML document from I/O functions and source and build a tree.
6704
 *
6705
 * Returns the resulting document tree
6706
 */
6707
htmlDocPtr
6708
htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6709
          void *ioctx, const char *URL, const char *encoding, int options)
6710
{
6711
    htmlParserCtxtPtr ctxt;
6712
    xmlParserInputBufferPtr input;
6713
    xmlParserInputPtr stream;
6714

6715
    if (ioread == NULL)
6716
        return (NULL);
6717
    xmlInitParser();
6718

6719
    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6720
                                         XML_CHAR_ENCODING_NONE);
6721
    if (input == NULL) {
6722
        if (ioclose != NULL)
6723
            ioclose(ioctx);
6724
        return (NULL);
6725
    }
6726
    ctxt = htmlNewParserCtxt();
6727
    if (ctxt == NULL) {
6728
        xmlFreeParserInputBuffer(input);
6729
        return (NULL);
6730
    }
6731
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6732
    if (stream == NULL) {
6733
        xmlFreeParserInputBuffer(input);
6734
	xmlFreeParserCtxt(ctxt);
6735
        return (NULL);
6736
    }
6737
    inputPush(ctxt, stream);
6738
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6739
}
6740

6741
/**
6742
 * htmlCtxtReadDoc:
6743
 * @ctxt:  an HTML parser context
6744
 * @str:  a pointer to a zero terminated string
6745
 * @URL:  the base URL to use for the document
6746
 * @encoding:  the document encoding, or NULL
6747
 * @options:  a combination of htmlParserOption(s)
6748
 *
6749
 * parse an XML in-memory document and build a tree.
6750
 * This reuses the existing @ctxt parser context
6751
 *
6752
 * Returns the resulting document tree
6753
 */
6754
htmlDocPtr
6755
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar *str,
6756
               const char *URL, const char *encoding, int options)
6757
{
6758
    xmlParserInputBufferPtr input;
6759
    xmlParserInputPtr stream;
6760

6761
    if (ctxt == NULL)
6762
        return (NULL);
6763
    if (str == NULL)
6764
        return (NULL);
6765
    xmlInitParser();
6766

6767
    htmlCtxtReset(ctxt);
6768

6769
    input = xmlParserInputBufferCreateString(str);
6770
    if (input == NULL) {
6771
	return(NULL);
6772
    }
6773

6774
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6775
    if (stream == NULL) {
6776
	xmlFreeParserInputBuffer(input);
6777
	return(NULL);
6778
    }
6779

6780
    inputPush(ctxt, stream);
6781
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6782
}
6783

6784
/**
6785
 * htmlCtxtReadFile:
6786
 * @ctxt:  an HTML parser context
6787
 * @filename:  a file or URL
6788
 * @encoding:  the document encoding, or NULL
6789
 * @options:  a combination of htmlParserOption(s)
6790
 *
6791
 * parse an XML file from the filesystem or the network.
6792
 * This reuses the existing @ctxt parser context
6793
 *
6794
 * Returns the resulting document tree
6795
 */
6796
htmlDocPtr
6797
htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6798
                const char *encoding, int options)
6799
{
6800
    xmlParserInputPtr stream;
6801

6802
    if (filename == NULL)
6803
        return (NULL);
6804
    if (ctxt == NULL)
6805
        return (NULL);
6806
    xmlInitParser();
6807

6808
    htmlCtxtReset(ctxt);
6809

6810
    stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6811
    if (stream == NULL) {
6812
        return (NULL);
6813
    }
6814
    inputPush(ctxt, stream);
6815
    return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6816
}
6817

6818
/**
6819
 * htmlCtxtReadMemory:
6820
 * @ctxt:  an HTML parser context
6821
 * @buffer:  a pointer to a char array
6822
 * @size:  the size of the array
6823
 * @URL:  the base URL to use for the document
6824
 * @encoding:  the document encoding, or NULL
6825
 * @options:  a combination of htmlParserOption(s)
6826
 *
6827
 * parse an XML in-memory document and build a tree.
6828
 * This reuses the existing @ctxt parser context
6829
 *
6830
 * Returns the resulting document tree
6831
 */
6832
htmlDocPtr
6833
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6834
                  const char *URL, const char *encoding, int options)
6835
{
6836
    xmlParserInputBufferPtr input;
6837
    xmlParserInputPtr stream;
6838

6839
    if (ctxt == NULL)
6840
        return (NULL);
6841
    if (buffer == NULL)
6842
        return (NULL);
6843
    xmlInitParser();
6844

6845
    htmlCtxtReset(ctxt);
6846

6847
    input = xmlParserInputBufferCreateStatic(buffer, size,
6848
                                             XML_CHAR_ENCODING_NONE);
6849
    if (input == NULL) {
6850
	return(NULL);
6851
    }
6852

6853
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6854
    if (stream == NULL) {
6855
	xmlFreeParserInputBuffer(input);
6856
	return(NULL);
6857
    }
6858

6859
    inputPush(ctxt, stream);
6860
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6861
}
6862

6863
/**
6864
 * htmlCtxtReadFd:
6865
 * @ctxt:  an HTML parser context
6866
 * @fd:  an open file descriptor
6867
 * @URL:  the base URL to use for the document
6868
 * @encoding:  the document encoding, or NULL
6869
 * @options:  a combination of htmlParserOption(s)
6870
 *
6871
 * parse an XML from a file descriptor and build a tree.
6872
 * This reuses the existing @ctxt parser context
6873
 *
6874
 * Returns the resulting document tree
6875
 */
6876
htmlDocPtr
6877
htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6878
              const char *URL, const char *encoding, int options)
6879
{
6880
    xmlParserInputBufferPtr input;
6881
    xmlParserInputPtr stream;
6882

6883
    if (fd < 0)
6884
        return (NULL);
6885
    if (ctxt == NULL)
6886
        return (NULL);
6887
    xmlInitParser();
6888

6889
    htmlCtxtReset(ctxt);
6890

6891

6892
    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6893
    if (input == NULL)
6894
        return (NULL);
6895
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6896
    if (stream == NULL) {
6897
        xmlFreeParserInputBuffer(input);
6898
        return (NULL);
6899
    }
6900
    inputPush(ctxt, stream);
6901
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6902
}
6903

6904
/**
6905
 * htmlCtxtReadIO:
6906
 * @ctxt:  an HTML parser context
6907
 * @ioread:  an I/O read function
6908
 * @ioclose:  an I/O close function
6909
 * @ioctx:  an I/O handler
6910
 * @URL:  the base URL to use for the document
6911
 * @encoding:  the document encoding, or NULL
6912
 * @options:  a combination of htmlParserOption(s)
6913
 *
6914
 * parse an HTML document from I/O functions and source and build a tree.
6915
 * This reuses the existing @ctxt parser context
6916
 *
6917
 * Returns the resulting document tree
6918
 */
6919
htmlDocPtr
6920
htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6921
              xmlInputCloseCallback ioclose, void *ioctx,
6922
	      const char *URL,
6923
              const char *encoding, int options)
6924
{
6925
    xmlParserInputBufferPtr input;
6926
    xmlParserInputPtr stream;
6927

6928
    if (ioread == NULL)
6929
        return (NULL);
6930
    if (ctxt == NULL)
6931
        return (NULL);
6932
    xmlInitParser();
6933

6934
    htmlCtxtReset(ctxt);
6935

6936
    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6937
                                         XML_CHAR_ENCODING_NONE);
6938
    if (input == NULL) {
6939
        if (ioclose != NULL)
6940
            ioclose(ioctx);
6941
        return (NULL);
6942
    }
6943
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6944
    if (stream == NULL) {
6945
        xmlFreeParserInputBuffer(input);
6946
        return (NULL);
6947
    }
6948
    inputPush(ctxt, stream);
6949
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6950
}
6951

6952
#endif /* LIBXML_HTML_ENABLED */
6953

6954
Product

Resources

Company