CoCalc -- HTMLtree.c

GitHub Repository: wine-mirror/wine
Path: blob/master/libs/xml2/HTMLtree.c
⁴³⁹³ views
1
/*
2
 * HTMLtree.c : implementation of access function for an HTML tree.
3
 *
4
 * See Copyright for the status of this software.
5
 *
6
 * [email protected]
7
 */
8

9

10
#define IN_LIBXML
11
#include "libxml.h"
12
#ifdef LIBXML_HTML_ENABLED
13

14
#include <string.h> /* for memset() only ! */
15
#include <ctype.h>
16
#include <stdlib.h>
17

18
#include <libxml/xmlmemory.h>
19
#include <libxml/HTMLparser.h>
20
#include <libxml/HTMLtree.h>
21
#include <libxml/entities.h>
22
#include <libxml/xmlerror.h>
23
#include <libxml/parserInternals.h>
24
#include <libxml/uri.h>
25

26
#include "private/buf.h"
27
#include "private/error.h"
28
#include "private/io.h"
29
#include "private/save.h"
30

31
/************************************************************************
32
 *									*
33
 *		Getting/Setting encoding meta tags			*
34
 *									*
35
 ************************************************************************/
36

37
/**
38
 * htmlGetMetaEncoding:
39
 * @doc:  the document
40
 *
41
 * Encoding definition lookup in the Meta tags
42
 *
43
 * Returns the current encoding as flagged in the HTML source
44
 */
45
const xmlChar *
46
htmlGetMetaEncoding(htmlDocPtr doc) {
47
    htmlNodePtr cur;
48
    const xmlChar *content;
49
    const xmlChar *encoding;
50

51
    if (doc == NULL)
52
	return(NULL);
53
    cur = doc->children;
54

55
    /*
56
     * Search the html
57
     */
58
    while (cur != NULL) {
59
	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
60
	    if (xmlStrEqual(cur->name, BAD_CAST"html"))
61
		break;
62
	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
63
		goto found_head;
64
	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
65
		goto found_meta;
66
	}
67
	cur = cur->next;
68
    }
69
    if (cur == NULL)
70
	return(NULL);
71
    cur = cur->children;
72

73
    /*
74
     * Search the head
75
     */
76
    while (cur != NULL) {
77
	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
78
	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
79
		break;
80
	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
81
		goto found_meta;
82
	}
83
	cur = cur->next;
84
    }
85
    if (cur == NULL)
86
	return(NULL);
87
found_head:
88
    cur = cur->children;
89

90
    /*
91
     * Search the meta elements
92
     */
93
found_meta:
94
    while (cur != NULL) {
95
	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
96
	    if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
97
		xmlAttrPtr attr = cur->properties;
98
		int http;
99
		const xmlChar *value;
100

101
		content = NULL;
102
		http = 0;
103
		while (attr != NULL) {
104
		    if ((attr->children != NULL) &&
105
		        (attr->children->type == XML_TEXT_NODE) &&
106
		        (attr->children->next == NULL)) {
107
			value = attr->children->content;
108
			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
109
			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
110
			    http = 1;
111
			else if ((value != NULL)
112
			 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
113
			    content = value;
114
			if ((http != 0) && (content != NULL))
115
			    goto found_content;
116
		    }
117
		    attr = attr->next;
118
		}
119
	    }
120
	}
121
	cur = cur->next;
122
    }
123
    return(NULL);
124

125
found_content:
126
    encoding = xmlStrstr(content, BAD_CAST"charset=");
127
    if (encoding == NULL)
128
	encoding = xmlStrstr(content, BAD_CAST"Charset=");
129
    if (encoding == NULL)
130
	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
131
    if (encoding != NULL) {
132
	encoding += 8;
133
    } else {
134
	encoding = xmlStrstr(content, BAD_CAST"charset =");
135
	if (encoding == NULL)
136
	    encoding = xmlStrstr(content, BAD_CAST"Charset =");
137
	if (encoding == NULL)
138
	    encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
139
	if (encoding != NULL)
140
	    encoding += 9;
141
    }
142
    if (encoding != NULL) {
143
	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
144
    }
145
    return(encoding);
146
}
147

148
/**
149
 * htmlSetMetaEncoding:
150
 * @doc:  the document
151
 * @encoding:  the encoding string
152
 *
153
 * Sets the current encoding in the Meta tags
154
 * NOTE: this will not change the document content encoding, just
155
 * the META flag associated.
156
 *
157
 * Returns 0 in case of success and -1 in case of error
158
 */
159
int
160
htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
161
    htmlNodePtr cur, meta = NULL, head = NULL;
162
    const xmlChar *content = NULL;
163
    char newcontent[100];
164

165
    newcontent[0] = 0;
166

167
    if (doc == NULL)
168
	return(-1);
169

170
    /* html isn't a real encoding it's just libxml2 way to get entities */
171
    if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
172
        return(-1);
173

174
    if (encoding != NULL) {
175
	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
176
                (char *)encoding);
177
	newcontent[sizeof(newcontent) - 1] = 0;
178
    }
179

180
    cur = doc->children;
181

182
    /*
183
     * Search the html
184
     */
185
    while (cur != NULL) {
186
	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
187
	    if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
188
		break;
189
	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
190
		goto found_head;
191
	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
192
		goto found_meta;
193
	}
194
	cur = cur->next;
195
    }
196
    if (cur == NULL)
197
	return(-1);
198
    cur = cur->children;
199

200
    /*
201
     * Search the head
202
     */
203
    while (cur != NULL) {
204
	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
205
	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
206
		break;
207
	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
208
                head = cur->parent;
209
		goto found_meta;
210
            }
211
	}
212
	cur = cur->next;
213
    }
214
    if (cur == NULL)
215
	return(-1);
216
found_head:
217
    head = cur;
218
    if (cur->children == NULL)
219
        goto create;
220
    cur = cur->children;
221

222
found_meta:
223
    /*
224
     * Search and update all the remaining the meta elements carrying
225
     * encoding information
226
     */
227
    while (cur != NULL) {
228
	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
229
	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
230
		xmlAttrPtr attr = cur->properties;
231
		int http;
232
		const xmlChar *value;
233

234
		content = NULL;
235
		http = 0;
236
		while (attr != NULL) {
237
		    if ((attr->children != NULL) &&
238
		        (attr->children->type == XML_TEXT_NODE) &&
239
		        (attr->children->next == NULL)) {
240
			value = attr->children->content;
241
			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
242
			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
243
			    http = 1;
244
			else
245
                        {
246
                           if ((value != NULL) &&
247
                               (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
248
			       content = value;
249
                        }
250
		        if ((http != 0) && (content != NULL))
251
			    break;
252
		    }
253
		    attr = attr->next;
254
		}
255
		if ((http != 0) && (content != NULL)) {
256
		    meta = cur;
257
		    break;
258
		}
259

260
	    }
261
	}
262
	cur = cur->next;
263
    }
264
create:
265
    if (meta == NULL) {
266
        if ((encoding != NULL) && (head != NULL)) {
267
            /*
268
             * Create a new Meta element with the right attributes
269
             */
270

271
            meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
272
            if (head->children == NULL)
273
                xmlAddChild(head, meta);
274
            else
275
                xmlAddPrevSibling(head->children, meta);
276
            xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
277
            xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
278
        }
279
    } else {
280
        /* remove the meta tag if NULL is passed */
281
        if (encoding == NULL) {
282
            xmlUnlinkNode(meta);
283
            xmlFreeNode(meta);
284
        }
285
        /* change the document only if there is a real encoding change */
286
        else if (xmlStrcasestr(content, encoding) == NULL) {
287
            xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
288
        }
289
    }
290

291

292
    return(0);
293
}
294

295
/**
296
 * booleanHTMLAttrs:
297
 *
298
 * These are the HTML attributes which will be output
299
 * in minimized form, i.e. <option selected="selected"> will be
300
 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
301
 *
302
 */
303
static const char* const htmlBooleanAttrs[] = {
304
  "checked", "compact", "declare", "defer", "disabled", "ismap",
305
  "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
306
  "selected", NULL
307
};
308

309

310
/**
311
 * htmlIsBooleanAttr:
312
 * @name:  the name of the attribute to check
313
 *
314
 * Determine if a given attribute is a boolean attribute.
315
 *
316
 * returns: false if the attribute is not boolean, true otherwise.
317
 */
318
int
319
htmlIsBooleanAttr(const xmlChar *name)
320
{
321
    int i = 0;
322

323
    while (htmlBooleanAttrs[i] != NULL) {
324
        if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
325
            return 1;
326
        i++;
327
    }
328
    return 0;
329
}
330

331
#ifdef LIBXML_OUTPUT_ENABLED
332
/************************************************************************
333
 *									*
334
 *			Output error handlers				*
335
 *									*
336
 ************************************************************************/
337
/**
338
 * htmlSaveErrMemory:
339
 * @extra:  extra information
340
 *
341
 * Handle an out of memory condition
342
 */
343
static void
344
htmlSaveErrMemory(const char *extra)
345
{
346
    __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
347
}
348

349
/**
350
 * htmlSaveErr:
351
 * @code:  the error number
352
 * @node:  the location of the error.
353
 * @extra:  extra information
354
 *
355
 * Handle an out of memory condition
356
 */
357
static void
358
htmlSaveErr(int code, xmlNodePtr node, const char *extra)
359
{
360
    const char *msg = NULL;
361

362
    switch(code) {
363
        case XML_SAVE_NOT_UTF8:
364
	    msg = "string is not in UTF-8\n";
365
	    break;
366
	case XML_SAVE_CHAR_INVALID:
367
	    msg = "invalid character value\n";
368
	    break;
369
	case XML_SAVE_UNKNOWN_ENCODING:
370
	    msg = "unknown encoding %s\n";
371
	    break;
372
	case XML_SAVE_NO_DOCTYPE:
373
	    msg = "HTML has no DOCTYPE\n";
374
	    break;
375
	default:
376
	    msg = "unexpected error number\n";
377
    }
378
    __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
379
}
380

381
/************************************************************************
382
 *									*
383
 *		Dumping HTML tree content to a simple buffer		*
384
 *									*
385
 ************************************************************************/
386

387
/**
388
 * htmlBufNodeDumpFormat:
389
 * @buf:  the xmlBufPtr output
390
 * @doc:  the document
391
 * @cur:  the current node
392
 * @format:  should formatting spaces been added
393
 *
394
 * Dump an HTML node, recursive behaviour,children are printed too.
395
 *
396
 * Returns the number of byte written or -1 in case of error
397
 */
398
static size_t
399
htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
400
	           int format) {
401
    size_t use;
402
    int ret;
403
    xmlOutputBufferPtr outbuf;
404

405
    if (cur == NULL) {
406
	return (-1);
407
    }
408
    if (buf == NULL) {
409
	return (-1);
410
    }
411
    outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
412
    if (outbuf == NULL) {
413
        htmlSaveErrMemory("allocating HTML output buffer");
414
	return (-1);
415
    }
416
    memset(outbuf, 0, sizeof(xmlOutputBuffer));
417
    outbuf->buffer = buf;
418
    outbuf->encoder = NULL;
419
    outbuf->writecallback = NULL;
420
    outbuf->closecallback = NULL;
421
    outbuf->context = NULL;
422
    outbuf->written = 0;
423

424
    use = xmlBufUse(buf);
425
    htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
426
    xmlFree(outbuf);
427
    ret = xmlBufUse(buf) - use;
428
    return (ret);
429
}
430

431
/**
432
 * htmlNodeDump:
433
 * @buf:  the HTML buffer output
434
 * @doc:  the document
435
 * @cur:  the current node
436
 *
437
 * Dump an HTML node, recursive behaviour,children are printed too,
438
 * and formatting returns are added.
439
 *
440
 * Returns the number of byte written or -1 in case of error
441
 */
442
int
443
htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
444
    xmlBufPtr buffer;
445
    size_t ret;
446

447
    if ((buf == NULL) || (cur == NULL))
448
        return(-1);
449

450
    xmlInitParser();
451
    buffer = xmlBufFromBuffer(buf);
452
    if (buffer == NULL)
453
        return(-1);
454

455
    ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
456

457
    xmlBufBackToBuffer(buffer);
458

459
    if (ret > INT_MAX)
460
        return(-1);
461
    return((int) ret);
462
}
463

464
/**
465
 * htmlNodeDumpFileFormat:
466
 * @out:  the FILE pointer
467
 * @doc:  the document
468
 * @cur:  the current node
469
 * @encoding: the document encoding
470
 * @format:  should formatting spaces been added
471
 *
472
 * Dump an HTML node, recursive behaviour,children are printed too.
473
 *
474
 * TODO: if encoding == NULL try to save in the doc encoding
475
 *
476
 * returns: the number of byte written or -1 in case of failure.
477
 */
478
int
479
htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
480
	               xmlNodePtr cur, const char *encoding, int format) {
481
    xmlOutputBufferPtr buf;
482
    xmlCharEncodingHandlerPtr handler = NULL;
483
    int ret;
484

485
    xmlInitParser();
486

487
    if (encoding != NULL) {
488
	xmlCharEncoding enc;
489

490
	enc = xmlParseCharEncoding(encoding);
491
	if (enc != XML_CHAR_ENCODING_UTF8) {
492
	    handler = xmlFindCharEncodingHandler(encoding);
493
	    if (handler == NULL)
494
		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
495
	}
496
    } else {
497
        /*
498
         * Fallback to HTML or ASCII when the encoding is unspecified
499
         */
500
        if (handler == NULL)
501
            handler = xmlFindCharEncodingHandler("HTML");
502
        if (handler == NULL)
503
            handler = xmlFindCharEncodingHandler("ascii");
504
    }
505

506
    /*
507
     * save the content to a temp buffer.
508
     */
509
    buf = xmlOutputBufferCreateFile(out, handler);
510
    if (buf == NULL) return(0);
511

512
    htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
513

514
    ret = xmlOutputBufferClose(buf);
515
    return(ret);
516
}
517

518
/**
519
 * htmlNodeDumpFile:
520
 * @out:  the FILE pointer
521
 * @doc:  the document
522
 * @cur:  the current node
523
 *
524
 * Dump an HTML node, recursive behaviour,children are printed too,
525
 * and formatting returns are added.
526
 */
527
void
528
htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
529
    htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
530
}
531

532
/**
533
 * htmlDocDumpMemoryFormat:
534
 * @cur:  the document
535
 * @mem:  OUT: the memory pointer
536
 * @size:  OUT: the memory length
537
 * @format:  should formatting spaces been added
538
 *
539
 * Dump an HTML document in memory and return the xmlChar * and it's size.
540
 * It's up to the caller to free the memory.
541
 */
542
void
543
htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
544
    xmlOutputBufferPtr buf;
545
    xmlCharEncodingHandlerPtr handler = NULL;
546
    const char *encoding;
547

548
    xmlInitParser();
549

550
    if ((mem == NULL) || (size == NULL))
551
        return;
552
    if (cur == NULL) {
553
	*mem = NULL;
554
	*size = 0;
555
	return;
556
    }
557

558
    encoding = (const char *) htmlGetMetaEncoding(cur);
559

560
    if (encoding != NULL) {
561
	xmlCharEncoding enc;
562

563
	enc = xmlParseCharEncoding(encoding);
564
	if (enc != XML_CHAR_ENCODING_UTF8) {
565
	    handler = xmlFindCharEncodingHandler(encoding);
566
	    if (handler == NULL)
567
                htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
568

569
	}
570
    } else {
571
        /*
572
         * Fallback to HTML or ASCII when the encoding is unspecified
573
         */
574
        if (handler == NULL)
575
            handler = xmlFindCharEncodingHandler("HTML");
576
        if (handler == NULL)
577
            handler = xmlFindCharEncodingHandler("ascii");
578
    }
579

580
    buf = xmlAllocOutputBufferInternal(handler);
581
    if (buf == NULL) {
582
	*mem = NULL;
583
	*size = 0;
584
	return;
585
    }
586

587
    htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
588

589
    xmlOutputBufferFlush(buf);
590
    if (buf->conv != NULL) {
591
	*size = xmlBufUse(buf->conv);
592
	*mem = xmlStrndup(xmlBufContent(buf->conv), *size);
593
    } else {
594
	*size = xmlBufUse(buf->buffer);
595
	*mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
596
    }
597
    (void)xmlOutputBufferClose(buf);
598
}
599

600
/**
601
 * htmlDocDumpMemory:
602
 * @cur:  the document
603
 * @mem:  OUT: the memory pointer
604
 * @size:  OUT: the memory length
605
 *
606
 * Dump an HTML document in memory and return the xmlChar * and it's size.
607
 * It's up to the caller to free the memory.
608
 */
609
void
610
htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
611
	htmlDocDumpMemoryFormat(cur, mem, size, 1);
612
}
613

614

615
/************************************************************************
616
 *									*
617
 *		Dumping HTML tree content to an I/O output buffer	*
618
 *									*
619
 ************************************************************************/
620

621
/**
622
 * htmlDtdDumpOutput:
623
 * @buf:  the HTML buffer output
624
 * @doc:  the document
625
 * @encoding:  the encoding string
626
 *
627
 * TODO: check whether encoding is needed
628
 *
629
 * Dump the HTML document DTD, if any.
630
 */
631
static void
632
htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
633
	          const char *encoding ATTRIBUTE_UNUSED) {
634
    xmlDtdPtr cur = doc->intSubset;
635

636
    if (cur == NULL) {
637
	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
638
	return;
639
    }
640
    xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
641
    xmlOutputBufferWriteString(buf, (const char *)cur->name);
642
    if (cur->ExternalID != NULL) {
643
	xmlOutputBufferWriteString(buf, " PUBLIC ");
644
	xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
645
	if (cur->SystemID != NULL) {
646
	    xmlOutputBufferWriteString(buf, " ");
647
	    xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
648
	}
649
    } else if (cur->SystemID != NULL &&
650
	       xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
651
	xmlOutputBufferWriteString(buf, " SYSTEM ");
652
	xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
653
    }
654
    xmlOutputBufferWriteString(buf, ">\n");
655
}
656

657
/**
658
 * htmlAttrDumpOutput:
659
 * @buf:  the HTML buffer output
660
 * @doc:  the document
661
 * @cur:  the attribute pointer
662
 *
663
 * Dump an HTML attribute
664
 */
665
static void
666
htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
667
    xmlChar *value;
668

669
    /*
670
     * The html output method should not escape a & character
671
     * occurring in an attribute value immediately followed by
672
     * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
673
     * This is implemented in xmlEncodeEntitiesReentrant
674
     */
675

676
    if (cur == NULL) {
677
	return;
678
    }
679
    xmlOutputBufferWriteString(buf, " ");
680
    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
681
        xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
682
	xmlOutputBufferWriteString(buf, ":");
683
    }
684
    xmlOutputBufferWriteString(buf, (const char *)cur->name);
685
    if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
686
	value = xmlNodeListGetString(doc, cur->children, 0);
687
	if (value) {
688
	    xmlOutputBufferWriteString(buf, "=");
689
	    if ((cur->ns == NULL) && (cur->parent != NULL) &&
690
		(cur->parent->ns == NULL) &&
691
		((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
692
	         (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
693
		 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
694
		 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
695
		  (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
696
		xmlChar *escaped;
697
		xmlChar *tmp = value;
698

699
		while (IS_BLANK_CH(*tmp)) tmp++;
700

701
		/*
702
                 * Angle brackets are technically illegal in URIs, but they're
703
                 * used in server side includes, for example. Curly brackets
704
                 * are illegal as well and often used in templates.
705
                 * Don't escape non-whitespace, printable ASCII chars for
706
                 * improved interoperability. Only escape space, control
707
                 * and non-ASCII chars.
708
		 */
709
		escaped = xmlURIEscapeStr(tmp,
710
                        BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{|}");
711
		if (escaped != NULL) {
712
		    xmlBufWriteQuotedString(buf->buffer, escaped);
713
		    xmlFree(escaped);
714
		} else {
715
		    xmlBufWriteQuotedString(buf->buffer, value);
716
		}
717
	    } else {
718
		xmlBufWriteQuotedString(buf->buffer, value);
719
	    }
720
	    xmlFree(value);
721
	} else  {
722
	    xmlOutputBufferWriteString(buf, "=\"\"");
723
	}
724
    }
725
}
726

727
/**
728
 * htmlNodeDumpFormatOutput:
729
 * @buf:  the HTML buffer output
730
 * @doc:  the document
731
 * @cur:  the current node
732
 * @encoding:  the encoding string (unused)
733
 * @format:  should formatting spaces been added
734
 *
735
 * Dump an HTML node, recursive behaviour,children are printed too.
736
 */
737
void
738
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
739
	                 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
740
                         int format) {
741
    xmlNodePtr root, parent;
742
    xmlAttrPtr attr;
743
    const htmlElemDesc * info;
744

745
    xmlInitParser();
746

747
    if ((cur == NULL) || (buf == NULL)) {
748
	return;
749
    }
750

751
    root = cur;
752
    parent = cur->parent;
753
    while (1) {
754
        switch (cur->type) {
755
        case XML_HTML_DOCUMENT_NODE:
756
        case XML_DOCUMENT_NODE:
757
            if (((xmlDocPtr) cur)->intSubset != NULL) {
758
                htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
759
            }
760
            if (cur->children != NULL) {
761
                /* Always validate cur->parent when descending. */
762
                if (cur->parent == parent) {
763
                    parent = cur;
764
                    cur = cur->children;
765
                    continue;
766
                }
767
            } else {
768
                xmlOutputBufferWriteString(buf, "\n");
769
            }
770
            break;
771

772
        case XML_ELEMENT_NODE:
773
            /*
774
             * Some users like lxml are known to pass nodes with a corrupted
775
             * tree structure. Fall back to a recursive call to handle this
776
             * case.
777
             */
778
            if ((cur->parent != parent) && (cur->children != NULL)) {
779
                htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
780
                break;
781
            }
782

783
            /*
784
             * Get specific HTML info for that node.
785
             */
786
            if (cur->ns == NULL)
787
                info = htmlTagLookup(cur->name);
788
            else
789
                info = NULL;
790

791
            xmlOutputBufferWriteString(buf, "<");
792
            if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
793
                xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
794
                xmlOutputBufferWriteString(buf, ":");
795
            }
796
            xmlOutputBufferWriteString(buf, (const char *)cur->name);
797
            if (cur->nsDef)
798
                xmlNsListDumpOutput(buf, cur->nsDef);
799
            attr = cur->properties;
800
            while (attr != NULL) {
801
                htmlAttrDumpOutput(buf, doc, attr);
802
                attr = attr->next;
803
            }
804

805
            if ((info != NULL) && (info->empty)) {
806
                xmlOutputBufferWriteString(buf, ">");
807
            } else if (cur->children == NULL) {
808
                if ((info != NULL) && (info->saveEndTag != 0) &&
809
                    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
810
                    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
811
                    xmlOutputBufferWriteString(buf, ">");
812
                } else {
813
                    xmlOutputBufferWriteString(buf, "></");
814
                    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
815
                        xmlOutputBufferWriteString(buf,
816
                                (const char *)cur->ns->prefix);
817
                        xmlOutputBufferWriteString(buf, ":");
818
                    }
819
                    xmlOutputBufferWriteString(buf, (const char *)cur->name);
820
                    xmlOutputBufferWriteString(buf, ">");
821
                }
822
            } else {
823
                xmlOutputBufferWriteString(buf, ">");
824
                if ((format) && (info != NULL) && (!info->isinline) &&
825
                    (cur->children->type != HTML_TEXT_NODE) &&
826
                    (cur->children->type != HTML_ENTITY_REF_NODE) &&
827
                    (cur->children != cur->last) &&
828
                    (cur->name != NULL) &&
829
                    (cur->name[0] != 'p')) /* p, pre, param */
830
                    xmlOutputBufferWriteString(buf, "\n");
831
                parent = cur;
832
                cur = cur->children;
833
                continue;
834
            }
835

836
            if ((format) && (cur->next != NULL) &&
837
                (info != NULL) && (!info->isinline)) {
838
                if ((cur->next->type != HTML_TEXT_NODE) &&
839
                    (cur->next->type != HTML_ENTITY_REF_NODE) &&
840
                    (parent != NULL) &&
841
                    (parent->name != NULL) &&
842
                    (parent->name[0] != 'p')) /* p, pre, param */
843
                    xmlOutputBufferWriteString(buf, "\n");
844
            }
845

846
            break;
847

848
        case XML_ATTRIBUTE_NODE:
849
            htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
850
            break;
851

852
        case HTML_TEXT_NODE:
853
            if (cur->content == NULL)
854
                break;
855
            if (((cur->name == (const xmlChar *)xmlStringText) ||
856
                 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
857
                ((parent == NULL) ||
858
                 ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
859
                  (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
860
                xmlChar *buffer;
861

862
                buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
863
                if (buffer != NULL) {
864
                    xmlOutputBufferWriteString(buf, (const char *)buffer);
865
                    xmlFree(buffer);
866
                }
867
            } else {
868
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
869
            }
870
            break;
871

872
        case HTML_COMMENT_NODE:
873
            if (cur->content != NULL) {
874
                xmlOutputBufferWriteString(buf, "<!--");
875
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
876
                xmlOutputBufferWriteString(buf, "-->");
877
            }
878
            break;
879

880
        case HTML_PI_NODE:
881
            if (cur->name != NULL) {
882
                xmlOutputBufferWriteString(buf, "<?");
883
                xmlOutputBufferWriteString(buf, (const char *)cur->name);
884
                if (cur->content != NULL) {
885
                    xmlOutputBufferWriteString(buf, " ");
886
                    xmlOutputBufferWriteString(buf,
887
                            (const char *)cur->content);
888
                }
889
                xmlOutputBufferWriteString(buf, ">");
890
            }
891
            break;
892

893
        case HTML_ENTITY_REF_NODE:
894
            xmlOutputBufferWriteString(buf, "&");
895
            xmlOutputBufferWriteString(buf, (const char *)cur->name);
896
            xmlOutputBufferWriteString(buf, ";");
897
            break;
898

899
        case HTML_PRESERVE_NODE:
900
            if (cur->content != NULL) {
901
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
902
            }
903
            break;
904

905
        default:
906
            break;
907
        }
908

909
        while (1) {
910
            if (cur == root)
911
                return;
912
            if (cur->next != NULL) {
913
                cur = cur->next;
914
                break;
915
            }
916

917
            cur = parent;
918
            /* cur->parent was validated when descending. */
919
            parent = cur->parent;
920

921
            if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
922
                (cur->type == XML_DOCUMENT_NODE)) {
923
                xmlOutputBufferWriteString(buf, "\n");
924
            } else {
925
                if ((format) && (cur->ns == NULL))
926
                    info = htmlTagLookup(cur->name);
927
                else
928
                    info = NULL;
929

930
                if ((format) && (info != NULL) && (!info->isinline) &&
931
                    (cur->last->type != HTML_TEXT_NODE) &&
932
                    (cur->last->type != HTML_ENTITY_REF_NODE) &&
933
                    (cur->children != cur->last) &&
934
                    (cur->name != NULL) &&
935
                    (cur->name[0] != 'p')) /* p, pre, param */
936
                    xmlOutputBufferWriteString(buf, "\n");
937

938
                xmlOutputBufferWriteString(buf, "</");
939
                if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
940
                    xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
941
                    xmlOutputBufferWriteString(buf, ":");
942
                }
943
                xmlOutputBufferWriteString(buf, (const char *)cur->name);
944
                xmlOutputBufferWriteString(buf, ">");
945

946
                if ((format) && (info != NULL) && (!info->isinline) &&
947
                    (cur->next != NULL)) {
948
                    if ((cur->next->type != HTML_TEXT_NODE) &&
949
                        (cur->next->type != HTML_ENTITY_REF_NODE) &&
950
                        (parent != NULL) &&
951
                        (parent->name != NULL) &&
952
                        (parent->name[0] != 'p')) /* p, pre, param */
953
                        xmlOutputBufferWriteString(buf, "\n");
954
                }
955
            }
956
        }
957
    }
958
}
959

960
/**
961
 * htmlNodeDumpOutput:
962
 * @buf:  the HTML buffer output
963
 * @doc:  the document
964
 * @cur:  the current node
965
 * @encoding:  the encoding string (unused)
966
 *
967
 * Dump an HTML node, recursive behaviour,children are printed too,
968
 * and formatting returns/spaces are added.
969
 */
970
void
971
htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
972
	           xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
973
    htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
974
}
975

976
/**
977
 * htmlDocContentDumpFormatOutput:
978
 * @buf:  the HTML buffer output
979
 * @cur:  the document
980
 * @encoding:  the encoding string (unused)
981
 * @format:  should formatting spaces been added
982
 *
983
 * Dump an HTML document.
984
 */
985
void
986
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
987
	                       const char *encoding ATTRIBUTE_UNUSED,
988
                               int format) {
989
    int type = 0;
990
    if (cur) {
991
        type = cur->type;
992
        cur->type = XML_HTML_DOCUMENT_NODE;
993
    }
994
    htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
995
    if (cur)
996
        cur->type = (xmlElementType) type;
997
}
998

999
/**
1000
 * htmlDocContentDumpOutput:
1001
 * @buf:  the HTML buffer output
1002
 * @cur:  the document
1003
 * @encoding:  the encoding string (unused)
1004
 *
1005
 * Dump an HTML document. Formatting return/spaces are added.
1006
 */
1007
void
1008
htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1009
	                 const char *encoding ATTRIBUTE_UNUSED) {
1010
    htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
1011
}
1012

1013
/************************************************************************
1014
 *									*
1015
 *		Saving functions front-ends				*
1016
 *									*
1017
 ************************************************************************/
1018

1019
/**
1020
 * htmlDocDump:
1021
 * @f:  the FILE*
1022
 * @cur:  the document
1023
 *
1024
 * Dump an HTML document to an open FILE.
1025
 *
1026
 * returns: the number of byte written or -1 in case of failure.
1027
 */
1028
int
1029
htmlDocDump(FILE *f, xmlDocPtr cur) {
1030
    xmlOutputBufferPtr buf;
1031
    xmlCharEncodingHandlerPtr handler = NULL;
1032
    const char *encoding;
1033
    int ret;
1034

1035
    xmlInitParser();
1036

1037
    if ((cur == NULL) || (f == NULL)) {
1038
	return(-1);
1039
    }
1040

1041
    encoding = (const char *) htmlGetMetaEncoding(cur);
1042

1043
    if (encoding != NULL) {
1044
	xmlCharEncoding enc;
1045

1046
	enc = xmlParseCharEncoding(encoding);
1047
	if (enc != XML_CHAR_ENCODING_UTF8) {
1048
	    handler = xmlFindCharEncodingHandler(encoding);
1049
	    if (handler == NULL)
1050
		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1051
	}
1052
    } else {
1053
        /*
1054
         * Fallback to HTML or ASCII when the encoding is unspecified
1055
         */
1056
        if (handler == NULL)
1057
            handler = xmlFindCharEncodingHandler("HTML");
1058
        if (handler == NULL)
1059
            handler = xmlFindCharEncodingHandler("ascii");
1060
    }
1061

1062
    buf = xmlOutputBufferCreateFile(f, handler);
1063
    if (buf == NULL) return(-1);
1064
    htmlDocContentDumpOutput(buf, cur, NULL);
1065

1066
    ret = xmlOutputBufferClose(buf);
1067
    return(ret);
1068
}
1069

1070
/**
1071
 * htmlSaveFile:
1072
 * @filename:  the filename (or URL)
1073
 * @cur:  the document
1074
 *
1075
 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1076
 * used.
1077
 * returns: the number of byte written or -1 in case of failure.
1078
 */
1079
int
1080
htmlSaveFile(const char *filename, xmlDocPtr cur) {
1081
    xmlOutputBufferPtr buf;
1082
    xmlCharEncodingHandlerPtr handler = NULL;
1083
    const char *encoding;
1084
    int ret;
1085

1086
    if ((cur == NULL) || (filename == NULL))
1087
        return(-1);
1088

1089
    xmlInitParser();
1090

1091
    encoding = (const char *) htmlGetMetaEncoding(cur);
1092

1093
    if (encoding != NULL) {
1094
	xmlCharEncoding enc;
1095

1096
	enc = xmlParseCharEncoding(encoding);
1097
	if (enc != XML_CHAR_ENCODING_UTF8) {
1098
	    handler = xmlFindCharEncodingHandler(encoding);
1099
	    if (handler == NULL)
1100
		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1101
	}
1102
    } else {
1103
        /*
1104
         * Fallback to HTML or ASCII when the encoding is unspecified
1105
         */
1106
        if (handler == NULL)
1107
            handler = xmlFindCharEncodingHandler("HTML");
1108
        if (handler == NULL)
1109
            handler = xmlFindCharEncodingHandler("ascii");
1110
    }
1111

1112
    /*
1113
     * save the content to a temp buffer.
1114
     */
1115
    buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1116
    if (buf == NULL) return(0);
1117

1118
    htmlDocContentDumpOutput(buf, cur, NULL);
1119

1120
    ret = xmlOutputBufferClose(buf);
1121
    return(ret);
1122
}
1123

1124
/**
1125
 * htmlSaveFileFormat:
1126
 * @filename:  the filename
1127
 * @cur:  the document
1128
 * @format:  should formatting spaces been added
1129
 * @encoding: the document encoding
1130
 *
1131
 * Dump an HTML document to a file using a given encoding.
1132
 *
1133
 * returns: the number of byte written or -1 in case of failure.
1134
 */
1135
int
1136
htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1137
	           const char *encoding, int format) {
1138
    xmlOutputBufferPtr buf;
1139
    xmlCharEncodingHandlerPtr handler = NULL;
1140
    int ret;
1141

1142
    if ((cur == NULL) || (filename == NULL))
1143
        return(-1);
1144

1145
    xmlInitParser();
1146

1147
    if (encoding != NULL) {
1148
	xmlCharEncoding enc;
1149

1150
	enc = xmlParseCharEncoding(encoding);
1151
	if (enc != XML_CHAR_ENCODING_UTF8) {
1152
	    handler = xmlFindCharEncodingHandler(encoding);
1153
	    if (handler == NULL)
1154
		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1155
	}
1156
        htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1157
    } else {
1158
	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1159

1160
        /*
1161
         * Fallback to HTML or ASCII when the encoding is unspecified
1162
         */
1163
        if (handler == NULL)
1164
            handler = xmlFindCharEncodingHandler("HTML");
1165
        if (handler == NULL)
1166
            handler = xmlFindCharEncodingHandler("ascii");
1167
    }
1168

1169
    /*
1170
     * save the content to a temp buffer.
1171
     */
1172
    buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1173
    if (buf == NULL) return(0);
1174

1175
    htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1176

1177
    ret = xmlOutputBufferClose(buf);
1178
    return(ret);
1179
}
1180

1181
/**
1182
 * htmlSaveFileEnc:
1183
 * @filename:  the filename
1184
 * @cur:  the document
1185
 * @encoding: the document encoding
1186
 *
1187
 * Dump an HTML document to a file using a given encoding
1188
 * and formatting returns/spaces are added.
1189
 *
1190
 * returns: the number of byte written or -1 in case of failure.
1191
 */
1192
int
1193
htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1194
    return(htmlSaveFileFormat(filename, cur, encoding, 1));
1195
}
1196

1197
#endif /* LIBXML_OUTPUT_ENABLED */
1198

1199
#endif /* LIBXML_HTML_ENABLED */
1200

1201
Product

Resources

Company