Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
wine-mirror
GitHub Repository: wine-mirror/wine
Path: blob/master/libs/xml2/HTMLtree.c
4393 views
1
/*
2
* HTMLtree.c : implementation of access function for an HTML tree.
3
*
4
* See Copyright for the status of this software.
5
*
6
* [email protected]
7
*/
8
9
10
#define IN_LIBXML
11
#include "libxml.h"
12
#ifdef LIBXML_HTML_ENABLED
13
14
#include <string.h> /* for memset() only ! */
15
#include <ctype.h>
16
#include <stdlib.h>
17
18
#include <libxml/xmlmemory.h>
19
#include <libxml/HTMLparser.h>
20
#include <libxml/HTMLtree.h>
21
#include <libxml/entities.h>
22
#include <libxml/xmlerror.h>
23
#include <libxml/parserInternals.h>
24
#include <libxml/uri.h>
25
26
#include "private/buf.h"
27
#include "private/error.h"
28
#include "private/io.h"
29
#include "private/save.h"
30
31
/************************************************************************
32
* *
33
* Getting/Setting encoding meta tags *
34
* *
35
************************************************************************/
36
37
/**
38
* htmlGetMetaEncoding:
39
* @doc: the document
40
*
41
* Encoding definition lookup in the Meta tags
42
*
43
* Returns the current encoding as flagged in the HTML source
44
*/
45
const xmlChar *
46
htmlGetMetaEncoding(htmlDocPtr doc) {
47
htmlNodePtr cur;
48
const xmlChar *content;
49
const xmlChar *encoding;
50
51
if (doc == NULL)
52
return(NULL);
53
cur = doc->children;
54
55
/*
56
* Search the html
57
*/
58
while (cur != NULL) {
59
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
60
if (xmlStrEqual(cur->name, BAD_CAST"html"))
61
break;
62
if (xmlStrEqual(cur->name, BAD_CAST"head"))
63
goto found_head;
64
if (xmlStrEqual(cur->name, BAD_CAST"meta"))
65
goto found_meta;
66
}
67
cur = cur->next;
68
}
69
if (cur == NULL)
70
return(NULL);
71
cur = cur->children;
72
73
/*
74
* Search the head
75
*/
76
while (cur != NULL) {
77
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
78
if (xmlStrEqual(cur->name, BAD_CAST"head"))
79
break;
80
if (xmlStrEqual(cur->name, BAD_CAST"meta"))
81
goto found_meta;
82
}
83
cur = cur->next;
84
}
85
if (cur == NULL)
86
return(NULL);
87
found_head:
88
cur = cur->children;
89
90
/*
91
* Search the meta elements
92
*/
93
found_meta:
94
while (cur != NULL) {
95
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
96
if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
97
xmlAttrPtr attr = cur->properties;
98
int http;
99
const xmlChar *value;
100
101
content = NULL;
102
http = 0;
103
while (attr != NULL) {
104
if ((attr->children != NULL) &&
105
(attr->children->type == XML_TEXT_NODE) &&
106
(attr->children->next == NULL)) {
107
value = attr->children->content;
108
if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
109
&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
110
http = 1;
111
else if ((value != NULL)
112
&& (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
113
content = value;
114
if ((http != 0) && (content != NULL))
115
goto found_content;
116
}
117
attr = attr->next;
118
}
119
}
120
}
121
cur = cur->next;
122
}
123
return(NULL);
124
125
found_content:
126
encoding = xmlStrstr(content, BAD_CAST"charset=");
127
if (encoding == NULL)
128
encoding = xmlStrstr(content, BAD_CAST"Charset=");
129
if (encoding == NULL)
130
encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
131
if (encoding != NULL) {
132
encoding += 8;
133
} else {
134
encoding = xmlStrstr(content, BAD_CAST"charset =");
135
if (encoding == NULL)
136
encoding = xmlStrstr(content, BAD_CAST"Charset =");
137
if (encoding == NULL)
138
encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
139
if (encoding != NULL)
140
encoding += 9;
141
}
142
if (encoding != NULL) {
143
while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
144
}
145
return(encoding);
146
}
147
148
/**
149
* htmlSetMetaEncoding:
150
* @doc: the document
151
* @encoding: the encoding string
152
*
153
* Sets the current encoding in the Meta tags
154
* NOTE: this will not change the document content encoding, just
155
* the META flag associated.
156
*
157
* Returns 0 in case of success and -1 in case of error
158
*/
159
int
160
htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
161
htmlNodePtr cur, meta = NULL, head = NULL;
162
const xmlChar *content = NULL;
163
char newcontent[100];
164
165
newcontent[0] = 0;
166
167
if (doc == NULL)
168
return(-1);
169
170
/* html isn't a real encoding it's just libxml2 way to get entities */
171
if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
172
return(-1);
173
174
if (encoding != NULL) {
175
snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
176
(char *)encoding);
177
newcontent[sizeof(newcontent) - 1] = 0;
178
}
179
180
cur = doc->children;
181
182
/*
183
* Search the html
184
*/
185
while (cur != NULL) {
186
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
187
if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
188
break;
189
if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
190
goto found_head;
191
if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
192
goto found_meta;
193
}
194
cur = cur->next;
195
}
196
if (cur == NULL)
197
return(-1);
198
cur = cur->children;
199
200
/*
201
* Search the head
202
*/
203
while (cur != NULL) {
204
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
205
if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
206
break;
207
if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
208
head = cur->parent;
209
goto found_meta;
210
}
211
}
212
cur = cur->next;
213
}
214
if (cur == NULL)
215
return(-1);
216
found_head:
217
head = cur;
218
if (cur->children == NULL)
219
goto create;
220
cur = cur->children;
221
222
found_meta:
223
/*
224
* Search and update all the remaining the meta elements carrying
225
* encoding information
226
*/
227
while (cur != NULL) {
228
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
229
if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
230
xmlAttrPtr attr = cur->properties;
231
int http;
232
const xmlChar *value;
233
234
content = NULL;
235
http = 0;
236
while (attr != NULL) {
237
if ((attr->children != NULL) &&
238
(attr->children->type == XML_TEXT_NODE) &&
239
(attr->children->next == NULL)) {
240
value = attr->children->content;
241
if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
242
&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
243
http = 1;
244
else
245
{
246
if ((value != NULL) &&
247
(!xmlStrcasecmp(attr->name, BAD_CAST"content")))
248
content = value;
249
}
250
if ((http != 0) && (content != NULL))
251
break;
252
}
253
attr = attr->next;
254
}
255
if ((http != 0) && (content != NULL)) {
256
meta = cur;
257
break;
258
}
259
260
}
261
}
262
cur = cur->next;
263
}
264
create:
265
if (meta == NULL) {
266
if ((encoding != NULL) && (head != NULL)) {
267
/*
268
* Create a new Meta element with the right attributes
269
*/
270
271
meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
272
if (head->children == NULL)
273
xmlAddChild(head, meta);
274
else
275
xmlAddPrevSibling(head->children, meta);
276
xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
277
xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
278
}
279
} else {
280
/* remove the meta tag if NULL is passed */
281
if (encoding == NULL) {
282
xmlUnlinkNode(meta);
283
xmlFreeNode(meta);
284
}
285
/* change the document only if there is a real encoding change */
286
else if (xmlStrcasestr(content, encoding) == NULL) {
287
xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
288
}
289
}
290
291
292
return(0);
293
}
294
295
/**
296
* booleanHTMLAttrs:
297
*
298
* These are the HTML attributes which will be output
299
* in minimized form, i.e. <option selected="selected"> will be
300
* output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
301
*
302
*/
303
static const char* const htmlBooleanAttrs[] = {
304
"checked", "compact", "declare", "defer", "disabled", "ismap",
305
"multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
306
"selected", NULL
307
};
308
309
310
/**
311
* htmlIsBooleanAttr:
312
* @name: the name of the attribute to check
313
*
314
* Determine if a given attribute is a boolean attribute.
315
*
316
* returns: false if the attribute is not boolean, true otherwise.
317
*/
318
int
319
htmlIsBooleanAttr(const xmlChar *name)
320
{
321
int i = 0;
322
323
while (htmlBooleanAttrs[i] != NULL) {
324
if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
325
return 1;
326
i++;
327
}
328
return 0;
329
}
330
331
#ifdef LIBXML_OUTPUT_ENABLED
332
/************************************************************************
333
* *
334
* Output error handlers *
335
* *
336
************************************************************************/
337
/**
338
* htmlSaveErrMemory:
339
* @extra: extra information
340
*
341
* Handle an out of memory condition
342
*/
343
static void
344
htmlSaveErrMemory(const char *extra)
345
{
346
__xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
347
}
348
349
/**
350
* htmlSaveErr:
351
* @code: the error number
352
* @node: the location of the error.
353
* @extra: extra information
354
*
355
* Handle an out of memory condition
356
*/
357
static void
358
htmlSaveErr(int code, xmlNodePtr node, const char *extra)
359
{
360
const char *msg = NULL;
361
362
switch(code) {
363
case XML_SAVE_NOT_UTF8:
364
msg = "string is not in UTF-8\n";
365
break;
366
case XML_SAVE_CHAR_INVALID:
367
msg = "invalid character value\n";
368
break;
369
case XML_SAVE_UNKNOWN_ENCODING:
370
msg = "unknown encoding %s\n";
371
break;
372
case XML_SAVE_NO_DOCTYPE:
373
msg = "HTML has no DOCTYPE\n";
374
break;
375
default:
376
msg = "unexpected error number\n";
377
}
378
__xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
379
}
380
381
/************************************************************************
382
* *
383
* Dumping HTML tree content to a simple buffer *
384
* *
385
************************************************************************/
386
387
/**
388
* htmlBufNodeDumpFormat:
389
* @buf: the xmlBufPtr output
390
* @doc: the document
391
* @cur: the current node
392
* @format: should formatting spaces been added
393
*
394
* Dump an HTML node, recursive behaviour,children are printed too.
395
*
396
* Returns the number of byte written or -1 in case of error
397
*/
398
static size_t
399
htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
400
int format) {
401
size_t use;
402
int ret;
403
xmlOutputBufferPtr outbuf;
404
405
if (cur == NULL) {
406
return (-1);
407
}
408
if (buf == NULL) {
409
return (-1);
410
}
411
outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
412
if (outbuf == NULL) {
413
htmlSaveErrMemory("allocating HTML output buffer");
414
return (-1);
415
}
416
memset(outbuf, 0, sizeof(xmlOutputBuffer));
417
outbuf->buffer = buf;
418
outbuf->encoder = NULL;
419
outbuf->writecallback = NULL;
420
outbuf->closecallback = NULL;
421
outbuf->context = NULL;
422
outbuf->written = 0;
423
424
use = xmlBufUse(buf);
425
htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
426
xmlFree(outbuf);
427
ret = xmlBufUse(buf) - use;
428
return (ret);
429
}
430
431
/**
432
* htmlNodeDump:
433
* @buf: the HTML buffer output
434
* @doc: the document
435
* @cur: the current node
436
*
437
* Dump an HTML node, recursive behaviour,children are printed too,
438
* and formatting returns are added.
439
*
440
* Returns the number of byte written or -1 in case of error
441
*/
442
int
443
htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
444
xmlBufPtr buffer;
445
size_t ret;
446
447
if ((buf == NULL) || (cur == NULL))
448
return(-1);
449
450
xmlInitParser();
451
buffer = xmlBufFromBuffer(buf);
452
if (buffer == NULL)
453
return(-1);
454
455
ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
456
457
xmlBufBackToBuffer(buffer);
458
459
if (ret > INT_MAX)
460
return(-1);
461
return((int) ret);
462
}
463
464
/**
465
* htmlNodeDumpFileFormat:
466
* @out: the FILE pointer
467
* @doc: the document
468
* @cur: the current node
469
* @encoding: the document encoding
470
* @format: should formatting spaces been added
471
*
472
* Dump an HTML node, recursive behaviour,children are printed too.
473
*
474
* TODO: if encoding == NULL try to save in the doc encoding
475
*
476
* returns: the number of byte written or -1 in case of failure.
477
*/
478
int
479
htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
480
xmlNodePtr cur, const char *encoding, int format) {
481
xmlOutputBufferPtr buf;
482
xmlCharEncodingHandlerPtr handler = NULL;
483
int ret;
484
485
xmlInitParser();
486
487
if (encoding != NULL) {
488
xmlCharEncoding enc;
489
490
enc = xmlParseCharEncoding(encoding);
491
if (enc != XML_CHAR_ENCODING_UTF8) {
492
handler = xmlFindCharEncodingHandler(encoding);
493
if (handler == NULL)
494
htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
495
}
496
} else {
497
/*
498
* Fallback to HTML or ASCII when the encoding is unspecified
499
*/
500
if (handler == NULL)
501
handler = xmlFindCharEncodingHandler("HTML");
502
if (handler == NULL)
503
handler = xmlFindCharEncodingHandler("ascii");
504
}
505
506
/*
507
* save the content to a temp buffer.
508
*/
509
buf = xmlOutputBufferCreateFile(out, handler);
510
if (buf == NULL) return(0);
511
512
htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
513
514
ret = xmlOutputBufferClose(buf);
515
return(ret);
516
}
517
518
/**
519
* htmlNodeDumpFile:
520
* @out: the FILE pointer
521
* @doc: the document
522
* @cur: the current node
523
*
524
* Dump an HTML node, recursive behaviour,children are printed too,
525
* and formatting returns are added.
526
*/
527
void
528
htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
529
htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
530
}
531
532
/**
533
* htmlDocDumpMemoryFormat:
534
* @cur: the document
535
* @mem: OUT: the memory pointer
536
* @size: OUT: the memory length
537
* @format: should formatting spaces been added
538
*
539
* Dump an HTML document in memory and return the xmlChar * and it's size.
540
* It's up to the caller to free the memory.
541
*/
542
void
543
htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
544
xmlOutputBufferPtr buf;
545
xmlCharEncodingHandlerPtr handler = NULL;
546
const char *encoding;
547
548
xmlInitParser();
549
550
if ((mem == NULL) || (size == NULL))
551
return;
552
if (cur == NULL) {
553
*mem = NULL;
554
*size = 0;
555
return;
556
}
557
558
encoding = (const char *) htmlGetMetaEncoding(cur);
559
560
if (encoding != NULL) {
561
xmlCharEncoding enc;
562
563
enc = xmlParseCharEncoding(encoding);
564
if (enc != XML_CHAR_ENCODING_UTF8) {
565
handler = xmlFindCharEncodingHandler(encoding);
566
if (handler == NULL)
567
htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
568
569
}
570
} else {
571
/*
572
* Fallback to HTML or ASCII when the encoding is unspecified
573
*/
574
if (handler == NULL)
575
handler = xmlFindCharEncodingHandler("HTML");
576
if (handler == NULL)
577
handler = xmlFindCharEncodingHandler("ascii");
578
}
579
580
buf = xmlAllocOutputBufferInternal(handler);
581
if (buf == NULL) {
582
*mem = NULL;
583
*size = 0;
584
return;
585
}
586
587
htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
588
589
xmlOutputBufferFlush(buf);
590
if (buf->conv != NULL) {
591
*size = xmlBufUse(buf->conv);
592
*mem = xmlStrndup(xmlBufContent(buf->conv), *size);
593
} else {
594
*size = xmlBufUse(buf->buffer);
595
*mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
596
}
597
(void)xmlOutputBufferClose(buf);
598
}
599
600
/**
601
* htmlDocDumpMemory:
602
* @cur: the document
603
* @mem: OUT: the memory pointer
604
* @size: OUT: the memory length
605
*
606
* Dump an HTML document in memory and return the xmlChar * and it's size.
607
* It's up to the caller to free the memory.
608
*/
609
void
610
htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
611
htmlDocDumpMemoryFormat(cur, mem, size, 1);
612
}
613
614
615
/************************************************************************
616
* *
617
* Dumping HTML tree content to an I/O output buffer *
618
* *
619
************************************************************************/
620
621
/**
622
* htmlDtdDumpOutput:
623
* @buf: the HTML buffer output
624
* @doc: the document
625
* @encoding: the encoding string
626
*
627
* TODO: check whether encoding is needed
628
*
629
* Dump the HTML document DTD, if any.
630
*/
631
static void
632
htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
633
const char *encoding ATTRIBUTE_UNUSED) {
634
xmlDtdPtr cur = doc->intSubset;
635
636
if (cur == NULL) {
637
htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
638
return;
639
}
640
xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
641
xmlOutputBufferWriteString(buf, (const char *)cur->name);
642
if (cur->ExternalID != NULL) {
643
xmlOutputBufferWriteString(buf, " PUBLIC ");
644
xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
645
if (cur->SystemID != NULL) {
646
xmlOutputBufferWriteString(buf, " ");
647
xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
648
}
649
} else if (cur->SystemID != NULL &&
650
xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
651
xmlOutputBufferWriteString(buf, " SYSTEM ");
652
xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
653
}
654
xmlOutputBufferWriteString(buf, ">\n");
655
}
656
657
/**
658
* htmlAttrDumpOutput:
659
* @buf: the HTML buffer output
660
* @doc: the document
661
* @cur: the attribute pointer
662
*
663
* Dump an HTML attribute
664
*/
665
static void
666
htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
667
xmlChar *value;
668
669
/*
670
* The html output method should not escape a & character
671
* occurring in an attribute value immediately followed by
672
* a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
673
* This is implemented in xmlEncodeEntitiesReentrant
674
*/
675
676
if (cur == NULL) {
677
return;
678
}
679
xmlOutputBufferWriteString(buf, " ");
680
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
681
xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
682
xmlOutputBufferWriteString(buf, ":");
683
}
684
xmlOutputBufferWriteString(buf, (const char *)cur->name);
685
if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
686
value = xmlNodeListGetString(doc, cur->children, 0);
687
if (value) {
688
xmlOutputBufferWriteString(buf, "=");
689
if ((cur->ns == NULL) && (cur->parent != NULL) &&
690
(cur->parent->ns == NULL) &&
691
((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
692
(!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
693
(!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
694
((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
695
(!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
696
xmlChar *escaped;
697
xmlChar *tmp = value;
698
699
while (IS_BLANK_CH(*tmp)) tmp++;
700
701
/*
702
* Angle brackets are technically illegal in URIs, but they're
703
* used in server side includes, for example. Curly brackets
704
* are illegal as well and often used in templates.
705
* Don't escape non-whitespace, printable ASCII chars for
706
* improved interoperability. Only escape space, control
707
* and non-ASCII chars.
708
*/
709
escaped = xmlURIEscapeStr(tmp,
710
BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{|}");
711
if (escaped != NULL) {
712
xmlBufWriteQuotedString(buf->buffer, escaped);
713
xmlFree(escaped);
714
} else {
715
xmlBufWriteQuotedString(buf->buffer, value);
716
}
717
} else {
718
xmlBufWriteQuotedString(buf->buffer, value);
719
}
720
xmlFree(value);
721
} else {
722
xmlOutputBufferWriteString(buf, "=\"\"");
723
}
724
}
725
}
726
727
/**
728
* htmlNodeDumpFormatOutput:
729
* @buf: the HTML buffer output
730
* @doc: the document
731
* @cur: the current node
732
* @encoding: the encoding string (unused)
733
* @format: should formatting spaces been added
734
*
735
* Dump an HTML node, recursive behaviour,children are printed too.
736
*/
737
void
738
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
739
xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
740
int format) {
741
xmlNodePtr root, parent;
742
xmlAttrPtr attr;
743
const htmlElemDesc * info;
744
745
xmlInitParser();
746
747
if ((cur == NULL) || (buf == NULL)) {
748
return;
749
}
750
751
root = cur;
752
parent = cur->parent;
753
while (1) {
754
switch (cur->type) {
755
case XML_HTML_DOCUMENT_NODE:
756
case XML_DOCUMENT_NODE:
757
if (((xmlDocPtr) cur)->intSubset != NULL) {
758
htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
759
}
760
if (cur->children != NULL) {
761
/* Always validate cur->parent when descending. */
762
if (cur->parent == parent) {
763
parent = cur;
764
cur = cur->children;
765
continue;
766
}
767
} else {
768
xmlOutputBufferWriteString(buf, "\n");
769
}
770
break;
771
772
case XML_ELEMENT_NODE:
773
/*
774
* Some users like lxml are known to pass nodes with a corrupted
775
* tree structure. Fall back to a recursive call to handle this
776
* case.
777
*/
778
if ((cur->parent != parent) && (cur->children != NULL)) {
779
htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
780
break;
781
}
782
783
/*
784
* Get specific HTML info for that node.
785
*/
786
if (cur->ns == NULL)
787
info = htmlTagLookup(cur->name);
788
else
789
info = NULL;
790
791
xmlOutputBufferWriteString(buf, "<");
792
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
793
xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
794
xmlOutputBufferWriteString(buf, ":");
795
}
796
xmlOutputBufferWriteString(buf, (const char *)cur->name);
797
if (cur->nsDef)
798
xmlNsListDumpOutput(buf, cur->nsDef);
799
attr = cur->properties;
800
while (attr != NULL) {
801
htmlAttrDumpOutput(buf, doc, attr);
802
attr = attr->next;
803
}
804
805
if ((info != NULL) && (info->empty)) {
806
xmlOutputBufferWriteString(buf, ">");
807
} else if (cur->children == NULL) {
808
if ((info != NULL) && (info->saveEndTag != 0) &&
809
(xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
810
(xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
811
xmlOutputBufferWriteString(buf, ">");
812
} else {
813
xmlOutputBufferWriteString(buf, "></");
814
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
815
xmlOutputBufferWriteString(buf,
816
(const char *)cur->ns->prefix);
817
xmlOutputBufferWriteString(buf, ":");
818
}
819
xmlOutputBufferWriteString(buf, (const char *)cur->name);
820
xmlOutputBufferWriteString(buf, ">");
821
}
822
} else {
823
xmlOutputBufferWriteString(buf, ">");
824
if ((format) && (info != NULL) && (!info->isinline) &&
825
(cur->children->type != HTML_TEXT_NODE) &&
826
(cur->children->type != HTML_ENTITY_REF_NODE) &&
827
(cur->children != cur->last) &&
828
(cur->name != NULL) &&
829
(cur->name[0] != 'p')) /* p, pre, param */
830
xmlOutputBufferWriteString(buf, "\n");
831
parent = cur;
832
cur = cur->children;
833
continue;
834
}
835
836
if ((format) && (cur->next != NULL) &&
837
(info != NULL) && (!info->isinline)) {
838
if ((cur->next->type != HTML_TEXT_NODE) &&
839
(cur->next->type != HTML_ENTITY_REF_NODE) &&
840
(parent != NULL) &&
841
(parent->name != NULL) &&
842
(parent->name[0] != 'p')) /* p, pre, param */
843
xmlOutputBufferWriteString(buf, "\n");
844
}
845
846
break;
847
848
case XML_ATTRIBUTE_NODE:
849
htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
850
break;
851
852
case HTML_TEXT_NODE:
853
if (cur->content == NULL)
854
break;
855
if (((cur->name == (const xmlChar *)xmlStringText) ||
856
(cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
857
((parent == NULL) ||
858
((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
859
(xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
860
xmlChar *buffer;
861
862
buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
863
if (buffer != NULL) {
864
xmlOutputBufferWriteString(buf, (const char *)buffer);
865
xmlFree(buffer);
866
}
867
} else {
868
xmlOutputBufferWriteString(buf, (const char *)cur->content);
869
}
870
break;
871
872
case HTML_COMMENT_NODE:
873
if (cur->content != NULL) {
874
xmlOutputBufferWriteString(buf, "<!--");
875
xmlOutputBufferWriteString(buf, (const char *)cur->content);
876
xmlOutputBufferWriteString(buf, "-->");
877
}
878
break;
879
880
case HTML_PI_NODE:
881
if (cur->name != NULL) {
882
xmlOutputBufferWriteString(buf, "<?");
883
xmlOutputBufferWriteString(buf, (const char *)cur->name);
884
if (cur->content != NULL) {
885
xmlOutputBufferWriteString(buf, " ");
886
xmlOutputBufferWriteString(buf,
887
(const char *)cur->content);
888
}
889
xmlOutputBufferWriteString(buf, ">");
890
}
891
break;
892
893
case HTML_ENTITY_REF_NODE:
894
xmlOutputBufferWriteString(buf, "&");
895
xmlOutputBufferWriteString(buf, (const char *)cur->name);
896
xmlOutputBufferWriteString(buf, ";");
897
break;
898
899
case HTML_PRESERVE_NODE:
900
if (cur->content != NULL) {
901
xmlOutputBufferWriteString(buf, (const char *)cur->content);
902
}
903
break;
904
905
default:
906
break;
907
}
908
909
while (1) {
910
if (cur == root)
911
return;
912
if (cur->next != NULL) {
913
cur = cur->next;
914
break;
915
}
916
917
cur = parent;
918
/* cur->parent was validated when descending. */
919
parent = cur->parent;
920
921
if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
922
(cur->type == XML_DOCUMENT_NODE)) {
923
xmlOutputBufferWriteString(buf, "\n");
924
} else {
925
if ((format) && (cur->ns == NULL))
926
info = htmlTagLookup(cur->name);
927
else
928
info = NULL;
929
930
if ((format) && (info != NULL) && (!info->isinline) &&
931
(cur->last->type != HTML_TEXT_NODE) &&
932
(cur->last->type != HTML_ENTITY_REF_NODE) &&
933
(cur->children != cur->last) &&
934
(cur->name != NULL) &&
935
(cur->name[0] != 'p')) /* p, pre, param */
936
xmlOutputBufferWriteString(buf, "\n");
937
938
xmlOutputBufferWriteString(buf, "</");
939
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
940
xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
941
xmlOutputBufferWriteString(buf, ":");
942
}
943
xmlOutputBufferWriteString(buf, (const char *)cur->name);
944
xmlOutputBufferWriteString(buf, ">");
945
946
if ((format) && (info != NULL) && (!info->isinline) &&
947
(cur->next != NULL)) {
948
if ((cur->next->type != HTML_TEXT_NODE) &&
949
(cur->next->type != HTML_ENTITY_REF_NODE) &&
950
(parent != NULL) &&
951
(parent->name != NULL) &&
952
(parent->name[0] != 'p')) /* p, pre, param */
953
xmlOutputBufferWriteString(buf, "\n");
954
}
955
}
956
}
957
}
958
}
959
960
/**
961
* htmlNodeDumpOutput:
962
* @buf: the HTML buffer output
963
* @doc: the document
964
* @cur: the current node
965
* @encoding: the encoding string (unused)
966
*
967
* Dump an HTML node, recursive behaviour,children are printed too,
968
* and formatting returns/spaces are added.
969
*/
970
void
971
htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
972
xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
973
htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
974
}
975
976
/**
977
* htmlDocContentDumpFormatOutput:
978
* @buf: the HTML buffer output
979
* @cur: the document
980
* @encoding: the encoding string (unused)
981
* @format: should formatting spaces been added
982
*
983
* Dump an HTML document.
984
*/
985
void
986
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
987
const char *encoding ATTRIBUTE_UNUSED,
988
int format) {
989
int type = 0;
990
if (cur) {
991
type = cur->type;
992
cur->type = XML_HTML_DOCUMENT_NODE;
993
}
994
htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
995
if (cur)
996
cur->type = (xmlElementType) type;
997
}
998
999
/**
1000
* htmlDocContentDumpOutput:
1001
* @buf: the HTML buffer output
1002
* @cur: the document
1003
* @encoding: the encoding string (unused)
1004
*
1005
* Dump an HTML document. Formatting return/spaces are added.
1006
*/
1007
void
1008
htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1009
const char *encoding ATTRIBUTE_UNUSED) {
1010
htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
1011
}
1012
1013
/************************************************************************
1014
* *
1015
* Saving functions front-ends *
1016
* *
1017
************************************************************************/
1018
1019
/**
1020
* htmlDocDump:
1021
* @f: the FILE*
1022
* @cur: the document
1023
*
1024
* Dump an HTML document to an open FILE.
1025
*
1026
* returns: the number of byte written or -1 in case of failure.
1027
*/
1028
int
1029
htmlDocDump(FILE *f, xmlDocPtr cur) {
1030
xmlOutputBufferPtr buf;
1031
xmlCharEncodingHandlerPtr handler = NULL;
1032
const char *encoding;
1033
int ret;
1034
1035
xmlInitParser();
1036
1037
if ((cur == NULL) || (f == NULL)) {
1038
return(-1);
1039
}
1040
1041
encoding = (const char *) htmlGetMetaEncoding(cur);
1042
1043
if (encoding != NULL) {
1044
xmlCharEncoding enc;
1045
1046
enc = xmlParseCharEncoding(encoding);
1047
if (enc != XML_CHAR_ENCODING_UTF8) {
1048
handler = xmlFindCharEncodingHandler(encoding);
1049
if (handler == NULL)
1050
htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1051
}
1052
} else {
1053
/*
1054
* Fallback to HTML or ASCII when the encoding is unspecified
1055
*/
1056
if (handler == NULL)
1057
handler = xmlFindCharEncodingHandler("HTML");
1058
if (handler == NULL)
1059
handler = xmlFindCharEncodingHandler("ascii");
1060
}
1061
1062
buf = xmlOutputBufferCreateFile(f, handler);
1063
if (buf == NULL) return(-1);
1064
htmlDocContentDumpOutput(buf, cur, NULL);
1065
1066
ret = xmlOutputBufferClose(buf);
1067
return(ret);
1068
}
1069
1070
/**
1071
* htmlSaveFile:
1072
* @filename: the filename (or URL)
1073
* @cur: the document
1074
*
1075
* Dump an HTML document to a file. If @filename is "-" the stdout file is
1076
* used.
1077
* returns: the number of byte written or -1 in case of failure.
1078
*/
1079
int
1080
htmlSaveFile(const char *filename, xmlDocPtr cur) {
1081
xmlOutputBufferPtr buf;
1082
xmlCharEncodingHandlerPtr handler = NULL;
1083
const char *encoding;
1084
int ret;
1085
1086
if ((cur == NULL) || (filename == NULL))
1087
return(-1);
1088
1089
xmlInitParser();
1090
1091
encoding = (const char *) htmlGetMetaEncoding(cur);
1092
1093
if (encoding != NULL) {
1094
xmlCharEncoding enc;
1095
1096
enc = xmlParseCharEncoding(encoding);
1097
if (enc != XML_CHAR_ENCODING_UTF8) {
1098
handler = xmlFindCharEncodingHandler(encoding);
1099
if (handler == NULL)
1100
htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1101
}
1102
} else {
1103
/*
1104
* Fallback to HTML or ASCII when the encoding is unspecified
1105
*/
1106
if (handler == NULL)
1107
handler = xmlFindCharEncodingHandler("HTML");
1108
if (handler == NULL)
1109
handler = xmlFindCharEncodingHandler("ascii");
1110
}
1111
1112
/*
1113
* save the content to a temp buffer.
1114
*/
1115
buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1116
if (buf == NULL) return(0);
1117
1118
htmlDocContentDumpOutput(buf, cur, NULL);
1119
1120
ret = xmlOutputBufferClose(buf);
1121
return(ret);
1122
}
1123
1124
/**
1125
* htmlSaveFileFormat:
1126
* @filename: the filename
1127
* @cur: the document
1128
* @format: should formatting spaces been added
1129
* @encoding: the document encoding
1130
*
1131
* Dump an HTML document to a file using a given encoding.
1132
*
1133
* returns: the number of byte written or -1 in case of failure.
1134
*/
1135
int
1136
htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1137
const char *encoding, int format) {
1138
xmlOutputBufferPtr buf;
1139
xmlCharEncodingHandlerPtr handler = NULL;
1140
int ret;
1141
1142
if ((cur == NULL) || (filename == NULL))
1143
return(-1);
1144
1145
xmlInitParser();
1146
1147
if (encoding != NULL) {
1148
xmlCharEncoding enc;
1149
1150
enc = xmlParseCharEncoding(encoding);
1151
if (enc != XML_CHAR_ENCODING_UTF8) {
1152
handler = xmlFindCharEncodingHandler(encoding);
1153
if (handler == NULL)
1154
htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1155
}
1156
htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1157
} else {
1158
htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1159
1160
/*
1161
* Fallback to HTML or ASCII when the encoding is unspecified
1162
*/
1163
if (handler == NULL)
1164
handler = xmlFindCharEncodingHandler("HTML");
1165
if (handler == NULL)
1166
handler = xmlFindCharEncodingHandler("ascii");
1167
}
1168
1169
/*
1170
* save the content to a temp buffer.
1171
*/
1172
buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1173
if (buf == NULL) return(0);
1174
1175
htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1176
1177
ret = xmlOutputBufferClose(buf);
1178
return(ret);
1179
}
1180
1181
/**
1182
* htmlSaveFileEnc:
1183
* @filename: the filename
1184
* @cur: the document
1185
* @encoding: the document encoding
1186
*
1187
* Dump an HTML document to a file using a given encoding
1188
* and formatting returns/spaces are added.
1189
*
1190
* returns: the number of byte written or -1 in case of failure.
1191
*/
1192
int
1193
htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1194
return(htmlSaveFileFormat(filename, cur, encoding, 1));
1195
}
1196
1197
#endif /* LIBXML_OUTPUT_ENABLED */
1198
1199
#endif /* LIBXML_HTML_ENABLED */
1200
1201