Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
wiseplat
GitHub Repository: wiseplat/python-code
Path: blob/master/ invest-robot-contest_TinkoffBotTwitch-main/venv/lib/python3.8/site-packages/pandas/io/xml.py
7824 views
1
"""
2
:mod:`pandas.io.xml` is a module for reading XML.
3
"""
4
5
from __future__ import annotations
6
7
import io
8
from typing import Sequence
9
10
from pandas._typing import (
11
CompressionOptions,
12
FilePath,
13
ReadBuffer,
14
StorageOptions,
15
XMLParsers,
16
)
17
from pandas.compat._optional import import_optional_dependency
18
from pandas.errors import (
19
AbstractMethodError,
20
ParserError,
21
)
22
from pandas.util._decorators import (
23
deprecate_nonkeyword_arguments,
24
doc,
25
)
26
27
from pandas.core.dtypes.common import is_list_like
28
29
from pandas.core.frame import DataFrame
30
from pandas.core.shared_docs import _shared_docs
31
32
from pandas.io.common import (
33
file_exists,
34
get_handle,
35
is_fsspec_url,
36
is_url,
37
stringify_path,
38
)
39
from pandas.io.parsers import TextParser
40
41
42
@doc(decompression_options=_shared_docs["decompression_options"] % "path_or_buffer")
43
class _XMLFrameParser:
44
"""
45
Internal subclass to parse XML into DataFrames.
46
47
Parameters
48
----------
49
path_or_buffer : a valid JSON str, path object or file-like object
50
Any valid string path is acceptable. The string could be a URL. Valid
51
URL schemes include http, ftp, s3, and file.
52
53
xpath : str or regex
54
The XPath expression to parse required set of nodes for
55
migration to `Data Frame`. `etree` supports limited XPath.
56
57
namespacess : dict
58
The namespaces defined in XML document (`xmlns:namespace='URI')
59
as dicts with key being namespace and value the URI.
60
61
elems_only : bool
62
Parse only the child elements at the specified `xpath`.
63
64
attrs_only : bool
65
Parse only the attributes at the specified `xpath`.
66
67
names : list
68
Column names for Data Frame of parsed XML data.
69
70
encoding : str
71
Encoding of xml object or document.
72
73
stylesheet : str or file-like
74
URL, file, file-like object, or a raw string containing XSLT,
75
`etree` does not support XSLT but retained for consistency.
76
77
{decompression_options}
78
79
.. versionchanged:: 1.4.0 Zstandard support.
80
81
storage_options : dict, optional
82
Extra options that make sense for a particular storage connection,
83
e.g. host, port, username, password, etc.,
84
85
See also
86
--------
87
pandas.io.xml._EtreeFrameParser
88
pandas.io.xml._LxmlFrameParser
89
90
Notes
91
-----
92
To subclass this class effectively you must override the following methods:`
93
* :func:`parse_data`
94
* :func:`_parse_nodes`
95
* :func:`_parse_doc`
96
* :func:`_validate_names`
97
* :func:`_validate_path`
98
99
100
See each method's respective documentation for details on their
101
functionality.
102
"""
103
104
def __init__(
105
self,
106
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
107
xpath: str,
108
namespaces: dict[str, str] | None,
109
elems_only: bool,
110
attrs_only: bool,
111
names: Sequence[str] | None,
112
encoding: str | None,
113
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
114
compression: CompressionOptions,
115
storage_options: StorageOptions,
116
):
117
self.path_or_buffer = path_or_buffer
118
self.xpath = xpath
119
self.namespaces = namespaces
120
self.elems_only = elems_only
121
self.attrs_only = attrs_only
122
self.names = names
123
self.encoding = encoding
124
self.stylesheet = stylesheet
125
self.is_style = None
126
self.compression = compression
127
self.storage_options = storage_options
128
129
def parse_data(self) -> list[dict[str, str | None]]:
130
"""
131
Parse xml data.
132
133
This method will call the other internal methods to
134
validate xpath, names, parse and return specific nodes.
135
"""
136
137
raise AbstractMethodError(self)
138
139
def _parse_nodes(self) -> list[dict[str, str | None]]:
140
"""
141
Parse xml nodes.
142
143
This method will parse the children and attributes of elements
144
in xpath, conditionally for only elements, only attributes
145
or both while optionally renaming node names.
146
147
Raises
148
------
149
ValueError
150
* If only elements and only attributes are specified.
151
152
Notes
153
-----
154
Namespace URIs will be removed from return node values.Also,
155
elements with missing children or attributes compared to siblings
156
will have optional keys filled withi None values.
157
"""
158
159
raise AbstractMethodError(self)
160
161
def _validate_path(self) -> None:
162
"""
163
Validate xpath.
164
165
This method checks for syntax, evaluation, or empty nodes return.
166
167
Raises
168
------
169
SyntaxError
170
* If xpah is not supported or issues with namespaces.
171
172
ValueError
173
* If xpah does not return any nodes.
174
"""
175
176
raise AbstractMethodError(self)
177
178
def _validate_names(self) -> None:
179
"""
180
Validate names.
181
182
This method will check if names is a list-like and aligns
183
with length of parse nodes.
184
185
Raises
186
------
187
ValueError
188
* If value is not a list and less then length of nodes.
189
"""
190
raise AbstractMethodError(self)
191
192
def _parse_doc(self, raw_doc) -> bytes:
193
"""
194
Build tree from path_or_buffer.
195
196
This method will parse XML object into tree
197
either from string/bytes or file location.
198
"""
199
raise AbstractMethodError(self)
200
201
202
class _EtreeFrameParser(_XMLFrameParser):
203
"""
204
Internal class to parse XML into DataFrames with the Python
205
standard library XML module: `xml.etree.ElementTree`.
206
"""
207
208
def parse_data(self) -> list[dict[str, str | None]]:
209
from xml.etree.ElementTree import XML
210
211
if self.stylesheet is not None:
212
raise ValueError(
213
"To use stylesheet, you need lxml installed and selected as parser."
214
)
215
216
self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
217
218
self._validate_path()
219
self._validate_names()
220
221
return self._parse_nodes()
222
223
def _parse_nodes(self) -> list[dict[str, str | None]]:
224
elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
225
dicts: list[dict[str, str | None]]
226
227
if self.elems_only and self.attrs_only:
228
raise ValueError("Either element or attributes can be parsed not both.")
229
elif self.elems_only:
230
if self.names:
231
dicts = [
232
{
233
**(
234
{el.tag: el.text.strip()}
235
if el.text and not el.text.isspace()
236
else {}
237
),
238
**{
239
nm: ch.text.strip() if ch.text else None
240
for nm, ch in zip(self.names, el.findall("*"))
241
},
242
}
243
for el in elems
244
]
245
else:
246
dicts = [
247
{
248
ch.tag: ch.text.strip() if ch.text else None
249
for ch in el.findall("*")
250
}
251
for el in elems
252
]
253
254
elif self.attrs_only:
255
dicts = [
256
{k: v.strip() if v else None for k, v in el.attrib.items()}
257
for el in elems
258
]
259
260
else:
261
if self.names:
262
dicts = [
263
{
264
**el.attrib,
265
**(
266
{el.tag: el.text.strip()}
267
if el.text and not el.text.isspace()
268
else {}
269
),
270
**{
271
nm: ch.text.strip() if ch.text else None
272
for nm, ch in zip(self.names, el.findall("*"))
273
},
274
}
275
for el in elems
276
]
277
278
else:
279
dicts = [
280
{
281
**el.attrib,
282
**(
283
{el.tag: el.text.strip()}
284
if el.text and not el.text.isspace()
285
else {}
286
),
287
**{
288
ch.tag: ch.text.strip() if ch.text else None
289
for ch in el.findall("*")
290
},
291
}
292
for el in elems
293
]
294
295
dicts = [
296
{k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts
297
]
298
299
keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
300
dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
301
302
if self.names:
303
dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]
304
305
return dicts
306
307
def _validate_path(self) -> None:
308
"""
309
Notes
310
-----
311
`etree` supports limited XPath. If user attempts a more complex
312
expression syntax error will raise.
313
"""
314
315
msg = (
316
"xpath does not return any nodes. "
317
"If document uses namespaces denoted with "
318
"xmlns, be sure to define namespaces and "
319
"use them in xpath."
320
)
321
try:
322
elems = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
323
if elems is None:
324
raise ValueError(msg)
325
326
if elems is not None and elems.find("*") is None and elems.attrib is None:
327
raise ValueError(msg)
328
329
except (KeyError, SyntaxError):
330
raise SyntaxError(
331
"You have used an incorrect or unsupported XPath "
332
"expression for etree library or you used an "
333
"undeclared namespace prefix."
334
)
335
336
def _validate_names(self) -> None:
337
if self.names:
338
parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
339
children = parent.findall("*") if parent else []
340
341
if is_list_like(self.names):
342
if len(self.names) < len(children):
343
raise ValueError(
344
"names does not match length of child elements in xpath."
345
)
346
else:
347
raise TypeError(
348
f"{type(self.names).__name__} is not a valid type for names"
349
)
350
351
def _parse_doc(self, raw_doc) -> bytes:
352
from xml.etree.ElementTree import (
353
XMLParser,
354
parse,
355
tostring,
356
)
357
358
handle_data = get_data_from_filepath(
359
filepath_or_buffer=raw_doc,
360
encoding=self.encoding,
361
compression=self.compression,
362
storage_options=self.storage_options,
363
)
364
365
with preprocess_data(handle_data) as xml_data:
366
curr_parser = XMLParser(encoding=self.encoding)
367
r = parse(xml_data, parser=curr_parser)
368
369
return tostring(r.getroot())
370
371
372
class _LxmlFrameParser(_XMLFrameParser):
373
"""
374
Internal class to parse XML into DataFrames with third-party
375
full-featured XML library, `lxml`, that supports
376
XPath 1.0 and XSLT 1.0.
377
"""
378
379
def parse_data(self) -> list[dict[str, str | None]]:
380
"""
381
Parse xml data.
382
383
This method will call the other internal methods to
384
validate xpath, names, optionally parse and run XSLT,
385
and parse original or transformed XML and return specific nodes.
386
"""
387
from lxml.etree import XML
388
389
self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
390
391
if self.stylesheet is not None:
392
self.xsl_doc = XML(self._parse_doc(self.stylesheet))
393
self.xml_doc = XML(self._transform_doc())
394
395
self._validate_path()
396
self._validate_names()
397
398
return self._parse_nodes()
399
400
def _parse_nodes(self) -> list[dict[str, str | None]]:
401
elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
402
dicts: list[dict[str, str | None]]
403
404
if self.elems_only and self.attrs_only:
405
raise ValueError("Either element or attributes can be parsed not both.")
406
407
elif self.elems_only:
408
if self.names:
409
dicts = [
410
{
411
**(
412
{el.tag: el.text.strip()}
413
if el.text and not el.text.isspace()
414
else {}
415
),
416
**{
417
nm: ch.text.strip() if ch.text else None
418
for nm, ch in zip(self.names, el.xpath("*"))
419
},
420
}
421
for el in elems
422
]
423
else:
424
dicts = [
425
{
426
ch.tag: ch.text.strip() if ch.text else None
427
for ch in el.xpath("*")
428
}
429
for el in elems
430
]
431
432
elif self.attrs_only:
433
dicts = [el.attrib for el in elems]
434
435
else:
436
if self.names:
437
dicts = [
438
{
439
**el.attrib,
440
**(
441
{el.tag: el.text.strip()}
442
if el.text and not el.text.isspace()
443
else {}
444
),
445
**{
446
nm: ch.text.strip() if ch.text else None
447
for nm, ch in zip(self.names, el.xpath("*"))
448
},
449
}
450
for el in elems
451
]
452
else:
453
dicts = [
454
{
455
**el.attrib,
456
**(
457
{el.tag: el.text.strip()}
458
if el.text and not el.text.isspace()
459
else {}
460
),
461
**{
462
ch.tag: ch.text.strip() if ch.text else None
463
for ch in el.xpath("*")
464
},
465
}
466
for el in elems
467
]
468
469
if self.namespaces or "}" in list(dicts[0].keys())[0]:
470
dicts = [
471
{k.split("}")[1] if "}" in k else k: v for k, v in d.items()}
472
for d in dicts
473
]
474
475
keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
476
dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
477
478
if self.names:
479
dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]
480
481
return dicts
482
483
def _validate_path(self) -> None:
484
485
msg = (
486
"xpath does not return any nodes. "
487
"Be sure row level nodes are in xpath. "
488
"If document uses namespaces denoted with "
489
"xmlns, be sure to define namespaces and "
490
"use them in xpath."
491
)
492
493
elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
494
children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces)
495
attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces)
496
497
if elems == []:
498
raise ValueError(msg)
499
500
if elems != [] and attrs == [] and children == []:
501
raise ValueError(msg)
502
503
def _validate_names(self) -> None:
504
"""
505
Validate names.
506
507
This method will check if names is a list and aligns with
508
length of parse nodes.
509
510
Raises
511
------
512
ValueError
513
* If value is not a list and less then length of nodes.
514
"""
515
if self.names:
516
children = self.xml_doc.xpath(
517
self.xpath + "[1]/*", namespaces=self.namespaces
518
)
519
520
if is_list_like(self.names):
521
if len(self.names) < len(children):
522
raise ValueError(
523
"names does not match length of child elements in xpath."
524
)
525
else:
526
raise TypeError(
527
f"{type(self.names).__name__} is not a valid type for names"
528
)
529
530
def _parse_doc(self, raw_doc) -> bytes:
531
from lxml.etree import (
532
XMLParser,
533
fromstring,
534
parse,
535
tostring,
536
)
537
538
handle_data = get_data_from_filepath(
539
filepath_or_buffer=raw_doc,
540
encoding=self.encoding,
541
compression=self.compression,
542
storage_options=self.storage_options,
543
)
544
545
with preprocess_data(handle_data) as xml_data:
546
curr_parser = XMLParser(encoding=self.encoding)
547
548
if isinstance(xml_data, io.StringIO):
549
if self.encoding is None:
550
raise TypeError(
551
"Can not pass encoding None when input is StringIO."
552
)
553
554
doc = fromstring(
555
xml_data.getvalue().encode(self.encoding), parser=curr_parser
556
)
557
else:
558
doc = parse(xml_data, parser=curr_parser)
559
560
return tostring(doc)
561
562
def _transform_doc(self) -> bytes:
563
"""
564
Transform original tree using stylesheet.
565
566
This method will transform original xml using XSLT script into
567
am ideally flatter xml document for easier parsing and migration
568
to Data Frame.
569
"""
570
from lxml.etree import XSLT
571
572
transformer = XSLT(self.xsl_doc)
573
new_doc = transformer(self.xml_doc)
574
575
return bytes(new_doc)
576
577
578
def get_data_from_filepath(
579
filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],
580
encoding: str | None,
581
compression: CompressionOptions,
582
storage_options: StorageOptions,
583
) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:
584
"""
585
Extract raw XML data.
586
587
The method accepts three input types:
588
1. filepath (string-like)
589
2. file-like object (e.g. open file object, StringIO)
590
3. XML string or bytes
591
592
This method turns (1) into (2) to simplify the rest of the processing.
593
It returns input types (2) and (3) unchanged.
594
"""
595
if not isinstance(filepath_or_buffer, bytes):
596
filepath_or_buffer = stringify_path(filepath_or_buffer)
597
598
if (
599
isinstance(filepath_or_buffer, str)
600
and not filepath_or_buffer.startswith(("<?xml", "<"))
601
) and (
602
not isinstance(filepath_or_buffer, str)
603
or is_url(filepath_or_buffer)
604
or is_fsspec_url(filepath_or_buffer)
605
or file_exists(filepath_or_buffer)
606
):
607
with get_handle(
608
filepath_or_buffer,
609
"r",
610
encoding=encoding,
611
compression=compression,
612
storage_options=storage_options,
613
) as handle_obj:
614
filepath_or_buffer = (
615
# error: Incompatible types in assignment (expression has type
616
# "Union[str, IO[str]]", variable has type "Union[Union[str,
617
# PathLike[str]], bytes, ReadBuffer[bytes], ReadBuffer[str]]")
618
handle_obj.handle.read() # type: ignore[assignment]
619
if hasattr(handle_obj.handle, "read")
620
else handle_obj.handle
621
)
622
623
return filepath_or_buffer
624
625
626
def preprocess_data(data) -> io.StringIO | io.BytesIO:
627
"""
628
Convert extracted raw data.
629
630
This method will return underlying data of extracted XML content.
631
The data either has a `read` attribute (e.g. a file object or a
632
StringIO/BytesIO) or is a string or bytes that is an XML document.
633
"""
634
635
if isinstance(data, str):
636
data = io.StringIO(data)
637
638
elif isinstance(data, bytes):
639
data = io.BytesIO(data)
640
641
return data
642
643
644
def _data_to_frame(data, **kwargs) -> DataFrame:
645
"""
646
Convert parsed data to Data Frame.
647
648
This method will bind xml dictionary data of keys and values
649
into named columns of Data Frame using the built-in TextParser
650
class that build Data Frame and infers specific dtypes.
651
"""
652
653
tags = next(iter(data))
654
nodes = [list(d.values()) for d in data]
655
656
try:
657
with TextParser(nodes, names=tags, **kwargs) as tp:
658
return tp.read()
659
except ParserError:
660
raise ParserError(
661
"XML document may be too complex for import. "
662
"Try to flatten document and use distinct "
663
"element and attribute names."
664
)
665
666
667
def _parse(
668
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
669
xpath: str,
670
namespaces: dict[str, str] | None,
671
elems_only: bool,
672
attrs_only: bool,
673
names: Sequence[str] | None,
674
encoding: str | None,
675
parser: XMLParsers,
676
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
677
compression: CompressionOptions,
678
storage_options: StorageOptions,
679
**kwargs,
680
) -> DataFrame:
681
"""
682
Call internal parsers.
683
684
This method will conditionally call internal parsers:
685
LxmlFrameParser and/or EtreeParser.
686
687
Raises
688
------
689
ImportError
690
* If lxml is not installed if selected as parser.
691
692
ValueError
693
* If parser is not lxml or etree.
694
"""
695
696
p: _EtreeFrameParser | _LxmlFrameParser
697
698
if parser == "lxml":
699
lxml = import_optional_dependency("lxml.etree", errors="ignore")
700
701
if lxml is not None:
702
p = _LxmlFrameParser(
703
path_or_buffer,
704
xpath,
705
namespaces,
706
elems_only,
707
attrs_only,
708
names,
709
encoding,
710
stylesheet,
711
compression,
712
storage_options,
713
)
714
else:
715
raise ImportError("lxml not found, please install or use the etree parser.")
716
717
elif parser == "etree":
718
p = _EtreeFrameParser(
719
path_or_buffer,
720
xpath,
721
namespaces,
722
elems_only,
723
attrs_only,
724
names,
725
encoding,
726
stylesheet,
727
compression,
728
storage_options,
729
)
730
else:
731
raise ValueError("Values for parser can only be lxml or etree.")
732
733
data_dicts = p.parse_data()
734
735
return _data_to_frame(data=data_dicts, **kwargs)
736
737
738
@deprecate_nonkeyword_arguments(
739
version=None, allowed_args=["path_or_buffer"], stacklevel=2
740
)
741
@doc(
742
storage_options=_shared_docs["storage_options"],
743
decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",
744
)
745
def read_xml(
746
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
747
xpath: str = "./*",
748
namespaces: dict[str, str] | None = None,
749
elems_only: bool = False,
750
attrs_only: bool = False,
751
names: Sequence[str] | None = None,
752
# encoding can not be None for lxml and StringIO input
753
encoding: str | None = "utf-8",
754
parser: XMLParsers = "lxml",
755
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,
756
compression: CompressionOptions = "infer",
757
storage_options: StorageOptions = None,
758
) -> DataFrame:
759
r"""
760
Read XML document into a ``DataFrame`` object.
761
762
.. versionadded:: 1.3.0
763
764
Parameters
765
----------
766
path_or_buffer : str, path object, or file-like object
767
String, path object (implementing ``os.PathLike[str]``), or file-like
768
object implementing a ``read()`` function. The string can be any valid XML
769
string or a path. The string can further be a URL. Valid URL schemes
770
include http, ftp, s3, and file.
771
772
xpath : str, optional, default './\*'
773
The XPath to parse required set of nodes for migration to DataFrame.
774
XPath should return a collection of elements and not a single
775
element. Note: The ``etree`` parser supports limited XPath
776
expressions. For more complex XPath, use ``lxml`` which requires
777
installation.
778
779
namespaces : dict, optional
780
The namespaces defined in XML document as dicts with key being
781
namespace prefix and value the URI. There is no need to include all
782
namespaces in XML, only the ones used in ``xpath`` expression.
783
Note: if XML document uses default namespace denoted as
784
`xmlns='<URI>'` without a prefix, you must assign any temporary
785
namespace prefix such as 'doc' to the URI in order to parse
786
underlying nodes and/or attributes. For example, ::
787
788
namespaces = {{"doc": "https://example.com"}}
789
790
elems_only : bool, optional, default False
791
Parse only the child elements at the specified ``xpath``. By default,
792
all child elements and non-empty text nodes are returned.
793
794
attrs_only : bool, optional, default False
795
Parse only the attributes at the specified ``xpath``.
796
By default, all attributes are returned.
797
798
names : list-like, optional
799
Column names for DataFrame of parsed XML data. Use this parameter to
800
rename original element names and distinguish same named elements.
801
802
encoding : str, optional, default 'utf-8'
803
Encoding of XML document.
804
805
parser : {{'lxml','etree'}}, default 'lxml'
806
Parser module to use for retrieval of data. Only 'lxml' and
807
'etree' are supported. With 'lxml' more complex XPath searches
808
and ability to use XSLT stylesheet are supported.
809
810
stylesheet : str, path object or file-like object
811
A URL, file-like object, or a raw string containing an XSLT script.
812
This stylesheet should flatten complex, deeply nested XML documents
813
for easier parsing. To use this feature you must have ``lxml`` module
814
installed and specify 'lxml' as ``parser``. The ``xpath`` must
815
reference nodes of transformed XML document generated after XSLT
816
transformation and not the original XML document. Only XSLT 1.0
817
scripts and not later versions is currently supported.
818
819
{decompression_options}
820
821
.. versionchanged:: 1.4.0 Zstandard support.
822
823
{storage_options}
824
825
Returns
826
-------
827
df
828
A DataFrame.
829
830
See Also
831
--------
832
read_json : Convert a JSON string to pandas object.
833
read_html : Read HTML tables into a list of DataFrame objects.
834
835
Notes
836
-----
837
This method is best designed to import shallow XML documents in
838
following format which is the ideal fit for the two-dimensions of a
839
``DataFrame`` (row by column). ::
840
841
<root>
842
<row>
843
<column1>data</column1>
844
<column2>data</column2>
845
<column3>data</column3>
846
...
847
</row>
848
<row>
849
...
850
</row>
851
...
852
</root>
853
854
As a file format, XML documents can be designed any way including
855
layout of elements and attributes as long as it conforms to W3C
856
specifications. Therefore, this method is a convenience handler for
857
a specific flatter design and not all possible XML structures.
858
859
However, for more complex XML documents, ``stylesheet`` allows you to
860
temporarily redesign original document with XSLT (a special purpose
861
language) for a flatter version for migration to a DataFrame.
862
863
This function will *always* return a single :class:`DataFrame` or raise
864
exceptions due to issues with XML document, ``xpath``, or other
865
parameters.
866
867
Examples
868
--------
869
>>> xml = '''<?xml version='1.0' encoding='utf-8'?>
870
... <data xmlns="http://example.com">
871
... <row>
872
... <shape>square</shape>
873
... <degrees>360</degrees>
874
... <sides>4.0</sides>
875
... </row>
876
... <row>
877
... <shape>circle</shape>
878
... <degrees>360</degrees>
879
... <sides/>
880
... </row>
881
... <row>
882
... <shape>triangle</shape>
883
... <degrees>180</degrees>
884
... <sides>3.0</sides>
885
... </row>
886
... </data>'''
887
888
>>> df = pd.read_xml(xml)
889
>>> df
890
shape degrees sides
891
0 square 360 4.0
892
1 circle 360 NaN
893
2 triangle 180 3.0
894
895
>>> xml = '''<?xml version='1.0' encoding='utf-8'?>
896
... <data>
897
... <row shape="square" degrees="360" sides="4.0"/>
898
... <row shape="circle" degrees="360"/>
899
... <row shape="triangle" degrees="180" sides="3.0"/>
900
... </data>'''
901
902
>>> df = pd.read_xml(xml, xpath=".//row")
903
>>> df
904
shape degrees sides
905
0 square 360 4.0
906
1 circle 360 NaN
907
2 triangle 180 3.0
908
909
>>> xml = '''<?xml version='1.0' encoding='utf-8'?>
910
... <doc:data xmlns:doc="https://example.com">
911
... <doc:row>
912
... <doc:shape>square</doc:shape>
913
... <doc:degrees>360</doc:degrees>
914
... <doc:sides>4.0</doc:sides>
915
... </doc:row>
916
... <doc:row>
917
... <doc:shape>circle</doc:shape>
918
... <doc:degrees>360</doc:degrees>
919
... <doc:sides/>
920
... </doc:row>
921
... <doc:row>
922
... <doc:shape>triangle</doc:shape>
923
... <doc:degrees>180</doc:degrees>
924
... <doc:sides>3.0</doc:sides>
925
... </doc:row>
926
... </doc:data>'''
927
928
>>> df = pd.read_xml(xml,
929
... xpath="//doc:row",
930
... namespaces={{"doc": "https://example.com"}})
931
>>> df
932
shape degrees sides
933
0 square 360 4.0
934
1 circle 360 NaN
935
2 triangle 180 3.0
936
"""
937
938
return _parse(
939
path_or_buffer=path_or_buffer,
940
xpath=xpath,
941
namespaces=namespaces,
942
elems_only=elems_only,
943
attrs_only=attrs_only,
944
names=names,
945
encoding=encoding,
946
parser=parser,
947
stylesheet=stylesheet,
948
compression=compression,
949
storage_options=storage_options,
950
)
951
952