CoCalc -- xml.py

GitHub Repository: wiseplat/python-code
Path: blob/master/ invest-robot-contest_TinkoffBotTwitch-main/venv/lib/python3.8/site-packages/pandas/io/xml.py
⁷⁸²⁴ views
1
"""
2
:mod:`pandas.io.xml` is a module for reading XML.
3
"""
4

5
from __future__ import annotations
6

7
import io
8
from typing import Sequence
9

10
from pandas._typing import (
11
    CompressionOptions,
12
    FilePath,
13
    ReadBuffer,
14
    StorageOptions,
15
    XMLParsers,
16
)
17
from pandas.compat._optional import import_optional_dependency
18
from pandas.errors import (
19
    AbstractMethodError,
20
    ParserError,
21
)
22
from pandas.util._decorators import (
23
    deprecate_nonkeyword_arguments,
24
    doc,
25
)
26

27
from pandas.core.dtypes.common import is_list_like
28

29
from pandas.core.frame import DataFrame
30
from pandas.core.shared_docs import _shared_docs
31

32
from pandas.io.common import (
33
    file_exists,
34
    get_handle,
35
    is_fsspec_url,
36
    is_url,
37
    stringify_path,
38
)
39
from pandas.io.parsers import TextParser
40

41

42
@doc(decompression_options=_shared_docs["decompression_options"] % "path_or_buffer")
43
class _XMLFrameParser:
44
    """
45
    Internal subclass to parse XML into DataFrames.
46

47
    Parameters
48
    ----------
49
    path_or_buffer : a valid JSON str, path object or file-like object
50
        Any valid string path is acceptable. The string could be a URL. Valid
51
        URL schemes include http, ftp, s3, and file.
52

53
    xpath : str or regex
54
        The XPath expression to parse required set of nodes for
55
        migration to `Data Frame`. `etree` supports limited XPath.
56

57
    namespacess : dict
58
        The namespaces defined in XML document (`xmlns:namespace='URI')
59
        as dicts with key being namespace and value the URI.
60

61
    elems_only : bool
62
        Parse only the child elements at the specified `xpath`.
63

64
    attrs_only : bool
65
        Parse only the attributes at the specified `xpath`.
66

67
    names : list
68
        Column names for Data Frame of parsed XML data.
69

70
    encoding : str
71
        Encoding of xml object or document.
72

73
    stylesheet : str or file-like
74
        URL, file, file-like object, or a raw string containing XSLT,
75
        `etree` does not support XSLT but retained for consistency.
76

77
    {decompression_options}
78

79
        .. versionchanged:: 1.4.0 Zstandard support.
80

81
    storage_options : dict, optional
82
        Extra options that make sense for a particular storage connection,
83
        e.g. host, port, username, password, etc.,
84

85
    See also
86
    --------
87
    pandas.io.xml._EtreeFrameParser
88
    pandas.io.xml._LxmlFrameParser
89

90
    Notes
91
    -----
92
    To subclass this class effectively you must override the following methods:`
93
        * :func:`parse_data`
94
        * :func:`_parse_nodes`
95
        * :func:`_parse_doc`
96
        * :func:`_validate_names`
97
        * :func:`_validate_path`
98

99

100
    See each method's respective documentation for details on their
101
    functionality.
102
    """
103

104
    def __init__(
105
        self,
106
        path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
107
        xpath: str,
108
        namespaces: dict[str, str] | None,
109
        elems_only: bool,
110
        attrs_only: bool,
111
        names: Sequence[str] | None,
112
        encoding: str | None,
113
        stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
114
        compression: CompressionOptions,
115
        storage_options: StorageOptions,
116
    ):
117
        self.path_or_buffer = path_or_buffer
118
        self.xpath = xpath
119
        self.namespaces = namespaces
120
        self.elems_only = elems_only
121
        self.attrs_only = attrs_only
122
        self.names = names
123
        self.encoding = encoding
124
        self.stylesheet = stylesheet
125
        self.is_style = None
126
        self.compression = compression
127
        self.storage_options = storage_options
128

129
    def parse_data(self) -> list[dict[str, str | None]]:
130
        """
131
        Parse xml data.
132

133
        This method will call the other internal methods to
134
        validate xpath, names, parse and return specific nodes.
135
        """
136

137
        raise AbstractMethodError(self)
138

139
    def _parse_nodes(self) -> list[dict[str, str | None]]:
140
        """
141
        Parse xml nodes.
142

143
        This method will parse the children and attributes of elements
144
        in xpath, conditionally for only elements, only attributes
145
        or both while optionally renaming node names.
146

147
        Raises
148
        ------
149
        ValueError
150
            * If only elements and only attributes are specified.
151

152
        Notes
153
        -----
154
        Namespace URIs will be removed from return node values.Also,
155
        elements with missing children or attributes compared to siblings
156
        will have optional keys filled withi None values.
157
        """
158

159
        raise AbstractMethodError(self)
160

161
    def _validate_path(self) -> None:
162
        """
163
        Validate xpath.
164

165
        This method checks for syntax, evaluation, or empty nodes return.
166

167
        Raises
168
        ------
169
        SyntaxError
170
            * If xpah is not supported or issues with namespaces.
171

172
        ValueError
173
            * If xpah does not return any nodes.
174
        """
175

176
        raise AbstractMethodError(self)
177

178
    def _validate_names(self) -> None:
179
        """
180
        Validate names.
181

182
        This method will check if names is a list-like and aligns
183
        with length of parse nodes.
184

185
        Raises
186
        ------
187
        ValueError
188
            * If value is not a list and less then length of nodes.
189
        """
190
        raise AbstractMethodError(self)
191

192
    def _parse_doc(self, raw_doc) -> bytes:
193
        """
194
        Build tree from path_or_buffer.
195

196
        This method will parse XML object into tree
197
        either from string/bytes or file location.
198
        """
199
        raise AbstractMethodError(self)
200

201

202
class _EtreeFrameParser(_XMLFrameParser):
203
    """
204
    Internal class to parse XML into DataFrames with the Python
205
    standard library XML module: `xml.etree.ElementTree`.
206
    """
207

208
    def parse_data(self) -> list[dict[str, str | None]]:
209
        from xml.etree.ElementTree import XML
210

211
        if self.stylesheet is not None:
212
            raise ValueError(
213
                "To use stylesheet, you need lxml installed and selected as parser."
214
            )
215

216
        self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
217

218
        self._validate_path()
219
        self._validate_names()
220

221
        return self._parse_nodes()
222

223
    def _parse_nodes(self) -> list[dict[str, str | None]]:
224
        elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
225
        dicts: list[dict[str, str | None]]
226

227
        if self.elems_only and self.attrs_only:
228
            raise ValueError("Either element or attributes can be parsed not both.")
229
        elif self.elems_only:
230
            if self.names:
231
                dicts = [
232
                    {
233
                        **(
234
                            {el.tag: el.text.strip()}
235
                            if el.text and not el.text.isspace()
236
                            else {}
237
                        ),
238
                        **{
239
                            nm: ch.text.strip() if ch.text else None
240
                            for nm, ch in zip(self.names, el.findall("*"))
241
                        },
242
                    }
243
                    for el in elems
244
                ]
245
            else:
246
                dicts = [
247
                    {
248
                        ch.tag: ch.text.strip() if ch.text else None
249
                        for ch in el.findall("*")
250
                    }
251
                    for el in elems
252
                ]
253

254
        elif self.attrs_only:
255
            dicts = [
256
                {k: v.strip() if v else None for k, v in el.attrib.items()}
257
                for el in elems
258
            ]
259

260
        else:
261
            if self.names:
262
                dicts = [
263
                    {
264
                        **el.attrib,
265
                        **(
266
                            {el.tag: el.text.strip()}
267
                            if el.text and not el.text.isspace()
268
                            else {}
269
                        ),
270
                        **{
271
                            nm: ch.text.strip() if ch.text else None
272
                            for nm, ch in zip(self.names, el.findall("*"))
273
                        },
274
                    }
275
                    for el in elems
276
                ]
277

278
            else:
279
                dicts = [
280
                    {
281
                        **el.attrib,
282
                        **(
283
                            {el.tag: el.text.strip()}
284
                            if el.text and not el.text.isspace()
285
                            else {}
286
                        ),
287
                        **{
288
                            ch.tag: ch.text.strip() if ch.text else None
289
                            for ch in el.findall("*")
290
                        },
291
                    }
292
                    for el in elems
293
                ]
294

295
        dicts = [
296
            {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts
297
        ]
298

299
        keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
300
        dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
301

302
        if self.names:
303
            dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]
304

305
        return dicts
306

307
    def _validate_path(self) -> None:
308
        """
309
        Notes
310
        -----
311
        `etree` supports limited XPath. If user attempts a more complex
312
        expression syntax error will raise.
313
        """
314

315
        msg = (
316
            "xpath does not return any nodes. "
317
            "If document uses namespaces denoted with "
318
            "xmlns, be sure to define namespaces and "
319
            "use them in xpath."
320
        )
321
        try:
322
            elems = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
323
            if elems is None:
324
                raise ValueError(msg)
325

326
            if elems is not None and elems.find("*") is None and elems.attrib is None:
327
                raise ValueError(msg)
328

329
        except (KeyError, SyntaxError):
330
            raise SyntaxError(
331
                "You have used an incorrect or unsupported XPath "
332
                "expression for etree library or you used an "
333
                "undeclared namespace prefix."
334
            )
335

336
    def _validate_names(self) -> None:
337
        if self.names:
338
            parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
339
            children = parent.findall("*") if parent else []
340

341
            if is_list_like(self.names):
342
                if len(self.names) < len(children):
343
                    raise ValueError(
344
                        "names does not match length of child elements in xpath."
345
                    )
346
            else:
347
                raise TypeError(
348
                    f"{type(self.names).__name__} is not a valid type for names"
349
                )
350

351
    def _parse_doc(self, raw_doc) -> bytes:
352
        from xml.etree.ElementTree import (
353
            XMLParser,
354
            parse,
355
            tostring,
356
        )
357

358
        handle_data = get_data_from_filepath(
359
            filepath_or_buffer=raw_doc,
360
            encoding=self.encoding,
361
            compression=self.compression,
362
            storage_options=self.storage_options,
363
        )
364

365
        with preprocess_data(handle_data) as xml_data:
366
            curr_parser = XMLParser(encoding=self.encoding)
367
            r = parse(xml_data, parser=curr_parser)
368

369
        return tostring(r.getroot())
370

371

372
class _LxmlFrameParser(_XMLFrameParser):
373
    """
374
    Internal class to parse XML into DataFrames with third-party
375
    full-featured XML library, `lxml`, that supports
376
    XPath 1.0 and XSLT 1.0.
377
    """
378

379
    def parse_data(self) -> list[dict[str, str | None]]:
380
        """
381
        Parse xml data.
382

383
        This method will call the other internal methods to
384
        validate xpath, names, optionally parse and run XSLT,
385
        and parse original or transformed XML and return specific nodes.
386
        """
387
        from lxml.etree import XML
388

389
        self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
390

391
        if self.stylesheet is not None:
392
            self.xsl_doc = XML(self._parse_doc(self.stylesheet))
393
            self.xml_doc = XML(self._transform_doc())
394

395
        self._validate_path()
396
        self._validate_names()
397

398
        return self._parse_nodes()
399

400
    def _parse_nodes(self) -> list[dict[str, str | None]]:
401
        elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
402
        dicts: list[dict[str, str | None]]
403

404
        if self.elems_only and self.attrs_only:
405
            raise ValueError("Either element or attributes can be parsed not both.")
406

407
        elif self.elems_only:
408
            if self.names:
409
                dicts = [
410
                    {
411
                        **(
412
                            {el.tag: el.text.strip()}
413
                            if el.text and not el.text.isspace()
414
                            else {}
415
                        ),
416
                        **{
417
                            nm: ch.text.strip() if ch.text else None
418
                            for nm, ch in zip(self.names, el.xpath("*"))
419
                        },
420
                    }
421
                    for el in elems
422
                ]
423
            else:
424
                dicts = [
425
                    {
426
                        ch.tag: ch.text.strip() if ch.text else None
427
                        for ch in el.xpath("*")
428
                    }
429
                    for el in elems
430
                ]
431

432
        elif self.attrs_only:
433
            dicts = [el.attrib for el in elems]
434

435
        else:
436
            if self.names:
437
                dicts = [
438
                    {
439
                        **el.attrib,
440
                        **(
441
                            {el.tag: el.text.strip()}
442
                            if el.text and not el.text.isspace()
443
                            else {}
444
                        ),
445
                        **{
446
                            nm: ch.text.strip() if ch.text else None
447
                            for nm, ch in zip(self.names, el.xpath("*"))
448
                        },
449
                    }
450
                    for el in elems
451
                ]
452
            else:
453
                dicts = [
454
                    {
455
                        **el.attrib,
456
                        **(
457
                            {el.tag: el.text.strip()}
458
                            if el.text and not el.text.isspace()
459
                            else {}
460
                        ),
461
                        **{
462
                            ch.tag: ch.text.strip() if ch.text else None
463
                            for ch in el.xpath("*")
464
                        },
465
                    }
466
                    for el in elems
467
                ]
468

469
        if self.namespaces or "}" in list(dicts[0].keys())[0]:
470
            dicts = [
471
                {k.split("}")[1] if "}" in k else k: v for k, v in d.items()}
472
                for d in dicts
473
            ]
474

475
        keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
476
        dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
477

478
        if self.names:
479
            dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]
480

481
        return dicts
482

483
    def _validate_path(self) -> None:
484

485
        msg = (
486
            "xpath does not return any nodes. "
487
            "Be sure row level nodes are in xpath. "
488
            "If document uses namespaces denoted with "
489
            "xmlns, be sure to define namespaces and "
490
            "use them in xpath."
491
        )
492

493
        elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
494
        children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces)
495
        attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces)
496

497
        if elems == []:
498
            raise ValueError(msg)
499

500
        if elems != [] and attrs == [] and children == []:
501
            raise ValueError(msg)
502

503
    def _validate_names(self) -> None:
504
        """
505
        Validate names.
506

507
        This method will check if names is a list and aligns with
508
        length of parse nodes.
509

510
        Raises
511
        ------
512
        ValueError
513
            * If value is not a list and less then length of nodes.
514
        """
515
        if self.names:
516
            children = self.xml_doc.xpath(
517
                self.xpath + "[1]/*", namespaces=self.namespaces
518
            )
519

520
            if is_list_like(self.names):
521
                if len(self.names) < len(children):
522
                    raise ValueError(
523
                        "names does not match length of child elements in xpath."
524
                    )
525
            else:
526
                raise TypeError(
527
                    f"{type(self.names).__name__} is not a valid type for names"
528
                )
529

530
    def _parse_doc(self, raw_doc) -> bytes:
531
        from lxml.etree import (
532
            XMLParser,
533
            fromstring,
534
            parse,
535
            tostring,
536
        )
537

538
        handle_data = get_data_from_filepath(
539
            filepath_or_buffer=raw_doc,
540
            encoding=self.encoding,
541
            compression=self.compression,
542
            storage_options=self.storage_options,
543
        )
544

545
        with preprocess_data(handle_data) as xml_data:
546
            curr_parser = XMLParser(encoding=self.encoding)
547

548
            if isinstance(xml_data, io.StringIO):
549
                if self.encoding is None:
550
                    raise TypeError(
551
                        "Can not pass encoding None when input is StringIO."
552
                    )
553

554
                doc = fromstring(
555
                    xml_data.getvalue().encode(self.encoding), parser=curr_parser
556
                )
557
            else:
558
                doc = parse(xml_data, parser=curr_parser)
559

560
        return tostring(doc)
561

562
    def _transform_doc(self) -> bytes:
563
        """
564
        Transform original tree using stylesheet.
565

566
        This method will transform original xml using XSLT script into
567
        am ideally flatter xml document for easier parsing and migration
568
        to Data Frame.
569
        """
570
        from lxml.etree import XSLT
571

572
        transformer = XSLT(self.xsl_doc)
573
        new_doc = transformer(self.xml_doc)
574

575
        return bytes(new_doc)
576

577

578
def get_data_from_filepath(
579
    filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],
580
    encoding: str | None,
581
    compression: CompressionOptions,
582
    storage_options: StorageOptions,
583
) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:
584
    """
585
    Extract raw XML data.
586

587
    The method accepts three input types:
588
        1. filepath (string-like)
589
        2. file-like object (e.g. open file object, StringIO)
590
        3. XML string or bytes
591

592
    This method turns (1) into (2) to simplify the rest of the processing.
593
    It returns input types (2) and (3) unchanged.
594
    """
595
    if not isinstance(filepath_or_buffer, bytes):
596
        filepath_or_buffer = stringify_path(filepath_or_buffer)
597

598
    if (
599
        isinstance(filepath_or_buffer, str)
600
        and not filepath_or_buffer.startswith(("<?xml", "<"))
601
    ) and (
602
        not isinstance(filepath_or_buffer, str)
603
        or is_url(filepath_or_buffer)
604
        or is_fsspec_url(filepath_or_buffer)
605
        or file_exists(filepath_or_buffer)
606
    ):
607
        with get_handle(
608
            filepath_or_buffer,
609
            "r",
610
            encoding=encoding,
611
            compression=compression,
612
            storage_options=storage_options,
613
        ) as handle_obj:
614
            filepath_or_buffer = (
615
                # error: Incompatible types in assignment (expression has type
616
                # "Union[str, IO[str]]", variable has type "Union[Union[str,
617
                # PathLike[str]], bytes, ReadBuffer[bytes], ReadBuffer[str]]")
618
                handle_obj.handle.read()  # type: ignore[assignment]
619
                if hasattr(handle_obj.handle, "read")
620
                else handle_obj.handle
621
            )
622

623
    return filepath_or_buffer
624

625

626
def preprocess_data(data) -> io.StringIO | io.BytesIO:
627
    """
628
    Convert extracted raw data.
629

630
    This method will return underlying data of extracted XML content.
631
    The data either has a `read` attribute (e.g. a file object or a
632
    StringIO/BytesIO) or is a string or bytes that is an XML document.
633
    """
634

635
    if isinstance(data, str):
636
        data = io.StringIO(data)
637

638
    elif isinstance(data, bytes):
639
        data = io.BytesIO(data)
640

641
    return data
642

643

644
def _data_to_frame(data, **kwargs) -> DataFrame:
645
    """
646
    Convert parsed data to Data Frame.
647

648
    This method will bind xml dictionary data of keys and values
649
    into named columns of Data Frame using the built-in TextParser
650
    class that build Data Frame and infers specific dtypes.
651
    """
652

653
    tags = next(iter(data))
654
    nodes = [list(d.values()) for d in data]
655

656
    try:
657
        with TextParser(nodes, names=tags, **kwargs) as tp:
658
            return tp.read()
659
    except ParserError:
660
        raise ParserError(
661
            "XML document may be too complex for import. "
662
            "Try to flatten document and use distinct "
663
            "element and attribute names."
664
        )
665

666

667
def _parse(
668
    path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
669
    xpath: str,
670
    namespaces: dict[str, str] | None,
671
    elems_only: bool,
672
    attrs_only: bool,
673
    names: Sequence[str] | None,
674
    encoding: str | None,
675
    parser: XMLParsers,
676
    stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
677
    compression: CompressionOptions,
678
    storage_options: StorageOptions,
679
    **kwargs,
680
) -> DataFrame:
681
    """
682
    Call internal parsers.
683

684
    This method will conditionally call internal parsers:
685
    LxmlFrameParser and/or EtreeParser.
686

687
    Raises
688
    ------
689
    ImportError
690
        * If lxml is not installed if selected as parser.
691

692
    ValueError
693
        * If parser is not lxml or etree.
694
    """
695

696
    p: _EtreeFrameParser | _LxmlFrameParser
697

698
    if parser == "lxml":
699
        lxml = import_optional_dependency("lxml.etree", errors="ignore")
700

701
        if lxml is not None:
702
            p = _LxmlFrameParser(
703
                path_or_buffer,
704
                xpath,
705
                namespaces,
706
                elems_only,
707
                attrs_only,
708
                names,
709
                encoding,
710
                stylesheet,
711
                compression,
712
                storage_options,
713
            )
714
        else:
715
            raise ImportError("lxml not found, please install or use the etree parser.")
716

717
    elif parser == "etree":
718
        p = _EtreeFrameParser(
719
            path_or_buffer,
720
            xpath,
721
            namespaces,
722
            elems_only,
723
            attrs_only,
724
            names,
725
            encoding,
726
            stylesheet,
727
            compression,
728
            storage_options,
729
        )
730
    else:
731
        raise ValueError("Values for parser can only be lxml or etree.")
732

733
    data_dicts = p.parse_data()
734

735
    return _data_to_frame(data=data_dicts, **kwargs)
736

737

738
@deprecate_nonkeyword_arguments(
739
    version=None, allowed_args=["path_or_buffer"], stacklevel=2
740
)
741
@doc(
742
    storage_options=_shared_docs["storage_options"],
743
    decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",
744
)
745
def read_xml(
746
    path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
747
    xpath: str = "./*",
748
    namespaces: dict[str, str] | None = None,
749
    elems_only: bool = False,
750
    attrs_only: bool = False,
751
    names: Sequence[str] | None = None,
752
    # encoding can not be None for lxml and StringIO input
753
    encoding: str | None = "utf-8",
754
    parser: XMLParsers = "lxml",
755
    stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,
756
    compression: CompressionOptions = "infer",
757
    storage_options: StorageOptions = None,
758
) -> DataFrame:
759
    r"""
760
    Read XML document into a ``DataFrame`` object.
761

762
    .. versionadded:: 1.3.0
763

764
    Parameters
765
    ----------
766
    path_or_buffer : str, path object, or file-like object
767
        String, path object (implementing ``os.PathLike[str]``), or file-like
768
        object implementing a ``read()`` function. The string can be any valid XML
769
        string or a path. The string can further be a URL. Valid URL schemes
770
        include http, ftp, s3, and file.
771

772
    xpath : str, optional, default './\*'
773
        The XPath to parse required set of nodes for migration to DataFrame.
774
        XPath should return a collection of elements and not a single
775
        element. Note: The ``etree`` parser supports limited XPath
776
        expressions. For more complex XPath, use ``lxml`` which requires
777
        installation.
778

779
    namespaces : dict, optional
780
        The namespaces defined in XML document as dicts with key being
781
        namespace prefix and value the URI. There is no need to include all
782
        namespaces in XML, only the ones used in ``xpath`` expression.
783
        Note: if XML document uses default namespace denoted as
784
        `xmlns='<URI>'` without a prefix, you must assign any temporary
785
        namespace prefix such as 'doc' to the URI in order to parse
786
        underlying nodes and/or attributes. For example, ::
787

788
            namespaces = {{"doc": "https://example.com"}}
789

790
    elems_only : bool, optional, default False
791
        Parse only the child elements at the specified ``xpath``. By default,
792
        all child elements and non-empty text nodes are returned.
793

794
    attrs_only :  bool, optional, default False
795
        Parse only the attributes at the specified ``xpath``.
796
        By default, all attributes are returned.
797

798
    names :  list-like, optional
799
        Column names for DataFrame of parsed XML data. Use this parameter to
800
        rename original element names and distinguish same named elements.
801

802
    encoding : str, optional, default 'utf-8'
803
        Encoding of XML document.
804

805
    parser : {{'lxml','etree'}}, default 'lxml'
806
        Parser module to use for retrieval of data. Only 'lxml' and
807
        'etree' are supported. With 'lxml' more complex XPath searches
808
        and ability to use XSLT stylesheet are supported.
809

810
    stylesheet : str, path object or file-like object
811
        A URL, file-like object, or a raw string containing an XSLT script.
812
        This stylesheet should flatten complex, deeply nested XML documents
813
        for easier parsing. To use this feature you must have ``lxml`` module
814
        installed and specify 'lxml' as ``parser``. The ``xpath`` must
815
        reference nodes of transformed XML document generated after XSLT
816
        transformation and not the original XML document. Only XSLT 1.0
817
        scripts and not later versions is currently supported.
818

819
    {decompression_options}
820

821
        .. versionchanged:: 1.4.0 Zstandard support.
822

823
    {storage_options}
824

825
    Returns
826
    -------
827
    df
828
        A DataFrame.
829

830
    See Also
831
    --------
832
    read_json : Convert a JSON string to pandas object.
833
    read_html : Read HTML tables into a list of DataFrame objects.
834

835
    Notes
836
    -----
837
    This method is best designed to import shallow XML documents in
838
    following format which is the ideal fit for the two-dimensions of a
839
    ``DataFrame`` (row by column). ::
840

841
            <root>
842
                <row>
843
                  <column1>data</column1>
844
                  <column2>data</column2>
845
                  <column3>data</column3>
846
                  ...
847
               </row>
848
               <row>
849
                  ...
850
               </row>
851
               ...
852
            </root>
853

854
    As a file format, XML documents can be designed any way including
855
    layout of elements and attributes as long as it conforms to W3C
856
    specifications. Therefore, this method is a convenience handler for
857
    a specific flatter design and not all possible XML structures.
858

859
    However, for more complex XML documents, ``stylesheet`` allows you to
860
    temporarily redesign original document with XSLT (a special purpose
861
    language) for a flatter version for migration to a DataFrame.
862

863
    This function will *always* return a single :class:`DataFrame` or raise
864
    exceptions due to issues with XML document, ``xpath``, or other
865
    parameters.
866

867
    Examples
868
    --------
869
    >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
870
    ... <data xmlns="http://example.com">
871
    ...  <row>
872
    ...    <shape>square</shape>
873
    ...    <degrees>360</degrees>
874
    ...    <sides>4.0</sides>
875
    ...  </row>
876
    ...  <row>
877
    ...    <shape>circle</shape>
878
    ...    <degrees>360</degrees>
879
    ...    <sides/>
880
    ...  </row>
881
    ...  <row>
882
    ...    <shape>triangle</shape>
883
    ...    <degrees>180</degrees>
884
    ...    <sides>3.0</sides>
885
    ...  </row>
886
    ... </data>'''
887

888
    >>> df = pd.read_xml(xml)
889
    >>> df
890
          shape  degrees  sides
891
    0    square      360    4.0
892
    1    circle      360    NaN
893
    2  triangle      180    3.0
894

895
    >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
896
    ... <data>
897
    ...   <row shape="square" degrees="360" sides="4.0"/>
898
    ...   <row shape="circle" degrees="360"/>
899
    ...   <row shape="triangle" degrees="180" sides="3.0"/>
900
    ... </data>'''
901

902
    >>> df = pd.read_xml(xml, xpath=".//row")
903
    >>> df
904
          shape  degrees  sides
905
    0    square      360    4.0
906
    1    circle      360    NaN
907
    2  triangle      180    3.0
908

909
    >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
910
    ... <doc:data xmlns:doc="https://example.com">
911
    ...   <doc:row>
912
    ...     <doc:shape>square</doc:shape>
913
    ...     <doc:degrees>360</doc:degrees>
914
    ...     <doc:sides>4.0</doc:sides>
915
    ...   </doc:row>
916
    ...   <doc:row>
917
    ...     <doc:shape>circle</doc:shape>
918
    ...     <doc:degrees>360</doc:degrees>
919
    ...     <doc:sides/>
920
    ...   </doc:row>
921
    ...   <doc:row>
922
    ...     <doc:shape>triangle</doc:shape>
923
    ...     <doc:degrees>180</doc:degrees>
924
    ...     <doc:sides>3.0</doc:sides>
925
    ...   </doc:row>
926
    ... </doc:data>'''
927

928
    >>> df = pd.read_xml(xml,
929
    ...                  xpath="//doc:row",
930
    ...                  namespaces={{"doc": "https://example.com"}})
931
    >>> df
932
          shape  degrees  sides
933
    0    square      360    4.0
934
    1    circle      360    NaN
935
    2  triangle      180    3.0
936
    """
937

938
    return _parse(
939
        path_or_buffer=path_or_buffer,
940
        xpath=xpath,
941
        namespaces=namespaces,
942
        elems_only=elems_only,
943
        attrs_only=attrs_only,
944
        names=names,
945
        encoding=encoding,
946
        parser=parser,
947
        stylesheet=stylesheet,
948
        compression=compression,
949
        storage_options=storage_options,
950
    )
951

952
Product

Resources

Company