Path: blob/master/ invest-robot-contest_TinkoffBotTwitch-main/venv/lib/python3.8/site-packages/pandas/io/xml.py
7824 views
"""1:mod:`pandas.io.xml` is a module for reading XML.2"""34from __future__ import annotations56import io7from typing import Sequence89from pandas._typing import (10CompressionOptions,11FilePath,12ReadBuffer,13StorageOptions,14XMLParsers,15)16from pandas.compat._optional import import_optional_dependency17from pandas.errors import (18AbstractMethodError,19ParserError,20)21from pandas.util._decorators import (22deprecate_nonkeyword_arguments,23doc,24)2526from pandas.core.dtypes.common import is_list_like2728from pandas.core.frame import DataFrame29from pandas.core.shared_docs import _shared_docs3031from pandas.io.common import (32file_exists,33get_handle,34is_fsspec_url,35is_url,36stringify_path,37)38from pandas.io.parsers import TextParser394041@doc(decompression_options=_shared_docs["decompression_options"] % "path_or_buffer")42class _XMLFrameParser:43"""44Internal subclass to parse XML into DataFrames.4546Parameters47----------48path_or_buffer : a valid JSON str, path object or file-like object49Any valid string path is acceptable. The string could be a URL. Valid50URL schemes include http, ftp, s3, and file.5152xpath : str or regex53The XPath expression to parse required set of nodes for54migration to `Data Frame`. `etree` supports limited XPath.5556namespacess : dict57The namespaces defined in XML document (`xmlns:namespace='URI')58as dicts with key being namespace and value the URI.5960elems_only : bool61Parse only the child elements at the specified `xpath`.6263attrs_only : bool64Parse only the attributes at the specified `xpath`.6566names : list67Column names for Data Frame of parsed XML data.6869encoding : str70Encoding of xml object or document.7172stylesheet : str or file-like73URL, file, file-like object, or a raw string containing XSLT,74`etree` does not support XSLT but retained for consistency.7576{decompression_options}7778.. versionchanged:: 1.4.0 Zstandard support.7980storage_options : dict, optional81Extra options that make sense for a particular storage connection,82e.g. host, port, username, password, etc.,8384See also85--------86pandas.io.xml._EtreeFrameParser87pandas.io.xml._LxmlFrameParser8889Notes90-----91To subclass this class effectively you must override the following methods:`92* :func:`parse_data`93* :func:`_parse_nodes`94* :func:`_parse_doc`95* :func:`_validate_names`96* :func:`_validate_path`979899See each method's respective documentation for details on their100functionality.101"""102103def __init__(104self,105path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],106xpath: str,107namespaces: dict[str, str] | None,108elems_only: bool,109attrs_only: bool,110names: Sequence[str] | None,111encoding: str | None,112stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,113compression: CompressionOptions,114storage_options: StorageOptions,115):116self.path_or_buffer = path_or_buffer117self.xpath = xpath118self.namespaces = namespaces119self.elems_only = elems_only120self.attrs_only = attrs_only121self.names = names122self.encoding = encoding123self.stylesheet = stylesheet124self.is_style = None125self.compression = compression126self.storage_options = storage_options127128def parse_data(self) -> list[dict[str, str | None]]:129"""130Parse xml data.131132This method will call the other internal methods to133validate xpath, names, parse and return specific nodes.134"""135136raise AbstractMethodError(self)137138def _parse_nodes(self) -> list[dict[str, str | None]]:139"""140Parse xml nodes.141142This method will parse the children and attributes of elements143in xpath, conditionally for only elements, only attributes144or both while optionally renaming node names.145146Raises147------148ValueError149* If only elements and only attributes are specified.150151Notes152-----153Namespace URIs will be removed from return node values.Also,154elements with missing children or attributes compared to siblings155will have optional keys filled withi None values.156"""157158raise AbstractMethodError(self)159160def _validate_path(self) -> None:161"""162Validate xpath.163164This method checks for syntax, evaluation, or empty nodes return.165166Raises167------168SyntaxError169* If xpah is not supported or issues with namespaces.170171ValueError172* If xpah does not return any nodes.173"""174175raise AbstractMethodError(self)176177def _validate_names(self) -> None:178"""179Validate names.180181This method will check if names is a list-like and aligns182with length of parse nodes.183184Raises185------186ValueError187* If value is not a list and less then length of nodes.188"""189raise AbstractMethodError(self)190191def _parse_doc(self, raw_doc) -> bytes:192"""193Build tree from path_or_buffer.194195This method will parse XML object into tree196either from string/bytes or file location.197"""198raise AbstractMethodError(self)199200201class _EtreeFrameParser(_XMLFrameParser):202"""203Internal class to parse XML into DataFrames with the Python204standard library XML module: `xml.etree.ElementTree`.205"""206207def parse_data(self) -> list[dict[str, str | None]]:208from xml.etree.ElementTree import XML209210if self.stylesheet is not None:211raise ValueError(212"To use stylesheet, you need lxml installed and selected as parser."213)214215self.xml_doc = XML(self._parse_doc(self.path_or_buffer))216217self._validate_path()218self._validate_names()219220return self._parse_nodes()221222def _parse_nodes(self) -> list[dict[str, str | None]]:223elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)224dicts: list[dict[str, str | None]]225226if self.elems_only and self.attrs_only:227raise ValueError("Either element or attributes can be parsed not both.")228elif self.elems_only:229if self.names:230dicts = [231{232**(233{el.tag: el.text.strip()}234if el.text and not el.text.isspace()235else {}236),237**{238nm: ch.text.strip() if ch.text else None239for nm, ch in zip(self.names, el.findall("*"))240},241}242for el in elems243]244else:245dicts = [246{247ch.tag: ch.text.strip() if ch.text else None248for ch in el.findall("*")249}250for el in elems251]252253elif self.attrs_only:254dicts = [255{k: v.strip() if v else None for k, v in el.attrib.items()}256for el in elems257]258259else:260if self.names:261dicts = [262{263**el.attrib,264**(265{el.tag: el.text.strip()}266if el.text and not el.text.isspace()267else {}268),269**{270nm: ch.text.strip() if ch.text else None271for nm, ch in zip(self.names, el.findall("*"))272},273}274for el in elems275]276277else:278dicts = [279{280**el.attrib,281**(282{el.tag: el.text.strip()}283if el.text and not el.text.isspace()284else {}285),286**{287ch.tag: ch.text.strip() if ch.text else None288for ch in el.findall("*")289},290}291for el in elems292]293294dicts = [295{k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts296]297298keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))299dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]300301if self.names:302dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]303304return dicts305306def _validate_path(self) -> None:307"""308Notes309-----310`etree` supports limited XPath. If user attempts a more complex311expression syntax error will raise.312"""313314msg = (315"xpath does not return any nodes. "316"If document uses namespaces denoted with "317"xmlns, be sure to define namespaces and "318"use them in xpath."319)320try:321elems = self.xml_doc.find(self.xpath, namespaces=self.namespaces)322if elems is None:323raise ValueError(msg)324325if elems is not None and elems.find("*") is None and elems.attrib is None:326raise ValueError(msg)327328except (KeyError, SyntaxError):329raise SyntaxError(330"You have used an incorrect or unsupported XPath "331"expression for etree library or you used an "332"undeclared namespace prefix."333)334335def _validate_names(self) -> None:336if self.names:337parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces)338children = parent.findall("*") if parent else []339340if is_list_like(self.names):341if len(self.names) < len(children):342raise ValueError(343"names does not match length of child elements in xpath."344)345else:346raise TypeError(347f"{type(self.names).__name__} is not a valid type for names"348)349350def _parse_doc(self, raw_doc) -> bytes:351from xml.etree.ElementTree import (352XMLParser,353parse,354tostring,355)356357handle_data = get_data_from_filepath(358filepath_or_buffer=raw_doc,359encoding=self.encoding,360compression=self.compression,361storage_options=self.storage_options,362)363364with preprocess_data(handle_data) as xml_data:365curr_parser = XMLParser(encoding=self.encoding)366r = parse(xml_data, parser=curr_parser)367368return tostring(r.getroot())369370371class _LxmlFrameParser(_XMLFrameParser):372"""373Internal class to parse XML into DataFrames with third-party374full-featured XML library, `lxml`, that supports375XPath 1.0 and XSLT 1.0.376"""377378def parse_data(self) -> list[dict[str, str | None]]:379"""380Parse xml data.381382This method will call the other internal methods to383validate xpath, names, optionally parse and run XSLT,384and parse original or transformed XML and return specific nodes.385"""386from lxml.etree import XML387388self.xml_doc = XML(self._parse_doc(self.path_or_buffer))389390if self.stylesheet is not None:391self.xsl_doc = XML(self._parse_doc(self.stylesheet))392self.xml_doc = XML(self._transform_doc())393394self._validate_path()395self._validate_names()396397return self._parse_nodes()398399def _parse_nodes(self) -> list[dict[str, str | None]]:400elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)401dicts: list[dict[str, str | None]]402403if self.elems_only and self.attrs_only:404raise ValueError("Either element or attributes can be parsed not both.")405406elif self.elems_only:407if self.names:408dicts = [409{410**(411{el.tag: el.text.strip()}412if el.text and not el.text.isspace()413else {}414),415**{416nm: ch.text.strip() if ch.text else None417for nm, ch in zip(self.names, el.xpath("*"))418},419}420for el in elems421]422else:423dicts = [424{425ch.tag: ch.text.strip() if ch.text else None426for ch in el.xpath("*")427}428for el in elems429]430431elif self.attrs_only:432dicts = [el.attrib for el in elems]433434else:435if self.names:436dicts = [437{438**el.attrib,439**(440{el.tag: el.text.strip()}441if el.text and not el.text.isspace()442else {}443),444**{445nm: ch.text.strip() if ch.text else None446for nm, ch in zip(self.names, el.xpath("*"))447},448}449for el in elems450]451else:452dicts = [453{454**el.attrib,455**(456{el.tag: el.text.strip()}457if el.text and not el.text.isspace()458else {}459),460**{461ch.tag: ch.text.strip() if ch.text else None462for ch in el.xpath("*")463},464}465for el in elems466]467468if self.namespaces or "}" in list(dicts[0].keys())[0]:469dicts = [470{k.split("}")[1] if "}" in k else k: v for k, v in d.items()}471for d in dicts472]473474keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))475dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]476477if self.names:478dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]479480return dicts481482def _validate_path(self) -> None:483484msg = (485"xpath does not return any nodes. "486"Be sure row level nodes are in xpath. "487"If document uses namespaces denoted with "488"xmlns, be sure to define namespaces and "489"use them in xpath."490)491492elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)493children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces)494attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces)495496if elems == []:497raise ValueError(msg)498499if elems != [] and attrs == [] and children == []:500raise ValueError(msg)501502def _validate_names(self) -> None:503"""504Validate names.505506This method will check if names is a list and aligns with507length of parse nodes.508509Raises510------511ValueError512* If value is not a list and less then length of nodes.513"""514if self.names:515children = self.xml_doc.xpath(516self.xpath + "[1]/*", namespaces=self.namespaces517)518519if is_list_like(self.names):520if len(self.names) < len(children):521raise ValueError(522"names does not match length of child elements in xpath."523)524else:525raise TypeError(526f"{type(self.names).__name__} is not a valid type for names"527)528529def _parse_doc(self, raw_doc) -> bytes:530from lxml.etree import (531XMLParser,532fromstring,533parse,534tostring,535)536537handle_data = get_data_from_filepath(538filepath_or_buffer=raw_doc,539encoding=self.encoding,540compression=self.compression,541storage_options=self.storage_options,542)543544with preprocess_data(handle_data) as xml_data:545curr_parser = XMLParser(encoding=self.encoding)546547if isinstance(xml_data, io.StringIO):548if self.encoding is None:549raise TypeError(550"Can not pass encoding None when input is StringIO."551)552553doc = fromstring(554xml_data.getvalue().encode(self.encoding), parser=curr_parser555)556else:557doc = parse(xml_data, parser=curr_parser)558559return tostring(doc)560561def _transform_doc(self) -> bytes:562"""563Transform original tree using stylesheet.564565This method will transform original xml using XSLT script into566am ideally flatter xml document for easier parsing and migration567to Data Frame.568"""569from lxml.etree import XSLT570571transformer = XSLT(self.xsl_doc)572new_doc = transformer(self.xml_doc)573574return bytes(new_doc)575576577def get_data_from_filepath(578filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],579encoding: str | None,580compression: CompressionOptions,581storage_options: StorageOptions,582) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:583"""584Extract raw XML data.585586The method accepts three input types:5871. filepath (string-like)5882. file-like object (e.g. open file object, StringIO)5893. XML string or bytes590591This method turns (1) into (2) to simplify the rest of the processing.592It returns input types (2) and (3) unchanged.593"""594if not isinstance(filepath_or_buffer, bytes):595filepath_or_buffer = stringify_path(filepath_or_buffer)596597if (598isinstance(filepath_or_buffer, str)599and not filepath_or_buffer.startswith(("<?xml", "<"))600) and (601not isinstance(filepath_or_buffer, str)602or is_url(filepath_or_buffer)603or is_fsspec_url(filepath_or_buffer)604or file_exists(filepath_or_buffer)605):606with get_handle(607filepath_or_buffer,608"r",609encoding=encoding,610compression=compression,611storage_options=storage_options,612) as handle_obj:613filepath_or_buffer = (614# error: Incompatible types in assignment (expression has type615# "Union[str, IO[str]]", variable has type "Union[Union[str,616# PathLike[str]], bytes, ReadBuffer[bytes], ReadBuffer[str]]")617handle_obj.handle.read() # type: ignore[assignment]618if hasattr(handle_obj.handle, "read")619else handle_obj.handle620)621622return filepath_or_buffer623624625def preprocess_data(data) -> io.StringIO | io.BytesIO:626"""627Convert extracted raw data.628629This method will return underlying data of extracted XML content.630The data either has a `read` attribute (e.g. a file object or a631StringIO/BytesIO) or is a string or bytes that is an XML document.632"""633634if isinstance(data, str):635data = io.StringIO(data)636637elif isinstance(data, bytes):638data = io.BytesIO(data)639640return data641642643def _data_to_frame(data, **kwargs) -> DataFrame:644"""645Convert parsed data to Data Frame.646647This method will bind xml dictionary data of keys and values648into named columns of Data Frame using the built-in TextParser649class that build Data Frame and infers specific dtypes.650"""651652tags = next(iter(data))653nodes = [list(d.values()) for d in data]654655try:656with TextParser(nodes, names=tags, **kwargs) as tp:657return tp.read()658except ParserError:659raise ParserError(660"XML document may be too complex for import. "661"Try to flatten document and use distinct "662"element and attribute names."663)664665666def _parse(667path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],668xpath: str,669namespaces: dict[str, str] | None,670elems_only: bool,671attrs_only: bool,672names: Sequence[str] | None,673encoding: str | None,674parser: XMLParsers,675stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,676compression: CompressionOptions,677storage_options: StorageOptions,678**kwargs,679) -> DataFrame:680"""681Call internal parsers.682683This method will conditionally call internal parsers:684LxmlFrameParser and/or EtreeParser.685686Raises687------688ImportError689* If lxml is not installed if selected as parser.690691ValueError692* If parser is not lxml or etree.693"""694695p: _EtreeFrameParser | _LxmlFrameParser696697if parser == "lxml":698lxml = import_optional_dependency("lxml.etree", errors="ignore")699700if lxml is not None:701p = _LxmlFrameParser(702path_or_buffer,703xpath,704namespaces,705elems_only,706attrs_only,707names,708encoding,709stylesheet,710compression,711storage_options,712)713else:714raise ImportError("lxml not found, please install or use the etree parser.")715716elif parser == "etree":717p = _EtreeFrameParser(718path_or_buffer,719xpath,720namespaces,721elems_only,722attrs_only,723names,724encoding,725stylesheet,726compression,727storage_options,728)729else:730raise ValueError("Values for parser can only be lxml or etree.")731732data_dicts = p.parse_data()733734return _data_to_frame(data=data_dicts, **kwargs)735736737@deprecate_nonkeyword_arguments(738version=None, allowed_args=["path_or_buffer"], stacklevel=2739)740@doc(741storage_options=_shared_docs["storage_options"],742decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",743)744def read_xml(745path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],746xpath: str = "./*",747namespaces: dict[str, str] | None = None,748elems_only: bool = False,749attrs_only: bool = False,750names: Sequence[str] | None = None,751# encoding can not be None for lxml and StringIO input752encoding: str | None = "utf-8",753parser: XMLParsers = "lxml",754stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,755compression: CompressionOptions = "infer",756storage_options: StorageOptions = None,757) -> DataFrame:758r"""759Read XML document into a ``DataFrame`` object.760761.. versionadded:: 1.3.0762763Parameters764----------765path_or_buffer : str, path object, or file-like object766String, path object (implementing ``os.PathLike[str]``), or file-like767object implementing a ``read()`` function. The string can be any valid XML768string or a path. The string can further be a URL. Valid URL schemes769include http, ftp, s3, and file.770771xpath : str, optional, default './\*'772The XPath to parse required set of nodes for migration to DataFrame.773XPath should return a collection of elements and not a single774element. Note: The ``etree`` parser supports limited XPath775expressions. For more complex XPath, use ``lxml`` which requires776installation.777778namespaces : dict, optional779The namespaces defined in XML document as dicts with key being780namespace prefix and value the URI. There is no need to include all781namespaces in XML, only the ones used in ``xpath`` expression.782Note: if XML document uses default namespace denoted as783`xmlns='<URI>'` without a prefix, you must assign any temporary784namespace prefix such as 'doc' to the URI in order to parse785underlying nodes and/or attributes. For example, ::786787namespaces = {{"doc": "https://example.com"}}788789elems_only : bool, optional, default False790Parse only the child elements at the specified ``xpath``. By default,791all child elements and non-empty text nodes are returned.792793attrs_only : bool, optional, default False794Parse only the attributes at the specified ``xpath``.795By default, all attributes are returned.796797names : list-like, optional798Column names for DataFrame of parsed XML data. Use this parameter to799rename original element names and distinguish same named elements.800801encoding : str, optional, default 'utf-8'802Encoding of XML document.803804parser : {{'lxml','etree'}}, default 'lxml'805Parser module to use for retrieval of data. Only 'lxml' and806'etree' are supported. With 'lxml' more complex XPath searches807and ability to use XSLT stylesheet are supported.808809stylesheet : str, path object or file-like object810A URL, file-like object, or a raw string containing an XSLT script.811This stylesheet should flatten complex, deeply nested XML documents812for easier parsing. To use this feature you must have ``lxml`` module813installed and specify 'lxml' as ``parser``. The ``xpath`` must814reference nodes of transformed XML document generated after XSLT815transformation and not the original XML document. Only XSLT 1.0816scripts and not later versions is currently supported.817818{decompression_options}819820.. versionchanged:: 1.4.0 Zstandard support.821822{storage_options}823824Returns825-------826df827A DataFrame.828829See Also830--------831read_json : Convert a JSON string to pandas object.832read_html : Read HTML tables into a list of DataFrame objects.833834Notes835-----836This method is best designed to import shallow XML documents in837following format which is the ideal fit for the two-dimensions of a838``DataFrame`` (row by column). ::839840<root>841<row>842<column1>data</column1>843<column2>data</column2>844<column3>data</column3>845...846</row>847<row>848...849</row>850...851</root>852853As a file format, XML documents can be designed any way including854layout of elements and attributes as long as it conforms to W3C855specifications. Therefore, this method is a convenience handler for856a specific flatter design and not all possible XML structures.857858However, for more complex XML documents, ``stylesheet`` allows you to859temporarily redesign original document with XSLT (a special purpose860language) for a flatter version for migration to a DataFrame.861862This function will *always* return a single :class:`DataFrame` or raise863exceptions due to issues with XML document, ``xpath``, or other864parameters.865866Examples867--------868>>> xml = '''<?xml version='1.0' encoding='utf-8'?>869... <data xmlns="http://example.com">870... <row>871... <shape>square</shape>872... <degrees>360</degrees>873... <sides>4.0</sides>874... </row>875... <row>876... <shape>circle</shape>877... <degrees>360</degrees>878... <sides/>879... </row>880... <row>881... <shape>triangle</shape>882... <degrees>180</degrees>883... <sides>3.0</sides>884... </row>885... </data>'''886887>>> df = pd.read_xml(xml)888>>> df889shape degrees sides8900 square 360 4.08911 circle 360 NaN8922 triangle 180 3.0893894>>> xml = '''<?xml version='1.0' encoding='utf-8'?>895... <data>896... <row shape="square" degrees="360" sides="4.0"/>897... <row shape="circle" degrees="360"/>898... <row shape="triangle" degrees="180" sides="3.0"/>899... </data>'''900901>>> df = pd.read_xml(xml, xpath=".//row")902>>> df903shape degrees sides9040 square 360 4.09051 circle 360 NaN9062 triangle 180 3.0907908>>> xml = '''<?xml version='1.0' encoding='utf-8'?>909... <doc:data xmlns:doc="https://example.com">910... <doc:row>911... <doc:shape>square</doc:shape>912... <doc:degrees>360</doc:degrees>913... <doc:sides>4.0</doc:sides>914... </doc:row>915... <doc:row>916... <doc:shape>circle</doc:shape>917... <doc:degrees>360</doc:degrees>918... <doc:sides/>919... </doc:row>920... <doc:row>921... <doc:shape>triangle</doc:shape>922... <doc:degrees>180</doc:degrees>923... <doc:sides>3.0</doc:sides>924... </doc:row>925... </doc:data>'''926927>>> df = pd.read_xml(xml,928... xpath="//doc:row",929... namespaces={{"doc": "https://example.com"}})930>>> df931shape degrees sides9320 square 360 4.09331 circle 360 NaN9342 triangle 180 3.0935"""936937return _parse(938path_or_buffer=path_or_buffer,939xpath=xpath,940namespaces=namespaces,941elems_only=elems_only,942attrs_only=attrs_only,943names=names,944encoding=encoding,945parser=parser,946stylesheet=stylesheet,947compression=compression,948storage_options=storage_options,949)950951952