Path: blob/master/ invest-robot-contest_TinkoffBotTwitch-main/venv/lib/python3.8/site-packages/pandas/io/html.py
7813 views
"""1:mod:`pandas.io.html` is a module containing functionality for dealing with2HTML IO.34"""56from __future__ import annotations78from collections import abc9import numbers10import re11from typing import (12Pattern,13Sequence,14cast,15)1617from pandas._typing import (18FilePath,19ReadBuffer,20)21from pandas.compat._optional import import_optional_dependency22from pandas.errors import (23AbstractMethodError,24EmptyDataError,25)26from pandas.util._decorators import deprecate_nonkeyword_arguments2728from pandas.core.dtypes.common import is_list_like2930from pandas.core.construction import create_series_with_explicit_dtype31from pandas.core.frame import DataFrame3233from pandas.io.common import (34file_exists,35get_handle,36is_url,37stringify_path,38urlopen,39validate_header_arg,40)41from pandas.io.formats.printing import pprint_thing42from pandas.io.parsers import TextParser4344_IMPORTS = False45_HAS_BS4 = False46_HAS_LXML = False47_HAS_HTML5LIB = False484950def _importers() -> None:51# import things we need52# but make this done on a first use basis5354global _IMPORTS55if _IMPORTS:56return5758global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB59bs4 = import_optional_dependency("bs4", errors="ignore")60_HAS_BS4 = bs4 is not None6162lxml = import_optional_dependency("lxml.etree", errors="ignore")63_HAS_LXML = lxml is not None6465html5lib = import_optional_dependency("html5lib", errors="ignore")66_HAS_HTML5LIB = html5lib is not None6768_IMPORTS = True697071#############72# READ HTML #73#############74_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}")757677def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str:78"""79Replace extra whitespace inside of a string with a single space.8081Parameters82----------83s : str or unicode84The string from which to remove extra whitespace.85regex : re.Pattern86The regular expression to use to remove extra whitespace.8788Returns89-------90subd : str or unicode91`s` with all extra whitespace replaced with a single space.92"""93return regex.sub(" ", s.strip())949596def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]:97"""98Get an iterator given an integer, slice or container.99100Parameters101----------102skiprows : int, slice, container103The iterator to use to skip rows; can also be a slice.104105Raises106------107TypeError108* If `skiprows` is not a slice, integer, or Container109110Returns111-------112it : iterable113A proper iterator to use to skip rows of a DataFrame.114"""115if isinstance(skiprows, slice):116start, step = skiprows.start or 0, skiprows.step or 1117return list(range(start, skiprows.stop, step))118elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):119return cast("int | Sequence[int]", skiprows)120elif skiprows is None:121return 0122raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows")123124125def _read(126obj: bytes | FilePath | ReadBuffer[str] | ReadBuffer[bytes], encoding: str | None127) -> str | bytes:128"""129Try to read from a url, file or string.130131Parameters132----------133obj : str, unicode, path object, or file-like object134135Returns136-------137raw_text : str138"""139text: str | bytes140if (141is_url(obj)142or hasattr(obj, "read")143or (isinstance(obj, str) and file_exists(obj))144):145# error: Argument 1 to "get_handle" has incompatible type "Union[str, bytes,146# Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]";147# expected "Union[PathLike[str], Union[str, Union[IO[Any], RawIOBase,148# BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]]"149with get_handle(150obj, "r", encoding=encoding # type: ignore[arg-type]151) as handles:152text = handles.handle.read()153elif isinstance(obj, (str, bytes)):154text = obj155else:156raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")157return text158159160class _HtmlFrameParser:161"""162Base class for parsers that parse HTML into DataFrames.163164Parameters165----------166io : str or file-like167This can be either a string of raw HTML, a valid URL using the HTTP,168FTP, or FILE protocols or a file-like object.169170match : str or regex171The text to match in the document.172173attrs : dict174List of HTML <table> element attributes to match.175176encoding : str177Encoding to be used by parser178179displayed_only : bool180Whether or not items with "display:none" should be ignored181182Attributes183----------184io : str or file-like185raw HTML, URL, or file-like object186187match : regex188The text to match in the raw HTML189190attrs : dict-like191A dictionary of valid table attributes to use to search for table192elements.193194encoding : str195Encoding to be used by parser196197displayed_only : bool198Whether or not items with "display:none" should be ignored199200Notes201-----202To subclass this class effectively you must override the following methods:203* :func:`_build_doc`204* :func:`_attr_getter`205* :func:`_text_getter`206* :func:`_parse_td`207* :func:`_parse_thead_tr`208* :func:`_parse_tbody_tr`209* :func:`_parse_tfoot_tr`210* :func:`_parse_tables`211* :func:`_equals_tag`212See each method's respective documentation for details on their213functionality.214"""215216def __init__(217self,218io: FilePath | ReadBuffer[str] | ReadBuffer[bytes],219match: str | Pattern,220attrs: dict[str, str] | None,221encoding: str,222displayed_only: bool,223):224self.io = io225self.match = match226self.attrs = attrs227self.encoding = encoding228self.displayed_only = displayed_only229230def parse_tables(self):231"""232Parse and return all tables from the DOM.233234Returns235-------236list of parsed (header, body, footer) tuples from tables.237"""238tables = self._parse_tables(self._build_doc(), self.match, self.attrs)239return (self._parse_thead_tbody_tfoot(table) for table in tables)240241def _attr_getter(self, obj, attr):242"""243Return the attribute value of an individual DOM node.244245Parameters246----------247obj : node-like248A DOM node.249250attr : str or unicode251The attribute, such as "colspan"252253Returns254-------255str or unicode256The attribute value.257"""258# Both lxml and BeautifulSoup have the same implementation:259return obj.get(attr)260261def _text_getter(self, obj):262"""263Return the text of an individual DOM node.264265Parameters266----------267obj : node-like268A DOM node.269270Returns271-------272text : str or unicode273The text from an individual DOM node.274"""275raise AbstractMethodError(self)276277def _parse_td(self, obj):278"""279Return the td elements from a row element.280281Parameters282----------283obj : node-like284A DOM <tr> node.285286Returns287-------288list of node-like289These are the elements of each row, i.e., the columns.290"""291raise AbstractMethodError(self)292293def _parse_thead_tr(self, table):294"""295Return the list of thead row elements from the parsed table element.296297Parameters298----------299table : a table element that contains zero or more thead elements.300301Returns302-------303list of node-like304These are the <tr> row elements of a table.305"""306raise AbstractMethodError(self)307308def _parse_tbody_tr(self, table):309"""310Return the list of tbody row elements from the parsed table element.311312HTML5 table bodies consist of either 0 or more <tbody> elements (which313only contain <tr> elements) or 0 or more <tr> elements. This method314checks for both structures.315316Parameters317----------318table : a table element that contains row elements.319320Returns321-------322list of node-like323These are the <tr> row elements of a table.324"""325raise AbstractMethodError(self)326327def _parse_tfoot_tr(self, table):328"""329Return the list of tfoot row elements from the parsed table element.330331Parameters332----------333table : a table element that contains row elements.334335Returns336-------337list of node-like338These are the <tr> row elements of a table.339"""340raise AbstractMethodError(self)341342def _parse_tables(self, doc, match, attrs):343"""344Return all tables from the parsed DOM.345346Parameters347----------348doc : the DOM from which to parse the table element.349350match : str or regular expression351The text to search for in the DOM tree.352353attrs : dict354A dictionary of table attributes that can be used to disambiguate355multiple tables on a page.356357Raises358------359ValueError : `match` does not match any text in the document.360361Returns362-------363list of node-like364HTML <table> elements to be parsed into raw data.365"""366raise AbstractMethodError(self)367368def _equals_tag(self, obj, tag):369"""370Return whether an individual DOM node matches a tag371372Parameters373----------374obj : node-like375A DOM node.376377tag : str378Tag name to be checked for equality.379380Returns381-------382boolean383Whether `obj`'s tag name is `tag`384"""385raise AbstractMethodError(self)386387def _build_doc(self):388"""389Return a tree-like object that can be used to iterate over the DOM.390391Returns392-------393node-like394The DOM from which to parse the table element.395"""396raise AbstractMethodError(self)397398def _parse_thead_tbody_tfoot(self, table_html):399"""400Given a table, return parsed header, body, and foot.401402Parameters403----------404table_html : node-like405406Returns407-------408tuple of (header, body, footer), each a list of list-of-text rows.409410Notes411-----412Header and body are lists-of-lists. Top level list is a list of413rows. Each row is a list of str text.414415Logic: Use <thead>, <tbody>, <tfoot> elements to identify416header, body, and footer, otherwise:417- Put all rows into body418- Move rows from top of body to header only if419all elements inside row are <th>420- Move rows from bottom of body to footer only if421all elements inside row are <th>422"""423header_rows = self._parse_thead_tr(table_html)424body_rows = self._parse_tbody_tr(table_html)425footer_rows = self._parse_tfoot_tr(table_html)426427def row_is_all_th(row):428return all(self._equals_tag(t, "th") for t in self._parse_td(row))429430if not header_rows:431# The table has no <thead>. Move the top all-<th> rows from432# body_rows to header_rows. (This is a common case because many433# tables in the wild have no <thead> or <tfoot>434while body_rows and row_is_all_th(body_rows[0]):435header_rows.append(body_rows.pop(0))436437header = self._expand_colspan_rowspan(header_rows)438body = self._expand_colspan_rowspan(body_rows)439footer = self._expand_colspan_rowspan(footer_rows)440441return header, body, footer442443def _expand_colspan_rowspan(self, rows):444"""445Given a list of <tr>s, return a list of text rows.446447Parameters448----------449rows : list of node-like450List of <tr>s451452Returns453-------454list of list455Each returned row is a list of str text.456457Notes458-----459Any cell with ``rowspan`` or ``colspan`` will have its contents copied460to subsequent cells.461"""462all_texts = [] # list of rows, each a list of str463remainder: list[tuple[int, str, int]] = [] # list of (index, text, nrows)464465for tr in rows:466texts = [] # the output for this row467next_remainder = []468469index = 0470tds = self._parse_td(tr)471for td in tds:472# Append texts from previous rows with rowspan>1 that come473# before this <td>474while remainder and remainder[0][0] <= index:475prev_i, prev_text, prev_rowspan = remainder.pop(0)476texts.append(prev_text)477if prev_rowspan > 1:478next_remainder.append((prev_i, prev_text, prev_rowspan - 1))479index += 1480481# Append the text from this <td>, colspan times482text = _remove_whitespace(self._text_getter(td))483rowspan = int(self._attr_getter(td, "rowspan") or 1)484colspan = int(self._attr_getter(td, "colspan") or 1)485486for _ in range(colspan):487texts.append(text)488if rowspan > 1:489next_remainder.append((index, text, rowspan - 1))490index += 1491492# Append texts from previous rows at the final position493for prev_i, prev_text, prev_rowspan in remainder:494texts.append(prev_text)495if prev_rowspan > 1:496next_remainder.append((prev_i, prev_text, prev_rowspan - 1))497498all_texts.append(texts)499remainder = next_remainder500501# Append rows that only appear because the previous row had non-1502# rowspan503while remainder:504next_remainder = []505texts = []506for prev_i, prev_text, prev_rowspan in remainder:507texts.append(prev_text)508if prev_rowspan > 1:509next_remainder.append((prev_i, prev_text, prev_rowspan - 1))510all_texts.append(texts)511remainder = next_remainder512513return all_texts514515def _handle_hidden_tables(self, tbl_list, attr_name):516"""517Return list of tables, potentially removing hidden elements518519Parameters520----------521tbl_list : list of node-like522Type of list elements will vary depending upon parser used523attr_name : str524Name of the accessor for retrieving HTML attributes525526Returns527-------528list of node-like529Return type matches `tbl_list`530"""531if not self.displayed_only:532return tbl_list533534return [535x536for x in tbl_list537if "display:none"538not in getattr(x, attr_name).get("style", "").replace(" ", "")539]540541542class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):543"""544HTML to DataFrame parser that uses BeautifulSoup under the hood.545546See Also547--------548pandas.io.html._HtmlFrameParser549pandas.io.html._LxmlFrameParser550551Notes552-----553Documentation strings for this class are in the base class554:class:`pandas.io.html._HtmlFrameParser`.555"""556557def __init__(self, *args, **kwargs):558super().__init__(*args, **kwargs)559from bs4 import SoupStrainer560561self._strainer = SoupStrainer("table")562563def _parse_tables(self, doc, match, attrs):564element_name = self._strainer.name565tables = doc.find_all(element_name, attrs=attrs)566567if not tables:568raise ValueError("No tables found")569570result = []571unique_tables = set()572tables = self._handle_hidden_tables(tables, "attrs")573574for table in tables:575if self.displayed_only:576for elem in table.find_all(style=re.compile(r"display:\s*none")):577elem.decompose()578579if table not in unique_tables and table.find(text=match) is not None:580result.append(table)581unique_tables.add(table)582583if not result:584raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")585return result586587def _text_getter(self, obj):588return obj.text589590def _equals_tag(self, obj, tag):591return obj.name == tag592593def _parse_td(self, row):594return row.find_all(("td", "th"), recursive=False)595596def _parse_thead_tr(self, table):597return table.select("thead tr")598599def _parse_tbody_tr(self, table):600from_tbody = table.select("tbody tr")601from_root = table.find_all("tr", recursive=False)602# HTML spec: at most one of these lists has content603return from_tbody + from_root604605def _parse_tfoot_tr(self, table):606return table.select("tfoot tr")607608def _setup_build_doc(self):609raw_text = _read(self.io, self.encoding)610if not raw_text:611raise ValueError(f"No text parsed from document: {self.io}")612return raw_text613614def _build_doc(self):615from bs4 import BeautifulSoup616617bdoc = self._setup_build_doc()618if isinstance(bdoc, bytes) and self.encoding is not None:619udoc = bdoc.decode(self.encoding)620from_encoding = None621else:622udoc = bdoc623from_encoding = self.encoding624return BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)625626627def _build_xpath_expr(attrs) -> str:628"""629Build an xpath expression to simulate bs4's ability to pass in kwargs to630search for attributes when using the lxml parser.631632Parameters633----------634attrs : dict635A dict of HTML attributes. These are NOT checked for validity.636637Returns638-------639expr : unicode640An XPath expression that checks for the given HTML attributes.641"""642# give class attribute as class_ because class is a python keyword643if "class_" in attrs:644attrs["class"] = attrs.pop("class_")645646s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()])647return f"[{s}]"648649650_re_namespace = {"re": "http://exslt.org/regular-expressions"}651652653class _LxmlFrameParser(_HtmlFrameParser):654"""655HTML to DataFrame parser that uses lxml under the hood.656657Warning658-------659This parser can only handle HTTP, FTP, and FILE urls.660661See Also662--------663_HtmlFrameParser664_BeautifulSoupLxmlFrameParser665666Notes667-----668Documentation strings for this class are in the base class669:class:`_HtmlFrameParser`.670"""671672def _text_getter(self, obj):673return obj.text_content()674675def _parse_td(self, row):676# Look for direct children only: the "row" element here may be a677# <thead> or <tfoot> (see _parse_thead_tr).678return row.xpath("./td|./th")679680def _parse_tables(self, doc, match, kwargs):681pattern = match.pattern682683# 1. check all descendants for the given pattern and only search tables684# 2. go up the tree until we find a table685xpath_expr = f"//table//*[re:test(text(), {repr(pattern)})]/ancestor::table"686687# if any table attributes were given build an xpath expression to688# search for them689if kwargs:690xpath_expr += _build_xpath_expr(kwargs)691692tables = doc.xpath(xpath_expr, namespaces=_re_namespace)693694tables = self._handle_hidden_tables(tables, "attrib")695if self.displayed_only:696for table in tables:697# lxml utilizes XPATH 1.0 which does not have regex698# support. As a result, we find all elements with a style699# attribute and iterate them to check for display:none700for elem in table.xpath(".//*[@style]"):701if "display:none" in elem.attrib.get("style", "").replace(" ", ""):702elem.getparent().remove(elem)703704if not tables:705raise ValueError(f"No tables found matching regex {repr(pattern)}")706return tables707708def _equals_tag(self, obj, tag):709return obj.tag == tag710711def _build_doc(self):712"""713Raises714------715ValueError716* If a URL that lxml cannot parse is passed.717718Exception719* Any other ``Exception`` thrown. For example, trying to parse a720URL that is syntactically correct on a machine with no internet721connection will fail.722723See Also724--------725pandas.io.html._HtmlFrameParser._build_doc726"""727from lxml.etree import XMLSyntaxError728from lxml.html import (729HTMLParser,730fromstring,731parse,732)733734parser = HTMLParser(recover=True, encoding=self.encoding)735736try:737if is_url(self.io):738with urlopen(self.io) as f:739r = parse(f, parser=parser)740else:741# try to parse the input in the simplest way742r = parse(self.io, parser=parser)743try:744r = r.getroot()745except AttributeError:746pass747except (UnicodeDecodeError, OSError) as e:748# if the input is a blob of html goop749if not is_url(self.io):750r = fromstring(self.io, parser=parser)751752try:753r = r.getroot()754except AttributeError:755pass756else:757raise e758else:759if not hasattr(r, "text_content"):760raise XMLSyntaxError("no text parsed from document", 0, 0, 0)761return r762763def _parse_thead_tr(self, table):764rows = []765766for thead in table.xpath(".//thead"):767rows.extend(thead.xpath("./tr"))768769# HACK: lxml does not clean up the clearly-erroneous770# <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add771# the <thead> and _pretend_ it's a <tr>; _parse_td() will find its772# children as though it's a <tr>.773#774# Better solution would be to use html5lib.775elements_at_root = thead.xpath("./td|./th")776if elements_at_root:777rows.append(thead)778779return rows780781def _parse_tbody_tr(self, table):782from_tbody = table.xpath(".//tbody//tr")783from_root = table.xpath("./tr")784# HTML spec: at most one of these lists has content785return from_tbody + from_root786787def _parse_tfoot_tr(self, table):788return table.xpath(".//tfoot//tr")789790791def _expand_elements(body):792data = [len(elem) for elem in body]793lens = create_series_with_explicit_dtype(data, dtype_if_empty=object)794lens_max = lens.max()795not_max = lens[lens != lens_max]796797empty = [""]798for ind, length in not_max.items():799body[ind] += empty * (lens_max - length)800801802def _data_to_frame(**kwargs):803head, body, foot = kwargs.pop("data")804header = kwargs.pop("header")805kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"])806if head:807body = head + body808809# Infer header when there is a <thead> or top <th>-only rows810if header is None:811if len(head) == 1:812header = 0813else:814# ignore all-empty-text rows815header = [i for i, row in enumerate(head) if any(text for text in row)]816817if foot:818body += foot819820# fill out elements of body that are "ragged"821_expand_elements(body)822with TextParser(body, header=header, **kwargs) as tp:823return tp.read()824825826_valid_parsers = {827"lxml": _LxmlFrameParser,828None: _LxmlFrameParser,829"html5lib": _BeautifulSoupHtml5LibFrameParser,830"bs4": _BeautifulSoupHtml5LibFrameParser,831}832833834def _parser_dispatch(flavor: str | None) -> type[_HtmlFrameParser]:835"""836Choose the parser based on the input flavor.837838Parameters839----------840flavor : str841The type of parser to use. This must be a valid backend.842843Returns844-------845cls : _HtmlFrameParser subclass846The parser class based on the requested input flavor.847848Raises849------850ValueError851* If `flavor` is not a valid backend.852ImportError853* If you do not have the requested `flavor`854"""855valid_parsers = list(_valid_parsers.keys())856if flavor not in valid_parsers:857raise ValueError(858f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}"859)860861if flavor in ("bs4", "html5lib"):862if not _HAS_HTML5LIB:863raise ImportError("html5lib not found, please install it")864if not _HAS_BS4:865raise ImportError("BeautifulSoup4 (bs4) not found, please install it")866# Although we call this above, we want to raise here right before use.867bs4 = import_optional_dependency("bs4") # noqa:F841868869else:870if not _HAS_LXML:871raise ImportError("lxml not found, please install it")872return _valid_parsers[flavor]873874875def _print_as_set(s) -> str:876arg = ", ".join([pprint_thing(el) for el in s])877return f"{{{arg}}}"878879880def _validate_flavor(flavor):881if flavor is None:882flavor = "lxml", "bs4"883elif isinstance(flavor, str):884flavor = (flavor,)885elif isinstance(flavor, abc.Iterable):886if not all(isinstance(flav, str) for flav in flavor):887raise TypeError(888f"Object of type {repr(type(flavor).__name__)} "889f"is not an iterable of strings"890)891else:892msg = repr(flavor) if isinstance(flavor, str) else str(flavor)893msg += " is not a valid flavor"894raise ValueError(msg)895896flavor = tuple(flavor)897valid_flavors = set(_valid_parsers)898flavor_set = set(flavor)899900if not flavor_set & valid_flavors:901raise ValueError(902f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid "903f"flavors are {_print_as_set(valid_flavors)}"904)905return flavor906907908def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):909flavor = _validate_flavor(flavor)910compiled_match = re.compile(match) # you can pass a compiled regex here911912retained = None913for flav in flavor:914parser = _parser_dispatch(flav)915p = parser(io, compiled_match, attrs, encoding, displayed_only)916917try:918tables = p.parse_tables()919except ValueError as caught:920# if `io` is an io-like object, check if it's seekable921# and try to rewind it before trying the next parser922if hasattr(io, "seekable") and io.seekable():923io.seek(0)924elif hasattr(io, "seekable") and not io.seekable():925# if we couldn't rewind it, let the user know926raise ValueError(927f"The flavor {flav} failed to parse your input. "928"Since you passed a non-rewindable file "929"object, we can't rewind it to try "930"another parser. Try read_html() with a different flavor."931) from caught932933retained = caught934else:935break936else:937assert retained is not None # for mypy938raise retained939940ret = []941for table in tables:942try:943ret.append(_data_to_frame(data=table, **kwargs))944except EmptyDataError: # empty table945continue946return ret947948949@deprecate_nonkeyword_arguments(version="2.0")950def read_html(951io: FilePath | ReadBuffer[str],952match: str | Pattern = ".+",953flavor: str | None = None,954header: int | Sequence[int] | None = None,955index_col: int | Sequence[int] | None = None,956skiprows: int | Sequence[int] | slice | None = None,957attrs: dict[str, str] | None = None,958parse_dates: bool = False,959thousands: str | None = ",",960encoding: str | None = None,961decimal: str = ".",962converters: dict | None = None,963na_values=None,964keep_default_na: bool = True,965displayed_only: bool = True,966) -> list[DataFrame]:967r"""968Read HTML tables into a ``list`` of ``DataFrame`` objects.969970Parameters971----------972io : str, path object, or file-like object973String, path object (implementing ``os.PathLike[str]``), or file-like974object implementing a string ``read()`` function.975The string can represent a URL or the HTML itself. Note that976lxml only accepts the http, ftp and file url protocols. If you have a977URL that starts with ``'https'`` you might try removing the ``'s'``.978979match : str or compiled regular expression, optional980The set of tables containing text matching this regex or string will be981returned. Unless the HTML is extremely simple you will probably need to982pass a non-empty string here. Defaults to '.+' (match any non-empty983string). The default value will return all tables contained on a page.984This value is converted to a regular expression so that there is985consistent behavior between Beautiful Soup and lxml.986987flavor : str, optional988The parsing engine to use. 'bs4' and 'html5lib' are synonymous with989each other, they are both there for backwards compatibility. The990default of ``None`` tries to use ``lxml`` to parse and if that fails it991falls back on ``bs4`` + ``html5lib``.992993header : int or list-like, optional994The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to995make the columns headers.996997index_col : int or list-like, optional998The column (or list of columns) to use to create the index.9991000skiprows : int, list-like or slice, optional1001Number of rows to skip after parsing the column integer. 0-based. If a1002sequence of integers or a slice is given, will skip the rows indexed by1003that sequence. Note that a single element sequence means 'skip the nth1004row' whereas an integer means 'skip n rows'.10051006attrs : dict, optional1007This is a dictionary of attributes that you can pass to use to identify1008the table in the HTML. These are not checked for validity before being1009passed to lxml or Beautiful Soup. However, these attributes must be1010valid HTML table attributes to work correctly. For example, ::10111012attrs = {'id': 'table'}10131014is a valid attribute dictionary because the 'id' HTML tag attribute is1015a valid HTML attribute for *any* HTML tag as per `this document1016<https://html.spec.whatwg.org/multipage/dom.html#global-attributes>`__. ::10171018attrs = {'asdf': 'table'}10191020is *not* a valid attribute dictionary because 'asdf' is not a valid1021HTML attribute even if it is a valid XML attribute. Valid HTML 4.011022table attributes can be found `here1023<http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A1024working draft of the HTML 5 spec can be found `here1025<https://html.spec.whatwg.org/multipage/tables.html>`__. It contains the1026latest information on table attributes for the modern web.10271028parse_dates : bool, optional1029See :func:`~read_csv` for more details.10301031thousands : str, optional1032Separator to use to parse thousands. Defaults to ``','``.10331034encoding : str, optional1035The encoding used to decode the web page. Defaults to ``None``.``None``1036preserves the previous encoding behavior, which depends on the1037underlying parser library (e.g., the parser library will try to use1038the encoding provided by the document).10391040decimal : str, default '.'1041Character to recognize as decimal point (e.g. use ',' for European1042data).10431044converters : dict, default None1045Dict of functions for converting values in certain columns. Keys can1046either be integers or column labels, values are functions that take one1047input argument, the cell (not column) content, and return the1048transformed content.10491050na_values : iterable, default None1051Custom NA values.10521053keep_default_na : bool, default True1054If na_values are specified and keep_default_na is False the default NaN1055values are overridden, otherwise they're appended to.10561057displayed_only : bool, default True1058Whether elements with "display: none" should be parsed.10591060Returns1061-------1062dfs1063A list of DataFrames.10641065See Also1066--------1067read_csv : Read a comma-separated values (csv) file into DataFrame.10681069Notes1070-----1071Before using this function you should read the :ref:`gotchas about the1072HTML parsing libraries <io.html.gotchas>`.10731074Expect to do some cleanup after you call this function. For example, you1075might need to manually assign column names if the column names are1076converted to NaN when you pass the `header=0` argument. We try to assume as1077little as possible about the structure of the table and push the1078idiosyncrasies of the HTML contained in the table to the user.10791080This function searches for ``<table>`` elements and only for ``<tr>``1081and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``1082element in the table. ``<td>`` stands for "table data". This function1083attempts to properly handle ``colspan`` and ``rowspan`` attributes.1084If the function has a ``<thead>`` argument, it is used to construct1085the header, otherwise the function attempts to find the header within1086the body (by putting rows with only ``<th>`` elements into the header).10871088Similar to :func:`~read_csv` the `header` argument is applied1089**after** `skiprows` is applied.10901091This function will *always* return a list of :class:`DataFrame` *or*1092it will fail, e.g., it will *not* return an empty list.10931094Examples1095--------1096See the :ref:`read_html documentation in the IO section of the docs1097<io.read_html>` for some examples of reading in HTML tables.1098"""1099_importers()11001101# Type check here. We don't want to parse only to fail because of an1102# invalid value of an integer skiprows.1103if isinstance(skiprows, numbers.Integral) and skiprows < 0:1104raise ValueError(1105"cannot skip rows starting from the end of the "1106"data (you passed a negative value)"1107)1108validate_header_arg(header)11091110io = stringify_path(io)11111112return _parse(1113flavor=flavor,1114io=io,1115match=match,1116header=header,1117index_col=index_col,1118skiprows=skiprows,1119parse_dates=parse_dates,1120thousands=thousands,1121attrs=attrs,1122encoding=encoding,1123decimal=decimal,1124converters=converters,1125na_values=na_values,1126keep_default_na=keep_default_na,1127displayed_only=displayed_only,1128)112911301131