Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
wiseplat
GitHub Repository: wiseplat/python-code
Path: blob/master/ invest-robot-contest_TinkoffBotTwitch-main/venv/lib/python3.8/site-packages/pandas/io/html.py
7813 views
1
"""
2
:mod:`pandas.io.html` is a module containing functionality for dealing with
3
HTML IO.
4
5
"""
6
7
from __future__ import annotations
8
9
from collections import abc
10
import numbers
11
import re
12
from typing import (
13
Pattern,
14
Sequence,
15
cast,
16
)
17
18
from pandas._typing import (
19
FilePath,
20
ReadBuffer,
21
)
22
from pandas.compat._optional import import_optional_dependency
23
from pandas.errors import (
24
AbstractMethodError,
25
EmptyDataError,
26
)
27
from pandas.util._decorators import deprecate_nonkeyword_arguments
28
29
from pandas.core.dtypes.common import is_list_like
30
31
from pandas.core.construction import create_series_with_explicit_dtype
32
from pandas.core.frame import DataFrame
33
34
from pandas.io.common import (
35
file_exists,
36
get_handle,
37
is_url,
38
stringify_path,
39
urlopen,
40
validate_header_arg,
41
)
42
from pandas.io.formats.printing import pprint_thing
43
from pandas.io.parsers import TextParser
44
45
_IMPORTS = False
46
_HAS_BS4 = False
47
_HAS_LXML = False
48
_HAS_HTML5LIB = False
49
50
51
def _importers() -> None:
52
# import things we need
53
# but make this done on a first use basis
54
55
global _IMPORTS
56
if _IMPORTS:
57
return
58
59
global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB
60
bs4 = import_optional_dependency("bs4", errors="ignore")
61
_HAS_BS4 = bs4 is not None
62
63
lxml = import_optional_dependency("lxml.etree", errors="ignore")
64
_HAS_LXML = lxml is not None
65
66
html5lib = import_optional_dependency("html5lib", errors="ignore")
67
_HAS_HTML5LIB = html5lib is not None
68
69
_IMPORTS = True
70
71
72
#############
73
# READ HTML #
74
#############
75
_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}")
76
77
78
def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str:
79
"""
80
Replace extra whitespace inside of a string with a single space.
81
82
Parameters
83
----------
84
s : str or unicode
85
The string from which to remove extra whitespace.
86
regex : re.Pattern
87
The regular expression to use to remove extra whitespace.
88
89
Returns
90
-------
91
subd : str or unicode
92
`s` with all extra whitespace replaced with a single space.
93
"""
94
return regex.sub(" ", s.strip())
95
96
97
def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]:
98
"""
99
Get an iterator given an integer, slice or container.
100
101
Parameters
102
----------
103
skiprows : int, slice, container
104
The iterator to use to skip rows; can also be a slice.
105
106
Raises
107
------
108
TypeError
109
* If `skiprows` is not a slice, integer, or Container
110
111
Returns
112
-------
113
it : iterable
114
A proper iterator to use to skip rows of a DataFrame.
115
"""
116
if isinstance(skiprows, slice):
117
start, step = skiprows.start or 0, skiprows.step or 1
118
return list(range(start, skiprows.stop, step))
119
elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):
120
return cast("int | Sequence[int]", skiprows)
121
elif skiprows is None:
122
return 0
123
raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows")
124
125
126
def _read(
127
obj: bytes | FilePath | ReadBuffer[str] | ReadBuffer[bytes], encoding: str | None
128
) -> str | bytes:
129
"""
130
Try to read from a url, file or string.
131
132
Parameters
133
----------
134
obj : str, unicode, path object, or file-like object
135
136
Returns
137
-------
138
raw_text : str
139
"""
140
text: str | bytes
141
if (
142
is_url(obj)
143
or hasattr(obj, "read")
144
or (isinstance(obj, str) and file_exists(obj))
145
):
146
# error: Argument 1 to "get_handle" has incompatible type "Union[str, bytes,
147
# Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]";
148
# expected "Union[PathLike[str], Union[str, Union[IO[Any], RawIOBase,
149
# BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]]"
150
with get_handle(
151
obj, "r", encoding=encoding # type: ignore[arg-type]
152
) as handles:
153
text = handles.handle.read()
154
elif isinstance(obj, (str, bytes)):
155
text = obj
156
else:
157
raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")
158
return text
159
160
161
class _HtmlFrameParser:
162
"""
163
Base class for parsers that parse HTML into DataFrames.
164
165
Parameters
166
----------
167
io : str or file-like
168
This can be either a string of raw HTML, a valid URL using the HTTP,
169
FTP, or FILE protocols or a file-like object.
170
171
match : str or regex
172
The text to match in the document.
173
174
attrs : dict
175
List of HTML <table> element attributes to match.
176
177
encoding : str
178
Encoding to be used by parser
179
180
displayed_only : bool
181
Whether or not items with "display:none" should be ignored
182
183
Attributes
184
----------
185
io : str or file-like
186
raw HTML, URL, or file-like object
187
188
match : regex
189
The text to match in the raw HTML
190
191
attrs : dict-like
192
A dictionary of valid table attributes to use to search for table
193
elements.
194
195
encoding : str
196
Encoding to be used by parser
197
198
displayed_only : bool
199
Whether or not items with "display:none" should be ignored
200
201
Notes
202
-----
203
To subclass this class effectively you must override the following methods:
204
* :func:`_build_doc`
205
* :func:`_attr_getter`
206
* :func:`_text_getter`
207
* :func:`_parse_td`
208
* :func:`_parse_thead_tr`
209
* :func:`_parse_tbody_tr`
210
* :func:`_parse_tfoot_tr`
211
* :func:`_parse_tables`
212
* :func:`_equals_tag`
213
See each method's respective documentation for details on their
214
functionality.
215
"""
216
217
def __init__(
218
self,
219
io: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
220
match: str | Pattern,
221
attrs: dict[str, str] | None,
222
encoding: str,
223
displayed_only: bool,
224
):
225
self.io = io
226
self.match = match
227
self.attrs = attrs
228
self.encoding = encoding
229
self.displayed_only = displayed_only
230
231
def parse_tables(self):
232
"""
233
Parse and return all tables from the DOM.
234
235
Returns
236
-------
237
list of parsed (header, body, footer) tuples from tables.
238
"""
239
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
240
return (self._parse_thead_tbody_tfoot(table) for table in tables)
241
242
def _attr_getter(self, obj, attr):
243
"""
244
Return the attribute value of an individual DOM node.
245
246
Parameters
247
----------
248
obj : node-like
249
A DOM node.
250
251
attr : str or unicode
252
The attribute, such as "colspan"
253
254
Returns
255
-------
256
str or unicode
257
The attribute value.
258
"""
259
# Both lxml and BeautifulSoup have the same implementation:
260
return obj.get(attr)
261
262
def _text_getter(self, obj):
263
"""
264
Return the text of an individual DOM node.
265
266
Parameters
267
----------
268
obj : node-like
269
A DOM node.
270
271
Returns
272
-------
273
text : str or unicode
274
The text from an individual DOM node.
275
"""
276
raise AbstractMethodError(self)
277
278
def _parse_td(self, obj):
279
"""
280
Return the td elements from a row element.
281
282
Parameters
283
----------
284
obj : node-like
285
A DOM <tr> node.
286
287
Returns
288
-------
289
list of node-like
290
These are the elements of each row, i.e., the columns.
291
"""
292
raise AbstractMethodError(self)
293
294
def _parse_thead_tr(self, table):
295
"""
296
Return the list of thead row elements from the parsed table element.
297
298
Parameters
299
----------
300
table : a table element that contains zero or more thead elements.
301
302
Returns
303
-------
304
list of node-like
305
These are the <tr> row elements of a table.
306
"""
307
raise AbstractMethodError(self)
308
309
def _parse_tbody_tr(self, table):
310
"""
311
Return the list of tbody row elements from the parsed table element.
312
313
HTML5 table bodies consist of either 0 or more <tbody> elements (which
314
only contain <tr> elements) or 0 or more <tr> elements. This method
315
checks for both structures.
316
317
Parameters
318
----------
319
table : a table element that contains row elements.
320
321
Returns
322
-------
323
list of node-like
324
These are the <tr> row elements of a table.
325
"""
326
raise AbstractMethodError(self)
327
328
def _parse_tfoot_tr(self, table):
329
"""
330
Return the list of tfoot row elements from the parsed table element.
331
332
Parameters
333
----------
334
table : a table element that contains row elements.
335
336
Returns
337
-------
338
list of node-like
339
These are the <tr> row elements of a table.
340
"""
341
raise AbstractMethodError(self)
342
343
def _parse_tables(self, doc, match, attrs):
344
"""
345
Return all tables from the parsed DOM.
346
347
Parameters
348
----------
349
doc : the DOM from which to parse the table element.
350
351
match : str or regular expression
352
The text to search for in the DOM tree.
353
354
attrs : dict
355
A dictionary of table attributes that can be used to disambiguate
356
multiple tables on a page.
357
358
Raises
359
------
360
ValueError : `match` does not match any text in the document.
361
362
Returns
363
-------
364
list of node-like
365
HTML <table> elements to be parsed into raw data.
366
"""
367
raise AbstractMethodError(self)
368
369
def _equals_tag(self, obj, tag):
370
"""
371
Return whether an individual DOM node matches a tag
372
373
Parameters
374
----------
375
obj : node-like
376
A DOM node.
377
378
tag : str
379
Tag name to be checked for equality.
380
381
Returns
382
-------
383
boolean
384
Whether `obj`'s tag name is `tag`
385
"""
386
raise AbstractMethodError(self)
387
388
def _build_doc(self):
389
"""
390
Return a tree-like object that can be used to iterate over the DOM.
391
392
Returns
393
-------
394
node-like
395
The DOM from which to parse the table element.
396
"""
397
raise AbstractMethodError(self)
398
399
def _parse_thead_tbody_tfoot(self, table_html):
400
"""
401
Given a table, return parsed header, body, and foot.
402
403
Parameters
404
----------
405
table_html : node-like
406
407
Returns
408
-------
409
tuple of (header, body, footer), each a list of list-of-text rows.
410
411
Notes
412
-----
413
Header and body are lists-of-lists. Top level list is a list of
414
rows. Each row is a list of str text.
415
416
Logic: Use <thead>, <tbody>, <tfoot> elements to identify
417
header, body, and footer, otherwise:
418
- Put all rows into body
419
- Move rows from top of body to header only if
420
all elements inside row are <th>
421
- Move rows from bottom of body to footer only if
422
all elements inside row are <th>
423
"""
424
header_rows = self._parse_thead_tr(table_html)
425
body_rows = self._parse_tbody_tr(table_html)
426
footer_rows = self._parse_tfoot_tr(table_html)
427
428
def row_is_all_th(row):
429
return all(self._equals_tag(t, "th") for t in self._parse_td(row))
430
431
if not header_rows:
432
# The table has no <thead>. Move the top all-<th> rows from
433
# body_rows to header_rows. (This is a common case because many
434
# tables in the wild have no <thead> or <tfoot>
435
while body_rows and row_is_all_th(body_rows[0]):
436
header_rows.append(body_rows.pop(0))
437
438
header = self._expand_colspan_rowspan(header_rows)
439
body = self._expand_colspan_rowspan(body_rows)
440
footer = self._expand_colspan_rowspan(footer_rows)
441
442
return header, body, footer
443
444
def _expand_colspan_rowspan(self, rows):
445
"""
446
Given a list of <tr>s, return a list of text rows.
447
448
Parameters
449
----------
450
rows : list of node-like
451
List of <tr>s
452
453
Returns
454
-------
455
list of list
456
Each returned row is a list of str text.
457
458
Notes
459
-----
460
Any cell with ``rowspan`` or ``colspan`` will have its contents copied
461
to subsequent cells.
462
"""
463
all_texts = [] # list of rows, each a list of str
464
remainder: list[tuple[int, str, int]] = [] # list of (index, text, nrows)
465
466
for tr in rows:
467
texts = [] # the output for this row
468
next_remainder = []
469
470
index = 0
471
tds = self._parse_td(tr)
472
for td in tds:
473
# Append texts from previous rows with rowspan>1 that come
474
# before this <td>
475
while remainder and remainder[0][0] <= index:
476
prev_i, prev_text, prev_rowspan = remainder.pop(0)
477
texts.append(prev_text)
478
if prev_rowspan > 1:
479
next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
480
index += 1
481
482
# Append the text from this <td>, colspan times
483
text = _remove_whitespace(self._text_getter(td))
484
rowspan = int(self._attr_getter(td, "rowspan") or 1)
485
colspan = int(self._attr_getter(td, "colspan") or 1)
486
487
for _ in range(colspan):
488
texts.append(text)
489
if rowspan > 1:
490
next_remainder.append((index, text, rowspan - 1))
491
index += 1
492
493
# Append texts from previous rows at the final position
494
for prev_i, prev_text, prev_rowspan in remainder:
495
texts.append(prev_text)
496
if prev_rowspan > 1:
497
next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
498
499
all_texts.append(texts)
500
remainder = next_remainder
501
502
# Append rows that only appear because the previous row had non-1
503
# rowspan
504
while remainder:
505
next_remainder = []
506
texts = []
507
for prev_i, prev_text, prev_rowspan in remainder:
508
texts.append(prev_text)
509
if prev_rowspan > 1:
510
next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
511
all_texts.append(texts)
512
remainder = next_remainder
513
514
return all_texts
515
516
def _handle_hidden_tables(self, tbl_list, attr_name):
517
"""
518
Return list of tables, potentially removing hidden elements
519
520
Parameters
521
----------
522
tbl_list : list of node-like
523
Type of list elements will vary depending upon parser used
524
attr_name : str
525
Name of the accessor for retrieving HTML attributes
526
527
Returns
528
-------
529
list of node-like
530
Return type matches `tbl_list`
531
"""
532
if not self.displayed_only:
533
return tbl_list
534
535
return [
536
x
537
for x in tbl_list
538
if "display:none"
539
not in getattr(x, attr_name).get("style", "").replace(" ", "")
540
]
541
542
543
class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
544
"""
545
HTML to DataFrame parser that uses BeautifulSoup under the hood.
546
547
See Also
548
--------
549
pandas.io.html._HtmlFrameParser
550
pandas.io.html._LxmlFrameParser
551
552
Notes
553
-----
554
Documentation strings for this class are in the base class
555
:class:`pandas.io.html._HtmlFrameParser`.
556
"""
557
558
def __init__(self, *args, **kwargs):
559
super().__init__(*args, **kwargs)
560
from bs4 import SoupStrainer
561
562
self._strainer = SoupStrainer("table")
563
564
def _parse_tables(self, doc, match, attrs):
565
element_name = self._strainer.name
566
tables = doc.find_all(element_name, attrs=attrs)
567
568
if not tables:
569
raise ValueError("No tables found")
570
571
result = []
572
unique_tables = set()
573
tables = self._handle_hidden_tables(tables, "attrs")
574
575
for table in tables:
576
if self.displayed_only:
577
for elem in table.find_all(style=re.compile(r"display:\s*none")):
578
elem.decompose()
579
580
if table not in unique_tables and table.find(text=match) is not None:
581
result.append(table)
582
unique_tables.add(table)
583
584
if not result:
585
raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
586
return result
587
588
def _text_getter(self, obj):
589
return obj.text
590
591
def _equals_tag(self, obj, tag):
592
return obj.name == tag
593
594
def _parse_td(self, row):
595
return row.find_all(("td", "th"), recursive=False)
596
597
def _parse_thead_tr(self, table):
598
return table.select("thead tr")
599
600
def _parse_tbody_tr(self, table):
601
from_tbody = table.select("tbody tr")
602
from_root = table.find_all("tr", recursive=False)
603
# HTML spec: at most one of these lists has content
604
return from_tbody + from_root
605
606
def _parse_tfoot_tr(self, table):
607
return table.select("tfoot tr")
608
609
def _setup_build_doc(self):
610
raw_text = _read(self.io, self.encoding)
611
if not raw_text:
612
raise ValueError(f"No text parsed from document: {self.io}")
613
return raw_text
614
615
def _build_doc(self):
616
from bs4 import BeautifulSoup
617
618
bdoc = self._setup_build_doc()
619
if isinstance(bdoc, bytes) and self.encoding is not None:
620
udoc = bdoc.decode(self.encoding)
621
from_encoding = None
622
else:
623
udoc = bdoc
624
from_encoding = self.encoding
625
return BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
626
627
628
def _build_xpath_expr(attrs) -> str:
629
"""
630
Build an xpath expression to simulate bs4's ability to pass in kwargs to
631
search for attributes when using the lxml parser.
632
633
Parameters
634
----------
635
attrs : dict
636
A dict of HTML attributes. These are NOT checked for validity.
637
638
Returns
639
-------
640
expr : unicode
641
An XPath expression that checks for the given HTML attributes.
642
"""
643
# give class attribute as class_ because class is a python keyword
644
if "class_" in attrs:
645
attrs["class"] = attrs.pop("class_")
646
647
s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()])
648
return f"[{s}]"
649
650
651
_re_namespace = {"re": "http://exslt.org/regular-expressions"}
652
653
654
class _LxmlFrameParser(_HtmlFrameParser):
655
"""
656
HTML to DataFrame parser that uses lxml under the hood.
657
658
Warning
659
-------
660
This parser can only handle HTTP, FTP, and FILE urls.
661
662
See Also
663
--------
664
_HtmlFrameParser
665
_BeautifulSoupLxmlFrameParser
666
667
Notes
668
-----
669
Documentation strings for this class are in the base class
670
:class:`_HtmlFrameParser`.
671
"""
672
673
def _text_getter(self, obj):
674
return obj.text_content()
675
676
def _parse_td(self, row):
677
# Look for direct children only: the "row" element here may be a
678
# <thead> or <tfoot> (see _parse_thead_tr).
679
return row.xpath("./td|./th")
680
681
def _parse_tables(self, doc, match, kwargs):
682
pattern = match.pattern
683
684
# 1. check all descendants for the given pattern and only search tables
685
# 2. go up the tree until we find a table
686
xpath_expr = f"//table//*[re:test(text(), {repr(pattern)})]/ancestor::table"
687
688
# if any table attributes were given build an xpath expression to
689
# search for them
690
if kwargs:
691
xpath_expr += _build_xpath_expr(kwargs)
692
693
tables = doc.xpath(xpath_expr, namespaces=_re_namespace)
694
695
tables = self._handle_hidden_tables(tables, "attrib")
696
if self.displayed_only:
697
for table in tables:
698
# lxml utilizes XPATH 1.0 which does not have regex
699
# support. As a result, we find all elements with a style
700
# attribute and iterate them to check for display:none
701
for elem in table.xpath(".//*[@style]"):
702
if "display:none" in elem.attrib.get("style", "").replace(" ", ""):
703
elem.getparent().remove(elem)
704
705
if not tables:
706
raise ValueError(f"No tables found matching regex {repr(pattern)}")
707
return tables
708
709
def _equals_tag(self, obj, tag):
710
return obj.tag == tag
711
712
def _build_doc(self):
713
"""
714
Raises
715
------
716
ValueError
717
* If a URL that lxml cannot parse is passed.
718
719
Exception
720
* Any other ``Exception`` thrown. For example, trying to parse a
721
URL that is syntactically correct on a machine with no internet
722
connection will fail.
723
724
See Also
725
--------
726
pandas.io.html._HtmlFrameParser._build_doc
727
"""
728
from lxml.etree import XMLSyntaxError
729
from lxml.html import (
730
HTMLParser,
731
fromstring,
732
parse,
733
)
734
735
parser = HTMLParser(recover=True, encoding=self.encoding)
736
737
try:
738
if is_url(self.io):
739
with urlopen(self.io) as f:
740
r = parse(f, parser=parser)
741
else:
742
# try to parse the input in the simplest way
743
r = parse(self.io, parser=parser)
744
try:
745
r = r.getroot()
746
except AttributeError:
747
pass
748
except (UnicodeDecodeError, OSError) as e:
749
# if the input is a blob of html goop
750
if not is_url(self.io):
751
r = fromstring(self.io, parser=parser)
752
753
try:
754
r = r.getroot()
755
except AttributeError:
756
pass
757
else:
758
raise e
759
else:
760
if not hasattr(r, "text_content"):
761
raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
762
return r
763
764
def _parse_thead_tr(self, table):
765
rows = []
766
767
for thead in table.xpath(".//thead"):
768
rows.extend(thead.xpath("./tr"))
769
770
# HACK: lxml does not clean up the clearly-erroneous
771
# <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add
772
# the <thead> and _pretend_ it's a <tr>; _parse_td() will find its
773
# children as though it's a <tr>.
774
#
775
# Better solution would be to use html5lib.
776
elements_at_root = thead.xpath("./td|./th")
777
if elements_at_root:
778
rows.append(thead)
779
780
return rows
781
782
def _parse_tbody_tr(self, table):
783
from_tbody = table.xpath(".//tbody//tr")
784
from_root = table.xpath("./tr")
785
# HTML spec: at most one of these lists has content
786
return from_tbody + from_root
787
788
def _parse_tfoot_tr(self, table):
789
return table.xpath(".//tfoot//tr")
790
791
792
def _expand_elements(body):
793
data = [len(elem) for elem in body]
794
lens = create_series_with_explicit_dtype(data, dtype_if_empty=object)
795
lens_max = lens.max()
796
not_max = lens[lens != lens_max]
797
798
empty = [""]
799
for ind, length in not_max.items():
800
body[ind] += empty * (lens_max - length)
801
802
803
def _data_to_frame(**kwargs):
804
head, body, foot = kwargs.pop("data")
805
header = kwargs.pop("header")
806
kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"])
807
if head:
808
body = head + body
809
810
# Infer header when there is a <thead> or top <th>-only rows
811
if header is None:
812
if len(head) == 1:
813
header = 0
814
else:
815
# ignore all-empty-text rows
816
header = [i for i, row in enumerate(head) if any(text for text in row)]
817
818
if foot:
819
body += foot
820
821
# fill out elements of body that are "ragged"
822
_expand_elements(body)
823
with TextParser(body, header=header, **kwargs) as tp:
824
return tp.read()
825
826
827
_valid_parsers = {
828
"lxml": _LxmlFrameParser,
829
None: _LxmlFrameParser,
830
"html5lib": _BeautifulSoupHtml5LibFrameParser,
831
"bs4": _BeautifulSoupHtml5LibFrameParser,
832
}
833
834
835
def _parser_dispatch(flavor: str | None) -> type[_HtmlFrameParser]:
836
"""
837
Choose the parser based on the input flavor.
838
839
Parameters
840
----------
841
flavor : str
842
The type of parser to use. This must be a valid backend.
843
844
Returns
845
-------
846
cls : _HtmlFrameParser subclass
847
The parser class based on the requested input flavor.
848
849
Raises
850
------
851
ValueError
852
* If `flavor` is not a valid backend.
853
ImportError
854
* If you do not have the requested `flavor`
855
"""
856
valid_parsers = list(_valid_parsers.keys())
857
if flavor not in valid_parsers:
858
raise ValueError(
859
f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}"
860
)
861
862
if flavor in ("bs4", "html5lib"):
863
if not _HAS_HTML5LIB:
864
raise ImportError("html5lib not found, please install it")
865
if not _HAS_BS4:
866
raise ImportError("BeautifulSoup4 (bs4) not found, please install it")
867
# Although we call this above, we want to raise here right before use.
868
bs4 = import_optional_dependency("bs4") # noqa:F841
869
870
else:
871
if not _HAS_LXML:
872
raise ImportError("lxml not found, please install it")
873
return _valid_parsers[flavor]
874
875
876
def _print_as_set(s) -> str:
877
arg = ", ".join([pprint_thing(el) for el in s])
878
return f"{{{arg}}}"
879
880
881
def _validate_flavor(flavor):
882
if flavor is None:
883
flavor = "lxml", "bs4"
884
elif isinstance(flavor, str):
885
flavor = (flavor,)
886
elif isinstance(flavor, abc.Iterable):
887
if not all(isinstance(flav, str) for flav in flavor):
888
raise TypeError(
889
f"Object of type {repr(type(flavor).__name__)} "
890
f"is not an iterable of strings"
891
)
892
else:
893
msg = repr(flavor) if isinstance(flavor, str) else str(flavor)
894
msg += " is not a valid flavor"
895
raise ValueError(msg)
896
897
flavor = tuple(flavor)
898
valid_flavors = set(_valid_parsers)
899
flavor_set = set(flavor)
900
901
if not flavor_set & valid_flavors:
902
raise ValueError(
903
f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid "
904
f"flavors are {_print_as_set(valid_flavors)}"
905
)
906
return flavor
907
908
909
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
910
flavor = _validate_flavor(flavor)
911
compiled_match = re.compile(match) # you can pass a compiled regex here
912
913
retained = None
914
for flav in flavor:
915
parser = _parser_dispatch(flav)
916
p = parser(io, compiled_match, attrs, encoding, displayed_only)
917
918
try:
919
tables = p.parse_tables()
920
except ValueError as caught:
921
# if `io` is an io-like object, check if it's seekable
922
# and try to rewind it before trying the next parser
923
if hasattr(io, "seekable") and io.seekable():
924
io.seek(0)
925
elif hasattr(io, "seekable") and not io.seekable():
926
# if we couldn't rewind it, let the user know
927
raise ValueError(
928
f"The flavor {flav} failed to parse your input. "
929
"Since you passed a non-rewindable file "
930
"object, we can't rewind it to try "
931
"another parser. Try read_html() with a different flavor."
932
) from caught
933
934
retained = caught
935
else:
936
break
937
else:
938
assert retained is not None # for mypy
939
raise retained
940
941
ret = []
942
for table in tables:
943
try:
944
ret.append(_data_to_frame(data=table, **kwargs))
945
except EmptyDataError: # empty table
946
continue
947
return ret
948
949
950
@deprecate_nonkeyword_arguments(version="2.0")
951
def read_html(
952
io: FilePath | ReadBuffer[str],
953
match: str | Pattern = ".+",
954
flavor: str | None = None,
955
header: int | Sequence[int] | None = None,
956
index_col: int | Sequence[int] | None = None,
957
skiprows: int | Sequence[int] | slice | None = None,
958
attrs: dict[str, str] | None = None,
959
parse_dates: bool = False,
960
thousands: str | None = ",",
961
encoding: str | None = None,
962
decimal: str = ".",
963
converters: dict | None = None,
964
na_values=None,
965
keep_default_na: bool = True,
966
displayed_only: bool = True,
967
) -> list[DataFrame]:
968
r"""
969
Read HTML tables into a ``list`` of ``DataFrame`` objects.
970
971
Parameters
972
----------
973
io : str, path object, or file-like object
974
String, path object (implementing ``os.PathLike[str]``), or file-like
975
object implementing a string ``read()`` function.
976
The string can represent a URL or the HTML itself. Note that
977
lxml only accepts the http, ftp and file url protocols. If you have a
978
URL that starts with ``'https'`` you might try removing the ``'s'``.
979
980
match : str or compiled regular expression, optional
981
The set of tables containing text matching this regex or string will be
982
returned. Unless the HTML is extremely simple you will probably need to
983
pass a non-empty string here. Defaults to '.+' (match any non-empty
984
string). The default value will return all tables contained on a page.
985
This value is converted to a regular expression so that there is
986
consistent behavior between Beautiful Soup and lxml.
987
988
flavor : str, optional
989
The parsing engine to use. 'bs4' and 'html5lib' are synonymous with
990
each other, they are both there for backwards compatibility. The
991
default of ``None`` tries to use ``lxml`` to parse and if that fails it
992
falls back on ``bs4`` + ``html5lib``.
993
994
header : int or list-like, optional
995
The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to
996
make the columns headers.
997
998
index_col : int or list-like, optional
999
The column (or list of columns) to use to create the index.
1000
1001
skiprows : int, list-like or slice, optional
1002
Number of rows to skip after parsing the column integer. 0-based. If a
1003
sequence of integers or a slice is given, will skip the rows indexed by
1004
that sequence. Note that a single element sequence means 'skip the nth
1005
row' whereas an integer means 'skip n rows'.
1006
1007
attrs : dict, optional
1008
This is a dictionary of attributes that you can pass to use to identify
1009
the table in the HTML. These are not checked for validity before being
1010
passed to lxml or Beautiful Soup. However, these attributes must be
1011
valid HTML table attributes to work correctly. For example, ::
1012
1013
attrs = {'id': 'table'}
1014
1015
is a valid attribute dictionary because the 'id' HTML tag attribute is
1016
a valid HTML attribute for *any* HTML tag as per `this document
1017
<https://html.spec.whatwg.org/multipage/dom.html#global-attributes>`__. ::
1018
1019
attrs = {'asdf': 'table'}
1020
1021
is *not* a valid attribute dictionary because 'asdf' is not a valid
1022
HTML attribute even if it is a valid XML attribute. Valid HTML 4.01
1023
table attributes can be found `here
1024
<http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A
1025
working draft of the HTML 5 spec can be found `here
1026
<https://html.spec.whatwg.org/multipage/tables.html>`__. It contains the
1027
latest information on table attributes for the modern web.
1028
1029
parse_dates : bool, optional
1030
See :func:`~read_csv` for more details.
1031
1032
thousands : str, optional
1033
Separator to use to parse thousands. Defaults to ``','``.
1034
1035
encoding : str, optional
1036
The encoding used to decode the web page. Defaults to ``None``.``None``
1037
preserves the previous encoding behavior, which depends on the
1038
underlying parser library (e.g., the parser library will try to use
1039
the encoding provided by the document).
1040
1041
decimal : str, default '.'
1042
Character to recognize as decimal point (e.g. use ',' for European
1043
data).
1044
1045
converters : dict, default None
1046
Dict of functions for converting values in certain columns. Keys can
1047
either be integers or column labels, values are functions that take one
1048
input argument, the cell (not column) content, and return the
1049
transformed content.
1050
1051
na_values : iterable, default None
1052
Custom NA values.
1053
1054
keep_default_na : bool, default True
1055
If na_values are specified and keep_default_na is False the default NaN
1056
values are overridden, otherwise they're appended to.
1057
1058
displayed_only : bool, default True
1059
Whether elements with "display: none" should be parsed.
1060
1061
Returns
1062
-------
1063
dfs
1064
A list of DataFrames.
1065
1066
See Also
1067
--------
1068
read_csv : Read a comma-separated values (csv) file into DataFrame.
1069
1070
Notes
1071
-----
1072
Before using this function you should read the :ref:`gotchas about the
1073
HTML parsing libraries <io.html.gotchas>`.
1074
1075
Expect to do some cleanup after you call this function. For example, you
1076
might need to manually assign column names if the column names are
1077
converted to NaN when you pass the `header=0` argument. We try to assume as
1078
little as possible about the structure of the table and push the
1079
idiosyncrasies of the HTML contained in the table to the user.
1080
1081
This function searches for ``<table>`` elements and only for ``<tr>``
1082
and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``
1083
element in the table. ``<td>`` stands for "table data". This function
1084
attempts to properly handle ``colspan`` and ``rowspan`` attributes.
1085
If the function has a ``<thead>`` argument, it is used to construct
1086
the header, otherwise the function attempts to find the header within
1087
the body (by putting rows with only ``<th>`` elements into the header).
1088
1089
Similar to :func:`~read_csv` the `header` argument is applied
1090
**after** `skiprows` is applied.
1091
1092
This function will *always* return a list of :class:`DataFrame` *or*
1093
it will fail, e.g., it will *not* return an empty list.
1094
1095
Examples
1096
--------
1097
See the :ref:`read_html documentation in the IO section of the docs
1098
<io.read_html>` for some examples of reading in HTML tables.
1099
"""
1100
_importers()
1101
1102
# Type check here. We don't want to parse only to fail because of an
1103
# invalid value of an integer skiprows.
1104
if isinstance(skiprows, numbers.Integral) and skiprows < 0:
1105
raise ValueError(
1106
"cannot skip rows starting from the end of the "
1107
"data (you passed a negative value)"
1108
)
1109
validate_header_arg(header)
1110
1111
io = stringify_path(io)
1112
1113
return _parse(
1114
flavor=flavor,
1115
io=io,
1116
match=match,
1117
header=header,
1118
index_col=index_col,
1119
skiprows=skiprows,
1120
parse_dates=parse_dates,
1121
thousands=thousands,
1122
attrs=attrs,
1123
encoding=encoding,
1124
decimal=decimal,
1125
converters=converters,
1126
na_values=na_values,
1127
keep_default_na=keep_default_na,
1128
displayed_only=displayed_only,
1129
)
1130
1131