CoCalc -- html.py

GitHub Repository: wiseplat/python-code
Path: blob/master/ invest-robot-contest_TinkoffBotTwitch-main/venv/lib/python3.8/site-packages/pandas/io/html.py
⁷⁸¹³ views
1
"""
2
:mod:`pandas.io.html` is a module containing functionality for dealing with
3
HTML IO.
4

5
"""
6

7
from __future__ import annotations
8

9
from collections import abc
10
import numbers
11
import re
12
from typing import (
13
    Pattern,
14
    Sequence,
15
    cast,
16
)
17

18
from pandas._typing import (
19
    FilePath,
20
    ReadBuffer,
21
)
22
from pandas.compat._optional import import_optional_dependency
23
from pandas.errors import (
24
    AbstractMethodError,
25
    EmptyDataError,
26
)
27
from pandas.util._decorators import deprecate_nonkeyword_arguments
28

29
from pandas.core.dtypes.common import is_list_like
30

31
from pandas.core.construction import create_series_with_explicit_dtype
32
from pandas.core.frame import DataFrame
33

34
from pandas.io.common import (
35
    file_exists,
36
    get_handle,
37
    is_url,
38
    stringify_path,
39
    urlopen,
40
    validate_header_arg,
41
)
42
from pandas.io.formats.printing import pprint_thing
43
from pandas.io.parsers import TextParser
44

45
_IMPORTS = False
46
_HAS_BS4 = False
47
_HAS_LXML = False
48
_HAS_HTML5LIB = False
49

50

51
def _importers() -> None:
52
    # import things we need
53
    # but make this done on a first use basis
54

55
    global _IMPORTS
56
    if _IMPORTS:
57
        return
58

59
    global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB
60
    bs4 = import_optional_dependency("bs4", errors="ignore")
61
    _HAS_BS4 = bs4 is not None
62

63
    lxml = import_optional_dependency("lxml.etree", errors="ignore")
64
    _HAS_LXML = lxml is not None
65

66
    html5lib = import_optional_dependency("html5lib", errors="ignore")
67
    _HAS_HTML5LIB = html5lib is not None
68

69
    _IMPORTS = True
70

71

72
#############
73
# READ HTML #
74
#############
75
_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}")
76

77

78
def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str:
79
    """
80
    Replace extra whitespace inside of a string with a single space.
81

82
    Parameters
83
    ----------
84
    s : str or unicode
85
        The string from which to remove extra whitespace.
86
    regex : re.Pattern
87
        The regular expression to use to remove extra whitespace.
88

89
    Returns
90
    -------
91
    subd : str or unicode
92
        `s` with all extra whitespace replaced with a single space.
93
    """
94
    return regex.sub(" ", s.strip())
95

96

97
def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]:
98
    """
99
    Get an iterator given an integer, slice or container.
100

101
    Parameters
102
    ----------
103
    skiprows : int, slice, container
104
        The iterator to use to skip rows; can also be a slice.
105

106
    Raises
107
    ------
108
    TypeError
109
        * If `skiprows` is not a slice, integer, or Container
110

111
    Returns
112
    -------
113
    it : iterable
114
        A proper iterator to use to skip rows of a DataFrame.
115
    """
116
    if isinstance(skiprows, slice):
117
        start, step = skiprows.start or 0, skiprows.step or 1
118
        return list(range(start, skiprows.stop, step))
119
    elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):
120
        return cast("int | Sequence[int]", skiprows)
121
    elif skiprows is None:
122
        return 0
123
    raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows")
124

125

126
def _read(
127
    obj: bytes | FilePath | ReadBuffer[str] | ReadBuffer[bytes], encoding: str | None
128
) -> str | bytes:
129
    """
130
    Try to read from a url, file or string.
131

132
    Parameters
133
    ----------
134
    obj : str, unicode, path object, or file-like object
135

136
    Returns
137
    -------
138
    raw_text : str
139
    """
140
    text: str | bytes
141
    if (
142
        is_url(obj)
143
        or hasattr(obj, "read")
144
        or (isinstance(obj, str) and file_exists(obj))
145
    ):
146
        # error: Argument 1 to "get_handle" has incompatible type "Union[str, bytes,
147
        # Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]";
148
        # expected "Union[PathLike[str], Union[str, Union[IO[Any], RawIOBase,
149
        # BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]]"
150
        with get_handle(
151
            obj, "r", encoding=encoding  # type: ignore[arg-type]
152
        ) as handles:
153
            text = handles.handle.read()
154
    elif isinstance(obj, (str, bytes)):
155
        text = obj
156
    else:
157
        raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")
158
    return text
159

160

161
class _HtmlFrameParser:
162
    """
163
    Base class for parsers that parse HTML into DataFrames.
164

165
    Parameters
166
    ----------
167
    io : str or file-like
168
        This can be either a string of raw HTML, a valid URL using the HTTP,
169
        FTP, or FILE protocols or a file-like object.
170

171
    match : str or regex
172
        The text to match in the document.
173

174
    attrs : dict
175
        List of HTML <table> element attributes to match.
176

177
    encoding : str
178
        Encoding to be used by parser
179

180
    displayed_only : bool
181
        Whether or not items with "display:none" should be ignored
182

183
    Attributes
184
    ----------
185
    io : str or file-like
186
        raw HTML, URL, or file-like object
187

188
    match : regex
189
        The text to match in the raw HTML
190

191
    attrs : dict-like
192
        A dictionary of valid table attributes to use to search for table
193
        elements.
194

195
    encoding : str
196
        Encoding to be used by parser
197

198
    displayed_only : bool
199
        Whether or not items with "display:none" should be ignored
200

201
    Notes
202
    -----
203
    To subclass this class effectively you must override the following methods:
204
        * :func:`_build_doc`
205
        * :func:`_attr_getter`
206
        * :func:`_text_getter`
207
        * :func:`_parse_td`
208
        * :func:`_parse_thead_tr`
209
        * :func:`_parse_tbody_tr`
210
        * :func:`_parse_tfoot_tr`
211
        * :func:`_parse_tables`
212
        * :func:`_equals_tag`
213
    See each method's respective documentation for details on their
214
    functionality.
215
    """
216

217
    def __init__(
218
        self,
219
        io: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
220
        match: str | Pattern,
221
        attrs: dict[str, str] | None,
222
        encoding: str,
223
        displayed_only: bool,
224
    ):
225
        self.io = io
226
        self.match = match
227
        self.attrs = attrs
228
        self.encoding = encoding
229
        self.displayed_only = displayed_only
230

231
    def parse_tables(self):
232
        """
233
        Parse and return all tables from the DOM.
234

235
        Returns
236
        -------
237
        list of parsed (header, body, footer) tuples from tables.
238
        """
239
        tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
240
        return (self._parse_thead_tbody_tfoot(table) for table in tables)
241

242
    def _attr_getter(self, obj, attr):
243
        """
244
        Return the attribute value of an individual DOM node.
245

246
        Parameters
247
        ----------
248
        obj : node-like
249
            A DOM node.
250

251
        attr : str or unicode
252
            The attribute, such as "colspan"
253

254
        Returns
255
        -------
256
        str or unicode
257
            The attribute value.
258
        """
259
        # Both lxml and BeautifulSoup have the same implementation:
260
        return obj.get(attr)
261

262
    def _text_getter(self, obj):
263
        """
264
        Return the text of an individual DOM node.
265

266
        Parameters
267
        ----------
268
        obj : node-like
269
            A DOM node.
270

271
        Returns
272
        -------
273
        text : str or unicode
274
            The text from an individual DOM node.
275
        """
276
        raise AbstractMethodError(self)
277

278
    def _parse_td(self, obj):
279
        """
280
        Return the td elements from a row element.
281

282
        Parameters
283
        ----------
284
        obj : node-like
285
            A DOM <tr> node.
286

287
        Returns
288
        -------
289
        list of node-like
290
            These are the elements of each row, i.e., the columns.
291
        """
292
        raise AbstractMethodError(self)
293

294
    def _parse_thead_tr(self, table):
295
        """
296
        Return the list of thead row elements from the parsed table element.
297

298
        Parameters
299
        ----------
300
        table : a table element that contains zero or more thead elements.
301

302
        Returns
303
        -------
304
        list of node-like
305
            These are the <tr> row elements of a table.
306
        """
307
        raise AbstractMethodError(self)
308

309
    def _parse_tbody_tr(self, table):
310
        """
311
        Return the list of tbody row elements from the parsed table element.
312

313
        HTML5 table bodies consist of either 0 or more <tbody> elements (which
314
        only contain <tr> elements) or 0 or more <tr> elements. This method
315
        checks for both structures.
316

317
        Parameters
318
        ----------
319
        table : a table element that contains row elements.
320

321
        Returns
322
        -------
323
        list of node-like
324
            These are the <tr> row elements of a table.
325
        """
326
        raise AbstractMethodError(self)
327

328
    def _parse_tfoot_tr(self, table):
329
        """
330
        Return the list of tfoot row elements from the parsed table element.
331

332
        Parameters
333
        ----------
334
        table : a table element that contains row elements.
335

336
        Returns
337
        -------
338
        list of node-like
339
            These are the <tr> row elements of a table.
340
        """
341
        raise AbstractMethodError(self)
342

343
    def _parse_tables(self, doc, match, attrs):
344
        """
345
        Return all tables from the parsed DOM.
346

347
        Parameters
348
        ----------
349
        doc : the DOM from which to parse the table element.
350

351
        match : str or regular expression
352
            The text to search for in the DOM tree.
353

354
        attrs : dict
355
            A dictionary of table attributes that can be used to disambiguate
356
            multiple tables on a page.
357

358
        Raises
359
        ------
360
        ValueError : `match` does not match any text in the document.
361

362
        Returns
363
        -------
364
        list of node-like
365
            HTML <table> elements to be parsed into raw data.
366
        """
367
        raise AbstractMethodError(self)
368

369
    def _equals_tag(self, obj, tag):
370
        """
371
        Return whether an individual DOM node matches a tag
372

373
        Parameters
374
        ----------
375
        obj : node-like
376
            A DOM node.
377

378
        tag : str
379
            Tag name to be checked for equality.
380

381
        Returns
382
        -------
383
        boolean
384
            Whether `obj`'s tag name is `tag`
385
        """
386
        raise AbstractMethodError(self)
387

388
    def _build_doc(self):
389
        """
390
        Return a tree-like object that can be used to iterate over the DOM.
391

392
        Returns
393
        -------
394
        node-like
395
            The DOM from which to parse the table element.
396
        """
397
        raise AbstractMethodError(self)
398

399
    def _parse_thead_tbody_tfoot(self, table_html):
400
        """
401
        Given a table, return parsed header, body, and foot.
402

403
        Parameters
404
        ----------
405
        table_html : node-like
406

407
        Returns
408
        -------
409
        tuple of (header, body, footer), each a list of list-of-text rows.
410

411
        Notes
412
        -----
413
        Header and body are lists-of-lists. Top level list is a list of
414
        rows. Each row is a list of str text.
415

416
        Logic: Use <thead>, <tbody>, <tfoot> elements to identify
417
               header, body, and footer, otherwise:
418
               - Put all rows into body
419
               - Move rows from top of body to header only if
420
                 all elements inside row are <th>
421
               - Move rows from bottom of body to footer only if
422
                 all elements inside row are <th>
423
        """
424
        header_rows = self._parse_thead_tr(table_html)
425
        body_rows = self._parse_tbody_tr(table_html)
426
        footer_rows = self._parse_tfoot_tr(table_html)
427

428
        def row_is_all_th(row):
429
            return all(self._equals_tag(t, "th") for t in self._parse_td(row))
430

431
        if not header_rows:
432
            # The table has no <thead>. Move the top all-<th> rows from
433
            # body_rows to header_rows. (This is a common case because many
434
            # tables in the wild have no <thead> or <tfoot>
435
            while body_rows and row_is_all_th(body_rows[0]):
436
                header_rows.append(body_rows.pop(0))
437

438
        header = self._expand_colspan_rowspan(header_rows)
439
        body = self._expand_colspan_rowspan(body_rows)
440
        footer = self._expand_colspan_rowspan(footer_rows)
441

442
        return header, body, footer
443

444
    def _expand_colspan_rowspan(self, rows):
445
        """
446
        Given a list of <tr>s, return a list of text rows.
447

448
        Parameters
449
        ----------
450
        rows : list of node-like
451
            List of <tr>s
452

453
        Returns
454
        -------
455
        list of list
456
            Each returned row is a list of str text.
457

458
        Notes
459
        -----
460
        Any cell with ``rowspan`` or ``colspan`` will have its contents copied
461
        to subsequent cells.
462
        """
463
        all_texts = []  # list of rows, each a list of str
464
        remainder: list[tuple[int, str, int]] = []  # list of (index, text, nrows)
465

466
        for tr in rows:
467
            texts = []  # the output for this row
468
            next_remainder = []
469

470
            index = 0
471
            tds = self._parse_td(tr)
472
            for td in tds:
473
                # Append texts from previous rows with rowspan>1 that come
474
                # before this <td>
475
                while remainder and remainder[0][0] <= index:
476
                    prev_i, prev_text, prev_rowspan = remainder.pop(0)
477
                    texts.append(prev_text)
478
                    if prev_rowspan > 1:
479
                        next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
480
                    index += 1
481

482
                # Append the text from this <td>, colspan times
483
                text = _remove_whitespace(self._text_getter(td))
484
                rowspan = int(self._attr_getter(td, "rowspan") or 1)
485
                colspan = int(self._attr_getter(td, "colspan") or 1)
486

487
                for _ in range(colspan):
488
                    texts.append(text)
489
                    if rowspan > 1:
490
                        next_remainder.append((index, text, rowspan - 1))
491
                    index += 1
492

493
            # Append texts from previous rows at the final position
494
            for prev_i, prev_text, prev_rowspan in remainder:
495
                texts.append(prev_text)
496
                if prev_rowspan > 1:
497
                    next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
498

499
            all_texts.append(texts)
500
            remainder = next_remainder
501

502
        # Append rows that only appear because the previous row had non-1
503
        # rowspan
504
        while remainder:
505
            next_remainder = []
506
            texts = []
507
            for prev_i, prev_text, prev_rowspan in remainder:
508
                texts.append(prev_text)
509
                if prev_rowspan > 1:
510
                    next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
511
            all_texts.append(texts)
512
            remainder = next_remainder
513

514
        return all_texts
515

516
    def _handle_hidden_tables(self, tbl_list, attr_name):
517
        """
518
        Return list of tables, potentially removing hidden elements
519

520
        Parameters
521
        ----------
522
        tbl_list : list of node-like
523
            Type of list elements will vary depending upon parser used
524
        attr_name : str
525
            Name of the accessor for retrieving HTML attributes
526

527
        Returns
528
        -------
529
        list of node-like
530
            Return type matches `tbl_list`
531
        """
532
        if not self.displayed_only:
533
            return tbl_list
534

535
        return [
536
            x
537
            for x in tbl_list
538
            if "display:none"
539
            not in getattr(x, attr_name).get("style", "").replace(" ", "")
540
        ]
541

542

543
class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
544
    """
545
    HTML to DataFrame parser that uses BeautifulSoup under the hood.
546

547
    See Also
548
    --------
549
    pandas.io.html._HtmlFrameParser
550
    pandas.io.html._LxmlFrameParser
551

552
    Notes
553
    -----
554
    Documentation strings for this class are in the base class
555
    :class:`pandas.io.html._HtmlFrameParser`.
556
    """
557

558
    def __init__(self, *args, **kwargs):
559
        super().__init__(*args, **kwargs)
560
        from bs4 import SoupStrainer
561

562
        self._strainer = SoupStrainer("table")
563

564
    def _parse_tables(self, doc, match, attrs):
565
        element_name = self._strainer.name
566
        tables = doc.find_all(element_name, attrs=attrs)
567

568
        if not tables:
569
            raise ValueError("No tables found")
570

571
        result = []
572
        unique_tables = set()
573
        tables = self._handle_hidden_tables(tables, "attrs")
574

575
        for table in tables:
576
            if self.displayed_only:
577
                for elem in table.find_all(style=re.compile(r"display:\s*none")):
578
                    elem.decompose()
579

580
            if table not in unique_tables and table.find(text=match) is not None:
581
                result.append(table)
582
            unique_tables.add(table)
583

584
        if not result:
585
            raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
586
        return result
587

588
    def _text_getter(self, obj):
589
        return obj.text
590

591
    def _equals_tag(self, obj, tag):
592
        return obj.name == tag
593

594
    def _parse_td(self, row):
595
        return row.find_all(("td", "th"), recursive=False)
596

597
    def _parse_thead_tr(self, table):
598
        return table.select("thead tr")
599

600
    def _parse_tbody_tr(self, table):
601
        from_tbody = table.select("tbody tr")
602
        from_root = table.find_all("tr", recursive=False)
603
        # HTML spec: at most one of these lists has content
604
        return from_tbody + from_root
605

606
    def _parse_tfoot_tr(self, table):
607
        return table.select("tfoot tr")
608

609
    def _setup_build_doc(self):
610
        raw_text = _read(self.io, self.encoding)
611
        if not raw_text:
612
            raise ValueError(f"No text parsed from document: {self.io}")
613
        return raw_text
614

615
    def _build_doc(self):
616
        from bs4 import BeautifulSoup
617

618
        bdoc = self._setup_build_doc()
619
        if isinstance(bdoc, bytes) and self.encoding is not None:
620
            udoc = bdoc.decode(self.encoding)
621
            from_encoding = None
622
        else:
623
            udoc = bdoc
624
            from_encoding = self.encoding
625
        return BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
626

627

628
def _build_xpath_expr(attrs) -> str:
629
    """
630
    Build an xpath expression to simulate bs4's ability to pass in kwargs to
631
    search for attributes when using the lxml parser.
632

633
    Parameters
634
    ----------
635
    attrs : dict
636
        A dict of HTML attributes. These are NOT checked for validity.
637

638
    Returns
639
    -------
640
    expr : unicode
641
        An XPath expression that checks for the given HTML attributes.
642
    """
643
    # give class attribute as class_ because class is a python keyword
644
    if "class_" in attrs:
645
        attrs["class"] = attrs.pop("class_")
646

647
    s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()])
648
    return f"[{s}]"
649

650

651
_re_namespace = {"re": "http://exslt.org/regular-expressions"}
652

653

654
class _LxmlFrameParser(_HtmlFrameParser):
655
    """
656
    HTML to DataFrame parser that uses lxml under the hood.
657

658
    Warning
659
    -------
660
    This parser can only handle HTTP, FTP, and FILE urls.
661

662
    See Also
663
    --------
664
    _HtmlFrameParser
665
    _BeautifulSoupLxmlFrameParser
666

667
    Notes
668
    -----
669
    Documentation strings for this class are in the base class
670
    :class:`_HtmlFrameParser`.
671
    """
672

673
    def _text_getter(self, obj):
674
        return obj.text_content()
675

676
    def _parse_td(self, row):
677
        # Look for direct children only: the "row" element here may be a
678
        # <thead> or <tfoot> (see _parse_thead_tr).
679
        return row.xpath("./td|./th")
680

681
    def _parse_tables(self, doc, match, kwargs):
682
        pattern = match.pattern
683

684
        # 1. check all descendants for the given pattern and only search tables
685
        # 2. go up the tree until we find a table
686
        xpath_expr = f"//table//*[re:test(text(), {repr(pattern)})]/ancestor::table"
687

688
        # if any table attributes were given build an xpath expression to
689
        # search for them
690
        if kwargs:
691
            xpath_expr += _build_xpath_expr(kwargs)
692

693
        tables = doc.xpath(xpath_expr, namespaces=_re_namespace)
694

695
        tables = self._handle_hidden_tables(tables, "attrib")
696
        if self.displayed_only:
697
            for table in tables:
698
                # lxml utilizes XPATH 1.0 which does not have regex
699
                # support. As a result, we find all elements with a style
700
                # attribute and iterate them to check for display:none
701
                for elem in table.xpath(".//*[@style]"):
702
                    if "display:none" in elem.attrib.get("style", "").replace(" ", ""):
703
                        elem.getparent().remove(elem)
704

705
        if not tables:
706
            raise ValueError(f"No tables found matching regex {repr(pattern)}")
707
        return tables
708

709
    def _equals_tag(self, obj, tag):
710
        return obj.tag == tag
711

712
    def _build_doc(self):
713
        """
714
        Raises
715
        ------
716
        ValueError
717
            * If a URL that lxml cannot parse is passed.
718

719
        Exception
720
            * Any other ``Exception`` thrown. For example, trying to parse a
721
              URL that is syntactically correct on a machine with no internet
722
              connection will fail.
723

724
        See Also
725
        --------
726
        pandas.io.html._HtmlFrameParser._build_doc
727
        """
728
        from lxml.etree import XMLSyntaxError
729
        from lxml.html import (
730
            HTMLParser,
731
            fromstring,
732
            parse,
733
        )
734

735
        parser = HTMLParser(recover=True, encoding=self.encoding)
736

737
        try:
738
            if is_url(self.io):
739
                with urlopen(self.io) as f:
740
                    r = parse(f, parser=parser)
741
            else:
742
                # try to parse the input in the simplest way
743
                r = parse(self.io, parser=parser)
744
            try:
745
                r = r.getroot()
746
            except AttributeError:
747
                pass
748
        except (UnicodeDecodeError, OSError) as e:
749
            # if the input is a blob of html goop
750
            if not is_url(self.io):
751
                r = fromstring(self.io, parser=parser)
752

753
                try:
754
                    r = r.getroot()
755
                except AttributeError:
756
                    pass
757
            else:
758
                raise e
759
        else:
760
            if not hasattr(r, "text_content"):
761
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
762
        return r
763

764
    def _parse_thead_tr(self, table):
765
        rows = []
766

767
        for thead in table.xpath(".//thead"):
768
            rows.extend(thead.xpath("./tr"))
769

770
            # HACK: lxml does not clean up the clearly-erroneous
771
            # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add
772
            # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its
773
            # children as though it's a <tr>.
774
            #
775
            # Better solution would be to use html5lib.
776
            elements_at_root = thead.xpath("./td|./th")
777
            if elements_at_root:
778
                rows.append(thead)
779

780
        return rows
781

782
    def _parse_tbody_tr(self, table):
783
        from_tbody = table.xpath(".//tbody//tr")
784
        from_root = table.xpath("./tr")
785
        # HTML spec: at most one of these lists has content
786
        return from_tbody + from_root
787

788
    def _parse_tfoot_tr(self, table):
789
        return table.xpath(".//tfoot//tr")
790

791

792
def _expand_elements(body):
793
    data = [len(elem) for elem in body]
794
    lens = create_series_with_explicit_dtype(data, dtype_if_empty=object)
795
    lens_max = lens.max()
796
    not_max = lens[lens != lens_max]
797

798
    empty = [""]
799
    for ind, length in not_max.items():
800
        body[ind] += empty * (lens_max - length)
801

802

803
def _data_to_frame(**kwargs):
804
    head, body, foot = kwargs.pop("data")
805
    header = kwargs.pop("header")
806
    kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"])
807
    if head:
808
        body = head + body
809

810
        # Infer header when there is a <thead> or top <th>-only rows
811
        if header is None:
812
            if len(head) == 1:
813
                header = 0
814
            else:
815
                # ignore all-empty-text rows
816
                header = [i for i, row in enumerate(head) if any(text for text in row)]
817

818
    if foot:
819
        body += foot
820

821
    # fill out elements of body that are "ragged"
822
    _expand_elements(body)
823
    with TextParser(body, header=header, **kwargs) as tp:
824
        return tp.read()
825

826

827
_valid_parsers = {
828
    "lxml": _LxmlFrameParser,
829
    None: _LxmlFrameParser,
830
    "html5lib": _BeautifulSoupHtml5LibFrameParser,
831
    "bs4": _BeautifulSoupHtml5LibFrameParser,
832
}
833

834

835
def _parser_dispatch(flavor: str | None) -> type[_HtmlFrameParser]:
836
    """
837
    Choose the parser based on the input flavor.
838

839
    Parameters
840
    ----------
841
    flavor : str
842
        The type of parser to use. This must be a valid backend.
843

844
    Returns
845
    -------
846
    cls : _HtmlFrameParser subclass
847
        The parser class based on the requested input flavor.
848

849
    Raises
850
    ------
851
    ValueError
852
        * If `flavor` is not a valid backend.
853
    ImportError
854
        * If you do not have the requested `flavor`
855
    """
856
    valid_parsers = list(_valid_parsers.keys())
857
    if flavor not in valid_parsers:
858
        raise ValueError(
859
            f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}"
860
        )
861

862
    if flavor in ("bs4", "html5lib"):
863
        if not _HAS_HTML5LIB:
864
            raise ImportError("html5lib not found, please install it")
865
        if not _HAS_BS4:
866
            raise ImportError("BeautifulSoup4 (bs4) not found, please install it")
867
        # Although we call this above, we want to raise here right before use.
868
        bs4 = import_optional_dependency("bs4")  # noqa:F841
869

870
    else:
871
        if not _HAS_LXML:
872
            raise ImportError("lxml not found, please install it")
873
    return _valid_parsers[flavor]
874

875

876
def _print_as_set(s) -> str:
877
    arg = ", ".join([pprint_thing(el) for el in s])
878
    return f"{{{arg}}}"
879

880

881
def _validate_flavor(flavor):
882
    if flavor is None:
883
        flavor = "lxml", "bs4"
884
    elif isinstance(flavor, str):
885
        flavor = (flavor,)
886
    elif isinstance(flavor, abc.Iterable):
887
        if not all(isinstance(flav, str) for flav in flavor):
888
            raise TypeError(
889
                f"Object of type {repr(type(flavor).__name__)} "
890
                f"is not an iterable of strings"
891
            )
892
    else:
893
        msg = repr(flavor) if isinstance(flavor, str) else str(flavor)
894
        msg += " is not a valid flavor"
895
        raise ValueError(msg)
896

897
    flavor = tuple(flavor)
898
    valid_flavors = set(_valid_parsers)
899
    flavor_set = set(flavor)
900

901
    if not flavor_set & valid_flavors:
902
        raise ValueError(
903
            f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid "
904
            f"flavors are {_print_as_set(valid_flavors)}"
905
        )
906
    return flavor
907

908

909
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
910
    flavor = _validate_flavor(flavor)
911
    compiled_match = re.compile(match)  # you can pass a compiled regex here
912

913
    retained = None
914
    for flav in flavor:
915
        parser = _parser_dispatch(flav)
916
        p = parser(io, compiled_match, attrs, encoding, displayed_only)
917

918
        try:
919
            tables = p.parse_tables()
920
        except ValueError as caught:
921
            # if `io` is an io-like object, check if it's seekable
922
            # and try to rewind it before trying the next parser
923
            if hasattr(io, "seekable") and io.seekable():
924
                io.seek(0)
925
            elif hasattr(io, "seekable") and not io.seekable():
926
                # if we couldn't rewind it, let the user know
927
                raise ValueError(
928
                    f"The flavor {flav} failed to parse your input. "
929
                    "Since you passed a non-rewindable file "
930
                    "object, we can't rewind it to try "
931
                    "another parser. Try read_html() with a different flavor."
932
                ) from caught
933

934
            retained = caught
935
        else:
936
            break
937
    else:
938
        assert retained is not None  # for mypy
939
        raise retained
940

941
    ret = []
942
    for table in tables:
943
        try:
944
            ret.append(_data_to_frame(data=table, **kwargs))
945
        except EmptyDataError:  # empty table
946
            continue
947
    return ret
948

949

950
@deprecate_nonkeyword_arguments(version="2.0")
951
def read_html(
952
    io: FilePath | ReadBuffer[str],
953
    match: str | Pattern = ".+",
954
    flavor: str | None = None,
955
    header: int | Sequence[int] | None = None,
956
    index_col: int | Sequence[int] | None = None,
957
    skiprows: int | Sequence[int] | slice | None = None,
958
    attrs: dict[str, str] | None = None,
959
    parse_dates: bool = False,
960
    thousands: str | None = ",",
961
    encoding: str | None = None,
962
    decimal: str = ".",
963
    converters: dict | None = None,
964
    na_values=None,
965
    keep_default_na: bool = True,
966
    displayed_only: bool = True,
967
) -> list[DataFrame]:
968
    r"""
969
    Read HTML tables into a ``list`` of ``DataFrame`` objects.
970

971
    Parameters
972
    ----------
973
    io : str, path object, or file-like object
974
        String, path object (implementing ``os.PathLike[str]``), or file-like
975
        object implementing a string ``read()`` function.
976
        The string can represent a URL or the HTML itself. Note that
977
        lxml only accepts the http, ftp and file url protocols. If you have a
978
        URL that starts with ``'https'`` you might try removing the ``'s'``.
979

980
    match : str or compiled regular expression, optional
981
        The set of tables containing text matching this regex or string will be
982
        returned. Unless the HTML is extremely simple you will probably need to
983
        pass a non-empty string here. Defaults to '.+' (match any non-empty
984
        string). The default value will return all tables contained on a page.
985
        This value is converted to a regular expression so that there is
986
        consistent behavior between Beautiful Soup and lxml.
987

988
    flavor : str, optional
989
        The parsing engine to use. 'bs4' and 'html5lib' are synonymous with
990
        each other, they are both there for backwards compatibility. The
991
        default of ``None`` tries to use ``lxml`` to parse and if that fails it
992
        falls back on ``bs4`` + ``html5lib``.
993

994
    header : int or list-like, optional
995
        The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to
996
        make the columns headers.
997

998
    index_col : int or list-like, optional
999
        The column (or list of columns) to use to create the index.
1000

1001
    skiprows : int, list-like or slice, optional
1002
        Number of rows to skip after parsing the column integer. 0-based. If a
1003
        sequence of integers or a slice is given, will skip the rows indexed by
1004
        that sequence.  Note that a single element sequence means 'skip the nth
1005
        row' whereas an integer means 'skip n rows'.
1006

1007
    attrs : dict, optional
1008
        This is a dictionary of attributes that you can pass to use to identify
1009
        the table in the HTML. These are not checked for validity before being
1010
        passed to lxml or Beautiful Soup. However, these attributes must be
1011
        valid HTML table attributes to work correctly. For example, ::
1012

1013
            attrs = {'id': 'table'}
1014

1015
        is a valid attribute dictionary because the 'id' HTML tag attribute is
1016
        a valid HTML attribute for *any* HTML tag as per `this document
1017
        <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>`__. ::
1018

1019
            attrs = {'asdf': 'table'}
1020

1021
        is *not* a valid attribute dictionary because 'asdf' is not a valid
1022
        HTML attribute even if it is a valid XML attribute.  Valid HTML 4.01
1023
        table attributes can be found `here
1024
        <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A
1025
        working draft of the HTML 5 spec can be found `here
1026
        <https://html.spec.whatwg.org/multipage/tables.html>`__. It contains the
1027
        latest information on table attributes for the modern web.
1028

1029
    parse_dates : bool, optional
1030
        See :func:`~read_csv` for more details.
1031

1032
    thousands : str, optional
1033
        Separator to use to parse thousands. Defaults to ``','``.
1034

1035
    encoding : str, optional
1036
        The encoding used to decode the web page. Defaults to ``None``.``None``
1037
        preserves the previous encoding behavior, which depends on the
1038
        underlying parser library (e.g., the parser library will try to use
1039
        the encoding provided by the document).
1040

1041
    decimal : str, default '.'
1042
        Character to recognize as decimal point (e.g. use ',' for European
1043
        data).
1044

1045
    converters : dict, default None
1046
        Dict of functions for converting values in certain columns. Keys can
1047
        either be integers or column labels, values are functions that take one
1048
        input argument, the cell (not column) content, and return the
1049
        transformed content.
1050

1051
    na_values : iterable, default None
1052
        Custom NA values.
1053

1054
    keep_default_na : bool, default True
1055
        If na_values are specified and keep_default_na is False the default NaN
1056
        values are overridden, otherwise they're appended to.
1057

1058
    displayed_only : bool, default True
1059
        Whether elements with "display: none" should be parsed.
1060

1061
    Returns
1062
    -------
1063
    dfs
1064
        A list of DataFrames.
1065

1066
    See Also
1067
    --------
1068
    read_csv : Read a comma-separated values (csv) file into DataFrame.
1069

1070
    Notes
1071
    -----
1072
    Before using this function you should read the :ref:`gotchas about the
1073
    HTML parsing libraries <io.html.gotchas>`.
1074

1075
    Expect to do some cleanup after you call this function. For example, you
1076
    might need to manually assign column names if the column names are
1077
    converted to NaN when you pass the `header=0` argument. We try to assume as
1078
    little as possible about the structure of the table and push the
1079
    idiosyncrasies of the HTML contained in the table to the user.
1080

1081
    This function searches for ``<table>`` elements and only for ``<tr>``
1082
    and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``
1083
    element in the table. ``<td>`` stands for "table data". This function
1084
    attempts to properly handle ``colspan`` and ``rowspan`` attributes.
1085
    If the function has a ``<thead>`` argument, it is used to construct
1086
    the header, otherwise the function attempts to find the header within
1087
    the body (by putting rows with only ``<th>`` elements into the header).
1088

1089
    Similar to :func:`~read_csv` the `header` argument is applied
1090
    **after** `skiprows` is applied.
1091

1092
    This function will *always* return a list of :class:`DataFrame` *or*
1093
    it will fail, e.g., it will *not* return an empty list.
1094

1095
    Examples
1096
    --------
1097
    See the :ref:`read_html documentation in the IO section of the docs
1098
    <io.read_html>` for some examples of reading in HTML tables.
1099
    """
1100
    _importers()
1101

1102
    # Type check here. We don't want to parse only to fail because of an
1103
    # invalid value of an integer skiprows.
1104
    if isinstance(skiprows, numbers.Integral) and skiprows < 0:
1105
        raise ValueError(
1106
            "cannot skip rows starting from the end of the "
1107
            "data (you passed a negative value)"
1108
        )
1109
    validate_header_arg(header)
1110

1111
    io = stringify_path(io)
1112

1113
    return _parse(
1114
        flavor=flavor,
1115
        io=io,
1116
        match=match,
1117
        header=header,
1118
        index_col=index_col,
1119
        skiprows=skiprows,
1120
        parse_dates=parse_dates,
1121
        thousands=thousands,
1122
        attrs=attrs,
1123
        encoding=encoding,
1124
        decimal=decimal,
1125
        converters=converters,
1126
        na_values=na_values,
1127
        keep_default_na=keep_default_na,
1128
        displayed_only=displayed_only,
1129
    )
1130

1131
Product

Resources

Company