|
|
@@ -0,0 +1,1318 @@
|
|
|
+from __future__ import annotations
|
|
|
+from typing import Sequence, Iterable
|
|
|
+# from pandas._typing import FilePath, BaseBuffer, StorageOptions
|
|
|
+from pandas._typing import *
|
|
|
+from loguru import logger
|
|
|
+
|
|
|
+"""
|
|
|
+对pandas.io.html的修改
|
|
|
+增加属性 **kwargs
|
|
|
+colspan_single = kwargs.get('colspan_single') # 单个单元格的colspan=Literal[None, "header", "footer", "body"]
|
|
|
+rowspan_single = kwargs.get('rowspan_single') # 单个单元格的rowspan=Literal[None, "header", "footer", "body"]
|
|
|
+number_strip = kwargs.get('number_strip') # 是否去除单个cell中数字中空格 = bool
|
|
|
+"""
|
|
|
+
|
|
|
+"""
|
|
|
+:mod:`pandas.io.html` is a module containing functionality for dealing with
|
|
|
+HTML IO.
|
|
|
+
|
|
|
+"""
|
|
|
+
|
|
|
+from collections import abc
|
|
|
+import numbers
|
|
|
+import re
|
|
|
+from re import Pattern
|
|
|
+from typing import (
|
|
|
+ TYPE_CHECKING,
|
|
|
+ Literal,
|
|
|
+ cast,
|
|
|
+)
|
|
|
+import warnings
|
|
|
+
|
|
|
+from pandas._libs import lib
|
|
|
+from pandas.compat._optional import import_optional_dependency
|
|
|
+from pandas.errors import (
|
|
|
+ AbstractMethodError,
|
|
|
+ EmptyDataError,
|
|
|
+)
|
|
|
+from pandas.util._decorators import doc
|
|
|
+from pandas.util._exceptions import find_stack_level
|
|
|
+from pandas.util._validators import check_dtype_backend
|
|
|
+
|
|
|
+from pandas.core.dtypes.common import is_list_like
|
|
|
+
|
|
|
+from pandas import isna
|
|
|
+from pandas.core.indexes.base import Index
|
|
|
+from pandas.core.indexes.multi import MultiIndex
|
|
|
+from pandas.core.series import Series
|
|
|
+from pandas.core.shared_docs import _shared_docs
|
|
|
+
|
|
|
+from pandas.io.common import (
|
|
|
+ file_exists,
|
|
|
+ get_handle,
|
|
|
+ is_file_like,
|
|
|
+ is_fsspec_url,
|
|
|
+ is_url,
|
|
|
+ stringify_path,
|
|
|
+ validate_header_arg,
|
|
|
+)
|
|
|
+from pandas.io.formats.printing import pprint_thing
|
|
|
+from pandas.io.parsers import TextParser
|
|
|
+
|
|
|
+if TYPE_CHECKING:
|
|
|
+ from collections.abc import (
|
|
|
+ Iterable,
|
|
|
+ Sequence,
|
|
|
+ )
|
|
|
+
|
|
|
+ from pandas._typing import (
|
|
|
+ BaseBuffer,
|
|
|
+ DtypeBackend,
|
|
|
+ FilePath,
|
|
|
+ HTMLFlavors,
|
|
|
+ ReadBuffer,
|
|
|
+ StorageOptions,
|
|
|
+ )
|
|
|
+
|
|
|
+ from pandas import DataFrame
|
|
|
+
|
|
|
+#############
|
|
|
+# READ HTML #
|
|
|
+#############
|
|
|
+_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}")
|
|
|
+
|
|
|
+
|
|
|
+def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str:
|
|
|
+ """
|
|
|
+ Replace extra whitespace inside of a string with a single space.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ s : str or unicode
|
|
|
+ The string from which to remove extra whitespace.
|
|
|
+ regex : re.Pattern
|
|
|
+ The regular expression to use to remove extra whitespace.
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ subd : str or unicode
|
|
|
+ `s` with all extra whitespace replaced with a single space.
|
|
|
+ """
|
|
|
+ return regex.sub(" ", s.strip())
|
|
|
+
|
|
|
+
|
|
|
+def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]:
|
|
|
+ """
|
|
|
+ Get an iterator given an integer, slice or container.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ skiprows : int, slice, container
|
|
|
+ The iterator to use to skip rows; can also be a slice.
|
|
|
+
|
|
|
+ Raises
|
|
|
+ ------
|
|
|
+ TypeError
|
|
|
+ * If `skiprows` is not a slice, integer, or Container
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ it : iterable
|
|
|
+ A proper iterator to use to skip rows of a DataFrame.
|
|
|
+ """
|
|
|
+ if isinstance(skiprows, slice):
|
|
|
+ start, step = skiprows.start or 0, skiprows.step or 1
|
|
|
+ return list(range(start, skiprows.stop, step))
|
|
|
+ elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):
|
|
|
+ return cast("int | Sequence[int]", skiprows)
|
|
|
+ elif skiprows is None:
|
|
|
+ return 0
|
|
|
+ raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows")
|
|
|
+
|
|
|
+
|
|
|
+def _read(
|
|
|
+ obj: FilePath | BaseBuffer,
|
|
|
+ encoding: str | None,
|
|
|
+ storage_options: StorageOptions | None,
|
|
|
+) -> str | bytes:
|
|
|
+ """
|
|
|
+ Try to read from a url, file or string.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ obj : str, unicode, path object, or file-like object
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ raw_text : str
|
|
|
+ """
|
|
|
+ text: str | bytes
|
|
|
+ if (
|
|
|
+ is_url(obj)
|
|
|
+ or hasattr(obj, "read")
|
|
|
+ or (isinstance(obj, str) and file_exists(obj))
|
|
|
+ ):
|
|
|
+ with get_handle(
|
|
|
+ obj, "r", encoding=encoding, storage_options=storage_options
|
|
|
+ ) as handles:
|
|
|
+ text = handles.handle.read()
|
|
|
+ elif isinstance(obj, (str, bytes)):
|
|
|
+ text = obj
|
|
|
+ else:
|
|
|
+ raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")
|
|
|
+ return text
|
|
|
+
|
|
|
+
|
|
|
+class _HtmlFrameParser:
|
|
|
+ """
|
|
|
+ Base class for parsers that parse HTML into DataFrames.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ io : str or file-like
|
|
|
+ This can be either a string of raw HTML, a valid URL using the HTTP,
|
|
|
+ FTP, or FILE protocols or a file-like object.
|
|
|
+
|
|
|
+ match : str or regex
|
|
|
+ The text to match in the document.
|
|
|
+
|
|
|
+ attrs : dict
|
|
|
+ List of HTML <table> element attributes to match.
|
|
|
+
|
|
|
+ encoding : str
|
|
|
+ Encoding to be used by parser
|
|
|
+
|
|
|
+ displayed_only : bool
|
|
|
+ Whether or not items with "display:none" should be ignored
|
|
|
+
|
|
|
+ extract_links : {None, "all", "header", "body", "footer"}
|
|
|
+ Table elements in the specified section(s) with <a> tags will have their
|
|
|
+ href extracted.
|
|
|
+
|
|
|
+ .. versionadded:: 1.5.0
|
|
|
+
|
|
|
+ Attributes
|
|
|
+ ----------
|
|
|
+ io : str or file-like
|
|
|
+ raw HTML, URL, or file-like object
|
|
|
+
|
|
|
+ match : regex
|
|
|
+ The text to match in the raw HTML
|
|
|
+
|
|
|
+ attrs : dict-like
|
|
|
+ A dictionary of valid table attributes to use to search for table
|
|
|
+ elements.
|
|
|
+
|
|
|
+ encoding : str
|
|
|
+ Encoding to be used by parser
|
|
|
+
|
|
|
+ displayed_only : bool
|
|
|
+ Whether or not items with "display:none" should be ignored
|
|
|
+
|
|
|
+ extract_links : {None, "all", "header", "body", "footer"}
|
|
|
+ Table elements in the specified section(s) with <a> tags will have their
|
|
|
+ href extracted.
|
|
|
+
|
|
|
+ .. versionadded:: 1.5.0
|
|
|
+
|
|
|
+ Notes
|
|
|
+ -----
|
|
|
+ To subclass this class effectively you must override the following methods:
|
|
|
+ * :func:`_build_doc`
|
|
|
+ * :func:`_attr_getter`
|
|
|
+ * :func:`_href_getter`
|
|
|
+ * :func:`_text_getter`
|
|
|
+ * :func:`_parse_td`
|
|
|
+ * :func:`_parse_thead_tr`
|
|
|
+ * :func:`_parse_tbody_tr`
|
|
|
+ * :func:`_parse_tfoot_tr`
|
|
|
+ * :func:`_parse_tables`
|
|
|
+ * :func:`_equals_tag`
|
|
|
+ See each method's respective documentation for details on their
|
|
|
+ functionality.
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(
|
|
|
+ self,
|
|
|
+ io: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
|
|
|
+ match: str | Pattern,
|
|
|
+ attrs: dict[str, str] | None,
|
|
|
+ encoding: str,
|
|
|
+ displayed_only: bool,
|
|
|
+ extract_links: Literal[None, "header", "footer", "body", "all"],
|
|
|
+ storage_options: StorageOptions = None,
|
|
|
+ ) -> None:
|
|
|
+ self.io = io
|
|
|
+ self.match = match
|
|
|
+ self.attrs = attrs
|
|
|
+ self.encoding = encoding
|
|
|
+ self.displayed_only = displayed_only
|
|
|
+ self.extract_links = extract_links
|
|
|
+ self.storage_options = storage_options
|
|
|
+
|
|
|
+ def parse_tables(self, custom_args):
|
|
|
+ """
|
|
|
+ Parse and return all tables from the DOM.
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ list of parsed (header, body, footer) tuples from tables.
|
|
|
+ """
|
|
|
+ tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
|
|
|
+ return (self._parse_thead_tbody_tfoot(table, custom_args) for table in tables)
|
|
|
+
|
|
|
+ def _attr_getter(self, obj, attr):
|
|
|
+ """
|
|
|
+ Return the attribute value of an individual DOM node.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ obj : node-like
|
|
|
+ A DOM node.
|
|
|
+
|
|
|
+ attr : str or unicode
|
|
|
+ The attribute, such as "colspan"
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ str or unicode
|
|
|
+ The attribute value.
|
|
|
+ """
|
|
|
+ # Both lxml and BeautifulSoup have the same implementation:
|
|
|
+ return obj.get(attr)
|
|
|
+
|
|
|
+ def _href_getter(self, obj) -> str | None:
|
|
|
+ """
|
|
|
+ Return a href if the DOM node contains a child <a> or None.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ obj : node-like
|
|
|
+ A DOM node.
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ href : str or unicode
|
|
|
+ The href from the <a> child of the DOM node.
|
|
|
+ """
|
|
|
+ raise AbstractMethodError(self)
|
|
|
+
|
|
|
+ def _text_getter(self, obj):
|
|
|
+ """
|
|
|
+ Return the text of an individual DOM node.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ obj : node-like
|
|
|
+ A DOM node.
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ text : str or unicode
|
|
|
+ The text from an individual DOM node.
|
|
|
+ """
|
|
|
+ raise AbstractMethodError(self)
|
|
|
+
|
|
|
+ def _parse_td(self, obj):
|
|
|
+ """
|
|
|
+ Return the td elements from a row element.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ obj : node-like
|
|
|
+ A DOM <tr> node.
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ list of node-like
|
|
|
+ These are the elements of each row, i.e., the columns.
|
|
|
+ """
|
|
|
+ raise AbstractMethodError(self)
|
|
|
+
|
|
|
+ def _parse_thead_tr(self, table):
|
|
|
+ """
|
|
|
+ Return the list of thead row elements from the parsed table element.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ table : a table element that contains zero or more thead elements.
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ list of node-like
|
|
|
+ These are the <tr> row elements of a table.
|
|
|
+ """
|
|
|
+ raise AbstractMethodError(self)
|
|
|
+
|
|
|
+ def _parse_tbody_tr(self, table):
|
|
|
+ """
|
|
|
+ Return the list of tbody row elements from the parsed table element.
|
|
|
+
|
|
|
+ HTML5 table bodies consist of either 0 or more <tbody> elements (which
|
|
|
+ only contain <tr> elements) or 0 or more <tr> elements. This method
|
|
|
+ checks for both structures.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ table : a table element that contains row elements.
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ list of node-like
|
|
|
+ These are the <tr> row elements of a table.
|
|
|
+ """
|
|
|
+ raise AbstractMethodError(self)
|
|
|
+
|
|
|
+ def _parse_tfoot_tr(self, table):
|
|
|
+ """
|
|
|
+ Return the list of tfoot row elements from the parsed table element.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ table : a table element that contains row elements.
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ list of node-like
|
|
|
+ These are the <tr> row elements of a table.
|
|
|
+ """
|
|
|
+ raise AbstractMethodError(self)
|
|
|
+
|
|
|
+ def _parse_tables(self, document, match, attrs):
|
|
|
+ """
|
|
|
+ Return all tables from the parsed DOM.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ document : the DOM from which to parse the table element.
|
|
|
+
|
|
|
+ match : str or regular expression
|
|
|
+ The text to search for in the DOM tree.
|
|
|
+
|
|
|
+ attrs : dict
|
|
|
+ A dictionary of table attributes that can be used to disambiguate
|
|
|
+ multiple tables on a page.
|
|
|
+
|
|
|
+ Raises
|
|
|
+ ------
|
|
|
+ ValueError : `match` does not match any text in the document.
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ list of node-like
|
|
|
+ HTML <table> elements to be parsed into raw data.
|
|
|
+ """
|
|
|
+ raise AbstractMethodError(self)
|
|
|
+
|
|
|
+ def _equals_tag(self, obj, tag) -> bool:
|
|
|
+ """
|
|
|
+ Return whether an individual DOM node matches a tag
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ obj : node-like
|
|
|
+ A DOM node.
|
|
|
+
|
|
|
+ tag : str
|
|
|
+ Tag name to be checked for equality.
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ boolean
|
|
|
+ Whether `obj`'s tag name is `tag`
|
|
|
+ """
|
|
|
+ raise AbstractMethodError(self)
|
|
|
+
|
|
|
+ def _build_doc(self):
|
|
|
+ """
|
|
|
+ Return a tree-like object that can be used to iterate over the DOM.
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ node-like
|
|
|
+ The DOM from which to parse the table element.
|
|
|
+ """
|
|
|
+ raise AbstractMethodError(self)
|
|
|
+
|
|
|
+ def _parse_thead_tbody_tfoot(self, table_html, custom_args):
|
|
|
+ """
|
|
|
+ Given a table, return parsed header, body, and foot.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ table_html : node-like
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ tuple of (header, body, footer), each a list of list-of-text rows.
|
|
|
+
|
|
|
+ Notes
|
|
|
+ -----
|
|
|
+ Header and body are lists-of-lists. Top level list is a list of
|
|
|
+ rows. Each row is a list of str text.
|
|
|
+
|
|
|
+ Logic: Use <thead>, <tbody>, <tfoot> elements to identify
|
|
|
+ header, body, and footer, otherwise:
|
|
|
+ - Put all rows into body
|
|
|
+ - Move rows from top of body to header only if
|
|
|
+ all elements inside row are <th>
|
|
|
+ - Move rows from bottom of body to footer only if
|
|
|
+ all elements inside row are <th>
|
|
|
+ """
|
|
|
+ header_rows = self._parse_thead_tr(table_html)
|
|
|
+ body_rows = self._parse_tbody_tr(table_html)
|
|
|
+ footer_rows = self._parse_tfoot_tr(table_html)
|
|
|
+
|
|
|
+ def row_is_all_th(row):
|
|
|
+ return all(self._equals_tag(t, "th") for t in self._parse_td(row))
|
|
|
+
|
|
|
+ if not header_rows:
|
|
|
+ # The table has no <thead>. Move the top all-<th> rows from
|
|
|
+ # body_rows to header_rows. (This is a common case because many
|
|
|
+ # tables in the wild have no <thead> or <tfoot>
|
|
|
+ while body_rows and row_is_all_th(body_rows[0]):
|
|
|
+ header_rows.append(body_rows.pop(0))
|
|
|
+
|
|
|
+ header = self._expand_colspan_rowspan(header_rows, section="header", custom_args=custom_args)
|
|
|
+ body = self._expand_colspan_rowspan(body_rows, section="body", custom_args=custom_args)
|
|
|
+ footer = self._expand_colspan_rowspan(footer_rows, section="footer", custom_args=custom_args)
|
|
|
+
|
|
|
+ return header, body, footer
|
|
|
+
|
|
|
+ def _expand_colspan_rowspan(
|
|
|
+ self, rows, section: Literal["header", "footer", "body"], custom_args
|
|
|
+ ):
|
|
|
+ """
|
|
|
+ Given a list of <tr>s, return a list of text rows.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ rows : list of node-like
|
|
|
+ List of <tr>s
|
|
|
+ section : the section that the rows belong to (header, body or footer).
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ list of list
|
|
|
+ Each returned row is a list of str text, or tuple (text, link)
|
|
|
+ if extract_links is not None.
|
|
|
+
|
|
|
+ Notes
|
|
|
+ -----
|
|
|
+ Any cell with ``rowspan`` or ``colspan`` will have its contents copied
|
|
|
+ to subsequent cells.
|
|
|
+ """
|
|
|
+ colspan_single = custom_args.get('colspan_single')
|
|
|
+ rowspan_single = custom_args.get('rowspan_single')
|
|
|
+ number_strip = custom_args.get('number_strip')
|
|
|
+
|
|
|
+ all_texts = [] # list of rows, each a list of str
|
|
|
+ text: str | tuple
|
|
|
+ remainder: list[
|
|
|
+ tuple[int, str | tuple, int]
|
|
|
+ ] = [] # list of (index, text, nrows)
|
|
|
+
|
|
|
+ for tr in rows:
|
|
|
+ texts = [] # the output for this row
|
|
|
+ next_remainder = []
|
|
|
+
|
|
|
+ index = 0
|
|
|
+ tds = self._parse_td(tr)
|
|
|
+ for td in tds:
|
|
|
+ # Append texts from previous rows with rowspan>1 that come
|
|
|
+ # before this <td>
|
|
|
+ while remainder and remainder[0][0] <= index:
|
|
|
+ prev_i, prev_text, prev_rowspan = remainder.pop(0)
|
|
|
+ texts.append(prev_text)
|
|
|
+ if prev_rowspan > 1:
|
|
|
+ next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
|
|
|
+ index += 1
|
|
|
+
|
|
|
+ # Append the text from this <td>, colspan times
|
|
|
+ text = _remove_whitespace(self._text_getter(td))
|
|
|
+ if self.extract_links in ("all", section):
|
|
|
+ href = self._href_getter(td)
|
|
|
+ text = (text, href)
|
|
|
+ rowspan = int(self._attr_getter(td, "rowspan") or 1)
|
|
|
+ colspan = int(self._attr_getter(td, "colspan") or 1)
|
|
|
+
|
|
|
+ for i in range(colspan):
|
|
|
+ if i == 0:
|
|
|
+ texts.append(text)
|
|
|
+ elif section in colspan_single:
|
|
|
+ texts.append('')
|
|
|
+ else:
|
|
|
+ texts.append(text)
|
|
|
+
|
|
|
+ if rowspan > 1 and section not in rowspan_single:
|
|
|
+ next_remainder.append((index, text, rowspan - 1))
|
|
|
+ index += 1
|
|
|
+
|
|
|
+ # Append texts from previous rows at the final position
|
|
|
+ for prev_i, prev_text, prev_rowspan in remainder:
|
|
|
+ texts.append(prev_text)
|
|
|
+ if prev_rowspan > 1:
|
|
|
+ next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
|
|
|
+
|
|
|
+ all_texts.append(texts)
|
|
|
+ remainder = next_remainder
|
|
|
+
|
|
|
+ # Append rows that only appear because the previous row had non-1
|
|
|
+ # rowspan
|
|
|
+ while remainder:
|
|
|
+ next_remainder = []
|
|
|
+ texts = []
|
|
|
+ for prev_i, prev_text, prev_rowspan in remainder:
|
|
|
+ texts.append(prev_text)
|
|
|
+ if prev_rowspan > 1:
|
|
|
+ next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
|
|
|
+ all_texts.append(texts)
|
|
|
+ remainder = next_remainder
|
|
|
+
|
|
|
+ return all_texts
|
|
|
+
|
|
|
+ def _handle_hidden_tables(self, tbl_list, attr_name: str):
|
|
|
+ """
|
|
|
+ Return list of tables, potentially removing hidden elements
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ tbl_list : list of node-like
|
|
|
+ Type of list elements will vary depending upon parser used
|
|
|
+ attr_name : str
|
|
|
+ Name of the accessor for retrieving HTML attributes
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ list of node-like
|
|
|
+ Return type matches `tbl_list`
|
|
|
+ """
|
|
|
+ if not self.displayed_only:
|
|
|
+ return tbl_list
|
|
|
+
|
|
|
+ return [
|
|
|
+ x
|
|
|
+ for x in tbl_list
|
|
|
+ if "display:none"
|
|
|
+ not in getattr(x, attr_name).get("style", "").replace(" ", "")
|
|
|
+ ]
|
|
|
+
|
|
|
+
|
|
|
+class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
|
|
|
+ """
|
|
|
+ HTML to DataFrame parser that uses BeautifulSoup under the hood.
|
|
|
+
|
|
|
+ See Also
|
|
|
+ --------
|
|
|
+ pandas.io.html._HtmlFrameParser
|
|
|
+ pandas.io.html._LxmlFrameParser
|
|
|
+
|
|
|
+ Notes
|
|
|
+ -----
|
|
|
+ Documentation strings for this class are in the base class
|
|
|
+ :class:`pandas.io.html._HtmlFrameParser`.
|
|
|
+ """
|
|
|
+
|
|
|
+ def _parse_tables(self, document, match, attrs):
|
|
|
+ element_name = "table"
|
|
|
+ tables = document.find_all(element_name, attrs=attrs)
|
|
|
+ if not tables:
|
|
|
+ raise ValueError("No tables found")
|
|
|
+
|
|
|
+ result = []
|
|
|
+ unique_tables = set()
|
|
|
+ tables = self._handle_hidden_tables(tables, "attrs")
|
|
|
+
|
|
|
+ for table in tables:
|
|
|
+ if self.displayed_only:
|
|
|
+ for elem in table.find_all("style"):
|
|
|
+ elem.decompose()
|
|
|
+
|
|
|
+ for elem in table.find_all(style=re.compile(r"display:\s*none")):
|
|
|
+ elem.decompose()
|
|
|
+
|
|
|
+ if table not in unique_tables and table.find(string=match) is not None:
|
|
|
+ result.append(table)
|
|
|
+ unique_tables.add(table)
|
|
|
+ if not result:
|
|
|
+ raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
|
|
|
+ return result
|
|
|
+
|
|
|
+ def _href_getter(self, obj) -> str | None:
|
|
|
+ a = obj.find("a", href=True)
|
|
|
+ return None if not a else a["href"]
|
|
|
+
|
|
|
+ def _text_getter(self, obj):
|
|
|
+ return obj.text
|
|
|
+
|
|
|
+ def _equals_tag(self, obj, tag) -> bool:
|
|
|
+ return obj.name == tag
|
|
|
+
|
|
|
+ def _parse_td(self, row):
|
|
|
+ return row.find_all(("td", "th"), recursive=False)
|
|
|
+
|
|
|
+ def _parse_thead_tr(self, table):
|
|
|
+ return table.select("thead tr")
|
|
|
+
|
|
|
+ def _parse_tbody_tr(self, table):
|
|
|
+ from_tbody = table.select("tbody tr")
|
|
|
+ from_root = table.find_all("tr", recursive=False)
|
|
|
+ # HTML spec: at most one of these lists has content
|
|
|
+ return from_tbody + from_root
|
|
|
+
|
|
|
+ def _parse_tfoot_tr(self, table):
|
|
|
+ return table.select("tfoot tr")
|
|
|
+
|
|
|
+ def _setup_build_doc(self):
|
|
|
+ raw_text = _read(self.io, self.encoding, self.storage_options)
|
|
|
+ if not raw_text:
|
|
|
+ raise ValueError(f"No text parsed from document: {self.io}")
|
|
|
+ return raw_text
|
|
|
+
|
|
|
+ def _build_doc(self):
|
|
|
+ from bs4 import BeautifulSoup
|
|
|
+
|
|
|
+ bdoc = self._setup_build_doc()
|
|
|
+ if isinstance(bdoc, bytes) and self.encoding is not None:
|
|
|
+ udoc = bdoc.decode(self.encoding)
|
|
|
+ from_encoding = None
|
|
|
+ else:
|
|
|
+ udoc = bdoc
|
|
|
+ from_encoding = self.encoding
|
|
|
+
|
|
|
+ soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
|
|
|
+
|
|
|
+ for br in soup.find_all("br"):
|
|
|
+ br.replace_with("\n" + br.text)
|
|
|
+
|
|
|
+ return soup
|
|
|
+
|
|
|
+
|
|
|
+def _build_xpath_expr(attrs) -> str:
|
|
|
+ """
|
|
|
+ Build an xpath expression to simulate bs4's ability to pass in kwargs to
|
|
|
+ search for attributes when using the lxml parser.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ attrs : dict
|
|
|
+ A dict of HTML attributes. These are NOT checked for validity.
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ expr : unicode
|
|
|
+ An XPath expression that checks for the given HTML attributes.
|
|
|
+ """
|
|
|
+ # give class attribute as class_ because class is a python keyword
|
|
|
+ if "class_" in attrs:
|
|
|
+ attrs["class"] = attrs.pop("class_")
|
|
|
+
|
|
|
+ s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()])
|
|
|
+ return f"[{s}]"
|
|
|
+
|
|
|
+
|
|
|
+_re_namespace = {"re": "http://exslt.org/regular-expressions"}
|
|
|
+
|
|
|
+
|
|
|
+class _LxmlFrameParser(_HtmlFrameParser):
|
|
|
+ """
|
|
|
+ HTML to DataFrame parser that uses lxml under the hood.
|
|
|
+
|
|
|
+ Warning
|
|
|
+ -------
|
|
|
+ This parser can only handle HTTP, FTP, and FILE urls.
|
|
|
+
|
|
|
+ See Also
|
|
|
+ --------
|
|
|
+ _HtmlFrameParser
|
|
|
+ _BeautifulSoupLxmlFrameParser
|
|
|
+
|
|
|
+ Notes
|
|
|
+ -----
|
|
|
+ Documentation strings for this class are in the base class
|
|
|
+ :class:`_HtmlFrameParser`.
|
|
|
+ """
|
|
|
+
|
|
|
+ def _href_getter(self, obj) -> str | None:
|
|
|
+ href = obj.xpath(".//a/@href")
|
|
|
+ return None if not href else href[0]
|
|
|
+
|
|
|
+ def _text_getter(self, obj):
|
|
|
+ return obj.text_content()
|
|
|
+
|
|
|
+ def _parse_td(self, row):
|
|
|
+ # Look for direct children only: the "row" element here may be a
|
|
|
+ # <thead> or <tfoot> (see _parse_thead_tr).
|
|
|
+ return row.xpath("./td|./th")
|
|
|
+
|
|
|
+ def _parse_tables(self, document, match, kwargs):
|
|
|
+ pattern = match.pattern
|
|
|
+
|
|
|
+ # 1. check all descendants for the given pattern and only search tables
|
|
|
+ # GH 49929
|
|
|
+ xpath_expr = f"//table[.//text()[re:test(., {repr(pattern)})]]"
|
|
|
+
|
|
|
+ # if any table attributes were given build an xpath expression to
|
|
|
+ # search for them
|
|
|
+ if kwargs:
|
|
|
+ xpath_expr += _build_xpath_expr(kwargs)
|
|
|
+
|
|
|
+ tables = document.xpath(xpath_expr, namespaces=_re_namespace)
|
|
|
+
|
|
|
+ tables = self._handle_hidden_tables(tables, "attrib")
|
|
|
+ if self.displayed_only:
|
|
|
+ for table in tables:
|
|
|
+ # lxml utilizes XPATH 1.0 which does not have regex
|
|
|
+ # support. As a result, we find all elements with a style
|
|
|
+ # attribute and iterate them to check for display:none
|
|
|
+ for elem in table.xpath(".//style"):
|
|
|
+ elem.drop_tree()
|
|
|
+ for elem in table.xpath(".//*[@style]"):
|
|
|
+ if "display:none" in elem.attrib.get("style", "").replace(" ", ""):
|
|
|
+ elem.drop_tree()
|
|
|
+ if not tables:
|
|
|
+ raise ValueError(f"No tables found matching regex {repr(pattern)}")
|
|
|
+ return tables
|
|
|
+
|
|
|
+ def _equals_tag(self, obj, tag) -> bool:
|
|
|
+ return obj.tag == tag
|
|
|
+
|
|
|
+ def _build_doc(self):
|
|
|
+ """
|
|
|
+ Raises
|
|
|
+ ------
|
|
|
+ ValueError
|
|
|
+ * If a URL that lxml cannot parse is passed.
|
|
|
+
|
|
|
+ Exception
|
|
|
+ * Any other ``Exception`` thrown. For example, trying to parse a
|
|
|
+ URL that is syntactically correct on a machine with no internet
|
|
|
+ connection will fail.
|
|
|
+
|
|
|
+ See Also
|
|
|
+ --------
|
|
|
+ pandas.io.html._HtmlFrameParser._build_doc
|
|
|
+ """
|
|
|
+ from lxml.etree import XMLSyntaxError
|
|
|
+ from lxml.html import (
|
|
|
+ HTMLParser,
|
|
|
+ fromstring,
|
|
|
+ parse,
|
|
|
+ )
|
|
|
+
|
|
|
+ parser = HTMLParser(recover=True, encoding=self.encoding)
|
|
|
+
|
|
|
+ try:
|
|
|
+ if is_url(self.io):
|
|
|
+ with get_handle(
|
|
|
+ self.io, "r", storage_options=self.storage_options
|
|
|
+ ) as f:
|
|
|
+ r = parse(f.handle, parser=parser)
|
|
|
+ else:
|
|
|
+ # try to parse the input in the simplest way
|
|
|
+ r = parse(self.io, parser=parser)
|
|
|
+ try:
|
|
|
+ r = r.getroot()
|
|
|
+ except AttributeError:
|
|
|
+ pass
|
|
|
+ except (UnicodeDecodeError, OSError) as e:
|
|
|
+ # if the input is a blob of html goop
|
|
|
+ if not is_url(self.io):
|
|
|
+ r = fromstring(self.io, parser=parser)
|
|
|
+
|
|
|
+ try:
|
|
|
+ r = r.getroot()
|
|
|
+ except AttributeError:
|
|
|
+ pass
|
|
|
+ else:
|
|
|
+ raise e
|
|
|
+ else:
|
|
|
+ if not hasattr(r, "text_content"):
|
|
|
+ raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
|
|
|
+
|
|
|
+ for br in r.xpath("*//br"):
|
|
|
+ br.tail = "\n" + (br.tail or "")
|
|
|
+
|
|
|
+ return r
|
|
|
+
|
|
|
+ def _parse_thead_tr(self, table):
|
|
|
+ rows = []
|
|
|
+
|
|
|
+ for thead in table.xpath(".//thead"):
|
|
|
+ rows.extend(thead.xpath("./tr"))
|
|
|
+
|
|
|
+ # HACK: lxml does not clean up the clearly-erroneous
|
|
|
+ # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add
|
|
|
+ # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its
|
|
|
+ # children as though it's a <tr>.
|
|
|
+ #
|
|
|
+ # Better solution would be to use html5lib.
|
|
|
+ elements_at_root = thead.xpath("./td|./th")
|
|
|
+ if elements_at_root:
|
|
|
+ rows.append(thead)
|
|
|
+
|
|
|
+ return rows
|
|
|
+
|
|
|
+ def _parse_tbody_tr(self, table):
|
|
|
+ from_tbody = table.xpath(".//tbody//tr")
|
|
|
+ from_root = table.xpath("./tr")
|
|
|
+ # HTML spec: at most one of these lists has content
|
|
|
+ return from_tbody + from_root
|
|
|
+
|
|
|
+ def _parse_tfoot_tr(self, table):
|
|
|
+ return table.xpath(".//tfoot//tr")
|
|
|
+
|
|
|
+
|
|
|
+def _expand_elements(body) -> None:
|
|
|
+ data = [len(elem) for elem in body]
|
|
|
+ lens = Series(data)
|
|
|
+ lens_max = lens.max()
|
|
|
+ not_max = lens[lens != lens_max]
|
|
|
+
|
|
|
+ empty = [""]
|
|
|
+ for ind, length in not_max.items():
|
|
|
+ body[ind] += empty * (lens_max - length)
|
|
|
+
|
|
|
+
|
|
|
+def _data_to_frame(**kwargs):
|
|
|
+
|
|
|
+ def clean_number_string(cell_value):
|
|
|
+ # 正则表达式,用于匹配数字之间的空格
|
|
|
+ # PATTERN = re.compile(r'(\d)\s+(\d)')
|
|
|
+ # 检查是否只包含允许的字符
|
|
|
+ allowed_chars = set("0123456789.,\n\t+- ")
|
|
|
+ if cell_value is None or cell_value == '':
|
|
|
+ return cell_value
|
|
|
+ if any(char not in allowed_chars for char in cell_value):
|
|
|
+ return cell_value
|
|
|
+
|
|
|
+ # 去除所有空白字符
|
|
|
+ cleaned = ''.join(char for char in cell_value if char not in " \t\n")
|
|
|
+ # 将逗号移除
|
|
|
+ cleaned = cleaned.replace(',', '')
|
|
|
+ return cleaned
|
|
|
+
|
|
|
+ def convert_to_float(cleaned_value):
|
|
|
+ try:
|
|
|
+ # 尝试将清理后的字符串转换为浮点数
|
|
|
+ number = float(cleaned_value)
|
|
|
+ return number
|
|
|
+ except ValueError:
|
|
|
+ # 如果转换失败,返回原始值或适当的错误信息
|
|
|
+ logger.error(f"无法将 '{cleaned_value}' 转换为浮点数")
|
|
|
+ # 返回空值
|
|
|
+ return None
|
|
|
+
|
|
|
+ head, body, foot = kwargs.pop("data")
|
|
|
+ header = kwargs.pop("header")
|
|
|
+ kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"])
|
|
|
+ number_strip = kwargs.get('number_strip')
|
|
|
+ if head:
|
|
|
+ body = head + body
|
|
|
+
|
|
|
+ # Infer header when there is a <thead> or top <th>-only rows
|
|
|
+ if header is None:
|
|
|
+ if len(head) == 1:
|
|
|
+ header = 0
|
|
|
+ else:
|
|
|
+ # ignore all-empty-text rows
|
|
|
+ header = [i for i, row in enumerate(head) if any(text for text in row)]
|
|
|
+
|
|
|
+ if foot:
|
|
|
+ body += foot
|
|
|
+
|
|
|
+ # fill out elements of body that are "ragged"
|
|
|
+ _expand_elements(body)
|
|
|
+ # 如果 number_strip 为 True,则去除文本中的数字前后以及中间的空格
|
|
|
+ if number_strip:
|
|
|
+ for i in range(len(body)):
|
|
|
+ for j in range(len(body[i])):
|
|
|
+ body[i][j] = clean_number_string(body[i][j])
|
|
|
+ with TextParser(body, header=header, **kwargs) as tp:
|
|
|
+ return tp.read()
|
|
|
+
|
|
|
+_valid_parsers = {
|
|
|
+ "lxml": _LxmlFrameParser,
|
|
|
+ None: _LxmlFrameParser,
|
|
|
+ "html5lib": _BeautifulSoupHtml5LibFrameParser,
|
|
|
+ "bs4": _BeautifulSoupHtml5LibFrameParser,
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+def _parser_dispatch(flavor: HTMLFlavors | None) -> type[_HtmlFrameParser]:
|
|
|
+ """
|
|
|
+ Choose the parser based on the input flavor.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ flavor : {{"lxml", "html5lib", "bs4"}} or None
|
|
|
+ The type of parser to use. This must be a valid backend.
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ cls : _HtmlFrameParser subclass
|
|
|
+ The parser class based on the requested input flavor.
|
|
|
+
|
|
|
+ Raises
|
|
|
+ ------
|
|
|
+ ValueError
|
|
|
+ * If `flavor` is not a valid backend.
|
|
|
+ ImportError
|
|
|
+ * If you do not have the requested `flavor`
|
|
|
+ """
|
|
|
+ valid_parsers = list(_valid_parsers.keys())
|
|
|
+ if flavor not in valid_parsers:
|
|
|
+ raise ValueError(
|
|
|
+ f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}"
|
|
|
+ )
|
|
|
+
|
|
|
+ if flavor in ("bs4", "html5lib"):
|
|
|
+ import_optional_dependency("html5lib")
|
|
|
+ import_optional_dependency("bs4")
|
|
|
+ else:
|
|
|
+ import_optional_dependency("lxml.etree")
|
|
|
+ return _valid_parsers[flavor]
|
|
|
+
|
|
|
+
|
|
|
+def _print_as_set(s) -> str:
|
|
|
+ arg = ", ".join([pprint_thing(el) for el in s])
|
|
|
+ return f"{{{arg}}}"
|
|
|
+
|
|
|
+
|
|
|
+def _validate_flavor(flavor):
|
|
|
+ if flavor is None:
|
|
|
+ flavor = "lxml", "bs4"
|
|
|
+ elif isinstance(flavor, str):
|
|
|
+ flavor = (flavor,)
|
|
|
+ elif isinstance(flavor, abc.Iterable):
|
|
|
+ if not all(isinstance(flav, str) for flav in flavor):
|
|
|
+ raise TypeError(
|
|
|
+ f"Object of type {repr(type(flavor).__name__)} "
|
|
|
+ f"is not an iterable of strings"
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ msg = repr(flavor) if isinstance(flavor, str) else str(flavor)
|
|
|
+ msg += " is not a valid flavor"
|
|
|
+ raise ValueError(msg)
|
|
|
+
|
|
|
+ flavor = tuple(flavor)
|
|
|
+ valid_flavors = set(_valid_parsers)
|
|
|
+ flavor_set = set(flavor)
|
|
|
+
|
|
|
+ if not flavor_set & valid_flavors:
|
|
|
+ raise ValueError(
|
|
|
+ f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid "
|
|
|
+ f"flavors are {_print_as_set(valid_flavors)}"
|
|
|
+ )
|
|
|
+ return flavor
|
|
|
+
|
|
|
+
|
|
|
+def _parse(
|
|
|
+ flavor,
|
|
|
+ io,
|
|
|
+ match,
|
|
|
+ attrs,
|
|
|
+ encoding,
|
|
|
+ displayed_only,
|
|
|
+ extract_links,
|
|
|
+ storage_options,
|
|
|
+ custom_args,
|
|
|
+ **kwargs,
|
|
|
+):
|
|
|
+ flavor = _validate_flavor(flavor)
|
|
|
+ compiled_match = re.compile(match) # you can pass a compiled regex here
|
|
|
+
|
|
|
+ retained = None
|
|
|
+ for flav in flavor:
|
|
|
+ parser = _parser_dispatch(flav)
|
|
|
+ p = parser(
|
|
|
+ io,
|
|
|
+ compiled_match,
|
|
|
+ attrs,
|
|
|
+ encoding,
|
|
|
+ displayed_only,
|
|
|
+ extract_links,
|
|
|
+ storage_options,
|
|
|
+ )
|
|
|
+
|
|
|
+ try:
|
|
|
+ tables = p.parse_tables(custom_args)
|
|
|
+ except ValueError as caught:
|
|
|
+ # if `io` is an io-like object, check if it's seekable
|
|
|
+ # and try to rewind it before trying the next parser
|
|
|
+ if hasattr(io, "seekable") and io.seekable():
|
|
|
+ io.seek(0)
|
|
|
+ elif hasattr(io, "seekable") and not io.seekable():
|
|
|
+ # if we couldn't rewind it, let the user know
|
|
|
+ raise ValueError(
|
|
|
+ f"The flavor {flav} failed to parse your input. "
|
|
|
+ "Since you passed a non-rewindable file "
|
|
|
+ "object, we can't rewind it to try "
|
|
|
+ "another parser. Try read_html() with a different flavor."
|
|
|
+ ) from caught
|
|
|
+
|
|
|
+ retained = caught
|
|
|
+ else:
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ assert retained is not None # for mypy
|
|
|
+ raise retained
|
|
|
+
|
|
|
+ ret = []
|
|
|
+ for table in tables:
|
|
|
+ try:
|
|
|
+ df = _data_to_frame(data=table, colspan_single=custom_args.get('colspan_single'), rowspan_single=custom_args.get('rowspan_single'), number_strip=custom_args.get('number_strip'), **kwargs)
|
|
|
+ # Cast MultiIndex header to an Index of tuples when extracting header
|
|
|
+ # links and replace nan with None (therefore can't use mi.to_flat_index()).
|
|
|
+ # This maintains consistency of selection (e.g. df.columns.str[1])
|
|
|
+ if extract_links in ("all", "header") and isinstance(
|
|
|
+ df.columns, MultiIndex
|
|
|
+ ):
|
|
|
+ df.columns = Index(
|
|
|
+ ((col[0], None if isna(col[1]) else col[1]) for col in df.columns),
|
|
|
+ tupleize_cols=False,
|
|
|
+ )
|
|
|
+
|
|
|
+ ret.append(df)
|
|
|
+ except EmptyDataError: # empty table
|
|
|
+ continue
|
|
|
+ return ret
|
|
|
+
|
|
|
+
|
|
|
+@doc(storage_options=_shared_docs["storage_options"])
|
|
|
+def read_html_zhch(
|
|
|
+ io: FilePath | ReadBuffer[str],
|
|
|
+ # 解释*含义:这里的*表示参数列表的开始,意味着后续的所有参数都必须是关键字参数(即必须通过参数名来传递)。
|
|
|
+ *,
|
|
|
+ match: str | Pattern = ".+",
|
|
|
+ flavor: HTMLFlavors | Sequence[HTMLFlavors] | None = None,
|
|
|
+ header: int | Sequence[int] | None = None,
|
|
|
+ index_col: int | Sequence[int] | None = None,
|
|
|
+ skiprows: int | Sequence[int] | slice | None = None,
|
|
|
+ attrs: dict[str, str] | None = None,
|
|
|
+ parse_dates: bool = False,
|
|
|
+ thousands: str | None = ",",
|
|
|
+ encoding: str | None = None,
|
|
|
+ decimal: str = ".",
|
|
|
+ converters: dict | None = None,
|
|
|
+ na_values: Iterable[object] | None = None,
|
|
|
+ keep_default_na: bool = True,
|
|
|
+ displayed_only: bool = True,
|
|
|
+ extract_links: Literal[None, "header", "footer", "body", "all"] = None,
|
|
|
+ dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
|
|
|
+ storage_options: StorageOptions = None,
|
|
|
+ custom_args: dict = {} ,
|
|
|
+) -> list[DataFrame]:
|
|
|
+ r"""
|
|
|
+ Read HTML tables into a ``list`` of ``DataFrame`` objects.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ io : str, path object, or file-like object
|
|
|
+ String, path object (implementing ``os.PathLike[str]``), or file-like
|
|
|
+ object implementing a string ``read()`` function.
|
|
|
+ The string can represent a URL or the HTML itself. Note that
|
|
|
+ lxml only accepts the http, ftp and file url protocols. If you have a
|
|
|
+ URL that starts with ``'https'`` you might try removing the ``'s'``.
|
|
|
+
|
|
|
+ .. deprecated:: 2.1.0
|
|
|
+ Passing html literal strings is deprecated.
|
|
|
+ Wrap literal string/bytes input in ``io.StringIO``/``io.BytesIO`` instead.
|
|
|
+
|
|
|
+ match : str or compiled regular expression, optional
|
|
|
+ The set of tables containing text matching this regex or string will be
|
|
|
+ returned. Unless the HTML is extremely simple you will probably need to
|
|
|
+ pass a non-empty string here. Defaults to '.+' (match any non-empty
|
|
|
+ string). The default value will return all tables contained on a page.
|
|
|
+ This value is converted to a regular expression so that there is
|
|
|
+ consistent behavior between Beautiful Soup and lxml.
|
|
|
+
|
|
|
+ flavor : {{"lxml", "html5lib", "bs4"}} or list-like, optional
|
|
|
+ The parsing engine (or list of parsing engines) to use. 'bs4' and
|
|
|
+ 'html5lib' are synonymous with each other, they are both there for
|
|
|
+ backwards compatibility. The default of ``None`` tries to use ``lxml``
|
|
|
+ to parse and if that fails it falls back on ``bs4`` + ``html5lib``.
|
|
|
+
|
|
|
+ header : int or list-like, optional
|
|
|
+ The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to
|
|
|
+ make the columns headers.
|
|
|
+
|
|
|
+ index_col : int or list-like, optional
|
|
|
+ The column (or list of columns) to use to create the index.
|
|
|
+
|
|
|
+ skiprows : int, list-like or slice, optional
|
|
|
+ Number of rows to skip after parsing the column integer. 0-based. If a
|
|
|
+ sequence of integers or a slice is given, will skip the rows indexed by
|
|
|
+ that sequence. Note that a single element sequence means 'skip the nth
|
|
|
+ row' whereas an integer means 'skip n rows'.
|
|
|
+
|
|
|
+ attrs : dict, optional
|
|
|
+ This is a dictionary of attributes that you can pass to use to identify
|
|
|
+ the table in the HTML. These are not checked for validity before being
|
|
|
+ passed to lxml or Beautiful Soup. However, these attributes must be
|
|
|
+ valid HTML table attributes to work correctly. For example, ::
|
|
|
+
|
|
|
+ attrs = {{'id': 'table'}}
|
|
|
+
|
|
|
+ is a valid attribute dictionary because the 'id' HTML tag attribute is
|
|
|
+ a valid HTML attribute for *any* HTML tag as per `this document
|
|
|
+ <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>`__. ::
|
|
|
+
|
|
|
+ attrs = {{'asdf': 'table'}}
|
|
|
+
|
|
|
+ is *not* a valid attribute dictionary because 'asdf' is not a valid
|
|
|
+ HTML attribute even if it is a valid XML attribute. Valid HTML 4.01
|
|
|
+ table attributes can be found `here
|
|
|
+ <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A
|
|
|
+ working draft of the HTML 5 spec can be found `here
|
|
|
+ <https://html.spec.whatwg.org/multipage/tables.html>`__. It contains the
|
|
|
+ latest information on table attributes for the modern web.
|
|
|
+
|
|
|
+ parse_dates : bool, optional
|
|
|
+ See :func:`~read_csv` for more details.
|
|
|
+
|
|
|
+ thousands : str, optional
|
|
|
+ Separator to use to parse thousands. Defaults to ``','``.
|
|
|
+
|
|
|
+ encoding : str, optional
|
|
|
+ The encoding used to decode the web page. Defaults to ``None``.``None``
|
|
|
+ preserves the previous encoding behavior, which depends on the
|
|
|
+ underlying parser library (e.g., the parser library will try to use
|
|
|
+ the encoding provided by the document).
|
|
|
+
|
|
|
+ decimal : str, default '.'
|
|
|
+ Character to recognize as decimal point (e.g. use ',' for European
|
|
|
+ data).
|
|
|
+
|
|
|
+ converters : dict, default None
|
|
|
+ Dict of functions for converting values in certain columns. Keys can
|
|
|
+ either be integers or column labels, values are functions that take one
|
|
|
+ input argument, the cell (not column) content, and return the
|
|
|
+ transformed content.
|
|
|
+
|
|
|
+ na_values : iterable, default None
|
|
|
+ Custom NA values.
|
|
|
+
|
|
|
+ keep_default_na : bool, default True
|
|
|
+ If na_values are specified and keep_default_na is False the default NaN
|
|
|
+ values are overridden, otherwise they're appended to.
|
|
|
+
|
|
|
+ displayed_only : bool, default True
|
|
|
+ Whether elements with "display: none" should be parsed.
|
|
|
+
|
|
|
+ extract_links : {{None, "all", "header", "body", "footer"}}
|
|
|
+ Table elements in the specified section(s) with <a> tags will have their
|
|
|
+ href extracted.
|
|
|
+
|
|
|
+ .. versionadded:: 1.5.0
|
|
|
+
|
|
|
+ dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
|
|
|
+ Back-end data type applied to the resultant :class:`DataFrame`
|
|
|
+ (still experimental). Behaviour is as follows:
|
|
|
+
|
|
|
+ * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
|
|
|
+ (default).
|
|
|
+ * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
|
|
|
+ DataFrame.
|
|
|
+
|
|
|
+ .. versionadded:: 2.0
|
|
|
+
|
|
|
+ {storage_options}
|
|
|
+
|
|
|
+ .. versionadded:: 2.1.0
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ dfs
|
|
|
+ A list of DataFrames.
|
|
|
+
|
|
|
+ See Also
|
|
|
+ --------
|
|
|
+ read_csv : Read a comma-separated values (csv) file into DataFrame.
|
|
|
+
|
|
|
+ Notes
|
|
|
+ -----
|
|
|
+ Before using this function you should read the :ref:`gotchas about the
|
|
|
+ HTML parsing libraries <io.html.gotchas>`.
|
|
|
+
|
|
|
+ Expect to do some cleanup after you call this function. For example, you
|
|
|
+ might need to manually assign column names if the column names are
|
|
|
+ converted to NaN when you pass the `header=0` argument. We try to assume as
|
|
|
+ little as possible about the structure of the table and push the
|
|
|
+ idiosyncrasies of the HTML contained in the table to the user.
|
|
|
+
|
|
|
+ This function searches for ``<table>`` elements and only for ``<tr>``
|
|
|
+ and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``
|
|
|
+ element in the table. ``<td>`` stands for "table data". This function
|
|
|
+ attempts to properly handle ``colspan`` and ``rowspan`` attributes.
|
|
|
+ If the function has a ``<thead>`` argument, it is used to construct
|
|
|
+ the header, otherwise the function attempts to find the header within
|
|
|
+ the body (by putting rows with only ``<th>`` elements into the header).
|
|
|
+
|
|
|
+ Similar to :func:`~read_csv` the `header` argument is applied
|
|
|
+ **after** `skiprows` is applied.
|
|
|
+
|
|
|
+ This function will *always* return a list of :class:`DataFrame` *or*
|
|
|
+ it will fail, e.g., it will *not* return an empty list.
|
|
|
+
|
|
|
+ Examples
|
|
|
+ --------
|
|
|
+ See the :ref:`read_html documentation in the IO section of the docs
|
|
|
+ <io.read_html>` for some examples of reading in HTML tables.
|
|
|
+ """
|
|
|
+ # Type check here. We don't want to parse only to fail because of an
|
|
|
+ # invalid value of an integer skiprows.
|
|
|
+ if isinstance(skiprows, numbers.Integral) and skiprows < 0:
|
|
|
+ raise ValueError(
|
|
|
+ "cannot skip rows starting from the end of the "
|
|
|
+ "data (you passed a negative value)"
|
|
|
+ )
|
|
|
+ if extract_links not in [None, "header", "footer", "body", "all"]:
|
|
|
+ raise ValueError(
|
|
|
+ "`extract_links` must be one of "
|
|
|
+ '{None, "header", "footer", "body", "all"}, got '
|
|
|
+ f'"{extract_links}"'
|
|
|
+ )
|
|
|
+
|
|
|
+ validate_header_arg(header)
|
|
|
+ check_dtype_backend(dtype_backend)
|
|
|
+
|
|
|
+ io = stringify_path(io)
|
|
|
+
|
|
|
+ if isinstance(io, str) and not any(
|
|
|
+ [
|
|
|
+ is_file_like(io),
|
|
|
+ file_exists(io),
|
|
|
+ is_url(io),
|
|
|
+ is_fsspec_url(io),
|
|
|
+ ]
|
|
|
+ ):
|
|
|
+ warnings.warn(
|
|
|
+ "Passing literal html to 'read_html' is deprecated and "
|
|
|
+ "will be removed in a future version. To read from a "
|
|
|
+ "literal string, wrap it in a 'StringIO' object.",
|
|
|
+ FutureWarning,
|
|
|
+ stacklevel=find_stack_level(),
|
|
|
+ )
|
|
|
+
|
|
|
+ return _parse(
|
|
|
+ flavor=flavor,
|
|
|
+ io=io,
|
|
|
+ match=match,
|
|
|
+ header=header,
|
|
|
+ index_col=index_col,
|
|
|
+ skiprows=skiprows,
|
|
|
+ parse_dates=parse_dates,
|
|
|
+ thousands=thousands,
|
|
|
+ attrs=attrs,
|
|
|
+ encoding=encoding,
|
|
|
+ decimal=decimal,
|
|
|
+ converters=converters,
|
|
|
+ na_values=na_values,
|
|
|
+ keep_default_na=keep_default_na,
|
|
|
+ displayed_only=displayed_only,
|
|
|
+ extract_links=extract_links,
|
|
|
+ dtype_backend=dtype_backend,
|
|
|
+ storage_options=storage_options,
|
|
|
+ custom_args=custom_args,
|
|
|
+ )
|