1 сар өмнө · b5202f629c
--- a/zhch/download_models.py
+++ b/zhch/download_models.py
@@ -1,63 +0,0 @@
 
				-import json
			
 
				-import os
			
 
				-
			
 
				-import requests
			
 
				-from modelscope import snapshot_download
			
 
				-
			
 
				-from dotenv import load_dotenv; load_dotenv()
			
 
				-print(f"os.environ['CUDA_VISIBLE_DEVICES']: {os.environ['CUDA_VISIBLE_DEVICES']}") 
			
 
				-print(f"os.environ['HF_ENDPOINT']: {os.environ['HF_ENDPOINT']}") 
			
 
				-
			
 
				-def download_json(url):
			
 
				-    # 下载JSON文件
			
 
				-    response = requests.get(url)
			
 
				-    response.raise_for_status()  # 检查请求是否成功
			
 
				-    return response.json()
			
 
				-
			
 
				-
			
 
				-def download_and_modify_json(url, local_filename, modifications):
			
 
				-    if os.path.exists(local_filename):
			
 
				-        data = json.load(open(local_filename))
			
 
				-        config_version = data.get('config_version', '0.0.0')
			
 
				-        if config_version < '1.0.0':
			
 
				-            data = download_json(url)
			
 
				-    else:
			
 
				-        data = download_json(url)
			
 
				-
			
 
				-    # 修改内容
			
 
				-    for key, value in modifications.items():
			
 
				-        data[key] = value
			
 
				-
			
 
				-    # 保存修改后的内容
			
 
				-    with open(local_filename, 'w', encoding='utf-8') as f:
			
 
				-        json.dump(data, f, ensure_ascii=False, indent=4)
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    mineru_patterns = [
			
 
				-        "models/Layout/LayoutLMv3/*",
			
 
				-        "models/Layout/YOLO/*",
			
 
				-        "models/MFD/YOLO/*",
			
 
				-        "models/MFR/unimernet_small/*",
			
 
				-        "models/TabRec/TableMaster/*",
			
 
				-        "models/TabRec/StructEqTable/*",
			
 
				-    ]
			
 
				-    model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_file_pattern=mineru_patterns)
			
 
				-    layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')
			
 
				-    model_dir = model_dir + '/models'
			
 
				-    print(f'model_dir is: {model_dir}')
			
 
				-    print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
			
 
				-
			
 
				-    json_url = 'https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.template.json'
			
 
				-    config_file_name = 'magic-pdf.json'
			
 
				-    # home_dir = os.path.expanduser('~')
			
 
				-    home_dir = os.path.dirname(os.path.abspath(__file__))
			
 
				-    config_file = os.path.join(home_dir, config_file_name)
			
 
				-
			
 
				-    json_mods = {
			
 
				-        'models-dir': model_dir,
			
 
				-        'layoutreader-model-dir': layoutreader_model_dir,
			
 
				-    }
			
 
				-
			
 
				-    download_and_modify_json(json_url, config_file, json_mods)
			
 
				-    print(f'The configuration file has been configured successfully, the path is: {config_file}')
			
--- a/zhch/html_zhch.py
+++ b/zhch/html_zhch.py
@@ -1,1318 +0,0 @@
 
				-from __future__ import annotations
			
 
				-from typing import Sequence, Iterable
			
 
				-# from pandas._typing import FilePath, BaseBuffer, StorageOptions
			
 
				-from pandas._typing import *
			
 
				-from loguru import logger
			
 
				-
			
 
				-"""
			
 
				-对pandas.io.html的修改
			
 
				-增加属性 **kwargs
			
 
				-colspan_single = kwargs.get('colspan_single') # 单个单元格的colspan=Literal[None, "header", "footer", "body"]
			
 
				-rowspan_single = kwargs.get('rowspan_single') # 单个单元格的rowspan=Literal[None, "header", "footer", "body"]
			
 
				-number_strip = kwargs.get('number_strip') # 是否去除单个cell中数字中空格 = bool
			
 
				-"""
			
 
				-
			
 
				-"""
			
 
				-:mod:`pandas.io.html` is a module containing functionality for dealing with
			
 
				-HTML IO.
			
 
				-
			
 
				-"""
			
 
				-
			
 
				-from collections import abc
			
 
				-import numbers
			
 
				-import re
			
 
				-from re import Pattern
			
 
				-from typing import (
			
 
				-    TYPE_CHECKING,
			
 
				-    Literal,
			
 
				-    cast,
			
 
				-)
			
 
				-import warnings
			
 
				-
			
 
				-from pandas._libs import lib
			
 
				-from pandas.compat._optional import import_optional_dependency
			
 
				-from pandas.errors import (
			
 
				-    AbstractMethodError,
			
 
				-    EmptyDataError,
			
 
				-)
			
 
				-from pandas.util._decorators import doc
			
 
				-from pandas.util._exceptions import find_stack_level
			
 
				-from pandas.util._validators import check_dtype_backend
			
 
				-
			
 
				-from pandas.core.dtypes.common import is_list_like
			
 
				-
			
 
				-from pandas import isna
			
 
				-from pandas.core.indexes.base import Index
			
 
				-from pandas.core.indexes.multi import MultiIndex
			
 
				-from pandas.core.series import Series
			
 
				-from pandas.core.shared_docs import _shared_docs
			
 
				-
			
 
				-from pandas.io.common import (
			
 
				-    file_exists,
			
 
				-    get_handle,
			
 
				-    is_file_like,
			
 
				-    is_fsspec_url,
			
 
				-    is_url,
			
 
				-    stringify_path,
			
 
				-    validate_header_arg,
			
 
				-)
			
 
				-from pandas.io.formats.printing import pprint_thing
			
 
				-from pandas.io.parsers import TextParser
			
 
				-
			
 
				-if TYPE_CHECKING:
			
 
				-    from collections.abc import (
			
 
				-        Iterable,
			
 
				-        Sequence,
			
 
				-    )
			
 
				-
			
 
				-    from pandas._typing import (
			
 
				-        BaseBuffer,
			
 
				-        DtypeBackend,
			
 
				-        FilePath,
			
 
				-        HTMLFlavors,
			
 
				-        ReadBuffer,
			
 
				-        StorageOptions,
			
 
				-    )
			
 
				-
			
 
				-    from pandas import DataFrame
			
 
				-
			
 
				-#############
			
 
				-# READ HTML #
			
 
				-#############
			
 
				-_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}")
			
 
				-
			
 
				-
			
 
				-def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str:
			
 
				-    """
			
 
				-    Replace extra whitespace inside of a string with a single space.
			
 
				-
			
 
				-    Parameters
			
 
				-    ----------
			
 
				-    s : str or unicode
			
 
				-        The string from which to remove extra whitespace.
			
 
				-    regex : re.Pattern
			
 
				-        The regular expression to use to remove extra whitespace.
			
 
				-
			
 
				-    Returns
			
 
				-    -------
			
 
				-    subd : str or unicode
			
 
				-        `s` with all extra whitespace replaced with a single space.
			
 
				-    """
			
 
				-    return regex.sub(" ", s.strip())
			
 
				-
			
 
				-
			
 
				-def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]:
			
 
				-    """
			
 
				-    Get an iterator given an integer, slice or container.
			
 
				-
			
 
				-    Parameters
			
 
				-    ----------
			
 
				-    skiprows : int, slice, container
			
 
				-        The iterator to use to skip rows; can also be a slice.
			
 
				-
			
 
				-    Raises
			
 
				-    ------
			
 
				-    TypeError
			
 
				-        * If `skiprows` is not a slice, integer, or Container
			
 
				-
			
 
				-    Returns
			
 
				-    -------
			
 
				-    it : iterable
			
 
				-        A proper iterator to use to skip rows of a DataFrame.
			
 
				-    """
			
 
				-    if isinstance(skiprows, slice):
			
 
				-        start, step = skiprows.start or 0, skiprows.step or 1
			
 
				-        return list(range(start, skiprows.stop, step))
			
 
				-    elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):
			
 
				-        return cast("int | Sequence[int]", skiprows)
			
 
				-    elif skiprows is None:
			
 
				-        return 0
			
 
				-    raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows")
			
 
				-
			
 
				-
			
 
				-def _read(
			
 
				-    obj: FilePath | BaseBuffer,
			
 
				-    encoding: str | None,
			
 
				-    storage_options: StorageOptions | None,
			
 
				-) -> str | bytes:
			
 
				-    """
			
 
				-    Try to read from a url, file or string.
			
 
				-
			
 
				-    Parameters
			
 
				-    ----------
			
 
				-    obj : str, unicode, path object, or file-like object
			
 
				-
			
 
				-    Returns
			
 
				-    -------
			
 
				-    raw_text : str
			
 
				-    """
			
 
				-    text: str | bytes
			
 
				-    if (
			
 
				-        is_url(obj)
			
 
				-        or hasattr(obj, "read")
			
 
				-        or (isinstance(obj, str) and file_exists(obj))
			
 
				-    ):
			
 
				-        with get_handle(
			
 
				-            obj, "r", encoding=encoding, storage_options=storage_options
			
 
				-        ) as handles:
			
 
				-            text = handles.handle.read()
			
 
				-    elif isinstance(obj, (str, bytes)):
			
 
				-        text = obj
			
 
				-    else:
			
 
				-        raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")
			
 
				-    return text
			
 
				-
			
 
				-
			
 
				-class _HtmlFrameParser:
			
 
				-    """
			
 
				-    Base class for parsers that parse HTML into DataFrames.
			
 
				-
			
 
				-    Parameters
			
 
				-    ----------
			
 
				-    io : str or file-like
			
 
				-        This can be either a string of raw HTML, a valid URL using the HTTP,
			
 
				-        FTP, or FILE protocols or a file-like object.
			
 
				-
			
 
				-    match : str or regex
			
 
				-        The text to match in the document.
			
 
				-
			
 
				-    attrs : dict
			
 
				-        List of HTML <table> element attributes to match.
			
 
				-
			
 
				-    encoding : str
			
 
				-        Encoding to be used by parser
			
 
				-
			
 
				-    displayed_only : bool
			
 
				-        Whether or not items with "display:none" should be ignored
			
 
				-
			
 
				-    extract_links : {None, "all", "header", "body", "footer"}
			
 
				-        Table elements in the specified section(s) with <a> tags will have their
			
 
				-        href extracted.
			
 
				-
			
 
				-        .. versionadded:: 1.5.0
			
 
				-
			
 
				-    Attributes
			
 
				-    ----------
			
 
				-    io : str or file-like
			
 
				-        raw HTML, URL, or file-like object
			
 
				-
			
 
				-    match : regex
			
 
				-        The text to match in the raw HTML
			
 
				-
			
 
				-    attrs : dict-like
			
 
				-        A dictionary of valid table attributes to use to search for table
			
 
				-        elements.
			
 
				-
			
 
				-    encoding : str
			
 
				-        Encoding to be used by parser
			
 
				-
			
 
				-    displayed_only : bool
			
 
				-        Whether or not items with "display:none" should be ignored
			
 
				-
			
 
				-    extract_links : {None, "all", "header", "body", "footer"}
			
 
				-        Table elements in the specified section(s) with <a> tags will have their
			
 
				-        href extracted.
			
 
				-
			
 
				-        .. versionadded:: 1.5.0
			
 
				-
			
 
				-    Notes
			
 
				-    -----
			
 
				-    To subclass this class effectively you must override the following methods:
			
 
				-        * :func:`_build_doc`
			
 
				-        * :func:`_attr_getter`
			
 
				-        * :func:`_href_getter`
			
 
				-        * :func:`_text_getter`
			
 
				-        * :func:`_parse_td`
			
 
				-        * :func:`_parse_thead_tr`
			
 
				-        * :func:`_parse_tbody_tr`
			
 
				-        * :func:`_parse_tfoot_tr`
			
 
				-        * :func:`_parse_tables`
			
 
				-        * :func:`_equals_tag`
			
 
				-    See each method's respective documentation for details on their
			
 
				-    functionality.
			
 
				-    """
			
 
				-
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        io: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
			
 
				-        match: str | Pattern,
			
 
				-        attrs: dict[str, str] | None,
			
 
				-        encoding: str,
			
 
				-        displayed_only: bool,
			
 
				-        extract_links: Literal[None, "header", "footer", "body", "all"],
			
 
				-        storage_options: StorageOptions = None,
			
 
				-    ) -> None:
			
 
				-        self.io = io
			
 
				-        self.match = match
			
 
				-        self.attrs = attrs
			
 
				-        self.encoding = encoding
			
 
				-        self.displayed_only = displayed_only
			
 
				-        self.extract_links = extract_links
			
 
				-        self.storage_options = storage_options
			
 
				-
			
 
				-    def parse_tables(self, custom_args):
			
 
				-        """
			
 
				-        Parse and return all tables from the DOM.
			
 
				-
			
 
				-        Returns
			
 
				-        -------
			
 
				-        list of parsed (header, body, footer) tuples from tables.
			
 
				-        """
			
 
				-        tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
			
 
				-        return (self._parse_thead_tbody_tfoot(table, custom_args) for table in tables)
			
 
				-
			
 
				-    def _attr_getter(self, obj, attr):
			
 
				-        """
			
 
				-        Return the attribute value of an individual DOM node.
			
 
				-
			
 
				-        Parameters
			
 
				-        ----------
			
 
				-        obj : node-like
			
 
				-            A DOM node.
			
 
				-
			
 
				-        attr : str or unicode
			
 
				-            The attribute, such as "colspan"
			
 
				-
			
 
				-        Returns
			
 
				-        -------
			
 
				-        str or unicode
			
 
				-            The attribute value.
			
 
				-        """
			
 
				-        # Both lxml and BeautifulSoup have the same implementation:
			
 
				-        return obj.get(attr)
			
 
				-
			
 
				-    def _href_getter(self, obj) -> str | None:
			
 
				-        """
			
 
				-        Return a href if the DOM node contains a child <a> or None.
			
 
				-
			
 
				-        Parameters
			
 
				-        ----------
			
 
				-        obj : node-like
			
 
				-            A DOM node.
			
 
				-
			
 
				-        Returns
			
 
				-        -------
			
 
				-        href : str or unicode
			
 
				-            The href from the <a> child of the DOM node.
			
 
				-        """
			
 
				-        raise AbstractMethodError(self)
			
 
				-
			
 
				-    def _text_getter(self, obj):
			
 
				-        """
			
 
				-        Return the text of an individual DOM node.
			
 
				-
			
 
				-        Parameters
			
 
				-        ----------
			
 
				-        obj : node-like
			
 
				-            A DOM node.
			
 
				-
			
 
				-        Returns
			
 
				-        -------
			
 
				-        text : str or unicode
			
 
				-            The text from an individual DOM node.
			
 
				-        """
			
 
				-        raise AbstractMethodError(self)
			
 
				-
			
 
				-    def _parse_td(self, obj):
			
 
				-        """
			
 
				-        Return the td elements from a row element.
			
 
				-
			
 
				-        Parameters
			
 
				-        ----------
			
 
				-        obj : node-like
			
 
				-            A DOM <tr> node.
			
 
				-
			
 
				-        Returns
			
 
				-        -------
			
 
				-        list of node-like
			
 
				-            These are the elements of each row, i.e., the columns.
			
 
				-        """
			
 
				-        raise AbstractMethodError(self)
			
 
				-
			
 
				-    def _parse_thead_tr(self, table):
			
 
				-        """
			
 
				-        Return the list of thead row elements from the parsed table element.
			
 
				-
			
 
				-        Parameters
			
 
				-        ----------
			
 
				-        table : a table element that contains zero or more thead elements.
			
 
				-
			
 
				-        Returns
			
 
				-        -------
			
 
				-        list of node-like
			
 
				-            These are the <tr> row elements of a table.
			
 
				-        """
			
 
				-        raise AbstractMethodError(self)
			
 
				-
			
 
				-    def _parse_tbody_tr(self, table):
			
 
				-        """
			
 
				-        Return the list of tbody row elements from the parsed table element.
			
 
				-
			
 
				-        HTML5 table bodies consist of either 0 or more <tbody> elements (which
			
 
				-        only contain <tr> elements) or 0 or more <tr> elements. This method
			
 
				-        checks for both structures.
			
 
				-
			
 
				-        Parameters
			
 
				-        ----------
			
 
				-        table : a table element that contains row elements.
			
 
				-
			
 
				-        Returns
			
 
				-        -------
			
 
				-        list of node-like
			
 
				-            These are the <tr> row elements of a table.
			
 
				-        """
			
 
				-        raise AbstractMethodError(self)
			
 
				-
			
 
				-    def _parse_tfoot_tr(self, table):
			
 
				-        """
			
 
				-        Return the list of tfoot row elements from the parsed table element.
			
 
				-
			
 
				-        Parameters
			
 
				-        ----------
			
 
				-        table : a table element that contains row elements.
			
 
				-
			
 
				-        Returns
			
 
				-        -------
			
 
				-        list of node-like
			
 
				-            These are the <tr> row elements of a table.
			
 
				-        """
			
 
				-        raise AbstractMethodError(self)
			
 
				-
			
 
				-    def _parse_tables(self, document, match, attrs):
			
 
				-        """
			
 
				-        Return all tables from the parsed DOM.
			
 
				-
			
 
				-        Parameters
			
 
				-        ----------
			
 
				-        document : the DOM from which to parse the table element.
			
 
				-
			
 
				-        match : str or regular expression
			
 
				-            The text to search for in the DOM tree.
			
 
				-
			
 
				-        attrs : dict
			
 
				-            A dictionary of table attributes that can be used to disambiguate
			
 
				-            multiple tables on a page.
			
 
				-
			
 
				-        Raises
			
 
				-        ------
			
 
				-        ValueError : `match` does not match any text in the document.
			
 
				-
			
 
				-        Returns
			
 
				-        -------
			
 
				-        list of node-like
			
 
				-            HTML <table> elements to be parsed into raw data.
			
 
				-        """
			
 
				-        raise AbstractMethodError(self)
			
 
				-
			
 
				-    def _equals_tag(self, obj, tag) -> bool:
			
 
				-        """
			
 
				-        Return whether an individual DOM node matches a tag
			
 
				-
			
 
				-        Parameters
			
 
				-        ----------
			
 
				-        obj : node-like
			
 
				-            A DOM node.
			
 
				-
			
 
				-        tag : str
			
 
				-            Tag name to be checked for equality.
			
 
				-
			
 
				-        Returns
			
 
				-        -------
			
 
				-        boolean
			
 
				-            Whether `obj`'s tag name is `tag`
			
 
				-        """
			
 
				-        raise AbstractMethodError(self)
			
 
				-
			
 
				-    def _build_doc(self):
			
 
				-        """
			
 
				-        Return a tree-like object that can be used to iterate over the DOM.
			
 
				-
			
 
				-        Returns
			
 
				-        -------
			
 
				-        node-like
			
 
				-            The DOM from which to parse the table element.
			
 
				-        """
			
 
				-        raise AbstractMethodError(self)
			
 
				-
			
 
				-    def _parse_thead_tbody_tfoot(self, table_html, custom_args):
			
 
				-        """
			
 
				-        Given a table, return parsed header, body, and foot.
			
 
				-
			
 
				-        Parameters
			
 
				-        ----------
			
 
				-        table_html : node-like
			
 
				-
			
 
				-        Returns
			
 
				-        -------
			
 
				-        tuple of (header, body, footer), each a list of list-of-text rows.
			
 
				-
			
 
				-        Notes
			
 
				-        -----
			
 
				-        Header and body are lists-of-lists. Top level list is a list of
			
 
				-        rows. Each row is a list of str text.
			
 
				-
			
 
				-        Logic: Use <thead>, <tbody>, <tfoot> elements to identify
			
 
				-               header, body, and footer, otherwise:
			
 
				-               - Put all rows into body
			
 
				-               - Move rows from top of body to header only if
			
 
				-                 all elements inside row are <th>
			
 
				-               - Move rows from bottom of body to footer only if
			
 
				-                 all elements inside row are <th>
			
 
				-        """
			
 
				-        header_rows = self._parse_thead_tr(table_html)
			
 
				-        body_rows = self._parse_tbody_tr(table_html)
			
 
				-        footer_rows = self._parse_tfoot_tr(table_html)
			
 
				-
			
 
				-        def row_is_all_th(row):
			
 
				-            return all(self._equals_tag(t, "th") for t in self._parse_td(row))
			
 
				-
			
 
				-        if not header_rows:
			
 
				-            # The table has no <thead>. Move the top all-<th> rows from
			
 
				-            # body_rows to header_rows. (This is a common case because many
			
 
				-            # tables in the wild have no <thead> or <tfoot>
			
 
				-            while body_rows and row_is_all_th(body_rows[0]):
			
 
				-                header_rows.append(body_rows.pop(0))
			
 
				-
			
 
				-        header = self._expand_colspan_rowspan(header_rows, section="header", custom_args=custom_args)
			
 
				-        body = self._expand_colspan_rowspan(body_rows, section="body", custom_args=custom_args)
			
 
				-        footer = self._expand_colspan_rowspan(footer_rows, section="footer", custom_args=custom_args)
			
 
				-
			
 
				-        return header, body, footer
			
 
				-
			
 
				-    def _expand_colspan_rowspan(
			
 
				-        self, rows, section: Literal["header", "footer", "body"], custom_args
			
 
				-    ):
			
 
				-        """
			
 
				-        Given a list of <tr>s, return a list of text rows.
			
 
				-
			
 
				-        Parameters
			
 
				-        ----------
			
 
				-        rows : list of node-like
			
 
				-            List of <tr>s
			
 
				-        section : the section that the rows belong to (header, body or footer).
			
 
				-
			
 
				-        Returns
			
 
				-        -------
			
 
				-        list of list
			
 
				-            Each returned row is a list of str text, or tuple (text, link)
			
 
				-            if extract_links is not None.
			
 
				-
			
 
				-        Notes
			
 
				-        -----
			
 
				-        Any cell with ``rowspan`` or ``colspan`` will have its contents copied
			
 
				-        to subsequent cells.
			
 
				-        """
			
 
				-        colspan_single = custom_args.get('colspan_single')
			
 
				-        rowspan_single = custom_args.get('rowspan_single')
			
 
				-        number_strip = custom_args.get('number_strip')
			
 
				-
			
 
				-        all_texts = []  # list of rows, each a list of str
			
 
				-        text: str | tuple
			
 
				-        remainder: list[
			
 
				-            tuple[int, str | tuple, int]
			
 
				-        ] = []  # list of (index, text, nrows)
			
 
				-
			
 
				-        for tr in rows:
			
 
				-            texts = []  # the output for this row
			
 
				-            next_remainder = []
			
 
				-
			
 
				-            index = 0
			
 
				-            tds = self._parse_td(tr)
			
 
				-            for td in tds:
			
 
				-                # Append texts from previous rows with rowspan>1 that come
			
 
				-                # before this <td>
			
 
				-                while remainder and remainder[0][0] <= index:
			
 
				-                    prev_i, prev_text, prev_rowspan = remainder.pop(0)
			
 
				-                    texts.append(prev_text)
			
 
				-                    if prev_rowspan > 1:
			
 
				-                        next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
			
 
				-                    index += 1
			
 
				-
			
 
				-                # Append the text from this <td>, colspan times
			
 
				-                text = _remove_whitespace(self._text_getter(td))
			
 
				-                if self.extract_links in ("all", section):
			
 
				-                    href = self._href_getter(td)
			
 
				-                    text = (text, href)
			
 
				-                rowspan = int(self._attr_getter(td, "rowspan") or 1)
			
 
				-                colspan = int(self._attr_getter(td, "colspan") or 1)
			
 
				-
			
 
				-                for i in range(colspan):
			
 
				-                    if i == 0:
			
 
				-                        texts.append(text)
			
 
				-                    elif section in colspan_single:
			
 
				-                        texts.append('')
			
 
				-                    else:
			
 
				-                        texts.append(text)
			
 
				-                    
			
 
				-                    if rowspan > 1 and section not in rowspan_single:
			
 
				-                        next_remainder.append((index, text, rowspan - 1))
			
 
				-                    index += 1
			
 
				-
			
 
				-            # Append texts from previous rows at the final position
			
 
				-            for prev_i, prev_text, prev_rowspan in remainder:
			
 
				-                texts.append(prev_text)
			
 
				-                if prev_rowspan > 1:
			
 
				-                    next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
			
 
				-
			
 
				-            all_texts.append(texts)
			
 
				-            remainder = next_remainder
			
 
				-
			
 
				-        # Append rows that only appear because the previous row had non-1
			
 
				-        # rowspan
			
 
				-        while remainder:
			
 
				-            next_remainder = []
			
 
				-            texts = []
			
 
				-            for prev_i, prev_text, prev_rowspan in remainder:
			
 
				-                texts.append(prev_text)
			
 
				-                if prev_rowspan > 1:
			
 
				-                    next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
			
 
				-            all_texts.append(texts)
			
 
				-            remainder = next_remainder
			
 
				-
			
 
				-        return all_texts
			
 
				-
			
 
				-    def _handle_hidden_tables(self, tbl_list, attr_name: str):
			
 
				-        """
			
 
				-        Return list of tables, potentially removing hidden elements
			
 
				-
			
 
				-        Parameters
			
 
				-        ----------
			
 
				-        tbl_list : list of node-like
			
 
				-            Type of list elements will vary depending upon parser used
			
 
				-        attr_name : str
			
 
				-            Name of the accessor for retrieving HTML attributes
			
 
				-
			
 
				-        Returns
			
 
				-        -------
			
 
				-        list of node-like
			
 
				-            Return type matches `tbl_list`
			
 
				-        """
			
 
				-        if not self.displayed_only:
			
 
				-            return tbl_list
			
 
				-
			
 
				-        return [
			
 
				-            x
			
 
				-            for x in tbl_list
			
 
				-            if "display:none"
			
 
				-            not in getattr(x, attr_name).get("style", "").replace(" ", "")
			
 
				-        ]
			
 
				-
			
 
				-
			
 
				-class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
			
 
				-    """
			
 
				-    HTML to DataFrame parser that uses BeautifulSoup under the hood.
			
 
				-
			
 
				-    See Also
			
 
				-    --------
			
 
				-    pandas.io.html._HtmlFrameParser
			
 
				-    pandas.io.html._LxmlFrameParser
			
 
				-
			
 
				-    Notes
			
 
				-    -----
			
 
				-    Documentation strings for this class are in the base class
			
 
				-    :class:`pandas.io.html._HtmlFrameParser`.
			
 
				-    """
			
 
				-
			
 
				-    def _parse_tables(self, document, match, attrs):
			
 
				-        element_name = "table"
			
 
				-        tables = document.find_all(element_name, attrs=attrs)
			
 
				-        if not tables:
			
 
				-            raise ValueError("No tables found")
			
 
				-
			
 
				-        result = []
			
 
				-        unique_tables = set()
			
 
				-        tables = self._handle_hidden_tables(tables, "attrs")
			
 
				-
			
 
				-        for table in tables:
			
 
				-            if self.displayed_only:
			
 
				-                for elem in table.find_all("style"):
			
 
				-                    elem.decompose()
			
 
				-
			
 
				-                for elem in table.find_all(style=re.compile(r"display:\s*none")):
			
 
				-                    elem.decompose()
			
 
				-
			
 
				-            if table not in unique_tables and table.find(string=match) is not None:
			
 
				-                result.append(table)
			
 
				-            unique_tables.add(table)
			
 
				-        if not result:
			
 
				-            raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
			
 
				-        return result
			
 
				-
			
 
				-    def _href_getter(self, obj) -> str | None:
			
 
				-        a = obj.find("a", href=True)
			
 
				-        return None if not a else a["href"]
			
 
				-
			
 
				-    def _text_getter(self, obj):
			
 
				-        return obj.text
			
 
				-
			
 
				-    def _equals_tag(self, obj, tag) -> bool:
			
 
				-        return obj.name == tag
			
 
				-
			
 
				-    def _parse_td(self, row):
			
 
				-        return row.find_all(("td", "th"), recursive=False)
			
 
				-
			
 
				-    def _parse_thead_tr(self, table):
			
 
				-        return table.select("thead tr")
			
 
				-
			
 
				-    def _parse_tbody_tr(self, table):
			
 
				-        from_tbody = table.select("tbody tr")
			
 
				-        from_root = table.find_all("tr", recursive=False)
			
 
				-        # HTML spec: at most one of these lists has content
			
 
				-        return from_tbody + from_root
			
 
				-
			
 
				-    def _parse_tfoot_tr(self, table):
			
 
				-        return table.select("tfoot tr")
			
 
				-
			
 
				-    def _setup_build_doc(self):
			
 
				-        raw_text = _read(self.io, self.encoding, self.storage_options)
			
 
				-        if not raw_text:
			
 
				-            raise ValueError(f"No text parsed from document: {self.io}")
			
 
				-        return raw_text
			
 
				-
			
 
				-    def _build_doc(self):
			
 
				-        from bs4 import BeautifulSoup
			
 
				-
			
 
				-        bdoc = self._setup_build_doc()
			
 
				-        if isinstance(bdoc, bytes) and self.encoding is not None:
			
 
				-            udoc = bdoc.decode(self.encoding)
			
 
				-            from_encoding = None
			
 
				-        else:
			
 
				-            udoc = bdoc
			
 
				-            from_encoding = self.encoding
			
 
				-
			
 
				-        soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
			
 
				-
			
 
				-        for br in soup.find_all("br"):
			
 
				-            br.replace_with("\n" + br.text)
			
 
				-
			
 
				-        return soup
			
 
				-
			
 
				-
			
 
				-def _build_xpath_expr(attrs) -> str:
			
 
				-    """
			
 
				-    Build an xpath expression to simulate bs4's ability to pass in kwargs to
			
 
				-    search for attributes when using the lxml parser.
			
 
				-
			
 
				-    Parameters
			
 
				-    ----------
			
 
				-    attrs : dict
			
 
				-        A dict of HTML attributes. These are NOT checked for validity.
			
 
				-
			
 
				-    Returns
			
 
				-    -------
			
 
				-    expr : unicode
			
 
				-        An XPath expression that checks for the given HTML attributes.
			
 
				-    """
			
 
				-    # give class attribute as class_ because class is a python keyword
			
 
				-    if "class_" in attrs:
			
 
				-        attrs["class"] = attrs.pop("class_")
			
 
				-
			
 
				-    s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()])
			
 
				-    return f"[{s}]"
			
 
				-
			
 
				-
			
 
				-_re_namespace = {"re": "http://exslt.org/regular-expressions"}
			
 
				-
			
 
				-
			
 
				-class _LxmlFrameParser(_HtmlFrameParser):
			
 
				-    """
			
 
				-    HTML to DataFrame parser that uses lxml under the hood.
			
 
				-
			
 
				-    Warning
			
 
				-    -------
			
 
				-    This parser can only handle HTTP, FTP, and FILE urls.
			
 
				-
			
 
				-    See Also
			
 
				-    --------
			
 
				-    _HtmlFrameParser
			
 
				-    _BeautifulSoupLxmlFrameParser
			
 
				-
			
 
				-    Notes
			
 
				-    -----
			
 
				-    Documentation strings for this class are in the base class
			
 
				-    :class:`_HtmlFrameParser`.
			
 
				-    """
			
 
				-
			
 
				-    def _href_getter(self, obj) -> str | None:
			
 
				-        href = obj.xpath(".//a/@href")
			
 
				-        return None if not href else href[0]
			
 
				-
			
 
				-    def _text_getter(self, obj):
			
 
				-        return obj.text_content()
			
 
				-
			
 
				-    def _parse_td(self, row):
			
 
				-        # Look for direct children only: the "row" element here may be a
			
 
				-        # <thead> or <tfoot> (see _parse_thead_tr).
			
 
				-        return row.xpath("./td|./th")
			
 
				-
			
 
				-    def _parse_tables(self, document, match, kwargs):
			
 
				-        pattern = match.pattern
			
 
				-
			
 
				-        # 1. check all descendants for the given pattern and only search tables
			
 
				-        # GH 49929
			
 
				-        xpath_expr = f"//table[.//text()[re:test(., {repr(pattern)})]]"
			
 
				-
			
 
				-        # if any table attributes were given build an xpath expression to
			
 
				-        # search for them
			
 
				-        if kwargs:
			
 
				-            xpath_expr += _build_xpath_expr(kwargs)
			
 
				-
			
 
				-        tables = document.xpath(xpath_expr, namespaces=_re_namespace)
			
 
				-
			
 
				-        tables = self._handle_hidden_tables(tables, "attrib")
			
 
				-        if self.displayed_only:
			
 
				-            for table in tables:
			
 
				-                # lxml utilizes XPATH 1.0 which does not have regex
			
 
				-                # support. As a result, we find all elements with a style
			
 
				-                # attribute and iterate them to check for display:none
			
 
				-                for elem in table.xpath(".//style"):
			
 
				-                    elem.drop_tree()
			
 
				-                for elem in table.xpath(".//*[@style]"):
			
 
				-                    if "display:none" in elem.attrib.get("style", "").replace(" ", ""):
			
 
				-                        elem.drop_tree()
			
 
				-        if not tables:
			
 
				-            raise ValueError(f"No tables found matching regex {repr(pattern)}")
			
 
				-        return tables
			
 
				-
			
 
				-    def _equals_tag(self, obj, tag) -> bool:
			
 
				-        return obj.tag == tag
			
 
				-
			
 
				-    def _build_doc(self):
			
 
				-        """
			
 
				-        Raises
			
 
				-        ------
			
 
				-        ValueError
			
 
				-            * If a URL that lxml cannot parse is passed.
			
 
				-
			
 
				-        Exception
			
 
				-            * Any other ``Exception`` thrown. For example, trying to parse a
			
 
				-              URL that is syntactically correct on a machine with no internet
			
 
				-              connection will fail.
			
 
				-
			
 
				-        See Also
			
 
				-        --------
			
 
				-        pandas.io.html._HtmlFrameParser._build_doc
			
 
				-        """
			
 
				-        from lxml.etree import XMLSyntaxError
			
 
				-        from lxml.html import (
			
 
				-            HTMLParser,
			
 
				-            fromstring,
			
 
				-            parse,
			
 
				-        )
			
 
				-
			
 
				-        parser = HTMLParser(recover=True, encoding=self.encoding)
			
 
				-
			
 
				-        try:
			
 
				-            if is_url(self.io):
			
 
				-                with get_handle(
			
 
				-                    self.io, "r", storage_options=self.storage_options
			
 
				-                ) as f:
			
 
				-                    r = parse(f.handle, parser=parser)
			
 
				-            else:
			
 
				-                # try to parse the input in the simplest way
			
 
				-                r = parse(self.io, parser=parser)
			
 
				-            try:
			
 
				-                r = r.getroot()
			
 
				-            except AttributeError:
			
 
				-                pass
			
 
				-        except (UnicodeDecodeError, OSError) as e:
			
 
				-            # if the input is a blob of html goop
			
 
				-            if not is_url(self.io):
			
 
				-                r = fromstring(self.io, parser=parser)
			
 
				-
			
 
				-                try:
			
 
				-                    r = r.getroot()
			
 
				-                except AttributeError:
			
 
				-                    pass
			
 
				-            else:
			
 
				-                raise e
			
 
				-        else:
			
 
				-            if not hasattr(r, "text_content"):
			
 
				-                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
			
 
				-
			
 
				-        for br in r.xpath("*//br"):
			
 
				-            br.tail = "\n" + (br.tail or "")
			
 
				-
			
 
				-        return r
			
 
				-
			
 
				-    def _parse_thead_tr(self, table):
			
 
				-        rows = []
			
 
				-
			
 
				-        for thead in table.xpath(".//thead"):
			
 
				-            rows.extend(thead.xpath("./tr"))
			
 
				-
			
 
				-            # HACK: lxml does not clean up the clearly-erroneous
			
 
				-            # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add
			
 
				-            # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its
			
 
				-            # children as though it's a <tr>.
			
 
				-            #
			
 
				-            # Better solution would be to use html5lib.
			
 
				-            elements_at_root = thead.xpath("./td|./th")
			
 
				-            if elements_at_root:
			
 
				-                rows.append(thead)
			
 
				-
			
 
				-        return rows
			
 
				-
			
 
				-    def _parse_tbody_tr(self, table):
			
 
				-        from_tbody = table.xpath(".//tbody//tr")
			
 
				-        from_root = table.xpath("./tr")
			
 
				-        # HTML spec: at most one of these lists has content
			
 
				-        return from_tbody + from_root
			
 
				-
			
 
				-    def _parse_tfoot_tr(self, table):
			
 
				-        return table.xpath(".//tfoot//tr")
			
 
				-
			
 
				-
			
 
				-def _expand_elements(body) -> None:
			
 
				-    data = [len(elem) for elem in body]
			
 
				-    lens = Series(data)
			
 
				-    lens_max = lens.max()
			
 
				-    not_max = lens[lens != lens_max]
			
 
				-
			
 
				-    empty = [""]
			
 
				-    for ind, length in not_max.items():
			
 
				-        body[ind] += empty * (lens_max - length)
			
 
				-
			
 
				-
			
 
				-def _data_to_frame(**kwargs):
			
 
				-
			
 
				-    def clean_number_string(cell_value):
			
 
				-        # 正则表达式，用于匹配数字之间的空格
			
 
				-        # PATTERN = re.compile(r'(\d)\s+(\d)')
			
 
				-        # 检查是否只包含允许的字符
			
 
				-        allowed_chars = set("0123456789.,\n\t+- ")
			
 
				-        if cell_value is None or cell_value == '':
			
 
				-            return cell_value
			
 
				-        if any(char not in allowed_chars for char in cell_value):
			
 
				-            return cell_value
			
 
				-    
			
 
				-        # 去除所有空白字符
			
 
				-        cleaned = ''.join(char for char in cell_value if char not in " \t\n")
			
 
				-        # 将逗号移除
			
 
				-        cleaned = cleaned.replace(',', '')
			
 
				-        return cleaned
			
 
				-
			
 
				-    def convert_to_float(cleaned_value):
			
 
				-        try:
			
 
				-            # 尝试将清理后的字符串转换为浮点数
			
 
				-            number = float(cleaned_value)
			
 
				-            return number
			
 
				-        except ValueError:
			
 
				-            # 如果转换失败，返回原始值或适当的错误信息
			
 
				-            logger.error(f"无法将 '{cleaned_value}' 转换为浮点数")
			
 
				-            # 返回空值
			
 
				-            return None
			
 
				-
			
 
				-    head, body, foot = kwargs.pop("data")
			
 
				-    header = kwargs.pop("header")
			
 
				-    kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"])
			
 
				-    number_strip = kwargs.get('number_strip')
			
 
				-    if head:
			
 
				-        body = head + body
			
 
				-
			
 
				-        # Infer header when there is a <thead> or top <th>-only rows
			
 
				-        if header is None:
			
 
				-            if len(head) == 1:
			
 
				-                header = 0
			
 
				-            else:
			
 
				-                # ignore all-empty-text rows
			
 
				-                header = [i for i, row in enumerate(head) if any(text for text in row)]
			
 
				-
			
 
				-    if foot:
			
 
				-        body += foot
			
 
				-
			
 
				-    # fill out elements of body that are "ragged"
			
 
				-    _expand_elements(body)
			
 
				-    # 如果 number_strip 为 True，则去除文本中的数字前后以及中间的空格
			
 
				-    if number_strip:
			
 
				-        for i in range(len(body)):
			
 
				-            for j in range(len(body[i])):
			
 
				-                body[i][j] = clean_number_string(body[i][j])
			
 
				-    with TextParser(body, header=header, **kwargs) as tp:
			
 
				-        return tp.read()
			
 
				-
			
 
				-_valid_parsers = {
			
 
				-    "lxml": _LxmlFrameParser,
			
 
				-    None: _LxmlFrameParser,
			
 
				-    "html5lib": _BeautifulSoupHtml5LibFrameParser,
			
 
				-    "bs4": _BeautifulSoupHtml5LibFrameParser,
			
 
				-}
			
 
				-
			
 
				-
			
 
				-def _parser_dispatch(flavor: HTMLFlavors | None) -> type[_HtmlFrameParser]:
			
 
				-    """
			
 
				-    Choose the parser based on the input flavor.
			
 
				-
			
 
				-    Parameters
			
 
				-    ----------
			
 
				-    flavor : {{"lxml", "html5lib", "bs4"}} or None
			
 
				-        The type of parser to use. This must be a valid backend.
			
 
				-
			
 
				-    Returns
			
 
				-    -------
			
 
				-    cls : _HtmlFrameParser subclass
			
 
				-        The parser class based on the requested input flavor.
			
 
				-
			
 
				-    Raises
			
 
				-    ------
			
 
				-    ValueError
			
 
				-        * If `flavor` is not a valid backend.
			
 
				-    ImportError
			
 
				-        * If you do not have the requested `flavor`
			
 
				-    """
			
 
				-    valid_parsers = list(_valid_parsers.keys())
			
 
				-    if flavor not in valid_parsers:
			
 
				-        raise ValueError(
			
 
				-            f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}"
			
 
				-        )
			
 
				-
			
 
				-    if flavor in ("bs4", "html5lib"):
			
 
				-        import_optional_dependency("html5lib")
			
 
				-        import_optional_dependency("bs4")
			
 
				-    else:
			
 
				-        import_optional_dependency("lxml.etree")
			
 
				-    return _valid_parsers[flavor]
			
 
				-
			
 
				-
			
 
				-def _print_as_set(s) -> str:
			
 
				-    arg = ", ".join([pprint_thing(el) for el in s])
			
 
				-    return f"{{{arg}}}"
			
 
				-
			
 
				-
			
 
				-def _validate_flavor(flavor):
			
 
				-    if flavor is None:
			
 
				-        flavor = "lxml", "bs4"
			
 
				-    elif isinstance(flavor, str):
			
 
				-        flavor = (flavor,)
			
 
				-    elif isinstance(flavor, abc.Iterable):
			
 
				-        if not all(isinstance(flav, str) for flav in flavor):
			
 
				-            raise TypeError(
			
 
				-                f"Object of type {repr(type(flavor).__name__)} "
			
 
				-                f"is not an iterable of strings"
			
 
				-            )
			
 
				-    else:
			
 
				-        msg = repr(flavor) if isinstance(flavor, str) else str(flavor)
			
 
				-        msg += " is not a valid flavor"
			
 
				-        raise ValueError(msg)
			
 
				-
			
 
				-    flavor = tuple(flavor)
			
 
				-    valid_flavors = set(_valid_parsers)
			
 
				-    flavor_set = set(flavor)
			
 
				-
			
 
				-    if not flavor_set & valid_flavors:
			
 
				-        raise ValueError(
			
 
				-            f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid "
			
 
				-            f"flavors are {_print_as_set(valid_flavors)}"
			
 
				-        )
			
 
				-    return flavor
			
 
				-
			
 
				-
			
 
				-def _parse(
			
 
				-    flavor,
			
 
				-    io,
			
 
				-    match,
			
 
				-    attrs,
			
 
				-    encoding,
			
 
				-    displayed_only,
			
 
				-    extract_links,
			
 
				-    storage_options,
			
 
				-    custom_args,
			
 
				-    **kwargs,
			
 
				-):
			
 
				-    flavor = _validate_flavor(flavor)
			
 
				-    compiled_match = re.compile(match)  # you can pass a compiled regex here
			
 
				-
			
 
				-    retained = None
			
 
				-    for flav in flavor:
			
 
				-        parser = _parser_dispatch(flav)
			
 
				-        p = parser(
			
 
				-            io,
			
 
				-            compiled_match,
			
 
				-            attrs,
			
 
				-            encoding,
			
 
				-            displayed_only,
			
 
				-            extract_links,
			
 
				-            storage_options,
			
 
				-        )
			
 
				-
			
 
				-        try:
			
 
				-            tables = p.parse_tables(custom_args)
			
 
				-        except ValueError as caught:
			
 
				-            # if `io` is an io-like object, check if it's seekable
			
 
				-            # and try to rewind it before trying the next parser
			
 
				-            if hasattr(io, "seekable") and io.seekable():
			
 
				-                io.seek(0)
			
 
				-            elif hasattr(io, "seekable") and not io.seekable():
			
 
				-                # if we couldn't rewind it, let the user know
			
 
				-                raise ValueError(
			
 
				-                    f"The flavor {flav} failed to parse your input. "
			
 
				-                    "Since you passed a non-rewindable file "
			
 
				-                    "object, we can't rewind it to try "
			
 
				-                    "another parser. Try read_html() with a different flavor."
			
 
				-                ) from caught
			
 
				-
			
 
				-            retained = caught
			
 
				-        else:
			
 
				-            break
			
 
				-    else:
			
 
				-        assert retained is not None  # for mypy
			
 
				-        raise retained
			
 
				-
			
 
				-    ret = []
			
 
				-    for table in tables:
			
 
				-        try:
			
 
				-            df = _data_to_frame(data=table, colspan_single=custom_args.get('colspan_single'), rowspan_single=custom_args.get('rowspan_single'), number_strip=custom_args.get('number_strip'), **kwargs)
			
 
				-            # Cast MultiIndex header to an Index of tuples when extracting header
			
 
				-            # links and replace nan with None (therefore can't use mi.to_flat_index()).
			
 
				-            # This maintains consistency of selection (e.g. df.columns.str[1])
			
 
				-            if extract_links in ("all", "header") and isinstance(
			
 
				-                df.columns, MultiIndex
			
 
				-            ):
			
 
				-                df.columns = Index(
			
 
				-                    ((col[0], None if isna(col[1]) else col[1]) for col in df.columns),
			
 
				-                    tupleize_cols=False,
			
 
				-                )
			
 
				-
			
 
				-            ret.append(df)
			
 
				-        except EmptyDataError:  # empty table
			
 
				-            continue
			
 
				-    return ret
			
 
				-
			
 
				-
			
 
				-@doc(storage_options=_shared_docs["storage_options"])
			
 
				-def read_html_zhch(
			
 
				-    io: FilePath | ReadBuffer[str],
			
 
				-    # 解释*含义：这里的*表示参数列表的开始，意味着后续的所有参数都必须是关键字参数（即必须通过参数名来传递）。
			
 
				-    *,
			
 
				-    match: str | Pattern = ".+",
			
 
				-    flavor: HTMLFlavors | Sequence[HTMLFlavors] | None = None,
			
 
				-    header: int | Sequence[int] | None = None,
			
 
				-    index_col: int | Sequence[int] | None = None,
			
 
				-    skiprows: int | Sequence[int] | slice | None = None,
			
 
				-    attrs: dict[str, str] | None = None,
			
 
				-    parse_dates: bool = False,
			
 
				-    thousands: str | None = ",",
			
 
				-    encoding: str | None = None,
			
 
				-    decimal: str = ".",
			
 
				-    converters: dict | None = None,
			
 
				-    na_values: Iterable[object] | None = None,
			
 
				-    keep_default_na: bool = True,
			
 
				-    displayed_only: bool = True,
			
 
				-    extract_links: Literal[None, "header", "footer", "body", "all"] = None,
			
 
				-    dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
			
 
				-    storage_options: StorageOptions = None,
			
 
				-    custom_args: dict = {} ,
			
 
				-) -> list[DataFrame]:
			
 
				-    r"""
			
 
				-    Read HTML tables into a ``list`` of ``DataFrame`` objects.
			
 
				-
			
 
				-    Parameters
			
 
				-    ----------
			
 
				-    io : str, path object, or file-like object
			
 
				-        String, path object (implementing ``os.PathLike[str]``), or file-like
			
 
				-        object implementing a string ``read()`` function.
			
 
				-        The string can represent a URL or the HTML itself. Note that
			
 
				-        lxml only accepts the http, ftp and file url protocols. If you have a
			
 
				-        URL that starts with ``'https'`` you might try removing the ``'s'``.
			
 
				-
			
 
				-        .. deprecated:: 2.1.0
			
 
				-            Passing html literal strings is deprecated.
			
 
				-            Wrap literal string/bytes input in ``io.StringIO``/``io.BytesIO`` instead.
			
 
				-
			
 
				-    match : str or compiled regular expression, optional
			
 
				-        The set of tables containing text matching this regex or string will be
			
 
				-        returned. Unless the HTML is extremely simple you will probably need to
			
 
				-        pass a non-empty string here. Defaults to '.+' (match any non-empty
			
 
				-        string). The default value will return all tables contained on a page.
			
 
				-        This value is converted to a regular expression so that there is
			
 
				-        consistent behavior between Beautiful Soup and lxml.
			
 
				-
			
 
				-    flavor : {{"lxml", "html5lib", "bs4"}} or list-like, optional
			
 
				-        The parsing engine (or list of parsing engines) to use. 'bs4' and
			
 
				-        'html5lib' are synonymous with each other, they are both there for
			
 
				-        backwards compatibility. The default of ``None`` tries to use ``lxml``
			
 
				-        to parse and if that fails it falls back on ``bs4`` + ``html5lib``.
			
 
				-
			
 
				-    header : int or list-like, optional
			
 
				-        The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to
			
 
				-        make the columns headers.
			
 
				-
			
 
				-    index_col : int or list-like, optional
			
 
				-        The column (or list of columns) to use to create the index.
			
 
				-
			
 
				-    skiprows : int, list-like or slice, optional
			
 
				-        Number of rows to skip after parsing the column integer. 0-based. If a
			
 
				-        sequence of integers or a slice is given, will skip the rows indexed by
			
 
				-        that sequence.  Note that a single element sequence means 'skip the nth
			
 
				-        row' whereas an integer means 'skip n rows'.
			
 
				-
			
 
				-    attrs : dict, optional
			
 
				-        This is a dictionary of attributes that you can pass to use to identify
			
 
				-        the table in the HTML. These are not checked for validity before being
			
 
				-        passed to lxml or Beautiful Soup. However, these attributes must be
			
 
				-        valid HTML table attributes to work correctly. For example, ::
			
 
				-
			
 
				-            attrs = {{'id': 'table'}}
			
 
				-
			
 
				-        is a valid attribute dictionary because the 'id' HTML tag attribute is
			
 
				-        a valid HTML attribute for *any* HTML tag as per `this document
			
 
				-        <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>`__. ::
			
 
				-
			
 
				-            attrs = {{'asdf': 'table'}}
			
 
				-
			
 
				-        is *not* a valid attribute dictionary because 'asdf' is not a valid
			
 
				-        HTML attribute even if it is a valid XML attribute.  Valid HTML 4.01
			
 
				-        table attributes can be found `here
			
 
				-        <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A
			
 
				-        working draft of the HTML 5 spec can be found `here
			
 
				-        <https://html.spec.whatwg.org/multipage/tables.html>`__. It contains the
			
 
				-        latest information on table attributes for the modern web.
			
 
				-
			
 
				-    parse_dates : bool, optional
			
 
				-        See :func:`~read_csv` for more details.
			
 
				-
			
 
				-    thousands : str, optional
			
 
				-        Separator to use to parse thousands. Defaults to ``','``.
			
 
				-
			
 
				-    encoding : str, optional
			
 
				-        The encoding used to decode the web page. Defaults to ``None``.``None``
			
 
				-        preserves the previous encoding behavior, which depends on the
			
 
				-        underlying parser library (e.g., the parser library will try to use
			
 
				-        the encoding provided by the document).
			
 
				-
			
 
				-    decimal : str, default '.'
			
 
				-        Character to recognize as decimal point (e.g. use ',' for European
			
 
				-        data).
			
 
				-
			
 
				-    converters : dict, default None
			
 
				-        Dict of functions for converting values in certain columns. Keys can
			
 
				-        either be integers or column labels, values are functions that take one
			
 
				-        input argument, the cell (not column) content, and return the
			
 
				-        transformed content.
			
 
				-
			
 
				-    na_values : iterable, default None
			
 
				-        Custom NA values.
			
 
				-
			
 
				-    keep_default_na : bool, default True
			
 
				-        If na_values are specified and keep_default_na is False the default NaN
			
 
				-        values are overridden, otherwise they're appended to.
			
 
				-
			
 
				-    displayed_only : bool, default True
			
 
				-        Whether elements with "display: none" should be parsed.
			
 
				-
			
 
				-    extract_links : {{None, "all", "header", "body", "footer"}}
			
 
				-        Table elements in the specified section(s) with <a> tags will have their
			
 
				-        href extracted.
			
 
				-
			
 
				-        .. versionadded:: 1.5.0
			
 
				-
			
 
				-    dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
			
 
				-        Back-end data type applied to the resultant :class:`DataFrame`
			
 
				-        (still experimental). Behaviour is as follows:
			
 
				-
			
 
				-        * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
			
 
				-          (default).
			
 
				-        * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
			
 
				-          DataFrame.
			
 
				-
			
 
				-        .. versionadded:: 2.0
			
 
				-
			
 
				-    {storage_options}
			
 
				-
			
 
				-        .. versionadded:: 2.1.0
			
 
				-
			
 
				-    Returns
			
 
				-    -------
			
 
				-    dfs
			
 
				-        A list of DataFrames.
			
 
				-
			
 
				-    See Also
			
 
				-    --------
			
 
				-    read_csv : Read a comma-separated values (csv) file into DataFrame.
			
 
				-
			
 
				-    Notes
			
 
				-    -----
			
 
				-    Before using this function you should read the :ref:`gotchas about the
			
 
				-    HTML parsing libraries <io.html.gotchas>`.
			
 
				-
			
 
				-    Expect to do some cleanup after you call this function. For example, you
			
 
				-    might need to manually assign column names if the column names are
			
 
				-    converted to NaN when you pass the `header=0` argument. We try to assume as
			
 
				-    little as possible about the structure of the table and push the
			
 
				-    idiosyncrasies of the HTML contained in the table to the user.
			
 
				-
			
 
				-    This function searches for ``<table>`` elements and only for ``<tr>``
			
 
				-    and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``
			
 
				-    element in the table. ``<td>`` stands for "table data". This function
			
 
				-    attempts to properly handle ``colspan`` and ``rowspan`` attributes.
			
 
				-    If the function has a ``<thead>`` argument, it is used to construct
			
 
				-    the header, otherwise the function attempts to find the header within
			
 
				-    the body (by putting rows with only ``<th>`` elements into the header).
			
 
				-
			
 
				-    Similar to :func:`~read_csv` the `header` argument is applied
			
 
				-    **after** `skiprows` is applied.
			
 
				-
			
 
				-    This function will *always* return a list of :class:`DataFrame` *or*
			
 
				-    it will fail, e.g., it will *not* return an empty list.
			
 
				-
			
 
				-    Examples
			
 
				-    --------
			
 
				-    See the :ref:`read_html documentation in the IO section of the docs
			
 
				-    <io.read_html>` for some examples of reading in HTML tables.
			
 
				-    """
			
 
				-    # Type check here. We don't want to parse only to fail because of an
			
 
				-    # invalid value of an integer skiprows.
			
 
				-    if isinstance(skiprows, numbers.Integral) and skiprows < 0:
			
 
				-        raise ValueError(
			
 
				-            "cannot skip rows starting from the end of the "
			
 
				-            "data (you passed a negative value)"
			
 
				-        )
			
 
				-    if extract_links not in [None, "header", "footer", "body", "all"]:
			
 
				-        raise ValueError(
			
 
				-            "`extract_links` must be one of "
			
 
				-            '{None, "header", "footer", "body", "all"}, got '
			
 
				-            f'"{extract_links}"'
			
 
				-        )
			
 
				-
			
 
				-    validate_header_arg(header)
			
 
				-    check_dtype_backend(dtype_backend)
			
 
				-
			
 
				-    io = stringify_path(io)
			
 
				-
			
 
				-    if isinstance(io, str) and not any(
			
 
				-        [
			
 
				-            is_file_like(io),
			
 
				-            file_exists(io),
			
 
				-            is_url(io),
			
 
				-            is_fsspec_url(io),
			
 
				-        ]
			
 
				-    ):
			
 
				-        warnings.warn(
			
 
				-            "Passing literal html to 'read_html' is deprecated and "
			
 
				-            "will be removed in a future version. To read from a "
			
 
				-            "literal string, wrap it in a 'StringIO' object.",
			
 
				-            FutureWarning,
			
 
				-            stacklevel=find_stack_level(),
			
 
				-        )
			
 
				-
			
 
				-    return _parse(
			
 
				-        flavor=flavor,
			
 
				-        io=io,
			
 
				-        match=match,
			
 
				-        header=header,
			
 
				-        index_col=index_col,
			
 
				-        skiprows=skiprows,
			
 
				-        parse_dates=parse_dates,
			
 
				-        thousands=thousands,
			
 
				-        attrs=attrs,
			
 
				-        encoding=encoding,
			
 
				-        decimal=decimal,
			
 
				-        converters=converters,
			
 
				-        na_values=na_values,
			
 
				-        keep_default_na=keep_default_na,
			
 
				-        displayed_only=displayed_only,
			
 
				-        extract_links=extract_links,
			
 
				-        dtype_backend=dtype_backend,
			
 
				-        storage_options=storage_options,
			
 
				-        custom_args=custom_args,
			
 
				-    )