zhengchun
/
ocr_platform


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
							"""
OCR 工具包

整合了文档处理相关的工具函数，包括：
- PDF 处理工具
- JSON/Markdown/HTML 格式化工具
- 文件处理工具
- 数字标准化工具
"""

# PDFUtils 和 extract_pdf_pages 使用延迟导入，避免在 PaddleX 环境中触发 MinerU 导入检查
# from .pdf_utils import PDFUtils  # 已移除，改为延迟导入
# from .pdf_extractor import extract_pdf_pages  # 已移除，改为延迟导入（因为它依赖 PDFUtils）
from .json_formatters import JSONFormatters
from .markdown_generator import MarkdownGenerator
from .html_generator import HTMLGenerator
from .visualization_utils import VisualizationUtils
from .output_formatter_v2 import OutputFormatterV2, save_mineru_format
from .normalize_financial_numbers import (
    normalize_financial_numbers,
    normalize_json_table,
    normalize_markdown_table,
    normalize_json_file
)
from .file_utils import (
    get_input_files,
    collect_pid_files,
    get_image_files_from_dir,
    get_image_files_from_list,
    get_image_files_from_csv,
    convert_pdf_to_images,
    split_files,
    create_temp_file_list,
    parse_page_range
)
from .log_utils import setup_logging
from .device_utils import get_device, get_device_name
from .image_utils import (
    img_decode,
    check_img,
    alpha_to_color,
    preprocess_image,
    bbox_to_points,
    points_to_bbox,
    rotate_image_and_coordinates
)
from .html_utils import (
    find_image_in_multiple_locations,
    process_html_images,
    process_markdown_images,
    process_all_images_in_content,
    convert_html_table_to_markdown,
    parse_html_tables
)
from .number_utils import (
    parse_number,
    normalize_text_number
)

__all__ = [
    # PDF 工具
    'PDFUtils',
    'extract_pdf_pages',
    # JSON 格式化
    'JSONFormatters',
    # Markdown 生成
    'MarkdownGenerator',
    # HTML 生成
    'HTMLGenerator',
    # 可视化
    'VisualizationUtils',
    # 输出格式化
    'OutputFormatterV2',
    'save_mineru_format',
    # 数字标准化
    'normalize_financial_numbers',
    'normalize_json_table',
    'normalize_markdown_table',
    'normalize_json_file',
    # 文件工具
    'get_input_files',
    'collect_pid_files',
    'get_image_files_from_dir',
    'get_image_files_from_list',
    'get_image_files_from_csv',
    'convert_pdf_to_images',
    'split_files',
    'create_temp_file_list',
    'parse_page_range',
    # 日志工具
    'setup_logging',
    # bbox 工具
    'BBoxExtractor',
    # 设备工具
    'get_device',
    'get_device_name',
    # 图像处理工具
    'img_decode',
    'check_img',
    'alpha_to_color',
    'preprocess_image',
    'bbox_to_points',
    'points_to_bbox',
    'rotate_image_and_coordinates',
    # HTML/Markdown 处理工具
    'find_image_in_multiple_locations',
    'process_html_images',
    'process_markdown_images',
    'process_all_images_in_content',
    'convert_html_table_to_markdown',
    'parse_html_tables',
    # 数字解析工具
    'parse_number',
    'normalize_text_number',
]


def __getattr__(name: str):
    """
    延迟导入 PDFUtils 和 extract_pdf_pages，只有在实际使用时才触发 MinerU 导入检查。
    这样可以在 PaddleX 环境中正常导入 ocr_utils，即使 MinerU 不可用。
    """
    if name == 'PDFUtils':
        from .pdf_utils import PDFUtils
        return PDFUtils
    elif name == 'extract_pdf_pages':
        from .pdf_extractor import extract_pdf_pages
        return extract_pdf_pages
    elif name == 'BBoxExtractor':
        """
        延迟导入 BBoxExtractor，只有在实际使用时才导入。
        """
        from .bbox_utils import BBoxExtractor
        return BBoxExtractor
    raise AttributeError(f"module '{__name__}' has no attribute '{name}'")

__version__ = "1.0.0"
__author__ = "zhch158"