| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139 |
- """
- OCR 工具包
- 整合了文档处理相关的工具函数,包括:
- - PDF 处理工具
- - JSON/Markdown/HTML 格式化工具
- - 文件处理工具
- - 数字标准化工具
- """
- # PDFUtils 和 extract_pdf_pages 使用延迟导入,避免在 PaddleX 环境中触发 MinerU 导入检查
- # from .pdf_utils import PDFUtils # 已移除,改为延迟导入
- # from .pdf_extractor import extract_pdf_pages # 已移除,改为延迟导入(因为它依赖 PDFUtils)
- from .json_formatters import JSONFormatters
- from .markdown_generator import MarkdownGenerator
- from .html_generator import HTMLGenerator
- from .visualization_utils import VisualizationUtils
- from .output_formatter_v2 import OutputFormatterV2, save_mineru_format
- from .normalize_financial_numbers import (
- normalize_financial_numbers,
- normalize_json_table,
- normalize_markdown_table,
- normalize_json_file
- )
- from .file_utils import (
- get_input_files,
- collect_pid_files,
- get_image_files_from_dir,
- get_image_files_from_list,
- get_image_files_from_csv,
- convert_pdf_to_images,
- split_files,
- create_temp_file_list,
- parse_page_range
- )
- from .log_utils import setup_logging
- from .device_utils import get_device, get_device_name
- from .image_utils import (
- img_decode,
- check_img,
- alpha_to_color,
- preprocess_image,
- bbox_to_points,
- points_to_bbox,
- rotate_image_and_coordinates
- )
- from .html_utils import (
- find_image_in_multiple_locations,
- process_html_images,
- process_markdown_images,
- process_all_images_in_content,
- convert_html_table_to_markdown,
- parse_html_tables
- )
- from .number_utils import (
- parse_number,
- normalize_text_number
- )
- __all__ = [
- # PDF 工具
- 'PDFUtils',
- 'extract_pdf_pages',
- # JSON 格式化
- 'JSONFormatters',
- # Markdown 生成
- 'MarkdownGenerator',
- # HTML 生成
- 'HTMLGenerator',
- # 可视化
- 'VisualizationUtils',
- # 输出格式化
- 'OutputFormatterV2',
- 'save_mineru_format',
- # 数字标准化
- 'normalize_financial_numbers',
- 'normalize_json_table',
- 'normalize_markdown_table',
- 'normalize_json_file',
- # 文件工具
- 'get_input_files',
- 'collect_pid_files',
- 'get_image_files_from_dir',
- 'get_image_files_from_list',
- 'get_image_files_from_csv',
- 'convert_pdf_to_images',
- 'split_files',
- 'create_temp_file_list',
- 'parse_page_range',
- # 日志工具
- 'setup_logging',
- # bbox 工具
- 'BBoxExtractor',
- # 设备工具
- 'get_device',
- 'get_device_name',
- # 图像处理工具
- 'img_decode',
- 'check_img',
- 'alpha_to_color',
- 'preprocess_image',
- 'bbox_to_points',
- 'points_to_bbox',
- 'rotate_image_and_coordinates',
- # HTML/Markdown 处理工具
- 'find_image_in_multiple_locations',
- 'process_html_images',
- 'process_markdown_images',
- 'process_all_images_in_content',
- 'convert_html_table_to_markdown',
- 'parse_html_tables',
- # 数字解析工具
- 'parse_number',
- 'normalize_text_number',
- ]
- def __getattr__(name: str):
- """
- 延迟导入 PDFUtils 和 extract_pdf_pages,只有在实际使用时才触发 MinerU 导入检查。
- 这样可以在 PaddleX 环境中正常导入 ocr_utils,即使 MinerU 不可用。
- """
- if name == 'PDFUtils':
- from .pdf_utils import PDFUtils
- return PDFUtils
- elif name == 'extract_pdf_pages':
- from .pdf_extractor import extract_pdf_pages
- return extract_pdf_pages
- elif name == 'BBoxExtractor':
- """
- 延迟导入 BBoxExtractor,只有在实际使用时才导入。
- """
- from .bbox_utils import BBoxExtractor
- return BBoxExtractor
- raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
- __version__ = "1.0.0"
- __author__ = "zhch158"
|