| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071 |
- """
- OCR 工具包
- 整合了文档处理相关的工具函数,包括:
- - PDF 处理工具
- - JSON/Markdown/HTML 格式化工具
- - 文件处理工具
- - 数字标准化工具
- """
- from .pdf_utils import PDFUtils
- from .json_formatters import JSONFormatters
- from .markdown_generator import MarkdownGenerator
- from .html_generator import HTMLGenerator
- from .visualization_utils import VisualizationUtils
- from .output_formatter_v2 import OutputFormatterV2, save_mineru_format
- from .pdf_extractor import extract_pdf_pages
- from .normalize_financial_numbers import (
- normalize_financial_numbers,
- normalize_json_table,
- normalize_markdown_table,
- normalize_json_file
- )
- from .file_utils import (
- get_input_files,
- collect_pid_files,
- get_image_files_from_dir,
- get_image_files_from_list,
- get_image_files_from_csv,
- convert_pdf_to_images,
- split_files,
- create_temp_file_list
- )
- from .log_utils import setup_logging
- __all__ = [
- # PDF 工具
- 'PDFUtils',
- 'extract_pdf_pages',
- # JSON 格式化
- 'JSONFormatters',
- # Markdown 生成
- 'MarkdownGenerator',
- # HTML 生成
- 'HTMLGenerator',
- # 可视化
- 'VisualizationUtils',
- # 输出格式化
- 'OutputFormatterV2',
- 'save_mineru_format',
- # 数字标准化
- 'normalize_financial_numbers',
- 'normalize_json_table',
- 'normalize_markdown_table',
- 'normalize_json_file',
- # 文件工具
- 'get_input_files',
- 'collect_pid_files',
- 'get_image_files_from_dir',
- 'get_image_files_from_list',
- 'get_image_files_from_csv',
- 'convert_pdf_to_images',
- 'split_files',
- 'create_temp_file_list',
- # 日志工具
- 'setup_logging',
- ]
- __version__ = "1.0.0"
- __author__ = "zhch158"
|