__init__.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. """
  2. OCR 工具包
  3. 整合了文档处理相关的工具函数,包括:
  4. - PDF 处理工具
  5. - JSON/Markdown/HTML 格式化工具
  6. - 文件处理工具
  7. - 数字标准化工具
  8. """
  9. # PDFUtils 和 extract_pdf_pages 使用延迟导入,避免在 PaddleX 环境中触发 MinerU 导入检查
  10. # from .pdf_utils import PDFUtils # 已移除,改为延迟导入
  11. # from .pdf_extractor import extract_pdf_pages # 已移除,改为延迟导入(因为它依赖 PDFUtils)
  12. from .json_formatters import JSONFormatters
  13. from .markdown_generator import MarkdownGenerator
  14. from .html_generator import HTMLGenerator
  15. from .visualization_utils import VisualizationUtils
  16. from .output_formatter_v2 import OutputFormatterV2, save_mineru_format
  17. from .normalize_financial_numbers import (
  18. normalize_financial_numbers,
  19. normalize_json_table,
  20. normalize_markdown_table,
  21. normalize_json_file
  22. )
  23. from .file_utils import (
  24. get_input_files,
  25. collect_pid_files,
  26. get_image_files_from_dir,
  27. get_image_files_from_list,
  28. get_image_files_from_csv,
  29. convert_pdf_to_images,
  30. split_files,
  31. create_temp_file_list,
  32. parse_page_range
  33. )
  34. from .log_utils import setup_logging
  35. from .device_utils import get_device, get_device_name
  36. from .image_utils import (
  37. img_decode,
  38. check_img,
  39. alpha_to_color,
  40. preprocess_image,
  41. bbox_to_points,
  42. points_to_bbox,
  43. rotate_image_and_coordinates
  44. )
  45. from .html_utils import (
  46. find_image_in_multiple_locations,
  47. process_html_images,
  48. process_markdown_images,
  49. process_all_images_in_content,
  50. convert_html_table_to_markdown,
  51. parse_html_tables
  52. )
  53. from .number_utils import (
  54. parse_number,
  55. normalize_text_number
  56. )
  57. __all__ = [
  58. # PDF 工具
  59. 'PDFUtils',
  60. 'extract_pdf_pages',
  61. # JSON 格式化
  62. 'JSONFormatters',
  63. # Markdown 生成
  64. 'MarkdownGenerator',
  65. # HTML 生成
  66. 'HTMLGenerator',
  67. # 可视化
  68. 'VisualizationUtils',
  69. # 输出格式化
  70. 'OutputFormatterV2',
  71. 'save_mineru_format',
  72. # 数字标准化
  73. 'normalize_financial_numbers',
  74. 'normalize_json_table',
  75. 'normalize_markdown_table',
  76. 'normalize_json_file',
  77. # 文件工具
  78. 'get_input_files',
  79. 'collect_pid_files',
  80. 'get_image_files_from_dir',
  81. 'get_image_files_from_list',
  82. 'get_image_files_from_csv',
  83. 'convert_pdf_to_images',
  84. 'split_files',
  85. 'create_temp_file_list',
  86. 'parse_page_range',
  87. # 日志工具
  88. 'setup_logging',
  89. # bbox 工具
  90. 'BBoxExtractor',
  91. # 设备工具
  92. 'get_device',
  93. 'get_device_name',
  94. # 图像处理工具
  95. 'img_decode',
  96. 'check_img',
  97. 'alpha_to_color',
  98. 'preprocess_image',
  99. 'bbox_to_points',
  100. 'points_to_bbox',
  101. 'rotate_image_and_coordinates',
  102. # HTML/Markdown 处理工具
  103. 'find_image_in_multiple_locations',
  104. 'process_html_images',
  105. 'process_markdown_images',
  106. 'process_all_images_in_content',
  107. 'convert_html_table_to_markdown',
  108. 'parse_html_tables',
  109. # 数字解析工具
  110. 'parse_number',
  111. 'normalize_text_number',
  112. ]
  113. def __getattr__(name: str):
  114. """
  115. 延迟导入 PDFUtils 和 extract_pdf_pages,只有在实际使用时才触发 MinerU 导入检查。
  116. 这样可以在 PaddleX 环境中正常导入 ocr_utils,即使 MinerU 不可用。
  117. """
  118. if name == 'PDFUtils':
  119. from .pdf_utils import PDFUtils
  120. return PDFUtils
  121. elif name == 'extract_pdf_pages':
  122. from .pdf_extractor import extract_pdf_pages
  123. return extract_pdf_pages
  124. elif name == 'BBoxExtractor':
  125. """
  126. 延迟导入 BBoxExtractor,只有在实际使用时才导入。
  127. """
  128. from .bbox_utils import BBoxExtractor
  129. return BBoxExtractor
  130. raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
  131. __version__ = "1.0.0"
  132. __author__ = "zhch158"