__init__.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. """
  2. OCR 工具包
  3. 整合了文档处理相关的工具函数,包括:
  4. - PDF 处理工具
  5. - JSON/Markdown/HTML 格式化工具
  6. - 文件处理工具
  7. - 数字标准化工具
  8. """
  9. from .pdf_utils import PDFUtils
  10. from .json_formatters import JSONFormatters
  11. from .markdown_generator import MarkdownGenerator
  12. from .html_generator import HTMLGenerator
  13. from .visualization_utils import VisualizationUtils
  14. from .output_formatter_v2 import OutputFormatterV2, save_mineru_format
  15. from .pdf_extractor import extract_pdf_pages
  16. from .normalize_financial_numbers import (
  17. normalize_financial_numbers,
  18. normalize_json_table,
  19. normalize_markdown_table,
  20. normalize_json_file
  21. )
  22. from .file_utils import (
  23. get_input_files,
  24. collect_pid_files,
  25. get_image_files_from_dir,
  26. get_image_files_from_list,
  27. get_image_files_from_csv,
  28. convert_pdf_to_images,
  29. split_files,
  30. create_temp_file_list
  31. )
  32. from .log_utils import setup_logging
  33. __all__ = [
  34. # PDF 工具
  35. 'PDFUtils',
  36. 'extract_pdf_pages',
  37. # JSON 格式化
  38. 'JSONFormatters',
  39. # Markdown 生成
  40. 'MarkdownGenerator',
  41. # HTML 生成
  42. 'HTMLGenerator',
  43. # 可视化
  44. 'VisualizationUtils',
  45. # 输出格式化
  46. 'OutputFormatterV2',
  47. 'save_mineru_format',
  48. # 数字标准化
  49. 'normalize_financial_numbers',
  50. 'normalize_json_table',
  51. 'normalize_markdown_table',
  52. 'normalize_json_file',
  53. # 文件工具
  54. 'get_input_files',
  55. 'collect_pid_files',
  56. 'get_image_files_from_dir',
  57. 'get_image_files_from_list',
  58. 'get_image_files_from_csv',
  59. 'convert_pdf_to_images',
  60. 'split_files',
  61. 'create_temp_file_list',
  62. # 日志工具
  63. 'setup_logging',
  64. ]
  65. __version__ = "1.0.0"
  66. __author__ = "zhch158"