merger_core.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. """
  2. 核心合并模块
  3. 整合各个子模块,提供统一的合并接口
  4. """
  5. import json
  6. from typing import List, Dict
  7. try:
  8. from .text_matcher import TextMatcher
  9. from .bbox_extractor import BBoxExtractor
  10. from .data_processor import DataProcessor
  11. from .markdown_generator import MarkdownGenerator
  12. except ImportError:
  13. from text_matcher import TextMatcher
  14. from bbox_extractor import BBoxExtractor
  15. from data_processor import DataProcessor
  16. from markdown_generator import MarkdownGenerator
  17. class MinerUPaddleOCRMerger:
  18. """MinerU 和 PaddleOCR 结果合并器"""
  19. def __init__(self, look_ahead_window: int = 10, similarity_threshold: int = 90):
  20. """
  21. Args:
  22. look_ahead_window: 向前查找的窗口大小
  23. similarity_threshold: 文本相似度阈值
  24. """
  25. self.look_ahead_window = look_ahead_window
  26. self.similarity_threshold = similarity_threshold
  27. # 初始化子模块
  28. self.text_matcher = TextMatcher(similarity_threshold)
  29. self.bbox_extractor = BBoxExtractor()
  30. self.data_processor = DataProcessor(self.text_matcher, look_ahead_window)
  31. self.markdown_generator = MarkdownGenerator()
  32. def merge_table_with_bbox(self, mineru_json_path: str,
  33. paddle_json_path: str) -> List[Dict]:
  34. """
  35. 合并 MinerU 和 PaddleOCR 的结果
  36. Args:
  37. mineru_json_path: MinerU 输出的 JSON 路径
  38. paddle_json_path: PaddleOCR 输出的 JSON 路径
  39. Returns:
  40. 合并后的结果
  41. """
  42. # 加载数据
  43. with open(mineru_json_path, 'r', encoding='utf-8') as f:
  44. mineru_data = json.load(f)
  45. with open(paddle_json_path, 'r', encoding='utf-8') as f:
  46. paddle_data = json.load(f)
  47. # 提取 PaddleOCR 的文字框信息
  48. paddle_text_boxes = self.bbox_extractor.extract_paddle_text_boxes(paddle_data)
  49. # 处理 MinerU 的数据
  50. merged_data = self.data_processor.process_mineru_data(
  51. mineru_data, paddle_text_boxes
  52. )
  53. return merged_data
  54. def generate_enhanced_markdown(self, merged_data: List[Dict],
  55. output_path: str = None,
  56. mineru_file: str = None) -> str:
  57. """生成增强的 Markdown"""
  58. return self.markdown_generator._generate_mineru_markdown(
  59. merged_data, output_path, mineru_file
  60. )
  61. def extract_table_cells_with_bbox(self, merged_data: List[Dict]) -> List[Dict]:
  62. """提取所有表格单元格及其 bbox 信息"""
  63. return self.bbox_extractor.extract_table_cells_with_bbox(merged_data)