| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- """
- 核心合并模块
- 整合各个子模块,提供统一的合并接口
- """
- import json
- from typing import List, Dict
- try:
- from .text_matcher import TextMatcher
- from .bbox_extractor import BBoxExtractor
- from .data_processor import DataProcessor
- from .markdown_generator import MarkdownGenerator
- except ImportError:
- from text_matcher import TextMatcher
- from bbox_extractor import BBoxExtractor
- from data_processor import DataProcessor
- from markdown_generator import MarkdownGenerator
- class MinerUPaddleOCRMerger:
- """MinerU 和 PaddleOCR 结果合并器"""
-
- def __init__(self, look_ahead_window: int = 10, similarity_threshold: int = 90):
- """
- Args:
- look_ahead_window: 向前查找的窗口大小
- similarity_threshold: 文本相似度阈值
- """
- self.look_ahead_window = look_ahead_window
- self.similarity_threshold = similarity_threshold
-
- # 初始化子模块
- self.text_matcher = TextMatcher(similarity_threshold)
- self.bbox_extractor = BBoxExtractor()
- self.data_processor = DataProcessor(self.text_matcher, look_ahead_window)
- self.markdown_generator = MarkdownGenerator()
-
- def merge_table_with_bbox(self, mineru_json_path: str,
- paddle_json_path: str) -> List[Dict]:
- """
- 合并 MinerU 和 PaddleOCR 的结果
-
- Args:
- mineru_json_path: MinerU 输出的 JSON 路径
- paddle_json_path: PaddleOCR 输出的 JSON 路径
-
- Returns:
- 合并后的结果
- """
- # 加载数据
- with open(mineru_json_path, 'r', encoding='utf-8') as f:
- mineru_data = json.load(f)
-
- with open(paddle_json_path, 'r', encoding='utf-8') as f:
- paddle_data = json.load(f)
-
- # 提取 PaddleOCR 的文字框信息
- paddle_text_boxes = self.bbox_extractor.extract_paddle_text_boxes(paddle_data)
-
- # 处理 MinerU 的数据
- merged_data = self.data_processor.process_mineru_data(
- mineru_data, paddle_text_boxes
- )
-
- return merged_data
-
- def generate_enhanced_markdown(self, merged_data: List[Dict],
- output_path: str = None,
- mineru_file: str = None) -> str:
- """生成增强的 Markdown"""
- return self.markdown_generator._generate_mineru_markdown(
- merged_data, output_path, mineru_file
- )
-
- def extract_table_cells_with_bbox(self, merged_data: List[Dict]) -> List[Dict]:
- """提取所有表格单元格及其 bbox 信息"""
- return self.bbox_extractor.extract_table_cells_with_bbox(merged_data)
|