""" PaddleOCR_VL 和 PaddleOCR 合并模块 """ import json from typing import List, Dict try: from .text_matcher import TextMatcher from .bbox_extractor import BBoxExtractor from .data_processor import DataProcessor from .markdown_generator import MarkdownGenerator from .unified_output_converter import UnifiedOutputConverter except ImportError: from text_matcher import TextMatcher from bbox_extractor import BBoxExtractor from data_processor import DataProcessor from markdown_generator import MarkdownGenerator from unified_output_converter import UnifiedOutputConverter class PaddleOCRVLMerger: """PaddleOCR_VL 和 PaddleOCR 结果合并器""" def __init__(self, look_ahead_window: int = 10, similarity_threshold: int = 90): """ Args: look_ahead_window: 向前查找的窗口大小 similarity_threshold: 文本相似度阈值 """ self.look_ahead_window = look_ahead_window self.similarity_threshold = similarity_threshold # 初始化子模块 self.text_matcher = TextMatcher(similarity_threshold) self.bbox_extractor = BBoxExtractor() self.data_processor = DataProcessor(self.text_matcher, look_ahead_window) self.markdown_generator = MarkdownGenerator() self.output_converter = UnifiedOutputConverter() def merge_table_with_bbox(self, paddleocr_vl_json_path: str, paddle_json_path: str, data_format: str = 'mineru') -> List[Dict]: """ 合并 PaddleOCR_VL 和 PaddleOCR 的结果 Args: paddleocr_vl_json_path: PaddleOCR_VL 输出的 JSON 路径 paddle_json_path: PaddleOCR 输出的 JSON 路径 data_format: 输出格式 ('mineru' 或 'paddleocr_vl') Returns: 合并后的结果 (默认MinerU格式) """ # 加载数据 with open(paddleocr_vl_json_path, 'r', encoding='utf-8') as f: paddleocr_vl_data = json.load(f) with open(paddle_json_path, 'r', encoding='utf-8') as f: paddle_data = json.load(f) # 提取 PaddleOCR 的文字框信息 paddle_text_boxes = self.bbox_extractor.extract_paddle_text_boxes(paddle_data) # 处理 PaddleOCR_VL 的数据, merge后已是minerU json格式 merged_data = self.data_processor.process_paddleocr_vl_data( paddleocr_vl_data, paddle_text_boxes ) # 不用再转换, # 转换为指定格式 # if data_format == 'mineru': # merged_data = self.output_converter.convert_to_mineru_format( # merged_data, data_source='paddleocr_vl' # ) return merged_data def generate_enhanced_markdown(self, merged_data: List[Dict], output_path: str = None, source_file: str = None, data_format: str = None) -> str: """ 生成增强的 Markdown Args: merged_data: 合并后的数据 output_path: 输出路径 source_file: 源文件路径 data_format: 数据格式,None 则自动检测 """ # 如果data_format未指定,自动检测 if data_format is None: data_format = self.markdown_generator.detect_data_format(merged_data) # 如果是PaddleOCR_VL格式,先转换为MinerU格式 if data_format == 'paddleocr_vl': merged_data = self.output_converter.convert_to_mineru_format( merged_data, data_source='paddleocr_vl' ) data_format = 'mineru' return self.markdown_generator.generate_enhanced_markdown( merged_data, output_path, source_file, data_format ) def extract_table_cells_with_bbox(self, merged_data: List[Dict]) -> List[Dict]: """提取所有表格单元格及其 bbox 信息""" # 确保数据是MinerU格式 if self.output_converter._detect_data_source(merged_data) != 'mineru': merged_data = self.output_converter.convert_to_mineru_format(merged_data) return self.bbox_extractor.extract_table_cells_with_bbox(merged_data)