|
|
@@ -0,0 +1,76 @@
|
|
|
+"""
|
|
|
+PaddleOCR_VL 和 PaddleOCR 合并模块
|
|
|
+"""
|
|
|
+import json
|
|
|
+from typing import List, Dict
|
|
|
+
|
|
|
+try:
|
|
|
+ from .text_matcher import TextMatcher
|
|
|
+ from .bbox_extractor import BBoxExtractor
|
|
|
+ from .data_processor import DataProcessor
|
|
|
+ from .markdown_generator import MarkdownGenerator
|
|
|
+except ImportError:
|
|
|
+ from text_matcher import TextMatcher
|
|
|
+ from bbox_extractor import BBoxExtractor
|
|
|
+ from data_processor import DataProcessor
|
|
|
+ from markdown_generator import MarkdownGenerator
|
|
|
+
|
|
|
+
|
|
|
+class PaddleOCRVLMerger:
|
|
|
+ """PaddleOCR_VL 和 PaddleOCR 结果合并器"""
|
|
|
+
|
|
|
+ def __init__(self, look_ahead_window: int = 10, similarity_threshold: int = 90):
|
|
|
+ """
|
|
|
+ Args:
|
|
|
+ look_ahead_window: 向前查找的窗口大小
|
|
|
+ similarity_threshold: 文本相似度阈值
|
|
|
+ """
|
|
|
+ self.look_ahead_window = look_ahead_window
|
|
|
+ self.similarity_threshold = similarity_threshold
|
|
|
+
|
|
|
+ # 初始化子模块
|
|
|
+ self.text_matcher = TextMatcher(similarity_threshold)
|
|
|
+ self.bbox_extractor = BBoxExtractor()
|
|
|
+ self.data_processor = DataProcessor(self.text_matcher, look_ahead_window)
|
|
|
+ self.markdown_generator = MarkdownGenerator()
|
|
|
+
|
|
|
+ def merge_table_with_bbox(self, paddleocr_vl_json_path: str,
|
|
|
+ paddle_json_path: str) -> List[Dict]:
|
|
|
+ """
|
|
|
+ 合并 PaddleOCR_VL 和 PaddleOCR 的结果
|
|
|
+
|
|
|
+ Args:
|
|
|
+ paddleocr_vl_json_path: PaddleOCR_VL 输出的 JSON 路径
|
|
|
+ paddle_json_path: PaddleOCR 输出的 JSON 路径
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 合并后的结果 (MinerU 格式)
|
|
|
+ """
|
|
|
+ # 加载数据
|
|
|
+ with open(paddleocr_vl_json_path, 'r', encoding='utf-8') as f:
|
|
|
+ paddleocr_vl_data = json.load(f)
|
|
|
+
|
|
|
+ with open(paddle_json_path, 'r', encoding='utf-8') as f:
|
|
|
+ paddle_data = json.load(f)
|
|
|
+
|
|
|
+ # 提取 PaddleOCR 的文字框信息
|
|
|
+ paddle_text_boxes = self.bbox_extractor.extract_paddle_text_boxes(paddle_data)
|
|
|
+
|
|
|
+ # 处理 PaddleOCR_VL 的数据
|
|
|
+ merged_data = self.data_processor.process_paddleocr_vl_data(
|
|
|
+ paddleocr_vl_data, paddle_text_boxes
|
|
|
+ )
|
|
|
+
|
|
|
+ return merged_data
|
|
|
+
|
|
|
+ def generate_enhanced_markdown(self, merged_data: List[Dict],
|
|
|
+ output_path: str = None,
|
|
|
+ source_file: str = None) -> str:
|
|
|
+ """生成增强的 Markdown"""
|
|
|
+ return self.markdown_generator._generate_paddleocr_vl_markdown(
|
|
|
+ merged_data, output_path, source_file
|
|
|
+ )
|
|
|
+
|
|
|
+ def extract_table_cells_with_bbox(self, merged_data: List[Dict]) -> List[Dict]:
|
|
|
+ """提取所有表格单元格及其 bbox 信息"""
|
|
|
+ return self.bbox_extractor.extract_table_cells_with_bbox(merged_data)
|