1
0

7 Коммиты a59da04cec ... 788e93532b

Автор SHA1 Сообщение Дата
  zhch158_admin 788e93532b feat: 添加文件路径检查,确保切换数据源时路径有效 1 неделя назад
  zhch158_admin d451e66d4c feat: 添加 DotsOCR 和 PaddleOCR 合并程序,支持单文件和批量处理,输出为统一的MinerU格式 1 неделя назад
  zhch158_admin 6e82eedf30 feat: 添加 DotsOCR 和 PaddleOCR 合并模块,支持 JSON 数据合并和 Markdown 生成 1 неделя назад
  zhch158_admin 7018b3372e feat: 添加 DotsOCR 数据处理功能,支持转换为 MinerU 格式并添加 bbox 信息 1 неделя назад
  zhch158_admin 810f8e84a7 feat: 添加 DotsOCR (带 cell bbox) 工具配置,支持结果目录和描述 1 неделя назад
  zhch158_admin 18549b7bc5 feat: 添加 2023年度报告母公司.pdf 到 PDF 列表 1 неделя назад
  zhch158_admin 1f38e81a65 fix: 修正 DotsOCR 的合并脚本映射,确保正确使用合并脚本 1 неделя назад

+ 3 - 1
batch_ocr/batch_merge_results.py

@@ -40,7 +40,7 @@ class BatchMerger:
     MERGER_SCRIPTS = {
         'paddleocr_vl': 'merge_paddleocr_vl_paddleocr.py',
         'mineru': 'merge_mineru_paddle_ocr.py',
-        'dotsocr': 'merge_mineru_paddle_ocr.py',  # DotsOCR 也用 MinerU 格式
+        'dotsocr': 'merge_dotsocr_paddleocr.py'
     }
     
     def __init__(self, config_file: str, base_dir: str = None):
@@ -405,6 +405,8 @@ class BatchMerger:
             return 'paddleocr-vl'
         elif 'mineru' in script_name:
             return 'mineru'
+        elif 'dotsocr' in script_name:
+            return 'dotsocr'
         else:
             return 'vl'
     

+ 1 - 0
batch_ocr/pdf_list.txt

@@ -2,3 +2,4 @@
 对公_招商银行图.pdf
 A用户_单元格扫描流水.pdf
 B用户_扫描流水.pdf
+2023年度报告母公司.pdf

+ 7 - 0
config/A用户_单元格扫描流水.yaml

@@ -45,4 +45,11 @@ document:
       image_dir: "dotsocr_vllm_results/{{name}}"
       description: "Dots OCR 图片合成结果"
       enabled: true
+  
+    # DotsOCR (带 cell bbox)
+    - tool: "mineru"
+      result_dir: "dotsocr_vllm_results_cell_bbox"
+      image_dir: "dotsocr_vllm_results/{{name}}"
+      description: "Dots OCR + PaddleOCR 坐标"
+      enabled: true
   

+ 7 - 0
config/B用户_扫描流水.yaml

@@ -44,4 +44,11 @@ document:
       result_dir: "dotsocr_vllm_results"
       image_dir: "dotsocr_vllm_results/{{name}}"
       description: "Dots OCR 图片合成结果"
+      enabled: true
+  
+    # DotsOCR (带 cell bbox)
+    - tool: "mineru"
+      result_dir: "dotsocr_vllm_results_cell_bbox"
+      image_dir: "dotsocr_vllm_results/{{name}}"
+      description: "Dots OCR + PaddleOCR 坐标"
       enabled: true

+ 7 - 0
config/对公_招商银行图.yaml

@@ -44,4 +44,11 @@ document:
       result_dir: "dotsocr_vllm_results"
       image_dir: "dotsocr_vllm_results/{{name}}"
       description: "Dots OCR 图片合成结果"
+      enabled: true
+  
+    # DotsOCR (带 cell bbox)
+    - tool: "mineru"
+      result_dir: "dotsocr_vllm_results_cell_bbox"
+      image_dir: "dotsocr_vllm_results/{{name}}"
+      description: "Dots OCR + PaddleOCR 坐标"
       enabled: true

+ 7 - 0
config/德_内蒙古银行照.yaml

@@ -45,4 +45,11 @@ document:
       result_dir: "dotsocr_vllm_results"
       image_dir: "dotsocr_vllm_results/{{name}}"
       description: "Dots OCR 图片合成结果"
+      enabled: true
+  
+    # DotsOCR (带 cell bbox)
+    - tool: "mineru"
+      result_dir: "dotsocr_vllm_results_cell_bbox"
+      image_dir: "dotsocr_vllm_results/{{name}}"
+      description: "Dots OCR + PaddleOCR 坐标"
       enabled: true

+ 9 - 2
config/至远彩色_2023年报.yaml

@@ -1,6 +1,6 @@
 document:
-  name: "至远彩色_2023年报"
-  base_dir: "/Users/zhch158/workspace/data/流水分析/至远彩色_2023年报"
+  name: "2023年告母公司"
+  base_dir: "/Users/zhch158/workspace/data/流水分析/2023年告母公司"
   
   # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
   ocr_results:
@@ -44,4 +44,11 @@ document:
       result_dir: "dotsocr_vllm_results"
       image_dir: "dotsocr_vllm_results/{{name}}"
       description: "Dots OCR 图片合成结果"
+      enabled: true
+  
+    # DotsOCR (带 cell bbox)
+    - tool: "mineru"
+      result_dir: "dotsocr_vllm_results_cell_bbox"
+      image_dir: "dotsocr_vllm_results/{{name}}"
+      description: "Dots OCR + PaddleOCR 坐标"
       enabled: true

+ 257 - 56
merger/data_processor.py

@@ -1,6 +1,6 @@
 """
 数据处理模块
-负责处理 MinerU/PaddleOCR_VL 数据,添加 bbox 信息
+负责处理 MinerU/PaddleOCR_VL/DotsOCR 数据,添加 bbox 信息
 """
 from typing import List, Dict, Tuple
 from bs4 import BeautifulSoup
@@ -71,56 +71,292 @@ class DataProcessor:
         
         return merged_data
     
+    def process_dotsocr_data(self, dotsocr_data: List[Dict],
+                            paddle_text_boxes: List[Dict]) -> List[Dict]:
+        """
+        🎯 处理 DotsOCR 数据,转换为 MinerU 格式并添加 bbox 信息
+        
+        Args:
+            dotsocr_data: DotsOCR 数据
+            paddle_text_boxes: PaddleOCR 文字框列表
+        
+        Returns:
+            MinerU 格式的合并数据
+        """
+        merged_data = []
+        paddle_pointer = 0
+        last_matched_index = 0
+        
+        # 按 bbox 排序
+        dotsocr_data.sort(
+            key=lambda x: (x['bbox'][1], x['bbox'][0])
+            if 'bbox' in x else (float('inf'), float('inf'))
+        )
+        
+        for item in dotsocr_data:
+            # 🎯 转换为 MinerU 格式
+            mineru_item = self._convert_dotsocr_to_mineru(item)
+            category = mineru_item.get('type', '')
+            
+            # 🎯 根据类型处理
+            if category.lower() == 'table':
+                merged_item, paddle_pointer = self._process_dotsocr_table(
+                    mineru_item, paddle_text_boxes, paddle_pointer
+                )
+                merged_data.append(merged_item)
+            
+            elif category.lower() in ['text', 'title', 'header', 'footer']:
+                merged_item, paddle_pointer, last_matched_index = self._process_text(
+                    mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
+                )
+                merged_data.append(merged_item)
+            
+            elif category.lower() == 'list':
+                merged_item, paddle_pointer, last_matched_index = self._process_list(
+                    mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
+                )
+                merged_data.append(merged_item)
+            
+            else:
+                # Page-header, Page-footer, Picture 等
+                merged_data.append(mineru_item)
+        
+        return merged_data
+    
+    def _convert_dotsocr_to_mineru(self, dotsocr_item: Dict) -> Dict:
+        """
+        🎯 将 DotsOCR 格式转换为 MinerU 格式
+        
+        DotsOCR:
+        {
+            "category": "Table",
+            "bbox": [x1, y1, x2, y2],
+            "text": "..."
+        }
+        
+        MinerU:
+        {
+            "type": "table",
+            "bbox": [x1, y1, x2, y2],
+            "table_body": "...",
+            "page_idx": 0
+        }
+        """
+        category = dotsocr_item.get('category', '')
+        
+        # 🎯 Category 映射
+        category_map = {
+            'Page-header': 'header',
+            'Page-footer': 'footer',
+            'Picture': 'image',
+            'Figure': 'image',
+            'Section-header': 'title',
+            'Table': 'table',
+            'Text': 'text',
+            'Title': 'title',
+            'List': 'list',
+            'Caption': 'title'
+        }
+        
+        mineru_type = category_map.get(category, 'text')
+        
+        # 🎯 基础转换
+        mineru_item = {
+            'type': mineru_type,
+            'bbox': dotsocr_item.get('bbox', []),
+            'page_idx': 0  # DotsOCR 默认单页
+        }
+        
+        # 🎯 处理文本内容
+        text = dotsocr_item.get('text', '')
+        
+        if mineru_type == 'table':
+            # 表格:text -> table_body
+            mineru_item['table_body'] = text
+        else:
+            # 其他类型:保持 text
+            mineru_item['text'] = text
+            
+            # 标题级别
+            if category == 'Section-header':
+                mineru_item['text_level'] = 1
+        
+        return mineru_item
+    
+    def _process_dotsocr_table(self, item: Dict, paddle_text_boxes: List[Dict],
+                              start_pointer: int) -> Tuple[Dict, int]:
+        """
+        🎯 处理 DotsOCR 表格(已转换为 MinerU 格式)
+        
+        DotsOCR 的表格 HTML 已经在 text 字段中,需要转移到 table_body
+        """
+        merged_item = item.copy()
+        table_html = item.get('table_body', '')
+        
+        if not table_html:
+            return merged_item, start_pointer
+        
+        # 🎯 复用表格处理逻辑
+        enhanced_html, cells, new_pointer = self._enhance_table_html_with_bbox(
+            table_html, paddle_text_boxes, start_pointer
+        )
+        
+        merged_item['table_body'] = enhanced_html
+        merged_item['table_body_with_bbox'] = enhanced_html
+        merged_item['bbox_mapping'] = 'merged_from_paddle_ocr'
+        merged_item['table_cells'] = cells if cells else []
+        
+        return merged_item, new_pointer
+    
     def process_paddleocr_vl_data(self, paddleocr_vl_data: Dict,
                                   paddle_text_boxes: List[Dict]) -> List[Dict]:
         """
         处理 PaddleOCR_VL 数据,添加 bbox 信息
-        
+    
         Args:
             paddleocr_vl_data: PaddleOCR_VL 数据 (JSON 对象)
             paddle_text_boxes: PaddleOCR 文字框列表
-        
+    
         Returns:
-            合并后的数据 (PPStruct3 格式, cell信息在parsing_res_list)
+            MinerU 格式的合并数据(统一输出格式)
         """
         merged_data = []
         paddle_pointer = 0
         last_matched_index = 0
-        
+    
         # 提取 parsing_res_list
         parsing_res_list = paddleocr_vl_data.get('parsing_res_list', [])
-        
+    
         # 按 bbox 排序
         parsing_res_list.sort(
             key=lambda x: (x['block_bbox'][1], x['block_bbox'][0])
             if 'block_bbox' in x else (float('inf'), float('inf'))
         )
-        
+    
         for item in parsing_res_list:
-            block_label = item.get('block_label', '')
+            # 🎯 统一转换为 MinerU 格式
+            mineru_item = self._convert_paddleocr_vl_to_mineru(item)
+            item_type = mineru_item.get('type', '')
             
-            # PPStruct3 格式, cell信息在parsing_res_list
-            if block_label == 'table':
-                merged_item, paddle_pointer = self._process_paddleocr_vl_table(
-                    item, paddle_text_boxes, paddle_pointer
+            # 🎯 根据类型处理(复用 MinerU 的通用方法)
+            if item_type == 'table':
+                merged_item, paddle_pointer = self._process_table(
+                    mineru_item, paddle_text_boxes, paddle_pointer
                 )
                 merged_data.append(merged_item)
-
-            elif 'title' in block_label or block_label in ['text', 'number']:
-                merged_item, paddle_pointer, last_matched_index = self._process_paddleocr_vl_text(
-                    item, paddle_text_boxes, paddle_pointer, last_matched_index
+        
+            elif item_type in ['text', 'title', 'header', 'footer', 'equation']:
+                merged_item, paddle_pointer, last_matched_index = self._process_text(
+                    mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
                 )
                 merged_data.append(merged_item)
-            
-            else:
-                # 其他类型直接转换
-                merged_data.append(item.copy())
         
+            elif item_type == 'list':
+                merged_item, paddle_pointer, last_matched_index = self._process_list(
+                    mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
+                )
+                merged_data.append(merged_item)
+        
+            else:
+                # 其他类型(image, equation 等)直接添加
+                merged_data.append(mineru_item)
+    
         return merged_data
     
+    def _convert_paddleocr_vl_to_mineru(self, paddleocr_vl_item: Dict) -> Dict:
+        """
+        🎯 将 PaddleOCR_VL 格式转换为 MinerU 格式
+        
+        PaddleOCR_VL (PP-DocLayout_plus-L):
+        {
+            "block_label": "paragraph_title",  # 或 "doc_title", "text" 等
+            "block_bbox": [172, 151, 547, 184],
+            "block_content": "...",
+            "block_id": 0
+        }
+        
+        MinerU:
+        {
+            "type": "title",
+            "bbox": [172, 151, 547, 184],
+            "text": "...",
+            "text_level": 1,
+            "page_idx": 0
+        }
+        """
+        block_label = paddleocr_vl_item.get('block_label', '')
+        
+        # 🎯 PP-DocLayout_plus-L 类别映射
+        label_map = {
+            # 标题类
+            'paragraph_title': 'title',      # 段落标题 → title (level 2)
+            'doc_title': 'title',            # 文档标题 → title (level 1)
+            'figure_table_chart_title': 'title',  # 图表标题 → title (level 3)
+            
+            # 文本类
+            'text': 'text',
+            'number': 'text',
+            'content': 'text',
+            'abstract': 'text',
+            'footnote': 'text',
+            'aside_text': 'text',
+            'algorithm': 'text',
+            
+            # 参考文献
+            'reference': 'text',
+            'reference_content': 'text',
+            
+            # 页眉页脚
+            'header': 'header',
+            'footer': 'footer',
+            
+            # 表格
+            'table': 'table',
+            
+            # 图片
+            'image': 'image',
+            'chart': 'image',
+            
+            # 公式
+            'formula': 'equation',
+            'formula_number': 'equation',
+            
+            # 印章
+            'seal': 'image'
+        }
+        
+        mineru_type = label_map.get(block_label, 'text')
+        
+        # 🎯 基础转换
+        mineru_item = {
+            'type': mineru_type,
+            'bbox': paddleocr_vl_item.get('block_bbox', []),
+            'page_idx': 0
+        }
+        
+        # 🎯 处理文本内容
+        content = paddleocr_vl_item.get('block_content', '')
+        
+        if mineru_type == 'table':
+            # 表格:block_content -> table_body
+            mineru_item['table_body'] = content
+        else:
+            # 其他类型:block_content -> text
+            mineru_item['text'] = content
+            
+            # 🎯 处理标题级别(基于实际的类别)
+            if block_label == 'doc_title':
+                mineru_item['text_level'] = 1  # 文档标题 - 一级
+            elif block_label == 'paragraph_title':
+                mineru_item['text_level'] = 2  # 段落标题 - 二级
+            elif block_label == 'figure_table_chart_title':
+                mineru_item['text_level'] = 3  # 图表标题 - 三级
+    
+        return mineru_item
+    
     def _process_table(self, item: Dict, paddle_text_boxes: List[Dict],
                       start_pointer: int) -> Tuple[Dict, int]:
-        """处理表格"""
+        """处理 MinerU 表格"""
         merged_item = item.copy()
         table_html = item.get('table_body', '')
         
@@ -170,41 +406,6 @@ class DataProcessor:
         
         return merged_item, paddle_pointer, last_matched_index
     
-    def _process_paddleocr_vl_table(self, item: Dict, paddle_text_boxes: List[Dict],
-                                    start_pointer: int) -> Tuple[Dict, int]:
-        """处理 PaddleOCR_VL 表格"""
-        merged_item = item.copy()
-        table_html = item.get('block_content', '')
-        
-        enhanced_html, cells, new_pointer = self._enhance_table_html_with_bbox(
-            table_html, paddle_text_boxes, start_pointer
-        )
-        
-        # merge item使用item的所有信息,但重写block_content为增强后的html,增加单元格信息
-        merged_item['block_content'] = enhanced_html
-        merged_item['block_content_with_bbox'] = enhanced_html
-        merged_item['bbox_mapping'] = 'merged_from_paddle_ocr'
-        merged_item['table_cells'] = cells if cells else []
-        
-        return merged_item, new_pointer
-    
-    def _process_paddleocr_vl_text(self, item: Dict, paddle_text_boxes: List[Dict],
-                                   paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]:
-        """处理 PaddleOCR_VL 文本"""
-        merged_item = item.copy()        
-        text = item.get('block_content', '')
-        
-        matched_bbox, paddle_pointer, last_matched_index = \
-            self.text_matcher.find_matching_bbox(
-                text, paddle_text_boxes, paddle_pointer, last_matched_index,
-                self.look_ahead_window
-            )
-        
-        if matched_bbox:
-            matched_bbox['used'] = True
-        
-        return merged_item, paddle_pointer, last_matched_index
-    
     def _enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
                                       start_pointer: int) -> Tuple[str, List[Dict], int]:
         """为 HTML 表格添加 bbox 信息"""

+ 92 - 0
merger/dotsocr_merger.py

@@ -0,0 +1,92 @@
+"""
+DotsOCR 和 PaddleOCR 合并模块
+"""
+import json
+from typing import List, Dict
+
+try:
+    from .text_matcher import TextMatcher
+    from .bbox_extractor import BBoxExtractor
+    from .data_processor import DataProcessor
+    from .markdown_generator import MarkdownGenerator
+    from .unified_output_converter import UnifiedOutputConverter
+except ImportError:
+    from text_matcher import TextMatcher
+    from bbox_extractor import BBoxExtractor
+    from data_processor import DataProcessor
+    from markdown_generator import MarkdownGenerator
+    from unified_output_converter import UnifiedOutputConverter
+
+
+class DotsOCRMerger:
+    """DotsOCR 和 PaddleOCR 结果合并器"""
+    
+    def __init__(self, look_ahead_window: int = 10, similarity_threshold: int = 90):
+        """
+        Args:
+            look_ahead_window: 向前查找的窗口大小
+            similarity_threshold: 文本相似度阈值
+        """
+        self.look_ahead_window = look_ahead_window
+        self.similarity_threshold = similarity_threshold
+        
+        # 初始化子模块
+        self.text_matcher = TextMatcher(similarity_threshold)
+        self.bbox_extractor = BBoxExtractor()
+        self.data_processor = DataProcessor(self.text_matcher, look_ahead_window)
+        self.markdown_generator = MarkdownGenerator()
+        self.output_converter = UnifiedOutputConverter()
+    
+    def merge_table_with_bbox(self, dotsocr_json_path: str, 
+                             paddle_json_path: str,
+                             data_format: str = 'mineru') -> List[Dict]:
+        """
+        合并 DotsOCR 和 PaddleOCR 的结果
+        
+        Args:
+            dotsocr_json_path: DotsOCR 输出的 JSON 路径
+            paddle_json_path: PaddleOCR 输出的 JSON 路径
+            data_format: 输出格式(固定为 'mineru')
+        
+        Returns:
+            MinerU 格式的合并数据
+        """
+        # 加载数据
+        with open(dotsocr_json_path, 'r', encoding='utf-8') as f:
+            dotsocr_data = json.load(f)
+        
+        with open(paddle_json_path, 'r', encoding='utf-8') as f:
+            paddle_data = json.load(f)
+        
+        # 🎯 提取 PaddleOCR 的文字框信息
+        paddle_text_boxes = self.bbox_extractor.extract_paddle_text_boxes(paddle_data)
+        
+        # 🎯 使用专门的 DotsOCR 处理方法(自动转换为 MinerU 格式)
+        merged_data = self.data_processor.process_dotsocr_data(
+            dotsocr_data, paddle_text_boxes
+        )
+        
+        return merged_data
+    
+    def generate_enhanced_markdown(self, merged_data: List[Dict], 
+                                   output_path: str = None,
+                                   source_file: str = None,
+                                   data_format: str = 'mineru') -> str:
+        """
+        生成增强的 Markdown(MinerU 格式)
+        
+        Args:
+            merged_data: 合并后的数据(MinerU 格式)
+            output_path: 输出路径
+            source_file: 源文件路径
+            data_format: 数据格式(固定为 'mineru')
+        """
+        # 🎯 强制使用 MinerU 格式生成 Markdown
+        return self.markdown_generator.generate_enhanced_markdown(
+            merged_data, output_path, source_file, data_format='mineru'
+        )
+    
+    def extract_table_cells_with_bbox(self, merged_data: List[Dict]) -> List[Dict]:
+        """提取所有表格单元格及其 bbox 信息"""
+        # 🎯 直接复用 BBoxExtractor 的方法
+        return self.bbox_extractor.extract_table_cells_with_bbox(merged_data)

+ 313 - 0
merger/merge_dotsocr_paddleocr.py

@@ -0,0 +1,313 @@
+"""
+合并 DotsOCR 和 PaddleOCR 的结果
+主程序入口
+"""
+import json
+import argparse
+from pathlib import Path
+
+try:
+    from .dotsocr_merger import DotsOCRMerger
+except ImportError:
+    from dotsocr_merger import DotsOCRMerger
+
+
+def merge_single_file(dotsocr_file: Path, paddle_file: Path, output_dir: Path, 
+                     output_type: str, merger: DotsOCRMerger) -> bool:
+    """
+    合并单个文件
+    
+    Args:
+        dotsocr_file: DotsOCR JSON 文件路径
+        paddle_file: PaddleOCR JSON 文件路径
+        output_dir: 输出目录
+        output_type: 输出格式
+        merger: 合并器实例
+    
+    Returns:
+        是否成功
+    """
+    print(f"📄 处理: {dotsocr_file.name}")
+    
+    # 输出文件路径
+    merged_md_path = output_dir / f"{dotsocr_file.stem}.md"
+    merged_json_path = output_dir / f"{dotsocr_file.stem}.json"
+    
+    try:
+        # ✅ 合并数据 (统一输出为MinerU格式)
+        merged_data = merger.merge_table_with_bbox(
+            str(dotsocr_file),
+            str(paddle_file),
+            data_format='mineru'  # 强制使用MinerU格式
+        )
+        
+        # ✅ 生成 Markdown (基于MinerU格式)
+        if output_type in ['markdown', 'both']:
+            markdown = merger.generate_enhanced_markdown(
+                merged_data, 
+                str(merged_md_path), 
+                str(dotsocr_file),
+                data_format='mineru'  # 强制使用MinerU格式
+            )
+        
+        # ✅ 保存 JSON (MinerU格式)
+        if output_type in ['json', 'both']:
+            with open(merged_json_path, 'w', encoding='utf-8') as f:
+                json.dump(merged_data, f, ensure_ascii=False, indent=2)
+
+        print(f"  ✅ 合并完成 (MinerU格式)")
+        print(f"  📊 共处理了 {len(merged_data)} 个对象")
+        print(f"  💾 输出文件:")
+        if output_type in ['markdown', 'both']:
+            print(f"    - {merged_md_path.name}")
+        if output_type in ['json', 'both']:
+            print(f"    - {merged_json_path.name}")
+
+        return True
+        
+    except Exception as e:
+        print(f"  ❌ 处理失败: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+def merge_dotsocr_batch(dotsocr_dir: str, paddle_dir: str, output_dir: str,
+                       output_type: str = 'both',
+                       look_ahead_window: int = 10, 
+                       similarity_threshold: int = 80):
+    """
+    批量合并 DotsOCR 和 PaddleOCR 的结果
+    
+    Args:
+        dotsocr_dir: DotsOCR 结果目录
+        paddle_dir: PaddleOCR 结果目录
+        output_dir: 输出目录
+        output_type: 输出格式
+        look_ahead_window: 向前查找窗口大小
+        similarity_threshold: 相似度阈值
+    """
+    dotsocr_path = Path(dotsocr_dir)
+    paddle_path = Path(paddle_dir)
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    
+    merger = DotsOCRMerger(look_ahead_window, similarity_threshold)
+    
+    # 查找所有 DotsOCR 的 JSON 文件
+    dotsocr_files = list(dotsocr_path.glob('*_page_*[0-9].json'))
+    dotsocr_files.sort()
+    
+    print(f"\n🔍 找到 {len(dotsocr_files)} 个 DotsOCR 文件")
+    print(f"📂 DotsOCR 目录: {dotsocr_dir}")
+    print(f"📂 PaddleOCR 目录: {paddle_dir}")
+    print(f"📂 输出目录: {output_dir}")
+    print(f"⚙️  查找窗口: {look_ahead_window}")
+    print(f"⚙️  相似度阈值: {similarity_threshold}%\n")
+    
+    success_count = 0
+    failed_count = 0
+    
+    for dotsocr_file in dotsocr_files:
+        # 查找对应的 PaddleOCR 文件
+        paddle_file = paddle_path / dotsocr_file.name
+        
+        if not paddle_file.exists():
+            print(f"⚠️  跳过: 未找到对应的 PaddleOCR 文件: {paddle_file.name}\n")
+            failed_count += 1
+            continue
+
+        if merge_single_file(dotsocr_file, paddle_file, output_path, output_type, merger):
+            success_count += 1
+        else:
+            failed_count += 1
+        
+        print()
+    
+    print("=" * 60)
+    print(f"✅ 处理完成!")
+    print(f"📊 统计信息:")
+    print(f"  - 总文件数: {len(dotsocr_files)}")
+    print(f"  - 成功: {success_count}")
+    print(f"  - 失败: {failed_count}")
+    print("=" * 60)
+
+
+def main():
+    """主函数"""
+    parser = argparse.ArgumentParser(
+        description='合并 DotsOCR 和 PaddleOCR 的识别结果,统一输出为MinerU格式',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+示例用法:
+
+  1. 批量处理整个目录:
+     python merge_dotsocr_paddleocr.py \\
+         --dotsocr-dir /path/to/dotsocr/results \\
+         --paddle-dir /path/to/paddle/results \\
+         --output-dir /path/to/output
+
+  2. 处理单个文件:
+     python merge_dotsocr_paddleocr.py \\
+         --dotsocr-file /path/to/file_page_001.json \\
+         --paddle-file /path/to/file_page_001.json \\
+         --output-dir /path/to/output
+
+  3. 自定义参数:
+     python merge_dotsocr_paddleocr.py \\
+         --dotsocr-dir /path/to/dotsocr \\
+         --paddle-dir /path/to/paddle \\
+         --output-dir /path/to/output \\
+         --window 15 \\
+         --threshold 85
+        
+输出格式说明:
+  - JSON: 统一的MinerU格式JSON文件
+  - Markdown: 基于MinerU格式生成的Markdown文件
+        """
+    )
+    
+    # 文件/目录参数
+    file_group = parser.add_argument_group('文件参数')
+    file_group.add_argument(
+        '--dotsocr-file', 
+        type=str,
+        help='DotsOCR 输出的 JSON 文件路径(单文件模式)'
+    )
+    file_group.add_argument(
+        '--paddle-file', 
+        type=str,
+        help='PaddleOCR 输出的 JSON 文件路径(单文件模式)'
+    )
+    
+    dir_group = parser.add_argument_group('目录参数')
+    dir_group.add_argument(
+        '--dotsocr-dir', 
+        type=str,
+        help='DotsOCR 结果目录(批量模式)'
+    )
+    dir_group.add_argument(
+        '--paddle-dir', 
+        type=str,
+        help='PaddleOCR 结果目录(批量模式)'
+    )
+    
+    # 输出参数
+    output_group = parser.add_argument_group('输出参数')
+    output_group.add_argument(
+        '-o', '--output-dir',
+        type=str,
+        required=True,
+        help='输出目录(必需)'
+    )
+    output_group.add_argument(
+        '-f', '--output-type', 
+        choices=['json', 'markdown', 'both'], 
+        default='both', 
+        help='输出格式'
+    )
+
+    # 算法参数
+    algo_group = parser.add_argument_group('算法参数')
+    algo_group.add_argument(
+        '-w', '--window',
+        type=int,
+        default=15,
+        help='向前查找的窗口大小(默认: 15)'
+    )
+    algo_group.add_argument(
+        '-t', '--threshold',
+        type=int,
+        default=80,
+        help='文本相似度阈值(0-100,默认: 80)'
+    )
+    
+    args = parser.parse_args()
+    output_type = args.output_type.lower()
+    
+    # 验证参数
+    if args.dotsocr_file and args.paddle_file:
+        # 单文件模式
+        dotsocr_file = Path(args.dotsocr_file)
+        paddle_file = Path(args.paddle_file)
+        output_dir = Path(args.output_dir)
+        
+        if not dotsocr_file.exists():
+            print(f"❌ 错误: DotsOCR 文件不存在: {dotsocr_file}")
+            return
+        
+        if not paddle_file.exists():
+            print(f"❌ 错误: PaddleOCR 文件不存在: {paddle_file}")
+            return
+        
+        output_dir.mkdir(parents=True, exist_ok=True)
+        
+        print("\n🔧 单文件处理模式")
+        print(f"📄 DotsOCR 文件: {dotsocr_file}")
+        print(f"📄 PaddleOCR 文件: {paddle_file}")
+        print(f"📂 输出目录: {output_dir}\n")
+        
+        merger = DotsOCRMerger(
+            look_ahead_window=args.window,
+            similarity_threshold=args.threshold
+        )
+        
+        success = merge_single_file(dotsocr_file, paddle_file, output_dir, output_type, merger)
+        
+        if success:
+            print("\n✅ 处理完成!")
+        else:
+            print("\n❌ 处理失败!")
+    
+    elif args.dotsocr_dir and args.paddle_dir:
+        # 批量模式
+        if not Path(args.dotsocr_dir).exists():
+            print(f"❌ 错误: DotsOCR 目录不存在: {args.dotsocr_dir}")
+            return
+        
+        if not Path(args.paddle_dir).exists():
+            print(f"❌ 错误: PaddleOCR 目录不存在: {args.paddle_dir}")
+            return
+        
+        print("\n🔧 批量处理模式")
+        
+        merge_dotsocr_batch(
+            args.dotsocr_dir,
+            args.paddle_dir,
+            args.output_dir,
+            output_type=output_type,
+            look_ahead_window=args.window,
+            similarity_threshold=args.threshold
+        )
+    
+    else:
+        parser.print_help()
+        print("\n❌ 错误: 请指定单文件模式或批量模式的参数")
+
+
+if __name__ == "__main__":
+    print("🚀 启动 DotsOCR + PaddleOCR 合并程序 (统一输出MinerU格式)...")
+    
+    import sys
+    
+    if len(sys.argv) == 1:
+        # 默认配置
+        default_config = {
+            "dotsocr-file": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/dotsocr_vllm_results/A用户_单元格扫描流水_page_002.json",
+            "paddle-file": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/ppstructurev3_client_results/A用户_单元格扫描流水_page_002.json",
+            "output-dir": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/dotsocr_vllm_results_cell_bbox",
+            "output-type": "both",
+            "window": "15",
+            "threshold": "85"
+        }
+        
+        print("ℹ️  未提供命令行参数,使用默认配置运行...")
+        print("⚙️  默认参数:")
+        for key, value in default_config.items():
+            print(f"  --{key}: {value}")
+        
+        sys.argv = [sys.argv[0]]
+        for key, value in default_config.items():
+            sys.argv.extend([f"--{key}", str(value)])
+    
+    sys.exit(main())

+ 4 - 0
streamlit_validator_core.py

@@ -81,6 +81,8 @@ class StreamlitOCRValidator:
                 print(f"✅ 切换到OCR数据源: {source_key}")
             else:
                 print(f"⚠️ 数据源 {source_key} 没有可用文件")
+        else:
+            raise FileNotFoundError(f"找不到文件路径: {source_key}")
     
     def switch_to_verify_source(self, source_key: str):
         """切换到指定验证数据源"""
@@ -96,6 +98,8 @@ class StreamlitOCRValidator:
                 print(f"✅ 切换到验证数据源: {source_key}")
             else:
                 print(f"⚠️ 验证数据源 {source_key} 没有可用文件")
+        else:
+            raise FileNotFoundError(f"找不到文件路径: {source_key}")
 
     def load_ocr_data(self, json_path: str, md_path: Optional[str] = None, image_path: Optional[str] = None):
         """加载OCR相关数据"""