Browse Source

feat: Add MinerU and PaddleOCR result merging functionality

- Implemented the main merging script `merge_mineru_paddle_ocr.py` to handle both single file and batch processing of MinerU and PaddleOCR results.
- Created core merging module `merger_core.py` to integrate various submodules for text matching, bounding box extraction, data processing, and Markdown generation.
- Developed `text_matcher.py` for text normalization and similarity calculations, utilizing fuzzy matching techniques.
- Enhanced error handling and user feedback during the merging process.
- Added command-line interface for flexible usage and configuration.
zhch158_admin 4 weeks ago
parent
commit
f137faec73

+ 17 - 0
merger/__init__.py

@@ -0,0 +1,17 @@
+"""
+MinerU + PaddleOCR 合并工具包
+"""
+
+from .merger_core import MinerUPaddleOCRMerger
+from .text_matcher import TextMatcher
+from .bbox_extractor import BBoxExtractor
+from .data_processor import DataProcessor
+from .markdown_generator import MarkdownGenerator
+
+__all__ = [
+    'MinerUPaddleOCRMerger',
+    'TextMatcher',
+    'BBoxExtractor',
+    'DataProcessor',
+    'MarkdownGenerator'
+]

+ 97 - 0
merger/bbox_extractor.py

@@ -0,0 +1,97 @@
+"""
+bbox 提取模块
+负责从 PaddleOCR 结果中提取文字框信息
+"""
+from typing import List, Dict
+
+
+class BBoxExtractor:
+    """bbox 提取器"""
+    
+    @staticmethod
+    def extract_paddle_text_boxes(paddle_data: Dict) -> List[Dict]:
+        """
+        提取 PaddleOCR 的文字框信息
+        
+        Args:
+            paddle_data: PaddleOCR 输出的数据
+        
+        Returns:
+            文字框列表
+        """
+        text_boxes = []
+        
+        if 'overall_ocr_res' not in paddle_data:
+            return text_boxes
+        
+        ocr_res = paddle_data['overall_ocr_res']
+        rec_texts = ocr_res.get('rec_texts', [])
+        rec_polys = ocr_res.get('rec_polys', [])
+        rec_scores = ocr_res.get('rec_scores', [])
+
+        for i, (text, poly, score) in enumerate(zip(rec_texts, rec_polys, rec_scores)):
+            if text and text.strip():
+                # 计算 bbox (x_min, y_min, x_max, y_max)
+                bbox = BBoxExtractor._poly_to_bbox(poly)
+                
+                text_boxes.append({
+                    'text': text,
+                    'bbox': bbox,
+                    'poly': poly,
+                    'score': score,
+                    'paddle_bbox_index': i,
+                    'used': False
+                })
+
+        return text_boxes
+    
+    @staticmethod
+    def _poly_to_bbox(poly: List[List[float]]) -> List[float]:
+        """将多边形转换为 bbox"""
+        xs = [p[0] for p in poly]
+        ys = [p[1] for p in poly]
+        return [min(xs), min(ys), max(xs), max(ys)]
+    
+    @staticmethod
+    def extract_table_cells_with_bbox(merged_data: List[Dict]) -> List[Dict]:
+        """
+        提取所有表格单元格及其 bbox 信息
+        
+        Args:
+            merged_data: 合并后的数据
+        
+        Returns:
+            单元格列表
+        """
+        import json
+        from bs4 import BeautifulSoup
+        
+        cells = []
+        
+        for item in merged_data:
+            if item['type'] != 'table':
+                continue
+            
+            html = item.get('table_body_with_bbox', item.get('table_body', ''))
+            soup = BeautifulSoup(html, 'html.parser')
+            
+            for row_idx, row in enumerate(soup.find_all('tr')):
+                for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
+                    cell_text = cell.get_text(strip=True)
+                    bbox_str = cell.get('data-bbox', '')
+                    
+                    if bbox_str:
+                        try:
+                            bbox = json.loads(bbox_str)
+                            cells.append({
+                                'text': cell_text,
+                                'bbox': bbox,
+                                'row': row_idx,
+                                'col': col_idx,
+                                'score': float(cell.get('data-score', 0)),
+                                'paddle_index': int(cell.get('data-paddle-index', -1))
+                            })
+                        except (json.JSONDecodeError, ValueError):
+                            pass
+        
+        return cells

+ 165 - 0
merger/data_processor.py

@@ -0,0 +1,165 @@
+"""
+数据处理模块
+负责处理 MinerU 数据,添加 bbox 信息
+"""
+from typing import List, Dict, Tuple
+from bs4 import BeautifulSoup
+
+try:
+    from .text_matcher import TextMatcher
+except ImportError:
+    from text_matcher import TextMatcher
+
+
+class DataProcessor:
+    """数据处理器"""
+    
+    def __init__(self, text_matcher: TextMatcher, look_ahead_window: int = 10):
+        """
+        Args:
+            text_matcher: 文本匹配器
+            look_ahead_window: 向前查找窗口
+        """
+        self.text_matcher = text_matcher
+        self.look_ahead_window = look_ahead_window
+    
+    def process_mineru_data(self, mineru_data: List[Dict], 
+                           paddle_text_boxes: List[Dict]) -> List[Dict]:
+        """
+        处理 MinerU 数据,添加 bbox 信息
+        
+        Args:
+            mineru_data: MinerU 数据
+            paddle_text_boxes: PaddleOCR 文字框列表
+        
+        Returns:
+            合并后的数据
+        """
+        merged_data = []
+        paddle_pointer = 0
+        last_matched_index = 0
+
+        # 按 bbox 排序
+        mineru_data.sort(
+            key=lambda x: (x['bbox'][1], x['bbox'][0]) 
+            if 'bbox' in x else (float('inf'), float('inf'))
+        )
+
+        for item in mineru_data:
+            item_type = item.get('type', '')
+            
+            if item_type == 'table':
+                merged_item, paddle_pointer = self._process_table(
+                    item, paddle_text_boxes, paddle_pointer
+                )
+                merged_data.append(merged_item)
+            
+            elif item_type in ['text', 'title']:
+                merged_item, paddle_pointer, last_matched_index = self._process_text(
+                    item, paddle_text_boxes, paddle_pointer, last_matched_index
+                )
+                merged_data.append(merged_item)
+            
+            elif item_type == 'list':
+                merged_item, paddle_pointer, last_matched_index = self._process_list(
+                    item, paddle_text_boxes, paddle_pointer, last_matched_index
+                )
+                merged_data.append(merged_item)
+            
+            else:
+                merged_data.append(item.copy())
+        
+        return merged_data
+    
+    def _process_table(self, item: Dict, paddle_text_boxes: List[Dict],
+                      start_pointer: int) -> Tuple[Dict, int]:
+        """处理表格"""
+        merged_item = item.copy()
+        table_html = item.get('table_body', '')
+        
+        enhanced_html, cells, new_pointer = self._enhance_table_html_with_bbox(
+            table_html, paddle_text_boxes, start_pointer
+        )
+        
+        merged_item['table_body'] = enhanced_html
+        merged_item['table_body_with_bbox'] = enhanced_html
+        merged_item['bbox_mapping'] = 'merged_from_paddle_ocr'
+        merged_item['table_cells'] = cells if cells else []
+        
+        return merged_item, new_pointer
+    
+    def _process_text(self, item: Dict, paddle_text_boxes: List[Dict],
+                     paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]:
+        """处理文本"""
+        merged_item = item.copy()
+        text = item.get('text', '')
+        
+        matched_bbox, paddle_pointer, last_matched_index = \
+            self.text_matcher.find_matching_bbox(
+                text, paddle_text_boxes, paddle_pointer, last_matched_index,
+                self.look_ahead_window
+            )
+        
+        if matched_bbox:
+            matched_bbox['used'] = True
+        
+        return merged_item, paddle_pointer, last_matched_index
+    
+    def _process_list(self, item: Dict, paddle_text_boxes: List[Dict],
+                     paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]:
+        """处理列表"""
+        merged_item = item.copy()
+        list_items = item.get('list_items', [])
+        
+        for list_item in list_items:
+            matched_bbox, paddle_pointer, last_matched_index = \
+                self.text_matcher.find_matching_bbox(
+                    list_item, paddle_text_boxes, paddle_pointer, last_matched_index,
+                    self.look_ahead_window
+                )
+            
+            if matched_bbox:
+                matched_bbox['used'] = True
+        
+        return merged_item, paddle_pointer, last_matched_index
+    
+    def _enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
+                                      start_pointer: int) -> Tuple[str, List[Dict], int]:
+        """为 HTML 表格添加 bbox 信息"""
+        soup = BeautifulSoup(html, 'html.parser')
+        current_pointer = start_pointer
+        last_matched_index = start_pointer
+        cells = []
+
+        for row_idx, row in enumerate(soup.find_all('tr')):
+            for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
+                cell_text = cell.get_text(strip=True)
+                
+                if not cell_text:
+                    continue
+                
+                matched_bbox, current_pointer, last_matched_index = \
+                    self.text_matcher.find_matching_bbox(
+                        cell_text, paddle_text_boxes, current_pointer, 
+                        last_matched_index, self.look_ahead_window
+                    )
+                
+                if matched_bbox:
+                    bbox = matched_bbox['bbox']
+                    cell['data-bbox'] = f"[{bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}]"
+                    cell['data-score'] = f"{matched_bbox['score']:.4f}"
+                    cell['data-paddle-index'] = str(matched_bbox['paddle_bbox_index'])
+
+                    cells.append({
+                        'type': 'table_cell',
+                        'text': cell_text,
+                        'bbox': bbox,
+                        'row': row_idx + 1,
+                        'col': col_idx + 1,
+                        'score': matched_bbox['score'],
+                        'paddle_bbox_index': matched_bbox['paddle_bbox_index']
+                    })
+                    
+                    matched_bbox['used'] = True
+        
+        return str(soup), cells, current_pointer

+ 232 - 0
merger/markdown_generator.py

@@ -0,0 +1,232 @@
+"""
+Markdown 生成模块
+负责将合并后的数据生成 Markdown 文件
+"""
+import shutil
+from pathlib import Path
+from typing import List, Dict, Optional
+
+
+class MarkdownGenerator:
+    """Markdown 生成器"""
+    
+    @staticmethod
+    def generate_enhanced_markdown(merged_data: List[Dict], 
+                                   output_path: Optional[str] = None,
+                                   mineru_file: Optional[str] = None) -> str:
+        """
+        生成增强的 Markdown(包含 bbox 信息的注释)
+        
+        Args:
+            merged_data: 合并后的数据
+            output_path: 输出路径
+            mineru_file: MinerU 源文件路径(用于复制图片)
+        
+        Returns:
+            Markdown 内容
+        """
+        md_lines = []
+        
+        for item in merged_data:
+            item_type = item.get('type', '')
+            
+            if item_type == 'title':
+                md_lines.extend(MarkdownGenerator._format_title(item))
+            elif item_type == 'text':
+                md_lines.extend(MarkdownGenerator._format_text(item))
+            elif item_type == 'list':
+                md_lines.extend(MarkdownGenerator._format_list(item))
+            elif item_type == 'table':
+                md_lines.extend(MarkdownGenerator._format_table(item))
+            elif item_type == 'image':
+                md_lines.extend(MarkdownGenerator._format_image(
+                    item, output_path, mineru_file
+                ))
+            elif item_type == 'equation':
+                md_lines.extend(MarkdownGenerator._format_equation(item))
+            elif item_type == 'inline_equation':
+                md_lines.extend(MarkdownGenerator._format_inline_equation(item))
+            elif item_type in ['page_number', 'header', 'footer']:
+                md_lines.extend(MarkdownGenerator._format_metadata(item, item_type))
+            elif item_type == 'reference':
+                md_lines.extend(MarkdownGenerator._format_reference(item))
+            else:
+                md_lines.extend(MarkdownGenerator._format_unknown(item))
+        
+        markdown_content = '\n'.join(md_lines)
+        
+        if output_path:
+            with open(output_path, 'w', encoding='utf-8') as f:
+                f.write(markdown_content)
+        
+        return markdown_content
+    
+    @staticmethod
+    def _add_bbox_comment(bbox: List) -> str:
+        """添加 bbox 注释"""
+        return f"<!-- bbox: {bbox} -->"
+    
+    @staticmethod
+    def _format_title(item: Dict) -> List[str]:
+        """格式化标题"""
+        lines = []
+        bbox = item.get('bbox', [])
+        if bbox:
+            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
+        
+        text = item.get('text', '')
+        text_level = item.get('text_level', 1)
+        heading = '#' * min(text_level, 6)
+        lines.append(f"{heading} {text}\n")
+        
+        return lines
+    
+    @staticmethod
+    def _format_text(item: Dict) -> List[str]:
+        """格式化文本"""
+        lines = []
+        bbox = item.get('bbox', [])
+        if bbox:
+            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
+        
+        text = item.get('text', '')
+        text_level = item.get('text_level', 0)
+        
+        if text_level > 0:
+            heading = '#' * min(text_level, 6)
+            lines.append(f"{heading} {text}\n")
+        else:
+            lines.append(f"{text}\n")
+        
+        return lines
+    
+    @staticmethod
+    def _format_list(item: Dict) -> List[str]:
+        """格式化列表"""
+        lines = []
+        bbox = item.get('bbox', [])
+        if bbox:
+            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
+        
+        list_items = item.get('list_items', [])
+        for list_item in list_items:
+            lines.append(f"{list_item}\n")
+        
+        lines.append("")
+        return lines
+    
+    @staticmethod
+    def _format_table(item: Dict) -> List[str]:
+        """格式化表格"""
+        lines = []
+        bbox = item.get('bbox', [])
+        if bbox:
+            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
+        
+        # 表格标题
+        table_caption = item.get('table_caption', [])
+        for caption in table_caption:
+            if caption:
+                lines.append(f"**{caption}**\n")
+        
+        # 表格内容
+        table_body = item.get('table_body_with_bbox', item.get('table_body', ''))
+        if table_body:
+            lines.append(table_body)
+            lines.append("")
+        
+        # 表格脚注
+        table_footnote = item.get('table_footnote', [])
+        for footnote in table_footnote:
+            if footnote:
+                lines.append(f"*{footnote}*")
+        if table_footnote:
+            lines.append("")
+        
+        return lines
+    
+    @staticmethod
+    def _format_image(item: Dict, output_path: Optional[str],
+                     mineru_file: Optional[str]) -> List[str]:
+        """格式化图片"""
+        lines = []
+        bbox = item.get('bbox', [])
+        if bbox:
+            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
+        
+        img_path = item.get('img_path', '')
+        
+        # 复制图片
+        if img_path and mineru_file and output_path:
+            MarkdownGenerator._copy_image(img_path, mineru_file, output_path)
+        
+        # 图片标题
+        image_caption = item.get('image_caption', [])
+        for caption in image_caption:
+            if caption:
+                lines.append(f"**{caption}**\n")
+        
+        lines.append(f"![Image]({img_path})\n")
+        
+        # 图片脚注
+        image_footnote = item.get('image_footnote', [])
+        for footnote in image_footnote:
+            if footnote:
+                lines.append(f"*{footnote}*")
+        if image_footnote:
+            lines.append("")
+        
+        return lines
+    
+    @staticmethod
+    def _copy_image(img_path: str, mineru_file: str, output_path: str):
+        """复制图片到输出目录"""
+        mineru_dir = Path(mineru_file).parent
+        img_full_path = mineru_dir / img_path
+        if img_full_path.exists():
+            output_img_path = Path(output_path).parent / img_path
+            output_img_path.parent.mkdir(parents=True, exist_ok=True)
+            shutil.copy(img_full_path, output_img_path)
+    
+    @staticmethod
+    def _format_equation(item: Dict) -> List[str]:
+        """格式化公式"""
+        latex = item.get('latex', '')
+        if latex:
+            return [f"$$\n{latex}\n$$\n"]
+        return []
+    
+    @staticmethod
+    def _format_inline_equation(item: Dict) -> List[str]:
+        """格式化行内公式"""
+        latex = item.get('latex', '')
+        if latex:
+            return [f"${latex}$\n"]
+        return []
+    
+    @staticmethod
+    def _format_metadata(item: Dict, item_type: str) -> List[str]:
+        """格式化元数据(页码、页眉、页脚)"""
+        text = item.get('text', '')
+        type_map = {
+            'page_number': '页码',
+            'header': '页眉',
+            'footer': '页脚'
+        }
+        if text:
+            return [f"<!-- {type_map.get(item_type, item_type)}: {text} -->\n"]
+        return []
+    
+    @staticmethod
+    def _format_reference(item: Dict) -> List[str]:
+        """格式化参考文献"""
+        text = item.get('text', '')
+        return [f"> {text}\n"]
+    
+    @staticmethod
+    def _format_unknown(item: Dict) -> List[str]:
+        """格式化未知类型"""
+        text = item.get('text', '')
+        if text:
+            return [f"{text}\n"]
+        return []

+ 816 - 0
merger/merge_mineru_paddle_ocr.1.py

@@ -0,0 +1,816 @@
+"""
+合并 MinerU 和 PaddleOCR 的结果
+使用 MinerU 的表格结构识别 + PaddleOCR 的文字框坐标
+"""
+import json
+import re
+import argparse
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+from bs4 import BeautifulSoup
+from fuzzywuzzy import fuzz
+import shutil
+
+class MinerUPaddleOCRMerger:
+    """合并 MinerU 和 PaddleOCR 的结果"""
+    
+    def __init__(self, look_ahead_window: int = 10, similarity_threshold: int = 90):
+        """
+        Args:
+            look_ahead_window: 向前查找的窗口大小
+            similarity_threshold: 文本相似度阈值
+        """
+        self.look_ahead_window = look_ahead_window
+        self.similarity_threshold = similarity_threshold
+    
+    def merge_table_with_bbox(self, mineru_json_path: str, paddle_json_path: str) -> List[Dict]:
+        """
+        合并 MinerU 和 PaddleOCR 的结果
+        
+        Args:
+            mineru_json_path: MinerU 输出的 JSON 路径
+            paddle_json_path: PaddleOCR 输出的 JSON 路径
+            output_path: 输出路径(可选)
+        
+        Returns:
+            合并后的结果字典
+        """
+        merged_data = None
+        # 加载数据
+        with open(mineru_json_path, 'r', encoding='utf-8') as f:
+            mineru_data = json.load(f)
+        
+        with open(paddle_json_path, 'r', encoding='utf-8') as f:
+            paddle_data = json.load(f)
+        
+        # 提取 PaddleOCR 的文字框信息
+        paddle_text_boxes = self._extract_paddle_text_boxes(paddle_data)
+        
+        # 处理 MinerU 的数据
+        merged_data = self._process_mineru_data(mineru_data, paddle_text_boxes)
+        
+        return merged_data
+    
+    def _extract_paddle_text_boxes(self, paddle_data: Dict) -> List[Dict]:
+        """提取 PaddleOCR 的文字框信息"""
+        text_boxes = []
+        
+        if 'overall_ocr_res' in paddle_data:
+            ocr_res = paddle_data['overall_ocr_res']
+            rec_texts = ocr_res.get('rec_texts', [])
+            rec_polys = ocr_res.get('rec_polys', [])
+            rec_scores = ocr_res.get('rec_scores', [])
+
+            for i, (text, poly, score) in enumerate(zip(rec_texts, rec_polys, rec_scores)):
+                if text and text.strip():
+                    # 计算 bbox (x_min, y_min, x_max, y_max)
+                    xs = [p[0] for p in poly]
+                    ys = [p[1] for p in poly]
+                    bbox = [min(xs), min(ys), max(xs), max(ys)]
+                    
+                    text_boxes.append({
+                        'text': text,
+                        'bbox': bbox,
+                        'poly': poly,
+                        'score': score,
+                        'paddle_bbox_index': i,
+                        'used': False  # 标记是否已被使用
+                    })
+
+        return text_boxes
+    
+    def _process_mineru_data(self, mineru_data: List[Dict], 
+                            paddle_text_boxes: List[Dict]) -> List[Dict]:
+        """处理 MinerU 数据,添加 bbox 信息
+
+        Args:
+            mineru_data (List[Dict]): _description_
+            paddle_text_boxes (List[Dict]): _description_
+
+        Returns:
+            List[Dict]: _description_
+        """ 
+
+        merged_data = []
+        cells = None  # 存储所有表格单元格信息
+        paddle_pointer = 0  # PaddleOCR 文字框指针
+        last_matched_index = 0  # 上次匹配成功的索引
+
+        # 对mineru_data按bbox从上到下排序,从左到右确保顺序一致
+        mineru_data.sort(key=lambda x: (x['bbox'][1], x['bbox'][0]) if 'bbox' in x else (float('inf'), float('inf')))
+
+        for item in mineru_data:
+            if item['type'] == 'table':
+                # 处理表格
+                merged_item = item.copy()
+                table_html = item.get('table_body', '')
+                
+                # 解析 HTML 表格并添加 bbox
+                enhanced_html, cells, paddle_pointer = self._enhance_table_html_with_bbox(
+                    table_html, paddle_text_boxes, paddle_pointer
+                )
+                
+                merged_item['table_body'] = enhanced_html
+                merged_item['table_body_with_bbox'] = enhanced_html
+                merged_item['bbox_mapping'] = 'merged_from_paddle_ocr'
+                merged_item['table_cells'] = cells if cells else []
+                
+                merged_data.append(merged_item)
+            
+            elif item['type'] in ['text', 'title']:
+                # 处理普通文本
+                merged_item = item.copy()
+                text = item.get('text', '')
+                
+                # 查找匹配的 bbox
+                matched_bbox, paddle_pointer, last_matched_index = self._find_matching_bbox(
+                    text, paddle_text_boxes, paddle_pointer, last_matched_index
+                )
+                
+                if matched_bbox:
+                    # merged_item['bbox'] = matched_bbox['bbox']
+                    # merged_item['bbox_source'] = 'paddle_ocr'
+                    # merged_item['text_score'] = matched_bbox['score']
+
+                    # 沿用mineru的bbox, 就是要移动位置paddle_pointer, last_matched_index
+                    # 标记为已使用
+                    matched_bbox['used'] = True
+                
+                merged_data.append(merged_item)
+            elif item['type'] == 'list':
+                # 处理列表项
+                merged_item = item.copy()
+                list_items = item.get('list_items', [])
+                sub_type = item.get('sub_type', 'unordered')  # 有序或无序
+
+                for list_item in list_items:
+                    # 查找匹配的 bbox
+                    matched_bbox, paddle_pointer, last_matched_index = self._find_matching_bbox(
+                        list_item, paddle_text_boxes, paddle_pointer, last_matched_index
+                    )
+                    
+                    if matched_bbox:
+                        # 沿用mineru的bbox, 就是要移动位置paddle_pointer, last_matched_index
+                        # 标记为已使用
+                        matched_bbox['used'] = True
+                
+                merged_data.append(merged_item)
+            else:
+                # 其他类型直接复制
+                merged_data.append(item.copy())
+        
+        return merged_data
+    
+    def _enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict], 
+                                      start_pointer: int) -> Tuple[str, List[Dict], int]:
+        """
+        为 HTML 表格添加 bbox 信息
+        
+        Args:
+            html: 原始 HTML 表格
+            paddle_text_boxes: PaddleOCR 文字框列表
+            start_pointer: 起始指针位置
+        
+        Returns:
+            (增强后的 HTML, 单元格数组, 新的指针位置)
+        """
+        # 需要处理minerU识别为2个连着的cell,如: -741.00|357,259.63, paddle识别为一个cell,如: -741.00357,259.63
+        soup = BeautifulSoup(html, 'html.parser')
+        current_pointer = start_pointer
+        last_matched_index = start_pointer
+        cells = []  # 存储单元格的 bbox 信息
+
+        # 遍历所有行
+        for row_idx, row in enumerate(soup.find_all('tr')):
+            # 遍历所有单元格
+            for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
+                cell_text = cell.get_text(strip=True)
+            
+                if not cell_text:
+                    continue
+                
+                # 查找匹配的 bbox
+                matched_bbox, current_pointer, last_matched_index = self._find_matching_bbox(
+                    cell_text, paddle_text_boxes, current_pointer, last_matched_index
+                )
+                
+                if matched_bbox:
+                    # 添加 data-bbox 属性
+                    bbox = matched_bbox['bbox']
+                    cell['data-bbox'] = f"[{bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}]"
+                    cell['data-score'] = f"{matched_bbox['score']:.4f}"
+                    cell['data-paddle-index'] = str(matched_bbox['paddle_bbox_index'])
+
+                    cells.append({
+                        'type': 'table_cell',
+                        'text': cell_text,
+                        'bbox': bbox,
+                        'row': row_idx+1,
+                        'col': col_idx+1,
+                        'score': matched_bbox['score'],
+                        'paddle_bbox_index': matched_bbox['paddle_bbox_index']
+                    })
+                    # 标记为已使用
+                    matched_bbox['used'] = True
+        
+        return str(soup), cells, current_pointer
+    
+    def _find_matching_bbox(self, target_text: str, text_boxes: List[Dict], 
+                           start_index: int, last_match_index: int) -> tuple[Optional[Dict], int, int]:
+        """
+        查找匹配的文字框
+        
+        Args:
+            target_text: 目标文本
+            text_boxes: 文字框列表
+            start_index: 起始索引, 是最后一个used=True的位置+1 
+            last_match_index: 上次匹配成功的索引, 可能比start_index小
+        
+        Returns:
+            (匹配的文字框信息, 新的指针位置, last_match_index)
+        """
+        target_text = self._normalize_text(target_text)
+        
+        # 过滤过短的目标文本
+        if len(target_text) < 2:
+            return None, start_index, last_match_index
+
+        # 由于minerU和Paddle的顺序基本一致, 也有不一致的地方, 所以需要向前找第一个未使用的位置
+        # MinerU和Paddle都可能识别错误,所以需要一个look_ahead_window来避免漏掉匹配
+        # 匹配时会遇到一些特殊情况,比如Paddle把两个连着的cell识别为一个字符串,MinerU将单元格上下2行识别为一行
+        # 	'1|2024-08-11|扫二维码付'   minerU识别为“扫二维码付款”,Paddle识别为'12024-08-11扫二维码付'  
+        #                  款
+        # 字符串的顺序极大概率是一致的,所以如果短字符串是长字符串的子串,可以增加相似权重
+
+        search_start = last_match_index - 1
+        unused_count = 0
+        while search_start >= 0:
+            if text_boxes[search_start]['used'] == False:
+                unused_count += 1
+            if unused_count >= self.look_ahead_window:
+                break
+            search_start -= 1
+        if search_start < 0:
+            search_start = 0
+            while search_start < start_index and text_boxes[search_start]['used']:
+                search_start += 1
+        search_end = min(start_index + self.look_ahead_window, len(text_boxes))
+        
+        best_match = None
+        best_index = start_index
+        
+        for i in range(search_start, search_end):
+            if text_boxes[i]['used']:
+                continue
+            
+            box_text = self._normalize_text(text_boxes[i]['text'])
+            # 精确匹配优先
+            if target_text == box_text:
+                if i >= start_index:
+                    return text_boxes[i], i + 1, i
+                else:
+                    return text_boxes[i], start_index, i
+            
+            # 过滤过短的候选文本(避免单字符匹配)
+            if len(box_text) < 2:
+                continue
+            
+            # 长度比例检查 - 避免长度差异过大的匹配
+            length_ratio = min(len(target_text), len(box_text)) / max(len(target_text), len(box_text))
+            if length_ratio < 0.3:  # 长度差异超过70%则跳过
+                continue
+
+            # 子串检查
+            shorter = target_text if len(target_text) < len(box_text) else box_text
+            longer = box_text if len(target_text) < len(box_text) else target_text
+            is_substring = shorter in longer        
+
+            # 计算多种相似度
+            # token_sort_ratio = fuzz.token_sort_ratio(target_text, box_text)
+            partial_ratio = fuzz.partial_ratio(target_text, box_text)
+            if is_substring:
+                partial_ratio += 10  # 子串时提升相似度
+            
+            # 综合相似度 - 两种算法都要达到阈值
+            if (partial_ratio >= self.similarity_threshold):
+                if i >= start_index:
+                    return text_boxes[i], i + 1, last_match_index
+                else:
+                    return text_boxes[i], start_index, last_match_index
+
+        return best_match, best_index, last_match_index
+
+    def _normalize_text(self, text: str) -> str:
+        """标准化文本(去除空格、标点等)"""
+        # 移除所有空白字符
+        text = re.sub(r'\s+', '', text)
+        # 转换全角数字和字母为半角
+        text = self._full_to_half(text)
+        return text.lower()
+    
+    def _full_to_half(self, text: str) -> str:
+        """全角转半角"""
+        result = []
+        for char in text:
+            code = ord(char)
+            if code == 0x3000:  # 全角空格
+                code = 0x0020
+            elif 0xFF01 <= code <= 0xFF5E:  # 全角字符
+                code -= 0xFEE0
+            result.append(chr(code))
+        return ''.join(result)
+    
+    def generate_enhanced_markdown(self, merged_data: List[Dict], 
+                                   output_path: Optional[str] = None, mineru_file: Optional[str] = None) -> str:
+        """
+        生成增强的 Markdown(包含 bbox 信息的注释)
+        参考 MinerU 的实现,支持标题、列表、表格标题等
+        
+        Args:
+            merged_data: 合并后的数据
+            output_path: 输出路径(可选)
+            mineru_file: MinerU 源文件路径(用于复制图片)
+        
+        Returns:
+            Markdown 内容
+        """
+        md_lines = []
+        
+        for item in merged_data:
+            item_type = item.get('type', '')
+            bbox = item.get('bbox', [])
+            
+            # 添加 bbox 注释
+            if bbox:
+                md_lines.append(f"<!-- bbox: {bbox} -->")
+            
+            # 根据类型处理
+            if item_type == 'title':
+                # 标题 - 使用 text_level 确定标题级别
+                text = item.get('text', '')
+                text_level = item.get('text_level', 1)
+                heading = '#' * min(text_level, 6)  # 最多6级标题
+                md_lines.append(f"{heading} {text}\n")
+            
+            elif item_type == 'text':
+                # 普通文本 - 可能也有 text_level
+                text = item.get('text', '')
+                text_level = item.get('text_level', 0)
+                
+                if text_level > 0:
+                    # 作为标题处理
+                    heading = '#' * min(text_level, 6)
+                    md_lines.append(f"{heading} {text}\n")
+                else:
+                    # 普通段落
+                    md_lines.append(f"{text}\n")
+            
+            elif item_type == 'list':
+                # 列表
+                sub_type = item.get('sub_type', 'text')
+                list_items = item.get('list_items', [])
+                
+                for list_item in list_items:
+                    md_lines.append(f"{list_item}\n")
+                
+                md_lines.append("")  # 列表后添加空行
+            
+            elif item_type == 'table':
+                # 表格标题
+                table_caption = item.get('table_caption', [])
+                if table_caption:
+                    for caption in table_caption:
+                        if caption:  # 跳过空标题
+                            md_lines.append(f"**{caption}**\n")
+                
+                # 表格内容
+                table_body = item.get('table_body_with_bbox', item.get('table_body', ''))
+                if table_body:
+                    md_lines.append(table_body)
+                    md_lines.append("")
+                
+                # 表格脚注
+                table_footnote = item.get('table_footnote', [])
+                if table_footnote:
+                    for footnote in table_footnote:
+                        if footnote:
+                            md_lines.append(f"*{footnote}*")
+                    md_lines.append("")
+            
+            elif item_type == 'image':
+                # 图片
+                img_path = item.get('img_path', '')
+                
+                # 复制图片到输出目录
+                if img_path and mineru_file and output_path:
+                    mineru_dir = Path(mineru_file).parent
+                    img_full_path = mineru_dir / img_path
+                    if img_full_path.exists():
+                        output_img_path = Path(output_path).parent / img_path
+                        output_img_path.parent.mkdir(parents=True, exist_ok=True)
+                        shutil.copy(img_full_path, output_img_path)
+                
+                # 图片标题
+                image_caption = item.get('image_caption', [])
+                if image_caption:
+                    for caption in image_caption:
+                        if caption:
+                            md_lines.append(f"**{caption}**\n")
+                
+                # 插入图片
+                md_lines.append(f"![Image]({img_path})\n")
+                
+                # 图片脚注
+                image_footnote = item.get('image_footnote', [])
+                if image_footnote:
+                    for footnote in image_footnote:
+                        if footnote:
+                            md_lines.append(f"*{footnote}*")
+                    md_lines.append("")
+            
+            elif item_type == 'equation':
+                # 公式
+                latex = item.get('latex', '')
+                if latex:
+                    md_lines.append(f"$$\n{latex}\n$$\n")
+            
+            elif item_type == 'inline_equation':
+                # 行内公式
+                latex = item.get('latex', '')
+                if latex:
+                    md_lines.append(f"${latex}$\n")
+            
+            elif item_type == 'page_number':
+                # 页码 - 通常跳过或作为注释
+                text = item.get('text', '')
+                md_lines.append(f"<!-- 页码: {text} -->\n")
+            
+            elif item_type == 'header':
+                # 页眉
+                text = item.get('text', '')
+                md_lines.append(f"<!-- 页眉: {text} -->\n")
+            
+            elif item_type == 'footer':
+                # 页脚
+                text = item.get('text', '')
+                if text:
+                    md_lines.append(f"<!-- 页脚: {text} -->\n")
+            
+            elif item_type == 'reference':
+                # 参考文献
+                text = item.get('text', '')
+                md_lines.append(f"> {text}\n")
+            
+            else:
+                # 未知类型 - 尝试提取文本
+                text = item.get('text', '')
+                if text:
+                    md_lines.append(f"{text}\n")
+        
+        markdown_content = '\n'.join(md_lines)
+        
+        # 保存文件
+        if output_path:
+            with open(output_path, 'w', encoding='utf-8') as f:
+                f.write(markdown_content)
+        
+        return markdown_content
+
+    def extract_table_cells_with_bbox(self, merged_data: List[Dict]) -> List[Dict]:
+        """
+        提取所有表格单元格及其 bbox 信息
+        
+        Returns:
+            单元格列表,每个包含 text, bbox, row, col 等信息
+        """
+        cells = []
+        
+        for item in merged_data:
+            if item['type'] != 'table':
+                continue
+            
+            html = item.get('table_body_with_bbox', item.get('table_body', ''))
+            soup = BeautifulSoup(html, 'html.parser')
+            
+            # 遍历所有行
+            for row_idx, row in enumerate(soup.find_all('tr')):
+                # 遍历所有单元格
+                for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
+                    cell_text = cell.get_text(strip=True)
+                    bbox_str = cell.get('data-bbox', '')
+                    
+                    if bbox_str:
+                        try:
+                            bbox = json.loads(bbox_str)
+                            cells.append({
+                                'text': cell_text,
+                                'bbox': bbox,
+                                'row': row_idx,
+                                'col': col_idx,
+                                'score': float(cell.get('data-score', 0)),
+                                'paddle_index': int(cell.get('data-paddle-index', -1))
+                            })
+                        except (json.JSONDecodeError, ValueError):
+                            pass
+        
+        return cells
+
+
+def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path, 
+                     output_format: str, merger: MinerUPaddleOCRMerger) -> bool:
+    """
+    合并单个文件
+    
+    Args:
+        mineru_file: MinerU JSON 文件路径
+        paddle_file: PaddleOCR JSON 文件路径
+        output_dir: 输出目录
+        merger: 合并器实例
+    
+    Returns:
+        是否成功
+    """
+    print(f"📄 处理: {mineru_file.name}")
+    
+    # 输出文件路径
+    merged_md_path = output_dir / f"{mineru_file.stem}.md"
+    merged_json_path = output_dir / f"{mineru_file.stem}.json"
+    
+    try:
+        # 合并数据
+        merged_data = merger.merge_table_with_bbox(
+            str(mineru_file),
+            str(paddle_file)
+        )
+        
+        # 生成 Markdown
+        if output_format in ['markdown', 'both']:
+            merger.generate_enhanced_markdown(merged_data, str(merged_md_path), mineru_file)
+        
+        # 提取单元格信息
+        # cells = merger.extract_table_cells_with_bbox(merged_data)
+        if output_format in ['json', 'both']:
+            with open(merged_json_path, 'w', encoding='utf-8') as f:
+                json.dump(merged_data, f, ensure_ascii=False, indent=2)
+
+        print(f"  ✅ 合并完成")
+        print(f"  📊 共处理了 {len(merged_data)} 个对象")
+        print(f"  💾 输出文件:")
+        if output_format in ['markdown', 'both']:
+            print(f"    - {merged_md_path.name}")
+        if output_format in ['json', 'both']:
+            print(f"    - {merged_json_path.name}")
+
+        return True
+        
+    except Exception as e:
+        print(f"  ❌ 处理失败: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+def merge_mineru_paddle_batch(mineru_dir: str, paddle_dir: str, output_dir: str, output_format: str = 'both',
+                              look_ahead_window: int = 10, 
+                              similarity_threshold: int = 80):
+    """
+    批量合并 MinerU 和 PaddleOCR 的结果
+    
+    Args:
+        mineru_dir: MinerU 结果目录
+        paddle_dir: PaddleOCR 结果目录
+        output_dir: 输出目录
+        look_ahead_window: 向前查找窗口大小
+        similarity_threshold: 相似度阈值
+    """
+    mineru_path = Path(mineru_dir)
+    paddle_path = Path(paddle_dir)
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    
+    merger = MinerUPaddleOCRMerger(
+        look_ahead_window=look_ahead_window, 
+        similarity_threshold=similarity_threshold
+    )
+    
+    # 查找所有 MinerU 的 JSON 文件
+    mineru_files = list(mineru_path.glob('*_page_*[0-9].json'))
+    mineru_files.sort()
+    
+    print(f"\n🔍 找到 {len(mineru_files)} 个 MinerU 文件")
+    print(f"📂 MinerU 目录: {mineru_dir}")
+    print(f"📂 PaddleOCR 目录: {paddle_dir}")
+    print(f"📂 输出目录: {output_dir}")
+    print(f"⚙️  查找窗口: {look_ahead_window}")
+    print(f"⚙️  相似度阈值: {similarity_threshold}%\n")
+    
+    success_count = 0
+    failed_count = 0
+    
+    for mineru_file in mineru_files:
+        # 查找对应的 PaddleOCR 文件
+        paddle_file = paddle_path / mineru_file.name
+        
+        if not paddle_file.exists():
+            print(f"⚠️  跳过: 未找到对应的 PaddleOCR 文件: {paddle_file.name}\n")
+            failed_count += 1
+            continue
+
+        if merge_single_file(mineru_file, paddle_file, output_path, output_format, merger):
+            success_count += 1
+        else:
+            failed_count += 1
+        
+        print()  # 空行分隔
+    
+    # 打印统计信息
+    print("=" * 60)
+    print(f"✅ 处理完成!")
+    print(f"📊 统计信息:")
+    print(f"  - 总文件数: {len(mineru_files)}")
+    print(f"  - 成功: {success_count}")
+    print(f"  - 失败: {failed_count}")
+    print("=" * 60)
+
+
+def main():
+    """主函数"""
+    parser = argparse.ArgumentParser(
+        description='合并 MinerU 和 PaddleOCR 的识别结果,添加 bbox 坐标信息',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+示例用法:
+
+  1. 批量处理整个目录:
+     python merge_mineru_paddle_ocr.py \\
+         --mineru-dir /path/to/mineru/results \\
+         --paddle-dir /path/to/paddle/results \\
+         --output-dir /path/to/output
+
+  2. 处理单个文件:
+     python merge_mineru_paddle_ocr.py \\
+         --mineru-file /path/to/file_page_001.json \\
+         --paddle-file /path/to/file_page_001.json \\
+         --output-dir /path/to/output
+
+  3. 自定义参数:
+     python merge_mineru_paddle_ocr.py \\
+         --mineru-dir /path/to/mineru \\
+         --paddle-dir /path/to/paddle \\
+         --output-dir /path/to/output \\
+         --window 15 \\
+         --threshold 85
+        """
+    )
+    
+    # 文件/目录参数
+    file_group = parser.add_argument_group('文件参数')
+    file_group.add_argument(
+        '--mineru-file', 
+        type=str,
+        help='MinerU 输出的 JSON 文件路径(单文件模式)'
+    )
+    file_group.add_argument(
+        '--paddle-file', 
+        type=str,
+        help='PaddleOCR 输出的 JSON 文件路径(单文件模式)'
+    )
+    
+    dir_group = parser.add_argument_group('目录参数')
+    dir_group.add_argument(
+        '--mineru-dir', 
+        type=str,
+        help='MinerU 结果目录(批量模式)'
+    )
+    dir_group.add_argument(
+        '--paddle-dir', 
+        type=str,
+        help='PaddleOCR 结果目录(批量模式)'
+    )
+    
+    # 输出参数
+    output_group = parser.add_argument_group('输出参数')
+    output_group.add_argument(
+        '-o', '--output-dir',
+        type=str,
+        required=True,
+        help='输出目录(必需)'
+    )
+    output_group.add_argument(
+        '-f', '--format', 
+        choices=['json', 'markdown', 'both'], 
+        default='both', help='输出格式'
+    )
+
+    # 算法参数
+    algo_group = parser.add_argument_group('算法参数')
+    algo_group.add_argument(
+        '-w', '--window',
+        type=int,
+        default=15,
+        help='向前查找的窗口大小(默认: 10)'
+    )
+    algo_group.add_argument(
+        '-t', '--threshold',
+        type=int,
+        default=80,
+        help='文本相似度阈值(0-100,默认: 80)'
+    )
+    
+    args = parser.parse_args()
+    output_format = args.format.lower()
+    
+    # 验证参数
+    if args.mineru_file and args.paddle_file:
+        # 单文件模式
+        mineru_file = Path(args.mineru_file)
+        paddle_file = Path(args.paddle_file)
+        output_dir = Path(args.output_dir)
+        
+        if not mineru_file.exists():
+            print(f"❌ 错误: MinerU 文件不存在: {mineru_file}")
+            return
+        
+        if not paddle_file.exists():
+            print(f"❌ 错误: PaddleOCR 文件不存在: {paddle_file}")
+            return
+        
+        output_dir.mkdir(parents=True, exist_ok=True)
+        
+        print("\n🔧 单文件处理模式")
+        print(f"📄 MinerU 文件: {mineru_file}")
+        print(f"📄 PaddleOCR 文件: {paddle_file}")
+        print(f"📂 输出目录: {output_dir}")
+        print(f"⚙️  查找窗口: {args.window}")
+        print(f"⚙️  相似度阈值: {args.threshold}%\n")
+        
+        merger = MinerUPaddleOCRMerger(
+            look_ahead_window=args.window,
+            similarity_threshold=args.threshold
+        )
+        
+        success = merge_single_file(mineru_file, paddle_file, output_dir, output_format, merger)
+        
+        if success:
+            print("\n✅ 处理完成!")
+        else:
+            print("\n❌ 处理失败!")
+    
+    elif args.mineru_dir and args.paddle_dir:
+        # 批量模式
+        if not Path(args.mineru_dir).exists():
+            print(f"❌ 错误: MinerU 目录不存在: {args.mineru_dir}")
+            return
+        
+        if not Path(args.paddle_dir).exists():
+            print(f"❌ 错误: PaddleOCR 目录不存在: {args.paddle_dir}")
+            return
+        
+        print("\n🔧 批量处理模式")
+        
+        merge_mineru_paddle_batch(
+            args.mineru_dir,
+            args.paddle_dir,
+            args.output_dir,
+            output_format=output_format,
+            look_ahead_window=args.window,
+            similarity_threshold=args.threshold
+        )
+    
+    else:
+        parser.print_help()
+        print("\n❌ 错误: 请指定单文件模式或批量模式的参数")
+        print("  单文件模式: --mineru-file 和 --paddle-file")
+        print("  批量模式: --mineru-dir 和 --paddle-dir")
+
+if __name__ == "__main__":
+    print("🚀 启动 MinerU + PaddleOCR 合并程序...")
+    
+    import sys
+    
+    if len(sys.argv) == 1:
+        # 如果没有命令行参数,使用默认配置运行
+        print("ℹ️  未提供命令行参数,使用默认配置运行...")
+        
+        # 默认配置
+        default_config = {
+            "mineru-file": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/mineru-vlm-2.5.3_Results/对公_招商银行图_page_001.json",
+            "paddle-file": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/data_PPStructureV3_Results/对公_招商银行图_page_001.json",
+            "output-dir": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/merged_results",
+            # "mineru-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/mineru-vlm-2.5.3_Results",
+            # "paddle-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/data_PPStructureV3_Results",
+            # "output-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/merged_results",
+            "format": "both",
+            "window": "15",
+            "threshold": "85"
+        }
+        
+        print("⚙️  默认参数:")
+        for key, value in default_config.items():
+            print(f"  --{key}: {value}")
+        # 构造参数
+        sys.argv = [sys.argv[0]]
+        for key, value in default_config.items():
+            sys.argv.extend([f"--{key}", str(value)])
+    
+    sys.exit(main())

+ 309 - 0
merger/merge_mineru_paddle_ocr.py

@@ -0,0 +1,309 @@
+"""
+合并 MinerU 和 PaddleOCR 的结果
+主程序入口
+"""
+import json
+import argparse
+from pathlib import Path
+
+try:
+    from .merger_core import MinerUPaddleOCRMerger
+except ImportError:
+    from merger_core import MinerUPaddleOCRMerger
+
+
+def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path, 
+                     output_format: str, merger: MinerUPaddleOCRMerger) -> bool:
+    """
+    合并单个文件
+    
+    Args:
+        mineru_file: MinerU JSON 文件路径
+        paddle_file: PaddleOCR JSON 文件路径
+        output_dir: 输出目录
+        merger: 合并器实例
+    
+    Returns:
+        是否成功
+    """
+    print(f"📄 处理: {mineru_file.name}")
+    
+    # 输出文件路径
+    merged_md_path = output_dir / f"{mineru_file.stem}.md"
+    merged_json_path = output_dir / f"{mineru_file.stem}.json"
+    
+    try:
+        # 合并数据
+        merged_data = merger.merge_table_with_bbox(
+            str(mineru_file),
+            str(paddle_file)
+        )
+        
+        # 生成 Markdown
+        if output_format in ['markdown', 'both']:
+            merger.generate_enhanced_markdown(merged_data, str(merged_md_path), mineru_file)
+        
+        # 保存 JSON
+        if output_format in ['json', 'both']:
+            with open(merged_json_path, 'w', encoding='utf-8') as f:
+                json.dump(merged_data, f, ensure_ascii=False, indent=2)
+
+        print(f"  ✅ 合并完成")
+        print(f"  📊 共处理了 {len(merged_data)} 个对象")
+        print(f"  💾 输出文件:")
+        if output_format in ['markdown', 'both']:
+            print(f"    - {merged_md_path.name}")
+        if output_format in ['json', 'both']:
+            print(f"    - {merged_json_path.name}")
+
+        return True
+        
+    except Exception as e:
+        print(f"  ❌ 处理失败: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+def merge_mineru_paddle_batch(mineru_dir: str, paddle_dir: str, output_dir: str,
+                              output_format: str = 'both',
+                              look_ahead_window: int = 10, 
+                              similarity_threshold: int = 80):
+    """
+    批量合并 MinerU 和 PaddleOCR 的结果
+    
+    Args:
+        mineru_dir: MinerU 结果目录
+        paddle_dir: PaddleOCR 结果目录
+        output_dir: 输出目录
+        look_ahead_window: 向前查找窗口大小
+        similarity_threshold: 相似度阈值
+    """
+    mineru_path = Path(mineru_dir)
+    paddle_path = Path(paddle_dir)
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    
+    merger = MinerUPaddleOCRMerger(look_ahead_window, similarity_threshold)
+    
+    # 查找所有 MinerU 的 JSON 文件
+    mineru_files = list(mineru_path.glob('*_page_*[0-9].json'))
+    mineru_files.sort()
+    
+    print(f"\n🔍 找到 {len(mineru_files)} 个 MinerU 文件")
+    print(f"📂 MinerU 目录: {mineru_dir}")
+    print(f"📂 PaddleOCR 目录: {paddle_dir}")
+    print(f"📂 输出目录: {output_dir}")
+    print(f"⚙️  查找窗口: {look_ahead_window}")
+    print(f"⚙️  相似度阈值: {similarity_threshold}%\n")
+    
+    success_count = 0
+    failed_count = 0
+    
+    for mineru_file in mineru_files:
+        # 查找对应的 PaddleOCR 文件
+        paddle_file = paddle_path / mineru_file.name
+        
+        if not paddle_file.exists():
+            print(f"⚠️  跳过: 未找到对应的 PaddleOCR 文件: {paddle_file.name}\n")
+            failed_count += 1
+            continue
+
+        if merge_single_file(mineru_file, paddle_file, output_path, output_format, merger):
+            success_count += 1
+        else:
+            failed_count += 1
+        
+        print()  # 空行分隔
+    
+    # 打印统计信息
+    print("=" * 60)
+    print(f"✅ 处理完成!")
+    print(f"📊 统计信息:")
+    print(f"  - 总文件数: {len(mineru_files)}")
+    print(f"  - 成功: {success_count}")
+    print(f"  - 失败: {failed_count}")
+    print("=" * 60)
+
+
+def main():
+    """主函数"""
+    parser = argparse.ArgumentParser(
+        description='合并 MinerU 和 PaddleOCR 的识别结果,添加 bbox 坐标信息',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+示例用法:
+
+  1. 批量处理整个目录:
+     python merge_mineru_paddle_ocr.py \\
+         --mineru-dir /path/to/mineru/results \\
+         --paddle-dir /path/to/paddle/results \\
+         --output-dir /path/to/output
+
+  2. 处理单个文件:
+     python merge_mineru_paddle_ocr.py \\
+         --mineru-file /path/to/file_page_001.json \\
+         --paddle-file /path/to/file_page_001.json \\
+         --output-dir /path/to/output
+
+  3. 自定义参数:
+     python merge_mineru_paddle_ocr.py \\
+         --mineru-dir /path/to/mineru \\
+         --paddle-dir /path/to/paddle \\
+         --output-dir /path/to/output \\
+         --window 15 \\
+         --threshold 85
+        """
+    )
+    
+    # 文件/目录参数
+    file_group = parser.add_argument_group('文件参数')
+    file_group.add_argument(
+        '--mineru-file', 
+        type=str,
+        help='MinerU 输出的 JSON 文件路径(单文件模式)'
+    )
+    file_group.add_argument(
+        '--paddle-file', 
+        type=str,
+        help='PaddleOCR 输出的 JSON 文件路径(单文件模式)'
+    )
+    
+    dir_group = parser.add_argument_group('目录参数')
+    dir_group.add_argument(
+        '--mineru-dir', 
+        type=str,
+        help='MinerU 结果目录(批量模式)'
+    )
+    dir_group.add_argument(
+        '--paddle-dir', 
+        type=str,
+        help='PaddleOCR 结果目录(批量模式)'
+    )
+    
+    # 输出参数
+    output_group = parser.add_argument_group('输出参数')
+    output_group.add_argument(
+        '-o', '--output-dir',
+        type=str,
+        required=True,
+        help='输出目录(必需)'
+    )
+    output_group.add_argument(
+        '-f', '--format', 
+        choices=['json', 'markdown', 'both'], 
+        default='both', help='输出格式'
+    )
+
+    # 算法参数
+    algo_group = parser.add_argument_group('算法参数')
+    algo_group.add_argument(
+        '-w', '--window',
+        type=int,
+        default=15,
+        help='向前查找的窗口大小(默认: 10)'
+    )
+    algo_group.add_argument(
+        '-t', '--threshold',
+        type=int,
+        default=80,
+        help='文本相似度阈值(0-100,默认: 80)'
+    )
+    
+    args = parser.parse_args()
+    output_format = args.format.lower()
+    
+    # 验证参数
+    if args.mineru_file and args.paddle_file:
+        # 单文件模式
+        mineru_file = Path(args.mineru_file)
+        paddle_file = Path(args.paddle_file)
+        output_dir = Path(args.output_dir)
+        
+        if not mineru_file.exists():
+            print(f"❌ 错误: MinerU 文件不存在: {mineru_file}")
+            return
+        
+        if not paddle_file.exists():
+            print(f"❌ 错误: PaddleOCR 文件不存在: {paddle_file}")
+            return
+        
+        output_dir.mkdir(parents=True, exist_ok=True)
+        
+        print("\n🔧 单文件处理模式")
+        print(f"📄 MinerU 文件: {mineru_file}")
+        print(f"📄 PaddleOCR 文件: {paddle_file}")
+        print(f"📂 输出目录: {output_dir}")
+        print(f"⚙️  查找窗口: {args.window}")
+        print(f"⚙️  相似度阈值: {args.threshold}%\n")
+        
+        merger = MinerUPaddleOCRMerger(
+            look_ahead_window=args.window,
+            similarity_threshold=args.threshold
+        )
+        
+        success = merge_single_file(mineru_file, paddle_file, output_dir, output_format, merger)
+        
+        if success:
+            print("\n✅ 处理完成!")
+        else:
+            print("\n❌ 处理失败!")
+    
+    elif args.mineru_dir and args.paddle_dir:
+        # 批量模式
+        if not Path(args.mineru_dir).exists():
+            print(f"❌ 错误: MinerU 目录不存在: {args.mineru_dir}")
+            return
+        
+        if not Path(args.paddle_dir).exists():
+            print(f"❌ 错误: PaddleOCR 目录不存在: {args.paddle_dir}")
+            return
+        
+        print("\n🔧 批量处理模式")
+        
+        merge_mineru_paddle_batch(
+            args.mineru_dir,
+            args.paddle_dir,
+            args.output_dir,
+            output_format=output_format,
+            look_ahead_window=args.window,
+            similarity_threshold=args.threshold
+        )
+    
+    else:
+        parser.print_help()
+        print("\n❌ 错误: 请指定单文件模式或批量模式的参数")
+        print("  单文件模式: --mineru-file 和 --paddle-file")
+        print("  批量模式: --mineru-dir 和 --paddle-dir")
+
+if __name__ == "__main__":
+    print("🚀 启动 MinerU + PaddleOCR 合并程序...")
+    
+    import sys
+    
+    if len(sys.argv) == 1:
+        # 如果没有命令行参数,使用默认配置运行
+        print("ℹ️  未提供命令行参数,使用默认配置运行...")
+        
+        # 默认配置
+        default_config = {
+            "mineru-file": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/mineru-vlm-2.5.3_Results/对公_招商银行图_page_001.json",
+            "paddle-file": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/data_PPStructureV3_Results/对公_招商银行图_page_001.json",
+            "output-dir": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/merged_results",
+            # "mineru-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/mineru-vlm-2.5.3_Results",
+            # "paddle-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/data_PPStructureV3_Results",
+            # "output-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/merged_results",
+            "format": "both",
+            "window": "15",
+            "threshold": "85"
+        }
+        
+        print("⚙️  默认参数:")
+        for key, value in default_config.items():
+            print(f"  --{key}: {value}")
+        # 构造参数
+        sys.argv = [sys.argv[0]]
+        for key, value in default_config.items():
+            sys.argv.extend([f"--{key}", str(value)])
+    
+    sys.exit(main())

+ 77 - 0
merger/merger_core.py

@@ -0,0 +1,77 @@
+"""
+核心合并模块
+整合各个子模块,提供统一的合并接口
+"""
+import json
+from typing import List, Dict
+
+try:
+    from .text_matcher import TextMatcher
+    from .bbox_extractor import BBoxExtractor
+    from .data_processor import DataProcessor
+    from .markdown_generator import MarkdownGenerator
+except ImportError:
+    from text_matcher import TextMatcher
+    from bbox_extractor import BBoxExtractor
+    from data_processor import DataProcessor
+    from markdown_generator import MarkdownGenerator
+
+
+class MinerUPaddleOCRMerger:
+    """MinerU 和 PaddleOCR 结果合并器"""
+    
+    def __init__(self, look_ahead_window: int = 10, similarity_threshold: int = 90):
+        """
+        Args:
+            look_ahead_window: 向前查找的窗口大小
+            similarity_threshold: 文本相似度阈值
+        """
+        self.look_ahead_window = look_ahead_window
+        self.similarity_threshold = similarity_threshold
+        
+        # 初始化子模块
+        self.text_matcher = TextMatcher(similarity_threshold)
+        self.bbox_extractor = BBoxExtractor()
+        self.data_processor = DataProcessor(self.text_matcher, look_ahead_window)
+        self.markdown_generator = MarkdownGenerator()
+    
+    def merge_table_with_bbox(self, mineru_json_path: str, 
+                             paddle_json_path: str) -> List[Dict]:
+        """
+        合并 MinerU 和 PaddleOCR 的结果
+        
+        Args:
+            mineru_json_path: MinerU 输出的 JSON 路径
+            paddle_json_path: PaddleOCR 输出的 JSON 路径
+        
+        Returns:
+            合并后的结果
+        """
+        # 加载数据
+        with open(mineru_json_path, 'r', encoding='utf-8') as f:
+            mineru_data = json.load(f)
+        
+        with open(paddle_json_path, 'r', encoding='utf-8') as f:
+            paddle_data = json.load(f)
+        
+        # 提取 PaddleOCR 的文字框信息
+        paddle_text_boxes = self.bbox_extractor.extract_paddle_text_boxes(paddle_data)
+        
+        # 处理 MinerU 的数据
+        merged_data = self.data_processor.process_mineru_data(
+            mineru_data, paddle_text_boxes
+        )
+        
+        return merged_data
+    
+    def generate_enhanced_markdown(self, merged_data: List[Dict], 
+                                   output_path: str = None,
+                                   mineru_file: str = None) -> str:
+        """生成增强的 Markdown"""
+        return self.markdown_generator.generate_enhanced_markdown(
+            merged_data, output_path, mineru_file
+        )
+    
+    def extract_table_cells_with_bbox(self, merged_data: List[Dict]) -> List[Dict]:
+        """提取所有表格单元格及其 bbox 信息"""
+        return self.bbox_extractor.extract_table_cells_with_bbox(merged_data)

+ 137 - 0
merger/text_matcher.py

@@ -0,0 +1,137 @@
+"""
+文本匹配工具模块
+负责文本标准化、相似度计算等
+"""
+import re
+from typing import Optional, List, Dict
+from fuzzywuzzy import fuzz
+
+
+class TextMatcher:
+    """文本匹配器"""
+    
+    def __init__(self, similarity_threshold: int = 90):
+        """
+        Args:
+            similarity_threshold: 文本相似度阈值
+        """
+        self.similarity_threshold = similarity_threshold
+    
+    def normalize_text(self, text: str) -> str:
+        """标准化文本(去除空格、标点等)"""
+        # 移除所有空白字符
+        text = re.sub(r'\s+', '', text)
+        # 转换全角数字和字母为半角
+        text = self._full_to_half(text)
+        return text.lower()
+    
+    def _full_to_half(self, text: str) -> str:
+        """全角转半角"""
+        result = []
+        for char in text:
+            code = ord(char)
+            if code == 0x3000:  # 全角空格
+                code = 0x0020
+            elif 0xFF01 <= code <= 0xFF5E:  # 全角字符
+                code -= 0xFEE0
+            result.append(chr(code))
+        return ''.join(result)
+    
+    def find_matching_bbox(self, target_text: str, text_boxes: List[Dict], 
+                          start_index: int, last_match_index: int,
+                          look_ahead_window: int = 10) -> tuple[Optional[Dict], int, int]:
+        """
+        查找匹配的文字框
+        
+        Args:
+            target_text: 目标文本
+            text_boxes: 文字框列表
+            start_index: 起始索引
+            last_match_index: 上次匹配成功的索引
+            look_ahead_window: 向前查找窗口
+        
+        Returns:
+            (匹配的文字框信息, 新的指针位置, last_match_index)
+        """
+        target_text = self.normalize_text(target_text)
+        
+        # 过滤过短的目标文本
+        if len(target_text) < 2:
+            return None, start_index, last_match_index
+
+        # 确定搜索范围
+        search_start = self._find_search_start(
+            text_boxes, last_match_index, start_index, look_ahead_window
+        )
+        search_end = min(start_index + look_ahead_window, len(text_boxes))
+        
+        # 在搜索范围内查找最佳匹配
+        for i in range(search_start, search_end):
+            if text_boxes[i]['used']:
+                continue
+            
+            box_text = self.normalize_text(text_boxes[i]['text'])
+            
+            # 精确匹配优先
+            if target_text == box_text:
+                return self._return_match(text_boxes[i], i, start_index)
+            
+            # 过滤过短的候选文本
+            if len(box_text) < 2:
+                continue
+            
+            # 长度比例检查
+            if not self._check_length_ratio(target_text, box_text):
+                continue
+            
+            # 计算相似度
+            if self._is_similar(target_text, box_text):
+                return self._return_match(text_boxes[i], i, start_index)
+        
+        return None, start_index, last_match_index
+    
+    def _find_search_start(self, text_boxes: List[Dict], last_match_index: int,
+                          start_index: int, look_ahead_window: int) -> int:
+        """确定搜索起始位置"""
+        search_start = last_match_index - 1
+        unused_count = 0
+        
+        while search_start >= 0:
+            if not text_boxes[search_start]['used']:
+                unused_count += 1
+            if unused_count >= look_ahead_window:
+                break
+            search_start -= 1
+        
+        if search_start < 0:
+            search_start = 0
+            while search_start < start_index and text_boxes[search_start]['used']:
+                search_start += 1
+        
+        return search_start
+    
+    def _check_length_ratio(self, text1: str, text2: str) -> bool:
+        """检查长度比例"""
+        length_ratio = min(len(text1), len(text2)) / max(len(text1), len(text2))
+        return length_ratio >= 0.3
+    
+    def _is_similar(self, text1: str, text2: str) -> bool:
+        """判断两个文本是否相似"""
+        # 子串检查
+        shorter = text1 if len(text1) < len(text2) else text2
+        longer = text2 if len(text1) < len(text2) else text1
+        is_substring = shorter in longer
+        
+        # 计算相似度
+        partial_ratio = fuzz.partial_ratio(text1, text2)
+        if is_substring:
+            partial_ratio += 10  # 子串时提升相似度
+        
+        return partial_ratio >= self.similarity_threshold
+    
+    def _return_match(self, text_box: Dict, index: int, start_index: int) -> tuple:
+        """返回匹配结果"""
+        if index >= start_index:
+            return text_box, index + 1, index
+        else:
+            return text_box, start_index, index