1 month ago · da2c59ec1b
--- a/merge_mineru_paddle_ocr.py
+++ b/merge_mineru_paddle_ocr.py
@@ -117,7 +117,7 @@ class MinerUPaddleOCRMerger:
 
				                 
			
 
				                 merged_data.append(merged_item)
			
 
				             
			
 
				-            elif item['type'] in ['text', 'header']:
			
 
				+            elif item['type'] in ['text', 'title']:
			
 
				                 # 处理普通文本
			
 
				                 merged_item = item.copy()
			
 
				                 text = item.get('text', '')
			
@@ -128,14 +128,33 @@ class MinerUPaddleOCRMerger:
 
				                 )
			
 
				                 
			
 
				                 if matched_bbox:
			
 
				-                    merged_item['bbox'] = matched_bbox['bbox']
			
 
				-                    merged_item['bbox_source'] = 'paddle_ocr'
			
 
				-                    merged_item['text_score'] = matched_bbox['score']
			
 
				+                    # merged_item['bbox'] = matched_bbox['bbox']
			
 
				+                    # merged_item['bbox_source'] = 'paddle_ocr'
			
 
				+                    # merged_item['text_score'] = matched_bbox['score']
			
 
				+
			
 
				+                    # 沿用mineru的bbox, 就是要移动位置paddle_pointer, last_matched_index
			
 
				                     # 标记为已使用
			
 
				                     matched_bbox['used'] = True
			
 
				                 
			
 
				                 merged_data.append(merged_item)
			
 
				-            
			
 
				+            elif item['type'] == 'list':
			
 
				+                # 处理列表项
			
 
				+                merged_item = item.copy()
			
 
				+                list_items = item.get('list_items', [])
			
 
				+                sub_type = item.get('sub_type', 'unordered')  # 有序或无序
			
 
				+
			
 
				+                for list_item in list_items:
			
 
				+                    # 查找匹配的 bbox
			
 
				+                    matched_bbox, paddle_pointer, last_matched_index = self._find_matching_bbox(
			
 
				+                        list_item, paddle_text_boxes, paddle_pointer, last_matched_index
			
 
				+                    )
			
 
				+                    
			
 
				+                    if matched_bbox:
			
 
				+                        # 沿用mineru的bbox, 就是要移动位置paddle_pointer, last_matched_index
			
 
				+                        # 标记为已使用
			
 
				+                        matched_bbox['used'] = True
			
 
				+                
			
 
				+                merged_data.append(merged_item)
			
 
				             else:
			
 
				                 # 其他类型直接复制
			
 
				                 merged_data.append(item.copy())
			
@@ -218,6 +237,11 @@ class MinerUPaddleOCRMerger:
 
				 
			
 
				         # 由于minerU和Paddle的顺序基本一致, 也有不一致的地方, 所以需要向前找第一个未使用的位置
			
 
				         # MinerU和Paddle都可能识别错误，所以需要一个look_ahead_window来避免漏掉匹配
			
 
				+        # 匹配时会遇到一些特殊情况，比如Paddle把两个连着的cell识别为一个字符串，MinerU将单元格上下2行识别为一行
			
 
				+        # 	'1|2024-08-11|扫二维码付'   minerU识别为“扫二维码付款”，Paddle识别为'12024-08-11扫二维码付'  
			
 
				+        #                  款
			
 
				+        # 字符串的顺序极大概率是一致的，所以如果短字符串是长字符串的子串，可以增加相似权重
			
 
				+
			
 
				         search_start = last_match_index - 1
			
 
				         unused_count = 0
			
 
				         while search_start >= 0:
			
@@ -238,6 +262,12 @@ class MinerUPaddleOCRMerger:
 
				                 continue
			
 
				             
			
 
				             box_text = self._normalize_text(text_boxes[i]['text'])
			
 
				+            # 精确匹配优先
			
 
				+            if target_text == box_text:
			
 
				+                if i >= start_index:
			
 
				+                    return text_boxes[i], i + 1, i
			
 
				+                else:
			
 
				+                    return text_boxes[i], start_index, i
			
 
				             
			
 
				             # 过滤过短的候选文本(避免单字符匹配)
			
 
				             if len(box_text) < 2:
			
@@ -247,21 +277,20 @@ class MinerUPaddleOCRMerger:
 
				             length_ratio = min(len(target_text), len(box_text)) / max(len(target_text), len(box_text))
			
 
				             if length_ratio < 0.3:  # 长度差异超过70%则跳过
			
 
				                 continue
			
 
				-            
			
 
				-            # 精确匹配优先
			
 
				-            if target_text == box_text:
			
 
				-                if i >= start_index:
			
 
				-                    return text_boxes[i], i + 1, i
			
 
				-                else:
			
 
				-                    return text_boxes[i], start_index, i
			
 
				-            
			
 
				+
			
 
				+            # 子串检查
			
 
				+            shorter = target_text if len(target_text) < len(box_text) else box_text
			
 
				+            longer = box_text if len(target_text) < len(box_text) else target_text
			
 
				+            is_substring = shorter in longer        
			
 
				+
			
 
				             # 计算多种相似度
			
 
				+            # token_sort_ratio = fuzz.token_sort_ratio(target_text, box_text)
			
 
				             partial_ratio = fuzz.partial_ratio(target_text, box_text)
			
 
				-            token_sort_ratio = fuzz.token_sort_ratio(target_text, box_text)
			
 
				+            if is_substring:
			
 
				+                partial_ratio += 10  # 子串时提升相似度
			
 
				             
			
 
				             # 综合相似度 - 两种算法都要达到阈值
			
 
				-            if (partial_ratio >= self.similarity_threshold and 
			
 
				-                token_sort_ratio >= 50):  # token_sort 阈值稍低
			
 
				+            if (partial_ratio >= self.similarity_threshold):
			
 
				                 return text_boxes[i], start_index, last_match_index
			
 
				 
			
 
				         return best_match, best_index, last_match_index
			
@@ -290,10 +319,12 @@ class MinerUPaddleOCRMerger:
 
				                                    output_path: Optional[str] = None, mineru_file: Optional[str] = None) -> str:
			
 
				         """
			
 
				         生成增强的 Markdown（包含 bbox 信息的注释）
			
 
				+        参考 MinerU 的实现,支持标题、列表、表格标题等
			
 
				         
			
 
				         Args:
			
 
				             merged_data: 合并后的数据
			
 
				             output_path: 输出路径（可选）
			
 
				+            mineru_file: MinerU 源文件路径（用于复制图片）
			
 
				         
			
 
				         Returns:
			
 
				             Markdown 内容
			
@@ -301,49 +332,145 @@ class MinerUPaddleOCRMerger:
 
				         md_lines = []
			
 
				         
			
 
				         for item in merged_data:
			
 
				-            if item['type'] == 'header':
			
 
				-                text = item.get('text', '')
			
 
				-                bbox = item.get('bbox', [])
			
 
				+            item_type = item.get('type', '')
			
 
				+            bbox = item.get('bbox', [])
			
 
				+            
			
 
				+            # 添加 bbox 注释
			
 
				+            if bbox:
			
 
				                 md_lines.append(f"<!-- bbox: {bbox} -->")
			
 
				-                md_lines.append(f"# {text}\n")
			
 
				             
			
 
				-            elif item['type'] == 'text':
			
 
				+            # 根据类型处理
			
 
				+            if item_type == 'title':
			
 
				+                # 标题 - 使用 text_level 确定标题级别
			
 
				                 text = item.get('text', '')
			
 
				-                bbox = item.get('bbox', [])
			
 
				-                if bbox:
			
 
				-                    md_lines.append(f"<!-- bbox: {bbox} -->")
			
 
				-                md_lines.append(f"{text}\n")
			
 
				+                text_level = item.get('text_level', 1)
			
 
				+                heading = '#' * min(text_level, 6)  # 最多6级标题
			
 
				+                md_lines.append(f"{heading} {text}\n")
			
 
				             
			
 
				-            elif item['type'] == 'table':
			
 
				-                md_lines.append("<!-- 表格单元格包含 data-bbox 属性 -->\n")
			
 
				-                md_lines.append(item.get('table_body_with_bbox', item.get('table_body', '')))
			
 
				-                md_lines.append("\n")
			
 
				+            elif item_type == 'text':
			
 
				+                # 普通文本 - 可能也有 text_level
			
 
				+                text = item.get('text', '')
			
 
				+                text_level = item.get('text_level', 0)
			
 
				+                
			
 
				+                if text_level > 0:
			
 
				+                    # 作为标题处理
			
 
				+                    heading = '#' * min(text_level, 6)
			
 
				+                    md_lines.append(f"{heading} {text}\n")
			
 
				+                else:
			
 
				+                    # 普通段落
			
 
				+                    md_lines.append(f"{text}\n")
			
 
				+            
			
 
				+            elif item_type == 'list':
			
 
				+                # 列表
			
 
				+                sub_type = item.get('sub_type', 'text')
			
 
				+                list_items = item.get('list_items', [])
			
 
				+                
			
 
				+                for list_item in list_items:
			
 
				+                    md_lines.append(f"{list_item}\n")
			
 
				+                
			
 
				+                md_lines.append("")  # 列表后添加空行
			
 
				+            
			
 
				+            elif item_type == 'table':
			
 
				+                # 表格标题
			
 
				+                table_caption = item.get('table_caption', [])
			
 
				+                if table_caption:
			
 
				+                    for caption in table_caption:
			
 
				+                        if caption:  # 跳过空标题
			
 
				+                            md_lines.append(f"**{caption}**\n")
			
 
				+                
			
 
				+                # 表格内容
			
 
				+                table_body = item.get('table_body_with_bbox', item.get('table_body', ''))
			
 
				+                if table_body:
			
 
				+                    md_lines.append(table_body)
			
 
				+                    md_lines.append("")
			
 
				+                
			
 
				+                # 表格脚注
			
 
				+                table_footnote = item.get('table_footnote', [])
			
 
				+                if table_footnote:
			
 
				+                    for footnote in table_footnote:
			
 
				+                        if footnote:
			
 
				+                            md_lines.append(f"*{footnote}*")
			
 
				+                    md_lines.append("")
			
 
				             
			
 
				-            elif item['type'] == 'image':
			
 
				+            elif item_type == 'image':
			
 
				+                # 图片
			
 
				                 img_path = item.get('img_path', '')
			
 
				-                # 需要将minerU图像路径下的图片拷贝到输出目录
			
 
				-                if img_path and mineru_file:
			
 
				+                
			
 
				+                # 复制图片到输出目录
			
 
				+                if img_path and mineru_file and output_path:
			
 
				                     mineru_dir = Path(mineru_file).parent
			
 
				                     img_full_path = mineru_dir / img_path
			
 
				                     if img_full_path.exists():
			
 
				-                        # 需要将图片拷贝到输出目录
			
 
				                         output_img_path = Path(output_path).parent / img_path
			
 
				                         output_img_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				                         shutil.copy(img_full_path, output_img_path)
			
 
				-
			
 
				-                bbox = item.get('bbox', [])
			
 
				-                if bbox:
			
 
				-                    md_lines.append(f"<!-- bbox: {bbox} -->")
			
 
				+                
			
 
				+                # 图片标题
			
 
				+                image_caption = item.get('image_caption', [])
			
 
				+                if image_caption:
			
 
				+                    for caption in image_caption:
			
 
				+                        if caption:
			
 
				+                            md_lines.append(f"**{caption}**\n")
			
 
				+                
			
 
				+                # 插入图片
			
 
				                 md_lines.append(f"![Image]({img_path})\n")
			
 
				+                
			
 
				+                # 图片脚注
			
 
				+                image_footnote = item.get('image_footnote', [])
			
 
				+                if image_footnote:
			
 
				+                    for footnote in image_footnote:
			
 
				+                        if footnote:
			
 
				+                            md_lines.append(f"*{footnote}*")
			
 
				+                    md_lines.append("")
			
 
				+            
			
 
				+            elif item_type == 'equation':
			
 
				+                # 公式
			
 
				+                latex = item.get('latex', '')
			
 
				+                if latex:
			
 
				+                    md_lines.append(f"$$\n{latex}\n$$\n")
			
 
				+            
			
 
				+            elif item_type == 'inline_equation':
			
 
				+                # 行内公式
			
 
				+                latex = item.get('latex', '')
			
 
				+                if latex:
			
 
				+                    md_lines.append(f"${latex}$\n")
			
 
				+            
			
 
				+            elif item_type == 'page_number':
			
 
				+                # 页码 - 通常跳过或作为注释
			
 
				+                text = item.get('text', '')
			
 
				+                md_lines.append(f"<!-- 页码: {text} -->\n")
			
 
				+            
			
 
				+            elif item_type == 'header':
			
 
				+                # 页眉
			
 
				+                text = item.get('text', '')
			
 
				+                md_lines.append(f"<!-- 页眉: {text} -->\n")
			
 
				+            
			
 
				+            elif item_type == 'footer':
			
 
				+                # 页脚
			
 
				+                text = item.get('text', '')
			
 
				+                if text:
			
 
				+                    md_lines.append(f"<!-- 页脚: {text} -->\n")
			
 
				+            
			
 
				+            elif item_type == 'reference':
			
 
				+                # 参考文献
			
 
				+                text = item.get('text', '')
			
 
				+                md_lines.append(f"> {text}\n")
			
 
				+            
			
 
				+            else:
			
 
				+                # 未知类型 - 尝试提取文本
			
 
				+                text = item.get('text', '')
			
 
				+                if text:
			
 
				+                    md_lines.append(f"{text}\n")
			
 
				         
			
 
				         markdown_content = '\n'.join(md_lines)
			
 
				         
			
 
				+        # 保存文件
			
 
				         if output_path:
			
 
				             with open(output_path, 'w', encoding='utf-8') as f:
			
 
				                 f.write(markdown_content)
			
 
				         
			
 
				         return markdown_content
			
 
				-    
			
 
				+
			
 
				     def extract_table_cells_with_bbox(self, merged_data: List[Dict]) -> List[Dict]:
			
 
				         """
			
 
				         提取所有表格单元格及其 bbox 信息
			
@@ -581,7 +708,7 @@ def main():
 
				     algo_group.add_argument(
			
 
				         '-t', '--threshold',
			
 
				         type=int,
			
 
				-        default=85,
			
 
				+        default=80,
			
 
				         help='文本相似度阈值（0-100，默认: 80）'
			
 
				     )
			
 
				     
			
@@ -662,9 +789,9 @@ if __name__ == "__main__":
 
				         
			
 
				         # 默认配置
			
 
				         default_config = {
			
 
				-            "mineru-file": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/mineru-vlm-2.5.3_Results/B用户_扫描流水_page_001.json",
			
 
				-            "paddle-file": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/data_PPStructureV3_Results/B用户_扫描流水_page_001.json",
			
 
				-            "output-dir": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/merged_results",
			
 
				+            "mineru-file": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/mineru-vlm-2.5.3_Results/A用户_单元格扫描流水_page_001.json",
			
 
				+            "paddle-file": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/data_PPStructureV3_Results/A用户_单元格扫描流水_page_001.json",
			
 
				+            "output-dir": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/merged_results",
			
 
				             # "mineru-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/mineru-vlm-2.5.3_Results",
			
 
				             # "paddle-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/data_PPStructureV3_Results",
			
 
				             # "output-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/merged_results",