vor 1 Monat · 16b5f43cba
--- a/merge_mineru_paddle_ocr.py
+++ b/merge_mineru_paddle_ocr.py
@@ -14,7 +14,7 @@ from fuzzywuzzy import fuzz
 
				 class MinerUPaddleOCRMerger:
			
 
				     """合并 MinerU 和 PaddleOCR 的结果"""
			
 
				     
			
 
				-    def __init__(self, look_ahead_window: int = 10, similarity_threshold: int = 80):
			
 
				+    def __init__(self, look_ahead_window: int = 10, similarity_threshold: int = 90):
			
 
				         """
			
 
				         Args:
			
 
				             look_ahead_window: 向前查找的窗口大小
			
@@ -85,7 +85,10 @@ class MinerUPaddleOCRMerger:
 
				         merged_data = []
			
 
				         cells = None  # 存储所有表格单元格信息
			
 
				         paddle_pointer = 0  # PaddleOCR 文字框指针
			
 
				-        
			
 
				+
			
 
				+        # 对mineru_data按bbox从上到下排序，从左到右确保顺序一致
			
 
				+        mineru_data.sort(key=lambda x: (x['bbox'][1], x['bbox'][0]) if 'bbox' in x else (float('inf'), float('inf')))
			
 
				+
			
 
				         for item in mineru_data:
			
 
				             if item['type'] == 'table':
			
 
				                 # 处理表格
			
@@ -148,34 +151,38 @@ class MinerUPaddleOCRMerger:
 
				         current_pointer = start_pointer
			
 
				         cells = []  # 存储单元格的 bbox 信息
			
 
				 
			
 
				-        # 遍历所有单元格
			
 
				-        for cell in soup.find_all(['td', 'th']):
			
 
				-            cell_text = cell.get_text(strip=True)
			
 
				+        # 遍历所有行
			
 
				+        for row_idx, row in enumerate(soup.find_all('tr')):
			
 
				+            # 遍历所有单元格
			
 
				+            for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
			
 
				+                cell_text = cell.get_text(strip=True)
			
 
				             
			
 
				-            if not cell_text:
			
 
				-                continue
			
 
				-            
			
 
				-            # 查找匹配的 bbox
			
 
				-            matched_bbox, current_pointer = self._find_matching_bbox(
			
 
				-                cell_text, paddle_text_boxes, current_pointer
			
 
				-            )
			
 
				-            
			
 
				-            if matched_bbox:
			
 
				-                # 添加 data-bbox 属性
			
 
				-                bbox = matched_bbox['bbox']
			
 
				-                cell['data-bbox'] = f"[{bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}]"
			
 
				-                cell['data-score'] = f"{matched_bbox['score']:.4f}"
			
 
				-                cell['data-paddle-index'] = str(matched_bbox['paddle_bbox_index'])
			
 
				+                if not cell_text:
			
 
				+                    continue
			
 
				+                
			
 
				+                # 查找匹配的 bbox
			
 
				+                matched_bbox, current_pointer = self._find_matching_bbox(
			
 
				+                    cell_text, paddle_text_boxes, current_pointer
			
 
				+                )
			
 
				+                
			
 
				+                if matched_bbox:
			
 
				+                    # 添加 data-bbox 属性
			
 
				+                    bbox = matched_bbox['bbox']
			
 
				+                    cell['data-bbox'] = f"[{bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}]"
			
 
				+                    cell['data-score'] = f"{matched_bbox['score']:.4f}"
			
 
				+                    cell['data-paddle-index'] = str(matched_bbox['paddle_bbox_index'])
			
 
				 
			
 
				-                cells.append({
			
 
				-                    'type': 'table_cell',
			
 
				-                    'text': cell_text,
			
 
				-                    'bbox': bbox,
			
 
				-                    'score': matched_bbox['score'],
			
 
				-                    'paddle_bbox_index': matched_bbox['paddle_bbox_index']
			
 
				-                })
			
 
				-                # 标记为已使用
			
 
				-                matched_bbox['used'] = True
			
 
				+                    cells.append({
			
 
				+                        'type': 'table_cell',
			
 
				+                        'text': cell_text,
			
 
				+                        'bbox': bbox,
			
 
				+                        'row': row_idx+1,
			
 
				+                        'col': col_idx+1,
			
 
				+                        'score': matched_bbox['score'],
			
 
				+                        'paddle_bbox_index': matched_bbox['paddle_bbox_index']
			
 
				+                    })
			
 
				+                    # 标记为已使用
			
 
				+                    matched_bbox['used'] = True
			
 
				         
			
 
				         return str(soup), cells, current_pointer
			
 
				     
			
@@ -208,17 +215,16 @@ class MinerUPaddleOCRMerger:
 
				             box_text = self._normalize_text(text_boxes[i]['text'])
			
 
				             
			
 
				             # 计算相似度
			
 
				-            similarity = fuzz.token_set_ratio(target_text, box_text)
			
 
				+            similarity = fuzz.partial_ratio(target_text, box_text)
			
 
				             
			
 
				             # 精确匹配优先
			
 
				             if target_text == box_text:
			
 
				                 return text_boxes[i], i + 1
			
 
				             
			
 
				-            # 记录最佳匹配
			
 
				-            if similarity > best_similarity and similarity >= self.similarity_threshold:
			
 
				-                best_similarity = similarity
			
 
				-                best_match = text_boxes[i]
			
 
				-                best_index = i + 1
			
 
				+            # 大于阈值就返回，不找最佳
			
 
				+            # if similarity > best_similarity and similarity >= self.similarity_threshold:
			
 
				+            if similarity >= self.similarity_threshold:
			
 
				+                return text_boxes[i], i + 1
			
 
				 
			
 
				         return best_match, best_index
			
 
				 
			
@@ -271,10 +277,16 @@ class MinerUPaddleOCRMerger:
 
				                 md_lines.append(f"{text}\n")
			
 
				             
			
 
				             elif item['type'] == 'table':
			
 
				-                md_lines.append("\n## 表格\n")
			
 
				                 md_lines.append("<!-- 表格单元格包含 data-bbox 属性 -->\n")
			
 
				                 md_lines.append(item.get('table_body_with_bbox', item.get('table_body', '')))
			
 
				                 md_lines.append("\n")
			
 
				+            
			
 
				+            elif item['type'] == 'image':
			
 
				+                img_path = item.get('img_path', '')
			
 
				+                bbox = item.get('bbox', [])
			
 
				+                if bbox:
			
 
				+                    md_lines.append(f"<!-- bbox: {bbox} -->")
			
 
				+                md_lines.append(f"![Image]({img_path})\n")
			
 
				         
			
 
				         markdown_content = '\n'.join(md_lines)
			
 
				         
			
@@ -325,7 +337,7 @@ class MinerUPaddleOCRMerger:
 
				 
			
 
				 
			
 
				 def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path, 
			
 
				-                     merger: MinerUPaddleOCRMerger) -> bool:
			
 
				+                     output_format: str, merger: MinerUPaddleOCRMerger) -> bool:
			
 
				     """
			
 
				     合并单个文件
			
 
				     
			
@@ -341,6 +353,7 @@ def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
 
				     print(f"📄 处理: {mineru_file.name}")
			
 
				     
			
 
				     # 输出文件路径
			
 
				+    merged_md_path = output_dir / f"{mineru_file.stem}.md"
			
 
				     merged_json_path = output_dir / f"{mineru_file.stem}.json"
			
 
				     
			
 
				     try:
			
@@ -351,13 +364,14 @@ def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
 
				         )
			
 
				         
			
 
				         # 生成 Markdown
			
 
				-        # merger.generate_enhanced_markdown(merged_data, str(merged_md_path))
			
 
				+        if output_format in ['markdown', 'both']:
			
 
				+            merger.generate_enhanced_markdown(merged_data, str(merged_md_path))
			
 
				         
			
 
				         # 提取单元格信息
			
 
				         # cells = merger.extract_table_cells_with_bbox(merged_data)
			
 
				-        
			
 
				-        with open(merged_json_path, 'w', encoding='utf-8') as f:
			
 
				-            json.dump(merged_data, f, ensure_ascii=False, indent=2)
			
 
				+        if output_format in ['json', 'both']:
			
 
				+            with open(merged_json_path, 'w', encoding='utf-8') as f:
			
 
				+                json.dump(merged_data, f, ensure_ascii=False, indent=2)
			
 
				 
			
 
				         print(f"  ✅ 合并完成")
			
 
				         print(f"  📊 共处理了 {len(merged_data)} 个对象")
			
@@ -373,7 +387,7 @@ def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
 
				         return False
			
 
				 
			
 
				 
			
 
				-def merge_mineru_paddle_batch(mineru_dir: str, paddle_dir: str, output_dir: str,
			
 
				+def merge_mineru_paddle_batch(mineru_dir: str, paddle_dir: str, output_dir: str, output_format: str = 'both',
			
 
				                               look_ahead_window: int = 10, 
			
 
				                               similarity_threshold: int = 80):
			
 
				     """
			
@@ -418,8 +432,8 @@ def merge_mineru_paddle_batch(mineru_dir: str, paddle_dir: str, output_dir: str,
 
				             print(f"⚠️  跳过: 未找到对应的 PaddleOCR 文件: {paddle_file.name}\n")
			
 
				             failed_count += 1
			
 
				             continue
			
 
				-        
			
 
				-        if merge_single_file(mineru_file, paddle_file, output_path, merger):
			
 
				+
			
 
				+        if merge_single_file(mineru_file, paddle_file, output_path, output_format, merger):
			
 
				             success_count += 1
			
 
				         else:
			
 
				             failed_count += 1
			
@@ -499,7 +513,12 @@ def main():
 
				         required=True,
			
 
				         help='输出目录（必需）'
			
 
				     )
			
 
				-    
			
 
				+    output_group.add_argument(
			
 
				+        '-f', '--format', 
			
 
				+        choices=['json', 'markdown', 'both'], 
			
 
				+        default='json', help='输出格式'
			
 
				+    )
			
 
				+
			
 
				     # 算法参数
			
 
				     algo_group = parser.add_argument_group('算法参数')
			
 
				     algo_group.add_argument(
			
@@ -516,6 +535,7 @@ def main():
 
				     )
			
 
				     
			
 
				     args = parser.parse_args()
			
 
				+    output_format = args.format.lower()
			
 
				     
			
 
				     # 验证参数
			
 
				     if args.mineru_file and args.paddle_file:
			
@@ -546,7 +566,7 @@ def main():
 
				             similarity_threshold=args.threshold
			
 
				         )
			
 
				         
			
 
				-        success = merge_single_file(mineru_file, paddle_file, output_dir, merger)
			
 
				+        success = merge_single_file(mineru_file, paddle_file, output_dir, output_format, merger)
			
 
				         
			
 
				         if success:
			
 
				             print("\n✅ 处理完成!")
			
@@ -569,6 +589,7 @@ def main():
 
				             args.mineru_dir,
			
 
				             args.paddle_dir,
			
 
				             args.output_dir,
			
 
				+            output_format=output_format,
			
 
				             look_ahead_window=args.window,
			
 
				             similarity_threshold=args.threshold
			
 
				         )
			
@@ -590,9 +611,10 @@ if __name__ == "__main__":
 
				         
			
 
				         # 默认配置
			
 
				         default_config = {
			
 
				-            "mineru-dir": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/mineru-vlm-2.5.3_Results",
			
 
				-            "paddle-dir": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/data_PPStructureV3_Results",
			
 
				-            "output-dir": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/merged_results",
			
 
				+            "mineru-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/mineru-vlm-2.5.3_Results",
			
 
				+            "paddle-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/data_PPStructureV3_Results",
			
 
				+            "output-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/merged_results",
			
 
				+            "format": "both",
			
 
				             "window": "15",
			
 
				             "threshold": "85"
			
 
				         }