Quellcode durchsuchen

文字双指针算法匹配,有漏掉的情况

zhch158_admin vor 1 Monat
Ursprung
Commit
16b5f43cba
1 geänderte Dateien mit 70 neuen und 48 gelöschten Zeilen
  1. 70 48
      merge_mineru_paddle_ocr.py

+ 70 - 48
merge_mineru_paddle_ocr.py

@@ -14,7 +14,7 @@ from fuzzywuzzy import fuzz
 class MinerUPaddleOCRMerger:
     """合并 MinerU 和 PaddleOCR 的结果"""
     
-    def __init__(self, look_ahead_window: int = 10, similarity_threshold: int = 80):
+    def __init__(self, look_ahead_window: int = 10, similarity_threshold: int = 90):
         """
         Args:
             look_ahead_window: 向前查找的窗口大小
@@ -85,7 +85,10 @@ class MinerUPaddleOCRMerger:
         merged_data = []
         cells = None  # 存储所有表格单元格信息
         paddle_pointer = 0  # PaddleOCR 文字框指针
-        
+
+        # 对mineru_data按bbox从上到下排序,从左到右确保顺序一致
+        mineru_data.sort(key=lambda x: (x['bbox'][1], x['bbox'][0]) if 'bbox' in x else (float('inf'), float('inf')))
+
         for item in mineru_data:
             if item['type'] == 'table':
                 # 处理表格
@@ -148,34 +151,38 @@ class MinerUPaddleOCRMerger:
         current_pointer = start_pointer
         cells = []  # 存储单元格的 bbox 信息
 
-        # 遍历所有单元格
-        for cell in soup.find_all(['td', 'th']):
-            cell_text = cell.get_text(strip=True)
+        # 遍历所有行
+        for row_idx, row in enumerate(soup.find_all('tr')):
+            # 遍历所有单元格
+            for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
+                cell_text = cell.get_text(strip=True)
             
-            if not cell_text:
-                continue
-            
-            # 查找匹配的 bbox
-            matched_bbox, current_pointer = self._find_matching_bbox(
-                cell_text, paddle_text_boxes, current_pointer
-            )
-            
-            if matched_bbox:
-                # 添加 data-bbox 属性
-                bbox = matched_bbox['bbox']
-                cell['data-bbox'] = f"[{bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}]"
-                cell['data-score'] = f"{matched_bbox['score']:.4f}"
-                cell['data-paddle-index'] = str(matched_bbox['paddle_bbox_index'])
+                if not cell_text:
+                    continue
+                
+                # 查找匹配的 bbox
+                matched_bbox, current_pointer = self._find_matching_bbox(
+                    cell_text, paddle_text_boxes, current_pointer
+                )
+                
+                if matched_bbox:
+                    # 添加 data-bbox 属性
+                    bbox = matched_bbox['bbox']
+                    cell['data-bbox'] = f"[{bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}]"
+                    cell['data-score'] = f"{matched_bbox['score']:.4f}"
+                    cell['data-paddle-index'] = str(matched_bbox['paddle_bbox_index'])
 
-                cells.append({
-                    'type': 'table_cell',
-                    'text': cell_text,
-                    'bbox': bbox,
-                    'score': matched_bbox['score'],
-                    'paddle_bbox_index': matched_bbox['paddle_bbox_index']
-                })
-                # 标记为已使用
-                matched_bbox['used'] = True
+                    cells.append({
+                        'type': 'table_cell',
+                        'text': cell_text,
+                        'bbox': bbox,
+                        'row': row_idx+1,
+                        'col': col_idx+1,
+                        'score': matched_bbox['score'],
+                        'paddle_bbox_index': matched_bbox['paddle_bbox_index']
+                    })
+                    # 标记为已使用
+                    matched_bbox['used'] = True
         
         return str(soup), cells, current_pointer
     
@@ -208,17 +215,16 @@ class MinerUPaddleOCRMerger:
             box_text = self._normalize_text(text_boxes[i]['text'])
             
             # 计算相似度
-            similarity = fuzz.token_set_ratio(target_text, box_text)
+            similarity = fuzz.partial_ratio(target_text, box_text)
             
             # 精确匹配优先
             if target_text == box_text:
                 return text_boxes[i], i + 1
             
-            # 记录最佳匹配
-            if similarity > best_similarity and similarity >= self.similarity_threshold:
-                best_similarity = similarity
-                best_match = text_boxes[i]
-                best_index = i + 1
+            # 大于阈值就返回,不找最佳
+            # if similarity > best_similarity and similarity >= self.similarity_threshold:
+            if similarity >= self.similarity_threshold:
+                return text_boxes[i], i + 1
 
         return best_match, best_index
 
@@ -271,10 +277,16 @@ class MinerUPaddleOCRMerger:
                 md_lines.append(f"{text}\n")
             
             elif item['type'] == 'table':
-                md_lines.append("\n## 表格\n")
                 md_lines.append("<!-- 表格单元格包含 data-bbox 属性 -->\n")
                 md_lines.append(item.get('table_body_with_bbox', item.get('table_body', '')))
                 md_lines.append("\n")
+            
+            elif item['type'] == 'image':
+                img_path = item.get('img_path', '')
+                bbox = item.get('bbox', [])
+                if bbox:
+                    md_lines.append(f"<!-- bbox: {bbox} -->")
+                md_lines.append(f"![Image]({img_path})\n")
         
         markdown_content = '\n'.join(md_lines)
         
@@ -325,7 +337,7 @@ class MinerUPaddleOCRMerger:
 
 
 def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path, 
-                     merger: MinerUPaddleOCRMerger) -> bool:
+                     output_format: str, merger: MinerUPaddleOCRMerger) -> bool:
     """
     合并单个文件
     
@@ -341,6 +353,7 @@ def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
     print(f"📄 处理: {mineru_file.name}")
     
     # 输出文件路径
+    merged_md_path = output_dir / f"{mineru_file.stem}.md"
     merged_json_path = output_dir / f"{mineru_file.stem}.json"
     
     try:
@@ -351,13 +364,14 @@ def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
         )
         
         # 生成 Markdown
-        # merger.generate_enhanced_markdown(merged_data, str(merged_md_path))
+        if output_format in ['markdown', 'both']:
+            merger.generate_enhanced_markdown(merged_data, str(merged_md_path))
         
         # 提取单元格信息
         # cells = merger.extract_table_cells_with_bbox(merged_data)
-        
-        with open(merged_json_path, 'w', encoding='utf-8') as f:
-            json.dump(merged_data, f, ensure_ascii=False, indent=2)
+        if output_format in ['json', 'both']:
+            with open(merged_json_path, 'w', encoding='utf-8') as f:
+                json.dump(merged_data, f, ensure_ascii=False, indent=2)
 
         print(f"  ✅ 合并完成")
         print(f"  📊 共处理了 {len(merged_data)} 个对象")
@@ -373,7 +387,7 @@ def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
         return False
 
 
-def merge_mineru_paddle_batch(mineru_dir: str, paddle_dir: str, output_dir: str,
+def merge_mineru_paddle_batch(mineru_dir: str, paddle_dir: str, output_dir: str, output_format: str = 'both',
                               look_ahead_window: int = 10, 
                               similarity_threshold: int = 80):
     """
@@ -418,8 +432,8 @@ def merge_mineru_paddle_batch(mineru_dir: str, paddle_dir: str, output_dir: str,
             print(f"⚠️  跳过: 未找到对应的 PaddleOCR 文件: {paddle_file.name}\n")
             failed_count += 1
             continue
-        
-        if merge_single_file(mineru_file, paddle_file, output_path, merger):
+
+        if merge_single_file(mineru_file, paddle_file, output_path, output_format, merger):
             success_count += 1
         else:
             failed_count += 1
@@ -499,7 +513,12 @@ def main():
         required=True,
         help='输出目录(必需)'
     )
-    
+    output_group.add_argument(
+        '-f', '--format', 
+        choices=['json', 'markdown', 'both'], 
+        default='json', help='输出格式'
+    )
+
     # 算法参数
     algo_group = parser.add_argument_group('算法参数')
     algo_group.add_argument(
@@ -516,6 +535,7 @@ def main():
     )
     
     args = parser.parse_args()
+    output_format = args.format.lower()
     
     # 验证参数
     if args.mineru_file and args.paddle_file:
@@ -546,7 +566,7 @@ def main():
             similarity_threshold=args.threshold
         )
         
-        success = merge_single_file(mineru_file, paddle_file, output_dir, merger)
+        success = merge_single_file(mineru_file, paddle_file, output_dir, output_format, merger)
         
         if success:
             print("\n✅ 处理完成!")
@@ -569,6 +589,7 @@ def main():
             args.mineru_dir,
             args.paddle_dir,
             args.output_dir,
+            output_format=output_format,
             look_ahead_window=args.window,
             similarity_threshold=args.threshold
         )
@@ -590,9 +611,10 @@ if __name__ == "__main__":
         
         # 默认配置
         default_config = {
-            "mineru-dir": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/mineru-vlm-2.5.3_Results",
-            "paddle-dir": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/data_PPStructureV3_Results",
-            "output-dir": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/merged_results",
+            "mineru-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/mineru-vlm-2.5.3_Results",
+            "paddle-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/data_PPStructureV3_Results",
+            "output-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/merged_results",
+            "format": "both",
             "window": "15",
             "threshold": "85"
         }