浏览代码

feat: 更新处理 MinerU 数据的函数,添加参数说明和返回值描述,优化匹配算法以提高准确性

zhch158_admin 1 月之前
父节点
当前提交
17272a1868
共有 1 个文件被更改,包括 73 次插入32 次删除
  1. 73 32
      merge_mineru_paddle_ocr.py

+ 73 - 32
merge_mineru_paddle_ocr.py

@@ -81,10 +81,20 @@ class MinerUPaddleOCRMerger:
     
     def _process_mineru_data(self, mineru_data: List[Dict], 
                             paddle_text_boxes: List[Dict]) -> List[Dict]:
-        """处理 MinerU 数据,添加 bbox 信息"""
+        """处理 MinerU 数据,添加 bbox 信息
+
+        Args:
+            mineru_data (List[Dict]): _description_
+            paddle_text_boxes (List[Dict]): _description_
+
+        Returns:
+            List[Dict]: _description_
+        """ 
+
         merged_data = []
         cells = None  # 存储所有表格单元格信息
         paddle_pointer = 0  # PaddleOCR 文字框指针
+        last_matched_index = 0  # 上次匹配成功的索引
 
         # 对mineru_data按bbox从上到下排序,从左到右确保顺序一致
         mineru_data.sort(key=lambda x: (x['bbox'][1], x['bbox'][0]) if 'bbox' in x else (float('inf'), float('inf')))
@@ -103,6 +113,7 @@ class MinerUPaddleOCRMerger:
                 merged_item['table_body'] = enhanced_html
                 merged_item['table_body_with_bbox'] = enhanced_html
                 merged_item['bbox_mapping'] = 'merged_from_paddle_ocr'
+                merged_item['table_cells'] = cells if cells else []
                 
                 merged_data.append(merged_item)
             
@@ -112,8 +123,8 @@ class MinerUPaddleOCRMerger:
                 text = item.get('text', '')
                 
                 # 查找匹配的 bbox
-                matched_bbox, paddle_pointer = self._find_matching_bbox(
-                    text, paddle_text_boxes, paddle_pointer
+                matched_bbox, paddle_pointer, last_matched_index = self._find_matching_bbox(
+                    text, paddle_text_boxes, paddle_pointer, last_matched_index
                 )
                 
                 if matched_bbox:
@@ -129,9 +140,6 @@ class MinerUPaddleOCRMerger:
                 # 其他类型直接复制
                 merged_data.append(item.copy())
         
-        if cells:
-            merged_data.extend(cells)
-
         return merged_data
     
     def _enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict], 
@@ -147,8 +155,10 @@ class MinerUPaddleOCRMerger:
         Returns:
             (增强后的 HTML, 单元格数组, 新的指针位置)
         """
+        # 需要处理minerU识别为2个连着的cell,如: -741.00|357,259.63, paddle识别为一个cell,如: -741.00357,259.63
         soup = BeautifulSoup(html, 'html.parser')
         current_pointer = start_pointer
+        last_matched_index = start_pointer
         cells = []  # 存储单元格的 bbox 信息
 
         # 遍历所有行
@@ -161,8 +171,8 @@ class MinerUPaddleOCRMerger:
                     continue
                 
                 # 查找匹配的 bbox
-                matched_bbox, current_pointer = self._find_matching_bbox(
-                    cell_text, paddle_text_boxes, current_pointer
+                matched_bbox, current_pointer, last_matched_index = self._find_matching_bbox(
+                    cell_text, paddle_text_boxes, current_pointer, last_matched_index
                 )
                 
                 if matched_bbox:
@@ -187,22 +197,37 @@ class MinerUPaddleOCRMerger:
         return str(soup), cells, current_pointer
     
     def _find_matching_bbox(self, target_text: str, text_boxes: List[Dict], 
-                           start_index: int) -> tuple[Optional[Dict], int]:
+                           start_index: int, last_match_index: int) -> tuple[Optional[Dict], int, int]:
         """
         查找匹配的文字框
         
         Args:
             target_text: 目标文本
             text_boxes: 文字框列表
-            start_index: 起始索引
+            start_index: 起始索引, 是最后一个used=True的位置+1 
+            last_match_index: 上次匹配成功的索引, 可能比start_index小
         
         Returns:
-            (匹配的文字框信息, 新的指针位置)
+            (匹配的文字框信息, 新的指针位置, last_match_index)
         """
         target_text = self._normalize_text(target_text)
+        
+        # 过滤过短的目标文本
+        if len(target_text) < 2:
+            return None, start_index, last_match_index
 
-        # 在窗口范围内查找, 窗口是start_index往回移动窗口的1/3到start_index + look_ahead_window
-        search_start = max(0, int(start_index - self.look_ahead_window/3))
+        # 由于minerU和Paddle的顺序基本一致, 也有不一致的地方, 所以需要向前找第一个未使用的位置
+        # MinerU和Paddle都可能识别错误,所以需要一个look_ahead_window来避免漏掉匹配
+        search_start = last_match_index - 1
+        unused_count = 0
+        while search_start >= 0:
+            if text_boxes[search_start]['used'] == False:
+                unused_count += 1
+            if unused_count >= self.look_ahead_window:
+                break
+            search_start -= 1
+        if search_start < 0:
+            search_start = 0
         search_end = min(start_index + self.look_ahead_window, len(text_boxes))
         
         best_match = None
@@ -214,19 +239,32 @@ class MinerUPaddleOCRMerger:
             
             box_text = self._normalize_text(text_boxes[i]['text'])
             
-            # 计算相似度
-            similarity = fuzz.partial_ratio(target_text, box_text)
+            # 过滤过短的候选文本(避免单字符匹配)
+            if len(box_text) < 2:
+                continue
+            
+            # 长度比例检查 - 避免长度差异过大的匹配
+            length_ratio = min(len(target_text), len(box_text)) / max(len(target_text), len(box_text))
+            if length_ratio < 0.3:  # 长度差异超过70%则跳过
+                continue
             
             # 精确匹配优先
             if target_text == box_text:
-                return text_boxes[i], i + 1
+                if i >= start_index:
+                    return text_boxes[i], i + 1, i
+                else:
+                    return text_boxes[i], start_index, i
+            
+            # 计算多种相似度
+            partial_ratio = fuzz.partial_ratio(target_text, box_text)
+            token_sort_ratio = fuzz.token_sort_ratio(target_text, box_text)
             
-            # 大于阈值就返回,不找最佳
-            # if similarity > best_similarity and similarity >= self.similarity_threshold:
-            if similarity >= self.similarity_threshold:
-                return text_boxes[i], i + 1
+            # 综合相似度 - 两种算法都要达到阈值
+            if (partial_ratio >= self.similarity_threshold and 
+                token_sort_ratio >= 50):  # token_sort 阈值稍低
+                return text_boxes[i], start_index, last_match_index
 
-        return best_match, best_index
+        return best_match, best_index, last_match_index
 
     def _normalize_text(self, text: str) -> str:
         """标准化文本(去除空格、标点等)"""
@@ -386,8 +424,11 @@ def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
         print(f"  ✅ 合并完成")
         print(f"  📊 共处理了 {len(merged_data)} 个对象")
         print(f"  💾 输出文件:")
-        print(f"    - {merged_json_path.name}")
-        
+        if output_format in ['markdown', 'both']:
+            print(f"    - {merged_md_path.name}")
+        if output_format in ['json', 'both']:
+            print(f"    - {merged_json_path.name}")
+
         return True
         
     except Exception as e:
@@ -621,20 +662,20 @@ if __name__ == "__main__":
         
         # 默认配置
         default_config = {
-            "mineru-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/mineru-vlm-2.5.3_Results",
-            "paddle-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/data_PPStructureV3_Results",
-            "output-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/merged_results",
+            "mineru-file": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/mineru-vlm-2.5.3_Results/B用户_扫描流水_page_001.json",
+            "paddle-file": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/data_PPStructureV3_Results/B用户_扫描流水_page_001.json",
+            "output-dir": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/merged_results",
+            # "mineru-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/mineru-vlm-2.5.3_Results",
+            # "paddle-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/data_PPStructureV3_Results",
+            # "output-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/merged_results",
             "format": "both",
             "window": "15",
             "threshold": "85"
         }
         
-        print(f"📂 MinerU 目录: {default_config['mineru-dir']}")
-        print(f"📂 PaddleOCR 目录: {default_config['paddle-dir']}")
-        print(f"📂 输出目录: {default_config['output-dir']}")
-        print(f"⚙️  查找窗口: {default_config['window']}")
-        print(f"⚙️  相似度阈值: {default_config['threshold']}%\n")
-        
+        print("⚙️  默认参数:")
+        for key, value in default_config.items():
+            print(f"  --{key}: {value}")
         # 构造参数
         sys.argv = [sys.argv[0]]
         for key, value in default_config.items():