|
|
@@ -81,10 +81,20 @@ class MinerUPaddleOCRMerger:
|
|
|
|
|
|
def _process_mineru_data(self, mineru_data: List[Dict],
|
|
|
paddle_text_boxes: List[Dict]) -> List[Dict]:
|
|
|
- """处理 MinerU 数据,添加 bbox 信息"""
|
|
|
+ """处理 MinerU 数据,添加 bbox 信息
|
|
|
+
|
|
|
+ Args:
|
|
|
+ mineru_data (List[Dict]): _description_
|
|
|
+ paddle_text_boxes (List[Dict]): _description_
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ List[Dict]: _description_
|
|
|
+ """
|
|
|
+
|
|
|
merged_data = []
|
|
|
cells = None # 存储所有表格单元格信息
|
|
|
paddle_pointer = 0 # PaddleOCR 文字框指针
|
|
|
+ last_matched_index = 0 # 上次匹配成功的索引
|
|
|
|
|
|
# 对mineru_data按bbox从上到下排序,从左到右确保顺序一致
|
|
|
mineru_data.sort(key=lambda x: (x['bbox'][1], x['bbox'][0]) if 'bbox' in x else (float('inf'), float('inf')))
|
|
|
@@ -103,6 +113,7 @@ class MinerUPaddleOCRMerger:
|
|
|
merged_item['table_body'] = enhanced_html
|
|
|
merged_item['table_body_with_bbox'] = enhanced_html
|
|
|
merged_item['bbox_mapping'] = 'merged_from_paddle_ocr'
|
|
|
+ merged_item['table_cells'] = cells if cells else []
|
|
|
|
|
|
merged_data.append(merged_item)
|
|
|
|
|
|
@@ -112,8 +123,8 @@ class MinerUPaddleOCRMerger:
|
|
|
text = item.get('text', '')
|
|
|
|
|
|
# 查找匹配的 bbox
|
|
|
- matched_bbox, paddle_pointer = self._find_matching_bbox(
|
|
|
- text, paddle_text_boxes, paddle_pointer
|
|
|
+ matched_bbox, paddle_pointer, last_matched_index = self._find_matching_bbox(
|
|
|
+ text, paddle_text_boxes, paddle_pointer, last_matched_index
|
|
|
)
|
|
|
|
|
|
if matched_bbox:
|
|
|
@@ -129,9 +140,6 @@ class MinerUPaddleOCRMerger:
|
|
|
# 其他类型直接复制
|
|
|
merged_data.append(item.copy())
|
|
|
|
|
|
- if cells:
|
|
|
- merged_data.extend(cells)
|
|
|
-
|
|
|
return merged_data
|
|
|
|
|
|
def _enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
|
|
|
@@ -147,8 +155,10 @@ class MinerUPaddleOCRMerger:
|
|
|
Returns:
|
|
|
(增强后的 HTML, 单元格数组, 新的指针位置)
|
|
|
"""
|
|
|
+ # 需要处理minerU识别为2个连着的cell,如: -741.00|357,259.63, paddle识别为一个cell,如: -741.00357,259.63
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
current_pointer = start_pointer
|
|
|
+ last_matched_index = start_pointer
|
|
|
cells = [] # 存储单元格的 bbox 信息
|
|
|
|
|
|
# 遍历所有行
|
|
|
@@ -161,8 +171,8 @@ class MinerUPaddleOCRMerger:
|
|
|
continue
|
|
|
|
|
|
# 查找匹配的 bbox
|
|
|
- matched_bbox, current_pointer = self._find_matching_bbox(
|
|
|
- cell_text, paddle_text_boxes, current_pointer
|
|
|
+ matched_bbox, current_pointer, last_matched_index = self._find_matching_bbox(
|
|
|
+ cell_text, paddle_text_boxes, current_pointer, last_matched_index
|
|
|
)
|
|
|
|
|
|
if matched_bbox:
|
|
|
@@ -187,22 +197,37 @@ class MinerUPaddleOCRMerger:
|
|
|
return str(soup), cells, current_pointer
|
|
|
|
|
|
def _find_matching_bbox(self, target_text: str, text_boxes: List[Dict],
|
|
|
- start_index: int) -> tuple[Optional[Dict], int]:
|
|
|
+ start_index: int, last_match_index: int) -> tuple[Optional[Dict], int, int]:
|
|
|
"""
|
|
|
查找匹配的文字框
|
|
|
|
|
|
Args:
|
|
|
target_text: 目标文本
|
|
|
text_boxes: 文字框列表
|
|
|
- start_index: 起始索引
|
|
|
+ start_index: 起始索引, 是最后一个used=True的位置+1
|
|
|
+ last_match_index: 上次匹配成功的索引, 可能比start_index小
|
|
|
|
|
|
Returns:
|
|
|
- (匹配的文字框信息, 新的指针位置)
|
|
|
+ (匹配的文字框信息, 新的指针位置, last_match_index)
|
|
|
"""
|
|
|
target_text = self._normalize_text(target_text)
|
|
|
+
|
|
|
+ # 过滤过短的目标文本
|
|
|
+ if len(target_text) < 2:
|
|
|
+ return None, start_index, last_match_index
|
|
|
|
|
|
- # 在窗口范围内查找, 窗口是start_index往回移动窗口的1/3到start_index + look_ahead_window
|
|
|
- search_start = max(0, int(start_index - self.look_ahead_window/3))
|
|
|
+ # 由于minerU和Paddle的顺序基本一致, 也有不一致的地方, 所以需要向前找第一个未使用的位置
|
|
|
+ # MinerU和Paddle都可能识别错误,所以需要一个look_ahead_window来避免漏掉匹配
|
|
|
+ search_start = last_match_index - 1
|
|
|
+ unused_count = 0
|
|
|
+ while search_start >= 0:
|
|
|
+ if text_boxes[search_start]['used'] == False:
|
|
|
+ unused_count += 1
|
|
|
+ if unused_count >= self.look_ahead_window:
|
|
|
+ break
|
|
|
+ search_start -= 1
|
|
|
+ if search_start < 0:
|
|
|
+ search_start = 0
|
|
|
search_end = min(start_index + self.look_ahead_window, len(text_boxes))
|
|
|
|
|
|
best_match = None
|
|
|
@@ -214,19 +239,32 @@ class MinerUPaddleOCRMerger:
|
|
|
|
|
|
box_text = self._normalize_text(text_boxes[i]['text'])
|
|
|
|
|
|
- # 计算相似度
|
|
|
- similarity = fuzz.partial_ratio(target_text, box_text)
|
|
|
+ # 过滤过短的候选文本(避免单字符匹配)
|
|
|
+ if len(box_text) < 2:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 长度比例检查 - 避免长度差异过大的匹配
|
|
|
+ length_ratio = min(len(target_text), len(box_text)) / max(len(target_text), len(box_text))
|
|
|
+ if length_ratio < 0.3: # 长度差异超过70%则跳过
|
|
|
+ continue
|
|
|
|
|
|
# 精确匹配优先
|
|
|
if target_text == box_text:
|
|
|
- return text_boxes[i], i + 1
|
|
|
+ if i >= start_index:
|
|
|
+ return text_boxes[i], i + 1, i
|
|
|
+ else:
|
|
|
+ return text_boxes[i], start_index, i
|
|
|
+
|
|
|
+ # 计算多种相似度
|
|
|
+ partial_ratio = fuzz.partial_ratio(target_text, box_text)
|
|
|
+ token_sort_ratio = fuzz.token_sort_ratio(target_text, box_text)
|
|
|
|
|
|
- # 大于阈值就返回,不找最佳
|
|
|
- # if similarity > best_similarity and similarity >= self.similarity_threshold:
|
|
|
- if similarity >= self.similarity_threshold:
|
|
|
- return text_boxes[i], i + 1
|
|
|
+ # 综合相似度 - 两种算法都要达到阈值
|
|
|
+ if (partial_ratio >= self.similarity_threshold and
|
|
|
+ token_sort_ratio >= 50): # token_sort 阈值稍低
|
|
|
+ return text_boxes[i], start_index, last_match_index
|
|
|
|
|
|
- return best_match, best_index
|
|
|
+ return best_match, best_index, last_match_index
|
|
|
|
|
|
def _normalize_text(self, text: str) -> str:
|
|
|
"""标准化文本(去除空格、标点等)"""
|
|
|
@@ -386,8 +424,11 @@ def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
|
|
|
print(f" ✅ 合并完成")
|
|
|
print(f" 📊 共处理了 {len(merged_data)} 个对象")
|
|
|
print(f" 💾 输出文件:")
|
|
|
- print(f" - {merged_json_path.name}")
|
|
|
-
|
|
|
+ if output_format in ['markdown', 'both']:
|
|
|
+ print(f" - {merged_md_path.name}")
|
|
|
+ if output_format in ['json', 'both']:
|
|
|
+ print(f" - {merged_json_path.name}")
|
|
|
+
|
|
|
return True
|
|
|
|
|
|
except Exception as e:
|
|
|
@@ -621,20 +662,20 @@ if __name__ == "__main__":
|
|
|
|
|
|
# 默认配置
|
|
|
default_config = {
|
|
|
- "mineru-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/mineru-vlm-2.5.3_Results",
|
|
|
- "paddle-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/data_PPStructureV3_Results",
|
|
|
- "output-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/merged_results",
|
|
|
+ "mineru-file": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/mineru-vlm-2.5.3_Results/B用户_扫描流水_page_001.json",
|
|
|
+ "paddle-file": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/data_PPStructureV3_Results/B用户_扫描流水_page_001.json",
|
|
|
+ "output-dir": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/merged_results",
|
|
|
+ # "mineru-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/mineru-vlm-2.5.3_Results",
|
|
|
+ # "paddle-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/data_PPStructureV3_Results",
|
|
|
+ # "output-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/merged_results",
|
|
|
"format": "both",
|
|
|
"window": "15",
|
|
|
"threshold": "85"
|
|
|
}
|
|
|
|
|
|
- print(f"📂 MinerU 目录: {default_config['mineru-dir']}")
|
|
|
- print(f"📂 PaddleOCR 目录: {default_config['paddle-dir']}")
|
|
|
- print(f"📂 输出目录: {default_config['output-dir']}")
|
|
|
- print(f"⚙️ 查找窗口: {default_config['window']}")
|
|
|
- print(f"⚙️ 相似度阈值: {default_config['threshold']}%\n")
|
|
|
-
|
|
|
+ print("⚙️ 默认参数:")
|
|
|
+ for key, value in default_config.items():
|
|
|
+ print(f" --{key}: {value}")
|
|
|
# 构造参数
|
|
|
sys.argv = [sys.argv[0]]
|
|
|
for key, value in default_config.items():
|