|
|
@@ -14,7 +14,7 @@ from fuzzywuzzy import fuzz
|
|
|
class MinerUPaddleOCRMerger:
|
|
|
"""合并 MinerU 和 PaddleOCR 的结果"""
|
|
|
|
|
|
- def __init__(self, look_ahead_window: int = 10, similarity_threshold: int = 80):
|
|
|
+ def __init__(self, look_ahead_window: int = 10, similarity_threshold: int = 90):
|
|
|
"""
|
|
|
Args:
|
|
|
look_ahead_window: 向前查找的窗口大小
|
|
|
@@ -85,7 +85,10 @@ class MinerUPaddleOCRMerger:
|
|
|
merged_data = []
|
|
|
cells = None # 存储所有表格单元格信息
|
|
|
paddle_pointer = 0 # PaddleOCR 文字框指针
|
|
|
-
|
|
|
+
|
|
|
+ # 对mineru_data按bbox从上到下排序,从左到右确保顺序一致
|
|
|
+ mineru_data.sort(key=lambda x: (x['bbox'][1], x['bbox'][0]) if 'bbox' in x else (float('inf'), float('inf')))
|
|
|
+
|
|
|
for item in mineru_data:
|
|
|
if item['type'] == 'table':
|
|
|
# 处理表格
|
|
|
@@ -148,34 +151,38 @@ class MinerUPaddleOCRMerger:
|
|
|
current_pointer = start_pointer
|
|
|
cells = [] # 存储单元格的 bbox 信息
|
|
|
|
|
|
- # 遍历所有单元格
|
|
|
- for cell in soup.find_all(['td', 'th']):
|
|
|
- cell_text = cell.get_text(strip=True)
|
|
|
+ # 遍历所有行
|
|
|
+ for row_idx, row in enumerate(soup.find_all('tr')):
|
|
|
+ # 遍历所有单元格
|
|
|
+ for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
|
|
|
+ cell_text = cell.get_text(strip=True)
|
|
|
|
|
|
- if not cell_text:
|
|
|
- continue
|
|
|
-
|
|
|
- # 查找匹配的 bbox
|
|
|
- matched_bbox, current_pointer = self._find_matching_bbox(
|
|
|
- cell_text, paddle_text_boxes, current_pointer
|
|
|
- )
|
|
|
-
|
|
|
- if matched_bbox:
|
|
|
- # 添加 data-bbox 属性
|
|
|
- bbox = matched_bbox['bbox']
|
|
|
- cell['data-bbox'] = f"[{bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}]"
|
|
|
- cell['data-score'] = f"{matched_bbox['score']:.4f}"
|
|
|
- cell['data-paddle-index'] = str(matched_bbox['paddle_bbox_index'])
|
|
|
+ if not cell_text:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 查找匹配的 bbox
|
|
|
+ matched_bbox, current_pointer = self._find_matching_bbox(
|
|
|
+ cell_text, paddle_text_boxes, current_pointer
|
|
|
+ )
|
|
|
+
|
|
|
+ if matched_bbox:
|
|
|
+ # 添加 data-bbox 属性
|
|
|
+ bbox = matched_bbox['bbox']
|
|
|
+ cell['data-bbox'] = f"[{bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}]"
|
|
|
+ cell['data-score'] = f"{matched_bbox['score']:.4f}"
|
|
|
+ cell['data-paddle-index'] = str(matched_bbox['paddle_bbox_index'])
|
|
|
|
|
|
- cells.append({
|
|
|
- 'type': 'table_cell',
|
|
|
- 'text': cell_text,
|
|
|
- 'bbox': bbox,
|
|
|
- 'score': matched_bbox['score'],
|
|
|
- 'paddle_bbox_index': matched_bbox['paddle_bbox_index']
|
|
|
- })
|
|
|
- # 标记为已使用
|
|
|
- matched_bbox['used'] = True
|
|
|
+ cells.append({
|
|
|
+ 'type': 'table_cell',
|
|
|
+ 'text': cell_text,
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'row': row_idx+1,
|
|
|
+ 'col': col_idx+1,
|
|
|
+ 'score': matched_bbox['score'],
|
|
|
+ 'paddle_bbox_index': matched_bbox['paddle_bbox_index']
|
|
|
+ })
|
|
|
+ # 标记为已使用
|
|
|
+ matched_bbox['used'] = True
|
|
|
|
|
|
return str(soup), cells, current_pointer
|
|
|
|
|
|
@@ -208,17 +215,16 @@ class MinerUPaddleOCRMerger:
|
|
|
box_text = self._normalize_text(text_boxes[i]['text'])
|
|
|
|
|
|
# 计算相似度
|
|
|
- similarity = fuzz.token_set_ratio(target_text, box_text)
|
|
|
+ similarity = fuzz.partial_ratio(target_text, box_text)
|
|
|
|
|
|
# 精确匹配优先
|
|
|
if target_text == box_text:
|
|
|
return text_boxes[i], i + 1
|
|
|
|
|
|
- # 记录最佳匹配
|
|
|
- if similarity > best_similarity and similarity >= self.similarity_threshold:
|
|
|
- best_similarity = similarity
|
|
|
- best_match = text_boxes[i]
|
|
|
- best_index = i + 1
|
|
|
+ # 大于阈值就返回,不找最佳
|
|
|
+ # if similarity > best_similarity and similarity >= self.similarity_threshold:
|
|
|
+ if similarity >= self.similarity_threshold:
|
|
|
+ return text_boxes[i], i + 1
|
|
|
|
|
|
return best_match, best_index
|
|
|
|
|
|
@@ -271,10 +277,16 @@ class MinerUPaddleOCRMerger:
|
|
|
md_lines.append(f"{text}\n")
|
|
|
|
|
|
elif item['type'] == 'table':
|
|
|
- md_lines.append("\n## 表格\n")
|
|
|
md_lines.append("<!-- 表格单元格包含 data-bbox 属性 -->\n")
|
|
|
md_lines.append(item.get('table_body_with_bbox', item.get('table_body', '')))
|
|
|
md_lines.append("\n")
|
|
|
+
|
|
|
+ elif item['type'] == 'image':
|
|
|
+ img_path = item.get('img_path', '')
|
|
|
+ bbox = item.get('bbox', [])
|
|
|
+ if bbox:
|
|
|
+ md_lines.append(f"<!-- bbox: {bbox} -->")
|
|
|
+ md_lines.append(f"\n")
|
|
|
|
|
|
markdown_content = '\n'.join(md_lines)
|
|
|
|
|
|
@@ -325,7 +337,7 @@ class MinerUPaddleOCRMerger:
|
|
|
|
|
|
|
|
|
def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
|
|
|
- merger: MinerUPaddleOCRMerger) -> bool:
|
|
|
+ output_format: str, merger: MinerUPaddleOCRMerger) -> bool:
|
|
|
"""
|
|
|
合并单个文件
|
|
|
|
|
|
@@ -341,6 +353,7 @@ def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
|
|
|
print(f"📄 处理: {mineru_file.name}")
|
|
|
|
|
|
# 输出文件路径
|
|
|
+ merged_md_path = output_dir / f"{mineru_file.stem}.md"
|
|
|
merged_json_path = output_dir / f"{mineru_file.stem}.json"
|
|
|
|
|
|
try:
|
|
|
@@ -351,13 +364,14 @@ def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
|
|
|
)
|
|
|
|
|
|
# 生成 Markdown
|
|
|
- # merger.generate_enhanced_markdown(merged_data, str(merged_md_path))
|
|
|
+ if output_format in ['markdown', 'both']:
|
|
|
+ merger.generate_enhanced_markdown(merged_data, str(merged_md_path))
|
|
|
|
|
|
# 提取单元格信息
|
|
|
# cells = merger.extract_table_cells_with_bbox(merged_data)
|
|
|
-
|
|
|
- with open(merged_json_path, 'w', encoding='utf-8') as f:
|
|
|
- json.dump(merged_data, f, ensure_ascii=False, indent=2)
|
|
|
+ if output_format in ['json', 'both']:
|
|
|
+ with open(merged_json_path, 'w', encoding='utf-8') as f:
|
|
|
+ json.dump(merged_data, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
print(f" ✅ 合并完成")
|
|
|
print(f" 📊 共处理了 {len(merged_data)} 个对象")
|
|
|
@@ -373,7 +387,7 @@ def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
|
|
|
return False
|
|
|
|
|
|
|
|
|
-def merge_mineru_paddle_batch(mineru_dir: str, paddle_dir: str, output_dir: str,
|
|
|
+def merge_mineru_paddle_batch(mineru_dir: str, paddle_dir: str, output_dir: str, output_format: str = 'both',
|
|
|
look_ahead_window: int = 10,
|
|
|
similarity_threshold: int = 80):
|
|
|
"""
|
|
|
@@ -418,8 +432,8 @@ def merge_mineru_paddle_batch(mineru_dir: str, paddle_dir: str, output_dir: str,
|
|
|
print(f"⚠️ 跳过: 未找到对应的 PaddleOCR 文件: {paddle_file.name}\n")
|
|
|
failed_count += 1
|
|
|
continue
|
|
|
-
|
|
|
- if merge_single_file(mineru_file, paddle_file, output_path, merger):
|
|
|
+
|
|
|
+ if merge_single_file(mineru_file, paddle_file, output_path, output_format, merger):
|
|
|
success_count += 1
|
|
|
else:
|
|
|
failed_count += 1
|
|
|
@@ -499,7 +513,12 @@ def main():
|
|
|
required=True,
|
|
|
help='输出目录(必需)'
|
|
|
)
|
|
|
-
|
|
|
+ output_group.add_argument(
|
|
|
+ '-f', '--format',
|
|
|
+ choices=['json', 'markdown', 'both'],
|
|
|
+ default='json', help='输出格式'
|
|
|
+ )
|
|
|
+
|
|
|
# 算法参数
|
|
|
algo_group = parser.add_argument_group('算法参数')
|
|
|
algo_group.add_argument(
|
|
|
@@ -516,6 +535,7 @@ def main():
|
|
|
)
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
+ output_format = args.format.lower()
|
|
|
|
|
|
# 验证参数
|
|
|
if args.mineru_file and args.paddle_file:
|
|
|
@@ -546,7 +566,7 @@ def main():
|
|
|
similarity_threshold=args.threshold
|
|
|
)
|
|
|
|
|
|
- success = merge_single_file(mineru_file, paddle_file, output_dir, merger)
|
|
|
+ success = merge_single_file(mineru_file, paddle_file, output_dir, output_format, merger)
|
|
|
|
|
|
if success:
|
|
|
print("\n✅ 处理完成!")
|
|
|
@@ -569,6 +589,7 @@ def main():
|
|
|
args.mineru_dir,
|
|
|
args.paddle_dir,
|
|
|
args.output_dir,
|
|
|
+ output_format=output_format,
|
|
|
look_ahead_window=args.window,
|
|
|
similarity_threshold=args.threshold
|
|
|
)
|
|
|
@@ -590,9 +611,10 @@ if __name__ == "__main__":
|
|
|
|
|
|
# 默认配置
|
|
|
default_config = {
|
|
|
- "mineru-dir": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/mineru-vlm-2.5.3_Results",
|
|
|
- "paddle-dir": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/data_PPStructureV3_Results",
|
|
|
- "output-dir": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/merged_results",
|
|
|
+ "mineru-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/mineru-vlm-2.5.3_Results",
|
|
|
+ "paddle-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/data_PPStructureV3_Results",
|
|
|
+ "output-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/merged_results",
|
|
|
+ "format": "both",
|
|
|
"window": "15",
|
|
|
"threshold": "85"
|
|
|
}
|