|
|
@@ -13,7 +13,7 @@ except ImportError:
|
|
|
|
|
|
|
|
|
def merge_single_file(paddleocr_vl_file: Path, paddle_file: Path, output_dir: Path,
|
|
|
- output_format: str, merger: PaddleOCRVLMerger) -> bool:
|
|
|
+ output_type: str, merger: PaddleOCRVLMerger) -> bool:
|
|
|
"""
|
|
|
合并单个文件
|
|
|
|
|
|
@@ -34,29 +34,33 @@ def merge_single_file(paddleocr_vl_file: Path, paddle_file: Path, output_dir: Pa
|
|
|
merged_json_path = output_dir / f"{paddleocr_vl_file.stem}.json"
|
|
|
|
|
|
try:
|
|
|
- # 合并数据
|
|
|
+ # ✅ 合并数据 (统一输出为MinerU格式)
|
|
|
merged_data = merger.merge_table_with_bbox(
|
|
|
str(paddleocr_vl_file),
|
|
|
- str(paddle_file)
|
|
|
+ str(paddle_file),
|
|
|
+ data_format='mineru' # 强制使用MinerU格式
|
|
|
)
|
|
|
|
|
|
- # 生成 Markdown
|
|
|
- if output_format in ['markdown', 'both']:
|
|
|
- merger.generate_enhanced_markdown(
|
|
|
- merged_data, str(merged_md_path), str(paddleocr_vl_file)
|
|
|
+ # ✅ 生成 Markdown (基于MinerU格式)
|
|
|
+ if output_type in ['markdown', 'both']:
|
|
|
+ markdown = merger.generate_enhanced_markdown(
|
|
|
+ merged_data,
|
|
|
+ str(merged_md_path),
|
|
|
+ str(paddleocr_vl_file),
|
|
|
+ data_format='mineru' # 强制使用MinerU格式
|
|
|
)
|
|
|
|
|
|
- # 保存 JSON
|
|
|
- if output_format in ['json', 'both']:
|
|
|
+ # ✅ 保存 JSON (MinerU格式)
|
|
|
+ if output_type in ['json', 'both']:
|
|
|
with open(merged_json_path, 'w', encoding='utf-8') as f:
|
|
|
json.dump(merged_data, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
- print(f" ✅ 合并完成")
|
|
|
+ print(f" ✅ 合并完成 (MinerU格式)")
|
|
|
print(f" 📊 共处理了 {len(merged_data)} 个对象")
|
|
|
print(f" 💾 输出文件:")
|
|
|
- if output_format in ['markdown', 'both']:
|
|
|
+ if output_type in ['markdown', 'both']:
|
|
|
print(f" - {merged_md_path.name}")
|
|
|
- if output_format in ['json', 'both']:
|
|
|
+ if output_type in ['json', 'both']:
|
|
|
print(f" - {merged_json_path.name}")
|
|
|
|
|
|
return True
|
|
|
@@ -69,7 +73,7 @@ def merge_single_file(paddleocr_vl_file: Path, paddle_file: Path, output_dir: Pa
|
|
|
|
|
|
|
|
|
def merge_paddleocr_vl_batch(paddleocr_vl_dir: str, paddle_dir: str, output_dir: str,
|
|
|
- output_format: str = 'both',
|
|
|
+ output_type: str = 'both',
|
|
|
look_ahead_window: int = 10,
|
|
|
similarity_threshold: int = 80):
|
|
|
"""
|
|
|
@@ -113,7 +117,7 @@ def merge_paddleocr_vl_batch(paddleocr_vl_dir: str, paddle_dir: str, output_dir:
|
|
|
failed_count += 1
|
|
|
continue
|
|
|
|
|
|
- if merge_single_file(paddleocr_vl_file, paddle_file, output_path, output_format, merger):
|
|
|
+ if merge_single_file(paddleocr_vl_file, paddle_file, output_path, output_type, merger):
|
|
|
success_count += 1
|
|
|
else:
|
|
|
failed_count += 1
|
|
|
@@ -132,7 +136,7 @@ def merge_paddleocr_vl_batch(paddleocr_vl_dir: str, paddle_dir: str, output_dir:
|
|
|
def main():
|
|
|
"""主函数"""
|
|
|
parser = argparse.ArgumentParser(
|
|
|
- description='合并 PaddleOCR_VL 和 PaddleOCR 的识别结果,添加 bbox 坐标信息',
|
|
|
+ description='合并 PaddleOCR_VL 和 PaddleOCR 的识别结果,统一输出为MinerU格式',
|
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
|
epilog="""
|
|
|
示例用法:
|
|
|
@@ -148,6 +152,10 @@ def main():
|
|
|
--paddleocr-vl-file /path/to/file_page_001.json \\
|
|
|
--paddle-file /path/to/file_page_001.json \\
|
|
|
--output-dir /path/to/output
|
|
|
+
|
|
|
+输出格式说明:
|
|
|
+ - JSON: 统一的MinerU格式JSON文件
|
|
|
+ - Markdown: 基于MinerU格式生成的Markdown文件
|
|
|
"""
|
|
|
)
|
|
|
|
|
|
@@ -185,7 +193,7 @@ def main():
|
|
|
help='输出目录(必需)'
|
|
|
)
|
|
|
output_group.add_argument(
|
|
|
- '-f', '--format',
|
|
|
+ '-f', '--output-type',
|
|
|
choices=['json', 'markdown', 'both'],
|
|
|
default='both',
|
|
|
help='输出格式'
|
|
|
@@ -207,7 +215,7 @@ def main():
|
|
|
)
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
- output_format = args.format.lower()
|
|
|
+ output_type = args.output_type.lower()
|
|
|
|
|
|
# 验证参数
|
|
|
if args.paddleocr_vl_file and args.paddle_file:
|
|
|
@@ -236,7 +244,7 @@ def main():
|
|
|
similarity_threshold=args.threshold
|
|
|
)
|
|
|
|
|
|
- success = merge_single_file(paddleocr_vl_file, paddle_file, output_dir, output_format, merger)
|
|
|
+ success = merge_single_file(paddleocr_vl_file, paddle_file, output_dir, output_type, merger)
|
|
|
|
|
|
if success:
|
|
|
print("\n✅ 处理完成!")
|
|
|
@@ -259,7 +267,7 @@ def main():
|
|
|
args.paddleocr_vl_dir,
|
|
|
args.paddle_dir,
|
|
|
args.output_dir,
|
|
|
- output_format=output_format,
|
|
|
+ output_type=output_type,
|
|
|
look_ahead_window=args.window,
|
|
|
similarity_threshold=args.threshold
|
|
|
)
|
|
|
@@ -270,7 +278,7 @@ def main():
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
- print("🚀 启动 PaddleOCR_VL + PaddleOCR 合并程序...")
|
|
|
+ print("🚀 启动 PaddleOCR_VL + PaddleOCR 合并程序 (统一输出MinerU格式)...")
|
|
|
|
|
|
import sys
|
|
|
|
|
|
@@ -280,7 +288,7 @@ if __name__ == "__main__":
|
|
|
"paddleocr-vl-file": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/PaddleOCR_VL_Results/对公_招商银行图_page_001.json",
|
|
|
"paddle-file": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/data_PPStructureV3_Results/对公_招商银行图_page_001.json",
|
|
|
"output-dir": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/PaddleOCR_VL_Results_cell_bbox",
|
|
|
- "format": "both",
|
|
|
+ "output-type": "both",
|
|
|
"window": "15",
|
|
|
"threshold": "85"
|
|
|
}
|