|
@@ -13,7 +13,7 @@ except ImportError:
|
|
|
|
|
|
|
|
|
|
|
|
|
def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
|
|
def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
|
|
|
- output_format: str, merger: MinerUPaddleOCRMerger) -> bool:
|
|
|
|
|
|
|
+ output_type: str, merger: MinerUPaddleOCRMerger) -> bool:
|
|
|
"""
|
|
"""
|
|
|
合并单个文件
|
|
合并单个文件
|
|
|
|
|
|
|
@@ -40,20 +40,20 @@ def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
# 生成 Markdown
|
|
# 生成 Markdown
|
|
|
- if output_format in ['markdown', 'both']:
|
|
|
|
|
|
|
+ if output_type in ['markdown', 'both']:
|
|
|
merger.generate_enhanced_markdown(merged_data, str(merged_md_path), mineru_file)
|
|
merger.generate_enhanced_markdown(merged_data, str(merged_md_path), mineru_file)
|
|
|
|
|
|
|
|
# 保存 JSON
|
|
# 保存 JSON
|
|
|
- if output_format in ['json', 'both']:
|
|
|
|
|
|
|
+ if output_type in ['json', 'both']:
|
|
|
with open(merged_json_path, 'w', encoding='utf-8') as f:
|
|
with open(merged_json_path, 'w', encoding='utf-8') as f:
|
|
|
json.dump(merged_data, f, ensure_ascii=False, indent=2)
|
|
json.dump(merged_data, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
|
|
print(f" ✅ 合并完成")
|
|
print(f" ✅ 合并完成")
|
|
|
print(f" 📊 共处理了 {len(merged_data)} 个对象")
|
|
print(f" 📊 共处理了 {len(merged_data)} 个对象")
|
|
|
print(f" 💾 输出文件:")
|
|
print(f" 💾 输出文件:")
|
|
|
- if output_format in ['markdown', 'both']:
|
|
|
|
|
|
|
+ if output_type in ['markdown', 'both']:
|
|
|
print(f" - {merged_md_path.name}")
|
|
print(f" - {merged_md_path.name}")
|
|
|
- if output_format in ['json', 'both']:
|
|
|
|
|
|
|
+ if output_type in ['json', 'both']:
|
|
|
print(f" - {merged_json_path.name}")
|
|
print(f" - {merged_json_path.name}")
|
|
|
|
|
|
|
|
return True
|
|
return True
|
|
@@ -66,7 +66,7 @@ def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
|
|
|
|
|
|
|
|
|
|
|
|
|
def merge_mineru_paddle_batch(mineru_dir: str, paddle_dir: str, output_dir: str,
|
|
def merge_mineru_paddle_batch(mineru_dir: str, paddle_dir: str, output_dir: str,
|
|
|
- output_format: str = 'both',
|
|
|
|
|
|
|
+ output_type: str = 'both',
|
|
|
look_ahead_window: int = 10,
|
|
look_ahead_window: int = 10,
|
|
|
similarity_threshold: int = 80):
|
|
similarity_threshold: int = 80):
|
|
|
"""
|
|
"""
|
|
@@ -109,7 +109,7 @@ def merge_mineru_paddle_batch(mineru_dir: str, paddle_dir: str, output_dir: str,
|
|
|
failed_count += 1
|
|
failed_count += 1
|
|
|
continue
|
|
continue
|
|
|
|
|
|
|
|
- if merge_single_file(mineru_file, paddle_file, output_path, output_format, merger):
|
|
|
|
|
|
|
+ if merge_single_file(mineru_file, paddle_file, output_path, output_type, merger):
|
|
|
success_count += 1
|
|
success_count += 1
|
|
|
else:
|
|
else:
|
|
|
failed_count += 1
|
|
failed_count += 1
|
|
@@ -190,7 +190,7 @@ def main():
|
|
|
help='输出目录(必需)'
|
|
help='输出目录(必需)'
|
|
|
)
|
|
)
|
|
|
output_group.add_argument(
|
|
output_group.add_argument(
|
|
|
- '-f', '--format',
|
|
|
|
|
|
|
+ '-f', '--output-type',
|
|
|
choices=['json', 'markdown', 'both'],
|
|
choices=['json', 'markdown', 'both'],
|
|
|
default='both', help='输出格式'
|
|
default='both', help='输出格式'
|
|
|
)
|
|
)
|
|
@@ -211,7 +211,7 @@ def main():
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
args = parser.parse_args()
|
|
|
- output_format = args.format.lower()
|
|
|
|
|
|
|
+ output_type = args.output_type.lower()
|
|
|
|
|
|
|
|
# 验证参数
|
|
# 验证参数
|
|
|
if args.mineru_file and args.paddle_file:
|
|
if args.mineru_file and args.paddle_file:
|
|
@@ -242,7 +242,7 @@ def main():
|
|
|
similarity_threshold=args.threshold
|
|
similarity_threshold=args.threshold
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
- success = merge_single_file(mineru_file, paddle_file, output_dir, output_format, merger)
|
|
|
|
|
|
|
+ success = merge_single_file(mineru_file, paddle_file, output_dir, output_type, merger)
|
|
|
|
|
|
|
|
if success:
|
|
if success:
|
|
|
print("\n✅ 处理完成!")
|
|
print("\n✅ 处理完成!")
|
|
@@ -265,7 +265,7 @@ def main():
|
|
|
args.mineru_dir,
|
|
args.mineru_dir,
|
|
|
args.paddle_dir,
|
|
args.paddle_dir,
|
|
|
args.output_dir,
|
|
args.output_dir,
|
|
|
- output_format=output_format,
|
|
|
|
|
|
|
+ output_type=output_type,
|
|
|
look_ahead_window=args.window,
|
|
look_ahead_window=args.window,
|
|
|
similarity_threshold=args.threshold
|
|
similarity_threshold=args.threshold
|
|
|
)
|
|
)
|
|
@@ -293,7 +293,7 @@ if __name__ == "__main__":
|
|
|
# "mineru-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/mineru-vlm-2.5.3_Results",
|
|
# "mineru-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/mineru-vlm-2.5.3_Results",
|
|
|
# "paddle-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/data_PPStructureV3_Results",
|
|
# "paddle-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/data_PPStructureV3_Results",
|
|
|
# "output-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/merged_results",
|
|
# "output-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/merged_results",
|
|
|
- "format": "both",
|
|
|
|
|
|
|
+ "output-type": "both",
|
|
|
"window": "15",
|
|
"window": "15",
|
|
|
"threshold": "85"
|
|
"threshold": "85"
|
|
|
}
|
|
}
|