|
|
@@ -9,7 +9,7 @@ from pathlib import Path
|
|
|
from typing import List, Dict, Tuple, Optional
|
|
|
from bs4 import BeautifulSoup
|
|
|
from fuzzywuzzy import fuzz
|
|
|
-
|
|
|
+import shutil
|
|
|
|
|
|
class MinerUPaddleOCRMerger:
|
|
|
"""合并 MinerU 和 PaddleOCR 的结果"""
|
|
|
@@ -249,7 +249,7 @@ class MinerUPaddleOCRMerger:
|
|
|
return ''.join(result)
|
|
|
|
|
|
def generate_enhanced_markdown(self, merged_data: List[Dict],
|
|
|
- output_path: Optional[str] = None) -> str:
|
|
|
+ output_path: Optional[str] = None, mineru_file: Optional[str] = None) -> str:
|
|
|
"""
|
|
|
生成增强的 Markdown(包含 bbox 信息的注释)
|
|
|
|
|
|
@@ -283,6 +283,16 @@ class MinerUPaddleOCRMerger:
|
|
|
|
|
|
elif item['type'] == 'image':
|
|
|
img_path = item.get('img_path', '')
|
|
|
+ # 需要将minerU图像路径下的图片拷贝到输出目录
|
|
|
+ if img_path and mineru_file:
|
|
|
+ mineru_dir = Path(mineru_file).parent
|
|
|
+ img_full_path = mineru_dir / img_path
|
|
|
+ if img_full_path.exists():
|
|
|
+ # 需要将图片拷贝到输出目录
|
|
|
+ output_img_path = Path(output_path).parent / img_path
|
|
|
+ output_img_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
+ shutil.copy(img_full_path, output_img_path)
|
|
|
+
|
|
|
bbox = item.get('bbox', [])
|
|
|
if bbox:
|
|
|
md_lines.append(f"<!-- bbox: {bbox} -->")
|
|
|
@@ -365,7 +375,7 @@ def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
|
|
|
|
|
|
# 生成 Markdown
|
|
|
if output_format in ['markdown', 'both']:
|
|
|
- merger.generate_enhanced_markdown(merged_data, str(merged_md_path))
|
|
|
+ merger.generate_enhanced_markdown(merged_data, str(merged_md_path), mineru_file)
|
|
|
|
|
|
# 提取单元格信息
|
|
|
# cells = merger.extract_table_cells_with_bbox(merged_data)
|
|
|
@@ -516,7 +526,7 @@ def main():
|
|
|
output_group.add_argument(
|
|
|
'-f', '--format',
|
|
|
choices=['json', 'markdown', 'both'],
|
|
|
- default='json', help='输出格式'
|
|
|
+ default='both', help='输出格式'
|
|
|
)
|
|
|
|
|
|
# 算法参数
|
|
|
@@ -524,13 +534,13 @@ def main():
|
|
|
algo_group.add_argument(
|
|
|
'-w', '--window',
|
|
|
type=int,
|
|
|
- default=10,
|
|
|
+ default=15,
|
|
|
help='向前查找的窗口大小(默认: 10)'
|
|
|
)
|
|
|
algo_group.add_argument(
|
|
|
'-t', '--threshold',
|
|
|
type=int,
|
|
|
- default=80,
|
|
|
+ default=85,
|
|
|
help='文本相似度阈值(0-100,默认: 80)'
|
|
|
)
|
|
|
|