Sfoglia il codice sorgente

feat: 添加图像拷贝功能,支持将MinerU图像复制到输出目录并更新Markdown生成函数参数

zhch158_admin 1 mese fa
parent
commit
fb37465548
1 ha cambiato i file con 16 aggiunte e 6 eliminazioni
  1. 16 6
      merge_mineru_paddle_ocr.py

+ 16 - 6
merge_mineru_paddle_ocr.py

@@ -9,7 +9,7 @@ from pathlib import Path
 from typing import List, Dict, Tuple, Optional
 from bs4 import BeautifulSoup
 from fuzzywuzzy import fuzz
-
+import shutil
 
 class MinerUPaddleOCRMerger:
     """合并 MinerU 和 PaddleOCR 的结果"""
@@ -249,7 +249,7 @@ class MinerUPaddleOCRMerger:
         return ''.join(result)
     
     def generate_enhanced_markdown(self, merged_data: List[Dict], 
-                                   output_path: Optional[str] = None) -> str:
+                                   output_path: Optional[str] = None, mineru_file: Optional[str] = None) -> str:
         """
         生成增强的 Markdown(包含 bbox 信息的注释)
         
@@ -283,6 +283,16 @@ class MinerUPaddleOCRMerger:
             
             elif item['type'] == 'image':
                 img_path = item.get('img_path', '')
+                # 需要将minerU图像路径下的图片拷贝到输出目录
+                if img_path and mineru_file:
+                    mineru_dir = Path(mineru_file).parent
+                    img_full_path = mineru_dir / img_path
+                    if img_full_path.exists():
+                        # 需要将图片拷贝到输出目录
+                        output_img_path = Path(output_path).parent / img_path
+                        output_img_path.parent.mkdir(parents=True, exist_ok=True)
+                        shutil.copy(img_full_path, output_img_path)
+
                 bbox = item.get('bbox', [])
                 if bbox:
                     md_lines.append(f"<!-- bbox: {bbox} -->")
@@ -365,7 +375,7 @@ def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
         
         # 生成 Markdown
         if output_format in ['markdown', 'both']:
-            merger.generate_enhanced_markdown(merged_data, str(merged_md_path))
+            merger.generate_enhanced_markdown(merged_data, str(merged_md_path), mineru_file)
         
         # 提取单元格信息
         # cells = merger.extract_table_cells_with_bbox(merged_data)
@@ -516,7 +526,7 @@ def main():
     output_group.add_argument(
         '-f', '--format', 
         choices=['json', 'markdown', 'both'], 
-        default='json', help='输出格式'
+        default='both', help='输出格式'
     )
 
     # 算法参数
@@ -524,13 +534,13 @@ def main():
     algo_group.add_argument(
         '-w', '--window',
         type=int,
-        default=10,
+        default=15,
         help='向前查找的窗口大小(默认: 10)'
     )
     algo_group.add_argument(
         '-t', '--threshold',
         type=int,
-        default=80,
+        default=85,
         help='文本相似度阈值(0-100,默认: 80)'
     )