Explorar el Código

refactor: Implement lazy loading for PDFUtils and extract_pdf_pages

- Changed the import strategy for PDFUtils and extract_pdf_pages to lazy loading to prevent MinerU import checks in PaddleX environments.
- Added a new parse_page_range function in file_utils for page range parsing, replacing the previous implementation in PDFUtils for better modularity.
- Updated relevant modules to utilize the new parse_page_range function, ensuring consistent page range handling across the codebase.
zhch158_admin hace 1 semana
padre
commit
499120d8c4
Se han modificado 4 ficheros con 181 adiciones y 92 borrados
  1. 20 3
      ocr_utils/__init__.py
  2. 149 50
      ocr_utils/file_utils.py
  3. 2 2
      ocr_utils/pdf_extractor.py
  4. 10 37
      ocr_utils/pdf_utils.py

+ 20 - 3
ocr_utils/__init__.py

@@ -8,13 +8,14 @@ OCR 工具包
 - 数字标准化工具
 """
 
-from .pdf_utils import PDFUtils
+# PDFUtils 和 extract_pdf_pages 使用延迟导入,避免在 PaddleX 环境中触发 MinerU 导入检查
+# from .pdf_utils import PDFUtils  # 已移除,改为延迟导入
+# from .pdf_extractor import extract_pdf_pages  # 已移除,改为延迟导入(因为它依赖 PDFUtils)
 from .json_formatters import JSONFormatters
 from .markdown_generator import MarkdownGenerator
 from .html_generator import HTMLGenerator
 from .visualization_utils import VisualizationUtils
 from .output_formatter_v2 import OutputFormatterV2, save_mineru_format
-from .pdf_extractor import extract_pdf_pages
 from .normalize_financial_numbers import (
     normalize_financial_numbers,
     normalize_json_table,
@@ -29,7 +30,8 @@ from .file_utils import (
     get_image_files_from_csv,
     convert_pdf_to_images,
     split_files,
-    create_temp_file_list
+    create_temp_file_list,
+    parse_page_range
 )
 from .log_utils import setup_logging
 
@@ -62,6 +64,7 @@ __all__ = [
     'convert_pdf_to_images',
     'split_files',
     'create_temp_file_list',
+    'parse_page_range',
     # 日志工具
     'setup_logging',
 ]
@@ -69,3 +72,17 @@ __all__ = [
 __version__ = "1.0.0"
 __author__ = "zhch158"
 
+
+def __getattr__(name: str):
+    """
+    延迟导入 PDFUtils 和 extract_pdf_pages,只有在实际使用时才触发 MinerU 导入检查。
+    这样可以在 PaddleX 环境中正常导入 ocr_utils,即使 MinerU 不可用。
+    """
+    if name == 'PDFUtils':
+        from .pdf_utils import PDFUtils
+        return PDFUtils
+    elif name == 'extract_pdf_pages':
+        from .pdf_extractor import extract_pdf_pages
+        return extract_pdf_pages
+    raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
+

+ 149 - 50
ocr_utils/file_utils.py

@@ -7,8 +7,9 @@
 - 文件列表处理
 """
 import tempfile
+import re
 from pathlib import Path
-from typing import List, Tuple
+from typing import List, Tuple, Optional, Set
 import json
 import traceback
 from loguru import logger
@@ -23,6 +24,60 @@ except ImportError:
     ImageType = None
 
 
+def parse_page_range(page_range: Optional[str], total_pages: int) -> Set[int]:
+    """
+    解析页面范围字符串
+    
+    支持格式:
+    - "1-5" → {0, 1, 2, 3, 4}(页码从1开始,内部转为0-based索引)
+    - "3" → {2}
+    - "1-5,7,9-12" → {0, 1, 2, 3, 4, 6, 8, 9, 10, 11}
+    - "1-" → 从第1页到最后
+    - "-5" → 从第1页到第5页
+    
+    Args:
+        page_range: 页面范围字符串(页码从1开始)
+        total_pages: 总页数
+        
+    Returns:
+        页面索引集合(0-based)
+    """
+    if not page_range or not page_range.strip():
+        return set(range(total_pages))
+    
+    pages = set()
+    parts = page_range.replace(' ', '').split(',')
+    
+    for part in parts:
+        part = part.strip()
+        if not part:
+            continue
+        
+        if '-' in part:
+            # 范围格式
+            match = re.match(r'^(\d*)-(\d*)$', part)
+            if match:
+                start_str, end_str = match.groups()
+                start = int(start_str) if start_str else 1
+                end = int(end_str) if end_str else total_pages
+                
+                # 转换为 0-based 索引
+                start = max(0, start - 1)
+                end = min(total_pages, end)
+                
+                pages.update(range(start, end))
+        else:
+            # 单页
+            try:
+                page_num = int(part)
+                if 1 <= page_num <= total_pages:
+                    pages.add(page_num - 1)  # 转换为 0-based 索引
+            except ValueError:
+                logger.warning(f"Invalid page number: {part}")
+    
+    return pages
+
+
 def split_files(file_list: List[str], num_splits: int) -> List[List[str]]:
     """
     将文件列表分割成指定数量的子列表
@@ -235,62 +290,107 @@ def convert_pdf_to_images(
     output_path.mkdir(parents=True, exist_ok=True)
 
     try:
-        # 使用MinerU的函数加载PDF图像
-        if not MINERU_AVAILABLE or load_images_from_pdf is None or ImageType is None:
-            logger.error("❌ MinerU components not available for PDF to image conversion")
-            return []
-        
-        images, _ = load_images_from_pdf(
-            pdf_path.read_bytes(),
-            dpi=dpi,
-            image_type=ImageType.PIL  # 返回包含 img_pil 的字典列表
-        )
-        
-        # 应用页面范围过滤
-        selected_pages = None
-        if page_range:
-            from .pdf_utils import PDFUtils
-            total_pages = len(images)
-            selected_pages = PDFUtils.parse_page_range(page_range, total_pages)
-            if selected_pages:
-                images = [images[i] for i in sorted(selected_pages)]
-                logger.info(f"📋 PDF 共 {total_pages} 页,选择处理 {len(images)} 页")
+        # 优先使用 MinerU 的函数(如果可用)
+        if MINERU_AVAILABLE and load_images_from_pdf is not None and ImageType is not None:
+            images, _ = load_images_from_pdf(
+                pdf_path.read_bytes(),
+                dpi=dpi,
+                image_type=ImageType.PIL  # 返回包含 img_pil 的字典列表
+            )
+            
+            # 应用页面范围过滤
+            selected_pages = None
+            if page_range:
+                total_pages = len(images)
+                selected_pages = parse_page_range(page_range, total_pages)
+                if selected_pages:
+                    images = [images[i] for i in sorted(selected_pages)]
+                    logger.info(f"📋 PDF 共 {total_pages} 页,选择处理 {len(images)} 页")
+                else:
+                    logger.warning(f"⚠️ 页面范围 '{page_range}' 没有匹配到任何有效页面")
+                    return []
             else:
-                logger.warning(f"⚠️ 页面范围 '{page_range}' 没有匹配到任何有效页面")
-                return []
-        
-        image_paths = []
-        # 需要跟踪原始页码索引,以便正确命名文件
-        original_indices = sorted(selected_pages) if selected_pages else list(range(len(images)))
-        
-        for idx, image in enumerate(images):
-            # 获取原始页码索引(用于文件命名)
-            original_idx = original_indices[idx] if selected_pages else idx
-            # 生成图像文件名(使用原始页码,从1开始)
-            image_filename = f"{pdf_path.stem}_page_{original_idx + 1:03d}.png"
-            image_path = output_path / image_filename
+                selected_pages = None
+            
+            image_paths = []
+            # 需要跟踪原始页码索引,以便正确命名文件
+            original_indices = sorted(selected_pages) if selected_pages else list(range(len(images)))
+            
+            for idx, image in enumerate(images):
+                # 获取原始页码索引(用于文件命名)
+                original_idx = original_indices[idx] if selected_pages else idx
+                # 生成图像文件名(使用原始页码,从1开始)
+                image_filename = f"{pdf_path.stem}_page_{original_idx + 1:03d}.png"
+                image_path = output_path / image_filename
 
-            # 保存图像 - 从字典中提取 img_pil
-            if isinstance(image, dict):
-                pil_image = image.get('img_pil')
-                if pil_image is None:
-                    logger.error(f"❌ Image dict at index {idx} does not contain 'img_pil' key")
-                    continue
-                pil_image.save(str(image_path))
-            else:
-                # 如果不是字典,假设是直接的 PIL Image
-                image.save(str(image_path))
-            image_paths.append(str(image_path))
+                # 保存图像 - 从字典中提取 img_pil
+                if isinstance(image, dict):
+                    pil_image = image.get('img_pil')
+                    if pil_image is None:
+                        logger.error(f"❌ Image dict at index {idx} does not contain 'img_pil' key")
+                        continue
+                    pil_image.save(str(image_path))
+                else:
+                    # 如果不是字典,假设是直接的 PIL Image
+                    image.save(str(image_path))
+                image_paths.append(str(image_path))
             
-        logger.info(f"✅ Converted {len(images)} pages from {pdf_path.name} to images")
-        return image_paths
+            logger.info(f"✅ Converted {len(images)} pages from {pdf_path.name} to images (using MinerU)")
+            return image_paths
+        
+        else:
+            # Fallback: 使用 pypdfium2(PaddleX 环境中可用)
+            logger.info("ℹ️  MinerU 不可用,使用 pypdfium2 进行 PDF 转图像")
+            try:
+                import pypdfium2 as pdfium
+            except ImportError:
+                logger.error("❌ pypdfium2 未安装,无法转换 PDF。请安装: pip install pypdfium2")
+                return []
+            
+            pdf_doc = pdfium.PdfDocument(pdf_path)
+            try:
+                total_pages = len(pdf_doc)
+                
+                # 解析页面范围(使用本地函数,不依赖 PDFUtils)
+                selected_pages = parse_page_range(page_range, total_pages)
+                if not selected_pages:
+                    logger.warning(f"⚠️ 页面范围 '{page_range}' 没有匹配到任何有效页面")
+                    return []
+                
+                if page_range:
+                    logger.info(f"📋 PDF 共 {total_pages} 页,选择处理 {len(selected_pages)} 页")
+                
+                # 计算缩放比例(DPI 转换)
+                # pypdfium2 的 scale 参数:1.0 = 72 DPI,所以 dpi/72 = scale
+                scale = dpi / 72.0
+                
+                image_paths = []
+                for page_idx in sorted(selected_pages):
+                    page = pdf_doc[page_idx]
+                    
+                    # 渲染页面为图像
+                    bitmap = page.render(scale=scale)
+                    pil_image = bitmap.to_pil()
+                    
+                    # 生成图像文件名(页码从1开始)
+                    image_filename = f"{pdf_path.stem}_page_{page_idx + 1:03d}.png"
+                    image_path = output_path / image_filename
+                    
+                    # 保存图像
+                    pil_image.save(str(image_path))
+                    image_paths.append(str(image_path))
+                
+                logger.info(f"✅ Converted {len(image_paths)} pages from {pdf_path.name} to images (using pypdfium2)")
+                return image_paths
+                
+            finally:
+                pdf_doc.close()
         
     except Exception as e:
         logger.error(f"❌ Error converting PDF {pdf_path}: {e}")
         traceback.print_exc()
         return []
 
-
 def get_input_files(args, page_range: str | None = None) -> List[str]:
     """
     获取输入文件列表,统一处理PDF和图像文件,支持页面范围过滤
@@ -356,9 +456,8 @@ def get_input_files(args, page_range: str | None = None) -> List[str]:
         
         # 对于图片目录,应用页面范围过滤
         if page_range and image_files:
-            from .pdf_utils import PDFUtils
             total_pages = len(image_files)
-            selected_pages = PDFUtils.parse_page_range(page_range, total_pages)
+            selected_pages = parse_page_range(page_range, total_pages)
             if selected_pages:
                 image_files = [image_files[i] for i in sorted(selected_pages)]
                 logger.info(f"📋 图片目录共 {total_pages} 张,选择处理 {len(image_files)} 张")

+ 2 - 2
ocr_utils/pdf_extractor.py

@@ -23,7 +23,7 @@ except ImportError:
     pdfium = None
 
 from loguru import logger
-from .pdf_utils import PDFUtils
+from .file_utils import parse_page_range
 
 
 def extract_pdf_pages(
@@ -197,7 +197,7 @@ def main():
         sys.exit(1)
     
     # 解析页面范围
-    page_set = PDFUtils.parse_page_range(args.pages, total_pages)
+    page_set = parse_page_range(args.pages, total_pages)
     page_indices = sorted(list(page_set))
     
     if not page_indices:

+ 10 - 37
ocr_utils/pdf_utils.py

@@ -13,6 +13,9 @@ from PIL import Image
 from loguru import logger
 import re
 
+# 导入页面范围解析函数(不依赖 MinerU)
+from .file_utils import parse_page_range
+
 # 导入 MinerU 组件
 try:
     from mineru.utils.pdf_classify import classify as pdf_classify
@@ -29,7 +32,10 @@ class PDFUtils:
     @staticmethod
     def parse_page_range(page_range: Optional[str], total_pages: int) -> Set[int]:
         """
-        解析页面范围字符串
+        解析页面范围字符串(向后兼容包装函数)
+        
+        此方法是对 file_utils.parse_page_range 的包装,保持向后兼容性。
+        新代码应直接使用 file_utils.parse_page_range。
         
         支持格式:
         - "1-5" → {0, 1, 2, 3, 4}(页码从1开始,内部转为0-based索引)
@@ -45,40 +51,7 @@ class PDFUtils:
         Returns:
             页面索引集合(0-based)
         """
-        if not page_range or not page_range.strip():
-            return set(range(total_pages))
-        
-        pages = set()
-        parts = page_range.replace(' ', '').split(',')
-        
-        for part in parts:
-            part = part.strip()
-            if not part:
-                continue
-            
-            if '-' in part:
-                # 范围格式
-                match = re.match(r'^(\d*)-(\d*)$', part)
-                if match:
-                    start_str, end_str = match.groups()
-                    start = int(start_str) if start_str else 1
-                    end = int(end_str) if end_str else total_pages
-                    
-                    # 转换为 0-based 索引
-                    start = max(0, start - 1)
-                    end = min(total_pages, end)
-                    
-                    pages.update(range(start, end))
-            else:
-                # 单页
-                try:
-                    page_num = int(part)
-                    if 1 <= page_num <= total_pages:
-                        pages.add(page_num - 1)  # 转换为 0-based 索引
-                except ValueError:
-                    logger.warning(f"Invalid page number: {part}")
-        
-        return pages
+        return parse_page_range(page_range, total_pages)
     
     @staticmethod
     def load_and_classify_document(
@@ -116,7 +89,7 @@ class PDFUtils:
             
             # 解析页面范围
             total_pages = len(image_files)
-            selected_pages = PDFUtils.parse_page_range(page_range, total_pages)
+            selected_pages = parse_page_range(page_range, total_pages)
             
             if page_range:
                 logger.info(f"📋 图片目录共 {total_pages} 张,选择处理 {len(selected_pages)} 张")
@@ -159,7 +132,7 @@ class PDFUtils:
             
             # 解析页面范围
             total_pages = len(images_list)
-            selected_pages = PDFUtils.parse_page_range(page_range, total_pages)
+            selected_pages = parse_page_range(page_range, total_pages)
             
             if page_range:
                 logger.info(f"📋 PDF 共 {total_pages} 页,选择处理 {len(selected_pages)} 页")