6 месяцев назад · db1a81a141
--- a/ocr_utils/pdf_utils.py
+++ b/ocr_utils/pdf_utils.py
@@ -52,13 +52,41 @@ class PDFUtils:
 
				             页面索引集合（0-based）
			
 
				         """
			
 
				         return parse_page_range(page_range, total_pages)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _detect_pdf_doc_type(pdf_doc: Any) -> str:
			
 
				+        """
			
 
				+        检测 PDF 文档对象类型
			
 
				+        
			
 
				+        Args:
			
 
				+            pdf_doc: PDF 文档对象
			
 
				+            
			
 
				+        Returns:
			
 
				+            'pypdfium2' 或 'fitz'
			
 
				+        """
			
 
				+        doc_type_name = type(pdf_doc).__name__
			
 
				+        doc_module = type(pdf_doc).__module__
			
 
				+        
			
 
				+        if 'pdfium' in doc_module.lower() or 'PdfDocument' in doc_type_name:
			
 
				+            return 'pypdfium2'
			
 
				+        elif 'fitz' in doc_module.lower() or 'Document' in doc_type_name:
			
 
				+            return 'fitz'
			
 
				+        else:
			
 
				+            # 尝试通过属性判断
			
 
				+            if hasattr(pdf_doc, 'get_page') or hasattr(pdf_doc, 'page_count'):
			
 
				+                # fitz.Document 有 page_count 属性
			
 
				+                return 'fitz'
			
 
				+            else:
			
 
				+                # pypdfium2 通过索引访问
			
 
				+                return 'pypdfium2'
			
 
				     
			
 
				     @staticmethod
			
 
				     def load_and_classify_document(
			
 
				         document_path: Path,
			
 
				         dpi: int = 200,
			
 
				-        page_range: Optional[str] = None
			
 
				-    ) -> Tuple[List[Dict], str, Optional[Any]]:
			
 
				+        page_range: Optional[str] = None,
			
 
				+        renderer: str = "fitz"  # 新增参数，默认 fitz
			
 
				+    ) -> Tuple[List[Dict], str, Optional[Any], str]:
			
 
				         """
			
 
				         加载文档并分类，支持页面范围过滤
			
 
				         
			
@@ -68,12 +96,14 @@ class PDFUtils:
 
				             page_range: 页面范围字符串，如 "1-5,7,9-12"
			
 
				                        - PDF：按页码（从1开始）
			
 
				                        - 图片目录：按文件名排序后的位置（从1开始）
			
 
				+            renderer: PDF渲染引擎，"fitz" 或 "pypdfium2"
			
 
				             
			
 
				         Returns:
			
 
				             (images_list, pdf_type, pdf_doc)
			
 
				             - images_list: 图像列表，每个元素包含 {'img_pil': PIL.Image, 'scale': float, 'page_idx': int}
			
 
				             - pdf_type: 'ocr' 或 'txt'
			
 
				-            - pdf_doc: PDF文档对象（如果是PDF）
			
 
				+            - pdf_doc: PDF文档对象（如果PDF）
			
 
				+            - renderer_used: 实际使用的渲染器类型
			
 
				         """
			
 
				         pdf_doc = None
			
 
				         pdf_type = 'ocr'  # 默认使用OCR模式
			
@@ -128,7 +158,7 @@ class PDFUtils:
 
				                 pdf_bytes, 
			
 
				                 dpi=dpi,
			
 
				                 image_type=ImageType.PIL,
			
 
				-                renderer='fitz'
			
 
				+                renderer=renderer   # 使用指定的渲染引擎
			
 
				             )
			
 
				             
			
 
				             # 解析页面范围
			
@@ -167,7 +197,7 @@ class PDFUtils:
 
				         else:
			
 
				             raise ValueError(f"Unsupported file format: {document_path.suffix}")
			
 
				         
			
 
				-        return all_images, pdf_type, pdf_doc
			
 
				+        return all_images, pdf_type, pdf_doc, renderer
			
 
				     
			
 
				     @staticmethod
			
 
				     def extract_text_from_pdf(
			
@@ -177,10 +207,10 @@ class PDFUtils:
 
				         scale: float
			
 
				     ) -> Tuple[str, bool]:
			
 
				         """
			
 
				-        从PDF直接提取文本（使用 MinerU 的 pypdfium2 方式）
			
 
				+        从PDF直接提取文本（支持 pypdfium2 和 fitz）
			
 
				         
			
 
				         Args:
			
 
				-            pdf_doc: pypdfium2 的 PdfDocument 对象
			
 
				+            pdf_doc: PDF文档对象 (pypdfium2.PdfDocument 或 fitz.Document)
			
 
				             page_idx: 页码索引
			
 
				             bbox: 目标区域的bbox（图像坐标）
			
 
				             scale: 图像与PDF的缩放比例
			
@@ -188,8 +218,24 @@ class PDFUtils:
 
				         Returns:
			
 
				             (text, success)
			
 
				         """
			
 
				+        # 检测 PDF 文档类型
			
 
				+        doc_type = PDFUtils._detect_pdf_doc_type(pdf_doc)
			
 
				+        
			
 
				+        if doc_type == 'fitz':
			
 
				+            return PDFUtils._extract_text_from_pdf_fitz(pdf_doc, page_idx, bbox, scale)
			
 
				+        else:  # pypdfium2
			
 
				+            return PDFUtils._extract_text_from_pdf_pypdfium2(pdf_doc, page_idx, bbox, scale)
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def _extract_text_from_pdf_pypdfium2(
			
 
				+        pdf_doc: Any,
			
 
				+        page_idx: int,
			
 
				+        bbox: List[float],
			
 
				+        scale: float
			
 
				+    ) -> Tuple[str, bool]:
			
 
				+        """使用 pypdfium2 提取文本（原有实现）"""
			
 
				         if not MINERU_AVAILABLE or pdf_get_page_text is None:
			
 
				-            logger.debug("MinerU pdf_text_tool not available")
			
 
				+            logger.error("MinerU pdf_text_tool not available")
			
 
				             return "", False
			
 
				             
			
 
				         try:
			
@@ -212,13 +258,12 @@ class PDFUtils:
 
				                 for line in block.get('lines', []):
			
 
				                     line_bbox = line.get('bbox')
			
 
				                     if line_bbox and hasattr(line_bbox, 'bbox'):
			
 
				-                        line_bbox = line_bbox.bbox  # pdftext 的 BBox 对象
			
 
				+                        line_bbox = line_bbox.bbox
			
 
				                     elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
			
 
				                         line_bbox = list(line_bbox)
			
 
				                     else:
			
 
				                         continue
			
 
				                     
			
 
				-                    # 检查 line 是否与目标 bbox 重叠
			
 
				                     if PDFUtils._bbox_overlap(pdf_bbox, line_bbox):
			
 
				                         for span in line.get('spans', []):
			
 
				                             span_text = span.get('text', '')
			
@@ -230,11 +275,187 @@ class PDFUtils:
 
				             
			
 
				         except Exception as e:
			
 
				             import traceback
			
 
				-            logger.debug(f"PDF text extraction error: {e}")
			
 
				+            logger.debug(f"pypdfium2 text extraction error: {e}")
			
 
				             logger.debug(traceback.format_exc())
			
 
				             return "", False
			
 
				     
			
 
				     @staticmethod
			
 
				+    def _extract_text_from_pdf_fitz(
			
 
				+        pdf_doc: Any,
			
 
				+        page_idx: int,
			
 
				+        bbox: List[float],
			
 
				+        scale: float
			
 
				+    ) -> Tuple[str, bool]:
			
 
				+        """使用 fitz 提取文本"""
			
 
				+        try:
			
 
				+            import fitz
			
 
				+        except ImportError:
			
 
				+            logger.error("PyMuPDF (fitz) not available")
			
 
				+            return "", False
			
 
				+        
			
 
				+        try:
			
 
				+            page = pdf_doc[page_idx]
			
 
				+            
			
 
				+            # 将图像坐标转换为PDF坐标
			
 
				+            pdf_bbox = fitz.Rect(
			
 
				+                bbox[0] / scale,
			
 
				+                bbox[1] / scale,
			
 
				+                bbox[2] / scale,
			
 
				+                bbox[3] / scale
			
 
				+            )
			
 
				+            
			
 
				+            # 提取区域内的文本
			
 
				+            text = page.get_text("text", clip=pdf_bbox)
			
 
				+            
			
 
				+            return text.strip(), bool(text.strip())
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            import traceback
			
 
				+            logger.debug(f"fitz text extraction error: {e}")
			
 
				+            logger.debug(traceback.format_exc())
			
 
				+            return "", False
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def extract_all_text_blocks(
			
 
				+        pdf_doc: Any,
			
 
				+        page_idx: int,
			
 
				+        scale: float
			
 
				+    ) -> List[Dict[str, Any]]:
			
 
				+        """
			
 
				+        提取页面所有文本块（支持 pypdfium2 和 fitz）
			
 
				+        
			
 
				+        Args:
			
 
				+            pdf_doc: PDF文档对象
			
 
				+            page_idx: 页码
			
 
				+            scale: 缩放比例
			
 
				+            
			
 
				+        Returns:
			
 
				+            文本块列表 [{'text': str, 'bbox': [x1, y1, x2, y2]}, ...]
			
 
				+        """
			
 
				+        # 检测 PDF 文档类型
			
 
				+        doc_type = PDFUtils._detect_pdf_doc_type(pdf_doc)
			
 
				+        
			
 
				+        if doc_type == 'fitz':
			
 
				+            return PDFUtils._extract_all_text_blocks_fitz(pdf_doc, page_idx, scale)
			
 
				+        else:  # pypdfium2
			
 
				+            return PDFUtils._extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale)
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def _extract_all_text_blocks_pypdfium2(
			
 
				+        pdf_doc: Any,
			
 
				+        page_idx: int,
			
 
				+        scale: float
			
 
				+    ) -> List[Dict[str, Any]]:
			
 
				+        """使用 pypdfium2 提取所有文本块（原有实现）"""
			
 
				+        if not MINERU_AVAILABLE or pdf_get_page_text is None:
			
 
				+            return []
			
 
				+            
			
 
				+        try:
			
 
				+            page = pdf_doc[page_idx]
			
 
				+            page_dict = pdf_get_page_text(page)
			
 
				+            
			
 
				+            extracted_blocks = []
			
 
				+            
			
 
				+            for block in page_dict.get('blocks', []):
			
 
				+                for line in block.get('lines', []):
			
 
				+                    line_text = ""
			
 
				+                    for span in line.get('spans', []):
			
 
				+                        line_text += span.get('text', "")
			
 
				+                    
			
 
				+                    if not line_text.strip():
			
 
				+                        continue
			
 
				+                        
			
 
				+                    line_bbox = line.get('bbox')
			
 
				+                    if line_bbox and hasattr(line_bbox, 'bbox'):
			
 
				+                        line_bbox = line_bbox.bbox
			
 
				+                    elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
			
 
				+                        line_bbox = list(line_bbox)
			
 
				+                    else:
			
 
				+                        continue
			
 
				+                        
			
 
				+                    img_bbox = [
			
 
				+                        line_bbox[0] * scale,
			
 
				+                        line_bbox[1] * scale,
			
 
				+                        line_bbox[2] * scale,
			
 
				+                        line_bbox[3] * scale
			
 
				+                    ]
			
 
				+                    
			
 
				+                    extracted_blocks.append({
			
 
				+                        'text': line_text,
			
 
				+                        'bbox': img_bbox,
			
 
				+                        'origin_bbox': line_bbox
			
 
				+                    })
			
 
				+            
			
 
				+            return extracted_blocks
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            logger.warning(f"pypdfium2 extract_all_text_blocks failed: {e}")
			
 
				+            import traceback
			
 
				+            logger.debug(traceback.format_exc())
			
 
				+            return []
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def _extract_all_text_blocks_fitz(
			
 
				+        pdf_doc: Any,
			
 
				+        page_idx: int,
			
 
				+        scale: float
			
 
				+    ) -> List[Dict[str, Any]]:
			
 
				+        """使用 fitz 提取所有文本块"""
			
 
				+        try:
			
 
				+            import fitz
			
 
				+        except ImportError:
			
 
				+            logger.warning("PyMuPDF (fitz) not available")
			
 
				+            return []
			
 
				+        
			
 
				+        try:
			
 
				+            page = pdf_doc[page_idx]
			
 
				+            
			
 
				+            # 使用 get_text("dict") 获取详细的文本信息
			
 
				+            text_dict = page.get_text("dict")
			
 
				+            
			
 
				+            extracted_blocks = []
			
 
				+            
			
 
				+            # 遍历所有 blocks
			
 
				+            for block in text_dict.get("blocks", []):
			
 
				+                # 只处理文本块（type=0）
			
 
				+                if block.get("type") != 0:
			
 
				+                    continue
			
 
				+                
			
 
				+                # 遍历所有 lines
			
 
				+                for line in block.get("lines", []):
			
 
				+                    line_text = ""
			
 
				+                    line_bbox = line.get("bbox")
			
 
				+                    
			
 
				+                    # 提取 line 中的所有 span 文本
			
 
				+                    for span in line.get("spans", []):
			
 
				+                        line_text += span.get("text", "")
			
 
				+                    
			
 
				+                    if not line_text.strip() or not line_bbox:
			
 
				+                        continue
			
 
				+                    
			
 
				+                    # PDF 坐标转换为图像坐标
			
 
				+                    img_bbox = [
			
 
				+                        line_bbox[0] * scale,
			
 
				+                        line_bbox[1] * scale,
			
 
				+                        line_bbox[2] * scale,
			
 
				+                        line_bbox[3] * scale
			
 
				+                    ]
			
 
				+                    
			
 
				+                    extracted_blocks.append({
			
 
				+                        'text': line_text,
			
 
				+                        'bbox': img_bbox,
			
 
				+                        'origin_bbox': list(line_bbox)
			
 
				+                    })
			
 
				+            
			
 
				+            return extracted_blocks
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            logger.warning(f"fitz extract_all_text_blocks failed: {e}")
			
 
				+            import traceback
			
 
				+            logger.debug(traceback.format_exc())
			
 
				+            return []    
			
 
				+
			
 
				+    @staticmethod
			
 
				     def _bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool:
			
 
				         """检查两个 bbox 是否重叠"""
			
 
				         if len(bbox1) < 4 or len(bbox2) < 4: