3 місяців тому · b6b75a00ba
--- a/ocr_utils/pdf_utils.py
+++ b/ocr_utils/pdf_utils.py
@@ -116,7 +116,8 @@ class PDFUtils:
 
				         document_path: Path,
			
 
				         dpi: int = 200,
			
 
				         page_range: Optional[str] = None,
			
 
				-        renderer: str = "fitz"
			
 
				+        renderer: str = "fitz",
			
 
				+        pdf_bytes: Optional[bytes] = None,
			
 
				     ) -> Tuple[List[Dict], str, Optional[Any], str]:
			
 
				         """
			
 
				         加载文档并分类，支持页面范围过滤
			
@@ -128,6 +129,7 @@ class PDFUtils:
 
				                        - PDF：按页码（从1开始）
			
 
				                        - 图片目录：按文件名排序后的位置（从1开始）
			
 
				             renderer: PDF渲染引擎，"fitz" 或 "pypdfium2"
			
 
				+            pdf_bytes: 可选的 PDF 字节数据；若提供则跳过从文件读取（用于内存中预处理后的 PDF）
			
 
				             
			
 
				         Returns:
			
 
				             (images_list, pdf_type, pdf_doc, renderer_used)
			
@@ -177,8 +179,9 @@ class PDFUtils:
 
				             if not MINERU_AVAILABLE:
			
 
				                 raise RuntimeError("MinerU components not available for PDF processing")
			
 
				             
			
 
				-            with open(document_path, 'rb') as f:
			
 
				-                pdf_bytes = f.read()
			
 
				+            if pdf_bytes is None:
			
 
				+                with open(document_path, 'rb') as f:
			
 
				+                    pdf_bytes = f.read()
			
 
				             
			
 
				             # PDF分类
			
 
				             pdf_type = pdf_classify(pdf_bytes)