|
|
@@ -116,7 +116,8 @@ class PDFUtils:
|
|
|
document_path: Path,
|
|
|
dpi: int = 200,
|
|
|
page_range: Optional[str] = None,
|
|
|
- renderer: str = "fitz"
|
|
|
+ renderer: str = "fitz",
|
|
|
+ pdf_bytes: Optional[bytes] = None,
|
|
|
) -> Tuple[List[Dict], str, Optional[Any], str]:
|
|
|
"""
|
|
|
加载文档并分类,支持页面范围过滤
|
|
|
@@ -128,6 +129,7 @@ class PDFUtils:
|
|
|
- PDF:按页码(从1开始)
|
|
|
- 图片目录:按文件名排序后的位置(从1开始)
|
|
|
renderer: PDF渲染引擎,"fitz" 或 "pypdfium2"
|
|
|
+ pdf_bytes: 可选的 PDF 字节数据;若提供则跳过从文件读取(用于内存中预处理后的 PDF)
|
|
|
|
|
|
Returns:
|
|
|
(images_list, pdf_type, pdf_doc, renderer_used)
|
|
|
@@ -177,8 +179,9 @@ class PDFUtils:
|
|
|
if not MINERU_AVAILABLE:
|
|
|
raise RuntimeError("MinerU components not available for PDF processing")
|
|
|
|
|
|
- with open(document_path, 'rb') as f:
|
|
|
- pdf_bytes = f.read()
|
|
|
+ if pdf_bytes is None:
|
|
|
+ with open(document_path, 'rb') as f:
|
|
|
+ pdf_bytes = f.read()
|
|
|
|
|
|
# PDF分类
|
|
|
pdf_type = pdf_classify(pdf_bytes)
|