|
|
@@ -201,7 +201,11 @@ class EnhancedDocPipeline:
|
|
|
处理结果字典
|
|
|
"""
|
|
|
doc_path = Path(document_path)
|
|
|
+ doc_name = doc_path.stem
|
|
|
|
|
|
+ # 判断输入类型
|
|
|
+ is_pdf = doc_path.suffix.lower() == '.pdf'
|
|
|
+
|
|
|
results = {
|
|
|
'scene': self.scene_name,
|
|
|
'document_path': str(doc_path),
|
|
|
@@ -219,6 +223,11 @@ class EnhancedDocPipeline:
|
|
|
results['metadata']['pdf_type'] = pdf_type
|
|
|
results['metadata']['page_count'] = len(images)
|
|
|
results['metadata']['page_range'] = page_range
|
|
|
+
|
|
|
+ # 保存文档元数据
|
|
|
+ self.doc_name = doc_name
|
|
|
+ self.is_pdf = is_pdf
|
|
|
+ self.total_pages = len(images)
|
|
|
|
|
|
logger.info(f"📄 Loaded {len(images)} pages, type: {pdf_type}")
|
|
|
|
|
|
@@ -227,6 +236,12 @@ class EnhancedDocPipeline:
|
|
|
# 使用原始页码索引(支持页面范围过滤)
|
|
|
page_idx = image_dict.get('page_idx', idx)
|
|
|
page_name = image_dict.get('page_name', f'page_{page_idx + 1:03d}')
|
|
|
+ # 如果定义将pdf输出图片,则保存
|
|
|
+ if is_pdf and self.config.get('output', {}).get('save_pdf_images', False):
|
|
|
+ image_path = Path(output_dir).resolve() / f"{doc_name}" / f"{doc_name}_page_{page_idx + 1:03d}.png"
|
|
|
+ image_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
+ image_dict.get('img_pil').save(image_path)
|
|
|
+
|
|
|
logger.info(f"🔍 Processing page {idx + 1}/{len(images)} (original index: {page_idx})")
|
|
|
|
|
|
page_result = self._process_single_page(
|