ソースを参照

fix: 添加PDF文档处理时保存页面图像的功能

zhch158_admin 6 時間 前
コミット
95e2272ed9

+ 15 - 0
ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py

@@ -201,7 +201,11 @@ class EnhancedDocPipeline:
             处理结果字典
         """
         doc_path = Path(document_path)
+        doc_name = doc_path.stem
         
+        # 判断输入类型
+        is_pdf = doc_path.suffix.lower() == '.pdf'        
+
         results = {
             'scene': self.scene_name,
             'document_path': str(doc_path),
@@ -219,6 +223,11 @@ class EnhancedDocPipeline:
             results['metadata']['pdf_type'] = pdf_type
             results['metadata']['page_count'] = len(images)
             results['metadata']['page_range'] = page_range
+
+            # 保存文档元数据
+            self.doc_name = doc_name
+            self.is_pdf = is_pdf
+            self.total_pages = len(images)            
             
             logger.info(f"📄 Loaded {len(images)} pages, type: {pdf_type}")
             
@@ -227,6 +236,12 @@ class EnhancedDocPipeline:
                 # 使用原始页码索引(支持页面范围过滤)
                 page_idx = image_dict.get('page_idx', idx)
                 page_name = image_dict.get('page_name', f'page_{page_idx + 1:03d}')
+                # 如果定义将pdf输出图片,则保存
+                if is_pdf and self.config.get('output', {}).get('save_pdf_images', False):
+                    image_path = Path(output_dir).resolve() / f"{doc_name}" / f"{doc_name}_page_{page_idx + 1:03d}.png"
+                    image_path.parent.mkdir(parents=True, exist_ok=True)
+                    image_dict.get('img_pil').save(image_path)
+                
                 logger.info(f"🔍 Processing page {idx + 1}/{len(images)} (original index: {page_idx})")
                 
                 page_result = self._process_single_page(

+ 5 - 0
ocr_tools/universal_doc_parser/core/pipeline_manager_v2_streaming.py

@@ -148,6 +148,11 @@ class StreamingDocPipeline(EnhancedDocPipeline):
             for idx, image_dict in enumerate(images):
                 page_idx = image_dict.get('page_idx', idx)
                 page_name = image_dict.get('page_name', f'page_{page_idx + 1:03d}')
+                # 如果定义将pdf输出图片,则保存
+                if is_pdf and output_config.get('save_pdf_images', False):
+                    image_path = self.output_dir / f"{doc_name}" / f"{doc_name}_page_{page_idx + 1:03d}.png"
+                    image_path.parent.mkdir(parents=True, exist_ok=True)
+                    image_dict.get('img_pil').save(image_path)
                 
                 logger.info(f"🔍 Processing page {idx + 1}/{len(images)} (original index: {page_idx})")