Эх сурвалжийг харах

feat: 实现PDF文件转换为图像的功能,统一处理PDF和图像输入

zhch158_admin 2 сар өмнө
parent
commit
9e128d998e

+ 158 - 65
zhch/ppstructurev3_single_process.py

@@ -1,4 +1,4 @@
-"""单进程运行稳定"""
+"""PDF转图像后统一处理"""
 import json
 import time
 import os
@@ -7,7 +7,7 @@ import argparse
 import sys
 import warnings
 from pathlib import Path
-from typing import List, Dict, Any
+from typing import List, Dict, Any, Union
 import cv2
 import numpy as np
 
@@ -27,24 +27,127 @@ from utils import (
     get_image_files_from_dir,
     get_image_files_from_list,
     get_image_files_from_csv,
-    collect_pid_files
+    collect_pid_files,
+    load_images_from_pdf
 )
 
-def process_images_single_process(image_paths: List[str],
-                                pipeline_name: str = "PP-StructureV3",
-                                device: str = "gpu:0",
-                                output_dir: str = "./output") -> List[Dict[str, Any]]:
+def convert_pdf_to_images(pdf_file: str, output_dir: str | None = None, dpi: int = 200) -> List[str]:
     """
-    单进程版本的图像处理函数
+    将PDF转换为图像文件
     
     Args:
-        image_paths: 图像路径列表
-        pipeline_name: Pipeline名称
-        device: 设备字符串,如"gpu:0"或"cpu"
+        pdf_file: PDF文件路径
         output_dir: 输出目录
+        dpi: 图像分辨率
         
     Returns:
-        处理结果列表
+        生成的图像文件路径列表
+    """
+    pdf_path = Path(pdf_file)
+    if not pdf_path.exists() or pdf_path.suffix.lower() != '.pdf':
+        print(f"❌ Invalid PDF file: {pdf_path}")
+        return []
+
+    # 如果没有指定输出目录,使用PDF同名目录
+    if output_dir is None:
+        output_path = pdf_path.parent / f"{pdf_path.stem}"
+    else:
+        output_path = Path(output_dir) / f"{pdf_path.stem}"
+    output_path = output_path.resolve()
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    try:
+        # 使用doc_utils中的函数加载PDF图像
+        images = load_images_from_pdf(str(pdf_path), dpi=dpi)
+        
+        image_paths = []
+        for i, image in enumerate(images):
+            # 生成图像文件名
+            image_filename = f"{pdf_path.stem}_page_{i+1:03d}.png"
+            image_path = output_path / image_filename
+
+            # 保存图像
+            image.save(str(image_path))
+            image_paths.append(str(image_path))
+            
+        print(f"✅ Converted {len(images)} pages from {pdf_path.name} to images")
+        return image_paths
+        
+    except Exception as e:
+        print(f"❌ Error converting PDF {pdf_path}: {e}")
+        traceback.print_exc()
+        return []
+
+def get_input_files(args) -> List[str]:
+    """
+    获取输入文件列表,统一处理PDF和图像文件
+    
+    Args:
+        args: 命令行参数
+        
+    Returns:
+        处理后的图像文件路径列表
+    """
+    input_files = []
+    
+    # 获取原始输入文件
+    if args.input_csv:
+        raw_files = get_image_files_from_csv(args.input_csv, "fail")
+    elif args.input_file_list:
+        raw_files = get_image_files_from_list(args.input_file_list)
+    else:
+        input_dir = Path(args.input_dir).resolve()
+        if not input_dir.exists():
+            print(f"❌ Input directory does not exist: {input_dir}")
+            return []
+        
+        # 获取所有支持的文件(图像和PDF)
+        image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
+        pdf_extensions = ['.pdf']
+        
+        raw_files = []
+        for ext in image_extensions + pdf_extensions:
+            raw_files.extend(list(input_dir.glob(f"*{ext}")))
+            raw_files.extend(list(input_dir.glob(f"*{ext.upper()}")))
+        
+        raw_files = [str(f) for f in raw_files]
+    
+    # 分别处理PDF和图像文件
+    pdf_count = 0
+    image_count = 0
+    
+    for file_path in raw_files:
+        file_path = Path(file_path)
+        
+        if file_path.suffix.lower() == '.pdf':
+            # 转换PDF为图像
+            print(f"📄 Processing PDF: {file_path.name}")
+            pdf_images = convert_pdf_to_images(
+                str(file_path), 
+                args.output_dir,
+                dpi=args.pdf_dpi
+            )
+            input_files.extend(pdf_images)
+            pdf_count += 1
+        else:
+            # 直接添加图像文件
+            if file_path.exists():
+                input_files.append(str(file_path))
+                image_count += 1
+    
+    print(f"📊 Input summary:")
+    print(f"  PDF files processed: {pdf_count}")
+    print(f"  Image files found: {image_count}")
+    print(f"  Total image files to process: {len(input_files)}")
+    
+    return input_files
+
+def process_images_unified(image_paths: List[str],
+                         pipeline_name: str = "PP-StructureV3",
+                         device: str = "gpu:0",
+                         output_dir: str = "./output") -> List[Dict[str, Any]]:
+    """
+    统一的图像处理函数(修改自ppstructurev3_single_process.py)
     """
     # 创建输出目录
     output_path = Path(output_dir)
@@ -81,12 +184,12 @@ def process_images_single_process(image_paths: List[str],
             try:
                 # 使用pipeline预测单个图像
                 results = pipeline.predict(
-                    img_path,  # 传入单个文件路径
+                    img_path,
                     use_doc_orientation_classify=True,
                     use_doc_unwarping=False,
                     use_seal_recognition=True,
                     use_table_recognition=True,
-                    use_formula_recognition=False,  # 暂时关闭公式识别以避免错误
+                    use_formula_recognition=False,
                     use_chart_recognition=True,
                 )
                 
@@ -117,7 +220,8 @@ def process_images_single_process(image_paths: List[str],
                             "success": True,
                             "device": device,
                             "output_json": json_output_path,
-                            "output_md": md_output_path
+                            "output_md": md_output_path,
+                            "is_pdf_page": "_page_" in input_path.name  # 标记是否为PDF页面
                         })
                         
                     except Exception as e:
@@ -157,102 +261,91 @@ def process_images_single_process(image_paths: List[str],
     
     return all_results
 
-
 def main():
     """主函数"""
-    parser = argparse.ArgumentParser(description="PaddleX PP-StructureV3 Single Process Processing")
+    parser = argparse.ArgumentParser(description="PaddleX PP-StructureV3 Unified PDF/Image Processor")
     
     # 参数定义
     input_group = parser.add_mutually_exclusive_group(required=True)
-    input_group.add_argument("--input_dir", type=str, help="Input directory")
+    input_group.add_argument("--input_dir", type=str, help="Input directory (supports both PDF and image files)")
     input_group.add_argument("--input_file_list", type=str, help="Input file list (one file per line)")
     input_group.add_argument("--input_csv", type=str, help="Input CSV file with image_path and status columns")
 
-    parser.add_argument("--output_dir", type=str, help="Output directory")
+    parser.add_argument("--output_dir", type=str, required=True, help="Output directory")
     parser.add_argument("--pipeline", type=str, default="PP-StructureV3", help="Pipeline name")
     parser.add_argument("--device", type=str, default="gpu:0", help="Device string (e.g., 'gpu:0', 'cpu')")
-    parser.add_argument("--input_pattern", type=str, default="*", help="Input file pattern")
-    parser.add_argument("--test_mode", action="store_true", help="Test mode (process only 20 images)")
-    parser.add_argument("--collect_results",type=str, help="收集处理结果到指定CSV文件")
+    parser.add_argument("--pdf_dpi", type=int, default=200, help="DPI for PDF to image conversion")
+    parser.add_argument("--test_mode", action="store_true", help="Test mode (process only 20 files)")
+    parser.add_argument("--collect_results", type=str, help="收集处理结果到指定CSV文件")
 
     args = parser.parse_args()
     
     try:
-        # 获取图像文件列表
-        if args.input_csv:
-            # 从CSV文件读取
-            image_files = get_image_files_from_csv(args.input_csv, "fail")
-            print(f"📊 Loaded {len(image_files)} files from CSV with status filter: fail")
-        elif args.input_file_list:
-            # 从文件列表读取
-            image_files = get_image_files_from_list(args.input_file_list)
-        else:
-            # 从目录读取
-            input_dir = Path(args.input_dir).resolve()
-            print(f"📁 Input dir: {input_dir}")
-            
-            if not input_dir.exists():
-                print(f"❌ Input directory does not exist: {input_dir}")
-                return 1
-
-            print(f"Input dir: {input_dir}")
-            image_files = get_image_files_from_dir(input_dir)
-
-        output_dir = Path(args.output_dir).resolve()
-        print(f"Output dir: {output_dir}")
-        print(f"Found {len(image_files)} image files")
+        # 获取并预处理输入文件
+        print("🔄 Preprocessing input files...")
+        input_files = get_input_files(args)
+        
+        if not input_files:
+            print("❌ No input files found or processed")
+            return 1
         
         if args.test_mode:
-            image_files = image_files[:20]
-            print(f"Test mode: processing only {len(image_files)} images")
+            input_files = input_files[:20]
+            print(f"Test mode: processing only {len(input_files)} images")
         
         print(f"Using device: {args.device}")
         
-        # 开始处理(删除了 batch_size 参数)
+        # 开始处理
         start_time = time.time()
-        results = process_images_single_process(
-            image_files,
+        results = process_images_unified(
+            input_files,
             args.pipeline,
             args.device,
-            str(output_dir)
+            args.output_dir
         )
         total_time = time.time() - start_time
         
         # 统计结果
         success_count = sum(1 for r in results if r.get('success', False))
         error_count = len(results) - success_count
+        pdf_page_count = sum(1 for r in results if r.get('is_pdf_page', False))
         
         print(f"\n" + "="*60)
         print(f"✅ Processing completed!")
         print(f"📊 Statistics:")
-        print(f"  Total files: {len(image_files)}")
+        print(f"  Total files processed: {len(input_files)}")
+        print(f"  PDF pages processed: {pdf_page_count}")
+        print(f"  Regular images processed: {len(input_files) - pdf_page_count}")
         print(f"  Successful: {success_count}")
         print(f"  Failed: {error_count}")
-        if len(image_files) > 0:
-            print(f"  Success rate: {success_count / len(image_files) * 100:.2f}%")
+        if len(input_files) > 0:
+            print(f"  Success rate: {success_count / len(input_files) * 100:.2f}%")
         print(f"⏱️ Performance:")
         print(f"  Total time: {total_time:.2f} seconds")
         if total_time > 0:
-            print(f"  Throughput: {len(image_files) / total_time:.2f} images/second")
-            print(f"  Avg time per image: {total_time / len(image_files):.2f} seconds")
+            print(f"  Throughput: {len(input_files) / total_time:.2f} files/second")
+            print(f"  Avg time per file: {total_time / len(input_files):.2f} seconds")
         
-        # 保存结果统计(删除了 batch_size 统计)
+        # 保存结果统计
         stats = {
-            "total_files": len(image_files),
+            "total_files": len(input_files),
+            "pdf_pages": pdf_page_count,
+            "regular_images": len(input_files) - pdf_page_count,
             "success_count": success_count,
             "error_count": error_count,
-            "success_rate": success_count / len(image_files) if len(image_files) > 0 else 0,
+            "success_rate": success_count / len(input_files) if len(input_files) > 0 else 0,
             "total_time": total_time,
-            "throughput": len(image_files) / total_time if total_time > 0 else 0,
-            "avg_time_per_image": total_time / len(image_files) if len(image_files) > 0 else 0,
+            "throughput": len(input_files) / total_time if total_time > 0 else 0,
+            "avg_time_per_file": total_time / len(input_files) if len(input_files) > 0 else 0,
             "device": args.device,
             "pipeline": args.pipeline,
+            "pdf_dpi": args.pdf_dpi,
             "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
         }
         
         # 保存最终结果
-        output_file_name = Path(output_dir).name
-        output_file = os.path.join(output_dir, f"{output_file_name}.json")
+        output_file_name = Path(args.output_dir).name
+        output_file = os.path.join(args.output_dir, f"{output_file_name}_unified.json")
         final_results = {
             "stats": stats,
             "results": results
@@ -281,7 +374,7 @@ def main():
 
 
 if __name__ == "__main__":
-    print(f"🚀 启动单进程OCR程序...")
+    print(f"🚀 启动统一PDF/图像处理程序...")
     print(f"🔧 CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'Not set')}")
     
     if len(sys.argv) == 1:
@@ -294,7 +387,7 @@ if __name__ == "__main__":
             "output_dir": "./OmniDocBench_PPStructureV3_Results",
             "pipeline": "./my_config/PP-StructureV3.yaml",
             "device": "gpu:0",
-            "collect_results": "./OmniDocBench_PPStructureV3_Results/processed_files.csv",
+            "collect_results": f"./OmniDocBench_PPStructureV3_Results/processed_files_{time.strftime('%Y%m%d_%H%M%S')}.csv",
         }
         
         # default_config = {