|
|
@@ -1,4 +1,4 @@
|
|
|
-"""单进程运行稳定"""
|
|
|
+"""PDF转图像后统一处理"""
|
|
|
import json
|
|
|
import time
|
|
|
import os
|
|
|
@@ -7,7 +7,7 @@ import argparse
|
|
|
import sys
|
|
|
import warnings
|
|
|
from pathlib import Path
|
|
|
-from typing import List, Dict, Any
|
|
|
+from typing import List, Dict, Any, Union
|
|
|
import cv2
|
|
|
import numpy as np
|
|
|
|
|
|
@@ -27,24 +27,127 @@ from utils import (
|
|
|
get_image_files_from_dir,
|
|
|
get_image_files_from_list,
|
|
|
get_image_files_from_csv,
|
|
|
- collect_pid_files
|
|
|
+ collect_pid_files,
|
|
|
+ load_images_from_pdf
|
|
|
)
|
|
|
|
|
|
-def process_images_single_process(image_paths: List[str],
|
|
|
- pipeline_name: str = "PP-StructureV3",
|
|
|
- device: str = "gpu:0",
|
|
|
- output_dir: str = "./output") -> List[Dict[str, Any]]:
|
|
|
+def convert_pdf_to_images(pdf_file: str, output_dir: str | None = None, dpi: int = 200) -> List[str]:
|
|
|
"""
|
|
|
- 单进程版本的图像处理函数
|
|
|
+ 将PDF转换为图像文件
|
|
|
|
|
|
Args:
|
|
|
- image_paths: 图像路径列表
|
|
|
- pipeline_name: Pipeline名称
|
|
|
- device: 设备字符串,如"gpu:0"或"cpu"
|
|
|
+ pdf_file: PDF文件路径
|
|
|
output_dir: 输出目录
|
|
|
+ dpi: 图像分辨率
|
|
|
|
|
|
Returns:
|
|
|
- 处理结果列表
|
|
|
+ 生成的图像文件路径列表
|
|
|
+ """
|
|
|
+ pdf_path = Path(pdf_file)
|
|
|
+ if not pdf_path.exists() or pdf_path.suffix.lower() != '.pdf':
|
|
|
+ print(f"❌ Invalid PDF file: {pdf_path}")
|
|
|
+ return []
|
|
|
+
|
|
|
+ # 如果没有指定输出目录,使用PDF同名目录
|
|
|
+ if output_dir is None:
|
|
|
+ output_path = pdf_path.parent / f"{pdf_path.stem}"
|
|
|
+ else:
|
|
|
+ output_path = Path(output_dir) / f"{pdf_path.stem}"
|
|
|
+ output_path = output_path.resolve()
|
|
|
+ output_path.mkdir(parents=True, exist_ok=True)
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 使用doc_utils中的函数加载PDF图像
|
|
|
+ images = load_images_from_pdf(str(pdf_path), dpi=dpi)
|
|
|
+
|
|
|
+ image_paths = []
|
|
|
+ for i, image in enumerate(images):
|
|
|
+ # 生成图像文件名
|
|
|
+ image_filename = f"{pdf_path.stem}_page_{i+1:03d}.png"
|
|
|
+ image_path = output_path / image_filename
|
|
|
+
|
|
|
+ # 保存图像
|
|
|
+ image.save(str(image_path))
|
|
|
+ image_paths.append(str(image_path))
|
|
|
+
|
|
|
+ print(f"✅ Converted {len(images)} pages from {pdf_path.name} to images")
|
|
|
+ return image_paths
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f"❌ Error converting PDF {pdf_path}: {e}")
|
|
|
+ traceback.print_exc()
|
|
|
+ return []
|
|
|
+
|
|
|
+def get_input_files(args) -> List[str]:
|
|
|
+ """
|
|
|
+ 获取输入文件列表,统一处理PDF和图像文件
|
|
|
+
|
|
|
+ Args:
|
|
|
+ args: 命令行参数
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 处理后的图像文件路径列表
|
|
|
+ """
|
|
|
+ input_files = []
|
|
|
+
|
|
|
+ # 获取原始输入文件
|
|
|
+ if args.input_csv:
|
|
|
+ raw_files = get_image_files_from_csv(args.input_csv, "fail")
|
|
|
+ elif args.input_file_list:
|
|
|
+ raw_files = get_image_files_from_list(args.input_file_list)
|
|
|
+ else:
|
|
|
+ input_dir = Path(args.input_dir).resolve()
|
|
|
+ if not input_dir.exists():
|
|
|
+ print(f"❌ Input directory does not exist: {input_dir}")
|
|
|
+ return []
|
|
|
+
|
|
|
+ # 获取所有支持的文件(图像和PDF)
|
|
|
+ image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
|
|
|
+ pdf_extensions = ['.pdf']
|
|
|
+
|
|
|
+ raw_files = []
|
|
|
+ for ext in image_extensions + pdf_extensions:
|
|
|
+ raw_files.extend(list(input_dir.glob(f"*{ext}")))
|
|
|
+ raw_files.extend(list(input_dir.glob(f"*{ext.upper()}")))
|
|
|
+
|
|
|
+ raw_files = [str(f) for f in raw_files]
|
|
|
+
|
|
|
+ # 分别处理PDF和图像文件
|
|
|
+ pdf_count = 0
|
|
|
+ image_count = 0
|
|
|
+
|
|
|
+ for file_path in raw_files:
|
|
|
+ file_path = Path(file_path)
|
|
|
+
|
|
|
+ if file_path.suffix.lower() == '.pdf':
|
|
|
+ # 转换PDF为图像
|
|
|
+ print(f"📄 Processing PDF: {file_path.name}")
|
|
|
+ pdf_images = convert_pdf_to_images(
|
|
|
+ str(file_path),
|
|
|
+ args.output_dir,
|
|
|
+ dpi=args.pdf_dpi
|
|
|
+ )
|
|
|
+ input_files.extend(pdf_images)
|
|
|
+ pdf_count += 1
|
|
|
+ else:
|
|
|
+ # 直接添加图像文件
|
|
|
+ if file_path.exists():
|
|
|
+ input_files.append(str(file_path))
|
|
|
+ image_count += 1
|
|
|
+
|
|
|
+ print(f"📊 Input summary:")
|
|
|
+ print(f" PDF files processed: {pdf_count}")
|
|
|
+ print(f" Image files found: {image_count}")
|
|
|
+ print(f" Total image files to process: {len(input_files)}")
|
|
|
+
|
|
|
+ return input_files
|
|
|
+
|
|
|
+def process_images_unified(image_paths: List[str],
|
|
|
+ pipeline_name: str = "PP-StructureV3",
|
|
|
+ device: str = "gpu:0",
|
|
|
+ output_dir: str = "./output") -> List[Dict[str, Any]]:
|
|
|
+ """
|
|
|
+ 统一的图像处理函数(修改自ppstructurev3_single_process.py)
|
|
|
"""
|
|
|
# 创建输出目录
|
|
|
output_path = Path(output_dir)
|
|
|
@@ -81,12 +184,12 @@ def process_images_single_process(image_paths: List[str],
|
|
|
try:
|
|
|
# 使用pipeline预测单个图像
|
|
|
results = pipeline.predict(
|
|
|
- img_path, # 传入单个文件路径
|
|
|
+ img_path,
|
|
|
use_doc_orientation_classify=True,
|
|
|
use_doc_unwarping=False,
|
|
|
use_seal_recognition=True,
|
|
|
use_table_recognition=True,
|
|
|
- use_formula_recognition=False, # 暂时关闭公式识别以避免错误
|
|
|
+ use_formula_recognition=False,
|
|
|
use_chart_recognition=True,
|
|
|
)
|
|
|
|
|
|
@@ -117,7 +220,8 @@ def process_images_single_process(image_paths: List[str],
|
|
|
"success": True,
|
|
|
"device": device,
|
|
|
"output_json": json_output_path,
|
|
|
- "output_md": md_output_path
|
|
|
+ "output_md": md_output_path,
|
|
|
+ "is_pdf_page": "_page_" in input_path.name # 标记是否为PDF页面
|
|
|
})
|
|
|
|
|
|
except Exception as e:
|
|
|
@@ -157,102 +261,91 @@ def process_images_single_process(image_paths: List[str],
|
|
|
|
|
|
return all_results
|
|
|
|
|
|
-
|
|
|
def main():
|
|
|
"""主函数"""
|
|
|
- parser = argparse.ArgumentParser(description="PaddleX PP-StructureV3 Single Process Processing")
|
|
|
+ parser = argparse.ArgumentParser(description="PaddleX PP-StructureV3 Unified PDF/Image Processor")
|
|
|
|
|
|
# 参数定义
|
|
|
input_group = parser.add_mutually_exclusive_group(required=True)
|
|
|
- input_group.add_argument("--input_dir", type=str, help="Input directory")
|
|
|
+ input_group.add_argument("--input_dir", type=str, help="Input directory (supports both PDF and image files)")
|
|
|
input_group.add_argument("--input_file_list", type=str, help="Input file list (one file per line)")
|
|
|
input_group.add_argument("--input_csv", type=str, help="Input CSV file with image_path and status columns")
|
|
|
|
|
|
- parser.add_argument("--output_dir", type=str, help="Output directory")
|
|
|
+ parser.add_argument("--output_dir", type=str, required=True, help="Output directory")
|
|
|
parser.add_argument("--pipeline", type=str, default="PP-StructureV3", help="Pipeline name")
|
|
|
parser.add_argument("--device", type=str, default="gpu:0", help="Device string (e.g., 'gpu:0', 'cpu')")
|
|
|
- parser.add_argument("--input_pattern", type=str, default="*", help="Input file pattern")
|
|
|
- parser.add_argument("--test_mode", action="store_true", help="Test mode (process only 20 images)")
|
|
|
- parser.add_argument("--collect_results",type=str, help="收集处理结果到指定CSV文件")
|
|
|
+ parser.add_argument("--pdf_dpi", type=int, default=200, help="DPI for PDF to image conversion")
|
|
|
+ parser.add_argument("--test_mode", action="store_true", help="Test mode (process only 20 files)")
|
|
|
+ parser.add_argument("--collect_results", type=str, help="收集处理结果到指定CSV文件")
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
try:
|
|
|
- # 获取图像文件列表
|
|
|
- if args.input_csv:
|
|
|
- # 从CSV文件读取
|
|
|
- image_files = get_image_files_from_csv(args.input_csv, "fail")
|
|
|
- print(f"📊 Loaded {len(image_files)} files from CSV with status filter: fail")
|
|
|
- elif args.input_file_list:
|
|
|
- # 从文件列表读取
|
|
|
- image_files = get_image_files_from_list(args.input_file_list)
|
|
|
- else:
|
|
|
- # 从目录读取
|
|
|
- input_dir = Path(args.input_dir).resolve()
|
|
|
- print(f"📁 Input dir: {input_dir}")
|
|
|
-
|
|
|
- if not input_dir.exists():
|
|
|
- print(f"❌ Input directory does not exist: {input_dir}")
|
|
|
- return 1
|
|
|
-
|
|
|
- print(f"Input dir: {input_dir}")
|
|
|
- image_files = get_image_files_from_dir(input_dir)
|
|
|
-
|
|
|
- output_dir = Path(args.output_dir).resolve()
|
|
|
- print(f"Output dir: {output_dir}")
|
|
|
- print(f"Found {len(image_files)} image files")
|
|
|
+ # 获取并预处理输入文件
|
|
|
+ print("🔄 Preprocessing input files...")
|
|
|
+ input_files = get_input_files(args)
|
|
|
+
|
|
|
+ if not input_files:
|
|
|
+ print("❌ No input files found or processed")
|
|
|
+ return 1
|
|
|
|
|
|
if args.test_mode:
|
|
|
- image_files = image_files[:20]
|
|
|
- print(f"Test mode: processing only {len(image_files)} images")
|
|
|
+ input_files = input_files[:20]
|
|
|
+ print(f"Test mode: processing only {len(input_files)} images")
|
|
|
|
|
|
print(f"Using device: {args.device}")
|
|
|
|
|
|
- # 开始处理(删除了 batch_size 参数)
|
|
|
+ # 开始处理
|
|
|
start_time = time.time()
|
|
|
- results = process_images_single_process(
|
|
|
- image_files,
|
|
|
+ results = process_images_unified(
|
|
|
+ input_files,
|
|
|
args.pipeline,
|
|
|
args.device,
|
|
|
- str(output_dir)
|
|
|
+ args.output_dir
|
|
|
)
|
|
|
total_time = time.time() - start_time
|
|
|
|
|
|
# 统计结果
|
|
|
success_count = sum(1 for r in results if r.get('success', False))
|
|
|
error_count = len(results) - success_count
|
|
|
+ pdf_page_count = sum(1 for r in results if r.get('is_pdf_page', False))
|
|
|
|
|
|
print(f"\n" + "="*60)
|
|
|
print(f"✅ Processing completed!")
|
|
|
print(f"📊 Statistics:")
|
|
|
- print(f" Total files: {len(image_files)}")
|
|
|
+ print(f" Total files processed: {len(input_files)}")
|
|
|
+ print(f" PDF pages processed: {pdf_page_count}")
|
|
|
+ print(f" Regular images processed: {len(input_files) - pdf_page_count}")
|
|
|
print(f" Successful: {success_count}")
|
|
|
print(f" Failed: {error_count}")
|
|
|
- if len(image_files) > 0:
|
|
|
- print(f" Success rate: {success_count / len(image_files) * 100:.2f}%")
|
|
|
+ if len(input_files) > 0:
|
|
|
+ print(f" Success rate: {success_count / len(input_files) * 100:.2f}%")
|
|
|
print(f"⏱️ Performance:")
|
|
|
print(f" Total time: {total_time:.2f} seconds")
|
|
|
if total_time > 0:
|
|
|
- print(f" Throughput: {len(image_files) / total_time:.2f} images/second")
|
|
|
- print(f" Avg time per image: {total_time / len(image_files):.2f} seconds")
|
|
|
+ print(f" Throughput: {len(input_files) / total_time:.2f} files/second")
|
|
|
+ print(f" Avg time per file: {total_time / len(input_files):.2f} seconds")
|
|
|
|
|
|
- # 保存结果统计(删除了 batch_size 统计)
|
|
|
+ # 保存结果统计
|
|
|
stats = {
|
|
|
- "total_files": len(image_files),
|
|
|
+ "total_files": len(input_files),
|
|
|
+ "pdf_pages": pdf_page_count,
|
|
|
+ "regular_images": len(input_files) - pdf_page_count,
|
|
|
"success_count": success_count,
|
|
|
"error_count": error_count,
|
|
|
- "success_rate": success_count / len(image_files) if len(image_files) > 0 else 0,
|
|
|
+ "success_rate": success_count / len(input_files) if len(input_files) > 0 else 0,
|
|
|
"total_time": total_time,
|
|
|
- "throughput": len(image_files) / total_time if total_time > 0 else 0,
|
|
|
- "avg_time_per_image": total_time / len(image_files) if len(image_files) > 0 else 0,
|
|
|
+ "throughput": len(input_files) / total_time if total_time > 0 else 0,
|
|
|
+ "avg_time_per_file": total_time / len(input_files) if len(input_files) > 0 else 0,
|
|
|
"device": args.device,
|
|
|
"pipeline": args.pipeline,
|
|
|
+ "pdf_dpi": args.pdf_dpi,
|
|
|
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
}
|
|
|
|
|
|
# 保存最终结果
|
|
|
- output_file_name = Path(output_dir).name
|
|
|
- output_file = os.path.join(output_dir, f"{output_file_name}.json")
|
|
|
+ output_file_name = Path(args.output_dir).name
|
|
|
+ output_file = os.path.join(args.output_dir, f"{output_file_name}_unified.json")
|
|
|
final_results = {
|
|
|
"stats": stats,
|
|
|
"results": results
|
|
|
@@ -281,7 +374,7 @@ def main():
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
- print(f"🚀 启动单进程OCR程序...")
|
|
|
+ print(f"🚀 启动统一PDF/图像处理程序...")
|
|
|
print(f"🔧 CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'Not set')}")
|
|
|
|
|
|
if len(sys.argv) == 1:
|
|
|
@@ -294,7 +387,7 @@ if __name__ == "__main__":
|
|
|
"output_dir": "./OmniDocBench_PPStructureV3_Results",
|
|
|
"pipeline": "./my_config/PP-StructureV3.yaml",
|
|
|
"device": "gpu:0",
|
|
|
- "collect_results": "./OmniDocBench_PPStructureV3_Results/processed_files.csv",
|
|
|
+ "collect_results": f"./OmniDocBench_PPStructureV3_Results/processed_files_{time.strftime('%Y%m%d_%H%M%S')}.csv",
|
|
|
}
|
|
|
|
|
|
# default_config = {
|