|
|
@@ -65,7 +65,7 @@ class PPStructureV3ParallelPredictor:
|
|
|
# 运行PaddleX pipeline
|
|
|
start_time = time.time()
|
|
|
|
|
|
- output = list(self.pipeline.predict(
|
|
|
+ output = self.pipeline.predict(
|
|
|
input=image_path,
|
|
|
device="gpu" if self.use_gpu else "cpu",
|
|
|
use_doc_orientation_classify=True,
|
|
|
@@ -74,10 +74,12 @@ class PPStructureV3ParallelPredictor:
|
|
|
use_chart_recognition=True,
|
|
|
use_table_recognition=True,
|
|
|
use_formula_recognition=True,
|
|
|
- ))
|
|
|
+ )
|
|
|
+ # 可视化结果并保存 json 结果
|
|
|
+ for res in output:
|
|
|
+ res.save_to_json(save_path=self.output_path) # 保存所有结果到指定路径
|
|
|
+ res.save_to_markdown(save_path=self.output_path) # 保存所有结果到指定路径
|
|
|
|
|
|
- output.save_to_json(save_path=self.output_path) # 保存JSON结果
|
|
|
- output.save_to_markdown(save_path=self.output_path) # 保存Markdown结果
|
|
|
process_time = time.time() - start_time
|
|
|
|
|
|
# 添加处理时间信息
|
|
|
@@ -289,19 +291,22 @@ def main():
|
|
|
"""主函数 - 并行处理OmniDocBench数据集"""
|
|
|
|
|
|
# 配置参数
|
|
|
- dataset_path = "/Users/zhch158/workspace/repository.git/OmniDocBench/OpenDataLab___OmniDocBench/images"
|
|
|
- output_dir = "/Users/zhch158/workspace/repository.git/PaddleX/zhch/OmniDocBench_Results"
|
|
|
+ dataset_path = "../../OmniDocBench/OpenDataLab___OmniDocBench/images"
|
|
|
+ output_dir = "./OmniDocBench_Results"
|
|
|
pipeline_config = "PP-StructureV3"
|
|
|
|
|
|
# 并行处理参数
|
|
|
batch_size = 4 # 批处理大小
|
|
|
max_workers = 4 # 最大工作进程/线程数
|
|
|
use_gpu = True # 是否使用GPU
|
|
|
- use_multiprocessing = False # False=多线程(GPU推荐), True=多进程(CPU推荐)
|
|
|
+ use_multiprocessing = True # False=多线程(GPU推荐), True=多进程(CPU推荐)
|
|
|
|
|
|
# 确保输出目录存在
|
|
|
+ print(f"输出目录: {Path(output_dir).absolute()}")
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
+ dataset_path = Path(dataset_path).resolve()
|
|
|
+ output_dir = Path(output_dir).resolve()
|
|
|
print("="*60)
|
|
|
print("OmniDocBench 并行评估开始")
|
|
|
print("="*60)
|
|
|
@@ -333,7 +338,7 @@ def main():
|
|
|
# 多进程处理(推荐用于CPU)
|
|
|
print("使用多进程并行处理...")
|
|
|
results = parallel_process_with_multiprocessing(
|
|
|
- image_files, batch_size, max_workers
|
|
|
+ image_files, batch_size, max_workers, pipeline_config, output_dir, use_gpu
|
|
|
)
|
|
|
else:
|
|
|
# 多线程处理(推荐用于GPU)
|