from pathlib import Path from paddlex import create_pipeline # 统计耗时 import time start_time = time.time() pipeline = create_pipeline(pipeline="./PP-StructureV3-zhch.yaml") input_file = "./sample_data/300674-财报数据-扫描.pdf" output_path = Path("./sample_data") / f"{Path(input_file).stem}_output" output = pipeline.predict( input=input_file, device="gpu", # 或者 "gpu" 如果你有 GPU 支持 enable_mkldnn=False, # 如果你不需要 MKLDNN 优化,可以设置为 False use_doc_orientation_classify=True, # 开启文档方向分类 use_doc_unwarping=False, # 开启文档去畸变 # layout_detection_model_name=None, # 如果要禁用版面分析,可以这样设置,或者依赖其默认行为 use_seal_recognition=True, # 跳过印章识别 use_formula_recognition=False, # 跳过公式识别 use_chart_recognition=True, # 跳过图表识别 use_table_recognition=True, # 开启表格识别 ) markdown_list = [] markdown_images = [] for res in output: res.save_all(save_path=output_path) # 保存所有结果到指定路径 md_info = res.markdown markdown_list.append(md_info) markdown_images.append(md_info.get("markdown_images", {})) markdown_texts = pipeline.concatenate_markdown_pages(markdown_list) mkd_file_path = output_path / f"{Path(input_file).stem}.md" mkd_file_path.parent.mkdir(parents=True, exist_ok=True) with open(mkd_file_path, "w", encoding="utf-8") as f: f.write(markdown_texts) for item in markdown_images: if item: for path, image in item.items(): file_path = output_path / path file_path.parent.mkdir(parents=True, exist_ok=True) image.save(file_path) # 统计耗时 end_time = time.time() elapsed_time = end_time - start_time print(f"Total time taken: {elapsed_time:.2f} seconds")