| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152 |
- from pathlib import Path
- from paddlex import create_pipeline
- # 统计耗时
- import time
- start_time = time.time()
- pipeline = create_pipeline(pipeline="./PP-StructureV3-zhch.yaml")
- input_file = "./sample_data/300674-财报数据-扫描.pdf"
- output_path = Path("./sample_data") / f"{Path(input_file).stem}_output"
- output = pipeline.predict(
- input=input_file,
- device="gpu", # 或者 "gpu" 如果你有 GPU 支持
- enable_mkldnn=False, # 如果你不需要 MKLDNN 优化,可以设置为 False
- use_doc_orientation_classify=True, # 开启文档方向分类
- use_doc_unwarping=False, # 开启文档去畸变
- # layout_detection_model_name=None, # 如果要禁用版面分析,可以这样设置,或者依赖其默认行为
- use_seal_recognition=True, # 跳过印章识别
- use_formula_recognition=False, # 跳过公式识别
- use_chart_recognition=True, # 跳过图表识别
- use_table_recognition=True, # 开启表格识别
- )
- markdown_list = []
- markdown_images = []
- for res in output:
- res.save_all(save_path=output_path) # 保存所有结果到指定路径
- md_info = res.markdown
- markdown_list.append(md_info)
- markdown_images.append(md_info.get("markdown_images", {}))
- markdown_texts = pipeline.concatenate_markdown_pages(markdown_list)
- mkd_file_path = output_path / f"{Path(input_file).stem}.md"
- mkd_file_path.parent.mkdir(parents=True, exist_ok=True)
- with open(mkd_file_path, "w", encoding="utf-8") as f:
- f.write(markdown_texts)
- for item in markdown_images:
- if item:
- for path, image in item.items():
- file_path = output_path / path
- file_path.parent.mkdir(parents=True, exist_ok=True)
- image.save(file_path)
- # 统计耗时
- end_time = time.time()
- elapsed_time = end_time - start_time
- print(f"Total time taken: {elapsed_time:.2f} seconds")
|