pdf2md.py 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. from pathlib import Path
  2. from paddlex import create_pipeline
  3. # 统计耗时
  4. import time
  5. start_time = time.time()
  6. pipeline = create_pipeline(pipeline="./PP-StructureV3-zhch.yaml")
  7. input_file = "./sample_data/300674-财报数据-扫描.pdf"
  8. output_path = Path("./sample_data") / f"{Path(input_file).stem}_output"
  9. output = pipeline.predict(
  10. input=input_file,
  11. device="gpu", # 或者 "gpu" 如果你有 GPU 支持
  12. enable_mkldnn=False, # 如果你不需要 MKLDNN 优化,可以设置为 False
  13. use_doc_orientation_classify=True, # 开启文档方向分类
  14. use_doc_unwarping=False, # 开启文档去畸变
  15. # layout_detection_model_name=None, # 如果要禁用版面分析,可以这样设置,或者依赖其默认行为
  16. use_seal_recognition=True, # 跳过印章识别
  17. use_formula_recognition=False, # 跳过公式识别
  18. use_chart_recognition=True, # 跳过图表识别
  19. use_table_recognition=True, # 开启表格识别
  20. )
  21. markdown_list = []
  22. markdown_images = []
  23. for res in output:
  24. res.save_all(save_path=output_path) # 保存所有结果到指定路径
  25. md_info = res.markdown
  26. markdown_list.append(md_info)
  27. markdown_images.append(md_info.get("markdown_images", {}))
  28. markdown_texts = pipeline.concatenate_markdown_pages(markdown_list)
  29. mkd_file_path = output_path / f"{Path(input_file).stem}.md"
  30. mkd_file_path.parent.mkdir(parents=True, exist_ok=True)
  31. with open(mkd_file_path, "w", encoding="utf-8") as f:
  32. f.write(markdown_texts)
  33. for item in markdown_images:
  34. if item:
  35. for path, image in item.items():
  36. file_path = output_path / path
  37. file_path.parent.mkdir(parents=True, exist_ok=True)
  38. image.save(file_path)
  39. # 统计耗时
  40. end_time = time.time()
  41. elapsed_time = end_time - start_time
  42. print(f"Total time taken: {elapsed_time:.2f} seconds")