Jelajahi Sumber

新增脚本:pdf2md.py,用于从PDF文件提取数据并生成Markdown文件

zhch158_admin 4 bulan lalu
induk
melakukan
511d561c36
1 mengubah file dengan 52 tambahan dan 0 penghapusan
  1. 52 0
      zhch/pdf2md.py

+ 52 - 0
zhch/pdf2md.py

@@ -0,0 +1,52 @@
+from pathlib import Path
+from paddlex import create_pipeline
+
+# 统计耗时
+import time
+start_time = time.time()
+
+pipeline = create_pipeline(pipeline="./PP-StructureV3-zhch.yaml")
+
+input_file = "./sample_data/300674-财报数据-扫描.pdf"
+output_path = Path("./sample_data") / f"{Path(input_file).stem}_output"
+
+output = pipeline.predict(
+    input=input_file,
+    device="gpu",  # 或者 "gpu" 如果你有 GPU 支持
+    enable_mkldnn=False,  # 如果你不需要 MKLDNN 优化,可以设置为 False
+    use_doc_orientation_classify=True, # 开启文档方向分类
+    use_doc_unwarping=False, # 开启文档去畸变
+    # layout_detection_model_name=None, # 如果要禁用版面分析,可以这样设置,或者依赖其默认行为
+    use_seal_recognition=True,         # 跳过印章识别
+    use_formula_recognition=False,      # 跳过公式识别
+    use_chart_recognition=True,         # 跳过图表识别
+    use_table_recognition=True,        # 开启表格识别
+)
+markdown_list = []
+markdown_images = []
+
+for res in output:
+    res.save_all(save_path=output_path)  # 保存所有结果到指定路径
+    md_info = res.markdown
+    markdown_list.append(md_info)
+    markdown_images.append(md_info.get("markdown_images", {}))
+
+markdown_texts = pipeline.concatenate_markdown_pages(markdown_list)
+
+mkd_file_path = output_path / f"{Path(input_file).stem}.md"
+mkd_file_path.parent.mkdir(parents=True, exist_ok=True)
+
+with open(mkd_file_path, "w", encoding="utf-8") as f:
+    f.write(markdown_texts)
+
+for item in markdown_images:
+    if item:
+        for path, image in item.items():
+            file_path = output_path / path
+            file_path.parent.mkdir(parents=True, exist_ok=True)
+            image.save(file_path)
+
+# 统计耗时
+end_time = time.time()
+elapsed_time = end_time - start_time
+print(f"Total time taken: {elapsed_time:.2f} seconds")