|
@@ -0,0 +1,52 @@
|
|
|
|
|
+from pathlib import Path
|
|
|
|
|
+from paddlex import create_pipeline
|
|
|
|
|
+
|
|
|
|
|
+# 统计耗时
|
|
|
|
|
+import time
|
|
|
|
|
+start_time = time.time()
|
|
|
|
|
+
|
|
|
|
|
+pipeline = create_pipeline(pipeline="./PP-StructureV3-zhch.yaml")
|
|
|
|
|
+
|
|
|
|
|
+input_file = "./sample_data/300674-财报数据-扫描.pdf"
|
|
|
|
|
+output_path = Path("./sample_data") / f"{Path(input_file).stem}_output"
|
|
|
|
|
+
|
|
|
|
|
+output = pipeline.predict(
|
|
|
|
|
+ input=input_file,
|
|
|
|
|
+ device="gpu", # 或者 "gpu" 如果你有 GPU 支持
|
|
|
|
|
+ enable_mkldnn=False, # 如果你不需要 MKLDNN 优化,可以设置为 False
|
|
|
|
|
+ use_doc_orientation_classify=True, # 开启文档方向分类
|
|
|
|
|
+ use_doc_unwarping=False, # 开启文档去畸变
|
|
|
|
|
+ # layout_detection_model_name=None, # 如果要禁用版面分析,可以这样设置,或者依赖其默认行为
|
|
|
|
|
+ use_seal_recognition=True, # 跳过印章识别
|
|
|
|
|
+ use_formula_recognition=False, # 跳过公式识别
|
|
|
|
|
+ use_chart_recognition=True, # 跳过图表识别
|
|
|
|
|
+ use_table_recognition=True, # 开启表格识别
|
|
|
|
|
+)
|
|
|
|
|
+markdown_list = []
|
|
|
|
|
+markdown_images = []
|
|
|
|
|
+
|
|
|
|
|
+for res in output:
|
|
|
|
|
+ res.save_all(save_path=output_path) # 保存所有结果到指定路径
|
|
|
|
|
+ md_info = res.markdown
|
|
|
|
|
+ markdown_list.append(md_info)
|
|
|
|
|
+ markdown_images.append(md_info.get("markdown_images", {}))
|
|
|
|
|
+
|
|
|
|
|
+markdown_texts = pipeline.concatenate_markdown_pages(markdown_list)
|
|
|
|
|
+
|
|
|
|
|
+mkd_file_path = output_path / f"{Path(input_file).stem}.md"
|
|
|
|
|
+mkd_file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
+
|
|
|
|
|
+with open(mkd_file_path, "w", encoding="utf-8") as f:
|
|
|
|
|
+ f.write(markdown_texts)
|
|
|
|
|
+
|
|
|
|
|
+for item in markdown_images:
|
|
|
|
|
+ if item:
|
|
|
|
|
+ for path, image in item.items():
|
|
|
|
|
+ file_path = output_path / path
|
|
|
|
|
+ file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
+ image.save(file_path)
|
|
|
|
|
+
|
|
|
|
|
+# 统计耗时
|
|
|
|
|
+end_time = time.time()
|
|
|
|
|
+elapsed_time = end_time - start_time
|
|
|
|
|
+print(f"Total time taken: {elapsed_time:.2f} seconds")
|