4 bulan lalu · 511d561c36
--- a/zhch/pdf2md.py
+++ b/zhch/pdf2md.py
@@ -0,0 +1,52 @@
 
				+from pathlib import Path
			
 
				+from paddlex import create_pipeline
			
 
				+
			
 
				+# 统计耗时
			
 
				+import time
			
 
				+start_time = time.time()
			
 
				+
			
 
				+pipeline = create_pipeline(pipeline="./PP-StructureV3-zhch.yaml")
			
 
				+
			
 
				+input_file = "./sample_data/300674-财报数据-扫描.pdf"
			
 
				+output_path = Path("./sample_data") / f"{Path(input_file).stem}_output"
			
 
				+
			
 
				+output = pipeline.predict(
			
 
				+    input=input_file,
			
 
				+    device="gpu",  # 或者 "gpu" 如果你有 GPU 支持
			
 
				+    enable_mkldnn=False,  # 如果你不需要 MKLDNN 优化，可以设置为 False
			
 
				+    use_doc_orientation_classify=True, # 开启文档方向分类
			
 
				+    use_doc_unwarping=False, # 开启文档去畸变
			
 
				+    # layout_detection_model_name=None, # 如果要禁用版面分析，可以这样设置，或者依赖其默认行为
			
 
				+    use_seal_recognition=True,         # 跳过印章识别
			
 
				+    use_formula_recognition=False,      # 跳过公式识别
			
 
				+    use_chart_recognition=True,         # 跳过图表识别
			
 
				+    use_table_recognition=True,        # 开启表格识别
			
 
				+)
			
 
				+markdown_list = []
			
 
				+markdown_images = []
			
 
				+
			
 
				+for res in output:
			
 
				+    res.save_all(save_path=output_path)  # 保存所有结果到指定路径
			
 
				+    md_info = res.markdown
			
 
				+    markdown_list.append(md_info)
			
 
				+    markdown_images.append(md_info.get("markdown_images", {}))
			
 
				+
			
 
				+markdown_texts = pipeline.concatenate_markdown_pages(markdown_list)
			
 
				+
			
 
				+mkd_file_path = output_path / f"{Path(input_file).stem}.md"
			
 
				+mkd_file_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+with open(mkd_file_path, "w", encoding="utf-8") as f:
			
 
				+    f.write(markdown_texts)
			
 
				+
			
 
				+for item in markdown_images:
			
 
				+    if item:
			
 
				+        for path, image in item.items():
			
 
				+            file_path = output_path / path
			
 
				+            file_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+            image.save(file_path)
			
 
				+
			
 
				+# 统计耗时
			
 
				+end_time = time.time()
			
 
				+elapsed_time = end_time - start_time
			
 
				+print(f"Total time taken: {elapsed_time:.2f} seconds")