ocr_demo.py 1.2 KB

12345678910111213141516171819202122232425262728293031323334353637
  1. import json
  2. import os
  3. from loguru import logger
  4. from magic_pdf.dict2md.ocr_mkcontent import mk_nlp_markdown
  5. from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
  6. def save_markdown(markdown_text, input_filepath):
  7. # 获取输入文件的目录
  8. directory = os.path.dirname(input_filepath)
  9. # 获取输入文件的文件名(不带扩展名)
  10. base_name = os.path.basename(input_filepath)
  11. file_name_without_ext = os.path.splitext(base_name)[0]
  12. # 定义输出文件的路径
  13. output_filepath = os.path.join(directory, f"{file_name_without_ext}.md")
  14. # 将Markdown文本写入.md文件
  15. with open(output_filepath, 'w', encoding='utf-8') as file:
  16. file.write(markdown_text)
  17. def read_json_file(file_path):
  18. with open(file_path, 'r') as f:
  19. data = json.load(f)
  20. return data
  21. if __name__ == '__main__':
  22. ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_0.json"
  23. ocr_pdf_info = read_json_file(ocr_json_file_path)
  24. pdf_info_dict = parse_pdf_by_ocr(ocr_pdf_info)
  25. markdown_text = mk_nlp_markdown(pdf_info_dict)
  26. logger.info(markdown_text)
  27. save_markdown(markdown_text, ocr_json_file_path)