ocr_demo.py 1008 B

1234567891011121314151617181920212223242526272829
  1. import os
  2. from loguru import logger
  3. from magic_pdf.dict2md.ocr_mkcontent import mk_nlp_markdown
  4. from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
  5. def save_markdown(markdown_text, input_filepath):
  6. # 获取输入文件的目录
  7. directory = os.path.dirname(input_filepath)
  8. # 获取输入文件的文件名(不带扩展名)
  9. base_name = os.path.basename(input_filepath)
  10. file_name_without_ext = os.path.splitext(base_name)[0]
  11. # 定义输出文件的路径
  12. output_filepath = os.path.join(directory, f"{file_name_without_ext}.md")
  13. # 将Markdown文本写入.md文件
  14. with open(output_filepath, 'w', encoding='utf-8') as file:
  15. file.write(markdown_text)
  16. if __name__ == '__main__':
  17. ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_0.json"
  18. pdf_info_dict = parse_pdf_by_ocr(ocr_json_file_path)
  19. markdown_text = mk_nlp_markdown(pdf_info_dict)
  20. logger.info(markdown_text)
  21. save_markdown(markdown_text, ocr_json_file_path)