demo.py 993 B

123456789101112131415161718192021222324252627
  1. import os
  2. from loguru import logger
  3. from magic_pdf.data.data_reader_writer import FileBasedDataWriter
  4. from magic_pdf.data.dataset import PymuDocDataset
  5. from magic_pdf.pipe.UNIPipe import UNIPipe
  6. try:
  7. current_script_dir = os.path.dirname(os.path.abspath(__file__))
  8. demo_name = 'demo1'
  9. pdf_path = os.path.join(current_script_dir, f'{demo_name}.pdf')
  10. pdf_bytes = open(pdf_path, 'rb').read()
  11. jso_useful_key = {'_pdf_type': '', 'model_list': []}
  12. local_image_dir = os.path.join(current_script_dir, 'images')
  13. image_dir = str(os.path.basename(local_image_dir))
  14. image_writer = FileBasedDataWriter(local_image_dir)
  15. pipe = UNIPipe(PymuDocDataset(pdf_bytes), jso_useful_key, image_writer)
  16. pipe.pipe_classify()
  17. pipe.pipe_analyze()
  18. pipe.pipe_parse()
  19. md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
  20. with open(f'{demo_name}.md', 'w', encoding='utf-8') as f:
  21. f.write(md_content)
  22. except Exception as e:
  23. logger.exception(e)