convert_pdf.rst 1.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. Convert PDF
  2. ============
  3. Command Line
  4. ^^^^^^^^^^^^^
  5. .. code:: python
  6. # make sure the file have correct suffix
  7. magic-pdf -p a.pdf -o output -m auto
  8. API
  9. ^^^^^^
  10. .. code:: python
  11. import os
  12. from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
  13. from magic_pdf.data.dataset import PymuDocDataset
  14. from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
  15. # args
  16. pdf_file_name = "abc.pdf" # replace with the real pdf path
  17. name_without_suff = pdf_file_name.split(".")[0]
  18. # prepare env
  19. local_image_dir, local_md_dir = "output/images", "output"
  20. image_dir = str(os.path.basename(local_image_dir))
  21. os.makedirs(local_image_dir, exist_ok=True)
  22. image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
  23. local_md_dir
  24. )
  25. # read bytes
  26. reader1 = FileBasedDataReader("")
  27. pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
  28. # proc
  29. ## Create Dataset Instance
  30. ds = PymuDocDataset(pdf_bytes)
  31. ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)