convert_pdf.rst 1.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. Convert PDF
  2. ============
  3. Command Line
  4. ^^^^^^^^^^^^^
  5. .. code:: python
  6. # make sure the file have correct suffix
  7. magic-pdf -p a.pdf -o output -m auto
  8. API
  9. ^^^^^^
  10. .. code:: python
  11. import os
  12. from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
  13. from magic_pdf.data.dataset import PymuDocDataset
  14. from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
  15. # args
  16. pdf_file_name = "abc.pdf" # replace with the real pdf path
  17. name_without_suff = pdf_file_name.split(".")[0]
  18. # prepare env
  19. local_image_dir, local_md_dir = "output/images", "output"
  20. image_dir = str(os.path.basename(local_image_dir))
  21. os.makedirs(local_image_dir, exist_ok=True)
  22. image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
  23. local_md_dir
  24. )
  25. # read bytes
  26. reader1 = FileBasedDataReader("")
  27. pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
  28. # proc
  29. ## Create Dataset Instance
  30. ds = PymuDocDataset(pdf_bytes)
  31. ## inference
  32. if ds.classify() == SupportedPdfParseMethod.OCR:
  33. ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
  34. md_writer, f"{name_without_suff}.md", image_dir
  35. )
  36. else:
  37. ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
  38. md_writer, f"{name_without_suff}.md", image_dir
  39. )