pdf_parse_by_txt.py 762 B

123456789101112131415161718192021222324
  1. from magic_pdf.config.enums import SupportedPdfParseMethod
  2. from magic_pdf.data.dataset import PymuDocDataset
  3. from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
  4. def parse_pdf_by_txt(
  5. pdf_bytes,
  6. model_list,
  7. imageWriter,
  8. start_page_id=0,
  9. end_page_id=None,
  10. debug_mode=False,
  11. lang=None,
  12. ):
  13. dataset = PymuDocDataset(pdf_bytes)
  14. return pdf_parse_union(dataset,
  15. model_list,
  16. imageWriter,
  17. SupportedPdfParseMethod.TXT,
  18. start_page_id=start_page_id,
  19. end_page_id=end_page_id,
  20. debug_mode=debug_mode,
  21. lang=lang,
  22. )