pdf_parse_by_ocr.py 880 B

1234567891011121314151617181920212223
  1. from magic_pdf.config.enums import SupportedPdfParseMethod
  2. from magic_pdf.data.dataset import PymuDocDataset
  3. from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
  4. def parse_pdf_by_ocr(pdf_bytes,
  5. model_list,
  6. imageWriter,
  7. start_page_id=0,
  8. end_page_id=None,
  9. debug_mode=False,
  10. lang=None,
  11. ):
  12. dataset = PymuDocDataset(pdf_bytes)
  13. return pdf_parse_union(dataset,
  14. model_list,
  15. imageWriter,
  16. SupportedPdfParseMethod.OCR,
  17. start_page_id=start_page_id,
  18. end_page_id=end_page_id,
  19. debug_mode=debug_mode,
  20. lang=lang,
  21. )