TXTPipe.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637
  1. from loguru import logger
  2. from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
  3. from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
  4. from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
  5. from magic_pdf.libs.json_compressor import JsonCompressor
  6. from magic_pdf.pipe.AbsPipe import AbsPipe
  7. from magic_pdf.user_api import parse_txt_pdf
  8. class TXTPipe(AbsPipe):
  9. def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
  10. start_page_id=0, end_page_id=None, lang=None):
  11. super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang)
  12. def pipe_classify(self):
  13. pass
  14. def pipe_analyze(self):
  15. self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
  16. start_page_id=self.start_page_id, end_page_id=self.end_page_id,
  17. lang=self.lang)
  18. def pipe_parse(self):
  19. self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
  20. start_page_id=self.start_page_id, end_page_id=self.end_page_id)
  21. def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
  22. result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
  23. logger.info("txt_pipe mk content list finished")
  24. return result
  25. def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
  26. result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
  27. logger.info(f"txt_pipe mk {md_make_mode} finished")
  28. return result