OCRPipe.py 1.2 KB

12345678910111213141516171819202122232425262728
  1. from magic_pdf.libs.MakeContentConfig import DropMode
  2. from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
  3. from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
  4. from magic_pdf.pipe.AbsPipe import AbsPipe
  5. from magic_pdf.user_api import parse_ocr_pdf
  6. class OCRPipe(AbsPipe):
  7. def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
  8. super().__init__(pdf_bytes, model_list, image_writer, is_debug)
  9. def pipe_classify(self):
  10. pass
  11. def pipe_analyze(self):
  12. self.model_list = doc_analyze(self.pdf_bytes, ocr=True)
  13. def pipe_parse(self):
  14. self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
  15. def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
  16. content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
  17. return content_list
  18. def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
  19. md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
  20. return md_content