| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980 |
- from loguru import logger
- from magic_pdf.config.make_content_config import DropMode, MakeMode
- from magic_pdf.data.data_reader_writer import DataWriter
- from magic_pdf.data.dataset import Dataset
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
- from magic_pdf.pipe.AbsPipe import AbsPipe
- from magic_pdf.user_api import parse_ocr_pdf
- class OCRPipe(AbsPipe):
- def __init__(
- self,
- dataset: Dataset,
- model_list: list,
- image_writer: DataWriter,
- is_debug: bool = False,
- start_page_id=0,
- end_page_id=None,
- lang=None,
- layout_model=None,
- formula_enable=None,
- table_enable=None,
- ):
- super().__init__(
- dataset,
- model_list,
- image_writer,
- is_debug,
- start_page_id,
- end_page_id,
- lang,
- layout_model,
- formula_enable,
- table_enable,
- )
- def pipe_classify(self):
- pass
- def pipe_analyze(self):
- self.infer_res = doc_analyze(
- self.dataset,
- ocr=True,
- start_page_id=self.start_page_id,
- end_page_id=self.end_page_id,
- lang=self.lang,
- layout_model=self.layout_model,
- formula_enable=self.formula_enable,
- table_enable=self.table_enable,
- )
- def pipe_parse(self):
- self.pdf_mid_data = parse_ocr_pdf(
- self.dataset,
- self.infer_res,
- self.image_writer,
- is_debug=self.is_debug,
- start_page_id=self.start_page_id,
- end_page_id=self.end_page_id,
- lang=self.lang,
- layout_model=self.layout_model,
- formula_enable=self.formula_enable,
- table_enable=self.table_enable,
- )
- def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
- result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
- logger.info('ocr_pipe mk content list finished')
- return result
- def pipe_mk_markdown(
- self,
- img_parent_path: str,
- drop_mode=DropMode.WHOLE_PDF,
- md_make_mode=MakeMode.MM_MD,
- ):
- result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
- logger.info(f'ocr_pipe mk {md_make_mode} finished')
- return result
|