OCRPipe.py 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. from loguru import logger
  2. from magic_pdf.config.make_content_config import DropMode, MakeMode
  3. from magic_pdf.data.data_reader_writer import DataWriter
  4. from magic_pdf.data.dataset import Dataset
  5. from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
  6. from magic_pdf.pipe.AbsPipe import AbsPipe
  7. from magic_pdf.user_api import parse_ocr_pdf
  8. class OCRPipe(AbsPipe):
  9. def __init__(
  10. self,
  11. dataset: Dataset,
  12. model_list: list,
  13. image_writer: DataWriter,
  14. is_debug: bool = False,
  15. start_page_id=0,
  16. end_page_id=None,
  17. lang=None,
  18. layout_model=None,
  19. formula_enable=None,
  20. table_enable=None,
  21. ):
  22. super().__init__(
  23. dataset,
  24. model_list,
  25. image_writer,
  26. is_debug,
  27. start_page_id,
  28. end_page_id,
  29. lang,
  30. layout_model,
  31. formula_enable,
  32. table_enable,
  33. )
  34. def pipe_classify(self):
  35. pass
  36. def pipe_analyze(self):
  37. self.infer_res = doc_analyze(
  38. self.dataset,
  39. ocr=True,
  40. start_page_id=self.start_page_id,
  41. end_page_id=self.end_page_id,
  42. lang=self.lang,
  43. layout_model=self.layout_model,
  44. formula_enable=self.formula_enable,
  45. table_enable=self.table_enable,
  46. )
  47. def pipe_parse(self):
  48. self.pdf_mid_data = parse_ocr_pdf(
  49. self.dataset,
  50. self.infer_res,
  51. self.image_writer,
  52. is_debug=self.is_debug,
  53. start_page_id=self.start_page_id,
  54. end_page_id=self.end_page_id,
  55. lang=self.lang,
  56. layout_model=self.layout_model,
  57. formula_enable=self.formula_enable,
  58. table_enable=self.table_enable,
  59. )
  60. def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
  61. result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
  62. logger.info('ocr_pipe mk content list finished')
  63. return result
  64. def pipe_mk_markdown(
  65. self,
  66. img_parent_path: str,
  67. drop_mode=DropMode.WHOLE_PDF,
  68. md_make_mode=MakeMode.MM_MD,
  69. ):
  70. result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
  71. logger.info(f'ocr_pipe mk {md_make_mode} finished')
  72. return result