operators.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
  1. import copy
  2. import json
  3. import os
  4. from typing import Callable
  5. from magic_pdf.config.enums import SupportedPdfParseMethod
  6. from magic_pdf.data.data_reader_writer import DataWriter
  7. from magic_pdf.data.dataset import Dataset
  8. from magic_pdf.filter import classify
  9. from magic_pdf.libs.draw_bbox import draw_model_bbox
  10. from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
  11. from magic_pdf.pipe.operators import PipeResult
  12. from magic_pdf.model import InferenceResultBase
  13. class InferenceResult(InferenceResultBase):
  14. def __init__(self, inference_results: list, dataset: Dataset):
  15. """Initialized method.
  16. Args:
  17. inference_results (list): the inference result generated by model
  18. dataset (Dataset): the dataset related with model inference result
  19. """
  20. self._infer_res = inference_results
  21. self._dataset = dataset
  22. def draw_model(self, file_path: str) -> None:
  23. """Draw model inference result.
  24. Args:
  25. file_path (str): the output file path
  26. """
  27. dir_name = os.path.dirname(file_path)
  28. base_name = os.path.basename(file_path)
  29. if not os.path.exists(dir_name):
  30. os.makedirs(dir_name, exist_ok=True)
  31. draw_model_bbox(
  32. copy.deepcopy(self._infer_res), self._dataset, dir_name, base_name
  33. )
  34. def dump_model(self, writer: DataWriter, file_path: str):
  35. """Dump model inference result to file.
  36. Args:
  37. writer (DataWriter): writer handle
  38. file_path (str): the location of target file
  39. """
  40. writer.write_string(
  41. file_path, json.dumps(self._infer_res, ensure_ascii=False, indent=4)
  42. )
  43. def get_infer_res(self):
  44. """Get the inference result.
  45. Returns:
  46. list: the inference result generated by model
  47. """
  48. return self._infer_res
  49. def apply(self, proc: Callable, *args, **kwargs):
  50. """Apply callable method which.
  51. Args:
  52. proc (Callable): invoke proc as follows:
  53. proc(inference_result, *args, **kwargs)
  54. Returns:
  55. Any: return the result generated by proc
  56. """
  57. return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
  58. def pipe_auto_mode(
  59. self,
  60. imageWriter: DataWriter,
  61. start_page_id=0,
  62. end_page_id=None,
  63. debug_mode=False,
  64. lang=None,
  65. ) -> PipeResult:
  66. """Post-proc the model inference result.
  67. step1: classify the dataset type
  68. step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
  69. Args:
  70. imageWriter (DataWriter): the image writer handle
  71. start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
  72. end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
  73. debug_mode (bool, optional): Defaults to False. will dump more log if enabled
  74. lang (str, optional): Defaults to None.
  75. Returns:
  76. PipeResult: the result
  77. """
  78. pdf_proc_method = classify(self._dataset.data_bits())
  79. if pdf_proc_method == SupportedPdfParseMethod.TXT:
  80. return self.pipe_txt_mode(
  81. imageWriter, start_page_id, end_page_id, debug_mode, lang
  82. )
  83. else:
  84. return self.pipe_ocr_mode(
  85. imageWriter, start_page_id, end_page_id, debug_mode, lang
  86. )
  87. def pipe_txt_mode(
  88. self,
  89. imageWriter: DataWriter,
  90. start_page_id=0,
  91. end_page_id=None,
  92. debug_mode=False,
  93. lang=None,
  94. ) -> PipeResult:
  95. """Post-proc the model inference result, Extract the text using the
  96. third library, such as `pymupdf`
  97. Args:
  98. imageWriter (DataWriter): the image writer handle
  99. start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
  100. end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
  101. debug_mode (bool, optional): Defaults to False. will dump more log if enabled
  102. lang (str, optional): Defaults to None.
  103. Returns:
  104. PipeResult: the result
  105. """
  106. def proc(*args, **kwargs) -> PipeResult:
  107. res = pdf_parse_union(*args, **kwargs)
  108. return PipeResult(res, self._dataset)
  109. return self.apply(
  110. proc,
  111. self._dataset,
  112. imageWriter,
  113. SupportedPdfParseMethod.TXT,
  114. start_page_id=start_page_id,
  115. end_page_id=end_page_id,
  116. debug_mode=debug_mode,
  117. lang=lang,
  118. )
  119. def pipe_ocr_mode(
  120. self,
  121. imageWriter: DataWriter,
  122. start_page_id=0,
  123. end_page_id=None,
  124. debug_mode=False,
  125. lang=None,
  126. ) -> PipeResult:
  127. """Post-proc the model inference result, Extract the text using `OCR`
  128. technical.
  129. Args:
  130. imageWriter (DataWriter): the image writer handle
  131. start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
  132. end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
  133. debug_mode (bool, optional): Defaults to False. will dump more log if enabled
  134. lang (str, optional): Defaults to None.
  135. Returns:
  136. PipeResult: the result
  137. """
  138. def proc(*args, **kwargs) -> PipeResult:
  139. res = pdf_parse_union(*args, **kwargs)
  140. return PipeResult(res, self._dataset)
  141. return self.apply(
  142. proc,
  143. self._dataset,
  144. imageWriter,
  145. SupportedPdfParseMethod.TXT,
  146. start_page_id=start_page_id,
  147. end_page_id=end_page_id,
  148. debug_mode=debug_mode,
  149. lang=lang,
  150. )