operators.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. import copy
  2. import json
  3. import os
  4. from typing import Callable
  5. from magic_pdf.config.enums import SupportedPdfParseMethod
  6. from magic_pdf.data.data_reader_writer import DataWriter
  7. from magic_pdf.data.dataset import Dataset
  8. from magic_pdf.filter import classify
  9. from magic_pdf.libs.draw_bbox import draw_model_bbox
  10. from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
  11. from magic_pdf.pipe.operators import PipeResult
  12. class InferenceResult:
  13. def __init__(self, inference_results: list, dataset: Dataset):
  14. """Initialized method.
  15. Args:
  16. inference_results (list): the inference result generated by model
  17. dataset (Dataset): the dataset related with model inference result
  18. """
  19. self._infer_res = inference_results
  20. self._dataset = dataset
  21. def draw_model(self, file_path: str) -> None:
  22. """Draw model inference result.
  23. Args:
  24. file_path (str): the output file path
  25. """
  26. dir_name = os.path.dirname(file_path)
  27. base_name = os.path.basename(file_path)
  28. if not os.path.exists(dir_name):
  29. os.makedirs(dir_name, exist_ok=True)
  30. draw_model_bbox(
  31. copy.deepcopy(self._infer_res), self._dataset, dir_name, base_name
  32. )
  33. def dump_model(self, writer: DataWriter, file_path: str):
  34. """Dump model inference result to file.
  35. Args:
  36. writer (DataWriter): writer handle
  37. file_path (str): the location of target file
  38. """
  39. writer.write_string(
  40. file_path, json.dumps(self._infer_res, ensure_ascii=False, indent=4)
  41. )
  42. def get_infer_res(self):
  43. """Get the inference result.
  44. Returns:
  45. list[dict]: the inference result generated by model
  46. """
  47. return self._infer_res
  48. def apply(self, proc: Callable, *args, **kwargs):
  49. """Apply callable method which.
  50. Args:
  51. proc (Callable): invoke proc as follows:
  52. proc(inference_result, *args, **kwargs)
  53. Returns:
  54. Any: return the result generated by proc
  55. """
  56. return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
  57. def pipe_auto_mode(
  58. self,
  59. imageWriter: DataWriter,
  60. start_page_id=0,
  61. end_page_id=None,
  62. debug_mode=False,
  63. lang=None,
  64. ) -> PipeResult:
  65. """Post-proc the model inference result.
  66. step1: classify the dataset type
  67. step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
  68. Args:
  69. imageWriter (DataWriter): the image writer handle
  70. start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
  71. end_page_id (_type_, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
  72. debug_mode (bool, optional): Defaults to False. will dump more log if enabled
  73. lang (_type_, optional): Defaults to None.
  74. Returns:
  75. PipeResult: the result
  76. """
  77. pdf_proc_method = classify(self._dataset.data_bits())
  78. if pdf_proc_method == SupportedPdfParseMethod.TXT:
  79. return self.pipe_txt_mode(
  80. imageWriter, start_page_id, end_page_id, debug_mode, lang
  81. )
  82. else:
  83. return self.pipe_ocr_mode(
  84. imageWriter, start_page_id, end_page_id, debug_mode, lang
  85. )
  86. def pipe_txt_mode(
  87. self,
  88. imageWriter: DataWriter,
  89. start_page_id=0,
  90. end_page_id=None,
  91. debug_mode=False,
  92. lang=None,
  93. ) -> PipeResult:
  94. """Post-proc the model inference result, Extract the text using the
  95. third library, such as `pymupdf`
  96. Args:
  97. imageWriter (DataWriter): the image writer handle
  98. start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
  99. end_page_id (_type_, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
  100. debug_mode (bool, optional): Defaults to False. will dump more log if enabled
  101. lang (_type_, optional): Defaults to None.
  102. Returns:
  103. PipeResult: the result
  104. """
  105. def proc(*args, **kwargs) -> PipeResult:
  106. res = pdf_parse_union(*args, **kwargs)
  107. return PipeResult(res, self._dataset)
  108. return self.apply(
  109. proc,
  110. self._dataset,
  111. imageWriter,
  112. SupportedPdfParseMethod.TXT,
  113. start_page_id=start_page_id,
  114. end_page_id=end_page_id,
  115. debug_mode=debug_mode,
  116. lang=lang,
  117. )
  118. def pipe_ocr_mode(
  119. self,
  120. imageWriter: DataWriter,
  121. start_page_id=0,
  122. end_page_id=None,
  123. debug_mode=False,
  124. lang=None,
  125. ) -> PipeResult:
  126. """Post-proc the model inference result, Extract the text using `OCR`
  127. technical.
  128. Args:
  129. imageWriter (DataWriter): the image writer handle
  130. start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
  131. end_page_id (_type_, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
  132. debug_mode (bool, optional): Defaults to False. will dump more log if enabled
  133. lang (_type_, optional): Defaults to None.
  134. Returns:
  135. PipeResult: the result
  136. """
  137. def proc(*args, **kwargs) -> PipeResult:
  138. res = pdf_parse_union(*args, **kwargs)
  139. return PipeResult(res, self._dataset)
  140. return self.apply(
  141. proc,
  142. self._dataset,
  143. imageWriter,
  144. SupportedPdfParseMethod.TXT,
  145. start_page_id=start_page_id,
  146. end_page_id=end_page_id,
  147. debug_mode=debug_mode,
  148. lang=lang,
  149. )