pipes.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. import copy
  2. import json
  3. import os
  4. from typing import Callable
  5. from magic_pdf.config.make_content_config import DropMode, MakeMode
  6. from magic_pdf.data.data_reader_writer import DataWriter
  7. from magic_pdf.data.dataset import Dataset
  8. from magic_pdf.dict2md.ocr_mkcontent import union_make
  9. from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
  10. draw_span_bbox)
  11. from magic_pdf.libs.json_compressor import JsonCompressor
  12. class PipeResult:
  13. def __init__(self, pipe_res, dataset: Dataset):
  14. """Initialized.
  15. Args:
  16. pipe_res (list[dict]): the pipeline processed result of model inference result
  17. dataset (Dataset): the dataset associated with pipe_res
  18. """
  19. self._pipe_res = pipe_res
  20. self._dataset = dataset
  21. def get_markdown(
  22. self,
  23. img_dir_or_bucket_prefix: str,
  24. drop_mode=DropMode.NONE,
  25. md_make_mode=MakeMode.MM_MD,
  26. ) -> str:
  27. """Get markdown content.
  28. Args:
  29. img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
  30. drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
  31. md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
  32. Returns:
  33. str: return markdown content
  34. """
  35. pdf_info_list = self._pipe_res['pdf_info']
  36. md_content = union_make(
  37. pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
  38. )
  39. return md_content
  40. def dump_md(
  41. self,
  42. writer: DataWriter,
  43. file_path: str,
  44. img_dir_or_bucket_prefix: str,
  45. drop_mode=DropMode.NONE,
  46. md_make_mode=MakeMode.MM_MD,
  47. ):
  48. """Dump The Markdown.
  49. Args:
  50. writer (DataWriter): File writer handle
  51. file_path (str): The file location of markdown
  52. img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
  53. drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
  54. md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
  55. """
  56. md_content = self.get_markdown(
  57. img_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode
  58. )
  59. writer.write_string(file_path, md_content)
  60. def get_content_list(
  61. self,
  62. image_dir_or_bucket_prefix: str,
  63. drop_mode=DropMode.NONE,
  64. ) -> str:
  65. """Get Content List.
  66. Args:
  67. image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
  68. drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
  69. Returns:
  70. str: content list content
  71. """
  72. pdf_info_list = self._pipe_res['pdf_info']
  73. content_list = union_make(
  74. pdf_info_list,
  75. MakeMode.STANDARD_FORMAT,
  76. drop_mode,
  77. image_dir_or_bucket_prefix,
  78. )
  79. return content_list
  80. def dump_content_list(
  81. self,
  82. writer: DataWriter,
  83. file_path: str,
  84. image_dir_or_bucket_prefix: str,
  85. drop_mode=DropMode.NONE,
  86. ):
  87. """Dump Content List.
  88. Args:
  89. writer (DataWriter): File writer handle
  90. file_path (str): The file location of content list
  91. image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
  92. drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
  93. """
  94. content_list = self.get_content_list(
  95. image_dir_or_bucket_prefix, drop_mode=drop_mode,
  96. )
  97. writer.write_string(
  98. file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
  99. )
  100. def get_middle_json(self) -> str:
  101. """Get middle json.
  102. Returns:
  103. str: The content of middle json
  104. """
  105. return json.dumps(self._pipe_res, ensure_ascii=False, indent=4)
  106. def dump_middle_json(self, writer: DataWriter, file_path: str):
  107. """Dump the result of pipeline.
  108. Args:
  109. writer (DataWriter): File writer handler
  110. file_path (str): The file location of middle json
  111. """
  112. middle_json = self.get_middle_json()
  113. writer.write_string(file_path, middle_json)
  114. def draw_layout(self, file_path: str) -> None:
  115. """Draw the layout.
  116. Args:
  117. file_path (str): The file location of layout result file
  118. """
  119. dir_name = os.path.dirname(file_path)
  120. base_name = os.path.basename(file_path)
  121. if not os.path.exists(dir_name):
  122. os.makedirs(dir_name, exist_ok=True)
  123. pdf_info = self._pipe_res['pdf_info']
  124. draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
  125. def draw_span(self, file_path: str):
  126. """Draw the Span.
  127. Args:
  128. file_path (str): The file location of span result file
  129. """
  130. dir_name = os.path.dirname(file_path)
  131. base_name = os.path.basename(file_path)
  132. if not os.path.exists(dir_name):
  133. os.makedirs(dir_name, exist_ok=True)
  134. pdf_info = self._pipe_res['pdf_info']
  135. draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
  136. def draw_line_sort(self, file_path: str):
  137. """Draw line sort.
  138. Args:
  139. file_path (str): The file location of line sort result file
  140. """
  141. dir_name = os.path.dirname(file_path)
  142. base_name = os.path.basename(file_path)
  143. if not os.path.exists(dir_name):
  144. os.makedirs(dir_name, exist_ok=True)
  145. pdf_info = self._pipe_res['pdf_info']
  146. draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
  147. def get_compress_pdf_mid_data(self):
  148. """Compress the pipeline result.
  149. Returns:
  150. str: compress the pipeline result and return
  151. """
  152. return JsonCompressor.compress_json(self._pipe_res)
  153. def apply(self, proc: Callable, *args, **kwargs):
  154. """Apply callable method which.
  155. Args:
  156. proc (Callable): invoke proc as follows:
  157. proc(pipeline_result, *args, **kwargs)
  158. Returns:
  159. Any: return the result generated by proc
  160. """
  161. return proc(copy.deepcopy(self._pipe_res), *args, **kwargs)