pipes.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. import copy
  2. import json
  3. import os
  4. from typing import Callable
  5. from magic_pdf.config.make_content_config import DropMode, MakeMode
  6. from magic_pdf.data.data_reader_writer import DataWriter
  7. from magic_pdf.data.dataset import Dataset
  8. from magic_pdf.dict2md.ocr_mkcontent import union_make
  9. from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
  10. draw_span_bbox)
  11. from magic_pdf.libs.json_compressor import JsonCompressor
  12. class PipeResult:
  13. def __init__(self, pipe_res, dataset: Dataset):
  14. """Initialized.
  15. Args:
  16. pipe_res (list[dict]): the pipeline processed result of model inference result
  17. dataset (Dataset): the dataset associated with pipe_res
  18. """
  19. self._pipe_res = pipe_res
  20. self._dataset = dataset
  21. def get_markdown(
  22. self,
  23. img_dir_or_bucket_prefix: str,
  24. drop_mode=DropMode.WHOLE_PDF,
  25. md_make_mode=MakeMode.MM_MD,
  26. ) -> str:
  27. """Get markdown content.
  28. Args:
  29. img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
  30. drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
  31. md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
  32. Returns:
  33. str: return markdown content
  34. """
  35. pdf_info_list = self._pipe_res['pdf_info']
  36. md_content = union_make(
  37. pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
  38. )
  39. return md_content
  40. def dump_md(
  41. self,
  42. writer: DataWriter,
  43. file_path: str,
  44. img_dir_or_bucket_prefix: str,
  45. drop_mode=DropMode.WHOLE_PDF,
  46. md_make_mode=MakeMode.MM_MD,
  47. ):
  48. """Dump The Markdown.
  49. Args:
  50. writer (DataWriter): File writer handle
  51. file_path (str): The file location of markdown
  52. img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
  53. drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
  54. md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
  55. """
  56. md_content = self.get_markdown(
  57. img_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode
  58. )
  59. writer.write_string(file_path, md_content)
  60. def get_content_list(
  61. self,
  62. image_dir_or_bucket_prefix: str,
  63. drop_mode=DropMode.NONE,
  64. md_make_mode=MakeMode.STANDARD_FORMAT,
  65. ) -> str:
  66. """Get Content List.
  67. Args:
  68. image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
  69. drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
  70. md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
  71. Returns:
  72. str: content list content
  73. """
  74. pdf_info_list = self._pipe_res['pdf_info']
  75. content_list = union_make(
  76. pdf_info_list,
  77. md_make_mode,
  78. drop_mode,
  79. image_dir_or_bucket_prefix,
  80. )
  81. return content_list
  82. def dump_content_list(
  83. self,
  84. writer: DataWriter,
  85. file_path: str,
  86. image_dir_or_bucket_prefix: str,
  87. drop_mode=DropMode.NONE,
  88. md_make_mode=MakeMode.STANDARD_FORMAT,
  89. ):
  90. """Dump Content List.
  91. Args:
  92. writer (DataWriter): File writer handle
  93. file_path (str): The file location of content list
  94. image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
  95. drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
  96. md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
  97. """
  98. content_list = self.get_content_list(
  99. image_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode
  100. )
  101. writer.write_string(
  102. file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
  103. )
  104. def get_middle_json(self) -> str:
  105. """Get middle json.
  106. Returns:
  107. str: The content of middle json
  108. """
  109. return json.dumps(self._pipe_res, ensure_ascii=False, indent=4)
  110. def dump_middle_json(self, writer: DataWriter, file_path: str):
  111. """Dump the result of pipeline.
  112. Args:
  113. writer (DataWriter): File writer handler
  114. file_path (str): The file location of middle json
  115. """
  116. middle_json = self.get_middle_json()
  117. writer.write_string(file_path, middle_json)
  118. def draw_layout(self, file_path: str) -> None:
  119. """Draw the layout.
  120. Args:
  121. file_path (str): The file location of layout result file
  122. """
  123. dir_name = os.path.dirname(file_path)
  124. base_name = os.path.basename(file_path)
  125. if not os.path.exists(dir_name):
  126. os.makedirs(dir_name, exist_ok=True)
  127. pdf_info = self._pipe_res['pdf_info']
  128. draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
  129. def draw_span(self, file_path: str):
  130. """Draw the Span.
  131. Args:
  132. file_path (str): The file location of span result file
  133. """
  134. dir_name = os.path.dirname(file_path)
  135. base_name = os.path.basename(file_path)
  136. if not os.path.exists(dir_name):
  137. os.makedirs(dir_name, exist_ok=True)
  138. pdf_info = self._pipe_res['pdf_info']
  139. draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
  140. def draw_line_sort(self, file_path: str):
  141. """Draw line sort.
  142. Args:
  143. file_path (str): The file location of line sort result file
  144. """
  145. dir_name = os.path.dirname(file_path)
  146. base_name = os.path.basename(file_path)
  147. if not os.path.exists(dir_name):
  148. os.makedirs(dir_name, exist_ok=True)
  149. pdf_info = self._pipe_res['pdf_info']
  150. draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
  151. def get_compress_pdf_mid_data(self):
  152. """Compress the pipeline result.
  153. Returns:
  154. str: compress the pipeline result and return
  155. """
  156. return JsonCompressor.compress_json(self._pipe_res)
  157. def apply(self, proc: Callable, *args, **kwargs):
  158. """Apply callable method which.
  159. Args:
  160. proc (Callable): invoke proc as follows:
  161. proc(pipeline_result, *args, **kwargs)
  162. Returns:
  163. Any: return the result generated by proc
  164. """
  165. return proc(copy.deepcopy(self._pipe_res), *args, **kwargs)