operators.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. import copy
  2. import json
  3. import os
  4. from typing import Callable
  5. from magic_pdf.config.make_content_config import DropMode, MakeMode
  6. from magic_pdf.data.data_reader_writer import DataWriter
  7. from magic_pdf.data.dataset import Dataset
  8. from magic_pdf.dict2md.ocr_mkcontent import union_make
  9. from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
  10. draw_span_bbox)
  11. from magic_pdf.libs.json_compressor import JsonCompressor
  12. class PipeResult:
  13. def __init__(self, pipe_res, dataset: Dataset):
  14. """Initialized.
  15. Args:
  16. pipe_res (list[dict]): the pipeline processed result of model inference result
  17. dataset (Dataset): the dataset associated with pipe_res
  18. """
  19. self._pipe_res = pipe_res
  20. self._dataset = dataset
  21. def get_markdown(self,
  22. img_dir_or_bucket_prefix: str,
  23. drop_mode=DropMode.WHOLE_PDF,
  24. md_make_mode=MakeMode.MM_MD) -> str:
  25. """Get markdown content.
  26. Args:
  27. img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
  28. drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
  29. md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
  30. Returns:
  31. str: return markdown content
  32. """
  33. pdf_info_list = self._pipe_res['pdf_info']
  34. md_content = union_make(
  35. pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
  36. )
  37. return md_content
  38. def dump_md(
  39. self,
  40. writer: DataWriter,
  41. file_path: str,
  42. img_dir_or_bucket_prefix: str,
  43. drop_mode=DropMode.WHOLE_PDF,
  44. md_make_mode=MakeMode.MM_MD,
  45. ):
  46. """Dump The Markdown.
  47. Args:
  48. writer (DataWriter): File writer handle
  49. file_path (str): The file location of markdown
  50. img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
  51. drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
  52. md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
  53. """
  54. md_content = self.get_markdown(img_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode)
  55. writer.write_string(file_path, md_content)
  56. def get_content_list(self,
  57. image_dir_or_bucket_prefix: str,
  58. drop_mode=DropMode.NONE,
  59. md_make_mode=MakeMode.STANDARD_FORMAT) -> str:
  60. """Get Content List.
  61. Args:
  62. image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
  63. drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
  64. md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
  65. Returns:
  66. str: content list content
  67. """
  68. pdf_info_list = self._pipe_res['pdf_info']
  69. content_list = union_make(
  70. pdf_info_list,
  71. md_make_mode,
  72. drop_mode,
  73. image_dir_or_bucket_prefix,
  74. )
  75. return content_list
  76. def dump_content_list(
  77. self,
  78. writer: DataWriter,
  79. file_path: str,
  80. image_dir_or_bucket_prefix: str,
  81. drop_mode=DropMode.NONE,
  82. md_make_mode=MakeMode.STANDARD_FORMAT
  83. ):
  84. """Dump Content List.
  85. Args:
  86. writer (DataWriter): File writer handle
  87. file_path (str): The file location of content list
  88. image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
  89. drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
  90. md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
  91. """
  92. content_list = self.get_content_list(image_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode)
  93. writer.write_string(
  94. file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
  95. )
  96. def dump_middle_json(self, writer: DataWriter, file_path: str):
  97. """Dump the result of pipeline.
  98. Args:
  99. writer (DataWriter): File writer handler
  100. file_path (str): The file location of middle json
  101. """
  102. writer.write_string(
  103. file_path, json.dumps(self._pipe_res, ensure_ascii=False, indent=4)
  104. )
  105. def draw_layout(self, file_path: str) -> None:
  106. """Draw the layout.
  107. Args:
  108. file_path (str): The file location of layout result file
  109. """
  110. dir_name = os.path.dirname(file_path)
  111. base_name = os.path.basename(file_path)
  112. if not os.path.exists(dir_name):
  113. os.makedirs(dir_name, exist_ok=True)
  114. pdf_info = self._pipe_res['pdf_info']
  115. draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
  116. def draw_span(self, file_path: str):
  117. """Draw the Span.
  118. Args:
  119. file_path (str): The file location of span result file
  120. """
  121. dir_name = os.path.dirname(file_path)
  122. base_name = os.path.basename(file_path)
  123. if not os.path.exists(dir_name):
  124. os.makedirs(dir_name, exist_ok=True)
  125. pdf_info = self._pipe_res['pdf_info']
  126. draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
  127. def draw_line_sort(self, file_path: str):
  128. """Draw line sort.
  129. Args:
  130. file_path (str): The file location of line sort result file
  131. """
  132. dir_name = os.path.dirname(file_path)
  133. base_name = os.path.basename(file_path)
  134. if not os.path.exists(dir_name):
  135. os.makedirs(dir_name, exist_ok=True)
  136. pdf_info = self._pipe_res['pdf_info']
  137. draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
  138. def get_compress_pdf_mid_data(self):
  139. """Compress the pipeline result.
  140. Returns:
  141. str: compress the pipeline result and return
  142. """
  143. return JsonCompressor.compress_json(self._pipe_res)
  144. def apply(self, proc: Callable, *args, **kwargs):
  145. """Apply callable method which.
  146. Args:
  147. proc (Callable): invoke proc as follows:
  148. proc(pipeline_result, *args, **kwargs)
  149. Returns:
  150. Any: return the result generated by proc
  151. """
  152. return proc(copy.deepcopy(self._pipe_res), *args, **kwargs)