operators.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. import json
  2. import os
  3. from typing import Callable
  4. import copy
  5. from magic_pdf.config.make_content_config import DropMode, MakeMode
  6. from magic_pdf.data.data_reader_writer import DataWriter
  7. from magic_pdf.data.dataset import Dataset
  8. from magic_pdf.dict2md.ocr_mkcontent import union_make
  9. from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
  10. draw_span_bbox)
  11. from magic_pdf.libs.json_compressor import JsonCompressor
  12. class PipeResult:
  13. def __init__(self, pipe_res, dataset: Dataset):
  14. """Initialized.
  15. Args:
  16. pipe_res (list[dict]): the pipeline processed result of model inference result
  17. dataset (Dataset): the dataset associated with pipe_res
  18. """
  19. self._pipe_res = pipe_res
  20. self._dataset = dataset
  21. def dump_md(
  22. self,
  23. writer: DataWriter,
  24. file_path: str,
  25. img_dir_or_bucket_prefix: str,
  26. drop_mode=DropMode.WHOLE_PDF,
  27. md_make_mode=MakeMode.MM_MD,
  28. ):
  29. """Dump The Markdown.
  30. Args:
  31. writer (DataWriter): File writer handle
  32. file_path (str): The file location of markdown
  33. img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
  34. drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
  35. md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
  36. """
  37. pdf_info_list = self._pipe_res['pdf_info']
  38. md_content = union_make(
  39. pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
  40. )
  41. writer.write_string(file_path, md_content)
  42. def dump_content_list(
  43. self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str
  44. ):
  45. """Dump Content List.
  46. Args:
  47. writer (DataWriter): File writer handle
  48. file_path (str): The file location of content list
  49. image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
  50. """
  51. pdf_info_list = self._pipe_res['pdf_info']
  52. content_list = union_make(
  53. pdf_info_list,
  54. MakeMode.STANDARD_FORMAT,
  55. DropMode.NONE,
  56. image_dir_or_bucket_prefix,
  57. )
  58. writer.write_string(
  59. file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
  60. )
  61. def dump_middle_json(self, writer: DataWriter, file_path: str):
  62. """Dump the result of pipeline.
  63. Args:
  64. writer (DataWriter): File writer handler
  65. file_path (str): The file location of middle json
  66. """
  67. writer.write_string(
  68. file_path, json.dumps(self._pipe_res, ensure_ascii=False, indent=4)
  69. )
  70. def draw_layout(self, file_path: str) -> None:
  71. """Draw the layout.
  72. Args:
  73. file_path (str): The file location of layout result file
  74. """
  75. dir_name = os.path.dirname(file_path)
  76. base_name = os.path.basename(file_path)
  77. if not os.path.exists(dir_name):
  78. os.makedirs(dir_name, exist_ok=True)
  79. pdf_info = self._pipe_res['pdf_info']
  80. draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
  81. def draw_span(self, file_path: str):
  82. """Draw the Span.
  83. Args:
  84. file_path (str): The file location of span result file
  85. """
  86. dir_name = os.path.dirname(file_path)
  87. base_name = os.path.basename(file_path)
  88. if not os.path.exists(dir_name):
  89. os.makedirs(dir_name, exist_ok=True)
  90. pdf_info = self._pipe_res['pdf_info']
  91. draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
  92. def draw_line_sort(self, file_path: str):
  93. """Draw line sort.
  94. Args:
  95. file_path (str): The file location of line sort result file
  96. """
  97. dir_name = os.path.dirname(file_path)
  98. base_name = os.path.basename(file_path)
  99. if not os.path.exists(dir_name):
  100. os.makedirs(dir_name, exist_ok=True)
  101. pdf_info = self._pipe_res['pdf_info']
  102. draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
  103. def get_compress_pdf_mid_data(self):
  104. """Compress the pipeline result.
  105. Returns:
  106. str: compress the pipeline result and return
  107. """
  108. return JsonCompressor.compress_json(self.pdf_mid_data)
  109. def apply(self, proc: Callable, *args, **kwargs):
  110. """Apply callable method which.
  111. Args:
  112. proc (Callable): invoke proc as follows:
  113. proc(pipeline_result, *args, **kwargs)
  114. Returns:
  115. Any: return the result generated by proc
  116. """
  117. return proc(copy.deepcopy(self._pipe_res), *args, **kwargs)