operators.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. import json
  2. import os
  3. from magic_pdf.config.make_content_config import DropMode, MakeMode
  4. from magic_pdf.data.data_reader_writer import DataWriter
  5. from magic_pdf.data.dataset import Dataset
  6. from magic_pdf.dict2md.ocr_mkcontent import union_make
  7. from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
  8. draw_span_bbox)
  9. from magic_pdf.libs.json_compressor import JsonCompressor
  10. class PipeResult:
  11. def __init__(self, pipe_res, dataset: Dataset):
  12. """Initialized.
  13. Args:
  14. pipe_res (list[dict]): the pipeline processed result of model inference result
  15. dataset (Dataset): the dataset associated with pipe_res
  16. """
  17. self._pipe_res = pipe_res
  18. self._dataset = dataset
  19. def dump_md(
  20. self,
  21. writer: DataWriter,
  22. file_path: str,
  23. img_dir_or_bucket_prefix: str,
  24. drop_mode=DropMode.WHOLE_PDF,
  25. md_make_mode=MakeMode.MM_MD,
  26. ):
  27. """Dump The Markdown.
  28. Args:
  29. writer (DataWriter): File writer handle
  30. file_path (str): The file location of markdown
  31. img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
  32. drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
  33. md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
  34. """
  35. pdf_info_list = self._pipe_res['pdf_info']
  36. md_content = union_make(
  37. pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
  38. )
  39. writer.write_string(file_path, md_content)
  40. def dump_content_list(
  41. self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str
  42. ):
  43. """Dump Content List.
  44. Args:
  45. writer (DataWriter): File writer handle
  46. file_path (str): The file location of content list
  47. image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
  48. """
  49. pdf_info_list = self._pipe_res['pdf_info']
  50. content_list = union_make(
  51. pdf_info_list,
  52. MakeMode.STANDARD_FORMAT,
  53. DropMode.NONE,
  54. image_dir_or_bucket_prefix,
  55. )
  56. writer.write_string(
  57. file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
  58. )
  59. def dump_middle_json(self, writer: DataWriter, file_path: str):
  60. """Dump the result of pipeline.
  61. Args:
  62. writer (DataWriter): File writer handler
  63. file_path (str): The file location of middle json
  64. """
  65. writer.write_string(
  66. file_path, json.dumps(self._pipe_res, ensure_ascii=False, indent=4)
  67. )
  68. def draw_layout(self, file_path: str) -> None:
  69. """Draw the layout.
  70. Args:
  71. file_path (str): The file location of layout result file
  72. """
  73. dir_name = os.path.dirname(file_path)
  74. base_name = os.path.basename(file_path)
  75. if not os.path.exists(dir_name):
  76. os.makedirs(dir_name, exist_ok=True)
  77. pdf_info = self._pipe_res['pdf_info']
  78. draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
  79. def draw_span(self, file_path: str):
  80. """Draw the Span.
  81. Args:
  82. file_path (str): The file location of span result file
  83. """
  84. dir_name = os.path.dirname(file_path)
  85. base_name = os.path.basename(file_path)
  86. if not os.path.exists(dir_name):
  87. os.makedirs(dir_name, exist_ok=True)
  88. pdf_info = self._pipe_res['pdf_info']
  89. draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
  90. def draw_line_sort(self, file_path: str):
  91. """Draw line sort.
  92. Args:
  93. file_path (str): The file location of line sort result file
  94. """
  95. dir_name = os.path.dirname(file_path)
  96. base_name = os.path.basename(file_path)
  97. if not os.path.exists(dir_name):
  98. os.makedirs(dir_name, exist_ok=True)
  99. pdf_info = self._pipe_res['pdf_info']
  100. draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
  101. def get_compress_pdf_mid_data(self):
  102. """Compress the pipeline result.
  103. Returns:
  104. str: compress the pipeline result and return
  105. """
  106. return JsonCompressor.compress_json(self.pdf_mid_data)