types.py 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. import json
  2. import os
  3. from magic_pdf.config.make_content_config import DropMode, MakeMode
  4. from magic_pdf.data.data_reader_writer import DataWriter
  5. from magic_pdf.data.dataset import Dataset
  6. from magic_pdf.dict2md.ocr_mkcontent import union_make
  7. from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
  8. draw_span_bbox)
  9. from magic_pdf.libs.json_compressor import JsonCompressor
  10. class PipeResult:
  11. def __init__(self, pipe_res, dataset: Dataset):
  12. self._pipe_res = pipe_res
  13. self._dataset = dataset
  14. def dump_md(self, writer: DataWriter, file_path: str, img_dir_or_bucket_prefix: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
  15. pdf_info_list = self._pipe_res['pdf_info']
  16. md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix)
  17. writer.write_string(file_path, md_content)
  18. def dump_content_list(self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str, drop_mode=DropMode.NONE):
  19. pdf_info_list = self._pipe_res['pdf_info']
  20. content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, image_dir_or_bucket_prefix)
  21. writer.write_string(file_path, json.dumps(content_list, ensure_ascii=False, indent=4))
  22. def dump_middle_json(self, writer: DataWriter, file_path: str):
  23. writer.write_string(file_path, json.dumps(self._pipe_res, ensure_ascii=False, indent=4))
  24. def draw_layout(self, file_path: str) -> None:
  25. dir_name = os.path.dirname(file_path)
  26. base_name = os.path.basename(file_path)
  27. if not os.path.exists(dir_name):
  28. os.makedirs(dir_name, exist_ok=True)
  29. pdf_info = self._pipe_res['pdf_info']
  30. draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
  31. def draw_span(self, file_path: str):
  32. dir_name = os.path.dirname(file_path)
  33. base_name = os.path.basename(file_path)
  34. if not os.path.exists(dir_name):
  35. os.makedirs(dir_name, exist_ok=True)
  36. pdf_info = self._pipe_res['pdf_info']
  37. draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
  38. def draw_line_sort(self, file_path: str):
  39. dir_name = os.path.dirname(file_path)
  40. base_name = os.path.basename(file_path)
  41. if not os.path.exists(dir_name):
  42. os.makedirs(dir_name, exist_ok=True)
  43. pdf_info = self._pipe_res['pdf_info']
  44. draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
  45. def draw_content_list(self, writer: DataWriter, file_path: str, img_dir_or_bucket_prefix: str, drop_mode=DropMode.WHOLE_PDF):
  46. pdf_info_list = self._pipe_res['pdf_info']
  47. content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_dir_or_bucket_prefix)
  48. writer.write_string(file_path, json.dumps(content_list, ensure_ascii=False, indent=4))
  49. def get_compress_pdf_mid_data(self):
  50. return JsonCompressor.compress_json(self.pdf_mid_data)