| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162 |
- import json
- import os
- from magic_pdf.config.make_content_config import DropMode, MakeMode
- from magic_pdf.data.data_reader_writer import DataWriter
- from magic_pdf.data.dataset import Dataset
- from magic_pdf.dict2md.ocr_mkcontent import union_make
- from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
- draw_span_bbox)
- from magic_pdf.libs.json_compressor import JsonCompressor
- class PipeResult:
- def __init__(self, pipe_res, dataset: Dataset):
- self._pipe_res = pipe_res
- self._dataset = dataset
- def dump_md(self, writer: DataWriter, file_path: str, img_dir_or_bucket_prefix: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
- pdf_info_list = self._pipe_res['pdf_info']
- md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix)
- writer.write_string(file_path, md_content)
- def dump_content_list(self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str, drop_mode=DropMode.NONE):
- pdf_info_list = self._pipe_res['pdf_info']
- content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, image_dir_or_bucket_prefix)
- writer.write_string(file_path, json.dumps(content_list, ensure_ascii=False, indent=4))
- def dump_middle_json(self, writer: DataWriter, file_path: str):
- writer.write_string(file_path, json.dumps(self._pipe_res, ensure_ascii=False, indent=4))
- def draw_layout(self, file_path: str) -> None:
- dir_name = os.path.dirname(file_path)
- base_name = os.path.basename(file_path)
- if not os.path.exists(dir_name):
- os.makedirs(dir_name, exist_ok=True)
- pdf_info = self._pipe_res['pdf_info']
- draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
- def draw_span(self, file_path: str):
- dir_name = os.path.dirname(file_path)
- base_name = os.path.basename(file_path)
- if not os.path.exists(dir_name):
- os.makedirs(dir_name, exist_ok=True)
- pdf_info = self._pipe_res['pdf_info']
- draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
- def draw_line_sort(self, file_path: str):
- dir_name = os.path.dirname(file_path)
- base_name = os.path.basename(file_path)
- if not os.path.exists(dir_name):
- os.makedirs(dir_name, exist_ok=True)
- pdf_info = self._pipe_res['pdf_info']
- draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
- def draw_content_list(self, writer: DataWriter, file_path: str, img_dir_or_bucket_prefix: str, drop_mode=DropMode.WHOLE_PDF):
- pdf_info_list = self._pipe_res['pdf_info']
- content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_dir_or_bucket_prefix)
- writer.write_string(file_path, json.dumps(content_list, ensure_ascii=False, indent=4))
- def get_compress_pdf_mid_data(self):
- return JsonCompressor.compress_json(self.pdf_mid_data)
|