| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191 |
- import copy
- import json
- import os
- from typing import Callable
- from magic_pdf.config.make_content_config import DropMode, MakeMode
- from magic_pdf.data.data_reader_writer import DataWriter
- from magic_pdf.data.dataset import Dataset
- from magic_pdf.dict2md.ocr_mkcontent import union_make
- from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
- draw_span_bbox)
- from magic_pdf.libs.json_compressor import JsonCompressor
- class PipeResult:
- def __init__(self, pipe_res, dataset: Dataset):
- """Initialized.
- Args:
- pipe_res (list[dict]): the pipeline processed result of model inference result
- dataset (Dataset): the dataset associated with pipe_res
- """
- self._pipe_res = pipe_res
- self._dataset = dataset
- def get_markdown(
- self,
- img_dir_or_bucket_prefix: str,
- drop_mode=DropMode.NONE,
- md_make_mode=MakeMode.MM_MD,
- ) -> str:
- """Get markdown content.
- Args:
- img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
- drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
- md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
- Returns:
- str: return markdown content
- """
- pdf_info_list = self._pipe_res['pdf_info']
- md_content = union_make(
- pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
- )
- return md_content
- def dump_md(
- self,
- writer: DataWriter,
- file_path: str,
- img_dir_or_bucket_prefix: str,
- drop_mode=DropMode.NONE,
- md_make_mode=MakeMode.MM_MD,
- ):
- """Dump The Markdown.
- Args:
- writer (DataWriter): File writer handle
- file_path (str): The file location of markdown
- img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
- drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
- md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
- """
- md_content = self.get_markdown(
- img_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode
- )
- writer.write_string(file_path, md_content)
- def get_content_list(
- self,
- image_dir_or_bucket_prefix: str,
- drop_mode=DropMode.NONE,
- ) -> str:
- """Get Content List.
- Args:
- image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
- drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
- Returns:
- str: content list content
- """
- pdf_info_list = self._pipe_res['pdf_info']
- content_list = union_make(
- pdf_info_list,
- MakeMode.STANDARD_FORMAT,
- drop_mode,
- image_dir_or_bucket_prefix,
- )
- return content_list
- def dump_content_list(
- self,
- writer: DataWriter,
- file_path: str,
- image_dir_or_bucket_prefix: str,
- drop_mode=DropMode.NONE,
- ):
- """Dump Content List.
- Args:
- writer (DataWriter): File writer handle
- file_path (str): The file location of content list
- image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
- drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
- """
- content_list = self.get_content_list(
- image_dir_or_bucket_prefix, drop_mode=drop_mode,
- )
- writer.write_string(
- file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
- )
- def get_middle_json(self) -> str:
- """Get middle json.
- Returns:
- str: The content of middle json
- """
- return json.dumps(self._pipe_res, ensure_ascii=False, indent=4)
- def dump_middle_json(self, writer: DataWriter, file_path: str):
- """Dump the result of pipeline.
- Args:
- writer (DataWriter): File writer handler
- file_path (str): The file location of middle json
- """
- middle_json = self.get_middle_json()
- writer.write_string(file_path, middle_json)
- def draw_layout(self, file_path: str) -> None:
- """Draw the layout.
- Args:
- file_path (str): The file location of layout result file
- """
- dir_name = os.path.dirname(file_path)
- base_name = os.path.basename(file_path)
- if not os.path.exists(dir_name):
- os.makedirs(dir_name, exist_ok=True)
- pdf_info = self._pipe_res['pdf_info']
- draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
- def draw_span(self, file_path: str):
- """Draw the Span.
- Args:
- file_path (str): The file location of span result file
- """
- dir_name = os.path.dirname(file_path)
- base_name = os.path.basename(file_path)
- if not os.path.exists(dir_name):
- os.makedirs(dir_name, exist_ok=True)
- pdf_info = self._pipe_res['pdf_info']
- draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
- def draw_line_sort(self, file_path: str):
- """Draw line sort.
- Args:
- file_path (str): The file location of line sort result file
- """
- dir_name = os.path.dirname(file_path)
- base_name = os.path.basename(file_path)
- if not os.path.exists(dir_name):
- os.makedirs(dir_name, exist_ok=True)
- pdf_info = self._pipe_res['pdf_info']
- draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
- def get_compress_pdf_mid_data(self):
- """Compress the pipeline result.
- Returns:
- str: compress the pipeline result and return
- """
- return JsonCompressor.compress_json(self._pipe_res)
- def apply(self, proc: Callable, *args, **kwargs):
- """Apply callable method which.
- Args:
- proc (Callable): invoke proc as follows:
- proc(pipeline_result, *args, **kwargs)
- Returns:
- Any: return the result generated by proc
- """
- return proc(copy.deepcopy(self._pipe_res), *args, **kwargs)
|