| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122 |
- import copy
- import json
- import os
- from typing import Callable
- from magic_pdf.config.enums import SupportedPdfParseMethod
- from magic_pdf.data.data_reader_writer import DataWriter
- from magic_pdf.data.dataset import Dataset
- from magic_pdf.filter import classify
- from magic_pdf.libs.draw_bbox import draw_model_bbox
- from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
- from magic_pdf.pipe.types import PipeResult
- class InferenceResult:
- def __init__(self, inference_results: list, dataset: Dataset):
- self._infer_res = inference_results
- self._dataset = dataset
- def draw_model(self, file_path: str) -> None:
- dir_name = os.path.dirname(file_path)
- base_name = os.path.basename(file_path)
- if not os.path.exists(dir_name):
- os.makedirs(dir_name, exist_ok=True)
- draw_model_bbox(
- copy.deepcopy(self._infer_res), self._dataset, dir_name, base_name
- )
- def dump_model(self, writer: DataWriter, file_path: str):
- writer.write_string(
- file_path, json.dumps(self._infer_res, ensure_ascii=False, indent=4)
- )
- def get_infer_res(self):
- return self._infer_res
- def apply(self, proc: Callable, *args, **kwargs):
- return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
- def pipe_auto_mode(
- self,
- imageWriter: DataWriter,
- start_page_id=0,
- end_page_id=None,
- debug_mode=False,
- lang=None,
- ) -> PipeResult:
- def proc(*args, **kwargs) -> PipeResult:
- res = pdf_parse_union(*args, **kwargs)
- return PipeResult(res, self._dataset)
- pdf_proc_method = classify(self._dataset.data_bits())
- if pdf_proc_method == SupportedPdfParseMethod.TXT:
- return self.apply(
- proc,
- self._dataset,
- imageWriter,
- SupportedPdfParseMethod.TXT,
- start_page_id=0,
- end_page_id=None,
- debug_mode=False,
- lang=None,
- )
- else:
- return self.apply(
- proc,
- self._dataset,
- imageWriter,
- SupportedPdfParseMethod.OCR,
- start_page_id=0,
- end_page_id=None,
- debug_mode=False,
- lang=None,
- )
- def pipe_txt_mode(
- self,
- imageWriter: DataWriter,
- start_page_id=0,
- end_page_id=None,
- debug_mode=False,
- lang=None,
- ) -> PipeResult:
- def proc(*args, **kwargs) -> PipeResult:
- res = pdf_parse_union(*args, **kwargs)
- return PipeResult(res, self._dataset)
- return self.apply(
- proc,
- self._dataset,
- imageWriter,
- SupportedPdfParseMethod.TXT,
- start_page_id=0,
- end_page_id=None,
- debug_mode=False,
- lang=None,
- )
- def pipe_ocr_mode(
- self,
- imageWriter: DataWriter,
- start_page_id=0,
- end_page_id=None,
- debug_mode=False,
- lang=None,
- ) -> PipeResult:
- def proc(*args, **kwargs) -> PipeResult:
- res = pdf_parse_union(*args, **kwargs)
- return PipeResult(res, self._dataset)
- return self.apply(
- proc,
- self._dataset,
- imageWriter,
- SupportedPdfParseMethod.TXT,
- start_page_id=0,
- end_page_id=None,
- debug_mode=False,
- lang=None,
- )
|