|
|
@@ -3,17 +3,17 @@ import json
|
|
|
import os
|
|
|
from typing import Callable
|
|
|
|
|
|
+from magic_pdf.config.constants import PARSE_TYPE_OCR, PARSE_TYPE_TXT
|
|
|
from magic_pdf.config.enums import SupportedPdfParseMethod
|
|
|
from magic_pdf.data.data_reader_writer import DataWriter
|
|
|
from magic_pdf.data.dataset import Dataset
|
|
|
-from magic_pdf.libs.version import __version__
|
|
|
from magic_pdf.filter import classify
|
|
|
from magic_pdf.libs.draw_bbox import draw_model_bbox
|
|
|
+from magic_pdf.libs.version import __version__
|
|
|
+from magic_pdf.model import InferenceResultBase
|
|
|
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
|
|
|
from magic_pdf.pipe.operators import PipeResult
|
|
|
-from magic_pdf.model import InferenceResultBase
|
|
|
-from magic_pdf.libs.version import __version__
|
|
|
-from magic_pdf.config.constants import PARSE_TYPE_TXT, PARSE_TYPE_OCR
|
|
|
+
|
|
|
|
|
|
class InferenceResult(InferenceResultBase):
|
|
|
def __init__(self, inference_results: list, dataset: Dataset):
|
|
|
@@ -129,6 +129,10 @@ class InferenceResult(InferenceResultBase):
|
|
|
|
|
|
def proc(*args, **kwargs) -> PipeResult:
|
|
|
res = pdf_parse_union(*args, **kwargs)
|
|
|
+ res['_parse_type'] = PARSE_TYPE_TXT
|
|
|
+ res['_version_name'] = __version__
|
|
|
+ if 'lang' in kwargs and kwargs['lang'] is not None:
|
|
|
+ res['lang'] = kwargs['lang']
|
|
|
return PipeResult(res, self._dataset)
|
|
|
|
|
|
res = self.apply(
|
|
|
@@ -141,11 +145,7 @@ class InferenceResult(InferenceResultBase):
|
|
|
debug_mode=debug_mode,
|
|
|
lang=lang,
|
|
|
)
|
|
|
- res['_parse_type'] = PARSE_TYPE_TXT
|
|
|
- res['_version_name'] = __version__
|
|
|
-
|
|
|
return res
|
|
|
-
|
|
|
|
|
|
def pipe_ocr_mode(
|
|
|
self,
|
|
|
@@ -171,19 +171,20 @@ class InferenceResult(InferenceResultBase):
|
|
|
|
|
|
def proc(*args, **kwargs) -> PipeResult:
|
|
|
res = pdf_parse_union(*args, **kwargs)
|
|
|
+ res['_parse_type'] = PARSE_TYPE_OCR
|
|
|
+ res['_version_name'] = __version__
|
|
|
+ if 'lang' in kwargs and kwargs['lang'] is not None:
|
|
|
+ res['lang'] = kwargs['lang']
|
|
|
return PipeResult(res, self._dataset)
|
|
|
|
|
|
res = self.apply(
|
|
|
proc,
|
|
|
self._dataset,
|
|
|
imageWriter,
|
|
|
- SupportedPdfParseMethod.TXT,
|
|
|
+ SupportedPdfParseMethod.OCR,
|
|
|
start_page_id=start_page_id,
|
|
|
end_page_id=end_page_id,
|
|
|
debug_mode=debug_mode,
|
|
|
lang=lang,
|
|
|
)
|
|
|
- res['_parse_type'] = PARSE_TYPE_OCR
|
|
|
-
|
|
|
- res['_version_name'] = __version__
|
|
|
- return res
|
|
|
+ return res
|