فهرست منبع

fix: add parse_pdf_type and version

icecraft 11 ماه پیش
والد
کامیت
57f9f9dcf9
1فایلهای تغییر یافته به همراه14 افزوده شده و 13 حذف شده
  1. 14 13
      magic_pdf/model/operators.py

+ 14 - 13
magic_pdf/model/operators.py

@@ -3,17 +3,17 @@ import json
 import os
 from typing import Callable
 
+from magic_pdf.config.constants import PARSE_TYPE_OCR, PARSE_TYPE_TXT
 from magic_pdf.config.enums import SupportedPdfParseMethod
 from magic_pdf.data.data_reader_writer import DataWriter
 from magic_pdf.data.dataset import Dataset
-from magic_pdf.libs.version import __version__
 from magic_pdf.filter import classify
 from magic_pdf.libs.draw_bbox import draw_model_bbox
+from magic_pdf.libs.version import __version__
+from magic_pdf.model import InferenceResultBase
 from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
 from magic_pdf.pipe.operators import PipeResult
-from magic_pdf.model import InferenceResultBase
-from magic_pdf.libs.version import __version__
-from magic_pdf.config.constants import PARSE_TYPE_TXT, PARSE_TYPE_OCR
+
 
 class InferenceResult(InferenceResultBase):
     def __init__(self, inference_results: list, dataset: Dataset):
@@ -129,6 +129,10 @@ class InferenceResult(InferenceResultBase):
 
         def proc(*args, **kwargs) -> PipeResult:
             res = pdf_parse_union(*args, **kwargs)
+            res['_parse_type'] = PARSE_TYPE_TXT
+            res['_version_name'] = __version__
+            if 'lang' in kwargs and kwargs['lang'] is not None:
+                res['lang'] = kwargs['lang']
             return PipeResult(res, self._dataset)
 
         res = self.apply(
@@ -141,11 +145,7 @@ class InferenceResult(InferenceResultBase):
             debug_mode=debug_mode,
             lang=lang,
         )
-        res['_parse_type'] = PARSE_TYPE_TXT
-        res['_version_name'] = __version__
-    
         return res
-        
 
     def pipe_ocr_mode(
         self,
@@ -171,19 +171,20 @@ class InferenceResult(InferenceResultBase):
 
         def proc(*args, **kwargs) -> PipeResult:
             res = pdf_parse_union(*args, **kwargs)
+            res['_parse_type'] = PARSE_TYPE_OCR
+            res['_version_name'] = __version__
+            if 'lang' in kwargs and kwargs['lang'] is not None:
+                res['lang'] = kwargs['lang']
             return PipeResult(res, self._dataset)
 
         res = self.apply(
             proc,
             self._dataset,
             imageWriter,
-            SupportedPdfParseMethod.TXT,
+            SupportedPdfParseMethod.OCR,
             start_page_id=start_page_id,
             end_page_id=end_page_id,
             debug_mode=debug_mode,
             lang=lang,
         )
-        res['_parse_type'] = PARSE_TYPE_OCR
-
-        res['_version_name'] = __version__
-        return res
+        return res