Forráskód Böngészése

fix(UNIPipe): add language parameter support for document analysis

Introduce a new parameter `lang` to the `UNIPipe` class's `parse_union_pdf` function to allow
specifying the language for document analysis. This enhancement enables users to customizethe language setting for better processing of multilingual documents.
myhloli 1 éve
szülő
commit
ef39b8251c
2 módosított fájl, 7 hozzáadás és 4 törlés
  1. 2 1
      magic_pdf/pipe/UNIPipe.py
  2. 5 3
      magic_pdf/user_api.py

+ 2 - 1
magic_pdf/pipe/UNIPipe.py

@@ -39,7 +39,8 @@ class UNIPipe(AbsPipe):
         if self.pdf_type == self.PIP_TXT:
             self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
                                                 is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
-                                                start_page_id=self.start_page_id, end_page_id=self.end_page_id)
+                                                start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                                lang=self.lang)
         elif self.pdf_type == self.PIP_OCR:
             self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
                                               is_debug=self.is_debug,

+ 5 - 3
magic_pdf/user_api.py

@@ -71,7 +71,7 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
 
 def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
                     input_model_is_empty: bool = False,
-                    start_page_id=0, end_page_id=None,
+                    start_page_id=0, end_page_id=None, lang=None,
                     *args, **kwargs):
     """
     ocr和文本混合的pdf,全部解析出来
@@ -95,9 +95,11 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
     if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
         logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
         if input_model_is_empty:
-            pdf_models = doc_analyze(pdf_bytes, ocr=True,
+            pdf_models = doc_analyze(pdf_bytes,
+                                     ocr=True,
                                      start_page_id=start_page_id,
-                                     end_page_id=end_page_id)
+                                     end_page_id=end_page_id,
+                                     lang=lang)
         pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
         if pdf_info_dict is None:
             raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")