Browse Source

修正几个常量

kernel.h@qq.com 1 year ago
parent
commit
f702defe5a
2 changed files with 13 additions and 10 deletions
  1. 6 6
      magic_pdf/pipe/AbsPipe.py
  2. 7 4
      magic_pdf/user_api.py

+ 6 - 6
magic_pdf/pipe/AbsPipe.py

@@ -78,9 +78,9 @@ class AbsPipe(ABC):
                     pdf_meta["text_layout_per_page"],
                 )
                 if is_text_pdf:
-                    return "txt"
+                    return AbsPipe.PIP_TXT
                 else:
-                    return "ocr"
+                    return AbsPipe.PIP_OCR
 
     @staticmethod
     def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str) -> list:
@@ -90,9 +90,9 @@ class AbsPipe(ABC):
         pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
         parse_type = pdf_mid_data["_parse_type"]
         pdf_info_list = pdf_mid_data["pdf_info"]
-        if parse_type == "txt":
+        if parse_type == AbsPipe.PIP_TXT:
             content_list = mk_universal_format(pdf_info_list, img_buket_path)
-        elif parse_type == "ocr":
+        elif parse_type == AbsPipe.PIP_OCR:
             content_list = make_standard_format_with_para(pdf_info_list, img_buket_path)
         return content_list
 
@@ -104,10 +104,10 @@ class AbsPipe(ABC):
         pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
         parse_type = pdf_mid_data["_parse_type"]
         pdf_info_list = pdf_mid_data["pdf_info"]
-        if parse_type == "txt":
+        if parse_type == AbsPipe.PIP_TXT:
             content_list = mk_universal_format(pdf_info_list, img_buket_path)
             md_content = mk_mm_markdown(content_list)
-        elif parse_type == "ocr":
+        elif parse_type == AbsPipe.PIP_OCR:
             md_content = ocr_mk_mm_markdown_with_para(pdf_info_list, img_buket_path)
         return md_content
 

+ 7 - 4
magic_pdf/user_api.py

@@ -19,6 +19,9 @@ from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
 from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
 
 
+PARSE_TYPE_TXT = "txt"
+PARSE_TYPE_OCR = "ocr"
+
 def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
                   **kwargs):
     """
@@ -32,7 +35,7 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
         debug_mode=is_debug,
     )
 
-    pdf_info_dict["parse_type"] = "txt"
+    pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
 
     return pdf_info_dict
 
@@ -50,7 +53,7 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
         debug_mode=is_debug,
     )
 
-    pdf_info_dict["_parse_type"] = "ocr"
+    pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
 
     return pdf_info_dict
 
@@ -82,8 +85,8 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
         if pdf_info_dict is None:
             raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
         else:
-            pdf_info_dict["_parse_type"] = "ocr"
+            pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
     else:
-        pdf_info_dict["_parse_type"] = "txt"
+        pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
 
     return pdf_info_dict