Browse Source

fix(pdf-extract): ensure table recognition config defaults to disabled

If 'table-config' is not present in the configuration file, the table recognition
feature will default to being disabled to ensure consistent behavior. This change
adds a warning log and sets a default configuration for table recognition when the
expected config is missing.
myhloli 1 year ago
parent
commit
52156eaee9
2 changed files with 9 additions and 4 deletions
  1. 6 1
      magic_pdf/libs/config_reader.py
  2. 3 3
      magic_pdf/model/pdf_extract_kit.py

+ 6 - 1
magic_pdf/libs/config_reader.py

@@ -76,10 +76,15 @@ def get_device():
     else:
         return device
 
+
 def get_table_recog_config():
     config = read_config()
     table_config = config.get("table-config")
-    return table_config
+    if table_config is None:
+        logger.warning(f"'table-config' not found in {CONFIG_FILE_NAME}, use 'False' as default")
+        return json.loads('{"is_table_recog_enable": false, "max_time": 400}')
+    else:
+        return table_config
 
 
 if __name__ == "__main__":

+ 3 - 3
magic_pdf/model/pdf_extract_kit.py

@@ -1,7 +1,7 @@
 from loguru import logger
 import os
 import time
-from pypandoc import convert_text
+
 
 os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
 try:
@@ -107,8 +107,8 @@ class CustomPEKModel:
         self.apply_table = self.table_config.get("is_table_recog_enable", False)
         self.apply_ocr = ocr
         logger.info(
-            "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}".format(
-                self.apply_layout, self.apply_formula, self.apply_ocr
+            "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}, apply_table: {}".format(
+                self.apply_layout, self.apply_formula, self.apply_ocr, self.apply_table
             )
         )
         assert self.apply_layout, "DocAnalysis must contain layout model."