فهرست منبع

add table recognition and conversion to LaTeX

liukaiwen 1 سال پیش
والد
کامیت
b9667fd3e3
2فایلهای تغییر یافته به همراه2 افزوده شده و 2 حذف شده
  1. 0 1
      magic_pdf/dict2md/ocr_mkcontent.py
  2. 2 1
      magic_pdf/model/pdf_extract_kit.py

+ 0 - 1
magic_pdf/dict2md/ocr_mkcontent.py

@@ -124,7 +124,6 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
                 for block in para_block['blocks']:  # 1st.拼table_caption
                     if block['type'] == BlockType.TableCaption:
                         table_caption = merge_para_with_text(block)
-                        para_text += table_caption
                 for block in para_block['blocks']:  # 2nd.拼table_body
                     if block['type'] == BlockType.TableBody:
                         for line in block['lines']:

+ 2 - 1
magic_pdf/model/pdf_extract_kit.py

@@ -104,6 +104,7 @@ class CustomPEKModel:
         self.apply_layout = kwargs.get("apply_layout", self.configs["config"]["layout"])
         self.apply_formula = kwargs.get("apply_formula", self.configs["config"]["formula"])
         self.table_config = kwargs.get("table_config", self.configs["config"]["table_config"])
+        self.apply_table = self.table_config.get("is_table_recog_enable", False)
         self.apply_ocr = ocr
         logger.info(
             "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}".format(
@@ -139,7 +140,7 @@ class CustomPEKModel:
             self.ocr_model = ModifiedPaddleOCR(show_log=show_log)
 
         # init structeqtable
-        if self.table_config.get("is_table_recog_enable", False):
+        if self.apply_table:
             max_time = self.table_config.get("max_time", 400)
             self.table_model = table_model_init(str(os.path.join(models_dir, self.configs["weights"]["table"])),
                                                 max_time=max_time, _device_=self.device)