Explorar el Código

fix: refactor table merging logic and add cross-page table merge utility

myhloli hace 3 semanas
padre
commit
90ed311198

+ 7 - 3
mineru/backend/pipeline/model_init.py

@@ -17,10 +17,14 @@ from ...model.table.rec.unet_table.main import UnetTableModel
 from ...utils.enum_class import ModelPath
 from ...utils.models_download_utils import auto_download_and_get_model_root_path
 
-MFR_MODEL = os.getenv('MINERU_MFR_MODEL', None)
-if MFR_MODEL is None:
+MFR_MODEL = os.getenv('MINERU_FORMULA_CH_SUPPORT', 'False')
+if MFR_MODEL.lower() in ['true', '1', 'yes']:
+    MFR_MODEL = "pp_formulanet_plus_m"
+elif MFR_MODEL.lower() in ['false', '0', 'no']:
+    MFR_MODEL = "unimernet_small"
+else:
+    logger.warning(f"Invalid MINERU_FORMULA_CH_SUPPORT value: {MFR_MODEL}, set to default 'False'")
     MFR_MODEL = "unimernet_small"
-    # MFR_MODEL = "pp_formulanet_plus_m"
 
 
 def img_orientation_cls_model_init():

+ 2 - 9
mineru/backend/pipeline/model_json_to_middle_json.py

@@ -5,6 +5,7 @@ import time
 from loguru import logger
 from tqdm import tqdm
 
+from mineru.backend.utils import cross_page_table_merge
 from mineru.utils.config_reader import get_device, get_llm_aided_config, get_formula_enable
 from mineru.backend.pipeline.model_init import AtomModelSingleton
 from mineru.backend.pipeline.para_split import para_split
@@ -20,7 +21,6 @@ from mineru.utils.ocr_utils import OcrConfidence
 from mineru.utils.span_block_fix import fill_spans_in_blocks, fix_discarded_block, fix_block_spans
 from mineru.utils.span_pre_proc import remove_outside_spans, remove_overlaps_low_confidence_spans, \
     remove_overlaps_min_spans, txt_spans_extract
-from mineru.utils.table_merge import merge_table
 from mineru.version import __version__
 from mineru.utils.hash_utils import bytes_md5
 
@@ -231,14 +231,7 @@ def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=N
     para_split(middle_json["pdf_info"])
 
     """表格跨页合并"""
-    is_merge_table = os.getenv('MINERU_MERGE_TABLE', 'true')
-    if is_merge_table.lower() == 'true':
-        merge_table(middle_json["pdf_info"])
-    elif is_merge_table.lower() == 'false':
-        pass
-    else:
-        logger.warning(f'unknown MINERU_MERGE_TABLE config: {is_merge_table}, pass')
-        pass
+    cross_page_table_merge(middle_json["pdf_info"])
 
     """llm优化"""
     llm_aided_config = get_llm_aided_config()

+ 24 - 0
mineru/backend/utils.py

@@ -0,0 +1,24 @@
+import os
+
+from loguru import logger
+
+from mineru.utils.table_merge import merge_table
+
+
+def cross_page_table_merge(pdf_info: list[dict]):
+    """Merge tables that span across multiple pages in a PDF document.
+
+    Args:
+        pdf_info (list[dict]): A list of dictionaries containing information about each page in the PDF.
+
+    Returns:
+        None
+    """
+    is_merge_table = os.getenv('MINERU_MERGE_TABLE', 'true')
+    if is_merge_table.lower() in ['true', '1', 'yes']:
+        merge_table(pdf_info)
+    elif is_merge_table.lower() in ['false', '0', 'no']:
+        pass
+    else:
+        logger.warning(f'unknown MINERU_MERGE_TABLE config: {is_merge_table}, pass')
+        pass

+ 2 - 9
mineru/backend/vlm/model_output_to_middle_json.py

@@ -5,13 +5,13 @@ import cv2
 import numpy as np
 from loguru import logger
 
+from mineru.backend.utils import cross_page_table_merge
 from mineru.backend.vlm.vlm_magic_model import MagicModel
 from mineru.utils.config_reader import get_table_enable, get_llm_aided_config
 from mineru.utils.cut_image import cut_image_and_table
 from mineru.utils.enum_class import ContentType
 from mineru.utils.hash_utils import bytes_md5
 from mineru.utils.pdf_image_tools import get_crop_img
-from mineru.utils.table_merge import merge_table
 from mineru.version import __version__
 
 
@@ -110,14 +110,7 @@ def result_to_middle_json(model_output_blocks_list, images_list, pdf_doc, image_
     """表格跨页合并"""
     table_enable = get_table_enable(os.getenv('MINERU_VLM_TABLE_ENABLE', 'True').lower() == 'true')
     if table_enable:
-        is_merge_table = os.getenv('MINERU_MERGE_TABLE', 'true')
-        if is_merge_table.lower() == 'true':
-            merge_table(middle_json["pdf_info"])
-        elif is_merge_table.lower() == 'false':
-            pass
-        else:
-            logger.warning(f'unknown MINERU_MERGE_TABLE config: {is_merge_table}, pass')
-            pass
+        cross_page_table_merge(middle_json["pdf_info"])
 
     """llm优化标题分级"""
     if heading_level_import_success: