Răsfoiți Sursa

feat: enhance make_blocks_to_content_list to include page size and bbox calculations

myhloli 2 luni în urmă
părinte
comite
9003f50a22

+ 14 - 2
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py

@@ -188,7 +188,7 @@ def merge_para_with_text(para_block):
     return para_text
 
 
-def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
+def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size):
     para_type = para_block['type']
     para_content = {}
     if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
@@ -245,6 +245,17 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
             if block['type'] == BlockType.TABLE_FOOTNOTE:
                 para_content[BlockType.TABLE_FOOTNOTE].append(merge_para_with_text(block))
 
+    page_weight, page_height = page_size
+    para_bbox = para_block.get('bbox')
+    if para_bbox:
+        x0, y0, x1, y1 = para_bbox
+        para_content['bbox'] = [
+            int(x0 * 1000 / page_weight),
+            int(y0 * 1000 / page_height),
+            int(x1 * 1000 / page_weight),
+            int(y1 * 1000 / page_height),
+        ]
+
     para_content['page_idx'] = page_idx
 
     return para_content
@@ -258,6 +269,7 @@ def union_make(pdf_info_dict: list,
     for page_info in pdf_info_dict:
         paras_of_layout = page_info.get('para_blocks')
         page_idx = page_info.get('page_idx')
+        page_size = page_info.get('page_size')
         if not paras_of_layout:
             continue
         if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
@@ -265,7 +277,7 @@ def union_make(pdf_info_dict: list,
             output_content.extend(page_markdown)
         elif make_mode == MakeMode.CONTENT_LIST:
             for para_block in paras_of_layout:
-                para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx)
+                para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
                 if para_content:
                     output_content.append(para_content)
 

+ 5 - 2
mineru/backend/vlm/token_to_middle_json.py

@@ -1,8 +1,9 @@
+import os
 import time
 from loguru import logger
 import numpy as np
 import cv2
-from mineru.utils.config_reader import get_llm_aided_config
+from mineru.utils.config_reader import get_llm_aided_config, get_table_enable
 from mineru.utils.cut_image import cut_image_and_table
 from mineru.utils.enum_class import ContentType
 from mineru.utils.hash_utils import str_md5
@@ -94,7 +95,9 @@ def result_to_middle_json(token_list, images_list, pdf_doc, image_writer):
         middle_json["pdf_info"].append(page_info)
 
     """表格跨页合并"""
-    merge_table(middle_json["pdf_info"])
+    table_enable = get_table_enable(os.getenv('MINERU_VLM_TABLE_ENABLE', 'True').lower() == 'true')
+    if table_enable:
+        merge_table(middle_json["pdf_info"])
 
     """llm优化标题分级"""
     if heading_level_import_success:

+ 14 - 2
mineru/backend/vlm/vlm_middle_json_mkcontent.py

@@ -125,7 +125,7 @@ def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable,
 
 
 
-def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
+def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size):
     para_type = para_block['type']
     para_content = {}
     if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
@@ -179,6 +179,17 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
             if block['type'] == BlockType.TABLE_FOOTNOTE:
                 para_content[BlockType.TABLE_FOOTNOTE].append(merge_para_with_text(block))
 
+    page_weight, page_height = page_size
+    para_bbox = para_block.get('bbox')
+    if para_bbox:
+        x0, y0, x1, y1 = para_bbox
+        para_content['bbox'] = [
+            int(x0 * 1000 / page_weight),
+            int(y0 * 1000 / page_height),
+            int(x1 * 1000 / page_weight),
+            int(y1 * 1000 / page_height),
+        ]
+
     para_content['page_idx'] = page_idx
 
     return para_content
@@ -195,6 +206,7 @@ def union_make(pdf_info_dict: list,
     for page_info in pdf_info_dict:
         paras_of_layout = page_info.get('para_blocks')
         page_idx = page_info.get('page_idx')
+        page_size = page_info.get('page_size')
         if not paras_of_layout:
             continue
         if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
@@ -202,7 +214,7 @@ def union_make(pdf_info_dict: list,
             output_content.extend(page_markdown)
         elif make_mode == MakeMode.CONTENT_LIST:
             for para_block in paras_of_layout:
-                para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx)
+                para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
                 output_content.append(para_content)
 
     if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]: