Browse Source

fix: enhance handling of discarded blocks in content generation

myhloli 3 tuần trước cách đây
mục cha
commit
1037fd56bc

+ 1 - 1
mineru/backend/pipeline/model_json_to_middle_json.py

@@ -148,7 +148,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
     fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
 
     """如果当前页面没有有效的bbox则跳过"""
-    if len(all_bboxes) == 0:
+    if len(all_bboxes) == 0 and len(fix_discarded_blocks) == 0:
         return None
 
     """对image/table/interline_equation截图"""

+ 16 - 4
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py

@@ -191,11 +191,20 @@ def merge_para_with_text(para_block):
 def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size):
     para_type = para_block['type']
     para_content = {}
-    if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
+    if para_type in [
+        BlockType.TEXT,
+        BlockType.LIST,
+        BlockType.INDEX,
+    ]:
         para_content = {
             'type': ContentType.TEXT,
             'text': merge_para_with_text(para_block),
         }
+    elif para_type == BlockType.DISCARDED:
+        para_content = {
+            'type': para_type,
+            'text': merge_para_with_text(para_block),
+        }
     elif para_type == BlockType.TITLE:
         para_content = {
             'type': ContentType.TEXT,
@@ -268,15 +277,18 @@ def union_make(pdf_info_dict: list,
     output_content = []
     for page_info in pdf_info_dict:
         paras_of_layout = page_info.get('para_blocks')
+        paras_of_discarded = page_info.get('discarded_blocks')
         page_idx = page_info.get('page_idx')
         page_size = page_info.get('page_size')
-        if not paras_of_layout:
-            continue
         if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
+            if not paras_of_layout:
+                continue
             page_markdown = make_blocks_to_markdown(paras_of_layout, make_mode, img_buket_path)
             output_content.extend(page_markdown)
         elif make_mode == MakeMode.CONTENT_LIST:
-            for para_block in paras_of_layout:
+            if not paras_of_layout + paras_of_discarded:
+                continue
+            for para_block in paras_of_layout + paras_of_discarded:
                 para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
                 if para_content:
                     output_content.append(para_content)

+ 5 - 3
mineru/backend/vlm/vlm_middle_json_mkcontent.py

@@ -248,13 +248,15 @@ def union_make(pdf_info_dict: list,
         paras_of_discarded = page_info.get('discarded_blocks')
         page_idx = page_info.get('page_idx')
         page_size = page_info.get('page_size')
-        if not paras_of_layout:
-            continue
         if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
+            if not paras_of_layout:
+                continue
             page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, formula_enable, table_enable, img_buket_path)
             output_content.extend(page_markdown)
         elif make_mode == MakeMode.CONTENT_LIST:
-            for para_block in paras_of_layout+paras_of_discarded:
+            if not paras_of_layout + paras_of_discarded:
+                continue
+            for para_block in paras_of_layout + paras_of_discarded:
                 para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
                 output_content.append(para_content)