Sfoglia il codice sorgente

Merge pull request #3737 from myhloli/dev

Refactor block processing to handle non-contiguous indices in captions and footnotes
Xiaomeng Zhao 1 mese fa
parent
commit
17a9921ba9
1 ha cambiato i file con 42 aggiunte e 1 eliminazioni
  1. 42 1
      mineru/backend/vlm/vlm_magic_model.py

+ 42 - 1
mineru/backend/vlm/vlm_magic_model.py

@@ -483,6 +483,44 @@ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
                 # 没找到合适的body,作为普通block处理
                 not_include_blocks.append(footnote)
 
+        # 第四步:将每个block的caption_list和footnote_list中不连续index的元素提出来作为普通block处理
+        for block in need_fix_blocks:
+            caption_list = block[f"{fix_type}_caption_list"]
+            footnote_list = block[f"{fix_type}_footnote_list"]
+            body_index = block[f"{fix_type}_body"]["index"]
+
+            # 处理caption_list (从body往前看,caption在body之前)
+            if caption_list:
+                # 按index降序排列,从最接近body的开始检查
+                caption_list.sort(key=lambda x: x["index"], reverse=True)
+                filtered_captions = [caption_list[0]]
+                for i in range(1, len(caption_list)):
+                    # 检查是否与前一个caption连续(降序所以是-1)
+                    if caption_list[i]["index"] == caption_list[i - 1]["index"] - 1:
+                        filtered_captions.append(caption_list[i])
+                    else:
+                        # 出现gap,后续所有caption都作为普通block
+                        not_include_blocks.extend(caption_list[i:])
+                        break
+                # 恢复升序
+                filtered_captions.reverse()
+                block[f"{fix_type}_caption_list"] = filtered_captions
+
+            # 处理footnote_list (从body往后看,footnote在body之后)
+            if footnote_list:
+                # 按index升序排列,从最接近body的开始检查
+                footnote_list.sort(key=lambda x: x["index"])
+                filtered_footnotes = [footnote_list[0]]
+                for i in range(1, len(footnote_list)):
+                    # 检查是否与前一个footnote连续
+                    if footnote_list[i]["index"] == footnote_list[i - 1]["index"] + 1:
+                        filtered_footnotes.append(footnote_list[i])
+                    else:
+                        # 出现gap,后续所有footnote都作为普通block
+                        not_include_blocks.extend(footnote_list[i:])
+                        break
+                block[f"{fix_type}_footnote_list"] = filtered_footnotes
+
     # 构建两层结构blocks
     for block in need_fix_blocks:
         body = block[f"{fix_type}_body"]
@@ -506,12 +544,15 @@ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
             "index": body["index"],
         }
         two_layer_block["blocks"].extend([*caption_list, *footnote_list])
+        # 对blocks按index排序
+        two_layer_block["blocks"].sort(key=lambda x: x["index"])
 
         fixed_blocks.append(two_layer_block)
 
     # 添加未处理的blocks
     for block in blocks:
-        if block["index"] not in processed_indices:
+        block.pop("type", None)
+        if block["index"] not in processed_indices and block not in not_include_blocks:
             not_include_blocks.append(block)
 
     return fixed_blocks, not_include_blocks