瀏覽代碼

更新了para_split

liukaiwen 1 年之前
父節點
當前提交
4cc88d2b6b
共有 1 個文件被更改,包括 3 次插入0 次删除
  1. 3 0
      magic_pdf/para/para_split_v2.py

+ 3 - 0
magic_pdf/para/para_split_v2.py

@@ -133,6 +133,7 @@ def __valign_lines(blocks, layout_bboxes):
     for layout_box in layout_bboxes:
         blocks_in_layoutbox = [b for b in blocks if b["type"] == BlockType.Text and is_in_layout(b['bbox'], layout_box['layout_bbox'])]
         if len(blocks_in_layoutbox) == 0 or len(blocks_in_layoutbox[0]["lines"]) == 0:
+            new_layout_bboxes.append(layout_box['layout_bbox'])
             continue
 
         x0_lst = np.array([[line['bbox'][0], 0] for block in blocks_in_layoutbox for line in block['lines']])
@@ -400,6 +401,8 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
     """
     if len(pre_page_paras) == 0 or len(next_page_paras) == 0:  # 0的时候最后的return 会出错
         return False
+    if len(pre_page_paras[-1]) == 0 or len(next_page_paras[0]) == 0:
+        return False
     if pre_page_paras[-1][-1]["type"] != BlockType.Text or next_page_paras[0][0]["type"] != BlockType.Text:
         return False
     if pre_page_list_info[1] and not next_page_list_info[0]:  # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进