Browse Source

Merge pull request #45 from papayalove/master

更新了para_split
myhloli 1 year ago
parent
commit
82421f7cf0
1 changed files with 4 additions and 1 deletions
  1. 4 1
      magic_pdf/para/para_split_v2.py

+ 4 - 1
magic_pdf/para/para_split_v2.py

@@ -132,7 +132,8 @@ def __valign_lines(blocks, layout_bboxes):
 
     for layout_box in layout_bboxes:
         blocks_in_layoutbox = [b for b in blocks if b["type"] == BlockType.Text and is_in_layout(b['bbox'], layout_box['layout_bbox'])]
-        if len(blocks_in_layoutbox) == 0:
+        if len(blocks_in_layoutbox) == 0 or len(blocks_in_layoutbox[0]["lines"]) == 0:
+            new_layout_bboxes.append(layout_box['layout_bbox'])
             continue
 
         x0_lst = np.array([[line['bbox'][0], 0] for block in blocks_in_layoutbox for line in block['lines']])
@@ -400,6 +401,8 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
     """
     if len(pre_page_paras) == 0 or len(next_page_paras) == 0:  # 0的时候最后的return 会出错
         return False
+    if len(pre_page_paras[-1]) == 0 or len(next_page_paras[0]) == 0:
+        return False
     if pre_page_paras[-1][-1]["type"] != BlockType.Text or next_page_paras[0][0]["type"] != BlockType.Text:
         return False
     if pre_page_list_info[1] and not next_page_list_info[0]:  # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进