Browse Source

Merge pull request #99 from papayalove/master

解决分段span丢失问题
myhloli 1 year ago
parent
commit
6a993c1671
1 changed files with 4 additions and 1 deletions
  1. 4 1
      magic_pdf/para/para_split_v2.py

+ 4 - 1
magic_pdf/para/para_split_v2.py

@@ -489,7 +489,10 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox, lang):
     connected_layout_blocks.append(blocks_group[0])
     connected_layout_blocks.append(blocks_group[0])
     for i in range(1, len(blocks_group)):
     for i in range(1, len(blocks_group)):
         try:
         try:
-            if len(blocks_group[i]) == 0 or len(blocks_group[i - 1]) == 0:  # TODO 考虑连接问题,
+            if len(blocks_group[i]) == 0:
+                continue
+            if  len(blocks_group[i - 1]) == 0:  # TODO 考虑连接问题,
+                connected_layout_blocks.append(blocks_group[i])
                 continue
                 continue
             # text类型的段才需要考虑layout间的合并
             # text类型的段才需要考虑layout间的合并
             if blocks_group[i - 1][-1]["type"] != BlockType.Text or blocks_group[i][0]["type"] != BlockType.Text:
             if blocks_group[i - 1][-1]["type"] != BlockType.Text or blocks_group[i][0]["type"] != BlockType.Text: