Kaynağa Gözat

feat(para_split_v3): improve list identification with block aspect ratio

- Add block_height calculation to determine block aspect ratio
- Update list identification condition to include aspect ratio check
- Improve code readability with better formatting and line breaks
myhloli 1 yıl önce
ebeveyn
işleme
cf0d76c094
1 değiştirilmiş dosya ile 15 ekleme ve 6 silme
  1. 15 6
      magic_pdf/para/para_split_v3.py

+ 15 - 6
magic_pdf/para/para_split_v3.py

@@ -63,6 +63,7 @@ def __is_list_or_index_block(block):
         first_line = block['lines'][0]
         line_height = first_line['bbox'][3] - first_line['bbox'][1]
         block_weight = block['bbox_fs'][2] - block['bbox_fs'][0]
+        block_height = block['bbox_fs'][3] - block['bbox_fs'][1]
 
         left_close_num = 0
         left_not_close_num = 0
@@ -86,10 +87,12 @@ def __is_list_or_index_block(block):
 
             line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
             block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
-            if (line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height and
-                block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height):
+            if (
+                    line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height and
+                    block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
+            ):
                 external_sides_not_close_num += 1
-            if abs(line_mid_x - block_mid_x) < line_height/2:
+            if abs(line_mid_x - block_mid_x) < line_height / 2:
                 center_close_num += 1
 
             line_text = ""
@@ -142,7 +145,7 @@ def __is_list_or_index_block(block):
                 line_num_flag = True
 
         # 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边,且符合数字规则极为index
-        if ((left_close_num/len(block['lines']) >= 0.8 or right_close_num/len(block['lines']) >= 0.8)
+        if ((left_close_num / len(block['lines']) >= 0.8 or right_close_num / len(block['lines']) >= 0.8)
                 and line_num_flag
         ):
             for line in block['lines']:
@@ -150,7 +153,13 @@ def __is_list_or_index_block(block):
             return BlockType.Index
 
         # 全部line都居中的特殊list识别,每行都需要换行,特征是多行,且大多数行都前后not_close,每line中点x坐标接近
-        elif external_sides_not_close_num >= 2 and center_close_num == len(block['lines']) and external_sides_not_close_num / len(block['lines']) >= 0.5:
+        # 补充条件block的长宽比有要求
+        elif (
+                external_sides_not_close_num >= 2 and
+                center_close_num == len(block['lines']) and
+                external_sides_not_close_num / len(block['lines']) >= 0.5 and
+                block_height / block_weight > 0.4
+        ):
             for line in block['lines']:
                 line[ListLineTag.IS_LIST_START_LINE] = True
             return BlockType.List
@@ -170,7 +179,7 @@ def __is_list_or_index_block(block):
                         if lines_text_list[i][-1] in LIST_END_FLAG:
                             line[ListLineTag.IS_LIST_END_LINE] = True
                             if i + 1 < len(block['lines']):
-                                block['lines'][i+1][ListLineTag.IS_LIST_START_LINE] = True
+                                block['lines'][i + 1][ListLineTag.IS_LIST_START_LINE] = True
                 # line item基本没有结束标识符,而且也没有缩进,按右侧空隙判断哪些是item end
                 else:
                     line_start_flag = False