Explorar el Código

feat(pdf_parse): add line start flag detection and optimize line stop flag logic
- Add LINE_START_FLAG tuple to identify starting flags of a line
- Modify calculate_char_in_span function to handle both line start and stop flags
- Remove redundant char_is_line_stop_flag variable and simplify logic
- Improve line flag detection to enhance text extraction accuracy

myhloli hace 11 meses
padre
commit
949d0867fb
Se han modificado 1 ficheros con 14 adiciones y 8 borrados
  1. 14 8
      magic_pdf/pdf_parse_union_core_v2.py

+ 14 - 8
magic_pdf/pdf_parse_union_core_v2.py

@@ -90,6 +90,9 @@ def chars_to_content(span):
 
 
 LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
+LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
+
+
 def fill_char_in_spans(spans, all_chars):
 
     # 简单从上到下排一下序
@@ -97,12 +100,7 @@ def fill_char_in_spans(spans, all_chars):
 
     for char in all_chars:
         for span in spans:
-            # 判断char是否属于LINE_STOP_FLAG
-            if char['c'] in LINE_STOP_FLAG:
-                char_is_line_stop_flag = True
-            else:
-                char_is_line_stop_flag = False
-            if calculate_char_in_span(char['bbox'], span['bbox'], char_is_line_stop_flag):
+            if calculate_char_in_span(char['bbox'], span['bbox'], char['c']):
                 span['chars'].append(char)
                 break
 
@@ -119,7 +117,7 @@ def fill_char_in_spans(spans, all_chars):
 
 
 # 使用鲁棒性更强的中心点坐标判断
-def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
+def calculate_char_in_span(char_bbox, span_bbox, char):
     char_center_x = (char_bbox[0] + char_bbox[2]) / 2
     char_center_y = (char_bbox[1] + char_bbox[3]) / 2
     span_center_y = (span_bbox[1] + span_bbox[3]) / 2
@@ -134,7 +132,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
     else:
         # 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致)
         # 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近
-        if char_is_line_stop_flag:
+        if char in LINE_STOP_FLAG:
             if (
                 (span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
                 and char_center_x > span_bbox[0]
@@ -142,6 +140,14 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
                 and abs(char_center_y - span_center_y) < span_height / 4
             ):
                 return True
+        elif char in LINE_START_FLAG:
+            if (
+                span_bbox[0] < char_bbox[2] < (span_bbox[0] + span_height)
+                and char_center_x < span_bbox[2]
+                and span_bbox[1] < char_center_y < span_bbox[3]
+                and abs(char_center_y - span_center_y) < span_height / 4
+            ):
+                return True
         else:
             return False