Преглед на файлове

refactor(pre_proc): improve character overlap handling in spans

- Remove remove_overlaps_chars function
- Add check_chars_is_overlap_in_span function
- Update span processing logic to handle character overlaps- Improve efficiency and readability of overlap detection
myhloli преди 11 месеца
родител
ревизия
15e876677d
променени са 2 файла, в които са добавени 9 реда и са изтрити 28 реда
  1. 3 5
      magic_pdf/pdf_parse_union_core_v2.py
  2. 6 23
      magic_pdf/pre_proc/ocr_span_list_modify.py

+ 3 - 5
magic_pdf/pdf_parse_union_core_v2.py

@@ -35,7 +35,7 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
 from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
 from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
 from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, \
-    remove_overlaps_min_spans, remove_overlaps_chars
+    remove_overlaps_min_spans, check_chars_is_overlap_in_span
 
 os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
 
@@ -78,6 +78,8 @@ def chars_to_content(span):
     if len(span['chars']) == 0:
         pass
         # span['content'] = ''
+    elif check_chars_is_overlap_in_span(span['chars']):
+        pass
     else:
         # 先给chars按char['bbox']的中心点的x坐标排序
         span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
@@ -121,10 +123,6 @@ def fill_char_in_spans(spans, all_chars):
     empty_spans = []
 
     for span in spans:
-
-        # 移除同一个span中重叠的char
-        span['chars'] = remove_overlaps_chars(span['chars'])
-
         chars_to_content(span)
         # 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
         if len(span['content']) * span['height'] < span['width'] * 0.5:

+ 6 - 23
magic_pdf/pre_proc/ocr_span_list_modify.py

@@ -33,29 +33,12 @@ def remove_overlaps_low_confidence_spans(spans):
     return spans, dropped_spans
 
 
-def remove_overlaps_chars(chars):
-    dropped_chars = []
-    #  删除重叠的char
-    for char1 in chars:
-        for char2 in chars:
-            if char1 != char2:
-                # char1 或 char2 任何一个都不应该在 dropped_chars 中
-                if char1 in dropped_chars or char2 in dropped_chars:
-                    continue
-                else:
-                    if calculate_iou(char1['bbox'], char2['bbox']) > 0.95:
-                        char_need_remove = char1
-                        if (
-                            char_need_remove is not None
-                            and char_need_remove not in dropped_chars
-                        ):
-                            dropped_chars.append(char_need_remove)
-
-    if len(dropped_chars) > 0:
-        for char_need_remove in dropped_chars:
-            chars.remove(char_need_remove)
-
-    return chars
+def check_chars_is_overlap_in_span(chars):
+    for i in range(len(chars)):
+        for j in range(i + 1, len(chars)):
+            if calculate_iou(chars[i]['bbox'], chars[j]['bbox']) > 0.9:
+                return True
+    return False
 
 
 def remove_overlaps_min_spans(spans):