Просмотр исходного кода

feat(pre_proc): add function to remove x-overlapping characters in spans

- Implement `remove_x_overlapping_chars` function in `ocr_span_list_modify.py`
- Integrate the new function in `pdf_parse_union_core_v2.py` to process spans
- Remove unnecessary character replacement functions and comments
myhloli 8 месяцев назад
Родитель
Сommit
3f2bafa88f
2 измененных файлов с 59 добавлено и 22 удалено
  1. 10 22
      magic_pdf/pdf_parse_union_core_v2.py
  2. 49 0
      magic_pdf/pre_proc/ocr_span_list_modify.py

+ 10 - 22
magic_pdf/pdf_parse_union_core_v2.py

@@ -34,7 +34,7 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
 from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
 from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
 from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, \
-    remove_overlaps_min_spans, check_chars_is_overlap_in_span
+    remove_overlaps_min_spans, remove_x_overlapping_chars
 
 os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
 
@@ -56,14 +56,6 @@ def __replace_STX_ETX(text_str: str):
     return text_str
 
 
-def __replace_0xfffd(text_str: str):
-    """Replace \ufffd, as these characters become garbled when extracted using pymupdf."""
-    if text_str:
-        s = text_str.replace('\ufffd', " ")
-        return s
-    return text_str
-
-
 # 连写字符拆分
 def __replace_ligatures(text: str):
     ligatures = {
@@ -76,16 +68,17 @@ def chars_to_content(span):
     # 检查span中的char是否为空
     if len(span['chars']) == 0:
         pass
-        # span['content'] = ''
-    elif check_chars_is_overlap_in_span(span['chars']):
-        pass
     else:
         # 先给chars按char['bbox']的中心点的x坐标排序
         span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
 
-        # 求char的平均宽度
-        char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
-        char_avg_width = char_width_sum / len(span['chars'])
+        # Calculate the width of each character
+        char_widths = [char['bbox'][2] - char['bbox'][0] for char in span['chars']]
+        # Calculate the median width
+        median_width = statistics.median(char_widths)
+
+        # 通过x轴重叠比率移除一部分char
+        span = remove_x_overlapping_chars(span, median_width)
 
         content = ''
         for char in span['chars']:
@@ -93,13 +86,12 @@ def chars_to_content(span):
             # 如果下一个char的x0和上一个char的x1距离超过0.25个字符宽度,则需要在中间插入一个空格
             char1 = char
             char2 = span['chars'][span['chars'].index(char) + 1] if span['chars'].index(char) + 1 < len(span['chars']) else None
-            if char2 and char2['bbox'][0] - char1['bbox'][2] > char_avg_width * 0.25 and char['c'] != ' ' and char2['c'] != ' ':
+            if char2 and char2['bbox'][0] - char1['bbox'][2] > median_width * 0.25 and char['c'] != ' ' and char2['c'] != ' ':
                 content += f"{char['c']} "
             else:
                 content += char['c']
 
-        content = __replace_ligatures(content)
-        span['content'] = __replace_0xfffd(content)
+        span['content'] = __replace_ligatures(content)
 
     del span['chars']
 
@@ -114,10 +106,6 @@ def fill_char_in_spans(spans, all_chars):
     spans = sorted(spans, key=lambda x: x['bbox'][1])
 
     for char in all_chars:
-        # 跳过非法bbox的char
-        # x1, y1, x2, y2 = char['bbox']
-        # if abs(x1 - x2) <= 0.01 or abs(y1 - y2) <= 0.01:
-        #     continue
 
         for span in spans:
             if calculate_char_in_span(char['bbox'], span['bbox'], char['c']):

+ 49 - 0
magic_pdf/pre_proc/ocr_span_list_modify.py

@@ -41,6 +41,55 @@ def check_chars_is_overlap_in_span(chars):
     return False
 
 
+def remove_x_overlapping_chars(span, median_width):
+    """
+    Remove characters from a span that overlap significantly on the x-axis.
+
+    Args:
+        median_width:
+        span (dict): A span containing a list of chars, each with bbox coordinates
+                    in the format [x0, y0, x1, y1]
+
+    Returns:
+        dict: The span with overlapping characters removed
+    """
+    if 'chars' not in span or len(span['chars']) < 2:
+        return span
+
+    overlap_threshold = median_width * 0.3
+
+    i = 0
+    while i < len(span['chars']) - 1:
+        char1 = span['chars'][i]
+        char2 = span['chars'][i + 1]
+
+        # Calculate overlap width
+        x_left = max(char1['bbox'][0], char2['bbox'][0])
+        x_right = min(char1['bbox'][2], char2['bbox'][2])
+
+        if x_right > x_left:  # There is overlap
+            overlap_width = x_right - x_left
+
+            if overlap_width > overlap_threshold:
+                # Determine which character to remove
+                width1 = char1['bbox'][2] - char1['bbox'][0]
+                width2 = char2['bbox'][2] - char2['bbox'][0]
+
+                if width1 < width2:
+                    # Remove the narrower character
+                    span['chars'].pop(i)
+                else:
+                    span['chars'].pop(i + 1)
+
+                # Don't increment i since we need to check the new pair
+            else:
+                i += 1
+        else:
+            i += 1
+
+    return span
+
+
 def remove_overlaps_min_spans(spans):
     dropped_spans = []
     #  删除重叠spans中较小的那些