浏览代码

fix(pre_proc): improve character overlap handling in OCR processing

- Add condition to check for identical or space characters when resolving overlaps
- Skip non-conflicting character pairs to prevent unnecessary removals
myhloli 7 月之前
父节点
当前提交
be505a958c
共有 1 个文件被更改,包括 10 次插入8 次删除
  1. 10 8
      magic_pdf/pre_proc/ocr_span_list_modify.py

+ 10 - 8
magic_pdf/pre_proc/ocr_span_list_modify.py

@@ -71,15 +71,17 @@ def remove_x_overlapping_chars(span, median_width):
             overlap_width = x_right - x_left
 
             if overlap_width > overlap_threshold:
-                # Determine which character to remove
-                width1 = char1['bbox'][2] - char1['bbox'][0]
-                width2 = char2['bbox'][2] - char2['bbox'][0]
-
-                if width1 < width2:
-                    # Remove the narrower character
-                    span['chars'].pop(i)
+                if char1['c'] == char2['c'] or char1['c'] == ' ' or char2['c'] == ' ':
+                    # Determine which character to remove
+                    width1 = char1['bbox'][2] - char1['bbox'][0]
+                    width2 = char2['bbox'][2] - char2['bbox'][0]
+                    if width1 < width2:
+                        # Remove the narrower character
+                        span['chars'].pop(i)
+                    else:
+                        span['chars'].pop(i + 1)
                 else:
-                    span['chars'].pop(i + 1)
+                    i += 1
 
                 # Don't increment i since we need to check the new pair
             else: