7 月之前 · be505a958c
--- a/magic_pdf/pre_proc/ocr_span_list_modify.py
+++ b/magic_pdf/pre_proc/ocr_span_list_modify.py
@@ -71,15 +71,17 @@ def remove_x_overlapping_chars(span, median_width):
 
				             overlap_width = x_right - x_left
			
 
				 
			
 
				             if overlap_width > overlap_threshold:
			
 
				-                # Determine which character to remove
			
 
				-                width1 = char1['bbox'][2] - char1['bbox'][0]
			
 
				-                width2 = char2['bbox'][2] - char2['bbox'][0]
			
 
				-
			
 
				-                if width1 < width2:
			
 
				-                    # Remove the narrower character
			
 
				-                    span['chars'].pop(i)
			
 
				+                if char1['c'] == char2['c'] or char1['c'] == ' ' or char2['c'] == ' ':
			
 
				+                    # Determine which character to remove
			
 
				+                    width1 = char1['bbox'][2] - char1['bbox'][0]
			
 
				+                    width2 = char2['bbox'][2] - char2['bbox'][0]
			
 
				+                    if width1 < width2:
			
 
				+                        # Remove the narrower character
			
 
				+                        span['chars'].pop(i)
			
 
				+                    else:
			
 
				+                        span['chars'].pop(i + 1)
			
 
				                 else:
			
 
				-                    span['chars'].pop(i + 1)
			
 
				+                    i += 1
			
 
				 
			
 
				                 # Don't increment i since we need to check the new pair
			
 
				             else: