Просмотр исходного кода

refactor: improve character span calculation and sorting logic

myhloli 5 месяцев назад
Родитель
Сommit
99192002dd
1 измененных файлов с 8 добавлено и 4 удалено
  1. 8 4
      mineru/utils/span_pre_proc.py

+ 8 - 4
mineru/utils/span_pre_proc.py

@@ -344,7 +344,8 @@ def fill_char_in_spans(spans, all_chars):
 LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
 LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
 
-def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
+Span_Height_Radio = 0.33  # 字符的中轴和span的中轴高度差不能超过1/3span高度
+def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=Span_Height_Radio):
     char_center_x = (char_bbox[0] + char_bbox[2]) / 2
     char_center_y = (char_bbox[1] + char_bbox[3]) / 2
     span_center_y = (span_bbox[1] + span_bbox[3]) / 2
@@ -353,7 +354,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
     if (
         span_bbox[0] < char_center_x < span_bbox[2]
         and span_bbox[1] < char_center_y < span_bbox[3]
-        and abs(char_center_y - span_center_y) < span_height * span_height_radio  # 字符的中轴和span的中轴高度差不能超过1/4span高度
+        and abs(char_center_y - span_center_y) < span_height * span_height_radio  # 字符的中轴和span的中轴高度差不能超过Span_Height_Radio
     ):
         return True
     else:
@@ -385,7 +386,10 @@ def chars_to_content(span):
         pass
     else:
         # 先给chars按char['bbox']的中心点的x坐标排序
-        span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
+        # span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
+
+        # 给chars按char_idx排序
+        span['chars'] = sorted(span['chars'], key=lambda x: x['char_idx'])
 
         # Calculate the width of each character
         char_widths = [char['bbox'][2] - char['bbox'][0] for char in span['chars']]
@@ -393,7 +397,7 @@ def chars_to_content(span):
         median_width = statistics.median(char_widths)
 
         # 通过x轴重叠比率移除一部分char
-        span = remove_x_overlapping_chars(span, median_width)
+        # span = remove_x_overlapping_chars(span, median_width)
 
         content = ''
         for char in span['chars']: