|
@@ -90,6 +90,9 @@ def chars_to_content(span):
|
|
|
|
|
|
|
|
|
|
|
|
|
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
|
|
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
|
|
|
|
|
+LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def fill_char_in_spans(spans, all_chars):
|
|
def fill_char_in_spans(spans, all_chars):
|
|
|
|
|
|
|
|
# 简单从上到下排一下序
|
|
# 简单从上到下排一下序
|
|
@@ -97,12 +100,7 @@ def fill_char_in_spans(spans, all_chars):
|
|
|
|
|
|
|
|
for char in all_chars:
|
|
for char in all_chars:
|
|
|
for span in spans:
|
|
for span in spans:
|
|
|
- # 判断char是否属于LINE_STOP_FLAG
|
|
|
|
|
- if char['c'] in LINE_STOP_FLAG:
|
|
|
|
|
- char_is_line_stop_flag = True
|
|
|
|
|
- else:
|
|
|
|
|
- char_is_line_stop_flag = False
|
|
|
|
|
- if calculate_char_in_span(char['bbox'], span['bbox'], char_is_line_stop_flag):
|
|
|
|
|
|
|
+ if calculate_char_in_span(char['bbox'], span['bbox'], char['c']):
|
|
|
span['chars'].append(char)
|
|
span['chars'].append(char)
|
|
|
break
|
|
break
|
|
|
|
|
|
|
@@ -119,7 +117,7 @@ def fill_char_in_spans(spans, all_chars):
|
|
|
|
|
|
|
|
|
|
|
|
|
# 使用鲁棒性更强的中心点坐标判断
|
|
# 使用鲁棒性更强的中心点坐标判断
|
|
|
-def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
|
|
|
|
|
|
|
+def calculate_char_in_span(char_bbox, span_bbox, char):
|
|
|
char_center_x = (char_bbox[0] + char_bbox[2]) / 2
|
|
char_center_x = (char_bbox[0] + char_bbox[2]) / 2
|
|
|
char_center_y = (char_bbox[1] + char_bbox[3]) / 2
|
|
char_center_y = (char_bbox[1] + char_bbox[3]) / 2
|
|
|
span_center_y = (span_bbox[1] + span_bbox[3]) / 2
|
|
span_center_y = (span_bbox[1] + span_bbox[3]) / 2
|
|
@@ -134,7 +132,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
|
|
|
else:
|
|
else:
|
|
|
# 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致)
|
|
# 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致)
|
|
|
# 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近
|
|
# 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近
|
|
|
- if char_is_line_stop_flag:
|
|
|
|
|
|
|
+ if char in LINE_STOP_FLAG:
|
|
|
if (
|
|
if (
|
|
|
(span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
|
|
(span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
|
|
|
and char_center_x > span_bbox[0]
|
|
and char_center_x > span_bbox[0]
|
|
@@ -142,6 +140,14 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
|
|
|
and abs(char_center_y - span_center_y) < span_height / 4
|
|
and abs(char_center_y - span_center_y) < span_height / 4
|
|
|
):
|
|
):
|
|
|
return True
|
|
return True
|
|
|
|
|
+ elif char in LINE_START_FLAG:
|
|
|
|
|
+ if (
|
|
|
|
|
+ span_bbox[0] < char_bbox[2] < (span_bbox[0] + span_height)
|
|
|
|
|
+ and char_center_x < span_bbox[2]
|
|
|
|
|
+ and span_bbox[1] < char_center_y < span_bbox[3]
|
|
|
|
|
+ and abs(char_center_y - span_center_y) < span_height / 4
|
|
|
|
|
+ ):
|
|
|
|
|
+ return True
|
|
|
else:
|
|
else:
|
|
|
return False
|
|
return False
|
|
|
|
|
|