|
|
@@ -89,29 +89,29 @@ def __replace_STX_ETX(text_str: str):
|
|
|
|
|
|
|
|
|
def chars_to_content(span):
|
|
|
- # # 先给chars按char['bbox']的x坐标排序
|
|
|
- # span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0])
|
|
|
-
|
|
|
- # 先给chars按char['bbox']的中心点的x坐标排序
|
|
|
- span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
|
|
|
- content = ''
|
|
|
-
|
|
|
- # 求char的平均宽度
|
|
|
- if len(span['chars']) == 0:
|
|
|
- span['content'] = content
|
|
|
- del span['chars']
|
|
|
- return
|
|
|
- else:
|
|
|
- char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
|
|
|
- char_avg_width = char_width_sum / len(span['chars'])
|
|
|
-
|
|
|
- for char in span['chars']:
|
|
|
- # 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
|
|
|
- if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
|
|
|
- content += ' '
|
|
|
- content += char['c']
|
|
|
- span['content'] = __replace_STX_ETX(content)
|
|
|
+ # # 先给chars按char['bbox']的x坐标排序
|
|
|
+ # span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0])
|
|
|
+
|
|
|
+ # 先给chars按char['bbox']的中心点的x坐标排序
|
|
|
+ span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
|
|
|
+ content = ''
|
|
|
+
|
|
|
+ # 求char的平均宽度
|
|
|
+ if len(span['chars']) == 0:
|
|
|
+ span['content'] = content
|
|
|
del span['chars']
|
|
|
+ return
|
|
|
+ else:
|
|
|
+ char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
|
|
|
+ char_avg_width = char_width_sum / len(span['chars'])
|
|
|
+
|
|
|
+ for char in span['chars']:
|
|
|
+ # 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
|
|
|
+ if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
|
|
|
+ content += ' '
|
|
|
+ content += char['c']
|
|
|
+ span['content'] = __replace_STX_ETX(content)
|
|
|
+ del span['chars']
|
|
|
|
|
|
|
|
|
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
|