11 달 전 · 7964ae45d2
--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
@@ -89,29 +89,29 @@ def __replace_STX_ETX(text_str: str):
 
				 
			
 
				 
			
 
				 def chars_to_content(span):
			
 
				-        # # 先给chars按char['bbox']的x坐标排序
			
 
				-        # span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0])
			
 
				-
			
 
				-        # 先给chars按char['bbox']的中心点的x坐标排序
			
 
				-        span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
			
 
				-        content = ''
			
 
				-
			
 
				-        # 求char的平均宽度
			
 
				-        if len(span['chars']) == 0:
			
 
				-            span['content'] = content
			
 
				-            del span['chars']
			
 
				-            return
			
 
				-        else:
			
 
				-            char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
			
 
				-            char_avg_width = char_width_sum / len(span['chars'])
			
 
				-
			
 
				-        for char in span['chars']:
			
 
				-            # 如果下一个char的x0和上一个char的x1距离超过一个字符宽度，则需要在中间插入一个空格
			
 
				-            if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
			
 
				-                content += ' '
			
 
				-            content += char['c']
			
 
				-        span['content'] = __replace_STX_ETX(content)
			
 
				+    # # 先给chars按char['bbox']的x坐标排序
			
 
				+    # span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0])
			
 
				+
			
 
				+    # 先给chars按char['bbox']的中心点的x坐标排序
			
 
				+    span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
			
 
				+    content = ''
			
 
				+
			
 
				+    # 求char的平均宽度
			
 
				+    if len(span['chars']) == 0:
			
 
				+        span['content'] = content
			
 
				         del span['chars']
			
 
				+        return
			
 
				+    else:
			
 
				+        char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
			
 
				+        char_avg_width = char_width_sum / len(span['chars'])
			
 
				+
			
 
				+    for char in span['chars']:
			
 
				+        # 如果下一个char的x0和上一个char的x1距离超过一个字符宽度，则需要在中间插入一个空格
			
 
				+        if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
			
 
				+            content += ' '
			
 
				+        content += char['c']
			
 
				+    span['content'] = __replace_STX_ETX(content)
			
 
				+    del span['chars']
			
 
				 
			
 
				 
			
 
				 LINE_STOP_FLAG = ('.', '!', '?', '。', '！', '？', ')', '）', '"', '”', ':', '：', ';', '；', ']', '】', '}', '}', '>', '》', '、', ',', '，', '-', '—', '–',)