11 tháng trước cách đây · 14656085f5
--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
@@ -89,28 +89,25 @@ def __replace_STX_ETX(text_str: str):
 
				 
			
 
				 
			
 
				 def chars_to_content(span):
			
 
				-    # # 先给chars按char['bbox']的x坐标排序
			
 
				-    # span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0])
			
 
				-
			
 
				-    # 先给chars按char['bbox']的中心点的x坐标排序
			
 
				-    span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
			
 
				-    content = ''
			
 
				-
			
 
				-    # 求char的平均宽度
			
 
				+    # 检查span中的char是否为空
			
 
				     if len(span['chars']) == 0:
			
 
				-        span['content'] = content
			
 
				-        del span['chars']
			
 
				-        return
			
 
				+        span['content'] = ''
			
 
				     else:
			
 
				+        # 先给chars按char['bbox']的中心点的x坐标排序
			
 
				+        span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
			
 
				+
			
 
				+        # 求char的平均宽度
			
 
				         char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
			
 
				         char_avg_width = char_width_sum / len(span['chars'])
			
 
				 
			
 
				-    for char in span['chars']:
			
 
				-        # 如果下一个char的x0和上一个char的x1距离超过一个字符宽度，则需要在中间插入一个空格
			
 
				-        if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
			
 
				-            content += ' '
			
 
				-        content += char['c']
			
 
				-    span['content'] = __replace_STX_ETX(content)
			
 
				+        content = ''
			
 
				+        for char in span['chars']:
			
 
				+            # 如果下一个char的x0和上一个char的x1距离超过一个字符宽度，则需要在中间插入一个空格
			
 
				+            if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
			
 
				+                content += ' '
			
 
				+            content += char['c']
			
 
				+        span['content'] = __replace_STX_ETX(content)
			
 
				+
			
 
				     del span['chars']
			
 
				 
			
 
				 
			
@@ -218,6 +215,8 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
 
				                     ocr_text, ocr_score = ocr_res[0][0]
			
 
				                     if ocr_score > 0.5 and len(ocr_text) > 0:
			
 
				                         span['content'] = ocr_text
			
 
				+                    else:
			
 
				+                        spans.remove(span)
			
 
				 
			
 
				     return spans