Browse Source

refactor(pdf_parse): improve text content extraction from PDF spans

- Optimize character sorting for accurate text assembly
- Handle empty char scenarios to prevent errors
- Remove unnecessary comments and improve code readability
- Enhance OCR text content handling by removing low-confidence spans
myhloli 11 tháng trước cách đây
mục cha
commit
14656085f5
1 tập tin đã thay đổi với 16 bổ sung17 xóa
  1. 16 17
      magic_pdf/pdf_parse_union_core_v2.py

+ 16 - 17
magic_pdf/pdf_parse_union_core_v2.py

@@ -89,28 +89,25 @@ def __replace_STX_ETX(text_str: str):
 
 
 def chars_to_content(span):
-    # # 先给chars按char['bbox']的x坐标排序
-    # span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0])
-
-    # 先给chars按char['bbox']的中心点的x坐标排序
-    span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
-    content = ''
-
-    # 求char的平均宽度
+    # 检查span中的char是否为空
     if len(span['chars']) == 0:
-        span['content'] = content
-        del span['chars']
-        return
+        span['content'] = ''
     else:
+        # 先给chars按char['bbox']的中心点的x坐标排序
+        span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
+
+        # 求char的平均宽度
         char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
         char_avg_width = char_width_sum / len(span['chars'])
 
-    for char in span['chars']:
-        # 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
-        if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
-            content += ' '
-        content += char['c']
-    span['content'] = __replace_STX_ETX(content)
+        content = ''
+        for char in span['chars']:
+            # 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
+            if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
+                content += ' '
+            content += char['c']
+        span['content'] = __replace_STX_ETX(content)
+
     del span['chars']
 
 
@@ -218,6 +215,8 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
                     ocr_text, ocr_score = ocr_res[0][0]
                     if ocr_score > 0.5 and len(ocr_text) > 0:
                         span['content'] = ocr_text
+                    else:
+                        spans.remove(span)
 
     return spans