浏览代码

fix(pdf): improve ligature handling and text extraction

- Move ligature replacement function to pdf_parse_union_core_v2.py
- Optimize ligature replacement using a more efficient approach
- Modify text extraction flags to preserve ligatures in PDF content
- Remove unnecessary function from ocr_mkcontent.py
myhloli 11 月之前
父节点
当前提交
c638fc5d1f
共有 2 个文件被更改,包括 15 次插入16 次删除
  1. 0 10
      magic_pdf/dict2md/ocr_mkcontent.py
  2. 15 6
      magic_pdf/pdf_parse_union_core_v2.py

+ 0 - 10
magic_pdf/dict2md/ocr_mkcontent.py

@@ -125,16 +125,6 @@ def detect_language(text):
         return 'empty'
 
 
-# 连写字符拆分
-def __replace_ligatures(text: str):
-    text = re.sub(r'fi', 'fi', text)  # 替换 fi 连写符
-    text = re.sub(r'fl', 'fl', text)  # 替换 fl 连写符
-    text = re.sub(r'ff', 'ff', text)  # 替换 ff 连写符
-    text = re.sub(r'ffi', 'ffi', text)  # 替换 ffi 连写符
-    text = re.sub(r'ffl', 'ffl', text)  # 替换 ffl 连写符
-    return text
-
-
 def merge_para_with_text(para_block):
     block_text = ''
     for line in para_block['lines']:

+ 15 - 6
magic_pdf/pdf_parse_union_core_v2.py

@@ -1,5 +1,6 @@
 import copy
 import os
+import re
 import statistics
 import time
 from typing import List
@@ -63,6 +64,15 @@ def __replace_0xfffd(text_str: str):
         return s
     return text_str
 
+
+# 连写字符拆分
+def __replace_ligatures(text: str):
+    ligatures = {
+        'fi': 'fi', 'fl': 'fl', 'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'ſt': 'ft', 'st': 'st'
+    }
+    return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text)
+
+
 def chars_to_content(span):
     # 检查span中的char是否为空
     if len(span['chars']) == 0:
@@ -83,6 +93,7 @@ def chars_to_content(span):
                 content += ' '
             content += char['c']
 
+        content = __replace_ligatures(content)
         span['content'] = __replace_0xfffd(content)
 
     del span['chars']
@@ -152,9 +163,11 @@ def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
 
 
 def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
+    # cid用0xfffd表示,连字符拆开
+    # text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
 
-    text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
-
+    # cid用0xfffd表示,连字符不拆开
+    text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
     all_pymu_chars = []
     for block in text_blocks_raw:
         for line in block['lines']:
@@ -255,10 +268,6 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
     return spans
 
 
-def replace_text_span(pymu_spans, ocr_spans):
-    return list(filter(lambda x: x['type'] != ContentType.Text, ocr_spans)) + pymu_spans
-
-
 def model_init(model_name: str):
     from transformers import LayoutLMv3ForTokenClassification