Pārlūkot izejas kodu

Merge pull request #1534 from myhloli/dev

Update pdf_parse_union_core_v2.py
Xiaomeng Zhao 10 mēneši atpakaļ
vecāks
revīzija
1b0ef29aa2
1 mainītis faili ar 5 papildinājumiem un 1 dzēšanām
  1. 5 1
      magic_pdf/pdf_parse_union_core_v2.py

+ 5 - 1
magic_pdf/pdf_parse_union_core_v2.py

@@ -178,7 +178,11 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
     # text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
 
     # cid用0xfffd表示,连字符不拆开
-    text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
+    #text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
+
+    # 自定义flags出现较多0xfffd,可能是pymupdf可以自行处理内置字典的pdf,不再使用
+    text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
+    # text_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
     all_pymu_chars = []
     for block in text_blocks_raw:
         for line in block['lines']: