Pārlūkot izejas kodu

refactor(ocr): improve text processing and span handling

- Remove unused language detection code
- Simplify text content processing logic
- Update span sorting and text extraction in pdf_parse_union_core_v2.py
myhloli 11 mēneši atpakaļ
vecāks
revīzija
88c0854a65

+ 9 - 21
magic_pdf/dict2md/ocr_mkcontent.py

@@ -136,14 +136,11 @@ def merge_para_with_text(para_block):
             para_text += '  \n'
 
         line_text = ''
-        line_lang = ''
         for span in line['spans']:
             span_type = span['type']
             if span_type == ContentType.Text:
                 line_text += span['content'].strip()
 
-        if line_text != '':
-            line_lang = detect_lang(line_text)
         for j, span in enumerate(line['spans']):
 
             span_type = span['type']
@@ -157,27 +154,18 @@ def merge_para_with_text(para_block):
 
             content = content.strip()
             if content != '':
-                langs = ['zh', 'ja', 'ko']
-                if line_lang in langs:  # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
-                    if span_type in [ContentType.Text, ContentType.InterlineEquation]:
-                        para_text += content  # 中文/日语/韩文语境下,content间不需要空格分隔
-                    elif span_type == ContentType.InlineEquation:
-                        para_text += f' {content} '
-                else:
-                    if span_type in [ContentType.Text, ContentType.InlineEquation]:
-                        # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
-                        if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
-                            para_text += content[:-1]
-                        elif len(content) == 1 and content not in ['A', 'I', 'a', 'i'] and not content.isdigit():
-                            para_text += content
-                        else:  # 西方文本语境下 content间需要空格分隔
-                            para_text += f'{content} '
-                    elif span_type == ContentType.InterlineEquation:
-                        para_text += content
+                if span_type in [ContentType.Text, ContentType.InlineEquation]:
+                    # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
+                    if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
+                        para_text += content[:-1]
+                    else:  # content间需要空格分隔
+                        para_text += f'{content} '
+                elif span_type == ContentType.InterlineEquation:
+                    para_text += content
             else:
                 continue
     # 连写字符拆分
-    para_text = __replace_ligatures(para_text)
+    # para_text = __replace_ligatures(para_text)
 
     return para_text
 

+ 4 - 1
magic_pdf/pdf_parse_union_core_v2.py

@@ -84,6 +84,9 @@ def chars_to_content(span):
 LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
 def fill_char_in_spans(spans, all_chars):
 
+    # 简单从上到下排一下序
+    spans = sorted(spans, key=lambda x: x['bbox'][1])
+
     for char in all_chars:
         for span in spans:
             # 判断char是否属于LINE_STOP_FLAG
@@ -137,7 +140,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
 
 def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
 
-    text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
+    text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP | fitz.TEXT_CID_FOR_UNKNOWN_UNICODE)['blocks']
 
     all_pymu_chars = []
     for block in text_blocks_raw: