Kaynağa Gözat

refactor: improve text processing by adding ligature and unicode replacement functions

myhloli 5 ay önce
ebeveyn
işleme
f211554137

+ 0 - 1
mineru/backend/pipeline/batch_analyze.py

@@ -132,7 +132,6 @@ class BatchAnalyze:
                 # 获取OCR模型
                 ocr_model = atom_model_manager.get_atom_model(
                     atom_model_name='ocr',
-                    ocr_show_log=False,
                     det_db_box_thresh=0.3,
                     lang=lang
                 )

+ 1 - 1
mineru/utils/span_block_fix.py

@@ -38,7 +38,7 @@ def fill_spans_in_blocks(blocks, spans, radio):
 
 
 def span_block_type_compatible(span_type, block_type):
-    if span_type in [ContentType.TEXT, ContentType.INTERLINE_EQUATION]:
+    if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
         return block_type in [
             BlockType.TEXT,
             BlockType.TITLE,

+ 16 - 0
mineru/utils/span_pre_proc.py

@@ -1,4 +1,5 @@
 # Copyright (c) Opendatalab. All rights reserved.
+import re
 import cv2
 import numpy as np
 
@@ -100,6 +101,19 @@ def remove_overlaps_min_spans(spans):
     return spans, dropped_spans
 
 
+def __replace_ligatures(text: str):
+    ligatures = {
+        'fi': 'fi', 'fl': 'fl', 'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'ſt': 'ft', 'st': 'st'
+    }
+    return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text)
+
+def __replace_unicode(text: str):
+    ligatures = {
+        '\r\n': '', '\u0002': '-',
+    }
+    return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text)
+
+
 def txt_spans_extract(pdf_page, spans, pil_img, scale):
 
     textpage = pdf_page.get_textpage()
@@ -117,6 +131,8 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale):
         text = textpage.get_text_bounded(left=rect_box[0], top=rect_box[1],
                                          right=rect_box[2], bottom=rect_box[3])
         if text and len(text) > 0:
+            text = __replace_unicode(text)
+            text = __replace_ligatures(text)
             span['content'] = text.strip()
             span['score'] = 1.0
         else: