Переглянути джерело

fix: replace \u0002, \u0003 in common text (#521)

* fix replace \u0002, \u0003 in common text

* fix(para): When an English line ends with a hyphen, do not add a space at the end.
drunkpig 1 рік тому
батько
коміт
83e0d55a34

+ 1 - 1
.gitignore

@@ -30,10 +30,10 @@ tmp/
 tmp
 .vscode
 .vscode/
-/tests/
 ocr_demo
 
 /app/common/__init__.py
 /magic_pdf/config/__init__.py
 source.dev.env
 
+tmp

+ 21 - 0
magic_pdf/dict2md/ocr_mkcontent.py

@@ -9,6 +9,20 @@ import wordninja
 import re
 
 
+def __is_hyphen_at_line_end(line):
+    """
+    Check if a line ends with one or more letters followed by a hyphen.
+    
+    Args:
+    line (str): The line of text to check.
+    
+    Returns:
+    bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
+    """
+    # Use regex to check if the line ends with one or more letters followed by a hyphen
+    return bool(re.search(r'[A-Za-z]+-\s*$', line))
+
+
 def split_long_words(text):
     segments = text.split(' ')
     for i in range(len(segments)):
@@ -184,10 +198,17 @@ def merge_para_with_text(para_block):
                 content = f" ${span['content']}$ "
             elif span_type == ContentType.InterlineEquation:
                 content = f"\n$$\n{span['content']}\n$$\n"
+
             if content != '':
                 langs = ['zh', 'ja', 'ko']
                 if line_lang in langs:  # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
                     para_text += content  # 中文/日语/韩文语境下,content间不需要空格分隔
+                elif line_lang == 'en':
+                    # 如果是前一行带有-连字符,那么末尾不应该加空格
+                    if __is_hyphen_at_line_end(para_text):
+                        para_text += content
+                    else:
+                        para_text += content + ' '
                 else:
                     para_text += content + ' '  # 西方文本语境下 content间需要空格分隔
     return para_text

+ 18 - 1
magic_pdf/pdf_parse_union_core.py

@@ -41,6 +41,23 @@ def remove_horizontal_overlap_block_which_smaller(all_bboxes):
     return is_useful_block_horz_overlap, all_bboxes
 
 
+def __replace_STX_ETX(text_str:str):
+    """ Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
+Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.
+
+    Args:
+        text_str (str): raw text
+
+    Returns:
+        _type_: replaced text
+    """
+    if text_str:
+        s = text_str.replace('\u0002', "'")
+        s = s.replace("\u0003", "'")
+        return s
+    return text_str
+
+
 def txt_spans_extract(pdf_page, inline_equations, interline_equations):
     text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
     char_level_text_blocks = pdf_page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)[
@@ -63,7 +80,7 @@ def txt_spans_extract(pdf_page, inline_equations, interline_equations):
                     spans.append(
                         {
                             "bbox": list(span["bbox"]),
-                            "content": span["text"],
+                            "content": __replace_STX_ETX(span["text"]),
                             "type": ContentType.Text,
                             "score": 1.0,
                         }

+ 28 - 0
tests/test_para/test_hyphen_at_line_end.py

@@ -0,0 +1,28 @@
+
+from magic_pdf.dict2md.ocr_mkcontent import __is_hyphen_at_line_end
+
+
+def test_hyphen_at_line_end():
+    """
+    测试行尾是不是一个连字符
+    """
+    test_cases_ok = [
+        "I am zhang-",
+        "you are zhang- ",
+        "math-",
+        "This is a TEST-",
+        "This is a TESTing-",
+        "美国人 hello-",
+    ]
+    test_cases_bad = [
+        "This is a TEST$-",
+        "This is a TEST21-",
+        "中国人-",
+        "美国人 hello人-",
+        "this is 123-",
+    ]
+    for test_case in test_cases_ok:
+        assert __is_hyphen_at_line_end(test_case)
+
+    for test_case in test_cases_bad:
+        assert not __is_hyphen_at_line_end(test_case)