1 年之前 · 83e0d55a34
--- a/.gitignore
+++ b/.gitignore
@@ -30,10 +30,10 @@ tmp/
 
				 tmp

			
 
				 .vscode

			
 
				 .vscode/

			
 
				-/tests/

			
 
				 ocr_demo

			
 
				 

			
 
				 /app/common/__init__.py

			
 
				 /magic_pdf/config/__init__.py

			
 
				 source.dev.env

			
 
				 

			
 
				+tmp

			
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -9,6 +9,20 @@ import wordninja
 
				 import re
			
 
				 
			
 
				 
			
 
				+def __is_hyphen_at_line_end(line):
			
 
				+    """
			
 
				+    Check if a line ends with one or more letters followed by a hyphen.
			
 
				+    
			
 
				+    Args:
			
 
				+    line (str): The line of text to check.
			
 
				+    
			
 
				+    Returns:
			
 
				+    bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
			
 
				+    """
			
 
				+    # Use regex to check if the line ends with one or more letters followed by a hyphen
			
 
				+    return bool(re.search(r'[A-Za-z]+-\s*$', line))
			
 
				+
			
 
				+
			
 
				 def split_long_words(text):
			
 
				     segments = text.split(' ')
			
 
				     for i in range(len(segments)):
			
@@ -184,10 +198,17 @@ def merge_para_with_text(para_block):
 
				                 content = f" ${span['content']}$ "
			
 
				             elif span_type == ContentType.InterlineEquation:
			
 
				                 content = f"\n$$\n{span['content']}\n$$\n"
			
 
				+
			
 
				             if content != '':
			
 
				                 langs = ['zh', 'ja', 'ko']
			
 
				                 if line_lang in langs:  # 遇到一些一个字一个span的文档，这种单字语言判断不准，需要用整行文本判断
			
 
				                     para_text += content  # 中文/日语/韩文语境下，content间不需要空格分隔
			
 
				+                elif line_lang == 'en':
			
 
				+                    # 如果是前一行带有-连字符，那么末尾不应该加空格
			
 
				+                    if __is_hyphen_at_line_end(para_text):
			
 
				+                        para_text += content
			
 
				+                    else:
			
 
				+                        para_text += content + ' '
			
 
				                 else:
			
 
				                     para_text += content + ' '  # 西方文本语境下 content间需要空格分隔
			
 
				     return para_text
			
--- a/magic_pdf/pdf_parse_union_core.py
+++ b/magic_pdf/pdf_parse_union_core.py
@@ -41,6 +41,23 @@ def remove_horizontal_overlap_block_which_smaller(all_bboxes):
 
				     return is_useful_block_horz_overlap, all_bboxes
			
 
				 
			
 
				 
			
 
				+def __replace_STX_ETX(text_str:str):
			
 
				+    """ Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
			
 
				+Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.
			
 
				+
			
 
				+    Args:
			
 
				+        text_str (str): raw text
			
 
				+
			
 
				+    Returns:
			
 
				+        _type_: replaced text
			
 
				+    """
			
 
				+    if text_str:
			
 
				+        s = text_str.replace('\u0002', "'")
			
 
				+        s = s.replace("\u0003", "'")
			
 
				+        return s
			
 
				+    return text_str
			
 
				+
			
 
				+
			
 
				 def txt_spans_extract(pdf_page, inline_equations, interline_equations):
			
 
				     text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
			
 
				     char_level_text_blocks = pdf_page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)[
			
@@ -63,7 +80,7 @@ def txt_spans_extract(pdf_page, inline_equations, interline_equations):
 
				                     spans.append(
			
 
				                         {
			
 
				                             "bbox": list(span["bbox"]),
			
 
				-                            "content": span["text"],
			
 
				+                            "content": __replace_STX_ETX(span["text"]),
			
 
				                             "type": ContentType.Text,
			
 
				                             "score": 1.0,
			
 
				                         }
			
--- a/tests/test_para/test_hyphen_at_line_end.py
+++ b/tests/test_para/test_hyphen_at_line_end.py
@@ -0,0 +1,28 @@
 
				+
			
 
				+from magic_pdf.dict2md.ocr_mkcontent import __is_hyphen_at_line_end
			
 
				+
			
 
				+
			
 
				+def test_hyphen_at_line_end():
			
 
				+    """
			
 
				+    测试行尾是不是一个连字符
			
 
				+    """
			
 
				+    test_cases_ok = [
			
 
				+        "I am zhang-",
			
 
				+        "you are zhang- ",
			
 
				+        "math-",
			
 
				+        "This is a TEST-",
			
 
				+        "This is a TESTing-",
			
 
				+        "美国人 hello-",
			
 
				+    ]
			
 
				+    test_cases_bad = [
			
 
				+        "This is a TEST$-",
			
 
				+        "This is a TEST21-",
			
 
				+        "中国人-",
			
 
				+        "美国人 hello人-",
			
 
				+        "this is 123-",
			
 
				+    ]
			
 
				+    for test_case in test_cases_ok:
			
 
				+        assert __is_hyphen_at_line_end(test_case)
			
 
				+
			
 
				+    for test_case in test_cases_bad:
			
 
				+        assert not __is_hyphen_at_line_end(test_case)