瀏覽代碼

make markdown时特殊符号转义

赵小蒙 1 年之前
父節點
當前提交
59b0b0c3da
共有 2 個文件被更改,包括 14 次插入2 次删除
  1. 3 2
      magic_pdf/dict2md/ocr_mkcontent.py
  2. 11 0
      magic_pdf/libs/markdown_utils.py

+ 3 - 2
magic_pdf/dict2md/ocr_mkcontent.py

@@ -1,3 +1,4 @@
+from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
 from magic_pdf.libs.ocr_content_type import ContentType
 
 
@@ -14,7 +15,7 @@ def ocr_mk_nlp_markdown(pdf_info_dict: dict):
                 for span in line['spans']:
                     if not span.get('content'):
                         continue
-                    content = span['content'].replace('$', '\$')  # 转义$
+                    content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
                     if span['type'] == ContentType.InlineEquation:
                         content = f"${content}$"
                     elif span['type'] == ContentType.InterlineEquation:
@@ -43,7 +44,7 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
                         else:
                             content = f"![]({span['image_path']})"
                     else:
-                        content = span['content'].replace('$', '\$')  # 转义$
+                        content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
                         if span['type'] == ContentType.InlineEquation:
                             content = f"${content}$"
                         elif span['type'] == ContentType.InterlineEquation:

+ 11 - 0
magic_pdf/libs/markdown_utils.py

@@ -18,3 +18,14 @@ def escape_special_markdown_char(pymu_blocks):
                         span['text'] = span['text'].replace(char, "\\" + char)
 
     return pymu_blocks
+
+
+def ocr_escape_special_markdown_char(content):
+    """
+    转义正文里对markdown语法有特殊意义的字符
+    """
+    special_chars = ["*", "`", "~", "$"]
+    for char in special_chars:
+        content = content.replace(char, "\\" + char)
+
+    return content