浏览代码

feat(latex): enhance LaTeX delimiter support and configurability

- Add support for \(\) and \[\] delimiters in addition to $$ and $$- Make LaTeX delimiter configuration more flexible and user-defined
- Update configuration file to include LaTeX delimiter settings
- Modify OCR content generation to use configurable delimiters
myhloli 6 月之前
父节点
当前提交
100e9c17a5
共有 4 个文件被更改,包括 44 次插入6 次删除
  1. 11 1
      magic-pdf.template.json
  2. 16 2
      magic_pdf/dict2md/ocr_mkcontent.py
  3. 9 0
      magic_pdf/libs/config_reader.py
  4. 8 3
      projects/gradio_app/app.py

+ 11 - 1
magic-pdf.template.json

@@ -20,6 +20,16 @@
         "enable": true,
         "max_time": 400
     },
+    "latex-delimiter-config": {
+        "display": {
+            "left": "$$",
+            "right": "$$"
+        },
+        "inline": {
+            "left": "$",
+            "right": "$"
+        }
+    },
     "llm-aided-config": {
         "formula_aided": {
             "api_key": "your_api_key",
@@ -40,5 +50,5 @@
             "enable": false
         }
     },
-    "config_version": "1.2.0"
+    "config_version": "1.2.1"
 }

+ 16 - 2
magic_pdf/dict2md/ocr_mkcontent.py

@@ -5,6 +5,7 @@ from loguru import logger
 from magic_pdf.config.make_content_config import DropMode, MakeMode
 from magic_pdf.config.ocr_content_type import BlockType, ContentType
 from magic_pdf.libs.commons import join_path
+from magic_pdf.libs.config_reader import get_latex_delimiter_config
 from magic_pdf.libs.language import detect_lang
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
 from magic_pdf.post_proc.para_split_v3 import ListLineTag
@@ -145,6 +146,19 @@ def full_to_half(text: str) -> str:
             result.append(char)
     return ''.join(result)
 
+latex_delimiters_config = get_latex_delimiter_config()
+
+default_delimiters = {
+    'display': {'left': '$$', 'right': '$$'},
+    'inline': {'left': '$', 'right': '$'}
+}
+
+delimiters = latex_delimiters_config if latex_delimiters_config else default_delimiters
+
+display_left_delimiter = delimiters['display']['left']
+display_right_delimiter = delimiters['display']['right']
+inline_left_delimiter = delimiters['inline']['left']
+inline_right_delimiter = delimiters['inline']['right']
 
 def merge_para_with_text(para_block):
     block_text = ''
@@ -168,9 +182,9 @@ def merge_para_with_text(para_block):
             if span_type == ContentType.Text:
                 content = ocr_escape_special_markdown_char(span['content'])
             elif span_type == ContentType.InlineEquation:
-                content = f"${span['content']}$"
+                content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
             elif span_type == ContentType.InterlineEquation:
-                content = f"\n$$\n{span['content']}\n$$\n"
+                content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n"
 
             content = content.strip()
 

+ 9 - 0
magic_pdf/libs/config_reader.py

@@ -125,6 +125,15 @@ def get_llm_aided_config():
     else:
         return llm_aided_config
 
+def get_latex_delimiter_config():
+    config = read_config()
+    latex_delimiter_config = config.get('latex-delimiter-config')
+    if latex_delimiter_config is None:
+        logger.warning(f"'latex-delimiter-config' not found in {CONFIG_FILE_NAME}, use 'None' as default")
+        return None
+    else:
+        return latex_delimiter_config
+
 
 if __name__ == '__main__':
     ak, sk, endpoint = get_s3_config('llm-raw')

+ 8 - 3
projects/gradio_app/app.py

@@ -117,8 +117,12 @@ def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table
     return md_content, txt_content, archive_zip_path, new_pdf_path
 
 
-latex_delimiters = [{'left': '$$', 'right': '$$', 'display': True},
-                    {'left': '$', 'right': '$', 'display': False}]
+latex_delimiters = [
+    {'left': '$$', 'right': '$$', 'display': True},
+    {'left': '$', 'right': '$', 'display': False},
+    {'left': '\\(', 'right': '\\)', 'display': False},
+    {'left': '\\[', 'right': '\\]', 'display': True},
+]
 
 
 def init_model():
@@ -218,7 +222,8 @@ if __name__ == '__main__':
                 with gr.Tabs():
                     with gr.Tab('Markdown rendering'):
                         md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True,
-                                         latex_delimiters=latex_delimiters, line_breaks=True)
+                                         latex_delimiters=latex_delimiters,
+                                         line_breaks=True)
                     with gr.Tab('Markdown text'):
                         md_text = gr.TextArea(lines=45, show_copy_button=True)
         file.change(fn=to_pdf, inputs=file, outputs=pdf_show)