瀏覽代碼

Merge pull request #2404 from opendatalab/release-1.3.10

Release 1.3.10
Xiaomeng Zhao 6 月之前
父節點
當前提交
8802687934

+ 3 - 0
README.md

@@ -48,6 +48,9 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
 </div>
 
 # Changelog
+- 2025/04/29 1.3.10 Released
+  - Support for custom formula delimiters can be achieved by modifying the `latex-delimiter-config` item in the `magic-pdf.json` file under the user directory.
+  - Pinned `pdfminer.six` to version `20250324` to prevent parsing failures caused by new versions.
 - 2025/04/27 1.3.9 Released  
   - Optimized the formula parsing function to improve the success rate of formula rendering  
   - Updated `pdfminer.six` to the latest version, fixing some abnormal PDF parsing issues

+ 3 - 0
README_zh-CN.md

@@ -47,6 +47,9 @@
 </div>
 
 # 更新记录
+- 2025/04/29 1.3.10 发布
+  - 支持使用自定义公式标识符,可通过修改用户目录下的`magic-pdf.json`文件中的`latex-delimiter-config`项实现。
+  - 锁定`pdfminer.six`至`20250324`版本,以避免新版本导致的解析失败问题。
 - 2025/04/27 1.3.9 发布
   - 优化公式解析功能,提升公式渲染的成功率
   - 更新`pdfminer.six`到最新版本,修复了部分pdf解析异常问题

+ 11 - 1
magic-pdf.template.json

@@ -20,6 +20,16 @@
         "enable": true,
         "max_time": 400
     },
+    "latex-delimiter-config": {
+        "display": {
+            "left": "$$",
+            "right": "$$"
+        },
+        "inline": {
+            "left": "$",
+            "right": "$"
+        }
+    },
     "llm-aided-config": {
         "formula_aided": {
             "api_key": "your_api_key",
@@ -40,5 +50,5 @@
             "enable": false
         }
     },
-    "config_version": "1.2.0"
+    "config_version": "1.2.1"
 }

+ 16 - 2
magic_pdf/dict2md/ocr_mkcontent.py

@@ -5,6 +5,7 @@ from loguru import logger
 from magic_pdf.config.make_content_config import DropMode, MakeMode
 from magic_pdf.config.ocr_content_type import BlockType, ContentType
 from magic_pdf.libs.commons import join_path
+from magic_pdf.libs.config_reader import get_latex_delimiter_config
 from magic_pdf.libs.language import detect_lang
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
 from magic_pdf.post_proc.para_split_v3 import ListLineTag
@@ -145,6 +146,19 @@ def full_to_half(text: str) -> str:
             result.append(char)
     return ''.join(result)
 
+latex_delimiters_config = get_latex_delimiter_config()
+
+default_delimiters = {
+    'display': {'left': '$$', 'right': '$$'},
+    'inline': {'left': '$', 'right': '$'}
+}
+
+delimiters = latex_delimiters_config if latex_delimiters_config else default_delimiters
+
+display_left_delimiter = delimiters['display']['left']
+display_right_delimiter = delimiters['display']['right']
+inline_left_delimiter = delimiters['inline']['left']
+inline_right_delimiter = delimiters['inline']['right']
 
 def merge_para_with_text(para_block):
     block_text = ''
@@ -168,9 +182,9 @@ def merge_para_with_text(para_block):
             if span_type == ContentType.Text:
                 content = ocr_escape_special_markdown_char(span['content'])
             elif span_type == ContentType.InlineEquation:
-                content = f"${span['content']}$"
+                content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
             elif span_type == ContentType.InterlineEquation:
-                content = f"\n$$\n{span['content']}\n$$\n"
+                content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n"
 
             content = content.strip()
 

+ 9 - 0
magic_pdf/libs/config_reader.py

@@ -125,6 +125,15 @@ def get_llm_aided_config():
     else:
         return llm_aided_config
 
+def get_latex_delimiter_config():
+    config = read_config()
+    latex_delimiter_config = config.get('latex-delimiter-config')
+    if latex_delimiter_config is None:
+        logger.warning(f"'latex-delimiter-config' not found in {CONFIG_FILE_NAME}, use 'None' as default")
+        return None
+    else:
+        return latex_delimiter_config
+
 
 if __name__ == '__main__':
     ak, sk, endpoint = get_s3_config('llm-raw')

+ 4 - 1
magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py

@@ -342,7 +342,10 @@ REPLACEMENTS_PATTERNS = {
     re.compile(r'\\Tilde'): r'\\tilde',
     re.compile(r'\\slash'): r'/',
     re.compile(r'\\textperthousand'): r'‰',
-    re.compile(r'\\sun'): r'☉'
+    re.compile(r'\\sun'): r'☉',
+    re.compile(r'\\textunderscore'): r'\\_',
+    re.compile(r'\\fint'): r'⨏',
+    re.compile(r'\\up '): r'\\ ',
 }
 QQUAD_PATTERN = re.compile(r'\\qquad(?!\s)')
 

+ 2 - 2
magic_pdf/model/sub_modules/model_utils.py

@@ -172,8 +172,8 @@ def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0
         tables_inside = [j for j in range(len(table_res_list))
                          if i != j and is_inside(table_info[j], table_info[i], overlap_threshold)]
 
-        # Continue if there are at least 2 tables inside
-        if len(tables_inside) >= 2:
+        # Continue if there are at least 3 tables inside
+        if len(tables_inside) >= 3:
             # Check if inside tables overlap with each other
             tables_overlap = any(do_overlap(table_info[tables_inside[idx1]], table_info[tables_inside[idx2]])
                                  for idx1 in range(len(tables_inside))

+ 8 - 3
projects/gradio_app/app.py

@@ -117,8 +117,12 @@ def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table
     return md_content, txt_content, archive_zip_path, new_pdf_path
 
 
-latex_delimiters = [{'left': '$$', 'right': '$$', 'display': True},
-                    {'left': '$', 'right': '$', 'display': False}]
+latex_delimiters = [
+    {'left': '$$', 'right': '$$', 'display': True},
+    {'left': '$', 'right': '$', 'display': False},
+    {'left': '\\(', 'right': '\\)', 'display': False},
+    {'left': '\\[', 'right': '\\]', 'display': True},
+]
 
 
 def init_model():
@@ -218,7 +222,8 @@ if __name__ == '__main__':
                 with gr.Tabs():
                     with gr.Tab('Markdown rendering'):
                         md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True,
-                                         latex_delimiters=latex_delimiters, line_breaks=True)
+                                         latex_delimiters=latex_delimiters,
+                                         line_breaks=True)
                     with gr.Tab('Markdown text'):
                         md_text = gr.TextArea(lines=45, show_copy_button=True)
         file.change(fn=to_pdf, inputs=file, outputs=pdf_show)

+ 1 - 1
requirements.txt

@@ -10,6 +10,6 @@ scikit-learn>=1.0.2
 torch>=2.2.2,!=2.5.0,!=2.5.1
 torchvision
 transformers>=4.49.0,!=4.51.0,<5.0.0
-pdfminer.six>=20250416
+pdfminer.six==20250324
 tqdm>=4.67.1
 # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.