瀏覽代碼

Merge pull request #2857 from myhloli/dev

fix: add content existence checks for inline and interline equations in pipeline_middle_json_mkcontent.py
Xiaomeng Zhao 4 月之前
父節點
當前提交
1012dcb9d4
共有 2 個文件被更改,包括 13 次插入4 次删除
  1. 4 2
      mineru/backend/pipeline/pipeline_middle_json_mkcontent.py
  2. 9 2
      mineru/cli/gradio_app.py

+ 4 - 2
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py

@@ -157,9 +157,11 @@ def merge_para_with_text(para_block):
             if span_type == ContentType.TEXT:
                 content = escape_special_markdown_char(span['content'])
             elif span_type == ContentType.INLINE_EQUATION:
-                content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
+                if span.get('content', ''):
+                    content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
             elif span_type == ContentType.INTERLINE_EQUATION:
-                content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n"
+                if span.get('content', ''):
+                    content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n"
 
             content = content.strip()
 

+ 9 - 2
mineru/cli/gradio_app.py

@@ -225,7 +225,14 @@ def update_interface(backend_choice):
     help="Enable gradio API for serving the application.",
     default=True,
 )
-def main(example_enable, sglang_engine_enable, mem_fraction_static, torch_compile_enable, api_enable):
+@click.option(
+    '--max-convert-pages',
+    'max_convert_pages',
+    type=int,
+    help="Set the maximum number of pages to convert from PDF to Markdown.",
+    default=1000,
+)
+def main(example_enable, sglang_engine_enable, mem_fraction_static, torch_compile_enable, api_enable, max_convert_pages):
     if sglang_engine_enable:
         try:
             print("Start init SgLang engine...")
@@ -257,7 +264,7 @@ def main(example_enable, sglang_engine_enable, mem_fraction_static, torch_compil
                 with gr.Row():
                     input_file = gr.File(label='Please upload a PDF or image', file_types=suffixes)
                 with gr.Row():
-                    max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
+                    max_pages = gr.Slider(1, max_convert_pages, int(max_convert_pages/2), step=1, label='Max convert pages')
                 with gr.Row():
                     if sglang_engine_enable:
                         drop_list = ["pipeline", "vlm-sglang-engine"]