浏览代码

refactor: update union_make import and adjust middle JSON structure for consistency

myhloli 5 月之前
父节点
当前提交
51393aa814

+ 1 - 1
mineru/backend/pipeline/model_json_to_middle_json.py

@@ -117,7 +117,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
 
 
 def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=None, ocr=False):
-    middle_json = {"pdf_info": [], "_backend":"vlm", "_version_name": __version__}
+    middle_json = {"pdf_info": [], "_backend":"pipeline", "_version_name": __version__}
     for page_index, page_model_info in enumerate(model_list):
         page = pdf_doc[page_index]
         image_dict = images_list[page_index]

+ 0 - 2
mineru/backend/pipeline/pipeline_analyze.py

@@ -5,8 +5,6 @@ import torch
 
 from .model_init import MineruPipelineModel
 from .config_reader import get_local_models_dir, get_device, get_formula_config, get_table_recog_config
-from .model_json_to_middle_json import result_to_middle_json
-from ...data.data_reader_writer import DataWriter
 from ...utils.pdf_classify import classify
 from ...utils.pdf_image_tools import load_images_from_pdf
 

+ 20 - 19
mineru/cli/common.py

@@ -2,13 +2,14 @@
 import io
 import json
 import os
+import copy
 from pathlib import Path
 
 import pypdfium2 as pdfium
 from loguru import logger
 
 from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
-from mineru.api.vlm_middle_json_mkcontent import union_make
+from mineru.api.vlm_middle_json_mkcontent import union_make as vlm_union_make
 from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
 from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
 from mineru.data.data_reader_writer import FileBasedDataWriter
@@ -98,8 +99,8 @@ def do_parse(
         infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = pipeline_doc_analyze(pdf_bytes_list, p_lang_list, parse_method=parse_method, formula_enable=p_formula_enable,table_enable=p_table_enable)
 
         for idx, model_list in enumerate(infer_results):
+            model_json = copy.deepcopy(model_list)
             pdf_file_name = pdf_file_names[idx]
-            model_json = infer_results[idx]
             local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
             image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
 
@@ -124,21 +125,21 @@ def do_parse(
                     pdf_bytes,
                 )
 
-            if f_dump_md:
-                image_dir = str(os.path.basename(local_image_dir))
-                md_content_str = union_make(pdf_info, f_make_md_mode, image_dir)
-                md_writer.write_string(
-                    f"{pdf_file_name}.md",
-                    md_content_str,
-                )
-
-            if f_dump_content_list:
-                image_dir = str(os.path.basename(local_image_dir))
-                content_list = union_make(pdf_info, MakeMode.STANDARD_FORMAT, image_dir)
-                md_writer.write_string(
-                    f"{pdf_file_name}_content_list.json",
-                    json.dumps(content_list, ensure_ascii=False, indent=4),
-                )
+            # if f_dump_md:
+            #     image_dir = str(os.path.basename(local_image_dir))
+            #     md_content_str = union_make(pdf_info, f_make_md_mode, image_dir)
+            #     md_writer.write_string(
+            #         f"{pdf_file_name}.md",
+            #         md_content_str,
+            #     )
+
+            # if f_dump_content_list:
+            #     image_dir = str(os.path.basename(local_image_dir))
+            #     content_list = union_make(pdf_info, MakeMode.STANDARD_FORMAT, image_dir)
+            #     md_writer.write_string(
+            #         f"{pdf_file_name}_content_list.json",
+            #         json.dumps(content_list, ensure_ascii=False, indent=4),
+            #     )
 
             if f_dump_middle_json:
                 md_writer.write_string(
@@ -179,7 +180,7 @@ def do_parse(
 
             if f_dump_md:
                 image_dir = str(os.path.basename(local_image_dir))
-                md_content_str = union_make(pdf_info, f_make_md_mode, image_dir)
+                md_content_str = vlm_union_make(pdf_info, f_make_md_mode, image_dir)
                 md_writer.write_string(
                     f"{pdf_file_name}.md",
                     md_content_str,
@@ -187,7 +188,7 @@ def do_parse(
 
             if f_dump_content_list:
                 image_dir = str(os.path.basename(local_image_dir))
-                content_list = union_make(pdf_info, MakeMode.STANDARD_FORMAT, image_dir)
+                content_list = vlm_union_make(pdf_info, MakeMode.STANDARD_FORMAT, image_dir)
                 md_writer.write_string(
                     f"{pdf_file_name}_content_list.json",
                     json.dumps(content_list, ensure_ascii=False, indent=4),