Parcourir la source

fix:use deepcopy keep the original model json

赵小蒙 il y a 1 an
Parent
commit
d5e30f8da5
1 fichiers modifiés avec 7 ajouts et 5 suppressions
  1. 7 5
      magic_pdf/cli/magicpdf.py

+ 7 - 5
magic_pdf/cli/magicpdf.py

@@ -17,8 +17,8 @@
     
 
 效果:
-python magicpdf.py --json  s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350 
-python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json  或者 python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf
+python magicpdf.py json-command --json  s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
+python magicpdf.py pdf-command --pdf  /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json  或者 python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf
 """
 
 import os
@@ -45,6 +45,7 @@ from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 import csv
+import copy
 
 parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
 
@@ -81,6 +82,7 @@ def do_parse(
     f_dump_orig_pdf=True,
     f_dump_content_list=True,
 ):
+    orig_model_list = copy.deepcopy(model_list)
 
     local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
     image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
@@ -130,7 +132,7 @@ def do_parse(
     if f_dump_model_json:
         """写model_json"""
         md_writer.write(
-            content=json_parse.dumps(pipe.model_list, ensure_ascii=False, indent=4),
+            content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
             path=f"{pdf_file_name}_model.json",
             mode=AbsReaderWriter.MODE_TXT,
         )
@@ -143,7 +145,7 @@ def do_parse(
             mode=AbsReaderWriter.MODE_BIN,
         )
 
-    content_list = pipe.pipe_mk_uni_format(str(image_dir), drop_mode=DropMode.NONE)
+    content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
     if f_dump_content_list:
         """写content_list"""
         md_writer.write(
@@ -278,7 +280,7 @@ def pdf_command(pdf, model, method):
             model_path = pdf.replace(".pdf", ".json")
             if not os.path.exists(model_path):
                 logger.warning(
-                    f"not found json {model_path} existed, use paddle analyze"
+                    f"not found json {model_path} existed"
                 )
                 # 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
                 model_json = "[]"