Browse Source

feat: update magic cli

blue 1 year ago
parent
commit
cc568d5efc
1 changed files with 83 additions and 51 deletions
  1. 83 51
      magic_pdf/cli/magicpdf.py

+ 83 - 51
magic_pdf/cli/magicpdf.py

@@ -50,9 +50,7 @@ parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
 
 
 def prepare_env(pdf_file_name, method):
-    local_parent_dir = os.path.join(
-        get_local_dir(), "magic-pdf", pdf_file_name, method
-    )
+    local_parent_dir = os.path.join(get_local_dir(), "magic-pdf", pdf_file_name, method)
 
     local_image_dir = os.path.join(str(local_parent_dir), "images")
     local_md_dir = local_parent_dir
@@ -62,7 +60,7 @@ def prepare_env(pdf_file_name, method):
 
 
 def write_to_csv(csv_file_path, csv_data):
-    with open(csv_file_path, mode='a', newline='', encoding='utf-8') as csvfile:
+    with open(csv_file_path, mode="a", newline="", encoding="utf-8") as csvfile:
         # 创建csv writer对象
         csv_writer = csv.writer(csvfile)
         # 写入数据
@@ -70,12 +68,28 @@ def write_to_csv(csv_file_path, csv_data):
     print(f"数据已成功追加到 '{csv_file_path}'")
 
 
-def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir, local_md_dir):
+def do_parse(
+    pdf_file_name,
+    pdf_bytes,
+    model_list,
+    parse_method,
+    f_draw_span_bbox=True,
+    f_draw_layout_bbox=True,
+    f_dump_md=True,
+    f_dump_middle_json=True,
+    f_dump_model_json=True,
+    f_dump_orig_pdf=True,
+    f_dump_content_list=True,
+):
+
+    local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
+    image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(
+        local_md_dir
+    )
+    image_dir = (os.path.basename(local_image_dir),)
+
     if parse_method == "auto":
-        jso_useful_key = {
-            "_pdf_type": "",
-            "model_list": model_list
-        }
+        jso_useful_key = {"_pdf_type": "", "model_list": model_list}
         pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
     elif parse_method == "txt":
         pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
@@ -87,48 +101,62 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
 
     pipe.pipe_classify()
 
-    '''如果没有传入有效的模型数据,则使用内置paddle解析'''
+    """如果没有传入有效的模型数据,则使用内置paddle解析"""
     if len(model_list) == 0:
         pipe.pipe_analyze()
 
     pipe.pipe_parse()
-    pdf_info = pipe.pdf_mid_data['pdf_info']
-    draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
-    draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
+    pdf_info = pipe.pdf_mid_data["pdf_info"]
+    if f_draw_layout_bbox:
+        draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
+    if f_draw_span_bbox:
+        draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
 
     # write_to_csv(r"D:\project\20231108code-clean\linshixuqiu\pdf_dev\新模型\新建文件夹\luanma.csv",
     #              [pdf_file_name, pipe.pdf_mid_data['not_common_character_rate'], pipe.pdf_mid_data['not_printable_rate']])
 
     md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
-    '''写markdown'''
-    md_writer.write(
-        content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
-    )
-    '''写middle_json'''
-    md_writer.write(
-        content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
-        path=f"{pdf_file_name}_middle.json",
-        mode=AbsReaderWriter.MODE_TXT,
-    )
-    '''写model_json'''
-    md_writer.write(
-        content=json_parse.dumps(pipe.model_list, ensure_ascii=False, indent=4),
-        path=f"{pdf_file_name}_model.json",
-        mode=AbsReaderWriter.MODE_TXT,
-    )
-    '''写源pdf'''
-    md_writer.write(
-        content=pdf_bytes,
-        path=f"{pdf_file_name}_origin.pdf",
-        mode=AbsReaderWriter.MODE_BIN,
-    )
+
+    if f_dump_md:
+        """写markdown"""
+        md_writer.write(
+            content=md_content,
+            path=f"{pdf_file_name}.md",
+            mode=AbsReaderWriter.MODE_TXT,
+        )
+
+    if f_dump_middle_json:
+        """写middle_json"""
+        md_writer.write(
+            content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
+            path=f"{pdf_file_name}_middle.json",
+            mode=AbsReaderWriter.MODE_TXT,
+        )
+
+    if f_dump_model_json:
+        """写model_json"""
+        md_writer.write(
+            content=json_parse.dumps(pipe.model_list, ensure_ascii=False, indent=4),
+            path=f"{pdf_file_name}_model.json",
+            mode=AbsReaderWriter.MODE_TXT,
+        )
+
+    if f_dump_orig_pdf:
+        """写源pdf"""
+        md_writer.write(
+            content=pdf_bytes,
+            path=f"{pdf_file_name}_origin.pdf",
+            mode=AbsReaderWriter.MODE_BIN,
+        )
     content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
-    '''写content_list'''
-    md_writer.write(
-        content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
-        path=f"{pdf_file_name}_content_list.json",
-        mode=AbsReaderWriter.MODE_TXT
-    )
+
+    if f_dump_content_list:
+        """写content_list"""
+        md_writer.write(
+            content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
+            path=f"{pdf_file_name}_content_list.json",
+            mode=AbsReaderWriter.MODE_TXT,
+        )
 
 
 @click.group()
@@ -177,13 +205,14 @@ def json_command(json, method):
         s3_file_path = jso.get("path")
     pdf_file_name = Path(s3_file_path).stem
     pdf_data = read_s3_path(s3_file_path)
+
     local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
 
     local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
         local_md_dir
     )
 
-    _do_parse(
+    do_parse(
         pdf_file_name,
         pdf_data,
         jso["doc_layout_result"],
@@ -191,7 +220,7 @@ def json_command(json, method):
         local_image_rw,
         local_md_rw,
         os.path.basename(local_image_dir),
-        local_md_dir
+        local_md_dir,
     )
 
 
@@ -235,11 +264,11 @@ def local_json_command(local_json, method):
             pdf_data = read_s3_path(s3_file_path)
             local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
 
-            local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
-                local_md_dir
-            )
+            local_image_rw, local_md_rw = DiskReaderWriter(
+                local_image_dir
+            ), DiskReaderWriter(local_md_dir)
 
-            _do_parse(
+            do_parse(
                 pdf_file_name,
                 pdf_data,
                 jso["doc_layout_result"],
@@ -247,7 +276,7 @@ def local_json_command(local_json, method):
                 local_image_rw,
                 local_md_rw,
                 os.path.basename(local_image_dir),
-                local_md_dir
+                local_md_dir,
             )
 
 
@@ -274,7 +303,9 @@ def pdf_command(pdf, model, method):
         if model_path is None:
             model_path = pdf.replace(".pdf", ".json")
             if not os.path.exists(model_path):
-                logger.warning(f"not found json {model_path} existed, use paddle analyze")
+                logger.warning(
+                    f"not found json {model_path} existed, use paddle analyze"
+                )
                 # 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
                 model_json = "[]"
             else:
@@ -286,11 +317,12 @@ def pdf_command(pdf, model, method):
 
     jso = json_parse.loads(get_model_json(model))
     pdf_file_name = Path(pdf).stem
+
     local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
     local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
         local_md_dir
     )
-    _do_parse(
+    do_parse(
         pdf_file_name,
         pdf_data,
         jso,
@@ -298,7 +330,7 @@ def pdf_command(pdf, model, method):
         local_image_rw,
         local_md_rw,
         os.path.basename(local_image_dir),
-        local_md_dir
+        local_md_dir,
     )