Browse Source

update cli

赵小蒙 1 năm trước cách đây
mục cha
commit
88f2245d86
1 tập tin đã thay đổi với 57 bổ sung52 xóa
  1. 57 52
      magic_pdf/cli/magicpdf.py

+ 57 - 52
magic_pdf/cli/magicpdf.py

@@ -27,6 +27,7 @@ import sys
 import click
 from loguru import logger
 from pathlib import Path
+from magic_pdf.libs.version import __version__
 
 from magic_pdf.libs.MakeContentConfig import DropMode
 from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
@@ -52,7 +53,7 @@ def prepare_env(pdf_file_name, method):
         get_local_dir(), "magic-pdf", pdf_file_name, method
     )
 
-    local_image_dir = os.path.join(local_parent_dir, "images")
+    local_image_dir = os.path.join(str(local_parent_dir), "images")
     local_md_dir = local_parent_dir
     os.makedirs(local_image_dir, exist_ok=True)
     os.makedirs(local_md_dir, exist_ok=True)
@@ -102,6 +103,7 @@ def cli():
 
 
 @cli.command()
+@click.version_option(__version__, "--version", "-v", help="显示版本信息")
 @click.option("--json", type=str, help="输入一个S3路径")
 @click.option(
     "--method",
@@ -158,63 +160,65 @@ def json_command(json, method):
     )
 
 
-    @cli.command()
-    @click.option("--local_json", type=str, help="输入一个本地jsonl路径")
-    @click.option(
-        "--method",
-        type=parse_pdf_methods,
-        help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
-        default="auto",
-    )
-    def local_json_command(local_json, method):
-        def read_s3_path(s3path):
-            bucket, key = parse_s3path(s3path)
+@cli.command()
+@click.version_option(__version__, "--version", "-v", help="显示版本信息")
+@click.option("--local_json", type=str, help="输入一个本地jsonl路径")
+@click.option(
+    "--method",
+    type=parse_pdf_methods,
+    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
+    default="auto",
+)
+def local_json_command(local_json, method):
+    def read_s3_path(s3path):
+        bucket, key = parse_s3path(s3path)
 
-            s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
-            s3_rw = S3ReaderWriter(
-                s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
-            )
-            may_range_params = parse_s3_range_params(s3path)
-            if may_range_params is None or 2 != len(may_range_params):
-                byte_start, byte_end = 0, None
-            else:
-                byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
-                byte_end += byte_start - 1
-            return s3_rw.read_jsonl(
-                remove_non_official_s3_args(s3path),
-                byte_start,
-                byte_end,
-                AbsReaderWriter.MODE_BIN,
+        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
+        s3_rw = S3ReaderWriter(
+            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
+        )
+        may_range_params = parse_s3_range_params(s3path)
+        if may_range_params is None or 2 != len(may_range_params):
+            byte_start, byte_end = 0, None
+        else:
+            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
+            byte_end += byte_start - 1
+        return s3_rw.read_jsonl(
+            remove_non_official_s3_args(s3path),
+            byte_start,
+            byte_end,
+            AbsReaderWriter.MODE_BIN,
+        )
+
+    with open(local_json, "r", encoding="utf-8") as f:
+        for json_line in f:
+            jso = json_parse.loads(json_line)
+
+            s3_file_path = jso.get("file_location")
+            if s3_file_path is None:
+                s3_file_path = jso.get("path")
+            pdf_file_name = Path(s3_file_path).stem
+            pdf_data = read_s3_path(s3_file_path)
+            local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
+
+            local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
+                local_md_dir
             )
 
-        with open(local_json, "r", encoding="utf-8") as f:
-            for json_line in f:
-                jso = json_parse.loads(json_line)
-
-                s3_file_path = jso.get("file_location")
-                if s3_file_path is None:
-                    s3_file_path = jso.get("path")
-                pdf_file_name = Path(s3_file_path).stem
-                pdf_data = read_s3_path(s3_file_path)
-                local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
-
-                local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
-                    local_md_dir
-                )
-
-                _do_parse(
-                    pdf_file_name,
-                    pdf_data,
-                    jso["doc_layout_result"],
-                    method,
-                    local_image_rw,
-                    local_md_rw,
-                    os.path.basename(local_image_dir),
-                    local_md_dir
-                )
+            _do_parse(
+                pdf_file_name,
+                pdf_data,
+                jso["doc_layout_result"],
+                method,
+                local_image_rw,
+                local_md_rw,
+                os.path.basename(local_image_dir),
+                local_md_dir
+            )
 
 
 @cli.command()
+@click.version_option(__version__, "--version", "-v", help="显示版本信息")
 @click.option(
     "--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
 )
@@ -256,6 +260,7 @@ def pdf_command(pdf, model, method):
     )
 
 
+
 if __name__ == "__main__":
     """
     python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551