许瑞 1 год назад
Родитель
Сommit
55cba1f4ed

+ 78 - 9
magic_pdf/cli/magicpdf.py

@@ -21,28 +21,97 @@ python magicpdf.py --json  s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
 python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json  或者 python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf
 """
 
+import click
+from magic_pdf.libs.config_reader import get_s3_config
+from magic_pdf.libs.path_utils import (
+    parse_s3path,
+    parse_s3_range_params,
+    remove_non_official_s3_args,
+)
+from magic_pdf.libs.config_reader import get_local_dir
+from magic_pdf.io.S3ReaderWriter import S3ReaderWriter, MODE_BIN
+from magic_pdf.io.DiskReaderWriter import DiskReaderWriter
+from magic_pdf.spark.spark_api import parse_union_pdf
+import os
+import json as json_parse
+from datetime import datetime
 
 
+def prepare_env():
+    local_parent_dir = os.path.join(
+        get_local_dir(), "magic-pdf", datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    )
+
+    local_image_dir = os.path.join(local_parent_dir, "images")
+    local_md_dir = os.path.join(local_parent_dir, "md")
+    os.makedirs(local_image_dir, exist_ok=True)
+    os.makedirs(local_md_dir, exist_ok=True)
+    return local_image_dir, local_md_dir
 
-import click
 
 @click.group()
 def cli():
     pass
 
+
 @cli.command()
-@click.option('--json', type=str, help='输入一个S3路径')
+@click.option("--json", type=str, help="输入一个S3路径")
 def json_command(json):
-    # 这里处理json相关的逻辑
-    print(f'处理JSON: {json}')
+    if not json.startswith("s3://"):
+        print("usage: python magipdf.py --json s3://some_bucket/some_path")
+        os.exit(1)
+
+    def read_s3_path(s3path):
+        bucket, key = parse_s3path(s3path)
+
+        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
+        s3_rw = S3ReaderWriter(
+            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
+        )
+        may_range_params = parse_s3_range_params(json)
+        if may_range_params is None or 2 != len(may_range_params):
+            byte_start, byte_end = 0, None
+        else:
+            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
+        return s3_rw.read_jsonl(
+            remove_non_official_s3_args(s3path), byte_start, byte_end, MODE_BIN
+        )
+
+    jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
+    pdf_data = read_s3_path(jso["file_location"])
+    local_image_dir, _ = prepare_env()
+
+    local_image_rw = DiskReaderWriter(local_image_dir)
+    parse_union_pdf(pdf_data, jso["doc_layout_result"], local_image_rw, is_debug=True)
+
 
 @cli.command()
-@click.option('--pdf', type=click.Path(exists=True), required=True, help='PDF文件的路径')
-@click.option('--model', type=click.Path(exists=True), help='模型的路径')
+@click.option(
+    "--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
+)
+@click.option("--model", type=click.Path(exists=True), help="模型的路径")
 def pdf_command(pdf, model):
     # 这里处理pdf和模型相关的逻辑
-    print(f'处理PDF: {pdf}')
-    print(f'加载模型: {model}')
+    if model is None:
+        model = pdf.replace(".pdf", ".json")
+        if not os.path.exists(model):
+            print(f"make sure json file existed and place under {os.dirname(pdf)}")
+            os.eixt(1)
+
+    def read_fn(path):
+        disk_rw = DiskReaderWriter(os.path.dirname(path))
+        return disk_rw.read(os.path.basename(path), MODE_BIN)
+
+    pdf_data = read_fn(pdf)
+    jso = json_parse.loads(read_fn(model).decode("utf-8"))
+
+    local_image_dir, _ = prepare_env()
+    local_image_rw = DiskReaderWriter(local_image_dir)
+    parse_union_pdf(pdf_data, jso["doc_layout_result"], local_image_rw, is_debug=True)
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
+    """
+    python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/format/v070/part-66028dd46437-000076.jsonl?bytes=0,308393
+    """
     cli()

+ 9 - 8
magic_pdf/io/DiskReaderWriter.py

@@ -5,9 +5,11 @@ from loguru import logger
 
 MODE_TXT = "text"
 MODE_BIN = "binary"
+
+
 class DiskReaderWriter(AbsReaderWriter):
 
-    def __init__(self, parent_path, encoding='utf-8'):
+    def __init__(self, parent_path, encoding="utf-8"):
         self.path = parent_path
         self.encoding = encoding
 
@@ -20,10 +22,10 @@ class DiskReaderWriter(AbsReaderWriter):
             logger.error(f"文件 {abspath} 不存在")
             raise Exception(f"文件 {abspath} 不存在")
         if mode == MODE_TXT:
-            with open(abspath, 'r', encoding = self.encoding) as f:
+            with open(abspath, "r", encoding=self.encoding) as f:
                 return f.read()
         elif mode == MODE_BIN:
-            with open(abspath, 'rb') as f:
+            with open(abspath, "rb") as f:
                 return f.read()
         else:
             raise ValueError("Invalid mode. Use 'text' or 'binary'.")
@@ -34,20 +36,21 @@ class DiskReaderWriter(AbsReaderWriter):
         else:
             abspath = os.path.join(self.path, path)
         if mode == MODE_TXT:
-            with open(abspath, 'w', encoding=self.encoding) as f:
+            with open(abspath, "w", encoding=self.encoding) as f:
                 f.write(content)
                 logger.info(f"内容已成功写入 {abspath}")
 
         elif mode == MODE_BIN:
-            with open(abspath, 'wb') as f:
+            with open(abspath, "wb") as f:
                 f.write(content)
                 logger.info(f"内容已成功写入 {abspath}")
         else:
             raise ValueError("Invalid mode. Use 'text' or 'binary'.")
 
-    def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding='utf-8'):
+    def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding="utf-8"):
         return self.read(path)
 
+
 # 使用示例
 if __name__ == "__main__":
     file_path = "io/example.txt"
@@ -60,5 +63,3 @@ if __name__ == "__main__":
     content = drw.read(path=file_path)
     if content:
         logger.info(f"从 {file_path} 读取的内容: {content}")
-
-

+ 16 - 6
magic_pdf/libs/config_reader.py

@@ -2,6 +2,7 @@
 根据bucket的名字返回对应的s3 AK, SK,endpoint三元组
 
 """
+
 import json
 import os
 
@@ -10,11 +11,7 @@ from loguru import logger
 from magic_pdf.libs.commons import parse_bucket_key
 
 
-def get_s3_config(bucket_name: str):
-    """
-    ~/magic-pdf.json 读出来
-    """
-
+def read_config():
     home_dir = os.path.expanduser("~")
 
     config_file = os.path.join(home_dir, "magic-pdf.json")
@@ -24,6 +21,14 @@ def get_s3_config(bucket_name: str):
 
     with open(config_file, "r") as f:
         config = json.load(f)
+    return config
+
+
+def get_s3_config(bucket_name: str):
+    """
+    ~/magic-pdf.json 读出来
+    """
+    config = read_config()
 
     bucket_info = config.get("bucket_info")
     if bucket_name not in bucket_info:
@@ -49,5 +54,10 @@ def get_bucket_name(path):
     return bucket
 
 
-if __name__ == '__main__':
+def get_local_dir():
+    config = read_config()
+    return config.get("temp-output-dir", "/tmp")
+
+
+if __name__ == "__main__":
     ak, sk, endpoint = get_s3_config("llm-raw")

+ 23 - 0
magic_pdf/libs/path_utils.py

@@ -0,0 +1,23 @@
+
+
+from s3pathlib import S3Path
+
+def remove_non_official_s3_args(s3path):
+    """
+    example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
+    """
+    arr = s3path.split("?")
+    return arr[0]
+
+def parse_s3path(s3path: str):
+    p = S3Path(remove_non_official_s3_args(s3path))
+    return p.bucket, p.key
+
+def parse_s3_range_params(s3path: str):
+    """
+    example: s3://abc/xxxx.json?bytes=0,81350 ==> [0, 81350]
+    """
+    arr = s3path.split("?bytes=")
+    if len(arr) == 1:
+        return None
+    return arr[1].split(",")

+ 2 - 1
requirements.txt

@@ -15,4 +15,5 @@ wordninja>=2.0.0
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
 zh_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.7.0/zh_core_web_sm-3.7.0-py3-none-any.whl
 scikit-learn==1.4.1.post1
-nltk==3.8.1
+nltk==3.8.1
+s3pathlib>=2.1.1