|
|
@@ -27,6 +27,7 @@ import sys
|
|
|
import click
|
|
|
from loguru import logger
|
|
|
from pathlib import Path
|
|
|
+from magic_pdf.libs.version import __version__
|
|
|
|
|
|
from magic_pdf.libs.MakeContentConfig import DropMode
|
|
|
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
|
|
|
@@ -52,7 +53,7 @@ def prepare_env(pdf_file_name, method):
|
|
|
get_local_dir(), "magic-pdf", pdf_file_name, method
|
|
|
)
|
|
|
|
|
|
- local_image_dir = os.path.join(local_parent_dir, "images")
|
|
|
+ local_image_dir = os.path.join(str(local_parent_dir), "images")
|
|
|
local_md_dir = local_parent_dir
|
|
|
os.makedirs(local_image_dir, exist_ok=True)
|
|
|
os.makedirs(local_md_dir, exist_ok=True)
|
|
|
@@ -102,6 +103,7 @@ def cli():
|
|
|
|
|
|
|
|
|
@cli.command()
|
|
|
+@click.version_option(__version__, "--version", "-v", help="显示版本信息")
|
|
|
@click.option("--json", type=str, help="输入一个S3路径")
|
|
|
@click.option(
|
|
|
"--method",
|
|
|
@@ -158,63 +160,65 @@ def json_command(json, method):
|
|
|
)
|
|
|
|
|
|
|
|
|
- @cli.command()
|
|
|
- @click.option("--local_json", type=str, help="输入一个本地jsonl路径")
|
|
|
- @click.option(
|
|
|
- "--method",
|
|
|
- type=parse_pdf_methods,
|
|
|
- help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
|
|
|
- default="auto",
|
|
|
- )
|
|
|
- def local_json_command(local_json, method):
|
|
|
- def read_s3_path(s3path):
|
|
|
- bucket, key = parse_s3path(s3path)
|
|
|
+@cli.command()
|
|
|
+@click.version_option(__version__, "--version", "-v", help="显示版本信息")
|
|
|
+@click.option("--local_json", type=str, help="输入一个本地jsonl路径")
|
|
|
+@click.option(
|
|
|
+ "--method",
|
|
|
+ type=parse_pdf_methods,
|
|
|
+ help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
|
|
|
+ default="auto",
|
|
|
+)
|
|
|
+def local_json_command(local_json, method):
|
|
|
+ def read_s3_path(s3path):
|
|
|
+ bucket, key = parse_s3path(s3path)
|
|
|
|
|
|
- s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
|
|
|
- s3_rw = S3ReaderWriter(
|
|
|
- s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
|
|
|
- )
|
|
|
- may_range_params = parse_s3_range_params(s3path)
|
|
|
- if may_range_params is None or 2 != len(may_range_params):
|
|
|
- byte_start, byte_end = 0, None
|
|
|
- else:
|
|
|
- byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
|
|
|
- byte_end += byte_start - 1
|
|
|
- return s3_rw.read_jsonl(
|
|
|
- remove_non_official_s3_args(s3path),
|
|
|
- byte_start,
|
|
|
- byte_end,
|
|
|
- AbsReaderWriter.MODE_BIN,
|
|
|
+ s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
|
|
|
+ s3_rw = S3ReaderWriter(
|
|
|
+ s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
|
|
|
+ )
|
|
|
+ may_range_params = parse_s3_range_params(s3path)
|
|
|
+ if may_range_params is None or 2 != len(may_range_params):
|
|
|
+ byte_start, byte_end = 0, None
|
|
|
+ else:
|
|
|
+ byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
|
|
|
+ byte_end += byte_start - 1
|
|
|
+ return s3_rw.read_jsonl(
|
|
|
+ remove_non_official_s3_args(s3path),
|
|
|
+ byte_start,
|
|
|
+ byte_end,
|
|
|
+ AbsReaderWriter.MODE_BIN,
|
|
|
+ )
|
|
|
+
|
|
|
+ with open(local_json, "r", encoding="utf-8") as f:
|
|
|
+ for json_line in f:
|
|
|
+ jso = json_parse.loads(json_line)
|
|
|
+
|
|
|
+ s3_file_path = jso.get("file_location")
|
|
|
+ if s3_file_path is None:
|
|
|
+ s3_file_path = jso.get("path")
|
|
|
+ pdf_file_name = Path(s3_file_path).stem
|
|
|
+ pdf_data = read_s3_path(s3_file_path)
|
|
|
+ local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
|
|
|
+
|
|
|
+ local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
|
|
|
+ local_md_dir
|
|
|
)
|
|
|
|
|
|
- with open(local_json, "r", encoding="utf-8") as f:
|
|
|
- for json_line in f:
|
|
|
- jso = json_parse.loads(json_line)
|
|
|
-
|
|
|
- s3_file_path = jso.get("file_location")
|
|
|
- if s3_file_path is None:
|
|
|
- s3_file_path = jso.get("path")
|
|
|
- pdf_file_name = Path(s3_file_path).stem
|
|
|
- pdf_data = read_s3_path(s3_file_path)
|
|
|
- local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
|
|
|
-
|
|
|
- local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
|
|
|
- local_md_dir
|
|
|
- )
|
|
|
-
|
|
|
- _do_parse(
|
|
|
- pdf_file_name,
|
|
|
- pdf_data,
|
|
|
- jso["doc_layout_result"],
|
|
|
- method,
|
|
|
- local_image_rw,
|
|
|
- local_md_rw,
|
|
|
- os.path.basename(local_image_dir),
|
|
|
- local_md_dir
|
|
|
- )
|
|
|
+ _do_parse(
|
|
|
+ pdf_file_name,
|
|
|
+ pdf_data,
|
|
|
+ jso["doc_layout_result"],
|
|
|
+ method,
|
|
|
+ local_image_rw,
|
|
|
+ local_md_rw,
|
|
|
+ os.path.basename(local_image_dir),
|
|
|
+ local_md_dir
|
|
|
+ )
|
|
|
|
|
|
|
|
|
@cli.command()
|
|
|
+@click.version_option(__version__, "--version", "-v", help="显示版本信息")
|
|
|
@click.option(
|
|
|
"--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
|
|
|
)
|
|
|
@@ -256,6 +260,7 @@ def pdf_command(pdf, model, method):
|
|
|
)
|
|
|
|
|
|
|
|
|
+
|
|
|
if __name__ == "__main__":
|
|
|
"""
|
|
|
python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
|