| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293 |
- import os
- import sys
- from pathlib import Path
- import click
- from loguru import logger
- from magic_pdf.libs import join_path
- from magic_pdf.dict2md.mkcontent import mk_mm_markdown
- from magic_pdf.pipeline import parse_pdf_by_model
- def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_profile: str, start_page_num=0, debug_mode=True):
- """ """
- pth = Path(s3_pdf_path)
- book_name = pth.name
- # book_name = "".join(os.path.basename(s3_pdf_path).split(".")[0:-1])
- save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "..", "tmp", "unittest")
- save_path = join_path(save_tmp_path, "md")
- text_content_save_path = f"{save_path}/{book_name}/book.md"
- # metadata_save_path = f"{save_path}/{book_name}/metadata.json"
- try:
- paras_dict = parse_pdf_by_model(
- s3_pdf_path, s3_pdf_profile, pdf_model_path, save_path, book_name, pdf_model_profile, start_page_num, debug_mode=debug_mode
- )
- parent_dir = os.path.dirname(text_content_save_path)
- if not os.path.exists(parent_dir):
- os.makedirs(parent_dir)
-
- if not paras_dict.get('need_drop'):
- markdown_content = mk_mm_markdown(paras_dict)
- else:
- markdown_content = paras_dict['drop_reason']
-
- with open(text_content_save_path, "w", encoding="utf-8") as f:
- f.write(markdown_content)
- except Exception as e:
- print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
- logger.exception(e)
- @click.command()
- @click.option("--pdf-file-path", help="s3上pdf文件的路径")
- @click.option("--save-path", help="解析出来的图片,文本的保存父目录")
- def main_shell(pdf_file_path: str, save_path: str):
- # pdf_bin_file_path = "s3://llm-raw-snew/llm-raw-scihub/scimag07865000-07865999/10.1007/"
- pdf_bin_file_parent_path = "s3://llm-raw-snew/llm-raw-scihub/"
- pdf_bin_file_profile = "s2"
- pdf_model_parent_dir = "s3://llm-pdf-text/eval_1k/layout_res/"
- pdf_model_profile = "langchao"
- p = Path(pdf_file_path)
- pdf_parent_path = p.parent
- pdf_file_name = p.name # pdf文件名字,含后缀
- pdf_bin_file_path = join_path(pdf_bin_file_parent_path, pdf_parent_path)
- pdf_model_dir = join_path(pdf_model_parent_dir, pdf_parent_path)
- main(
- join_path(pdf_bin_file_path, pdf_file_name),
- pdf_bin_file_profile,
- join_path(pdf_model_dir, pdf_file_name),
- pdf_model_profile,
- save_path,
- )
- @click.command()
- @click.option("--pdf-dir", help="s3上pdf文件的路径")
- @click.option("--model-dir", help="s3上pdf文件的路径")
- @click.option("--start-page-num", default=0, help="从第几页开始解析")
- def main_shell2(pdf_dir: str, model_dir: str,start_page_num: int):
- # 先扫描所有的pdf目录里的文件名字
- pdf_dir = Path(pdf_dir)
- model_dir = Path(model_dir)
- if pdf_dir.is_file():
- pdf_file_names = [pdf_dir.name]
- pdf_dir = pdf_dir.parent
- else:
- pdf_file_names = [f.name for f in pdf_dir.glob("*.pdf")]
- for pdf_file in pdf_file_names:
- pdf_file_path = os.path.join(pdf_dir, pdf_file)
- model_file_path = os.path.join(model_dir, pdf_file)
- main(pdf_file_path, None, model_file_path, None, start_page_num)
- if __name__ == "__main__":
- main_shell2()
|