| 12345678910111213141516171819202122232425262728293031323334353637 |
- from pathlib import Path
- import click
- import json
- from demo.pdf2md import main
- @click.command()
- @click.option("--pdf-file-path", help="s3上pdf文件的路径")
- @click.option("--pdf-name", help="pdf name")
- def main_shell(pdf_file_path: str, pdf_name: str):
- with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_final_rotated_formulafix_highdpi_scihub.json', 'r') as f:
- samples = json.load(f)
- for sample in samples:
- pdf_file_path = sample['s3_path']
- pdf_bin_file_profile = "outsider"
- pdf_name = sample['pdf_name']
- pdf_model_dir = f"s3://llm-pdf-text/eval_1k/layout_res/{pdf_name}"
- pdf_model_profile = "langchao"
- p = Path(pdf_file_path)
- pdf_file_name = p.name # pdf文件名字,含后缀
- #pdf_model_dir = join_path(pdf_model_parent_dir, pdf_file_name)
- main(
- pdf_file_path,
- pdf_bin_file_profile,
- pdf_model_dir,
- pdf_model_profile,
- debug_mode=True,
- )
-
-
- if __name__ == "__main__":
- main_shell()
|