s3pdf2md.py 1.1 KB

12345678910111213141516171819202122232425262728293031323334353637
  1. from pathlib import Path
  2. import click
  3. import json
  4. from demo.pdf2md import main
  5. @click.command()
  6. @click.option("--pdf-file-path", help="s3上pdf文件的路径")
  7. @click.option("--pdf-name", help="pdf name")
  8. def main_shell(pdf_file_path: str, pdf_name: str):
  9. with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_final_rotated_formulafix_highdpi_scihub.json', 'r') as f:
  10. samples = json.load(f)
  11. for sample in samples:
  12. pdf_file_path = sample['s3_path']
  13. pdf_bin_file_profile = "outsider"
  14. pdf_name = sample['pdf_name']
  15. pdf_model_dir = f"s3://llm-pdf-text/eval_1k/layout_res/{pdf_name}"
  16. pdf_model_profile = "langchao"
  17. p = Path(pdf_file_path)
  18. pdf_file_name = p.name # pdf文件名字,含后缀
  19. #pdf_model_dir = join_path(pdf_model_parent_dir, pdf_file_name)
  20. main(
  21. pdf_file_path,
  22. pdf_bin_file_profile,
  23. pdf_model_dir,
  24. pdf_model_profile,
  25. debug_mode=True,
  26. )
  27. if __name__ == "__main__":
  28. main_shell()