download.bak 955 B

123456789101112131415161718192021222324
  1. import json
  2. import os
  3. from tqdm import tqdm
  4. from magic_pdf.libs.commons import join_path
  5. with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset.json', 'r') as f:
  6. samples = json.load(f)
  7. pdf_model_dir = 's3://llm-pdf-text/eval_1k/layout_res/'
  8. labels = []
  9. det_res = []
  10. edit_distance_list = []
  11. for sample in tqdm(samples):
  12. pdf_name = sample['pdf_name']
  13. page_num = sample['page']
  14. pdf_model_path = join_path(pdf_model_dir, pdf_name)
  15. model_output_json = join_path(pdf_model_path, f"page_{page_num}.json") # 模型输出的页面编号从1开始的
  16. save_root_path = '/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_val_docxchain/'
  17. save_path = join_path(save_root_path, pdf_name)
  18. os.makedirs(save_path, exist_ok=True)
  19. # print("s3c cp {} {}".format(model_output_json, save_path))
  20. os.system("aws --profile langchao --endpoint-url=http://10.140.85.161:80 s3 cp {} {}".format(model_output_json, save_path))