benchmark.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. import zipfile
  2. import os
  3. import shutil
  4. code_path = os.environ.get('GITHUB_WORKSPACE')
  5. pdf_dev_path = "/home/quyuan/data"
  6. pdf_res_path = "/home/quyuan/code/Magic-PDF/Magic-PDF/Magic-PDF/ci/magic-pdf"
  7. def test_cli():
  8. cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py pdf-command --pdf {}' % (code_path, pdf_dev_path)
  9. os.system(cmd)
  10. if not os.path.exists(os.path.join(pdf_dev_path, "output")):
  11. os.makedirs(os.path.join(pdf_dev_path, "output"))
  12. for annotaion_name in os.listdir(os.path.join(pdf_dev_path, "output")):
  13. if annotaion_name.endswith('.pdf'):
  14. for pdf_res_path in os.listdir(pdf_res_path):
  15. if ".md" in os.path.join(pdf_res_path, annotaion_name, "auto"):
  16. prefix = annotaion_name.split('_')[-2]
  17. if not os.path.exists(os.join(pdf_dev_path, prefix)):
  18. os.makedirs(os.path.join(pdf_dev_path, prefix))
  19. shutil.copy(os.path.join(pdf_res_path, annotaion_name, "auto", annotaion_name + ".md"), os.join(pdf_dev_path, prefix, annotaion_name + ".md"))
  20. def calculate_score():
  21. cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name annotations --download_dir %s" % (code_path, pdf_dev_path)
  22. os.system(cmd)
  23. cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name magicpdf --download_dir %s" % (code_path, pdf_dev_path)
  24. os.system(cmd)
  25. cmd = "cd %s && export PYTHONPATH=. && python tools/markdown_calculate.py --tool_name pdf-command --download_dir %s --results %s" % (code_path, pdf_dev_path, os.path.join(pdf_dev_path, "result.json"))
  26. os.system(cmd)
  27. def extrat_zip(zip_file_path, extract_to_path):
  28. if zipfile.is_zipfile(zip_file_path):
  29. with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
  30. zip_ref.extractall(extract_to_path)
  31. print(f'Files extracted to {extract_to_path}')
  32. else:
  33. print(f'{zip_file_path} is not a zip file')
  34. if __name__ == "__main__":
  35. extrat_zip(os.path.join(pdf_dev_path, 'output.zip'), os.path.join(pdf_dev_path,'datasets'))
  36. test_cli()
  37. calculate_score()