benchmark.py 3.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. import zipfile
  2. import os
  3. import shutil
  4. import json
  5. import markdown_calculate
  6. code_path = os.environ.get('GITHUB_WORKSPACE')
  7. #数据集存放路径
  8. pdf_dev_path = "/share/quyuan/mineru/data/"
  9. #magicpdf最终结果
  10. pdf_res_path = "/share/quyuan/mineru/data/mineru"
  11. file_types = ["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"]
  12. def test_cli():
  13. #magicpdf模型输出结果
  14. magicpdf_path = os.path.join(pdf_dev_path, "output")
  15. rm_cmd = "rm -rf %s" % (pdf_res_path)
  16. os.system(rm_cmd)
  17. os.makedirs(pdf_res_path)
  18. cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py pdf-command --pdf {}' % (code_path, magicpdf_path)
  19. os.system(cmd)
  20. for root, dirs, files in os.walk(pdf_res_path):
  21. for magic_file in files:
  22. for file_type in file_types:
  23. target_dir = os.path.join(pdf_dev_path, "ci", file_type, "magicpdf")
  24. if magic_file.endswith(".md") and magic_file.startswith(file_type):
  25. source_file = os.path.join(root, magic_file)
  26. target_file = os.path.join(pdf_dev_path, "ci", file_type, "magicpdf", magic_file)
  27. if not os.path.exists(target_dir):
  28. os.makedirs(target_dir)
  29. shutil.copy(source_file, target_file)
  30. def calculate_score():
  31. data_path = os.path.join(pdf_dev_path, "ci")
  32. cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name annotations --download_dir %s" % (code_path, data_path)
  33. os.system(cmd)
  34. cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name magicpdf --download_dir %s" % (code_path, data_path)
  35. os.system(cmd)
  36. score = markdown_calculate.Scoring(os.path.join(data_path, "result.json"))
  37. score.calculate_similarity_total("magicpdf", file_types, data_path)
  38. res = score.summary_scores()
  39. return res
  40. def extrat_zip(zip_file_path, extract_to_path):
  41. if zipfile.is_zipfile(zip_file_path):
  42. with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
  43. zip_ref.extractall(extract_to_path)
  44. print(f'Files extracted to {extract_to_path}')
  45. else:
  46. print(f'{zip_file_path} is not a zip file')
  47. def ci_ben():
  48. fr = open(os.path.join(pdf_dev_path, "ci", "result.json"), "r")
  49. lines = fr.readlines()
  50. last_line = lines[-1].strip()
  51. last_score = json.loads(last_line)
  52. print ("last_score:", last_score)
  53. last_simscore = last_score["average_sim_score"]
  54. last_editdistance = last_score["average_edit_distance"]
  55. last_bleu = last_score["average_bleu_score"]
  56. extrat_zip(os.path.join(pdf_dev_path, 'output.zip'), os.path.join(pdf_dev_path))
  57. test_cli()
  58. now_score = calculate_score()
  59. print ("now_score:", now_score)
  60. now_simscore = now_score["average_sim_score"]
  61. now_editdistance = now_score["average_edit_distance"]
  62. now_bleu = now_score["average_bleu_score"]
  63. assert last_simscore <= now_simscore
  64. assert last_editdistance <= now_editdistance
  65. assert last_bleu <= now_bleu
  66. if __name__ == "__main__":
  67. ci_ben()