markdown_calculate.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. import os
  2. from Levenshtein import distance
  3. from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
  4. from nltk.tokenize import word_tokenize
  5. import json
  6. import re
  7. import scoring
  8. import argparse
  9. parser = argparse.ArgumentParser(description="get directory")
  10. parser.add_argument('--document_types',
  11. nargs='+',
  12. choices=["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"],
  13. help='Choose one or more document_types',
  14. default=["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"]
  15. )
  16. parser.add_argument(
  17. "--tool_name",
  18. type=str,
  19. required=True,
  20. help="tool name",
  21. )
  22. parser.add_argument(
  23. "--download_dir",
  24. type=str,
  25. required=True,
  26. help="input download dir",
  27. )
  28. parser.add_argument(
  29. "--results",
  30. type=str,
  31. required=True,
  32. help="results path(end with .json)",
  33. )
  34. args = parser.parse_args()
  35. fw = open(args.results, 'w+', encoding='utf-8')
  36. # 初始化列表来存储编辑距离和BLEU分数
  37. class Scoring:
  38. def __init__(self):
  39. self.edit_distances = []
  40. self.bleu_scores = []
  41. self.sim_scores = []
  42. self.filenames = []
  43. self.score_dict = {}
  44. self.anntion_cnt = 0
  45. def simple_bleu_score(self, candidate, reference):
  46. candidate_tokens = word_tokenize(candidate)
  47. reference_tokens = word_tokenize(reference)
  48. return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method1)
  49. def preprocess_string(self, s):
  50. sub_enter = re.sub(r'\n+', '\n', s)
  51. return re.sub(r' ', ' ', sub_enter)
  52. def calculate_similarity(self, annotion, actual, tool_type):
  53. class_dict = {}
  54. edit_distances = []
  55. bleu_scores = []
  56. sim_scores = list()
  57. total_file = 0
  58. for filename in os.listdir(annotion):
  59. if filename.endswith('.md') and not filename.startswith('.'): # 忽略隐藏文件
  60. total_file = total_file + 1
  61. # 读取A目录中的文件
  62. with open(os.path.join(annotion, filename), 'r', encoding='utf-8') as file_a:
  63. content_a = file_a.read()
  64. self.anntion_cnt = self.anntion_cnt + 1
  65. filepath_b = os.path.join(actual, filename)
  66. if os.path.exists(filepath_b):
  67. with open(filepath_b, 'r', encoding='utf-8') as file_b:
  68. content_b = file_b.read()
  69. self.filenames.append(filename)
  70. # 计算编辑距离
  71. edit_dist = distance(self.preprocess_string(content_b),self.preprocess_string(content_a)) / max(len(content_a), len(content_b))
  72. self.edit_distances.append(edit_dist)
  73. edit_distances.append(edit_dist)
  74. #计算BLUE分数
  75. bleu_score = self.simple_bleu_score(content_b, content_a)
  76. bleu_scores.append(bleu_score)
  77. self.bleu_scores.append(bleu_score)
  78. #计算marker分数
  79. score = scoring.score_text(content_b, content_a)
  80. sim_scores.append(score)
  81. self.sim_scores.append(score)
  82. class_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
  83. self.score_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
  84. else:
  85. print(f"File {filename} not found in actual directory.")
  86. # 计算每类平均值
  87. class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0
  88. class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
  89. class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0
  90. fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
  91. ratio = len(class_dict)/total_file
  92. fw.write(f"{tool_type} extract ratio: {ratio}" + "\n")
  93. fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
  94. fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
  95. fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")
  96. print (f"{tool_type} extract ratio: {ratio}")
  97. print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}")
  98. print (f"{tool_type} Average BLEU Score: {class_average_bleu_score}")
  99. print (f"{tool_type} Average Sim Score: {class_average_sim_score}")
  100. return self.score_dict
  101. def summary_scores(self):
  102. # 计算整体平均值
  103. average_edit_distance = sum(self.edit_distances) / len(self.edit_distances) if self.edit_distances else 0
  104. average_bleu_score = sum(self.bleu_scores) / len(self.bleu_scores) if self.bleu_scores else 0
  105. average_sim_score = sum(self.sim_scores) / len(self.sim_scores) if self.sim_scores else 0
  106. #self.fw.write(json.dumps(self.score_dict, ensure_ascii=False) + "\n")
  107. fw.write(f"Overall extract cnt: {len(self.score_dict)/self.anntion_cnt}" + "\n")
  108. fw.write(f"Overall Average Levenshtein Distance: {average_edit_distance}" + "\n")
  109. fw.write(f"Overall Average BLEU Score: {average_bleu_score}" + "\n")
  110. fw.write(f"Overall Average Marker Score: {average_sim_score}" + "\n")
  111. print ("Overall extract ratio: ", len(self.score_dict)/self.anntion_cnt)
  112. print (f"Overall Average Levenshtein Distance: {average_edit_distance}")
  113. print (f"Overall Average BLEU Score: {average_bleu_score}")
  114. print (f"Overall Average Marker Score: {average_sim_score}")
  115. fw.close()
  116. def calculate_similarity_total(self, tool_type, file_types, download_dir):
  117. for file_type in file_types:
  118. annotion = os.path.join(download_dir, file_type, "annotations", "cleaned")
  119. actual = os.path.join(download_dir, file_type, tool_type, "cleaned")
  120. self.calculate_similarity(annotion, actual, file_type)
  121. if __name__ == "__main__":
  122. file_types = list()
  123. tool_type =args.tool_name
  124. download_dir = args.download_dir
  125. if args.document_types:
  126. print("Selected types:", args.document_types)
  127. for type_ in args.document_types:
  128. file_types.append(type_)
  129. else:
  130. print("No types selected")
  131. print(f"Type {file_types} is selected. Executing related operations...")
  132. score = Scoring()
  133. score.calculate_similarity_total(tool_type, file_types, download_dir)
  134. score.summary_scores()