markdown_calculate.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. import os
  2. from Levenshtein import distance
  3. from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
  4. from nltk.tokenize import word_tokenize
  5. import json
  6. import re
  7. import scoring
  8. import argparse
  9. import nltk
  10. nltk.download('punkt')
  11. # 初始化列表来存储编辑距离和BLEU分数
  12. class Scoring:
  13. def __init__(self, result_path):
  14. self.edit_distances = []
  15. self.bleu_scores = []
  16. self.sim_scores = []
  17. self.filenames = []
  18. self.score_dict = {}
  19. self.anntion_cnt = 0
  20. self.fw = open(result_path, "w+")
  21. def simple_bleu_score(self, candidate, reference):
  22. candidate_tokens = word_tokenize(candidate)
  23. reference_tokens = word_tokenize(reference)
  24. return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method1)
  25. def preprocess_string(self, s):
  26. sub_enter = re.sub(r'\n+', '\n', s)
  27. return re.sub(r' ', ' ', sub_enter)
  28. def calculate_similarity(self, annotion, actual, tool_type):
  29. class_dict = {}
  30. edit_distances = []
  31. bleu_scores = []
  32. sim_scores = list()
  33. total_file = 0
  34. for filename in os.listdir(annotion):
  35. if filename.endswith('.md') and not filename.startswith('.'): # 忽略隐藏文件
  36. total_file = total_file + 1
  37. # 读取A目录中的文件
  38. with open(os.path.join(annotion, filename), 'r', encoding='utf-8') as file_a:
  39. content_a = file_a.read()
  40. self.anntion_cnt = self.anntion_cnt + 1
  41. filepath_b = os.path.join(actual, filename)
  42. if os.path.exists(filepath_b):
  43. with open(filepath_b, 'r', encoding='utf-8') as file_b:
  44. content_b = file_b.read()
  45. self.filenames.append(filename)
  46. # 计算编辑距离
  47. edit_dist = distance(self.preprocess_string(content_b),self.preprocess_string(content_a)) / max(len(content_a), len(content_b))
  48. self.edit_distances.append(edit_dist)
  49. edit_distances.append(edit_dist)
  50. #计算BLUE分数
  51. bleu_score = self.simple_bleu_score(content_b, content_a)
  52. bleu_scores.append(bleu_score)
  53. self.bleu_scores.append(bleu_score)
  54. #计算marker分数
  55. score = scoring.score_text(content_b, content_a)
  56. sim_scores.append(score)
  57. self.sim_scores.append(score)
  58. class_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
  59. self.score_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
  60. else:
  61. print(f"File {filename} not found in actual directory.")
  62. # 计算每类平均值
  63. class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0
  64. class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
  65. class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0
  66. self.fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
  67. ratio = len(class_dict)/total_file
  68. self.fw.write(f"{tool_type} extract ratio: {ratio}" + "\n")
  69. self.fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
  70. self.fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
  71. self.fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")
  72. print (f"{tool_type} extract ratio: {ratio}")
  73. print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}")
  74. print (f"{tool_type} Average BLEU Score: {class_average_bleu_score}")
  75. print (f"{tool_type} Average Sim Score: {class_average_sim_score}")
  76. return self.score_dict
  77. def summary_scores(self):
  78. # 计算整体平均值
  79. over_all_dict = dict()
  80. average_edit_distance = sum(self.edit_distances) / len(self.edit_distances) if self.edit_distances else 0
  81. average_bleu_score = sum(self.bleu_scores) / len(self.bleu_scores) if self.bleu_scores else 0
  82. average_sim_score = sum(self.sim_scores) / len(self.sim_scores) if self.sim_scores else 0
  83. over_all_dict["average_edit_distance"] = average_edit_distance
  84. over_all_dict["average_bleu_score"] = average_bleu_score
  85. over_all_dict["average_sim_score"] = average_sim_score
  86. self.fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n")
  87. return over_all_dict
  88. def calculate_similarity_total(self, tool_type, file_types, download_dir):
  89. for file_type in file_types:
  90. annotion = os.path.join(download_dir, file_type, "annotations", "cleaned")
  91. actual = os.path.join(download_dir, file_type, tool_type, "cleaned")
  92. self.calculate_similarity(annotion, actual, file_type)