markdown_calculate.py 5.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. import os
  2. from Levenshtein import distance
  3. from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
  4. from nltk.tokenize import word_tokenize
  5. import json
  6. import re
  7. import scoring
  8. import argparse
  9. # 初始化列表来存储编辑距离和BLEU分数
  10. class Scoring:
  11. def __init__(self, result_path):
  12. self.edit_distances = []
  13. self.bleu_scores = []
  14. self.sim_scores = []
  15. self.filenames = []
  16. self.score_dict = {}
  17. self.anntion_cnt = 0
  18. self.fw = open(result_path, "w+")
  19. def simple_bleu_score(self, candidate, reference):
  20. candidate_tokens = word_tokenize(candidate)
  21. reference_tokens = word_tokenize(reference)
  22. return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method1)
  23. def preprocess_string(self, s):
  24. sub_enter = re.sub(r'\n+', '\n', s)
  25. return re.sub(r' ', ' ', sub_enter)
  26. def calculate_similarity(self, annotion, actual, tool_type):
  27. class_dict = {}
  28. edit_distances = []
  29. bleu_scores = []
  30. sim_scores = list()
  31. total_file = 0
  32. for filename in os.listdir(annotion):
  33. if filename.endswith('.md') and not filename.startswith('.'): # 忽略隐藏文件
  34. total_file = total_file + 1
  35. # 读取A目录中的文件
  36. with open(os.path.join(annotion, filename), 'r', encoding='utf-8') as file_a:
  37. content_a = file_a.read()
  38. self.anntion_cnt = self.anntion_cnt + 1
  39. filepath_b = os.path.join(actual, filename)
  40. if os.path.exists(filepath_b):
  41. with open(filepath_b, 'r', encoding='utf-8') as file_b:
  42. content_b = file_b.read()
  43. self.filenames.append(filename)
  44. # 计算编辑距离
  45. edit_dist = distance(self.preprocess_string(content_b),self.preprocess_string(content_a)) / max(len(content_a), len(content_b))
  46. self.edit_distances.append(edit_dist)
  47. edit_distances.append(edit_dist)
  48. #计算BLUE分数
  49. bleu_score = self.simple_bleu_score(content_b, content_a)
  50. bleu_scores.append(bleu_score)
  51. self.bleu_scores.append(bleu_score)
  52. #计算marker分数
  53. score = scoring.score_text(content_b, content_a)
  54. sim_scores.append(score)
  55. self.sim_scores.append(score)
  56. class_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
  57. self.score_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
  58. else:
  59. print(f"File {filename} not found in actual directory.")
  60. # 计算每类平均值
  61. class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0
  62. class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
  63. class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0
  64. self.fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
  65. ratio = len(class_dict)/total_file
  66. self.fw.write(f"{tool_type} extract ratio: {ratio}" + "\n")
  67. self.fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
  68. self.fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
  69. self.fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")
  70. print (f"{tool_type} extract ratio: {ratio}")
  71. print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}")
  72. print (f"{tool_type} Average BLEU Score: {class_average_bleu_score}")
  73. print (f"{tool_type} Average Sim Score: {class_average_sim_score}")
  74. return self.score_dict
  75. def summary_scores(self):
  76. # 计算整体平均值
  77. over_all_dict = dict()
  78. average_edit_distance = sum(self.edit_distances) / len(self.edit_distances) if self.edit_distances else 0
  79. average_bleu_score = sum(self.bleu_scores) / len(self.bleu_scores) if self.bleu_scores else 0
  80. average_sim_score = sum(self.sim_scores) / len(self.sim_scores) if self.sim_scores else 0
  81. over_all_dict["average_edit_distance"] = average_edit_distance
  82. over_all_dict["average_bleu_score"] = average_bleu_score
  83. over_all_dict["average_sim_score"] = average_sim_score
  84. self.fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n")
  85. return over_all_dict
  86. def calculate_similarity_total(self, tool_type, file_types, download_dir):
  87. for file_type in file_types:
  88. annotion = os.path.join(download_dir, file_type, "annotations", "cleaned")
  89. actual = os.path.join(download_dir, file_type, tool_type, "cleaned")
  90. self.calculate_similarity(annotion, actual, file_type)