浏览代码

update require txt

quyuan 1 年之前
父节点
当前提交
4bedda7b03
共有 2 个文件被更改,包括 10 次插入52 次删除
  1. 2 2
      tools/benchmark.py
  2. 8 50
      tools/markdown_calculate.py

+ 2 - 2
tools/benchmark.py

@@ -36,8 +36,8 @@ def calculate_score():
     os.system(cmd)
     cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name magicpdf --download_dir %s" % (code_path, data_path)
     os.system(cmd)
-    score = markdown_calculate.Scoring()
-    score.calculate_similarity_total("magicpdf", file_types, os.path.join(data_path, "result.json"))
+    score = markdown_calculate.Scoring(os.path.join(data_path, "result.json"))
+    score.calculate_similarity_total("magicpdf", file_types, data_path)
     res = score.summary_scores()
     return res
 

+ 8 - 50
tools/markdown_calculate.py

@@ -7,44 +7,16 @@ import re
 import scoring
 import argparse
 
-parser = argparse.ArgumentParser(description="get directory")
-parser.add_argument('--document_types', 
-    nargs='+',
-    choices=["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"], 
-    help='Choose one or more document_types',
-    default=["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"]
-)
-
-parser.add_argument(
-    "--tool_name",
-    type=str,
-    required=True,
-    help="tool name",
-)
-parser.add_argument(
-    "--download_dir",
-    type=str,
-    required=True,
-    help="input download dir",
-)
-parser.add_argument(
-    "--results",
-    type=str,
-    required=True,
-    help="results path(end with .json)",
-)
-args = parser.parse_args()
-fw = open(args.results, 'w+', encoding='utf-8')
 # 初始化列表来存储编辑距离和BLEU分数  
 class Scoring:
-    def __init__(self):
+    def __init__(self, result_path):
         self.edit_distances = []
         self.bleu_scores = []
         self.sim_scores = []
         self.filenames = []
         self.score_dict = {}
         self.anntion_cnt = 0
-
+        self.fw = open(result_path, "w+")
     def simple_bleu_score(self, candidate, reference):  
         candidate_tokens = word_tokenize(candidate)  
         reference_tokens = word_tokenize(reference) 
@@ -93,12 +65,12 @@ class Scoring:
         class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0  
         class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0  
         class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0
-        fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
+        self.fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
         ratio = len(class_dict)/total_file
-        fw.write(f"{tool_type} extract ratio:  {ratio}" + "\n")
-        fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
-        fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
-        fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")
+        self.fw.write(f"{tool_type} extract ratio:  {ratio}" + "\n")
+        self.fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
+        self.fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
+        self.fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")
 
         print (f"{tool_type} extract ratio: {ratio}")
         print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}")
@@ -115,7 +87,7 @@ class Scoring:
         over_all_dict["average_edit_distance"] = average_edit_distance
         over_all_dict["average_bleu_score"] = average_bleu_score
         over_all_dict["average_sim_score"] = average_sim_score
-        fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n")
+        self.fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n")
         return over_all_dict
 
     def calculate_similarity_total(self, tool_type, file_types, download_dir):
@@ -124,17 +96,3 @@ class Scoring:
             actual = os.path.join(download_dir, file_type, tool_type, "cleaned")
             self.calculate_similarity(annotion, actual, file_type)
 
-if __name__ == "__main__":  
-  file_types = list()
-  tool_type =args.tool_name
-  download_dir = args.download_dir
-  if args.document_types:
-    print("Selected types:", args.document_types)
-    for type_ in args.document_types:
-        file_types.append(type_)
-  else:
-      print("No types selected")
-  print(f"Type {file_types} is selected. Executing related operations...")
-  score = Scoring()
-  score.calculate_similarity_total(tool_type, file_types, download_dir)
-  score.summary_scores()