quyuan 1 rok temu
rodzic
commit
194f34af68
4 zmienionych plików z 35 dodań i 201 usunięć
  1. 0 87
      tools/base_data_ocr.json
  2. 0 88
      tools/base_data_text.json
  3. 19 14
      tools/ocr_badcase.py
  4. 16 12
      tools/text_badcase.py

+ 0 - 87
tools/base_data_ocr.json

@@ -1,87 +0,0 @@
-{
-    "accuracy": 1.0,
-    "precision": 1.0,
-    "recall": 1.0,
-    "f1_score": 1.0,
-    "pdf间的平均编辑距离": 133.10256410256412,
-    "pdf间的平均bleu": 0.28838311595434046,
-    "分段准确率": 0.07220216606498195,
-    "行内公式准确率": {
-        "accuracy": 0.004835727492533068,
-        "precision": 0.008790072388831437,
-        "recall": 0.010634970284641852,
-        "f1_score": 0.009624911535739562
-    },
-    "行内公式编辑距离": 1.6176470588235294,
-    "行内公式bleu": 0.17154724654721457,
-    "行间公式准确率": {
-        "accuracy": 0.08490566037735849,
-        "precision": 0.1836734693877551,
-        "recall": 0.13636363636363635,
-        "f1_score": 0.1565217391304348
-    },
-    "行间公式编辑距离": 113.22222222222223,
-    "行间公式bleu": 0.2531053359913409,
-    "丢弃文本准确率": {
-        "accuracy": 0.00035398230088495576,
-        "precision": 0.0006389776357827476,
-        "recall": 0.0007930214115781126,
-        "f1_score": 0.0007077140835102619
-    },
-    "丢弃文本标签准确率": {
-        "color_background_header_txt_block": {
-            "precision": 0.0,
-            "recall": 0.0,
-            "f1-score": 0.0,
-            "support": 41.0
-        },
-        "header": {
-            "precision": 0.0,
-            "recall": 0.0,
-            "f1-score": 0.0,
-            "support": 4.0
-        },
-        "footnote": {
-            "precision": 1.0,
-            "recall": 0.009708737864077669,
-            "f1-score": 0.019230769230769232,
-            "support": 103.0
-        },
-        "on-table": {
-            "precision": 0.0,
-            "recall": 0.0,
-            "f1-score": 0.0,
-            "support": 665.0
-        },
-        "rotate": {
-            "precision": 0.0,
-            "recall": 0.0,
-            "f1-score": 0.0,
-            "support": 63.0
-        },
-        "on-image": {
-            "precision": 0.0,
-            "recall": 0.0,
-            "f1-score": 0.0,
-            "support": 380.0
-        },
-        "micro avg": {
-            "precision": 1.0,
-            "recall": 0.0007961783439490446,
-            "f1-score": 0.0015910898965791568,
-            "support": 1256.0
-        }
-    },
-    "丢弃图片准确率": {
-        "accuracy": 0.0,
-        "precision": 0.0,
-        "recall": 0.0,
-        "f1_score": 0.0
-    },
-    "丢弃表格准确率": {
-        "accuracy": 0.0,
-        "precision": 0.0,
-        "recall": 0.0,
-        "f1_score": 0.0
-    }
-}

+ 0 - 88
tools/base_data_text.json

@@ -1,88 +0,0 @@
-{
-    "accuracy": 1.0,
-    "precision": 1.0,
-    "recall": 1.0,
-    "f1_score": 1.0,
-    "pdf间的平均编辑距离": 19.82051282051282,
-    "pdf间的平均bleu": 0.9002485609584511,
-    "阅读顺序编辑距离": 0.3176895306859206,
-    "分段准确率": 0.8989169675090253,
-    "行内公式准确率": {
-        "accuracy": 0.9782741738066095,
-        "precision": 0.9782741738066095,
-        "recall": 1.0,
-        "f1_score": 0.9890177880897139
-    },
-    "行内公式编辑距离": 0.0,
-    "行内公式bleu": 0.20340450120213166,
-    "行间公式准确率": {
-        "accuracy": 1.0,
-        "precision": 1.0,
-        "recall": 1.0,
-        "f1_score": 1.0
-    },
-    "行间公式编辑距离": 0.0,
-    "行间公式bleu": 0.3662262622386575,
-    "丢弃文本准确率": {
-        "accuracy": 0.867870036101083,
-        "precision": 0.9064856711915535,
-        "recall": 0.9532117367168914,
-        "f1_score": 0.9292616930807885
-    },
-    "丢弃文本标签准确率": {
-        "color_background_header_txt_block": {
-            "precision": 0.0,
-            "recall": 0.0,
-            "f1-score": 0.0,
-            "support": 41.0
-        },
-        "rotate": {
-            "precision": 1.0,
-            "recall": 0.9682539682539683,
-            "f1-score": 0.9838709677419355,
-            "support": 63.0
-        },
-        "footnote": {
-            "precision": 1.0,
-            "recall": 0.883495145631068,
-            "f1-score": 0.9381443298969072,
-            "support": 103.0
-        },
-        "header": {
-            "precision": 1.0,
-            "recall": 1.0,
-            "f1-score": 1.0,
-            "support": 4.0
-        },
-        "on-image": {
-            "precision": 0.9947643979057592,
-            "recall": 1.0,
-            "f1-score": 0.9973753280839895,
-            "support": 380.0
-        },
-        "on-table": {
-            "precision": 1.0,
-            "recall": 0.9443609022556391,
-            "f1-score": 0.97138437741686,
-            "support": 665.0
-        },
-        "micro avg": {
-            "precision": 0.9982847341337907,
-            "recall": 0.9267515923566879,
-            "f1-score": 0.9611890999174236,
-            "support": 1256.0
-        }
-    },
-    "丢弃图片准确率": {
-        "accuracy": 0.8666666666666667,
-        "precision": 0.9285714285714286,
-        "recall": 0.9285714285714286,
-        "f1_score": 0.9285714285714286
-    },
-    "丢弃表格准确率": {
-        "accuracy": 0,
-        "precision": 0,
-        "recall": 0,
-        "f1_score": 0
-    }
-}

+ 19 - 14
tools/ocr_badcase.py

@@ -756,7 +756,7 @@ def merge_json_data(json_test_df, json_standard_df):
 
     return inner_merge, standard_exist, test_exist
 
-def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
+def save_results(result_dict,overall_report_dict,badcase_path,overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url):
     """
     将结果字典保存为JSON文件至指定路径。
 
@@ -764,19 +764,20 @@ def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
     - result_dict: 包含计算结果的字典。
     - overall_path: 结果文件的保存路径,包括文件名。
     """
+    with open(overall_path, 'w', encoding='utf-8') as f:
+    # 将结果字典转换为JSON格式并写入文件
+        json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
+    final_overall_path = upload_to_s3(overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
+    overall_path_res = "OCR抽取方案整体评测指标结果请查看:" + final_overall_path
+    print(f'\033[31m{overall_path_res}\033[0m')
     # 打开指定的文件以写入
     with open(badcase_path, 'w', encoding='utf-8') as f:
         # 将结果字典转换为JSON格式并写入文件
         json.dump(result_dict, f, ensure_ascii=False, indent=4)
-    badcase_path_res = "OCR抽取方案评测badcase输出报告查看:" + badcase_path
+    final_badcase_path = upload_to_s3(badcase_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
+    badcase_path_res = "OCR抽取方案评测badcase输出报告查看:" + final_badcase_path
     print(f'\033[31m{badcase_path_res}\033[0m')
 
-    with open(overall_path, 'w', encoding='utf-8') as f:
-    # 将结果字典转换为JSON格式并写入文件
-        json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
-    overall_path_res = "OCR抽取方案整体评测指标结果请查看:" + overall_path
-    print(f'\033[31m{overall_path_res}\033[0m')
-
 def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRET_KEY, END_POINT_URL):
     """
     上传文件到Amazon S3
@@ -792,8 +793,9 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
         
         # 上传文件到S3
         s3.upload_file(file_path, bucket_name, s3_object_key)
-        
-        print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}")
+        s3_path = f"http://st.bigdata.shlab.tech/S3_Browser?output_path=s3://{bucket_name}/{s3_directory}/{file_name}"
+        return s3_path
+        #print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}")
     except FileNotFoundError:
         print(f"文件 {file_path} 未找到,请检查文件路径是否正确。")
     except NoCredentialsError:
@@ -801,6 +803,7 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
     except ClientError as e:
         print(f"上传文件时发生错误:{e}")
 
+
 def generate_filename(badcase_path,overall_path):
     """
     生成带有当前时间戳的输出文件名。
@@ -864,17 +867,19 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
     badcase_file,overall_file = generate_filename(badcase_path,overall_path)
 
     # 保存结果到JSON文件
-    save_results(result_dict, overall_report_dict,badcase_file,overall_file)
+    #save_results(result_dict, overall_report_dict,badcase_file,overall_file)
+    save_results(result_dict, overall_report_dict,badcase_file,overall_file,  s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
 
     result=compare_edit_distance(base_data_path, overall_report_dict)
-
+    """
     if all([s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url]):
         try:
             upload_to_s3(badcase_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
             upload_to_s3(overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
         except Exception as e:
-            print(f"上传到S3时发生错误: {e}")
-    print(result)
+            print(f"上传到S3时发生错误: {e}")    
+    """
+    #print(result)
     assert result == 1
 
 if __name__ == "__main__":

+ 16 - 12
tools/text_badcase.py

@@ -768,7 +768,7 @@ def merge_json_data(json_test_df, json_standard_df):
 
     return inner_merge, standard_exist, test_exist
 
-def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
+def save_results(result_dict,overall_report_dict,badcase_path,overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url):
     """
     将结果字典保存为JSON文件至指定路径。
 
@@ -776,18 +776,21 @@ def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
     - result_dict: 包含计算结果的字典。
     - overall_path: 结果文件的保存路径,包括文件名。
     """
+    with open(overall_path, 'w', encoding='utf-8') as f:
+    # 将结果字典转换为JSON格式并写入文件
+        json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
+    final_overall_path = upload_to_s3(overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
+    overall_path_res = "文本型PDF抽取方案整体评测指标结果请查看:" + final_overall_path
+    print(f'\033[31m{overall_path_res}\033[0m')
     # 打开指定的文件以写入
     with open(badcase_path, 'w', encoding='utf-8') as f:
         # 将结果字典转换为JSON格式并写入文件
         json.dump(result_dict, f, ensure_ascii=False, indent=4)
-    badcase_path_res = "文本型PDF抽取方案评测badcase输出报告查看:" + badcase_path
+    final_badcase_path = upload_to_s3(badcase_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
+    badcase_path_res = "文本型PDF抽取方案评测badcase输出报告查看:" + final_badcase_path
     print(f'\033[31m{badcase_path_res}\033[0m')
 
-    with open(overall_path, 'w', encoding='utf-8') as f:
-    # 将结果字典转换为JSON格式并写入文件
-        json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
-    overall_path_res = "文本型PDF抽取方案整体评测指标结果请查看:" + overall_path
-    print(f'\033[31m{overall_path_res}\033[0m')
+
 
     
 def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRET_KEY, END_POINT_URL):
@@ -805,8 +808,9 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
         
         # 上传文件到S3
         s3.upload_file(file_path, bucket_name, s3_object_key)
-        
-        print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}")
+        s3_path = f"http://st.bigdata.shlab.tech/S3_Browser?output_path=s3://{bucket_name}/{s3_directory}/{file_name}"
+        return s3_path
+        #print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}")
     except FileNotFoundError:
         print(f"文件 {file_path} 未找到,请检查文件路径是否正确。")
     except NoCredentialsError:
@@ -875,17 +879,17 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
     badcase_file,overall_file = generate_filename(badcase_path,overall_path)
 
     # 保存结果到JSON文件
-    save_results(result_dict, overall_report_dict,badcase_file,overall_file)
+    save_results(result_dict, overall_report_dict,badcase_file,overall_file,  s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
 
     result=compare_edit_distance(base_data_path, overall_report_dict)
-
+    """
     if all([s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url]):
         try:
             upload_to_s3(badcase_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
             upload_to_s3(overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
         except Exception as e:
             print(f"上传到S3时发生错误: {e}")
-    print(result)
+    """
     assert result == 1
 
 if __name__ == "__main__":