浏览代码

Merge remote-tracking branch 'origin/master'

# Conflicts:
#	magic_pdf/libs/pdf_image_tools.py
赵小蒙 1 年之前
父节点
当前提交
cf3e8519db

+ 7 - 4
.github/workflows/benchmark.yml

@@ -40,12 +40,15 @@ jobs:
           pip install -r requirements.txt
         fi
 
-
-    - name: benchmark
+    - name: config-net-reset
+      run: |
+        export http_proxy=""
+        export https_proxy=""
+    - name: get-benchmark-result
       run: |
         echo "start test"
-        cd tools && python text_badcase.py pdf_json_label_0306.json pdf_json_label_0229.json json_files.zip text_badcase text_overall base_data_text.json
-        python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip ocr_badcase ocr_overall base_data_ocr.json
+        cd tools && python text_badcase.py pdf_json_label_0306.json pdf_json_label_0229.json json_files.zip text_badcase text_overall base_data_text.json --s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY 7X9CWNHIVOHH3LXRD5WK  --AWS_SECRET_KEY IHLyTsv7h4ArzReLWUGZNKvwqB7CMrRi6e7ZyUt0 --END_POINT_URL http://p-ceph-norm-inside.pjlab.org.cn:80
+        python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip ocr_badcase ocr_overall base_data_ocr.json --s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY 7X9CWNHIVOHH3LXRD5WK  --AWS_SECRET_KEY IHLyTsv7h4ArzReLWUGZNKvwqB7CMrRi6e7ZyUt0 --END_POINT_URL http://p-ceph-norm-inside.pjlab.org.cn:80
   
   notify_to_feishu:
     if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}

+ 2 - 2
magic_pdf/cli/magicpdf.py

@@ -50,7 +50,7 @@ def get_pdf_parse_method(method):
 
 def prepare_env():
     local_parent_dir = os.path.join(
-        get_local_dir(), "magic-pdf", datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+        get_local_dir(), "magic-pdf", datetime.now().strftime("%Y-%m-%d")
     )
 
     local_image_dir = os.path.join(local_parent_dir, "images")
@@ -132,7 +132,7 @@ def pdf_command(pdf, model, method):
     local_image_dir, _ = prepare_env()
     local_image_rw = DiskReaderWriter(local_image_dir)
     parse = get_pdf_parse_method(method)
-    parse(pdf_data, jso["doc_layout_result"], local_image_rw, is_debug=True)
+    parse(pdf_data, jso, local_image_rw, is_debug=True)
 
 
 if __name__ == "__main__":

+ 5 - 2
magic_pdf/io/DiskReaderWriter.py

@@ -35,6 +35,9 @@ class DiskReaderWriter(AbsReaderWriter):
             abspath = path
         else:
             abspath = os.path.join(self.path, path)
+        directory_path = os.path.dirname(abspath)
+        if not os.path.exists(directory_path):
+            os.makedirs(directory_path)
         if mode == MODE_TXT:
             with open(abspath, "w", encoding=self.encoding) as f:
                 f.write(content)
@@ -53,11 +56,11 @@ class DiskReaderWriter(AbsReaderWriter):
 
 # 使用示例
 if __name__ == "__main__":
-    file_path = "io/example.txt"
+    file_path = "io/test/example.txt"
     drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
 
     # 写入内容到文件
-    drw.write(b"Hello, World!", path="io/example.txt", mode="binary")
+    drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
 
     # 从文件读取内容
     content = drw.read(path=file_path)

+ 1 - 1
magic_pdf/libs/config_reader.py

@@ -17,7 +17,7 @@ def read_config():
     config_file = os.path.join(home_dir, "magic-pdf.json")
 
     if not os.path.exists(config_file):
-        raise Exception("magic-pdf.json not found")
+        raise Exception(f"{config_file} not found")
 
     with open(config_file, "r") as f:
         config = json.load(f)

+ 1 - 1
magic_pdf/libs/pdf_image_tools.py

@@ -5,7 +5,7 @@ from magic_pdf.libs.commons import join_path
 from magic_pdf.libs.hash_utils import compute_sha256
 
 
-def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter):
+def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter:AbsReaderWriter):
     """
     从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
     save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。

+ 2 - 0
tools/README.MD

@@ -0,0 +1,2 @@
+# 工具脚本使用说明
+

+ 20 - 15
tools/ocr_badcase.py

@@ -756,7 +756,7 @@ def merge_json_data(json_test_df, json_standard_df):
 
     return inner_merge, standard_exist, test_exist
 
-def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
+def save_results(result_dict,overall_report_dict,badcase_path,overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url):
     """
     将结果字典保存为JSON文件至指定路径。
 
@@ -764,18 +764,19 @@ def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
     - result_dict: 包含计算结果的字典。
     - overall_path: 结果文件的保存路径,包括文件名。
     """
+    with open(overall_path, 'w', encoding='utf-8') as f:
+    # 将结果字典转换为JSON格式并写入文件
+        json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
+    final_overall_path = upload_to_s3(overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
+    overall_path_res = "OCR抽取方案整体评测指标结果请查看:" + final_overall_path
+    print(f'\033[31m{overall_path_res}\033[0m')
     # 打开指定的文件以写入
     with open(badcase_path, 'w', encoding='utf-8') as f:
         # 将结果字典转换为JSON格式并写入文件
         json.dump(result_dict, f, ensure_ascii=False, indent=4)
-
-    print(f"计算结果已经保存到文件:{badcase_path}")
-
-    with open(overall_path, 'w', encoding='utf-8') as f:
-    # 将结果字典转换为JSON格式并写入文件
-        json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
-
-    print(f"计算结果已经保存到文件:{overall_path}")
+    final_badcase_path = upload_to_s3(badcase_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
+    badcase_path_res = "OCR抽取方案评测badcase输出报告查看:" + final_badcase_path
+    print(f'\033[31m{badcase_path_res}\033[0m')
 
 def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRET_KEY, END_POINT_URL):
     """
@@ -792,8 +793,9 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
         
         # 上传文件到S3
         s3.upload_file(file_path, bucket_name, s3_object_key)
-        
-        print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}")
+        s3_path = f"http://st.bigdata.shlab.tech/S3_Browser?output_path=s3://{bucket_name}/{s3_directory}/{file_name}"
+        return s3_path
+        #print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}")
     except FileNotFoundError:
         print(f"文件 {file_path} 未找到,请检查文件路径是否正确。")
     except NoCredentialsError:
@@ -801,6 +803,7 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
     except ClientError as e:
         print(f"上传文件时发生错误:{e}")
 
+
 def generate_filename(badcase_path,overall_path):
     """
     生成带有当前时间戳的输出文件名。
@@ -864,17 +867,19 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
     badcase_file,overall_file = generate_filename(badcase_path,overall_path)
 
     # 保存结果到JSON文件
-    save_results(result_dict, overall_report_dict,badcase_file,overall_file)
+    #save_results(result_dict, overall_report_dict,badcase_file,overall_file)
+    save_results(result_dict, overall_report_dict,badcase_file,overall_file,  s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
 
     result=compare_edit_distance(base_data_path, overall_report_dict)
-
+    """
     if all([s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url]):
         try:
             upload_to_s3(badcase_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
             upload_to_s3(overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
         except Exception as e:
-            print(f"上传到S3时发生错误: {e}")
-    print(result)
+            print(f"上传到S3时发生错误: {e}")    
+    """
+    #print(result)
     assert result == 1
 
 if __name__ == "__main__":

+ 16 - 12
tools/text_badcase.py

@@ -768,7 +768,7 @@ def merge_json_data(json_test_df, json_standard_df):
 
     return inner_merge, standard_exist, test_exist
 
-def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
+def save_results(result_dict,overall_report_dict,badcase_path,overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url):
     """
     将结果字典保存为JSON文件至指定路径。
 
@@ -776,18 +776,21 @@ def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
     - result_dict: 包含计算结果的字典。
     - overall_path: 结果文件的保存路径,包括文件名。
     """
+    with open(overall_path, 'w', encoding='utf-8') as f:
+    # 将结果字典转换为JSON格式并写入文件
+        json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
+    final_overall_path = upload_to_s3(overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
+    overall_path_res = "文本型PDF抽取方案整体评测指标结果请查看:" + final_overall_path
+    print(f'\033[31m{overall_path_res}\033[0m')
     # 打开指定的文件以写入
     with open(badcase_path, 'w', encoding='utf-8') as f:
         # 将结果字典转换为JSON格式并写入文件
         json.dump(result_dict, f, ensure_ascii=False, indent=4)
+    final_badcase_path = upload_to_s3(badcase_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
+    badcase_path_res = "文本型PDF抽取方案评测badcase输出报告查看:" + final_badcase_path
+    print(f'\033[31m{badcase_path_res}\033[0m')
 
-    print(f"计算结果已经保存到文件:{badcase_path}")
 
-    with open(overall_path, 'w', encoding='utf-8') as f:
-    # 将结果字典转换为JSON格式并写入文件
-        json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
-
-    print(f"计算结果已经保存到文件:{overall_path}")
 
     
 def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRET_KEY, END_POINT_URL):
@@ -805,8 +808,9 @@ def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRE
         
         # 上传文件到S3
         s3.upload_file(file_path, bucket_name, s3_object_key)
-        
-        print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}")
+        s3_path = f"http://st.bigdata.shlab.tech/S3_Browser?output_path=s3://{bucket_name}/{s3_directory}/{file_name}"
+        return s3_path
+        #print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory},文件名为 {file_name}")
     except FileNotFoundError:
         print(f"文件 {file_path} 未找到,请检查文件路径是否正确。")
     except NoCredentialsError:
@@ -875,17 +879,17 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
     badcase_file,overall_file = generate_filename(badcase_path,overall_path)
 
     # 保存结果到JSON文件
-    save_results(result_dict, overall_report_dict,badcase_file,overall_file)
+    save_results(result_dict, overall_report_dict,badcase_file,overall_file,  s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
 
     result=compare_edit_distance(base_data_path, overall_report_dict)
-
+    """
     if all([s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url]):
         try:
             upload_to_s3(badcase_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
             upload_to_s3(overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
         except Exception as e:
             print(f"上传到S3时发生错误: {e}")
-    print(result)
+    """
     assert result == 1
 
 if __name__ == "__main__":