ソースを参照

feat: 添加表格一致性检查和警告,优化API结果处理逻辑

zhch158_admin 1 ヶ月 前
コミット
508f9d1254
1 ファイル変更24 行追加3 行削除
  1. 24 3
      zhch/ppstructurev3_single_client.py

+ 24 - 3
zhch/ppstructurev3_single_client.py

@@ -221,7 +221,24 @@ def convert_api_result_to_json(api_result: Dict[str, Any],
                 if original_html != normalized_html:
                     table_item['pred_html'] = normalized_html
                     changes_count += len([1 for o, n in zip(original_html, normalized_html) if o != n])
-        
+
+        # 检查是否需要修复表格一致性(这里只做统计,实际修复可能需要更复杂的逻辑)
+               # 统计表格数量
+        parsing_res_tables_count = 0
+        table_res_list_count = 0
+        if 'parsing_res_list' in converted_json:
+            parsing_res_tables_count = len([item for item in converted_json['parsing_res_list'] 
+                                          if 'block_label' in item and item['block_label'] == 'table'])
+        if 'table_res_list' in converted_json:
+            table_res_list_count = len(converted_json["table_res_list"])
+        table_consistency_fixed = False
+        if parsing_res_tables_count != table_res_list_count:
+            warnings.warn(f"⚠️ Warning: {filename} Table count mismatch - parsing_res_list has {parsing_res_tables_count} tables, "
+                          f"but table_res_list has {table_res_list_count} tables.")
+            table_consistency_fixed = True
+            # 这里可以添加实际的修复逻辑,例如根据需要添加或删除表格项
+            # 但由于缺乏具体规则,暂时只做统计和警告
+
         # 3. 标准化 overall_ocr_res 中的识别文本
         # ocr_res = converted_json.get('overall_ocr_res', {})
         # if 'rec_texts' in ocr_res:
@@ -240,7 +257,10 @@ def convert_api_result_to_json(api_result: Dict[str, Any],
         converted_json['processing_info'] = {
             "normalize_numbers": normalize_numbers,
             "changes_applied": changes_count > 0,
-            "character_changes_count": changes_count
+            "character_changes_count": changes_count,
+            "parsing_res_tables_count": parsing_res_tables_count,
+            "table_res_list_count": table_res_list_count,
+            "table_consistency_fixed": table_consistency_fixed
         }
         
         # if changes_count > 0:
@@ -656,7 +676,8 @@ if __name__ == "__main__":
         
         # 默认配置
         default_config = {
-            "input_dir": "../../OmniDocBench/OpenDataLab___OmniDocBench/images",
+            "input_file": "/Users/zhch158/workspace/data/至远彩色印刷工业有限公司/data_PPStructureV3_Results/2023年度报告母公司/2023年度报告母公司_page_027.png",
+            # "input_dir": "../../OmniDocBench/OpenDataLab___OmniDocBench/images",
             "output_dir": "./OmniDocBench_API_Results",
             "api_url": "http://10.192.72.11:8111/layout-parsing",
             "timeout": "300",