|
|
@@ -221,7 +221,24 @@ def convert_api_result_to_json(api_result: Dict[str, Any],
|
|
|
if original_html != normalized_html:
|
|
|
table_item['pred_html'] = normalized_html
|
|
|
changes_count += len([1 for o, n in zip(original_html, normalized_html) if o != n])
|
|
|
-
|
|
|
+
|
|
|
+ # 检查是否需要修复表格一致性(这里只做统计,实际修复可能需要更复杂的逻辑)
|
|
|
+ # 统计表格数量
|
|
|
+ parsing_res_tables_count = 0
|
|
|
+ table_res_list_count = 0
|
|
|
+ if 'parsing_res_list' in converted_json:
|
|
|
+ parsing_res_tables_count = len([item for item in converted_json['parsing_res_list']
|
|
|
+ if 'block_label' in item and item['block_label'] == 'table'])
|
|
|
+ if 'table_res_list' in converted_json:
|
|
|
+ table_res_list_count = len(converted_json["table_res_list"])
|
|
|
+ table_consistency_fixed = False
|
|
|
+ if parsing_res_tables_count != table_res_list_count:
|
|
|
+ warnings.warn(f"⚠️ Warning: {filename} Table count mismatch - parsing_res_list has {parsing_res_tables_count} tables, "
|
|
|
+ f"but table_res_list has {table_res_list_count} tables.")
|
|
|
+ table_consistency_fixed = True
|
|
|
+ # 这里可以添加实际的修复逻辑,例如根据需要添加或删除表格项
|
|
|
+ # 但由于缺乏具体规则,暂时只做统计和警告
|
|
|
+
|
|
|
# 3. 标准化 overall_ocr_res 中的识别文本
|
|
|
# ocr_res = converted_json.get('overall_ocr_res', {})
|
|
|
# if 'rec_texts' in ocr_res:
|
|
|
@@ -240,7 +257,10 @@ def convert_api_result_to_json(api_result: Dict[str, Any],
|
|
|
converted_json['processing_info'] = {
|
|
|
"normalize_numbers": normalize_numbers,
|
|
|
"changes_applied": changes_count > 0,
|
|
|
- "character_changes_count": changes_count
|
|
|
+ "character_changes_count": changes_count,
|
|
|
+ "parsing_res_tables_count": parsing_res_tables_count,
|
|
|
+ "table_res_list_count": table_res_list_count,
|
|
|
+ "table_consistency_fixed": table_consistency_fixed
|
|
|
}
|
|
|
|
|
|
# if changes_count > 0:
|
|
|
@@ -656,7 +676,8 @@ if __name__ == "__main__":
|
|
|
|
|
|
# 默认配置
|
|
|
default_config = {
|
|
|
- "input_dir": "../../OmniDocBench/OpenDataLab___OmniDocBench/images",
|
|
|
+ "input_file": "/Users/zhch158/workspace/data/至远彩色印刷工业有限公司/data_PPStructureV3_Results/2023年度报告母公司/2023年度报告母公司_page_027.png",
|
|
|
+ # "input_dir": "../../OmniDocBench/OpenDataLab___OmniDocBench/images",
|
|
|
"output_dir": "./OmniDocBench_API_Results",
|
|
|
"api_url": "http://10.192.72.11:8111/layout-parsing",
|
|
|
"timeout": "300",
|