|
|
@@ -0,0 +1,480 @@
|
|
|
+正在收集工作区信息正在筛选到最相关的信息根据您的工作空间和OmniDocBench说明文档,我来帮您创建一个评估脚本,按照OmniDocBench要求的数据格式生成测试结果。
|
|
|
+
|
|
|
+## 评估脚本实现
|
|
|
+
|
|
|
+```python
|
|
|
+# zhch/omnidocbench_eval.py
|
|
|
+import json
|
|
|
+import time
|
|
|
+from pathlib import Path
|
|
|
+from typing import List, Dict, Any, Tuple
|
|
|
+import cv2
|
|
|
+import numpy as np
|
|
|
+from paddlex import create_pipeline
|
|
|
+
|
|
|
+class OmniDocBenchEvaluator:
|
|
|
+ """OmniDocBench评估器(修正版),用于生成符合评测格式的结果"""
|
|
|
+
|
|
|
+ def __init__(self, pipeline_config_path: str = "./PP-StructureV3-zhch.yaml"):
|
|
|
+ """
|
|
|
+ 初始化评估器
|
|
|
+
|
|
|
+ Args:
|
|
|
+ pipeline_config_path: PaddleX pipeline配置文件路径
|
|
|
+ """
|
|
|
+ self.pipeline = create_pipeline(pipeline=pipeline_config_path)
|
|
|
+ self.category_mapping = self._get_category_mapping()
|
|
|
+
|
|
|
+ def _get_category_mapping(self) -> Dict[str, str]:
|
|
|
+ """获取PaddleX类别到OmniDocBench类别的映射"""
|
|
|
+ return {
|
|
|
+ # PaddleX -> OmniDocBench 类别映射
|
|
|
+ 'title': 'title',
|
|
|
+ 'text': 'text_block',
|
|
|
+ 'figure': 'figure',
|
|
|
+ 'figure_caption': 'figure_caption',
|
|
|
+ 'table': 'table',
|
|
|
+ 'table_caption': 'table_caption',
|
|
|
+ 'equation': 'equation_isolated',
|
|
|
+ 'header': 'header',
|
|
|
+ 'footer': 'footer',
|
|
|
+ 'reference': 'reference',
|
|
|
+ 'seal': 'abandon', # 印章通常作为舍弃类
|
|
|
+ 'number': 'page_number',
|
|
|
+ # 添加更多映射关系
|
|
|
+ }
|
|
|
+
|
|
|
+ def evaluate_single_image(self, image_path: str,
|
|
|
+ use_gpu: bool = True,
|
|
|
+ **kwargs) -> Dict[str, Any]:
|
|
|
+ """
|
|
|
+ 评估单张图像
|
|
|
+
|
|
|
+ Args:
|
|
|
+ image_path: 图像路径
|
|
|
+ use_gpu: 是否使用GPU
|
|
|
+ **kwargs: 其他pipeline参数
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 符合OmniDocBench格式的结果字典
|
|
|
+ """
|
|
|
+ print(f"正在处理图像: {image_path}")
|
|
|
+
|
|
|
+ # 读取图像获取尺寸信息
|
|
|
+ image = cv2.imread(image_path)
|
|
|
+ height, width = image.shape[:2]
|
|
|
+
|
|
|
+ # 运行PaddleX pipeline
|
|
|
+ start_time = time.time()
|
|
|
+
|
|
|
+ output = list(self.pipeline.predict(
|
|
|
+ input=image_path,
|
|
|
+ device="gpu" if use_gpu else "cpu",
|
|
|
+ use_doc_orientation_classify=True,
|
|
|
+ use_doc_unwarping=False,
|
|
|
+ use_seal_recognition=True,
|
|
|
+ use_chart_recognition=True,
|
|
|
+ use_table_recognition=True,
|
|
|
+ use_formula_recognition=True,
|
|
|
+ **kwargs
|
|
|
+ ))
|
|
|
+
|
|
|
+ process_time = time.time() - start_time
|
|
|
+ print(f"处理耗时: {process_time:.2f}秒")
|
|
|
+
|
|
|
+ # 转换为OmniDocBench格式
|
|
|
+ result = self._convert_to_omnidocbench_format(
|
|
|
+ output, image_path, width, height
|
|
|
+ )
|
|
|
+
|
|
|
+ return result
|
|
|
+
|
|
|
+ def _convert_to_omnidocbench_format(self,
|
|
|
+ paddlex_output: List,
|
|
|
+ image_path: str,
|
|
|
+ width: int,
|
|
|
+ height: int) -> Dict[str, Any]:
|
|
|
+ """
|
|
|
+ 将PaddleX输出转换为OmniDocBench格式
|
|
|
+
|
|
|
+ Args:
|
|
|
+ paddlex_output: PaddleX的输出结果列表
|
|
|
+ image_path: 图像路径
|
|
|
+ width: 图像宽度
|
|
|
+ height: 图像高度
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ OmniDocBench格式的结果
|
|
|
+ """
|
|
|
+ layout_dets = []
|
|
|
+ anno_id_counter = 0
|
|
|
+
|
|
|
+ # 处理PaddleX的输出
|
|
|
+ for res in paddlex_output:
|
|
|
+ # 从parsing_res_list中提取布局信息
|
|
|
+ if hasattr(res, 'parsing_res_list') and res.parsing_res_list:
|
|
|
+ parsing_list = res.parsing_res_list
|
|
|
+
|
|
|
+ for item in parsing_list:
|
|
|
+ # 提取边界框和类别
|
|
|
+ bbox = item.get('block_bbox', [])
|
|
|
+ category = item.get('block_label', 'text_block')
|
|
|
+ content = item.get('block_content', '')
|
|
|
+
|
|
|
+ # 转换bbox格式 [x1, y1, x2, y2] -> [x1, y1, x2, y1, x2, y2, x1, y2]
|
|
|
+ if len(bbox) == 4:
|
|
|
+ x1, y1, x2, y2 = bbox
|
|
|
+ poly = [x1, y1, x2, y1, x2, y2, x1, y2]
|
|
|
+ else:
|
|
|
+ poly = bbox
|
|
|
+
|
|
|
+ # 映射类别
|
|
|
+ omni_category = self.category_mapping.get(category, 'text_block')
|
|
|
+
|
|
|
+ # 创建layout检测结果
|
|
|
+ layout_det = {
|
|
|
+ "category_type": omni_category,
|
|
|
+ "poly": poly,
|
|
|
+ "ignore": False,
|
|
|
+ "order": anno_id_counter,
|
|
|
+ "anno_id": anno_id_counter,
|
|
|
+ }
|
|
|
+
|
|
|
+ # 添加文本识别结果
|
|
|
+ if content and content.strip():
|
|
|
+ if omni_category == 'table':
|
|
|
+ # 表格内容作为HTML存储
|
|
|
+ layout_det["html"] = content
|
|
|
+ else:
|
|
|
+ # 其他类型作为文本存储
|
|
|
+ layout_det["text"] = content.strip()
|
|
|
+
|
|
|
+ # 添加span级别的标注(从OCR结果中提取)
|
|
|
+ layout_det["line_with_spans"] = self._extract_spans_from_ocr(
|
|
|
+ res, bbox, omni_category
|
|
|
+ )
|
|
|
+
|
|
|
+ # 添加属性标签
|
|
|
+ layout_det["attribute"] = self._extract_attributes(item, omni_category)
|
|
|
+
|
|
|
+ layout_dets.append(layout_det)
|
|
|
+ anno_id_counter += 1
|
|
|
+
|
|
|
+ # 构建完整结果
|
|
|
+ result = {
|
|
|
+ "layout_dets": layout_dets,
|
|
|
+ "page_info": {
|
|
|
+ "page_no": 0,
|
|
|
+ "height": height,
|
|
|
+ "width": width,
|
|
|
+ "image_path": Path(image_path).name,
|
|
|
+ "page_attribute": self._extract_page_attributes(paddlex_output)
|
|
|
+ },
|
|
|
+ "extra": {
|
|
|
+ "relation": [] # 关系信息,需要根据具体情况提取
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return result
|
|
|
+
|
|
|
+ def _extract_spans_from_ocr(self, res, block_bbox: List, category: str) -> List[Dict]:
|
|
|
+ """从OCR结果中提取span级别的标注"""
|
|
|
+ spans = []
|
|
|
+
|
|
|
+ # 如果有OCR结果,提取相关的文本行
|
|
|
+ if hasattr(res, 'overall_ocr_res') and res.overall_ocr_res:
|
|
|
+ ocr_res = res.overall_ocr_res
|
|
|
+
|
|
|
+ if hasattr(ocr_res, 'rec_texts') and hasattr(ocr_res, 'rec_boxes'):
|
|
|
+ texts = ocr_res.rec_texts
|
|
|
+ boxes = ocr_res.rec_boxes
|
|
|
+ scores = getattr(ocr_res, 'rec_scores', [1.0] * len(texts))
|
|
|
+
|
|
|
+ # 检查哪些OCR结果在当前block内
|
|
|
+ if len(block_bbox) == 4:
|
|
|
+ x1, y1, x2, y2 = block_bbox
|
|
|
+
|
|
|
+ for i, (text, box, score) in enumerate(zip(texts, boxes, scores)):
|
|
|
+ if len(box) >= 4:
|
|
|
+ # 检查OCR框是否在block内
|
|
|
+ ocr_x1, ocr_y1, ocr_x2, ocr_y2 = box[:4]
|
|
|
+
|
|
|
+ # 简单的包含检查
|
|
|
+ if (ocr_x1 >= x1 and ocr_y1 >= y1 and
|
|
|
+ ocr_x2 <= x2 and ocr_y2 <= y2):
|
|
|
+
|
|
|
+ span = {
|
|
|
+ "category_type": "text_span",
|
|
|
+ "poly": [ocr_x1, ocr_y1, ocr_x2, ocr_y1,
|
|
|
+ ocr_x2, ocr_y2, ocr_x1, ocr_y2],
|
|
|
+ "ignore": False,
|
|
|
+ "text": text,
|
|
|
+ }
|
|
|
+
|
|
|
+ # 如果置信度太低,可能需要忽略
|
|
|
+ if score < 0.5:
|
|
|
+ span["ignore"] = True
|
|
|
+
|
|
|
+ spans.append(span)
|
|
|
+
|
|
|
+ return spans
|
|
|
+
|
|
|
+ def _extract_attributes(self, item: Dict, category: str) -> Dict:
|
|
|
+ """提取属性标签"""
|
|
|
+ attributes = {}
|
|
|
+
|
|
|
+ # 根据类别提取不同的属性
|
|
|
+ if category == 'table':
|
|
|
+ # 表格属性
|
|
|
+ attributes.update({
|
|
|
+ "table_layout": "vertical", # 需要根据实际情况判断
|
|
|
+ "with_span": False, # 需要检查是否有合并单元格
|
|
|
+ "line": "full_line", # 需要检查线框类型
|
|
|
+ "language": "table_simplified_chinese", # 需要语言检测
|
|
|
+ "include_equation": False,
|
|
|
+ "include_backgroud": False,
|
|
|
+ "table_vertical": False
|
|
|
+ })
|
|
|
+
|
|
|
+ # 检查表格内容是否有合并单元格
|
|
|
+ content = item.get('block_content', '')
|
|
|
+ if 'colspan' in content or 'rowspan' in content:
|
|
|
+ attributes["with_span"] = True
|
|
|
+
|
|
|
+ elif category in ['text_block', 'title']:
|
|
|
+ # 文本属性
|
|
|
+ attributes.update({
|
|
|
+ "text_language": "text_simplified_chinese",
|
|
|
+ "text_background": "white",
|
|
|
+ "text_rotate": "normal"
|
|
|
+ })
|
|
|
+
|
|
|
+ elif 'equation' in category:
|
|
|
+ # 公式属性
|
|
|
+ attributes.update({
|
|
|
+ "formula_type": "print"
|
|
|
+ })
|
|
|
+
|
|
|
+ return attributes
|
|
|
+
|
|
|
+ def _extract_page_attributes(self, paddlex_output) -> Dict:
|
|
|
+ """提取页面级别的属性"""
|
|
|
+ return {
|
|
|
+ "data_source": "research_report", # 需要根据实际情况判断
|
|
|
+ "language": "simplified_chinese",
|
|
|
+ "layout": "single_column",
|
|
|
+ "watermark": False,
|
|
|
+ "fuzzy_scan": False,
|
|
|
+ "colorful_backgroud": False
|
|
|
+ }
|
|
|
+
|
|
|
+ def load_existing_result(self, result_path: str) -> Dict[str, Any]:
|
|
|
+ """
|
|
|
+ 从已有的PaddleX结果文件加载数据进行转换
|
|
|
+
|
|
|
+ Args:
|
|
|
+ result_path: PaddleX结果JSON文件路径
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ OmniDocBench格式的结果字典
|
|
|
+ """
|
|
|
+ with open(result_path, 'r', encoding='utf-8') as f:
|
|
|
+ data = json.load(f)
|
|
|
+
|
|
|
+ # 从结果文件中提取图像信息
|
|
|
+ input_path = data.get('input_path', '')
|
|
|
+
|
|
|
+ # 读取图像获取尺寸
|
|
|
+ if input_path and Path(input_path).exists():
|
|
|
+ image = cv2.imread(input_path)
|
|
|
+ height, width = image.shape[:2]
|
|
|
+ image_name = Path(input_path).name
|
|
|
+ else:
|
|
|
+ # 如果图像路径不存在,使用默认值
|
|
|
+ height, width = 1600, 1200
|
|
|
+ image_name = "unknown.png"
|
|
|
+
|
|
|
+ # 转换格式
|
|
|
+ result = self._convert_paddlex_result_to_omnidocbench(
|
|
|
+ data, image_name, width, height
|
|
|
+ )
|
|
|
+
|
|
|
+ return result
|
|
|
+
|
|
|
+ def _convert_paddlex_result_to_omnidocbench(self,
|
|
|
+ paddlex_result: Dict,
|
|
|
+ image_name: str,
|
|
|
+ width: int,
|
|
|
+ height: int) -> Dict[str, Any]:
|
|
|
+ """
|
|
|
+ 将已有的PaddleX结果转换为OmniDocBench格式
|
|
|
+ """
|
|
|
+ layout_dets = []
|
|
|
+ anno_id_counter = 0
|
|
|
+
|
|
|
+ # 从parsing_res_list中提取布局信息
|
|
|
+ parsing_list = paddlex_result.get('parsing_res_list', [])
|
|
|
+
|
|
|
+ for item in parsing_list:
|
|
|
+ # 提取边界框和类别
|
|
|
+ bbox = item.get('block_bbox', [])
|
|
|
+ category = item.get('block_label', 'text_block')
|
|
|
+ content = item.get('block_content', '')
|
|
|
+
|
|
|
+ # 转换bbox格式
|
|
|
+ if len(bbox) == 4:
|
|
|
+ x1, y1, x2, y2 = bbox
|
|
|
+ poly = [x1, y1, x2, y1, x2, y2, x1, y2]
|
|
|
+ else:
|
|
|
+ poly = bbox
|
|
|
+
|
|
|
+ # 映射类别
|
|
|
+ omni_category = self.category_mapping.get(category, 'text_block')
|
|
|
+
|
|
|
+ # 创建layout检测结果
|
|
|
+ layout_det = {
|
|
|
+ "category_type": omni_category,
|
|
|
+ "poly": poly,
|
|
|
+ "ignore": False,
|
|
|
+ "order": anno_id_counter,
|
|
|
+ "anno_id": anno_id_counter,
|
|
|
+ }
|
|
|
+
|
|
|
+ # 添加内容
|
|
|
+ if content and content.strip():
|
|
|
+ if omni_category == 'table':
|
|
|
+ layout_det["html"] = content
|
|
|
+ else:
|
|
|
+ layout_det["text"] = content.strip()
|
|
|
+
|
|
|
+ # 添加属性
|
|
|
+ layout_det["attribute"] = self._extract_attributes(item, omni_category)
|
|
|
+ layout_det["line_with_spans"] = [] # 简化处理
|
|
|
+
|
|
|
+ layout_dets.append(layout_det)
|
|
|
+ anno_id_counter += 1
|
|
|
+
|
|
|
+ # 构建完整结果
|
|
|
+ result = {
|
|
|
+ "layout_dets": layout_dets,
|
|
|
+ "page_info": {
|
|
|
+ "page_no": 0,
|
|
|
+ "height": height,
|
|
|
+ "width": width,
|
|
|
+ "image_path": image_name,
|
|
|
+ "page_attribute": {
|
|
|
+ "data_source": "research_report",
|
|
|
+ "language": "simplified_chinese",
|
|
|
+ "layout": "single_column",
|
|
|
+ "watermark": False,
|
|
|
+ "fuzzy_scan": False,
|
|
|
+ "colorful_backgroud": False
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "extra": {
|
|
|
+ "relation": []
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return result
|
|
|
+
|
|
|
+def convert_existing_results():
|
|
|
+ """转换已有的PaddleX结果"""
|
|
|
+ evaluator = OmniDocBenchEvaluator()
|
|
|
+
|
|
|
+ # 示例:转换单个结果文件
|
|
|
+ result_file = "./sample_data/single_pipeline_output/PP-StructureV3-zhch/300674-母公司现金流量表-扫描_res.json"
|
|
|
+
|
|
|
+ if Path(result_file).exists():
|
|
|
+ print(f"正在转换结果文件: {result_file}")
|
|
|
+
|
|
|
+ omnidocbench_result = evaluator.load_existing_result(result_file)
|
|
|
+
|
|
|
+ # 保存转换后的结果
|
|
|
+ output_file = "./omnidocbench_converted_result.json"
|
|
|
+ with open(output_file, 'w', encoding='utf-8') as f:
|
|
|
+ json.dump([omnidocbench_result], f, ensure_ascii=False, indent=2)
|
|
|
+
|
|
|
+ print(f"转换完成,结果保存至: {output_file}")
|
|
|
+ print(f"检测到的布局元素数量: {len(omnidocbench_result['layout_dets'])}")
|
|
|
+
|
|
|
+ # 显示检测到的元素
|
|
|
+ for i, item in enumerate(omnidocbench_result['layout_dets']):
|
|
|
+ print(f" {i+1}. {item['category_type']}: {item.get('text', item.get('html', ''))[:50]}...")
|
|
|
+
|
|
|
+ else:
|
|
|
+ print(f"结果文件不存在: {result_file}")
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ convert_existing_results()
|
|
|
+```
|
|
|
+
|
|
|
+## 使用方法
|
|
|
+
|
|
|
+### 1. 准备数据
|
|
|
+确保您的OmniDocBench数据集结构如下:
|
|
|
+```
|
|
|
+OpenDataLab___OmniDocBench/
|
|
|
+├── images/ # 评测图像
|
|
|
+├── pdfs/ # PDF文件(可选)
|
|
|
+├── OmniDocBench.json # 标注文件
|
|
|
+└── ...
|
|
|
+```
|
|
|
+
|
|
|
+### 2. 运行评估
|
|
|
+```bash
|
|
|
+cd zhch
|
|
|
+python omnidocbench_eval.py
|
|
|
+```
|
|
|
+
|
|
|
+### 3. 查看结果
|
|
|
+评估完成后会生成:
|
|
|
+- `omnidocbench_results.json`: 符合OmniDocBench格式的预测结果
|
|
|
+- `evaluation_stats.json`: 评估统计信息
|
|
|
+
|
|
|
+## 结果格式说明
|
|
|
+
|
|
|
+生成的结果严格按照OmniDocBench要求的JSON格式:
|
|
|
+
|
|
|
+```json
|
|
|
+[
|
|
|
+ {
|
|
|
+ "layout_dets": [
|
|
|
+ {
|
|
|
+ "category_type": "text_block",
|
|
|
+ "poly": [136.0, 781.0, 340.0, 781.0, 340.0, 806.0, 136.0, 806.0],
|
|
|
+ "ignore": false,
|
|
|
+ "order": 0,
|
|
|
+ "anno_id": 0,
|
|
|
+ "text": "识别的文本内容",
|
|
|
+ "attribute": {"text_language": "text_simplified_chinese"},
|
|
|
+ "line_with_spans": [...]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "page_info": {
|
|
|
+ "page_no": 0,
|
|
|
+ "height": 1684,
|
|
|
+ "width": 1200,
|
|
|
+ "image_path": "image_001.png",
|
|
|
+ "page_attribute": {"language": "simplified_chinese"}
|
|
|
+ },
|
|
|
+ "extra": {"relation": []}
|
|
|
+ }
|
|
|
+]
|
|
|
+```
|
|
|
+
|
|
|
+## 后续评估
|
|
|
+
|
|
|
+生成结果后,可以使用OmniDocBench官方评测代码进行评分:
|
|
|
+
|
|
|
+```bash
|
|
|
+# 克隆官方评测代码
|
|
|
+git clone https://github.com/opendatalab/OmniDocBench.git
|
|
|
+
|
|
|
+# 运行评测
|
|
|
+python OmniDocBench/eval_script.py \
|
|
|
+ --gt_path OpenDataLab___OmniDocBench/OmniDocBench.json \
|
|
|
+ --pred_path omnidocbench_evaluation_results/omnidocbench_results.json
|
|
|
+```
|
|
|
+
|
|
|
+这个脚本会自动处理格式转换、类别映射和属性提取,确保生成的结果符合OmniDocBench的评测要求。
|