| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258 |
- """
- bbox 提取模块
- 负责从 PaddleOCR 结果中提取文字框信息
- """
- from typing import List, Dict
- import numpy as np
- from pathlib import Path
- class BBoxExtractor:
- """bbox 提取器"""
-
- @staticmethod
- def extract_paddle_text_boxes(paddle_data: Dict) -> List[Dict]:
- """
- 提取 PaddleOCR 的文字框信息
-
- Args:
- paddle_data: PaddleOCR 输出的数据
-
- Returns:
- 文字框列表(坐标已转换为 angle=0 时的坐标)
- """
- text_boxes = []
-
- if 'overall_ocr_res' not in paddle_data:
- return text_boxes
-
- ocr_res = paddle_data['overall_ocr_res']
- rec_texts = ocr_res.get('rec_texts', [])
- rec_polys = ocr_res.get('rec_polys', [])
- rec_scores = ocr_res.get('rec_scores', [])
-
- # 🎯 获取旋转角度
- rotation_angle = BBoxExtractor._get_rotation_angle(paddle_data)
-
- # 🎯 如果有旋转,需要获取原始图像尺寸
- orig_image_size = None
-
- if rotation_angle != 0:
- orig_image_size = BBoxExtractor._get_original_image_size(paddle_data)
- print(f"🔄 检测到旋转角度: {rotation_angle}°")
- print(f"📐 原始图像尺寸: {orig_image_size[0]} x {orig_image_size[1]}")
-
- for i, (text, poly, score) in enumerate(zip(rec_texts, rec_polys, rec_scores)):
- if text and text.strip():
- # 🎯 如果有旋转角度,转换坐标
- if rotation_angle != 0 and orig_image_size:
- poly = BBoxExtractor._inverse_rotate_coordinates(
- poly, rotation_angle, orig_image_size
- )
-
- # 计算 bbox (x_min, y_min, x_max, y_max)
- bbox = BBoxExtractor._poly_to_bbox(poly)
-
- text_boxes.append({
- 'text': text,
- 'bbox': bbox,
- 'poly': poly,
- 'score': score,
- 'paddle_bbox_index': i,
- 'used': False
- })
-
- return text_boxes
-
- @staticmethod
- def _get_rotation_angle(paddle_data: Dict) -> float:
- """获取旋转角度"""
- if 'doc_preprocessor_res' not in paddle_data:
- return 0.0
-
- doc_res = paddle_data['doc_preprocessor_res']
- if isinstance(doc_res, dict) and 'angle' in doc_res:
- return float(doc_res['angle'])
-
- return 0.0
-
- @staticmethod
- def _get_original_image_size(paddle_data: Dict) -> tuple:
- """
- 获取原始图像尺寸(从图片文件读取)
-
- Args:
- paddle_data: PaddleOCR 数据
-
- Returns:
- (width, height) 元组
- """
- from PIL import Image
-
- # 🎯 从 input_path 读取图像
- input_path = paddle_data.get('input_path')
-
- if input_path and Path(input_path).exists():
- try:
- with Image.open(input_path) as img:
- # 返回原始图像尺寸
- return img.size # (width, height)
- except Exception as e:
- print(f"⚠️ 无法读取图像文件 {input_path}: {e}")
-
- # 🎯 降级方案:从 layout_det_res 推断
- if 'layout_det_res' in paddle_data:
- layout_res = paddle_data['layout_det_res']
- if 'boxes' in layout_res and layout_res['boxes']:
- max_x = 0
- max_y = 0
- for box in layout_res['boxes']:
- coord = box.get('coordinate', [])
- if len(coord) >= 4:
- max_x = max(max_x, coord[2])
- max_y = max(max_y, coord[3])
-
- if max_x > 0 and max_y > 0:
- return (int(max_x) + 50, int(max_y) + 50)
-
- # 🎯 最后降级:从 overall_ocr_res 推断
- if 'overall_ocr_res' in paddle_data:
- ocr_res = paddle_data['overall_ocr_res']
- rec_polys = ocr_res.get('rec_polys', [])
- if rec_polys:
- max_x = 0
- max_y = 0
- for poly in rec_polys:
- for point in poly:
- max_x = max(max_x, point[0])
- max_y = max(max_y, point[1])
-
- if max_x > 0 and max_y > 0:
- return (int(max_x) + 50, int(max_y) + 50)
-
- # 🎯 默认 A4 尺寸
- print("⚠️ 无法确定原始图像尺寸,使用默认值")
- return (2480, 3508)
-
- @staticmethod
- def _inverse_rotate_coordinates(poly: List[List[float]],
- angle: float,
- orig_image_size: tuple) -> List[List[float]]:
- """
- 反向旋转坐标
-
- 参考 ocr_validator_utils.rotate_image_and_coordinates 的逆操作
-
- PaddleOCR 在旋转后的图像上识别,坐标是旋转后的
- 我们需要将坐标转换回原始图像(未旋转)
-
- Args:
- poly: 旋转后图像上的多边形坐标 [[x',y'], ...]
- angle: 旋转角度(度数,PaddleX 使用的角度)
- orig_image_size: 原始图像尺寸 (width, height)
-
- Returns:
- 原始图像上的多边形坐标 [[x,y], ...]
- """
- orig_width, orig_height = orig_image_size
-
- # 🎯 根据旋转角度计算旋转后的图像尺寸
- if angle == 90:
- rotated_width, rotated_height = orig_height, orig_width
- elif angle == 270:
- rotated_width, rotated_height = orig_height, orig_width
- else:
- rotated_width, rotated_height = orig_width, orig_height
-
- inverse_poly = []
-
- for point in poly:
- x_rot, y_rot = point[0], point[1] # 旋转后的坐标
-
- # 🎯 反向旋转(参考 rotate_image_and_coordinates 的逆操作)
- if angle == 90:
- # 正向: rotated = image.rotate(90, expand=True)
- # x_rot = y_orig
- # y_rot = rotated_width - x_orig = orig_height - x_orig
- # 反向: x_orig = rotated_width - y_rot = orig_height - y_rot
- # y_orig = x_rot
- x_orig = rotated_width - y_rot
- y_orig = x_rot
-
- elif angle == 270:
- # 正向: rotated = image.rotate(-90, expand=True)
- # x_rot = rotated_width - y_orig = orig_height - y_orig
- # y_rot = x_orig
- # 反向: y_orig = rotated_width - x_rot = orig_height - x_rot
- # x_orig = y_rot
- x_orig = y_rot
- y_orig = rotated_width - x_rot
-
- elif angle == 180:
- # 正向: rotated = image.rotate(180)
- # x_rot = orig_width - x_orig
- # y_rot = orig_height - y_orig
- # 反向: x_orig = orig_width - x_rot
- # y_orig = orig_height - y_rot
- x_orig = orig_width - x_rot
- y_orig = orig_height - y_rot
-
- else:
- # 其他角度或0度,不转换
- x_orig = x_rot
- y_orig = y_rot
-
- inverse_poly.append([x_orig, y_orig])
-
- return inverse_poly
-
- @staticmethod
- def _poly_to_bbox(poly: List[List[float]]) -> List[float]:
- """将多边形转换为 bbox [x_min, y_min, x_max, y_max]"""
- xs = [p[0] for p in poly]
- ys = [p[1] for p in poly]
- return [min(xs), min(ys), max(xs), max(ys)]
-
- @staticmethod
- def extract_table_cells_with_bbox(merged_data: List[Dict]) -> List[Dict]:
- """
- 提取所有表格单元格及其 bbox 信息
-
- Args:
- merged_data: 合并后的数据
-
- Returns:
- 单元格列表
- """
- import json
- from bs4 import BeautifulSoup
-
- cells = []
-
- for item in merged_data:
- if item['type'] != 'table':
- continue
-
- html = item.get('table_body_with_bbox', item.get('table_body', ''))
- soup = BeautifulSoup(html, 'html.parser')
-
- for row_idx, row in enumerate(soup.find_all('tr')):
- for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
- cell_text = cell.get_text(strip=True)
- bbox_str = cell.get('data-bbox', '')
-
- if bbox_str:
- try:
- bbox = json.loads(bbox_str)
- cells.append({
- 'text': cell_text,
- 'bbox': bbox,
- 'row': row_idx,
- 'col': col_idx,
- 'score': float(cell.get('data-score', 0)),
- 'paddle_index': int(cell.get('data-paddle-index', -1))
- })
- except (json.JSONDecodeError, ValueError):
- pass
-
- return cells
|