zhengchun
/
ocr_verify


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
							"""
bbox 提取模块
负责从 PaddleOCR 结果中提取文字框信息
"""
from typing import List, Dict


class BBoxExtractor:
    """bbox 提取器"""
    
    @staticmethod
    def extract_paddle_text_boxes(paddle_data: Dict) -> List[Dict]:
        """
        提取 PaddleOCR 的文字框信息
        
        Args:
            paddle_data: PaddleOCR 输出的数据
        
        Returns:
            文字框列表
        """
        text_boxes = []
        
        if 'overall_ocr_res' not in paddle_data:
            return text_boxes
        
        ocr_res = paddle_data['overall_ocr_res']
        rec_texts = ocr_res.get('rec_texts', [])
        rec_polys = ocr_res.get('rec_polys', [])
        rec_scores = ocr_res.get('rec_scores', [])

        for i, (text, poly, score) in enumerate(zip(rec_texts, rec_polys, rec_scores)):
            if text and text.strip():
                # 计算 bbox (x_min, y_min, x_max, y_max)
                bbox = BBoxExtractor._poly_to_bbox(poly)
                
                text_boxes.append({
                    'text': text,
                    'bbox': bbox,
                    'poly': poly,
                    'score': score,
                    'paddle_bbox_index': i,
                    'used': False
                })

        return text_boxes
    
    @staticmethod
    def _poly_to_bbox(poly: List[List[float]]) -> List[float]:
        """将多边形转换为 bbox"""
        xs = [p[0] for p in poly]
        ys = [p[1] for p in poly]
        return [min(xs), min(ys), max(xs), max(ys)]
    
    @staticmethod
    def extract_table_cells_with_bbox(merged_data: List[Dict]) -> List[Dict]:
        """
        提取所有表格单元格及其 bbox 信息
        
        Args:
            merged_data: 合并后的数据
        
        Returns:
            单元格列表
        """
        import json
        from bs4 import BeautifulSoup
        
        cells = []
        
        for item in merged_data:
            if item['type'] != 'table':
                continue
            
            html = item.get('table_body_with_bbox', item.get('table_body', ''))
            soup = BeautifulSoup(html, 'html.parser')
            
            for row_idx, row in enumerate(soup.find_all('tr')):
                for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
                    cell_text = cell.get_text(strip=True)
                    bbox_str = cell.get('data-bbox', '')
                    
                    if bbox_str:
                        try:
                            bbox = json.loads(bbox_str)
                            cells.append({
                                'text': cell_text,
                                'bbox': bbox,
                                'row': row_idx,
                                'col': col_idx,
                                'score': float(cell.get('data-score', 0)),
                                'paddle_index': int(cell.get('data-paddle-index', -1))
                            })
                        except (json.JSONDecodeError, ValueError):
                            pass
        
        return cells