|
@@ -36,6 +36,12 @@ class TextFiller:
|
|
|
self.min_overlap_area: float = config.get("min_overlap_area", 50.0)
|
|
self.min_overlap_area: float = config.get("min_overlap_area", 50.0)
|
|
|
self.center_cell_ratio: float = config.get("center_cell_ratio", 0.5)
|
|
self.center_cell_ratio: float = config.get("center_cell_ratio", 0.5)
|
|
|
self.other_cell_max_ratio: float = config.get("other_cell_max_ratio", 0.3)
|
|
self.other_cell_max_ratio: float = config.get("other_cell_max_ratio", 0.3)
|
|
|
|
|
+ # OCR box 宽度超过中心单元格宽度 * 该比例 → 视为横向跨格误合并
|
|
|
|
|
+ self.ocr_bbox_width_overflow_ratio: float = config.get("ocr_bbox_width_overflow_ratio", 1.08)
|
|
|
|
|
+ # 相邻列单元格与 OCR box 的重叠比例下限(低于 other_cell_max_ratio,用于捕获 ~20-30% 的邻格重叠)
|
|
|
|
|
+ self.horizontal_secondary_overlap_ratio: float = config.get(
|
|
|
|
|
+ "horizontal_secondary_overlap_ratio", 0.15
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
@staticmethod
|
|
@staticmethod
|
|
|
def calculate_dynamic_confidence_threshold(text: str, base_threshold: float = 0.9) -> float:
|
|
def calculate_dynamic_confidence_threshold(text: str, base_threshold: float = 0.9) -> float:
|
|
@@ -266,6 +272,34 @@ class TextFiller:
|
|
|
logger.debug(f"检测到 OCR box 跨 {len(overlapping_cells)} 个单元格[{', '.join(map(str, overlapping_cells))}]: {ocr_item['text'][:20]}...")
|
|
logger.debug(f"检测到 OCR box 跨 {len(overlapping_cells)} 个单元格[{', '.join(map(str, overlapping_cells))}]: {ocr_item['text'][:20]}...")
|
|
|
|
|
|
|
|
processed_ocr_indices.add(ocr_idx)
|
|
processed_ocr_indices.add(ocr_idx)
|
|
|
|
|
+
|
|
|
|
|
+ # 已匹配到单元格但 OCR box 宽度明显超出单元格(漏检跨格的补充)
|
|
|
|
|
+ # for cell_idx, cell_bbox in enumerate(bboxes):
|
|
|
|
|
+ # if not matched_boxes_list[cell_idx]:
|
|
|
|
|
+ # continue
|
|
|
|
|
+ # cell_w = cell_bbox[2] - cell_bbox[0]
|
|
|
|
|
+ # if cell_w <= 0:
|
|
|
|
|
+ # continue
|
|
|
|
|
+ # for box in matched_boxes_list[cell_idx]:
|
|
|
|
|
+ # ocr_bbox = CoordinateUtils.poly_to_bbox(box.get("bbox", []))
|
|
|
|
|
+ # if not ocr_bbox or len(ocr_bbox) < 4:
|
|
|
|
|
+ # continue
|
|
|
|
|
+ # ocr_w = ocr_bbox[2] - ocr_bbox[0]
|
|
|
|
|
+ # if ocr_w <= cell_w * self.ocr_bbox_width_overflow_ratio:
|
|
|
|
|
+ # continue
|
|
|
|
|
+ # cx = (ocr_bbox[0] + ocr_bbox[2]) / 2
|
|
|
|
|
+ # cy = (ocr_bbox[1] + ocr_bbox[3]) / 2
|
|
|
|
|
+ # spanning = self.detect_ocr_box_spanning_cells(
|
|
|
|
|
+ # ocr_bbox, bboxes, center_point=(cx, cy)
|
|
|
|
|
+ # )
|
|
|
|
|
+ # targets = spanning if len(spanning) >= 2 else [cell_idx]
|
|
|
|
|
+ # for tidx in targets:
|
|
|
|
|
+ # if tidx not in need_reocr_indices:
|
|
|
|
|
+ # need_reocr_indices.append(tidx)
|
|
|
|
|
+ # logger.debug(
|
|
|
|
|
+ # f"OCR box 宽度({ocr_w:.0f})超出单元格{cell_idx}宽度({cell_w:.0f}),"
|
|
|
|
|
+ # f"标记重识别: {targets}"
|
|
|
|
|
+ # )
|
|
|
|
|
|
|
|
return texts, scores, matched_boxes_list, need_reocr_indices
|
|
return texts, scores, matched_boxes_list, need_reocr_indices
|
|
|
|
|
|
|
@@ -383,19 +417,30 @@ class TextFiller:
|
|
|
if is_overlapping:
|
|
if is_overlapping:
|
|
|
cell_overlaps.append((idx, overlap_ratio))
|
|
cell_overlaps.append((idx, overlap_ratio))
|
|
|
|
|
|
|
|
- # 如果中心点在某个单元格内,且该单元格的重叠比例符合阈值,且没有其他单元格达到次要阈值,则不标记为跨单元格
|
|
|
|
|
|
|
+ # 中心单元格占主导时可豁免跨格标记,但横向误合并(OCR 框过宽 / 邻格有显著重叠)除外
|
|
|
if center_cell_idx is not None and cell_overlaps:
|
|
if center_cell_idx is not None and cell_overlaps:
|
|
|
- # 找到中心点所在单元格的重叠比例
|
|
|
|
|
- center_overlap = next((overlap for idx, overlap in cell_overlaps if idx == center_cell_idx), None)
|
|
|
|
|
|
|
+ center_overlap = next(
|
|
|
|
|
+ (overlap for idx, overlap in cell_overlaps if idx == center_cell_idx), None
|
|
|
|
|
+ )
|
|
|
if center_overlap is not None and center_overlap >= self.center_cell_ratio:
|
|
if center_overlap is not None and center_overlap >= self.center_cell_ratio:
|
|
|
- # 检查是否有其他单元格的重叠比例也超过次要阈值
|
|
|
|
|
- other_high_overlaps = [idx for idx, overlap in cell_overlaps
|
|
|
|
|
- if idx != center_cell_idx and overlap >= self.other_cell_max_ratio]
|
|
|
|
|
- if not other_high_overlaps:
|
|
|
|
|
- # 中心点所在单元格占主导,不应该标记为跨单元格
|
|
|
|
|
|
|
+ other_high_overlaps = [
|
|
|
|
|
+ idx for idx, overlap in cell_overlaps
|
|
|
|
|
+ if idx != center_cell_idx and overlap >= self.other_cell_max_ratio
|
|
|
|
|
+ ]
|
|
|
|
|
+ other_horizontal_overlaps = [
|
|
|
|
|
+ idx for idx, overlap in cell_overlaps
|
|
|
|
|
+ if idx != center_cell_idx
|
|
|
|
|
+ and overlap >= self.horizontal_secondary_overlap_ratio
|
|
|
|
|
+ ]
|
|
|
|
|
+ center_cell = cell_bboxes[center_cell_idx]
|
|
|
|
|
+ center_w = center_cell[2] - center_cell[0]
|
|
|
|
|
+ width_overflow = (
|
|
|
|
|
+ center_w > 0
|
|
|
|
|
+ and ocr_width > center_w * self.ocr_bbox_width_overflow_ratio
|
|
|
|
|
+ )
|
|
|
|
|
+ if not other_high_overlaps and not other_horizontal_overlaps and not width_overflow:
|
|
|
return []
|
|
return []
|
|
|
|
|
|
|
|
- # 返回所有满足阈值的单元格索引
|
|
|
|
|
return [idx for idx, _ in cell_overlaps]
|
|
return [idx for idx, _ in cell_overlaps]
|
|
|
|
|
|
|
|
def second_pass_ocr_fill(
|
|
def second_pass_ocr_fill(
|