|
|
@@ -24,26 +24,30 @@ class TableCellMatcher:
|
|
|
def __init__(self, text_matcher: TextMatcher,
|
|
|
x_tolerance: int = 3,
|
|
|
y_tolerance: int = 10,
|
|
|
- inclination_threshold: float = 0.3):
|
|
|
+ skew_threshold: float = 0.3):
|
|
|
"""
|
|
|
Args:
|
|
|
text_matcher: 文本匹配器
|
|
|
x_tolerance: X轴容差(用于列边界判断)
|
|
|
y_tolerance: Y轴容差(用于行分组)
|
|
|
- inclination_threshold: 倾斜校正阈值(度数)
|
|
|
+ skew_threshold: 倾斜校正阈值(度数)
|
|
|
"""
|
|
|
self.text_matcher = text_matcher
|
|
|
self.x_tolerance = x_tolerance
|
|
|
self.y_tolerance = y_tolerance
|
|
|
- self.inclination_threshold = inclination_threshold # 倾斜校正阈值(度数)
|
|
|
+ self.skew_threshold = skew_threshold # 倾斜校正阈值(度数)
|
|
|
|
|
|
def enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
|
|
|
- start_pointer: int, table_bbox: Optional[List[int]] = None) -> Tuple[str, List[Dict], int]:
|
|
|
+ start_pointer: int, table_bbox: Optional[List[int]] = None) -> Tuple[str, List[Dict], int, float]:
|
|
|
"""
|
|
|
为 HTML 表格添加 bbox 信息(优化版:使用行级动态规划)
|
|
|
+ Returns:
|
|
|
+ (enhanced_html, cells, new_pointer, skew_angle):
|
|
|
+ 增强后的HTML、单元格列表、新指针位置、倾斜角度
|
|
|
"""
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
cells = []
|
|
|
+ skew_angle = 0.0
|
|
|
|
|
|
# 🔑 第一步:筛选表格区域内的 paddle boxes
|
|
|
table_region_boxes, actual_table_bbox = self._filter_boxes_in_table_region(
|
|
|
@@ -54,16 +58,16 @@ class TableCellMatcher:
|
|
|
|
|
|
if not table_region_boxes:
|
|
|
print(f"⚠️ 未在表格区域找到 paddle boxes")
|
|
|
- return str(soup), cells, start_pointer
|
|
|
+ return str(soup), cells, start_pointer, skew_angle
|
|
|
|
|
|
print(f"📊 表格区域: {len(table_region_boxes)} 个文本框")
|
|
|
|
|
|
# 🔑 第二步:将表格区域的 boxes 按行分组
|
|
|
- grouped_boxes = self._group_paddle_boxes_by_rows(
|
|
|
+ grouped_boxes, skew_angle = self._group_paddle_boxes_by_rows(
|
|
|
table_region_boxes,
|
|
|
y_tolerance=self.y_tolerance,
|
|
|
auto_correct_skew=True,
|
|
|
- inclination_threshold=self.inclination_threshold
|
|
|
+ skew_threshold=self.skew_threshold
|
|
|
)
|
|
|
|
|
|
# 🔑 第三步:在每组内按 x 坐标排序
|
|
|
@@ -151,7 +155,7 @@ class TableCellMatcher:
|
|
|
|
|
|
print(f" 总计匹配: {len(cells)} 个单元格")
|
|
|
|
|
|
- return str(soup), cells, new_pointer
|
|
|
+ return str(soup), cells, new_pointer, skew_angle
|
|
|
|
|
|
def _merge_boxes_bbox(self, boxes: List[Dict]) -> List[int]:
|
|
|
"""辅助函数:合并多个 box 的坐标"""
|
|
|
@@ -471,7 +475,7 @@ class TableCellMatcher:
|
|
|
def _group_paddle_boxes_by_rows(self, paddle_boxes: List[Dict],
|
|
|
y_tolerance: int = 10,
|
|
|
auto_correct_skew: bool = True,
|
|
|
- inclination_threshold: float = 0.3) -> List[Dict]:
|
|
|
+ skew_threshold: float = 0.3) -> Tuple[List[Dict], float]:
|
|
|
"""
|
|
|
将 paddle_text_boxes 按 y 坐标分组(聚类)- 增强版本
|
|
|
|
|
|
@@ -483,21 +487,22 @@ class TableCellMatcher:
|
|
|
Returns:
|
|
|
分组列表,每组包含 {'y_center': float, 'boxes': List[Dict]}
|
|
|
"""
|
|
|
+ skew_angle = 0.0
|
|
|
if not paddle_boxes:
|
|
|
- return []
|
|
|
+ return [], skew_angle
|
|
|
|
|
|
# 🎯 步骤 1: 检测并校正倾斜(使用 BBoxExtractor)
|
|
|
if auto_correct_skew:
|
|
|
- rotation_angle = BBoxExtractor.calculate_skew_angle(paddle_boxes)
|
|
|
+ skew_angle = BBoxExtractor.calculate_skew_angle(paddle_boxes)
|
|
|
|
|
|
- if abs(rotation_angle) > inclination_threshold:
|
|
|
+ if abs(skew_angle) > skew_threshold:
|
|
|
max_x = max(box['bbox'][2] for box in paddle_boxes)
|
|
|
max_y = max(box['bbox'][3] for box in paddle_boxes)
|
|
|
image_size = (max_x, max_y)
|
|
|
|
|
|
- print(f" 🔧 校正倾斜角度: {rotation_angle:.2f}°")
|
|
|
+ print(f" 🔧 校正倾斜角度: {skew_angle:.2f}°")
|
|
|
paddle_boxes = BBoxExtractor.correct_boxes_skew(
|
|
|
- paddle_boxes, -rotation_angle, image_size
|
|
|
+ paddle_boxes, -skew_angle, image_size
|
|
|
)
|
|
|
|
|
|
# 🎯 步骤 2: 按校正后的 y 坐标分组
|
|
|
@@ -542,7 +547,7 @@ class TableCellMatcher:
|
|
|
|
|
|
print(f" ✓ 分组完成: {len(groups)} 行")
|
|
|
|
|
|
- return groups
|
|
|
+ return groups, skew_angle
|
|
|
|
|
|
|
|
|
def _match_html_rows_to_paddle_groups(self, html_rows: List,
|