|
|
@@ -183,7 +183,7 @@ class ElementProcessors:
|
|
|
image: np.ndarray,
|
|
|
bbox: List[float],
|
|
|
pre_matched_spans: Optional[List[Dict[str, Any]]] = None
|
|
|
- ) -> Tuple[np.ndarray, List[Dict[str, Any]], int, str]:
|
|
|
+ ) -> Tuple[np.ndarray, List[Dict[str, Any]], int, str, int]:
|
|
|
"""
|
|
|
表格OCR预处理(共享逻辑)
|
|
|
|
|
|
@@ -195,10 +195,19 @@ class ElementProcessors:
|
|
|
pre_matched_spans: 预匹配的 OCR spans
|
|
|
|
|
|
Returns:
|
|
|
- (cropped_table, ocr_boxes, table_angle, ocr_source)
|
|
|
+ (cropped_table, ocr_boxes, table_angle, ocr_source, crop_padding)
|
|
|
其中 cropped_table 已经过方向检测和旋转处理
|
|
|
+ crop_padding: 裁剪时添加的 padding 值
|
|
|
"""
|
|
|
- cropped_table = CoordinateUtils.crop_region(image, bbox)
|
|
|
+ # 计算表格区域尺寸,用于确定合适的padding
|
|
|
+ table_width = bbox[2] - bbox[0]
|
|
|
+ table_height = bbox[3] - bbox[1]
|
|
|
+
|
|
|
+ # 为倾斜图片添加padding,确保角落内容不被切掉
|
|
|
+ # padding = 表格宽度的1% + 表格高度的1%,最小20像素
|
|
|
+ crop_padding = max(20, int(min(table_width, table_height) * 0.01))
|
|
|
+
|
|
|
+ cropped_table = CoordinateUtils.crop_region(image, bbox, padding=crop_padding)
|
|
|
table_angle = 0
|
|
|
|
|
|
# 1. 表格方向检测
|
|
|
@@ -214,6 +223,11 @@ class ElementProcessors:
|
|
|
ocr_boxes = []
|
|
|
ocr_source = "none"
|
|
|
|
|
|
+ # 计算裁剪后图像的起始坐标(考虑 padding)
|
|
|
+ # 裁剪后图像的 (0, 0) 对应原图的 (bbox[0] - crop_padding, bbox[1] - crop_padding)
|
|
|
+ cropped_offset_x = bbox[0] - crop_padding
|
|
|
+ cropped_offset_y = bbox[1] - crop_padding
|
|
|
+
|
|
|
if pre_matched_spans and len(pre_matched_spans) > 0 and table_angle == 0:
|
|
|
# 使用整页 OCR 的结果
|
|
|
for idx, span in enumerate(pre_matched_spans):
|
|
|
@@ -222,11 +236,11 @@ class ElementProcessors:
|
|
|
span_bbox = span.get('bbox', [])
|
|
|
|
|
|
if span_poly:
|
|
|
- # 如果有 poly 数据,直接使用(需要转换为相对坐标)
|
|
|
+ # 如果有 poly 数据,转换为相对于裁剪后图像的坐标(考虑 padding)
|
|
|
if isinstance(span_poly[0], (list, tuple)) and len(span_poly) >= 4:
|
|
|
- # 转换为相对坐标(相对于表格区域)
|
|
|
+ # 转换为相对坐标(相对于裁剪后图像的 (0, 0))
|
|
|
relative_poly = [
|
|
|
- [float(p[0]) - bbox[0], float(p[1]) - bbox[1]]
|
|
|
+ [float(p[0]) - cropped_offset_x, float(p[1]) - cropped_offset_y]
|
|
|
for p in span_poly[:4]
|
|
|
]
|
|
|
formatted_box = CoordinateUtils.convert_ocr_to_matcher_format(
|
|
|
@@ -239,12 +253,12 @@ class ElementProcessors:
|
|
|
if formatted_box:
|
|
|
ocr_boxes.append(formatted_box)
|
|
|
elif span_bbox and len(span_bbox) >= 4:
|
|
|
- # 兜底:使用 bbox 数据
|
|
|
+ # 兜底:使用 bbox 数据,转换为相对于裁剪后图像的坐标(考虑 padding)
|
|
|
relative_bbox = [
|
|
|
- span_bbox[0] - bbox[0],
|
|
|
- span_bbox[1] - bbox[1],
|
|
|
- span_bbox[2] - bbox[0],
|
|
|
- span_bbox[3] - bbox[1]
|
|
|
+ span_bbox[0] - cropped_offset_x,
|
|
|
+ span_bbox[1] - cropped_offset_y,
|
|
|
+ span_bbox[2] - cropped_offset_x,
|
|
|
+ span_bbox[3] - cropped_offset_y
|
|
|
]
|
|
|
formatted_box = CoordinateUtils.convert_ocr_to_matcher_format(
|
|
|
relative_bbox,
|
|
|
@@ -296,14 +310,16 @@ class ElementProcessors:
|
|
|
except Exception as e:
|
|
|
logger.warning(f"Table OCR failed: {e}")
|
|
|
|
|
|
- return cropped_table, ocr_boxes, table_angle, ocr_source
|
|
|
+ return cropped_table, ocr_boxes, table_angle, ocr_source, crop_padding
|
|
|
|
|
|
def process_table_element_wired(
|
|
|
self,
|
|
|
image: np.ndarray,
|
|
|
layout_item: Dict[str, Any],
|
|
|
scale: float,
|
|
|
- pre_matched_spans: Optional[List[Dict[str, Any]]] = None
|
|
|
+ pre_matched_spans: Optional[List[Dict[str, Any]]] = None,
|
|
|
+ output_dir: Optional[str] = None,
|
|
|
+ basename: Optional[str] = None
|
|
|
) -> Dict[str, Any]:
|
|
|
"""
|
|
|
使用 UNet 有线表格识别处理表格元素
|
|
|
@@ -324,8 +340,8 @@ class ElementProcessors:
|
|
|
"""
|
|
|
bbox = layout_item.get('bbox', [0, 0, 0, 0])
|
|
|
|
|
|
- # OCR 预处理(返回已旋转的表格图片 + OCR 框)
|
|
|
- cropped_table, ocr_boxes, table_angle, ocr_source = \
|
|
|
+ # OCR 预处理(返回已旋转的表格图片 + OCR 框 + padding)
|
|
|
+ cropped_table, ocr_boxes, table_angle, ocr_source, crop_padding = \
|
|
|
self._prepare_table_ocr(image, bbox, pre_matched_spans)
|
|
|
|
|
|
# 获取裁剪后表格图片的尺寸
|
|
|
@@ -340,10 +356,19 @@ class ElementProcessors:
|
|
|
if not self.wired_table_recognizer:
|
|
|
raise RuntimeError("Wired table recognizer not available")
|
|
|
|
|
|
+ # 构造调试选项覆盖
|
|
|
+ debug_opts_override = {}
|
|
|
+ if output_dir:
|
|
|
+ debug_opts_override['output_dir'] = output_dir
|
|
|
+ if basename:
|
|
|
+ # 使用完整 basename 作为前缀 (如 "filename_page_001")
|
|
|
+ debug_opts_override['prefix'] = basename
|
|
|
+
|
|
|
wired_res = self.wired_table_recognizer.recognize(
|
|
|
table_image=cropped_table,
|
|
|
# ocr_boxes=ocr_boxes_for_wired,
|
|
|
ocr_boxes=ocr_boxes,
|
|
|
+ debug_options=debug_opts_override
|
|
|
)
|
|
|
|
|
|
if not (wired_res.get('html') or wired_res.get('cells')):
|
|
|
@@ -359,26 +384,29 @@ class ElementProcessors:
|
|
|
return self._create_empty_table_result(layout_item, bbox, table_angle, ocr_source)
|
|
|
|
|
|
# 坐标转换:将旋转后的坐标转换回原图坐标
|
|
|
+ # 计算正确的偏移量:裁剪后图像的 (0, 0) 对应原图的 (bbox[0] - crop_padding, bbox[1] - crop_padding)
|
|
|
+ cropped_offset_bbox = [bbox[0] - crop_padding, bbox[1] - crop_padding, bbox[2] + crop_padding, bbox[3] + crop_padding]
|
|
|
+
|
|
|
if table_angle != 0 and MERGER_AVAILABLE:
|
|
|
cells, enhanced_html = CoordinateUtils.inverse_rotate_table_coords(
|
|
|
cells=cells,
|
|
|
html=enhanced_html,
|
|
|
rotation_angle=table_angle,
|
|
|
orig_table_size=orig_table_size,
|
|
|
- table_bbox=bbox
|
|
|
+ table_bbox=cropped_offset_bbox
|
|
|
)
|
|
|
ocr_boxes = CoordinateUtils.inverse_rotate_ocr_boxes(
|
|
|
ocr_boxes=ocr_boxes,
|
|
|
rotation_angle=table_angle,
|
|
|
orig_table_size=orig_table_size,
|
|
|
- table_bbox=bbox
|
|
|
+ table_bbox=cropped_offset_bbox
|
|
|
)
|
|
|
logger.info(f"📐 Wired table coordinates transformed back to original image")
|
|
|
else:
|
|
|
- # 没有旋转,只需要加上表格偏移量
|
|
|
- cells = CoordinateUtils.add_table_offset_to_cells(cells, bbox)
|
|
|
- enhanced_html = CoordinateUtils.add_table_offset_to_html(enhanced_html, bbox)
|
|
|
- ocr_boxes = CoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, bbox)
|
|
|
+ # 没有旋转,使用正确的偏移量(考虑 padding)
|
|
|
+ cells = CoordinateUtils.add_table_offset_to_cells(cells, cropped_offset_bbox)
|
|
|
+ enhanced_html = CoordinateUtils.add_table_offset_to_html(enhanced_html, cropped_offset_bbox)
|
|
|
+ ocr_boxes = CoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, cropped_offset_bbox)
|
|
|
|
|
|
return {
|
|
|
'type': 'table',
|
|
|
@@ -423,8 +451,8 @@ class ElementProcessors:
|
|
|
"""
|
|
|
bbox = layout_item.get('bbox', [0, 0, 0, 0])
|
|
|
|
|
|
- # OCR 预处理(返回已旋转的表格图片 + OCR 框)
|
|
|
- cropped_table, ocr_boxes, table_angle, ocr_source = \
|
|
|
+ # OCR 预处理(返回已旋转的表格图片 + OCR 框 + padding)
|
|
|
+ cropped_table, ocr_boxes, table_angle, ocr_source, crop_padding = \
|
|
|
self._prepare_table_ocr(image, bbox, pre_matched_spans)
|
|
|
|
|
|
# 获取裁剪后表格图片的尺寸
|
|
|
@@ -451,37 +479,42 @@ class ElementProcessors:
|
|
|
|
|
|
if table_html and ocr_boxes and self.table_cell_matcher:
|
|
|
try:
|
|
|
+ # table_bbox 参数是相对于裁剪后图像的,OCR 框已经是相对于裁剪后图像的
|
|
|
+ # 使用裁剪后图像的实际尺寸
|
|
|
enhanced_html, cells, _, skew_angle = self.table_cell_matcher.enhance_table_html_with_bbox(
|
|
|
html=table_html,
|
|
|
paddle_text_boxes=ocr_boxes,
|
|
|
start_pointer=0,
|
|
|
- table_bbox=[0, 0, bbox[2] - bbox[0], bbox[3] - bbox[1]]
|
|
|
+ table_bbox=[0, 0, orig_table_w, orig_table_h]
|
|
|
)
|
|
|
logger.info(f"📊 Matched {len(cells)} cells with coordinates (skew: {skew_angle:.2f}°)")
|
|
|
except Exception as e:
|
|
|
logger.warning(f"Cell coordinate matching failed: {e}")
|
|
|
|
|
|
# 坐标转换:将旋转后的坐标转换回原图坐标
|
|
|
+ # 计算正确的偏移量:裁剪后图像的 (0, 0) 对应原图的 (bbox[0] - crop_padding, bbox[1] - crop_padding)
|
|
|
+ cropped_offset_bbox = [bbox[0] - crop_padding, bbox[1] - crop_padding, bbox[2] + crop_padding, bbox[3] + crop_padding]
|
|
|
+
|
|
|
if table_angle != 0 and MERGER_AVAILABLE:
|
|
|
cells, enhanced_html = CoordinateUtils.inverse_rotate_table_coords(
|
|
|
cells=cells,
|
|
|
html=enhanced_html,
|
|
|
rotation_angle=table_angle,
|
|
|
orig_table_size=orig_table_size,
|
|
|
- table_bbox=bbox
|
|
|
+ table_bbox=cropped_offset_bbox
|
|
|
)
|
|
|
ocr_boxes = CoordinateUtils.inverse_rotate_ocr_boxes(
|
|
|
ocr_boxes=ocr_boxes,
|
|
|
rotation_angle=table_angle,
|
|
|
orig_table_size=orig_table_size,
|
|
|
- table_bbox=bbox
|
|
|
+ table_bbox=cropped_offset_bbox
|
|
|
)
|
|
|
logger.info(f"📐 VLM table coordinates transformed back to original image")
|
|
|
else:
|
|
|
- # 没有旋转,只需要加上表格偏移量
|
|
|
- cells = CoordinateUtils.add_table_offset_to_cells(cells, bbox)
|
|
|
- enhanced_html = CoordinateUtils.add_table_offset_to_html(enhanced_html, bbox)
|
|
|
- ocr_boxes = CoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, bbox)
|
|
|
+ # 没有旋转,使用正确的偏移量(考虑 padding)
|
|
|
+ cells = CoordinateUtils.add_table_offset_to_cells(cells, cropped_offset_bbox)
|
|
|
+ enhanced_html = CoordinateUtils.add_table_offset_to_html(enhanced_html, cropped_offset_bbox)
|
|
|
+ ocr_boxes = CoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, cropped_offset_bbox)
|
|
|
|
|
|
return {
|
|
|
'type': 'table',
|