瀏覽代碼

feat: Introduce new wired table processing module with enhanced skew detection, grid recovery, and output capabilities, and update pipeline to utilize it.

zhch158_admin 3 天之前
父節點
當前提交
4f32495604
共有 1 個文件被更改,包括 63 次插入30 次删除
  1. 63 30
      ocr_tools/universal_doc_parser/core/element_processors.py

+ 63 - 30
ocr_tools/universal_doc_parser/core/element_processors.py

@@ -183,7 +183,7 @@ class ElementProcessors:
         image: np.ndarray,
         bbox: List[float],
         pre_matched_spans: Optional[List[Dict[str, Any]]] = None
-    ) -> Tuple[np.ndarray, List[Dict[str, Any]], int, str]:
+    ) -> Tuple[np.ndarray, List[Dict[str, Any]], int, str, int]:
         """
         表格OCR预处理(共享逻辑)
         
@@ -195,10 +195,19 @@ class ElementProcessors:
             pre_matched_spans: 预匹配的 OCR spans
             
         Returns:
-            (cropped_table, ocr_boxes, table_angle, ocr_source)
+            (cropped_table, ocr_boxes, table_angle, ocr_source, crop_padding)
             其中 cropped_table 已经过方向检测和旋转处理
+            crop_padding: 裁剪时添加的 padding 值
         """
-        cropped_table = CoordinateUtils.crop_region(image, bbox)
+        # 计算表格区域尺寸,用于确定合适的padding
+        table_width = bbox[2] - bbox[0]
+        table_height = bbox[3] - bbox[1]
+
+        # 为倾斜图片添加padding,确保角落内容不被切掉
+        # padding = 表格宽度的1% + 表格高度的1%,最小20像素
+        crop_padding = max(20, int(min(table_width, table_height) * 0.01))
+
+        cropped_table = CoordinateUtils.crop_region(image, bbox, padding=crop_padding)
         table_angle = 0
         
         # 1. 表格方向检测
@@ -214,6 +223,11 @@ class ElementProcessors:
         ocr_boxes = []
         ocr_source = "none"
         
+        # 计算裁剪后图像的起始坐标(考虑 padding)
+        # 裁剪后图像的 (0, 0) 对应原图的 (bbox[0] - crop_padding, bbox[1] - crop_padding)
+        cropped_offset_x = bbox[0] - crop_padding
+        cropped_offset_y = bbox[1] - crop_padding
+        
         if pre_matched_spans and len(pre_matched_spans) > 0 and table_angle == 0:
             # 使用整页 OCR 的结果
             for idx, span in enumerate(pre_matched_spans):
@@ -222,11 +236,11 @@ class ElementProcessors:
                 span_bbox = span.get('bbox', [])
                 
                 if span_poly:
-                    # 如果有 poly 数据,直接使用(需要转换为相对坐标)
+                    # 如果有 poly 数据,转换为相对于裁剪后图像的坐标(考虑 padding
                     if isinstance(span_poly[0], (list, tuple)) and len(span_poly) >= 4:
-                        # 转换为相对坐标(相对于表格区域
+                        # 转换为相对坐标(相对于裁剪后图像的 (0, 0)
                         relative_poly = [
-                            [float(p[0]) - bbox[0], float(p[1]) - bbox[1]]
+                            [float(p[0]) - cropped_offset_x, float(p[1]) - cropped_offset_y]
                             for p in span_poly[:4]
                         ]
                         formatted_box = CoordinateUtils.convert_ocr_to_matcher_format(
@@ -239,12 +253,12 @@ class ElementProcessors:
                         if formatted_box:
                             ocr_boxes.append(formatted_box)
                 elif span_bbox and len(span_bbox) >= 4:
-                    # 兜底:使用 bbox 数据
+                    # 兜底:使用 bbox 数据,转换为相对于裁剪后图像的坐标(考虑 padding)
                     relative_bbox = [
-                        span_bbox[0] - bbox[0],
-                        span_bbox[1] - bbox[1],
-                        span_bbox[2] - bbox[0],
-                        span_bbox[3] - bbox[1]
+                        span_bbox[0] - cropped_offset_x,
+                        span_bbox[1] - cropped_offset_y,
+                        span_bbox[2] - cropped_offset_x,
+                        span_bbox[3] - cropped_offset_y
                     ]
                     formatted_box = CoordinateUtils.convert_ocr_to_matcher_format(
                         relative_bbox,
@@ -296,14 +310,16 @@ class ElementProcessors:
             except Exception as e:
                 logger.warning(f"Table OCR failed: {e}")
         
-        return cropped_table, ocr_boxes, table_angle, ocr_source
+        return cropped_table, ocr_boxes, table_angle, ocr_source, crop_padding
     
     def process_table_element_wired(
         self,
         image: np.ndarray,
         layout_item: Dict[str, Any],
         scale: float,
-        pre_matched_spans: Optional[List[Dict[str, Any]]] = None
+        pre_matched_spans: Optional[List[Dict[str, Any]]] = None,
+        output_dir: Optional[str] = None,
+        basename: Optional[str] = None
     ) -> Dict[str, Any]:
         """
         使用 UNet 有线表格识别处理表格元素
@@ -324,8 +340,8 @@ class ElementProcessors:
         """
         bbox = layout_item.get('bbox', [0, 0, 0, 0])
         
-        # OCR 预处理(返回已旋转的表格图片 + OCR 框)
-        cropped_table, ocr_boxes, table_angle, ocr_source = \
+        # OCR 预处理(返回已旋转的表格图片 + OCR 框 + padding
+        cropped_table, ocr_boxes, table_angle, ocr_source, crop_padding = \
             self._prepare_table_ocr(image, bbox, pre_matched_spans)
         
         # 获取裁剪后表格图片的尺寸
@@ -340,10 +356,19 @@ class ElementProcessors:
             if not self.wired_table_recognizer:
                 raise RuntimeError("Wired table recognizer not available")
             
+            # 构造调试选项覆盖
+            debug_opts_override = {}
+            if output_dir:
+                debug_opts_override['output_dir'] = output_dir
+            if basename:
+                # 使用完整 basename 作为前缀 (如 "filename_page_001")
+                debug_opts_override['prefix'] = basename
+
             wired_res = self.wired_table_recognizer.recognize(
                 table_image=cropped_table,
                 # ocr_boxes=ocr_boxes_for_wired,
                 ocr_boxes=ocr_boxes,
+                debug_options=debug_opts_override
             )
             
             if not (wired_res.get('html') or wired_res.get('cells')):
@@ -359,26 +384,29 @@ class ElementProcessors:
             return self._create_empty_table_result(layout_item, bbox, table_angle, ocr_source)
         
         # 坐标转换:将旋转后的坐标转换回原图坐标
+        # 计算正确的偏移量:裁剪后图像的 (0, 0) 对应原图的 (bbox[0] - crop_padding, bbox[1] - crop_padding)
+        cropped_offset_bbox = [bbox[0] - crop_padding, bbox[1] - crop_padding, bbox[2] + crop_padding, bbox[3] + crop_padding]
+        
         if table_angle != 0 and MERGER_AVAILABLE:
             cells, enhanced_html = CoordinateUtils.inverse_rotate_table_coords(
                 cells=cells,
                 html=enhanced_html,
                 rotation_angle=table_angle,
                 orig_table_size=orig_table_size,
-                table_bbox=bbox
+                table_bbox=cropped_offset_bbox
             )
             ocr_boxes = CoordinateUtils.inverse_rotate_ocr_boxes(
                 ocr_boxes=ocr_boxes,
                 rotation_angle=table_angle,
                 orig_table_size=orig_table_size,
-                table_bbox=bbox
+                table_bbox=cropped_offset_bbox
             )
             logger.info(f"📐 Wired table coordinates transformed back to original image")
         else:
-            # 没有旋转,只需要加上表格偏移量
-            cells = CoordinateUtils.add_table_offset_to_cells(cells, bbox)
-            enhanced_html = CoordinateUtils.add_table_offset_to_html(enhanced_html, bbox)
-            ocr_boxes = CoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, bbox)
+            # 没有旋转,使用正确的偏移量(考虑 padding)
+            cells = CoordinateUtils.add_table_offset_to_cells(cells, cropped_offset_bbox)
+            enhanced_html = CoordinateUtils.add_table_offset_to_html(enhanced_html, cropped_offset_bbox)
+            ocr_boxes = CoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, cropped_offset_bbox)
         
         return {
             'type': 'table',
@@ -423,8 +451,8 @@ class ElementProcessors:
         """
         bbox = layout_item.get('bbox', [0, 0, 0, 0])
         
-        # OCR 预处理(返回已旋转的表格图片 + OCR 框)
-        cropped_table, ocr_boxes, table_angle, ocr_source = \
+        # OCR 预处理(返回已旋转的表格图片 + OCR 框 + padding
+        cropped_table, ocr_boxes, table_angle, ocr_source, crop_padding = \
             self._prepare_table_ocr(image, bbox, pre_matched_spans)
         
         # 获取裁剪后表格图片的尺寸
@@ -451,37 +479,42 @@ class ElementProcessors:
         
         if table_html and ocr_boxes and self.table_cell_matcher:
             try:
+                # table_bbox 参数是相对于裁剪后图像的,OCR 框已经是相对于裁剪后图像的
+                # 使用裁剪后图像的实际尺寸
                 enhanced_html, cells, _, skew_angle = self.table_cell_matcher.enhance_table_html_with_bbox(
                     html=table_html,
                     paddle_text_boxes=ocr_boxes,
                     start_pointer=0,
-                    table_bbox=[0, 0, bbox[2] - bbox[0], bbox[3] - bbox[1]]
+                    table_bbox=[0, 0, orig_table_w, orig_table_h]
                 )
                 logger.info(f"📊 Matched {len(cells)} cells with coordinates (skew: {skew_angle:.2f}°)")
             except Exception as e:
                 logger.warning(f"Cell coordinate matching failed: {e}")
         
         # 坐标转换:将旋转后的坐标转换回原图坐标
+        # 计算正确的偏移量:裁剪后图像的 (0, 0) 对应原图的 (bbox[0] - crop_padding, bbox[1] - crop_padding)
+        cropped_offset_bbox = [bbox[0] - crop_padding, bbox[1] - crop_padding, bbox[2] + crop_padding, bbox[3] + crop_padding]
+        
         if table_angle != 0 and MERGER_AVAILABLE:
             cells, enhanced_html = CoordinateUtils.inverse_rotate_table_coords(
                 cells=cells,
                 html=enhanced_html,
                 rotation_angle=table_angle,
                 orig_table_size=orig_table_size,
-                table_bbox=bbox
+                table_bbox=cropped_offset_bbox
             )
             ocr_boxes = CoordinateUtils.inverse_rotate_ocr_boxes(
                 ocr_boxes=ocr_boxes,
                 rotation_angle=table_angle,
                 orig_table_size=orig_table_size,
-                table_bbox=bbox
+                table_bbox=cropped_offset_bbox
             )
             logger.info(f"📐 VLM table coordinates transformed back to original image")
         else:
-            # 没有旋转,只需要加上表格偏移量
-            cells = CoordinateUtils.add_table_offset_to_cells(cells, bbox)
-            enhanced_html = CoordinateUtils.add_table_offset_to_html(enhanced_html, bbox)
-            ocr_boxes = CoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, bbox)
+            # 没有旋转,使用正确的偏移量(考虑 padding)
+            cells = CoordinateUtils.add_table_offset_to_cells(cells, cropped_offset_bbox)
+            enhanced_html = CoordinateUtils.add_table_offset_to_html(enhanced_html, cropped_offset_bbox)
+            ocr_boxes = CoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, cropped_offset_bbox)
         
         return {
             'type': 'table',