6 mesi fa · 4f32495604
--- a/ocr_tools/universal_doc_parser/core/element_processors.py
+++ b/ocr_tools/universal_doc_parser/core/element_processors.py
@@ -183,7 +183,7 @@ class ElementProcessors:
 
				         image: np.ndarray,
			
 
				         bbox: List[float],
			
 
				         pre_matched_spans: Optional[List[Dict[str, Any]]] = None
			
 
				-    ) -> Tuple[np.ndarray, List[Dict[str, Any]], int, str]:
			
 
				+    ) -> Tuple[np.ndarray, List[Dict[str, Any]], int, str, int]:
			
 
				         """
			
 
				         表格OCR预处理（共享逻辑）
			
 
				         
			
@@ -195,10 +195,19 @@ class ElementProcessors:
 
				             pre_matched_spans: 预匹配的 OCR spans
			
 
				             
			
 
				         Returns:
			
 
				-            (cropped_table, ocr_boxes, table_angle, ocr_source)
			
 
				+            (cropped_table, ocr_boxes, table_angle, ocr_source, crop_padding)
			
 
				             其中 cropped_table 已经过方向检测和旋转处理
			
 
				+            crop_padding: 裁剪时添加的 padding 值
			
 
				         """
			
 
				-        cropped_table = CoordinateUtils.crop_region(image, bbox)
			
 
				+        # 计算表格区域尺寸，用于确定合适的padding
			
 
				+        table_width = bbox[2] - bbox[0]
			
 
				+        table_height = bbox[3] - bbox[1]
			
 
				+
			
 
				+        # 为倾斜图片添加padding，确保角落内容不被切掉
			
 
				+        # padding = 表格宽度的1% + 表格高度的1%，最小20像素
			
 
				+        crop_padding = max(20, int(min(table_width, table_height) * 0.01))
			
 
				+
			
 
				+        cropped_table = CoordinateUtils.crop_region(image, bbox, padding=crop_padding)
			
 
				         table_angle = 0
			
 
				         
			
 
				         # 1. 表格方向检测
			
@@ -214,6 +223,11 @@ class ElementProcessors:
 
				         ocr_boxes = []
			
 
				         ocr_source = "none"
			
 
				         
			
 
				+        # 计算裁剪后图像的起始坐标（考虑 padding）
			
 
				+        # 裁剪后图像的 (0, 0) 对应原图的 (bbox[0] - crop_padding, bbox[1] - crop_padding)
			
 
				+        cropped_offset_x = bbox[0] - crop_padding
			
 
				+        cropped_offset_y = bbox[1] - crop_padding
			
 
				+        
			
 
				         if pre_matched_spans and len(pre_matched_spans) > 0 and table_angle == 0:
			
 
				             # 使用整页 OCR 的结果
			
 
				             for idx, span in enumerate(pre_matched_spans):
			
@@ -222,11 +236,11 @@ class ElementProcessors:
 
				                 span_bbox = span.get('bbox', [])
			
 
				                 
			
 
				                 if span_poly:
			
 
				-                    # 如果有 poly 数据，直接使用（需要转换为相对坐标）
			
 
				+                    # 如果有 poly 数据，转换为相对于裁剪后图像的坐标（考虑 padding）
			
 
				                     if isinstance(span_poly[0], (list, tuple)) and len(span_poly) >= 4:
			
 
				-                        # 转换为相对坐标（相对于表格区域）
			
 
				+                        # 转换为相对坐标（相对于裁剪后图像的 (0, 0)）
			
 
				                         relative_poly = [
			
 
				-                            [float(p[0]) - bbox[0], float(p[1]) - bbox[1]]
			
 
				+                            [float(p[0]) - cropped_offset_x, float(p[1]) - cropped_offset_y]
			
 
				                             for p in span_poly[:4]
			
 
				                         ]
			
 
				                         formatted_box = CoordinateUtils.convert_ocr_to_matcher_format(
			
@@ -239,12 +253,12 @@ class ElementProcessors:
 
				                         if formatted_box:
			
 
				                             ocr_boxes.append(formatted_box)
			
 
				                 elif span_bbox and len(span_bbox) >= 4:
			
 
				-                    # 兜底：使用 bbox 数据
			
 
				+                    # 兜底：使用 bbox 数据，转换为相对于裁剪后图像的坐标（考虑 padding）
			
 
				                     relative_bbox = [
			
 
				-                        span_bbox[0] - bbox[0],
			
 
				-                        span_bbox[1] - bbox[1],
			
 
				-                        span_bbox[2] - bbox[0],
			
 
				-                        span_bbox[3] - bbox[1]
			
 
				+                        span_bbox[0] - cropped_offset_x,
			
 
				+                        span_bbox[1] - cropped_offset_y,
			
 
				+                        span_bbox[2] - cropped_offset_x,
			
 
				+                        span_bbox[3] - cropped_offset_y
			
 
				                     ]
			
 
				                     formatted_box = CoordinateUtils.convert_ocr_to_matcher_format(
			
 
				                         relative_bbox,
			
@@ -296,14 +310,16 @@ class ElementProcessors:
 
				             except Exception as e:
			
 
				                 logger.warning(f"Table OCR failed: {e}")
			
 
				         
			
 
				-        return cropped_table, ocr_boxes, table_angle, ocr_source
			
 
				+        return cropped_table, ocr_boxes, table_angle, ocr_source, crop_padding
			
 
				     
			
 
				     def process_table_element_wired(
			
 
				         self,
			
 
				         image: np.ndarray,
			
 
				         layout_item: Dict[str, Any],
			
 
				         scale: float,
			
 
				-        pre_matched_spans: Optional[List[Dict[str, Any]]] = None
			
 
				+        pre_matched_spans: Optional[List[Dict[str, Any]]] = None,
			
 
				+        output_dir: Optional[str] = None,
			
 
				+        basename: Optional[str] = None
			
 
				     ) -> Dict[str, Any]:
			
 
				         """
			
 
				         使用 UNet 有线表格识别处理表格元素
			
@@ -324,8 +340,8 @@ class ElementProcessors:
 
				         """
			
 
				         bbox = layout_item.get('bbox', [0, 0, 0, 0])
			
 
				         
			
 
				-        # OCR 预处理（返回已旋转的表格图片 + OCR 框）
			
 
				-        cropped_table, ocr_boxes, table_angle, ocr_source = \
			
 
				+        # OCR 预处理（返回已旋转的表格图片 + OCR 框 + padding）
			
 
				+        cropped_table, ocr_boxes, table_angle, ocr_source, crop_padding = \
			
 
				             self._prepare_table_ocr(image, bbox, pre_matched_spans)
			
 
				         
			
 
				         # 获取裁剪后表格图片的尺寸
			
@@ -340,10 +356,19 @@ class ElementProcessors:
 
				             if not self.wired_table_recognizer:
			
 
				                 raise RuntimeError("Wired table recognizer not available")
			
 
				             
			
 
				+            # 构造调试选项覆盖
			
 
				+            debug_opts_override = {}
			
 
				+            if output_dir:
			
 
				+                debug_opts_override['output_dir'] = output_dir
			
 
				+            if basename:
			
 
				+                # 使用完整 basename 作为前缀 (如 "filename_page_001")
			
 
				+                debug_opts_override['prefix'] = basename
			
 
				+
			
 
				             wired_res = self.wired_table_recognizer.recognize(
			
 
				                 table_image=cropped_table,
			
 
				                 # ocr_boxes=ocr_boxes_for_wired,
			
 
				                 ocr_boxes=ocr_boxes,
			
 
				+                debug_options=debug_opts_override
			
 
				             )
			
 
				             
			
 
				             if not (wired_res.get('html') or wired_res.get('cells')):
			
@@ -359,26 +384,29 @@ class ElementProcessors:
 
				             return self._create_empty_table_result(layout_item, bbox, table_angle, ocr_source)
			
 
				         
			
 
				         # 坐标转换：将旋转后的坐标转换回原图坐标
			
 
				+        # 计算正确的偏移量：裁剪后图像的 (0, 0) 对应原图的 (bbox[0] - crop_padding, bbox[1] - crop_padding)
			
 
				+        cropped_offset_bbox = [bbox[0] - crop_padding, bbox[1] - crop_padding, bbox[2] + crop_padding, bbox[3] + crop_padding]
			
 
				+        
			
 
				         if table_angle != 0 and MERGER_AVAILABLE:
			
 
				             cells, enhanced_html = CoordinateUtils.inverse_rotate_table_coords(
			
 
				                 cells=cells,
			
 
				                 html=enhanced_html,
			
 
				                 rotation_angle=table_angle,
			
 
				                 orig_table_size=orig_table_size,
			
 
				-                table_bbox=bbox
			
 
				+                table_bbox=cropped_offset_bbox
			
 
				             )
			
 
				             ocr_boxes = CoordinateUtils.inverse_rotate_ocr_boxes(
			
 
				                 ocr_boxes=ocr_boxes,
			
 
				                 rotation_angle=table_angle,
			
 
				                 orig_table_size=orig_table_size,
			
 
				-                table_bbox=bbox
			
 
				+                table_bbox=cropped_offset_bbox
			
 
				             )
			
 
				             logger.info(f"📐 Wired table coordinates transformed back to original image")
			
 
				         else:
			
 
				-            # 没有旋转，只需要加上表格偏移量
			
 
				-            cells = CoordinateUtils.add_table_offset_to_cells(cells, bbox)
			
 
				-            enhanced_html = CoordinateUtils.add_table_offset_to_html(enhanced_html, bbox)
			
 
				-            ocr_boxes = CoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, bbox)
			
 
				+            # 没有旋转，使用正确的偏移量（考虑 padding）
			
 
				+            cells = CoordinateUtils.add_table_offset_to_cells(cells, cropped_offset_bbox)
			
 
				+            enhanced_html = CoordinateUtils.add_table_offset_to_html(enhanced_html, cropped_offset_bbox)
			
 
				+            ocr_boxes = CoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, cropped_offset_bbox)
			
 
				         
			
 
				         return {
			
 
				             'type': 'table',
			
@@ -423,8 +451,8 @@ class ElementProcessors:
 
				         """
			
 
				         bbox = layout_item.get('bbox', [0, 0, 0, 0])
			
 
				         
			
 
				-        # OCR 预处理（返回已旋转的表格图片 + OCR 框）
			
 
				-        cropped_table, ocr_boxes, table_angle, ocr_source = \
			
 
				+        # OCR 预处理（返回已旋转的表格图片 + OCR 框 + padding）
			
 
				+        cropped_table, ocr_boxes, table_angle, ocr_source, crop_padding = \
			
 
				             self._prepare_table_ocr(image, bbox, pre_matched_spans)
			
 
				         
			
 
				         # 获取裁剪后表格图片的尺寸
			
@@ -451,37 +479,42 @@ class ElementProcessors:
 
				         
			
 
				         if table_html and ocr_boxes and self.table_cell_matcher:
			
 
				             try:
			
 
				+                # table_bbox 参数是相对于裁剪后图像的，OCR 框已经是相对于裁剪后图像的
			
 
				+                # 使用裁剪后图像的实际尺寸
			
 
				                 enhanced_html, cells, _, skew_angle = self.table_cell_matcher.enhance_table_html_with_bbox(
			
 
				                     html=table_html,
			
 
				                     paddle_text_boxes=ocr_boxes,
			
 
				                     start_pointer=0,
			
 
				-                    table_bbox=[0, 0, bbox[2] - bbox[0], bbox[3] - bbox[1]]
			
 
				+                    table_bbox=[0, 0, orig_table_w, orig_table_h]
			
 
				                 )
			
 
				                 logger.info(f"📊 Matched {len(cells)} cells with coordinates (skew: {skew_angle:.2f}°)")
			
 
				             except Exception as e:
			
 
				                 logger.warning(f"Cell coordinate matching failed: {e}")
			
 
				         
			
 
				         # 坐标转换：将旋转后的坐标转换回原图坐标
			
 
				+        # 计算正确的偏移量：裁剪后图像的 (0, 0) 对应原图的 (bbox[0] - crop_padding, bbox[1] - crop_padding)
			
 
				+        cropped_offset_bbox = [bbox[0] - crop_padding, bbox[1] - crop_padding, bbox[2] + crop_padding, bbox[3] + crop_padding]
			
 
				+        
			
 
				         if table_angle != 0 and MERGER_AVAILABLE:
			
 
				             cells, enhanced_html = CoordinateUtils.inverse_rotate_table_coords(
			
 
				                 cells=cells,
			
 
				                 html=enhanced_html,
			
 
				                 rotation_angle=table_angle,
			
 
				                 orig_table_size=orig_table_size,
			
 
				-                table_bbox=bbox
			
 
				+                table_bbox=cropped_offset_bbox
			
 
				             )
			
 
				             ocr_boxes = CoordinateUtils.inverse_rotate_ocr_boxes(
			
 
				                 ocr_boxes=ocr_boxes,
			
 
				                 rotation_angle=table_angle,
			
 
				                 orig_table_size=orig_table_size,
			
 
				-                table_bbox=bbox
			
 
				+                table_bbox=cropped_offset_bbox
			
 
				             )
			
 
				             logger.info(f"📐 VLM table coordinates transformed back to original image")
			
 
				         else:
			
 
				-            # 没有旋转，只需要加上表格偏移量
			
 
				-            cells = CoordinateUtils.add_table_offset_to_cells(cells, bbox)
			
 
				-            enhanced_html = CoordinateUtils.add_table_offset_to_html(enhanced_html, bbox)
			
 
				-            ocr_boxes = CoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, bbox)
			
 
				+            # 没有旋转，使用正确的偏移量（考虑 padding）
			
 
				+            cells = CoordinateUtils.add_table_offset_to_cells(cells, cropped_offset_bbox)
			
 
				+            enhanced_html = CoordinateUtils.add_table_offset_to_html(enhanced_html, cropped_offset_bbox)
			
 
				+            ocr_boxes = CoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, cropped_offset_bbox)
			
 
				         
			
 
				         return {
			
 
				             'type': 'table',