|
@@ -57,6 +57,7 @@ except ImportError:
|
|
|
TableCellMatcher = None
|
|
TableCellMatcher = None
|
|
|
TextMatcher = None
|
|
TextMatcher = None
|
|
|
|
|
|
|
|
|
|
+from ocr_utils.bbox_utils import BBoxExtractor
|
|
|
|
|
|
|
|
class EnhancedDocPipeline:
|
|
class EnhancedDocPipeline:
|
|
|
"""增强版文档处理流水线"""
|
|
"""增强版文档处理流水线"""
|
|
@@ -350,6 +351,9 @@ class EnhancedDocPipeline:
|
|
|
logger.info(f"📝 Page {page_idx}: OCR detected {len(all_ocr_spans)} text spans")
|
|
logger.info(f"📝 Page {page_idx}: OCR detected {len(all_ocr_spans)} text spans")
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
logger.warning(f"⚠️ Full-page OCR failed: {e}")
|
|
logger.warning(f"⚠️ Full-page OCR failed: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ skew_angle = BBoxExtractor.calculate_skew_angle(all_ocr_spans)
|
|
|
|
|
+ logger.info(f"📊 Wired table skew angle: {skew_angle:.3f}°")
|
|
|
|
|
|
|
|
# 4. 将 OCR spans 匹配到 layout blocks
|
|
# 4. 将 OCR spans 匹配到 layout blocks
|
|
|
matched_spans = SpanMatcher.match_spans_to_blocks(
|
|
matched_spans = SpanMatcher.match_spans_to_blocks(
|