|
|
@@ -15,6 +15,7 @@ from loguru import logger
|
|
|
|
|
|
from ocr_utils.coordinate_utils import CoordinateUtils
|
|
|
from ocr_utils import PDFUtils
|
|
|
+from ocr_utils import normalize_financial_numbers
|
|
|
from .table_coordinate_utils import TableCoordinateUtils
|
|
|
|
|
|
# 导入 SpanMatcher(用于 spans 合并)
|
|
|
@@ -358,24 +359,26 @@ class ElementProcessors:
|
|
|
layout_item: Dict[str, Any],
|
|
|
scale: float,
|
|
|
pre_matched_spans: Optional[List[Dict[str, Any]]] = None,
|
|
|
- pdf_type: str = 'ocr', # 'ocr' 或 'txt'
|
|
|
+ pdf_type: str = 'ocr', # 'ocr' 或 'txt'
|
|
|
output_dir: Optional[str] = None,
|
|
|
- basename: Optional[str] = None
|
|
|
+ basename: Optional[str] = None,
|
|
|
+ normalize_numbers: bool = True,
|
|
|
) -> Dict[str, Any]:
|
|
|
"""
|
|
|
使用 UNet 有线表格识别处理表格元素
|
|
|
-
|
|
|
+
|
|
|
流程:
|
|
|
1. OCR检测获取文本框坐标
|
|
|
2. UNet 有线表格识别
|
|
|
3. 坐标逆向转换回原图坐标
|
|
|
-
|
|
|
+
|
|
|
Args:
|
|
|
image: 页面图像
|
|
|
layout_item: 布局检测项
|
|
|
scale: 缩放比例
|
|
|
pre_matched_spans: 预匹配的 OCR spans(来自整页 OCR)
|
|
|
-
|
|
|
+ normalize_numbers: 是否对表格内容做金额标准化
|
|
|
+
|
|
|
Returns:
|
|
|
处理后的元素字典
|
|
|
"""
|
|
|
@@ -446,7 +449,7 @@ class ElementProcessors:
|
|
|
enhanced_html = TableCoordinateUtils.add_table_offset_to_html(enhanced_html, cropped_offset_bbox)
|
|
|
ocr_boxes = TableCoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, cropped_offset_bbox)
|
|
|
|
|
|
- return {
|
|
|
+ result = {
|
|
|
'type': 'table',
|
|
|
'bbox': bbox,
|
|
|
'confidence': layout_item.get('confidence', 0.0),
|
|
|
@@ -461,29 +464,34 @@ class ElementProcessors:
|
|
|
'recognition_method': 'wired_unet',
|
|
|
},
|
|
|
}
|
|
|
-
|
|
|
+ if normalize_numbers:
|
|
|
+ self._normalize_table_content(result['content'])
|
|
|
+ return result
|
|
|
+
|
|
|
def process_table_element_vlm(
|
|
|
self,
|
|
|
image: np.ndarray,
|
|
|
layout_item: Dict[str, Any],
|
|
|
scale: float,
|
|
|
- pre_matched_spans: Optional[List[Dict[str, Any]]] = None
|
|
|
+ pre_matched_spans: Optional[List[Dict[str, Any]]] = None,
|
|
|
+ normalize_numbers: bool = True,
|
|
|
) -> Dict[str, Any]:
|
|
|
"""
|
|
|
使用 VLM 无线表格识别处理表格元素
|
|
|
-
|
|
|
+
|
|
|
流程:
|
|
|
1. OCR检测获取文本框坐标
|
|
|
2. VLM识别获取表格结构HTML
|
|
|
3. 匹配OCR坐标与VLM结构
|
|
|
4. 坐标逆向转换回原图坐标
|
|
|
-
|
|
|
+
|
|
|
Args:
|
|
|
image: 页面图像
|
|
|
layout_item: 布局检测项
|
|
|
scale: 缩放比例
|
|
|
pre_matched_spans: 预匹配的 OCR spans(来自整页 OCR)
|
|
|
-
|
|
|
+ normalize_numbers: 是否对表格内容做金额标准化
|
|
|
+
|
|
|
Returns:
|
|
|
处理后的元素字典
|
|
|
"""
|
|
|
@@ -557,7 +565,7 @@ class ElementProcessors:
|
|
|
enhanced_html = TableCoordinateUtils.add_table_offset_to_html(enhanced_html, cropped_offset_bbox)
|
|
|
ocr_boxes = TableCoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, cropped_offset_bbox)
|
|
|
|
|
|
- return {
|
|
|
+ result = {
|
|
|
'type': 'table',
|
|
|
'bbox': bbox,
|
|
|
'confidence': layout_item.get('confidence', 0.0),
|
|
|
@@ -572,6 +580,64 @@ class ElementProcessors:
|
|
|
'recognition_method': 'vlm',
|
|
|
},
|
|
|
}
|
|
|
+ if normalize_numbers:
|
|
|
+ self._normalize_table_content(result['content'])
|
|
|
+ return result
|
|
|
+
|
|
|
+ def _normalize_table_content(self, content: Dict[str, Any]) -> None:
|
|
|
+ """
|
|
|
+ 对表格 content 中的 html 与 cells 做金额标准化(就地修改),
|
|
|
+ 并记录变更到 content['number_normalization_changes'],供 JSON/MD 输出写入。
|
|
|
+ """
|
|
|
+ if not content or not content.get('html'):
|
|
|
+ return
|
|
|
+ table_changes: List[Dict[str, Any]] = []
|
|
|
+ try:
|
|
|
+ from ast import literal_eval
|
|
|
+ from bs4 import BeautifulSoup, Tag
|
|
|
+ html = content['html']
|
|
|
+ soup = BeautifulSoup(html, 'html.parser')
|
|
|
+ for table in soup.find_all('table'):
|
|
|
+ if not isinstance(table, Tag):
|
|
|
+ continue
|
|
|
+ for row_idx, tr in enumerate(table.find_all('tr')): # type: ignore[reportAttributeAccessIssue]
|
|
|
+ for col_idx, cell in enumerate(tr.find_all(['td', 'th'])): # type: ignore[reportAttributeAccessIssue]
|
|
|
+ if not isinstance(cell, Tag):
|
|
|
+ continue
|
|
|
+ raw = cell.get_text()
|
|
|
+ norm = normalize_financial_numbers(raw)
|
|
|
+ if norm != raw:
|
|
|
+ change: Dict[str, Any] = {
|
|
|
+ "row": row_idx,
|
|
|
+ "col": col_idx,
|
|
|
+ "old": raw,
|
|
|
+ "new": norm,
|
|
|
+ }
|
|
|
+ bbox_attr = cell.get("data-bbox")
|
|
|
+ if isinstance(bbox_attr, str):
|
|
|
+ try:
|
|
|
+ change["bbox"] = literal_eval(bbox_attr)
|
|
|
+ except Exception:
|
|
|
+ change["bbox"] = bbox_attr
|
|
|
+ table_changes.append(change)
|
|
|
+ cell.string = norm
|
|
|
+ content['html'] = str(soup)
|
|
|
+ except Exception as e:
|
|
|
+ logger.warning(f"表格 HTML 金额标准化失败: {e}")
|
|
|
+ cells = content.get('cells')
|
|
|
+ if isinstance(cells, list):
|
|
|
+ try:
|
|
|
+ for cell in cells:
|
|
|
+ if not isinstance(cell, dict):
|
|
|
+ continue
|
|
|
+ # for key in ['text', 'matched_text']:
|
|
|
+ for key in ['text']:
|
|
|
+ if key in cell and isinstance(cell[key], str):
|
|
|
+ cell[key] = normalize_financial_numbers(cell[key])
|
|
|
+ except Exception as e:
|
|
|
+ logger.warning(f"表格 cells 金额标准化失败: {e}")
|
|
|
+ if table_changes:
|
|
|
+ content['number_normalization_changes'] = table_changes
|
|
|
|
|
|
def _create_empty_table_result(
|
|
|
self,
|