|
|
@@ -1,6 +1,6 @@
|
|
|
"""
|
|
|
数据处理模块
|
|
|
-负责处理 MinerU 数据,添加 bbox 信息
|
|
|
+负责处理 MinerU/PaddleOCR_VL 数据,添加 bbox 信息
|
|
|
"""
|
|
|
from typing import List, Dict, Tuple
|
|
|
from bs4 import BeautifulSoup
|
|
|
@@ -33,7 +33,7 @@ class DataProcessor:
|
|
|
paddle_text_boxes: PaddleOCR 文字框列表
|
|
|
|
|
|
Returns:
|
|
|
- 合并后的数据
|
|
|
+ 合并后的数据, table cell使用paddle的bbox,其他类型只是移动指针,bbox还是沿用minerU的bbox
|
|
|
"""
|
|
|
merged_data = []
|
|
|
paddle_pointer = 0
|
|
|
@@ -71,6 +71,53 @@ class DataProcessor:
|
|
|
|
|
|
return merged_data
|
|
|
|
|
|
+ def process_paddleocr_vl_data(self, paddleocr_vl_data: Dict,
|
|
|
+ paddle_text_boxes: List[Dict]) -> List[Dict]:
|
|
|
+ """
|
|
|
+ 处理 PaddleOCR_VL 数据,添加 bbox 信息
|
|
|
+
|
|
|
+ Args:
|
|
|
+ paddleocr_vl_data: PaddleOCR_VL 数据 (JSON 对象)
|
|
|
+ paddle_text_boxes: PaddleOCR 文字框列表
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 合并后的数据 (PPStruct3 格式, cell信息在parsing_res_list)
|
|
|
+ """
|
|
|
+ merged_data = []
|
|
|
+ paddle_pointer = 0
|
|
|
+ last_matched_index = 0
|
|
|
+
|
|
|
+ # 提取 parsing_res_list
|
|
|
+ parsing_res_list = paddleocr_vl_data.get('parsing_res_list', [])
|
|
|
+
|
|
|
+ # 按 bbox 排序
|
|
|
+ parsing_res_list.sort(
|
|
|
+ key=lambda x: (x['block_bbox'][1], x['block_bbox'][0])
|
|
|
+ if 'block_bbox' in x else (float('inf'), float('inf'))
|
|
|
+ )
|
|
|
+
|
|
|
+ for item in parsing_res_list:
|
|
|
+ block_label = item.get('block_label', '')
|
|
|
+
|
|
|
+ # PPStruct3 格式, cell信息在parsing_res_list
|
|
|
+ if block_label == 'table':
|
|
|
+ merged_item, paddle_pointer = self._process_paddleocr_vl_table(
|
|
|
+ item, paddle_text_boxes, paddle_pointer
|
|
|
+ )
|
|
|
+ merged_data.append(merged_item)
|
|
|
+
|
|
|
+ elif 'title' in block_label or block_label in ['text', 'number']:
|
|
|
+ merged_item, paddle_pointer, last_matched_index = self._process_paddleocr_vl_text(
|
|
|
+ item, paddle_text_boxes, paddle_pointer, last_matched_index
|
|
|
+ )
|
|
|
+ merged_data.append(merged_item)
|
|
|
+
|
|
|
+ else:
|
|
|
+ # 其他类型直接转换
|
|
|
+ merged_data.append(item.copy())
|
|
|
+
|
|
|
+ return merged_data
|
|
|
+
|
|
|
def _process_table(self, item: Dict, paddle_text_boxes: List[Dict],
|
|
|
start_pointer: int) -> Tuple[Dict, int]:
|
|
|
"""处理表格"""
|
|
|
@@ -123,6 +170,41 @@ class DataProcessor:
|
|
|
|
|
|
return merged_item, paddle_pointer, last_matched_index
|
|
|
|
|
|
+ def _process_paddleocr_vl_table(self, item: Dict, paddle_text_boxes: List[Dict],
|
|
|
+ start_pointer: int) -> Tuple[Dict, int]:
|
|
|
+ """处理 PaddleOCR_VL 表格"""
|
|
|
+ merged_item = item.copy()
|
|
|
+ table_html = item.get('block_content', '')
|
|
|
+
|
|
|
+ enhanced_html, cells, new_pointer = self._enhance_table_html_with_bbox(
|
|
|
+ table_html, paddle_text_boxes, start_pointer
|
|
|
+ )
|
|
|
+
|
|
|
+ # merge item使用item的所有信息,但重写block_content为增强后的html,增加单元格信息
|
|
|
+ merged_item['block_content'] = enhanced_html
|
|
|
+ merged_item['block_content_with_bbox'] = enhanced_html
|
|
|
+ merged_item['bbox_mapping'] = 'merged_from_paddle_ocr'
|
|
|
+ merged_item['table_cells'] = cells if cells else []
|
|
|
+
|
|
|
+ return merged_item, new_pointer
|
|
|
+
|
|
|
+ def _process_paddleocr_vl_text(self, item: Dict, paddle_text_boxes: List[Dict],
|
|
|
+ paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]:
|
|
|
+ """处理 PaddleOCR_VL 文本"""
|
|
|
+ merged_item = item.copy()
|
|
|
+ text = item.get('block_content', '')
|
|
|
+
|
|
|
+ matched_bbox, paddle_pointer, last_matched_index = \
|
|
|
+ self.text_matcher.find_matching_bbox(
|
|
|
+ text, paddle_text_boxes, paddle_pointer, last_matched_index,
|
|
|
+ self.look_ahead_window
|
|
|
+ )
|
|
|
+
|
|
|
+ if matched_bbox:
|
|
|
+ matched_bbox['used'] = True
|
|
|
+
|
|
|
+ return merged_item, paddle_pointer, last_matched_index
|
|
|
+
|
|
|
def _enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
|
|
|
start_pointer: int) -> Tuple[str, List[Dict], int]:
|
|
|
"""为 HTML 表格添加 bbox 信息"""
|