|
|
@@ -1,6 +1,6 @@
|
|
|
"""
|
|
|
数据处理模块
|
|
|
-负责处理 MinerU/PaddleOCR_VL 数据,添加 bbox 信息
|
|
|
+负责处理 MinerU/PaddleOCR_VL/DotsOCR 数据,添加 bbox 信息
|
|
|
"""
|
|
|
from typing import List, Dict, Tuple
|
|
|
from bs4 import BeautifulSoup
|
|
|
@@ -71,56 +71,292 @@ class DataProcessor:
|
|
|
|
|
|
return merged_data
|
|
|
|
|
|
+ def process_dotsocr_data(self, dotsocr_data: List[Dict],
|
|
|
+ paddle_text_boxes: List[Dict]) -> List[Dict]:
|
|
|
+ """
|
|
|
+ 🎯 处理 DotsOCR 数据,转换为 MinerU 格式并添加 bbox 信息
|
|
|
+
|
|
|
+ Args:
|
|
|
+ dotsocr_data: DotsOCR 数据
|
|
|
+ paddle_text_boxes: PaddleOCR 文字框列表
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ MinerU 格式的合并数据
|
|
|
+ """
|
|
|
+ merged_data = []
|
|
|
+ paddle_pointer = 0
|
|
|
+ last_matched_index = 0
|
|
|
+
|
|
|
+ # 按 bbox 排序
|
|
|
+ dotsocr_data.sort(
|
|
|
+ key=lambda x: (x['bbox'][1], x['bbox'][0])
|
|
|
+ if 'bbox' in x else (float('inf'), float('inf'))
|
|
|
+ )
|
|
|
+
|
|
|
+ for item in dotsocr_data:
|
|
|
+ # 🎯 转换为 MinerU 格式
|
|
|
+ mineru_item = self._convert_dotsocr_to_mineru(item)
|
|
|
+ category = mineru_item.get('type', '')
|
|
|
+
|
|
|
+ # 🎯 根据类型处理
|
|
|
+ if category.lower() == 'table':
|
|
|
+ merged_item, paddle_pointer = self._process_dotsocr_table(
|
|
|
+ mineru_item, paddle_text_boxes, paddle_pointer
|
|
|
+ )
|
|
|
+ merged_data.append(merged_item)
|
|
|
+
|
|
|
+ elif category.lower() in ['text', 'title', 'header', 'footer']:
|
|
|
+ merged_item, paddle_pointer, last_matched_index = self._process_text(
|
|
|
+ mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
|
|
|
+ )
|
|
|
+ merged_data.append(merged_item)
|
|
|
+
|
|
|
+ elif category.lower() == 'list':
|
|
|
+ merged_item, paddle_pointer, last_matched_index = self._process_list(
|
|
|
+ mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
|
|
|
+ )
|
|
|
+ merged_data.append(merged_item)
|
|
|
+
|
|
|
+ else:
|
|
|
+ # Page-header, Page-footer, Picture 等
|
|
|
+ merged_data.append(mineru_item)
|
|
|
+
|
|
|
+ return merged_data
|
|
|
+
|
|
|
+ def _convert_dotsocr_to_mineru(self, dotsocr_item: Dict) -> Dict:
|
|
|
+ """
|
|
|
+ 🎯 将 DotsOCR 格式转换为 MinerU 格式
|
|
|
+
|
|
|
+ DotsOCR:
|
|
|
+ {
|
|
|
+ "category": "Table",
|
|
|
+ "bbox": [x1, y1, x2, y2],
|
|
|
+ "text": "..."
|
|
|
+ }
|
|
|
+
|
|
|
+ MinerU:
|
|
|
+ {
|
|
|
+ "type": "table",
|
|
|
+ "bbox": [x1, y1, x2, y2],
|
|
|
+ "table_body": "...",
|
|
|
+ "page_idx": 0
|
|
|
+ }
|
|
|
+ """
|
|
|
+ category = dotsocr_item.get('category', '')
|
|
|
+
|
|
|
+ # 🎯 Category 映射
|
|
|
+ category_map = {
|
|
|
+ 'Page-header': 'header',
|
|
|
+ 'Page-footer': 'footer',
|
|
|
+ 'Picture': 'image',
|
|
|
+ 'Figure': 'image',
|
|
|
+ 'Section-header': 'title',
|
|
|
+ 'Table': 'table',
|
|
|
+ 'Text': 'text',
|
|
|
+ 'Title': 'title',
|
|
|
+ 'List': 'list',
|
|
|
+ 'Caption': 'title'
|
|
|
+ }
|
|
|
+
|
|
|
+ mineru_type = category_map.get(category, 'text')
|
|
|
+
|
|
|
+ # 🎯 基础转换
|
|
|
+ mineru_item = {
|
|
|
+ 'type': mineru_type,
|
|
|
+ 'bbox': dotsocr_item.get('bbox', []),
|
|
|
+ 'page_idx': 0 # DotsOCR 默认单页
|
|
|
+ }
|
|
|
+
|
|
|
+ # 🎯 处理文本内容
|
|
|
+ text = dotsocr_item.get('text', '')
|
|
|
+
|
|
|
+ if mineru_type == 'table':
|
|
|
+ # 表格:text -> table_body
|
|
|
+ mineru_item['table_body'] = text
|
|
|
+ else:
|
|
|
+ # 其他类型:保持 text
|
|
|
+ mineru_item['text'] = text
|
|
|
+
|
|
|
+ # 标题级别
|
|
|
+ if category == 'Section-header':
|
|
|
+ mineru_item['text_level'] = 1
|
|
|
+
|
|
|
+ return mineru_item
|
|
|
+
|
|
|
+ def _process_dotsocr_table(self, item: Dict, paddle_text_boxes: List[Dict],
|
|
|
+ start_pointer: int) -> Tuple[Dict, int]:
|
|
|
+ """
|
|
|
+ 🎯 处理 DotsOCR 表格(已转换为 MinerU 格式)
|
|
|
+
|
|
|
+ DotsOCR 的表格 HTML 已经在 text 字段中,需要转移到 table_body
|
|
|
+ """
|
|
|
+ merged_item = item.copy()
|
|
|
+ table_html = item.get('table_body', '')
|
|
|
+
|
|
|
+ if not table_html:
|
|
|
+ return merged_item, start_pointer
|
|
|
+
|
|
|
+ # 🎯 复用表格处理逻辑
|
|
|
+ enhanced_html, cells, new_pointer = self._enhance_table_html_with_bbox(
|
|
|
+ table_html, paddle_text_boxes, start_pointer
|
|
|
+ )
|
|
|
+
|
|
|
+ merged_item['table_body'] = enhanced_html
|
|
|
+ merged_item['table_body_with_bbox'] = enhanced_html
|
|
|
+ merged_item['bbox_mapping'] = 'merged_from_paddle_ocr'
|
|
|
+ merged_item['table_cells'] = cells if cells else []
|
|
|
+
|
|
|
+ return merged_item, new_pointer
|
|
|
+
|
|
|
def process_paddleocr_vl_data(self, paddleocr_vl_data: Dict,
|
|
|
paddle_text_boxes: List[Dict]) -> List[Dict]:
|
|
|
"""
|
|
|
处理 PaddleOCR_VL 数据,添加 bbox 信息
|
|
|
-
|
|
|
+
|
|
|
Args:
|
|
|
paddleocr_vl_data: PaddleOCR_VL 数据 (JSON 对象)
|
|
|
paddle_text_boxes: PaddleOCR 文字框列表
|
|
|
-
|
|
|
+
|
|
|
Returns:
|
|
|
- 合并后的数据 (PPStruct3 格式, cell信息在parsing_res_list)
|
|
|
+ MinerU 格式的合并数据(统一输出格式)
|
|
|
"""
|
|
|
merged_data = []
|
|
|
paddle_pointer = 0
|
|
|
last_matched_index = 0
|
|
|
-
|
|
|
+
|
|
|
# 提取 parsing_res_list
|
|
|
parsing_res_list = paddleocr_vl_data.get('parsing_res_list', [])
|
|
|
-
|
|
|
+
|
|
|
# 按 bbox 排序
|
|
|
parsing_res_list.sort(
|
|
|
key=lambda x: (x['block_bbox'][1], x['block_bbox'][0])
|
|
|
if 'block_bbox' in x else (float('inf'), float('inf'))
|
|
|
)
|
|
|
-
|
|
|
+
|
|
|
for item in parsing_res_list:
|
|
|
- block_label = item.get('block_label', '')
|
|
|
+ # 🎯 统一转换为 MinerU 格式
|
|
|
+ mineru_item = self._convert_paddleocr_vl_to_mineru(item)
|
|
|
+ item_type = mineru_item.get('type', '')
|
|
|
|
|
|
- # PPStruct3 格式, cell信息在parsing_res_list
|
|
|
- if block_label == 'table':
|
|
|
- merged_item, paddle_pointer = self._process_paddleocr_vl_table(
|
|
|
- item, paddle_text_boxes, paddle_pointer
|
|
|
+ # 🎯 根据类型处理(复用 MinerU 的通用方法)
|
|
|
+ if item_type == 'table':
|
|
|
+ merged_item, paddle_pointer = self._process_table(
|
|
|
+ mineru_item, paddle_text_boxes, paddle_pointer
|
|
|
)
|
|
|
merged_data.append(merged_item)
|
|
|
-
|
|
|
- elif 'title' in block_label or block_label in ['text', 'number']:
|
|
|
- merged_item, paddle_pointer, last_matched_index = self._process_paddleocr_vl_text(
|
|
|
- item, paddle_text_boxes, paddle_pointer, last_matched_index
|
|
|
+
|
|
|
+ elif item_type in ['text', 'title', 'header', 'footer', 'equation']:
|
|
|
+ merged_item, paddle_pointer, last_matched_index = self._process_text(
|
|
|
+ mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
|
|
|
)
|
|
|
merged_data.append(merged_item)
|
|
|
-
|
|
|
- else:
|
|
|
- # 其他类型直接转换
|
|
|
- merged_data.append(item.copy())
|
|
|
|
|
|
+ elif item_type == 'list':
|
|
|
+ merged_item, paddle_pointer, last_matched_index = self._process_list(
|
|
|
+ mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
|
|
|
+ )
|
|
|
+ merged_data.append(merged_item)
|
|
|
+
|
|
|
+ else:
|
|
|
+ # 其他类型(image, equation 等)直接添加
|
|
|
+ merged_data.append(mineru_item)
|
|
|
+
|
|
|
return merged_data
|
|
|
|
|
|
+ def _convert_paddleocr_vl_to_mineru(self, paddleocr_vl_item: Dict) -> Dict:
|
|
|
+ """
|
|
|
+ 🎯 将 PaddleOCR_VL 格式转换为 MinerU 格式
|
|
|
+
|
|
|
+ PaddleOCR_VL (PP-DocLayout_plus-L):
|
|
|
+ {
|
|
|
+ "block_label": "paragraph_title", # 或 "doc_title", "text" 等
|
|
|
+ "block_bbox": [172, 151, 547, 184],
|
|
|
+ "block_content": "...",
|
|
|
+ "block_id": 0
|
|
|
+ }
|
|
|
+
|
|
|
+ MinerU:
|
|
|
+ {
|
|
|
+ "type": "title",
|
|
|
+ "bbox": [172, 151, 547, 184],
|
|
|
+ "text": "...",
|
|
|
+ "text_level": 1,
|
|
|
+ "page_idx": 0
|
|
|
+ }
|
|
|
+ """
|
|
|
+ block_label = paddleocr_vl_item.get('block_label', '')
|
|
|
+
|
|
|
+ # 🎯 PP-DocLayout_plus-L 类别映射
|
|
|
+ label_map = {
|
|
|
+ # 标题类
|
|
|
+ 'paragraph_title': 'title', # 段落标题 → title (level 2)
|
|
|
+ 'doc_title': 'title', # 文档标题 → title (level 1)
|
|
|
+ 'figure_table_chart_title': 'title', # 图表标题 → title (level 3)
|
|
|
+
|
|
|
+ # 文本类
|
|
|
+ 'text': 'text',
|
|
|
+ 'number': 'text',
|
|
|
+ 'content': 'text',
|
|
|
+ 'abstract': 'text',
|
|
|
+ 'footnote': 'text',
|
|
|
+ 'aside_text': 'text',
|
|
|
+ 'algorithm': 'text',
|
|
|
+
|
|
|
+ # 参考文献
|
|
|
+ 'reference': 'text',
|
|
|
+ 'reference_content': 'text',
|
|
|
+
|
|
|
+ # 页眉页脚
|
|
|
+ 'header': 'header',
|
|
|
+ 'footer': 'footer',
|
|
|
+
|
|
|
+ # 表格
|
|
|
+ 'table': 'table',
|
|
|
+
|
|
|
+ # 图片
|
|
|
+ 'image': 'image',
|
|
|
+ 'chart': 'image',
|
|
|
+
|
|
|
+ # 公式
|
|
|
+ 'formula': 'equation',
|
|
|
+ 'formula_number': 'equation',
|
|
|
+
|
|
|
+ # 印章
|
|
|
+ 'seal': 'image'
|
|
|
+ }
|
|
|
+
|
|
|
+ mineru_type = label_map.get(block_label, 'text')
|
|
|
+
|
|
|
+ # 🎯 基础转换
|
|
|
+ mineru_item = {
|
|
|
+ 'type': mineru_type,
|
|
|
+ 'bbox': paddleocr_vl_item.get('block_bbox', []),
|
|
|
+ 'page_idx': 0
|
|
|
+ }
|
|
|
+
|
|
|
+ # 🎯 处理文本内容
|
|
|
+ content = paddleocr_vl_item.get('block_content', '')
|
|
|
+
|
|
|
+ if mineru_type == 'table':
|
|
|
+ # 表格:block_content -> table_body
|
|
|
+ mineru_item['table_body'] = content
|
|
|
+ else:
|
|
|
+ # 其他类型:block_content -> text
|
|
|
+ mineru_item['text'] = content
|
|
|
+
|
|
|
+ # 🎯 处理标题级别(基于实际的类别)
|
|
|
+ if block_label == 'doc_title':
|
|
|
+ mineru_item['text_level'] = 1 # 文档标题 - 一级
|
|
|
+ elif block_label == 'paragraph_title':
|
|
|
+ mineru_item['text_level'] = 2 # 段落标题 - 二级
|
|
|
+ elif block_label == 'figure_table_chart_title':
|
|
|
+ mineru_item['text_level'] = 3 # 图表标题 - 三级
|
|
|
+
|
|
|
+ return mineru_item
|
|
|
+
|
|
|
def _process_table(self, item: Dict, paddle_text_boxes: List[Dict],
|
|
|
start_pointer: int) -> Tuple[Dict, int]:
|
|
|
- """处理表格"""
|
|
|
+ """处理 MinerU 表格"""
|
|
|
merged_item = item.copy()
|
|
|
table_html = item.get('table_body', '')
|
|
|
|
|
|
@@ -170,41 +406,6 @@ class DataProcessor:
|
|
|
|
|
|
return merged_item, paddle_pointer, last_matched_index
|
|
|
|
|
|
- def _process_paddleocr_vl_table(self, item: Dict, paddle_text_boxes: List[Dict],
|
|
|
- start_pointer: int) -> Tuple[Dict, int]:
|
|
|
- """处理 PaddleOCR_VL 表格"""
|
|
|
- merged_item = item.copy()
|
|
|
- table_html = item.get('block_content', '')
|
|
|
-
|
|
|
- enhanced_html, cells, new_pointer = self._enhance_table_html_with_bbox(
|
|
|
- table_html, paddle_text_boxes, start_pointer
|
|
|
- )
|
|
|
-
|
|
|
- # merge item使用item的所有信息,但重写block_content为增强后的html,增加单元格信息
|
|
|
- merged_item['block_content'] = enhanced_html
|
|
|
- merged_item['block_content_with_bbox'] = enhanced_html
|
|
|
- merged_item['bbox_mapping'] = 'merged_from_paddle_ocr'
|
|
|
- merged_item['table_cells'] = cells if cells else []
|
|
|
-
|
|
|
- return merged_item, new_pointer
|
|
|
-
|
|
|
- def _process_paddleocr_vl_text(self, item: Dict, paddle_text_boxes: List[Dict],
|
|
|
- paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]:
|
|
|
- """处理 PaddleOCR_VL 文本"""
|
|
|
- merged_item = item.copy()
|
|
|
- text = item.get('block_content', '')
|
|
|
-
|
|
|
- matched_bbox, paddle_pointer, last_matched_index = \
|
|
|
- self.text_matcher.find_matching_bbox(
|
|
|
- text, paddle_text_boxes, paddle_pointer, last_matched_index,
|
|
|
- self.look_ahead_window
|
|
|
- )
|
|
|
-
|
|
|
- if matched_bbox:
|
|
|
- matched_bbox['used'] = True
|
|
|
-
|
|
|
- return merged_item, paddle_pointer, last_matched_index
|
|
|
-
|
|
|
def _enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
|
|
|
start_pointer: int) -> Tuple[str, List[Dict], int]:
|
|
|
"""为 HTML 表格添加 bbox 信息"""
|