data_processor.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. """
  2. 数据处理模块
  3. 负责处理 MinerU/PaddleOCR_VL 数据,添加 bbox 信息
  4. """
  5. from typing import List, Dict, Tuple
  6. from bs4 import BeautifulSoup
  7. try:
  8. from .text_matcher import TextMatcher
  9. except ImportError:
  10. from text_matcher import TextMatcher
  11. class DataProcessor:
  12. """数据处理器"""
  13. def __init__(self, text_matcher: TextMatcher, look_ahead_window: int = 10):
  14. """
  15. Args:
  16. text_matcher: 文本匹配器
  17. look_ahead_window: 向前查找窗口
  18. """
  19. self.text_matcher = text_matcher
  20. self.look_ahead_window = look_ahead_window
  21. def process_mineru_data(self, mineru_data: List[Dict],
  22. paddle_text_boxes: List[Dict]) -> List[Dict]:
  23. """
  24. 处理 MinerU 数据,添加 bbox 信息
  25. Args:
  26. mineru_data: MinerU 数据
  27. paddle_text_boxes: PaddleOCR 文字框列表
  28. Returns:
  29. 合并后的数据, table cell使用paddle的bbox,其他类型只是移动指针,bbox还是沿用minerU的bbox
  30. """
  31. merged_data = []
  32. paddle_pointer = 0
  33. last_matched_index = 0
  34. # 按 bbox 排序
  35. mineru_data.sort(
  36. key=lambda x: (x['bbox'][1], x['bbox'][0])
  37. if 'bbox' in x else (float('inf'), float('inf'))
  38. )
  39. for item in mineru_data:
  40. item_type = item.get('type', '')
  41. if item_type == 'table':
  42. merged_item, paddle_pointer = self._process_table(
  43. item, paddle_text_boxes, paddle_pointer
  44. )
  45. merged_data.append(merged_item)
  46. elif item_type in ['text', 'title']:
  47. merged_item, paddle_pointer, last_matched_index = self._process_text(
  48. item, paddle_text_boxes, paddle_pointer, last_matched_index
  49. )
  50. merged_data.append(merged_item)
  51. elif item_type == 'list':
  52. merged_item, paddle_pointer, last_matched_index = self._process_list(
  53. item, paddle_text_boxes, paddle_pointer, last_matched_index
  54. )
  55. merged_data.append(merged_item)
  56. else:
  57. merged_data.append(item.copy())
  58. return merged_data
  59. def process_paddleocr_vl_data(self, paddleocr_vl_data: Dict,
  60. paddle_text_boxes: List[Dict]) -> List[Dict]:
  61. """
  62. 处理 PaddleOCR_VL 数据,添加 bbox 信息
  63. Args:
  64. paddleocr_vl_data: PaddleOCR_VL 数据 (JSON 对象)
  65. paddle_text_boxes: PaddleOCR 文字框列表
  66. Returns:
  67. 合并后的数据 (PPStruct3 格式, cell信息在parsing_res_list)
  68. """
  69. merged_data = []
  70. paddle_pointer = 0
  71. last_matched_index = 0
  72. # 提取 parsing_res_list
  73. parsing_res_list = paddleocr_vl_data.get('parsing_res_list', [])
  74. # 按 bbox 排序
  75. parsing_res_list.sort(
  76. key=lambda x: (x['block_bbox'][1], x['block_bbox'][0])
  77. if 'block_bbox' in x else (float('inf'), float('inf'))
  78. )
  79. for item in parsing_res_list:
  80. block_label = item.get('block_label', '')
  81. # PPStruct3 格式, cell信息在parsing_res_list
  82. if block_label == 'table':
  83. merged_item, paddle_pointer = self._process_paddleocr_vl_table(
  84. item, paddle_text_boxes, paddle_pointer
  85. )
  86. merged_data.append(merged_item)
  87. elif 'title' in block_label or block_label in ['text', 'number']:
  88. merged_item, paddle_pointer, last_matched_index = self._process_paddleocr_vl_text(
  89. item, paddle_text_boxes, paddle_pointer, last_matched_index
  90. )
  91. merged_data.append(merged_item)
  92. else:
  93. # 其他类型直接转换
  94. merged_data.append(item.copy())
  95. return merged_data
  96. def _process_table(self, item: Dict, paddle_text_boxes: List[Dict],
  97. start_pointer: int) -> Tuple[Dict, int]:
  98. """处理表格"""
  99. merged_item = item.copy()
  100. table_html = item.get('table_body', '')
  101. enhanced_html, cells, new_pointer = self._enhance_table_html_with_bbox(
  102. table_html, paddle_text_boxes, start_pointer
  103. )
  104. merged_item['table_body'] = enhanced_html
  105. merged_item['table_body_with_bbox'] = enhanced_html
  106. merged_item['bbox_mapping'] = 'merged_from_paddle_ocr'
  107. merged_item['table_cells'] = cells if cells else []
  108. return merged_item, new_pointer
  109. def _process_text(self, item: Dict, paddle_text_boxes: List[Dict],
  110. paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]:
  111. """处理文本"""
  112. merged_item = item.copy()
  113. text = item.get('text', '')
  114. matched_bbox, paddle_pointer, last_matched_index = \
  115. self.text_matcher.find_matching_bbox(
  116. text, paddle_text_boxes, paddle_pointer, last_matched_index,
  117. self.look_ahead_window
  118. )
  119. if matched_bbox:
  120. matched_bbox['used'] = True
  121. return merged_item, paddle_pointer, last_matched_index
  122. def _process_list(self, item: Dict, paddle_text_boxes: List[Dict],
  123. paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]:
  124. """处理列表"""
  125. merged_item = item.copy()
  126. list_items = item.get('list_items', [])
  127. for list_item in list_items:
  128. matched_bbox, paddle_pointer, last_matched_index = \
  129. self.text_matcher.find_matching_bbox(
  130. list_item, paddle_text_boxes, paddle_pointer, last_matched_index,
  131. self.look_ahead_window
  132. )
  133. if matched_bbox:
  134. matched_bbox['used'] = True
  135. return merged_item, paddle_pointer, last_matched_index
  136. def _process_paddleocr_vl_table(self, item: Dict, paddle_text_boxes: List[Dict],
  137. start_pointer: int) -> Tuple[Dict, int]:
  138. """处理 PaddleOCR_VL 表格"""
  139. merged_item = item.copy()
  140. table_html = item.get('block_content', '')
  141. enhanced_html, cells, new_pointer = self._enhance_table_html_with_bbox(
  142. table_html, paddle_text_boxes, start_pointer
  143. )
  144. # merge item使用item的所有信息,但重写block_content为增强后的html,增加单元格信息
  145. merged_item['block_content'] = enhanced_html
  146. merged_item['block_content_with_bbox'] = enhanced_html
  147. merged_item['bbox_mapping'] = 'merged_from_paddle_ocr'
  148. merged_item['table_cells'] = cells if cells else []
  149. return merged_item, new_pointer
  150. def _process_paddleocr_vl_text(self, item: Dict, paddle_text_boxes: List[Dict],
  151. paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]:
  152. """处理 PaddleOCR_VL 文本"""
  153. merged_item = item.copy()
  154. text = item.get('block_content', '')
  155. matched_bbox, paddle_pointer, last_matched_index = \
  156. self.text_matcher.find_matching_bbox(
  157. text, paddle_text_boxes, paddle_pointer, last_matched_index,
  158. self.look_ahead_window
  159. )
  160. if matched_bbox:
  161. matched_bbox['used'] = True
  162. return merged_item, paddle_pointer, last_matched_index
  163. def _enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
  164. start_pointer: int) -> Tuple[str, List[Dict], int]:
  165. """为 HTML 表格添加 bbox 信息"""
  166. soup = BeautifulSoup(html, 'html.parser')
  167. current_pointer = start_pointer
  168. last_matched_index = start_pointer
  169. cells = []
  170. for row_idx, row in enumerate(soup.find_all('tr')):
  171. for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
  172. cell_text = cell.get_text(strip=True)
  173. if not cell_text:
  174. continue
  175. matched_bbox, current_pointer, last_matched_index = \
  176. self.text_matcher.find_matching_bbox(
  177. cell_text, paddle_text_boxes, current_pointer,
  178. last_matched_index, self.look_ahead_window
  179. )
  180. if matched_bbox:
  181. bbox = matched_bbox['bbox']
  182. cell['data-bbox'] = f"[{bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}]"
  183. cell['data-score'] = f"{matched_bbox['score']:.4f}"
  184. cell['data-paddle-index'] = str(matched_bbox['paddle_bbox_index'])
  185. # ✅ 完整记录单元格信息
  186. cells.append({
  187. 'type': 'table_cell',
  188. 'text': cell_text,
  189. 'bbox': bbox,
  190. 'row': row_idx + 1,
  191. 'col': col_idx + 1,
  192. 'score': matched_bbox['score'],
  193. 'paddle_bbox_index': matched_bbox['paddle_bbox_index']
  194. })
  195. matched_bbox['used'] = True
  196. # ✅ 如果匹配失败,不应该添加到 cells 中
  197. return str(soup), cells, current_pointer