data_processor.py 49 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286
  1. """
  2. 数据处理模块
  3. 负责处理 MinerU/PaddleOCR_VL/DotsOCR 数据,添加 bbox 信息
  4. """
  5. from typing import List, Dict, Tuple, Optional
  6. from bs4 import BeautifulSoup
  7. try:
  8. from .text_matcher import TextMatcher
  9. from .bbox_extractor import BBoxExtractor
  10. except ImportError:
  11. from text_matcher import TextMatcher
  12. from bbox_extractor import BBoxExtractor
  13. class DataProcessor:
  14. """数据处理器"""
  15. def __init__(self, text_matcher: TextMatcher, look_ahead_window: int = 10, x_tolerance: int = 3):
  16. """
  17. Args:
  18. text_matcher: 文本匹配器
  19. look_ahead_window: 向前查找窗口
  20. x_tolerance: x轴容差
  21. """
  22. self.text_matcher = text_matcher
  23. self.look_ahead_window = look_ahead_window
  24. # X轴容差, 用于判断文本框是否在同一列
  25. self.x_tolerance = x_tolerance
  26. def process_mineru_data(self, mineru_data: List[Dict],
  27. paddle_text_boxes: List[Dict]) -> List[Dict]:
  28. """
  29. 处理 MinerU 数据,添加 bbox 信息
  30. Args:
  31. mineru_data: MinerU 数据
  32. paddle_text_boxes: PaddleOCR 文字框列表
  33. Returns:
  34. 合并后的数据, table cell使用paddle的bbox,其他类型只是移动指针,bbox还是沿用minerU的bbox
  35. """
  36. merged_data = []
  37. paddle_pointer = 0
  38. last_matched_index = 0
  39. # 按 bbox 排序
  40. mineru_data.sort(
  41. key=lambda x: (x['bbox'][1], x['bbox'][0])
  42. if 'bbox' in x else (float('inf'), float('inf'))
  43. )
  44. for item in mineru_data:
  45. item_type = item.get('type', '')
  46. if item_type == 'table':
  47. merged_item, paddle_pointer = self._process_table(
  48. item, paddle_text_boxes, paddle_pointer
  49. )
  50. merged_data.append(merged_item)
  51. elif item_type in ['text', 'title']:
  52. merged_item, paddle_pointer, last_matched_index = self._process_text(
  53. item, paddle_text_boxes, paddle_pointer, last_matched_index
  54. )
  55. merged_data.append(merged_item)
  56. elif item_type == 'list':
  57. merged_item, paddle_pointer, last_matched_index = self._process_list(
  58. item, paddle_text_boxes, paddle_pointer, last_matched_index
  59. )
  60. merged_data.append(merged_item)
  61. else:
  62. merged_data.append(item.copy())
  63. return merged_data
  64. def process_dotsocr_data(self, dotsocr_data: List[Dict],
  65. paddle_text_boxes: List[Dict]) -> List[Dict]:
  66. """
  67. 🎯 处理 DotsOCR 数据,转换为 MinerU 格式并添加 bbox 信息
  68. Args:
  69. dotsocr_data: DotsOCR 数据
  70. paddle_text_boxes: PaddleOCR 文字框列表
  71. Returns:
  72. MinerU 格式的合并数据
  73. """
  74. merged_data = []
  75. paddle_pointer = 0
  76. last_matched_index = 0
  77. # 按 bbox 排序
  78. dotsocr_data.sort(
  79. key=lambda x: (x['bbox'][1], x['bbox'][0])
  80. if 'bbox' in x else (float('inf'), float('inf'))
  81. )
  82. for item in dotsocr_data:
  83. # 🎯 转换为 MinerU 格式
  84. mineru_item = self._convert_dotsocr_to_mineru(item)
  85. category = mineru_item.get('type', '')
  86. # 🎯 根据类型处理
  87. if category.lower() == 'table':
  88. merged_item, paddle_pointer = self._process_table(
  89. mineru_item, paddle_text_boxes, paddle_pointer
  90. )
  91. merged_data.append(merged_item)
  92. elif category.lower() in ['text', 'title', 'header', 'footer']:
  93. merged_item, paddle_pointer, last_matched_index = self._process_text(
  94. mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
  95. )
  96. merged_data.append(merged_item)
  97. elif category.lower() == 'list':
  98. merged_item, paddle_pointer, last_matched_index = self._process_list(
  99. mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
  100. )
  101. merged_data.append(merged_item)
  102. else:
  103. # Page-header, Page-footer, Picture 等
  104. merged_data.append(mineru_item)
  105. return merged_data
  106. def _convert_dotsocr_to_mineru(self, dotsocr_item: Dict) -> Dict:
  107. """
  108. 🎯 将 DotsOCR 格式转换为 MinerU 格式
  109. DotsOCR:
  110. {
  111. "category": "Table",
  112. "bbox": [x1, y1, x2, y2],
  113. "text": "..."
  114. }
  115. MinerU:
  116. {
  117. "type": "table",
  118. "bbox": [x1, y1, x2, y2],
  119. "table_body": "...",
  120. "page_idx": 0
  121. }
  122. """
  123. category = dotsocr_item.get('category', '')
  124. # 🎯 Category 映射
  125. category_map = {
  126. 'Page-header': 'header',
  127. 'Page-footer': 'footer',
  128. 'Picture': 'image',
  129. 'Figure': 'image',
  130. 'Section-header': 'title',
  131. 'Table': 'table',
  132. 'Text': 'text',
  133. 'Title': 'title',
  134. 'List': 'list',
  135. 'Caption': 'title'
  136. }
  137. mineru_type = category_map.get(category, 'text')
  138. # 🎯 基础转换
  139. mineru_item = {
  140. 'type': mineru_type,
  141. 'bbox': dotsocr_item.get('bbox', []),
  142. 'page_idx': 0 # DotsOCR 默认单页
  143. }
  144. # 🎯 处理文本内容
  145. text = dotsocr_item.get('text', '')
  146. if mineru_type == 'table':
  147. # 表格:text -> table_body
  148. mineru_item['table_body'] = text
  149. else:
  150. # 其他类型:保持 text
  151. mineru_item['text'] = text
  152. # 标题级别
  153. if category == 'Section-header':
  154. mineru_item['text_level'] = 1
  155. return mineru_item
  156. def process_paddleocr_vl_data(self, paddleocr_vl_data: Dict,
  157. paddle_text_boxes: List[Dict]) -> List[Dict]:
  158. """
  159. 处理 PaddleOCR_VL 数据,添加 bbox 信息
  160. Args:
  161. paddleocr_vl_data: PaddleOCR_VL 数据 (JSON 对象)
  162. paddle_text_boxes: PaddleOCR 文字框列表
  163. Returns:
  164. 🎯 MinerU 格式的合并数据(统一输出格式)
  165. """
  166. merged_data = []
  167. paddle_pointer = 0
  168. last_matched_index = 0
  169. # 🎯 获取旋转角度和原始图像尺寸
  170. rotation_angle = self._get_rotation_angle_from_vl(paddleocr_vl_data)
  171. orig_image_size = None
  172. if rotation_angle != 0:
  173. orig_image_size = self._get_original_image_size_from_vl(paddleocr_vl_data)
  174. print(f"🔄 PaddleOCR_VL 检测到旋转角度: {rotation_angle}°")
  175. print(f"📐 原始图像尺寸: {orig_image_size[0]} x {orig_image_size[1]}")
  176. # 提取 parsing_res_list
  177. parsing_res_list = paddleocr_vl_data.get('parsing_res_list', [])
  178. # 按 bbox 排序
  179. parsing_res_list.sort(
  180. key=lambda x: (x['block_bbox'][1], x['block_bbox'][0])
  181. if 'block_bbox' in x else (float('inf'), float('inf'))
  182. )
  183. for item in parsing_res_list:
  184. # 🎯 先转换 bbox 坐标(如果需要)
  185. if rotation_angle != 0 and orig_image_size:
  186. item = self._transform_vl_block_bbox(item, rotation_angle, orig_image_size)
  187. # 🎯 统一转换为 MinerU 格式
  188. mineru_item = self._convert_paddleocr_vl_to_mineru(item)
  189. item_type = mineru_item.get('type', '')
  190. # 🎯 根据类型处理(复用 MinerU 的通用方法)
  191. if item_type == 'table':
  192. merged_item, paddle_pointer = self._process_table(
  193. mineru_item, paddle_text_boxes, paddle_pointer
  194. )
  195. merged_data.append(merged_item)
  196. elif item_type in ['text', 'title', 'header', 'footer', 'equation']:
  197. merged_item, paddle_pointer, last_matched_index = self._process_text(
  198. mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
  199. )
  200. merged_data.append(merged_item)
  201. elif item_type == 'list':
  202. merged_item, paddle_pointer, last_matched_index = self._process_list(
  203. mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
  204. )
  205. merged_data.append(merged_item)
  206. else:
  207. # 其他类型(image 等)直接添加
  208. merged_data.append(mineru_item)
  209. return merged_data
  210. def _get_rotation_angle_from_vl(self, paddleocr_vl_data: Dict) -> float:
  211. """从 PaddleOCR_VL 数据中获取旋转角度"""
  212. return BBoxExtractor._get_rotation_angle(paddleocr_vl_data)
  213. def _get_original_image_size_from_vl(self, paddleocr_vl_data: Dict) -> tuple:
  214. """从 PaddleOCR_VL 数据中获取原始图像尺寸"""
  215. return BBoxExtractor._get_original_image_size(paddleocr_vl_data)
  216. def _transform_vl_block_bbox(self, item: Dict, angle: float,
  217. orig_image_size: tuple) -> Dict:
  218. """
  219. 转换 PaddleOCR_VL 的 block_bbox 坐标
  220. Args:
  221. item: PaddleOCR_VL 的 block 数据
  222. angle: 旋转角度
  223. orig_image_size: 原始图像尺寸
  224. Returns:
  225. 转换后的 block 数据
  226. """
  227. transformed_item = item.copy()
  228. if 'block_bbox' not in item:
  229. return transformed_item
  230. block_bbox = item['block_bbox']
  231. if len(block_bbox) < 4:
  232. return transformed_item
  233. # block_bbox 格式: [x1, y1, x2, y2]
  234. # 转换为 poly 格式进行旋转
  235. poly = [
  236. [block_bbox[0], block_bbox[1]], # 左上
  237. [block_bbox[2], block_bbox[1]], # 右上
  238. [block_bbox[2], block_bbox[3]], # 右下
  239. [block_bbox[0], block_bbox[3]] # 左下
  240. ]
  241. # 🎯 使用 BBoxExtractor 的坐标转换方法
  242. transformed_poly = BBoxExtractor._inverse_rotate_coordinates(
  243. poly, angle, orig_image_size
  244. )
  245. # 转换回 bbox 格式
  246. xs = [p[0] for p in transformed_poly]
  247. ys = [p[1] for p in transformed_poly]
  248. transformed_bbox = [min(xs), min(ys), max(xs), max(ys)]
  249. transformed_item['block_bbox'] = transformed_bbox
  250. return transformed_item
  251. def _convert_paddleocr_vl_to_mineru(self, paddleocr_vl_item: Dict) -> Dict:
  252. """
  253. 🎯 将 PaddleOCR_VL 格式转换为 MinerU 格式
  254. 基于 PP-DocLayout_plus-L 的 20 种类别
  255. """
  256. block_label = paddleocr_vl_item.get('block_label', '')
  257. # 🎯 PP-DocLayout_plus-L 类别映射(共 20 种)
  258. label_map = {
  259. # 标题类(3种)
  260. 'paragraph_title': 'title',
  261. 'doc_title': 'title',
  262. 'figure_table_chart_title': 'title',
  263. # 文本类(9种)
  264. 'text': 'text',
  265. 'number': 'text',
  266. 'content': 'text',
  267. 'abstract': 'text',
  268. 'footnote': 'text',
  269. 'aside_text': 'text',
  270. 'algorithm': 'text',
  271. 'reference': 'text',
  272. 'reference_content': 'text',
  273. # 页眉页脚(2种)
  274. 'header': 'header',
  275. 'footer': 'footer',
  276. # 表格(1种)
  277. 'table': 'table',
  278. # 图片/图表(3种)
  279. 'image': 'image',
  280. 'chart': 'image',
  281. 'seal': 'image',
  282. # 公式(2种)
  283. 'formula': 'equation',
  284. 'formula_number': 'equation'
  285. }
  286. mineru_type = label_map.get(block_label, 'text')
  287. mineru_item = {
  288. 'type': mineru_type,
  289. 'bbox': paddleocr_vl_item.get('block_bbox', []),
  290. 'page_idx': 0
  291. }
  292. content = paddleocr_vl_item.get('block_content', '')
  293. if mineru_type == 'table':
  294. mineru_item['table_body'] = content
  295. else:
  296. mineru_item['text'] = content
  297. # 标题级别
  298. if block_label == 'doc_title':
  299. mineru_item['text_level'] = 1
  300. elif block_label == 'paragraph_title':
  301. mineru_item['text_level'] = 2
  302. elif block_label == 'figure_table_chart_title':
  303. mineru_item['text_level'] = 3
  304. return mineru_item
  305. def _process_table(self, item: Dict, paddle_text_boxes: List[Dict],
  306. start_pointer: int) -> Tuple[Dict, int]:
  307. """
  308. 处理表格类型(MinerU 格式)
  309. 策略:
  310. - 解析 HTML 表格
  311. - 为每个单元格匹配 PaddleOCR 的 bbox
  312. - 返回处理后的表格和新指针位置
  313. """
  314. table_body = item.get('table_body', '')
  315. if not table_body:
  316. print(f"⚠️ 表格内容为空,跳过")
  317. return item, start_pointer
  318. try:
  319. # 🔑 传入 table_bbox 用于筛选
  320. table_bbox = item.get('bbox') # MinerU 提供的表格边界
  321. enhanced_html, cells, new_pointer = self._enhance_table_html_with_bbox(
  322. table_body,
  323. paddle_text_boxes,
  324. start_pointer,
  325. table_bbox # ✅ 传入边界框
  326. )
  327. # 更新 item
  328. item['table_body'] = enhanced_html
  329. item['table_cells'] = cells
  330. # 统计信息
  331. matched_count = len(cells)
  332. total_cells = len(BeautifulSoup(table_body, 'html.parser').find_all(['td', 'th']))
  333. print(f" 表格单元格: {matched_count}/{total_cells} 匹配")
  334. return item, new_pointer
  335. except Exception as e:
  336. print(f"⚠️ 表格处理失败: {e}")
  337. import traceback
  338. traceback.print_exc()
  339. return item, start_pointer
  340. def _process_text(self, item: Dict, paddle_text_boxes: List[Dict],
  341. paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]:
  342. """处理文本"""
  343. merged_item = item.copy()
  344. text = item.get('text', '')
  345. matched_bbox, paddle_pointer, last_matched_index = \
  346. self.text_matcher.find_matching_bbox(
  347. text, paddle_text_boxes, paddle_pointer, last_matched_index,
  348. self.look_ahead_window
  349. )
  350. if matched_bbox:
  351. matched_bbox['used'] = True
  352. return merged_item, paddle_pointer, last_matched_index
  353. def _process_list(self, item: Dict, paddle_text_boxes: List[Dict],
  354. paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]:
  355. """处理列表"""
  356. merged_item = item.copy()
  357. list_items = item.get('list_items', [])
  358. for list_item in list_items:
  359. matched_bbox, paddle_pointer, last_matched_index = \
  360. self.text_matcher.find_matching_bbox(
  361. list_item, paddle_text_boxes, paddle_pointer, last_matched_index,
  362. self.look_ahead_window
  363. )
  364. if matched_bbox:
  365. matched_bbox['used'] = True
  366. return merged_item, paddle_pointer, last_matched_index
  367. def _enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
  368. start_pointer: int, table_bbox: Optional[List[int]] = None) -> Tuple[str, List[Dict], int]:
  369. """
  370. 为 HTML 表格添加 bbox 信息(优化版:先筛选表格区域)
  371. 策略:
  372. 1. 根据 table_bbox 筛选出表格区域内的 paddle_text_boxes
  373. 2. 将筛选后的 boxes 按行分组
  374. 3. 智能匹配 HTML 行与 paddle 行组
  375. 4. 在匹配的组内查找单元格
  376. Args:
  377. html: HTML 表格
  378. paddle_text_boxes: 全部 paddle OCR 结果
  379. start_pointer: 开始位置
  380. table_bbox: 表格边界框 [x1, y1, x2, y2]
  381. """
  382. soup = BeautifulSoup(html, 'html.parser')
  383. cells = []
  384. # 🔑 第一步:筛选表格区域内的 paddle boxes
  385. table_region_boxes, actual_table_bbox = self._filter_boxes_in_table_region(
  386. paddle_text_boxes[start_pointer:],
  387. table_bbox,
  388. html
  389. )
  390. if not table_region_boxes:
  391. print(f"⚠️ 未在表格区域找到 paddle boxes")
  392. return str(soup), cells, start_pointer
  393. print(f"📊 表格区域: {len(table_region_boxes)} 个文本框")
  394. print(f" 边界: {actual_table_bbox}")
  395. # 🔑 第二步:将表格区域的 boxes 按行分组
  396. grouped_boxes = self._group_paddle_boxes_by_rows(
  397. table_region_boxes,
  398. y_tolerance=20
  399. )
  400. # 🔑 第三步:在每组内按 x 坐标排序
  401. for group in grouped_boxes:
  402. group['boxes'].sort(key=lambda x: x['bbox'][0])
  403. grouped_boxes.sort(key=lambda g: g['y_center'])
  404. print(f" 分组: {len(grouped_boxes)} 行")
  405. # 🔑 第四步:智能匹配 HTML 行与 paddle 行组
  406. html_rows = soup.find_all('tr')
  407. row_mapping = self._match_html_rows_to_paddle_groups(html_rows, grouped_boxes)
  408. print(f" HTML行: {len(html_rows)} 行")
  409. print(f" 映射: {len([v for v in row_mapping.values() if v])} 个有效映射")
  410. # 🔑 第五步:遍历 HTML 表格,使用映射关系查找
  411. for row_idx, row in enumerate(html_rows):
  412. group_indices = row_mapping.get(row_idx, [])
  413. if not group_indices:
  414. continue
  415. # 合并多个组的 boxes
  416. current_boxes = []
  417. for group_idx in group_indices:
  418. if group_idx < len(grouped_boxes):
  419. current_boxes.extend(grouped_boxes[group_idx]['boxes'])
  420. current_boxes.sort(key=lambda x: x['bbox'][0])
  421. # 🎯 关键改进:提取 HTML 单元格并预先确定列边界
  422. html_cells = row.find_all(['td', 'th'])
  423. if not html_cells:
  424. continue
  425. # 🔑 预估列边界(基于 x 坐标分布)
  426. col_boundaries = self._estimate_column_boundaries(
  427. current_boxes,
  428. len(html_cells)
  429. )
  430. print(f" 行 {row_idx + 1}: {len(html_cells)} 列,边界: {col_boundaries}")
  431. # 🎯 关键改进:顺序指针匹配
  432. box_pointer = 0 # 当前行的 boxes 指针
  433. for col_idx, cell in enumerate(html_cells):
  434. cell_text = cell.get_text(strip=True)
  435. if not cell_text:
  436. continue
  437. # 🔑 从当前指针开始匹配
  438. matched_result = self._match_cell_sequential(
  439. cell_text,
  440. current_boxes,
  441. col_boundaries,
  442. box_pointer
  443. )
  444. if matched_result:
  445. merged_bbox = matched_result['bbox']
  446. merged_text = matched_result['text']
  447. cell['data-bbox'] = f"[{merged_bbox[0]},{merged_bbox[1]},{merged_bbox[2]},{merged_bbox[3]}]"
  448. cell['data-score'] = f"{matched_result['score']:.4f}"
  449. cell['data-paddle-indices'] = str(matched_result['paddle_indices'])
  450. cells.append({
  451. 'type': 'table_cell',
  452. 'text': cell_text,
  453. 'matched_text': merged_text,
  454. 'bbox': merged_bbox,
  455. 'row': row_idx + 1,
  456. 'col': col_idx + 1,
  457. 'score': matched_result['score'],
  458. 'paddle_bbox_indices': matched_result['paddle_indices']
  459. })
  460. # 标记已使用
  461. for box in matched_result['used_boxes']:
  462. box['used'] = True
  463. # 🎯 移动指针到最后使用的 box 之后
  464. box_pointer = matched_result['last_used_index'] + 1
  465. print(f" 列 {col_idx + 1}: '{cell_text[:20]}...' 匹配 {len(matched_result['used_boxes'])} 个box (指针: {box_pointer})")
  466. # 计算新的指针位置
  467. used_count = sum(1 for box in table_region_boxes if box.get('used'))
  468. new_pointer = start_pointer + used_count
  469. print(f" 匹配: {len(cells)} 个单元格")
  470. return str(soup), cells, new_pointer
  471. def _estimate_column_boundaries(self, boxes: List[Dict],
  472. num_cols: int) -> List[Tuple[int, int]]:
  473. """
  474. 估算列边界(改进版:处理同列多文本框)
  475. Args:
  476. boxes: 当前行的所有 boxes(已按 x 排序)
  477. num_cols: HTML 表格的列数
  478. Returns:
  479. 列边界列表 [(x_start, x_end), ...]
  480. """
  481. if not boxes:
  482. return []
  483. # 🔑 关键改进:先按 x 坐标聚类(合并同列的多个文本框)
  484. x_clusters = self._cluster_boxes_by_x(boxes, x_tolerance=self.x_tolerance)
  485. print(f" X聚类: {len(boxes)} 个boxes -> {len(x_clusters)} 个列簇")
  486. # 获取所有 x 坐标范围
  487. x_min = min(cluster['x_min'] for cluster in x_clusters)
  488. x_max = max(cluster['x_max'] for cluster in x_clusters)
  489. # 🎯 策略 1: 如果聚类数量<=列数接近
  490. if len(x_clusters) <= num_cols:
  491. # 直接使用聚类边界
  492. boundaries = [(cluster['x_min'], cluster['x_max'])
  493. for cluster in x_clusters]
  494. return boundaries
  495. # 🎯 策略 2: 聚类数多于列数(某些列有多个文本簇)
  496. if len(x_clusters) > num_cols:
  497. print(f" ℹ️ 聚类数 {len(x_clusters)} > 列数 {num_cols},合并相近簇")
  498. # 合并相近的簇
  499. merged_clusters = self._merge_close_clusters(x_clusters, num_cols)
  500. boundaries = [(cluster['x_min'], cluster['x_max'])
  501. for cluster in merged_clusters]
  502. return boundaries
  503. return []
  504. def _cluster_boxes_by_x(self, boxes: List[Dict],
  505. x_tolerance: int = 3) -> List[Dict]:
  506. """
  507. 按 x 坐标聚类(合并同列的多个文本框)
  508. Args:
  509. boxes: 文本框列表
  510. x_tolerance: X坐标容忍度
  511. Returns:
  512. 聚类列表 [{'x_min': int, 'x_max': int, 'boxes': List[Dict]}, ...]
  513. """
  514. if not boxes:
  515. return []
  516. # 按左边界 x 坐标排序
  517. sorted_boxes = sorted(boxes, key=lambda b: b['bbox'][0])
  518. clusters = []
  519. current_cluster = None
  520. for box in sorted_boxes:
  521. bbox = box['bbox']
  522. x_start = bbox[0]
  523. x_end = bbox[2]
  524. if current_cluster is None:
  525. # 开始新簇
  526. current_cluster = {
  527. 'x_min': x_start,
  528. 'x_max': x_end,
  529. 'boxes': [box]
  530. }
  531. else:
  532. # 🔑 检查是否属于当前簇(修正后的逻辑)
  533. # 1. x 坐标有重叠:x_start <= current_x_max 且 x_end >= current_x_min
  534. # 2. 或者距离在容忍度内
  535. has_overlap = (x_start <= current_cluster['x_max'] and
  536. x_end >= current_cluster['x_min'])
  537. is_close = abs(x_start - current_cluster['x_max']) <= x_tolerance
  538. if has_overlap or is_close:
  539. # 合并到当前簇
  540. current_cluster['boxes'].append(box)
  541. current_cluster['x_min'] = min(current_cluster['x_min'], x_start)
  542. current_cluster['x_max'] = max(current_cluster['x_max'], x_end)
  543. else:
  544. # 保存当前簇,开始新簇
  545. clusters.append(current_cluster)
  546. current_cluster = {
  547. 'x_min': x_start,
  548. 'x_max': x_end,
  549. 'boxes': [box]
  550. }
  551. # 添加最后一簇
  552. if current_cluster:
  553. clusters.append(current_cluster)
  554. return clusters
  555. def _merge_close_clusters(self, clusters: List[Dict],
  556. target_count: int) -> List[Dict]:
  557. """
  558. 合并相近的簇,直到数量等于目标列数
  559. Args:
  560. clusters: 聚类列表
  561. target_count: 目标列数
  562. Returns:
  563. 合并后的聚类列表
  564. """
  565. if len(clusters) <= target_count:
  566. return clusters
  567. # 复制一份,避免修改原数据
  568. working_clusters = [c.copy() for c in clusters]
  569. while len(working_clusters) > target_count:
  570. # 找到距离最近的两个簇
  571. min_distance = float('inf')
  572. merge_idx = 0
  573. for i in range(len(working_clusters) - 1):
  574. distance = working_clusters[i + 1]['x_min'] - working_clusters[i]['x_max']
  575. if distance < min_distance:
  576. min_distance = distance
  577. merge_idx = i
  578. # 合并
  579. cluster1 = working_clusters[merge_idx]
  580. cluster2 = working_clusters[merge_idx + 1]
  581. merged_cluster = {
  582. 'x_min': cluster1['x_min'],
  583. 'x_max': cluster2['x_max'],
  584. 'boxes': cluster1['boxes'] + cluster2['boxes']
  585. }
  586. # 替换
  587. working_clusters[merge_idx] = merged_cluster
  588. working_clusters.pop(merge_idx + 1)
  589. return working_clusters
  590. def _get_boxes_in_column(self, boxes: List[Dict],
  591. boundaries: List[Tuple[int, int]],
  592. col_idx: int) -> List[Dict]:
  593. """
  594. 获取指定列范围内的 boxes(改进版:包含重叠)
  595. Args:
  596. boxes: 当前行的所有 boxes
  597. boundaries: 列边界
  598. col_idx: 列索引
  599. Returns:
  600. 该列的 boxes
  601. """
  602. if col_idx >= len(boundaries):
  603. return []
  604. x_start, x_end = boundaries[col_idx]
  605. col_boxes = []
  606. for box in boxes:
  607. bbox = box['bbox']
  608. box_x_start = bbox[0]
  609. box_x_end = bbox[2]
  610. # 🔑 改进:检查是否有重叠(不只是中心点)
  611. overlap = not (box_x_start > x_end or box_x_end < x_start)
  612. if overlap:
  613. col_boxes.append(box)
  614. return col_boxes
  615. def _filter_boxes_in_table_region(self, paddle_boxes: List[Dict],
  616. table_bbox: Optional[List[int]],
  617. html: str) -> Tuple[List[Dict], List[int]]:
  618. """
  619. 筛选表格区域内的 paddle boxes
  620. 策略:
  621. 1. 如果有 table_bbox,使用边界框筛选(扩展边界)
  622. 2. 如果没有 table_bbox,通过内容匹配推断区域
  623. Args:
  624. paddle_boxes: paddle OCR 结果
  625. table_bbox: 表格边界框 [x1, y1, x2, y2]
  626. html: HTML 内容(用于内容验证)
  627. Returns:
  628. (筛选后的 boxes, 实际表格边界框)
  629. """
  630. if not paddle_boxes:
  631. return [], [0, 0, 0, 0]
  632. # 🎯 策略 1: 使用提供的 table_bbox(扩展边界)
  633. if table_bbox and len(table_bbox) == 4:
  634. x1, y1, x2, y2 = table_bbox
  635. # 扩展边界(考虑边框外的文本)
  636. margin = 20
  637. expanded_bbox = [
  638. max(0, x1 - margin),
  639. max(0, y1 - margin),
  640. x2 + margin,
  641. y2 + margin
  642. ]
  643. filtered = []
  644. for box in paddle_boxes:
  645. bbox = box['bbox']
  646. box_center_x = (bbox[0] + bbox[2]) / 2
  647. box_center_y = (bbox[1] + bbox[3]) / 2
  648. # 中心点在扩展区域内
  649. if (expanded_bbox[0] <= box_center_x <= expanded_bbox[2] and
  650. expanded_bbox[1] <= box_center_y <= expanded_bbox[3]):
  651. filtered.append(box)
  652. if filtered:
  653. # 计算实际边界框
  654. actual_bbox = [
  655. min(b['bbox'][0] for b in filtered),
  656. min(b['bbox'][1] for b in filtered),
  657. max(b['bbox'][2] for b in filtered),
  658. max(b['bbox'][3] for b in filtered)
  659. ]
  660. return filtered, actual_bbox
  661. # 🎯 策略 2: 通过内容匹配推断区域
  662. print(" ℹ️ 无 table_bbox,使用内容匹配推断表格区域...")
  663. # 提取 HTML 中的所有文本
  664. from bs4 import BeautifulSoup
  665. soup = BeautifulSoup(html, 'html.parser')
  666. html_texts = set()
  667. for cell in soup.find_all(['td', 'th']):
  668. text = cell.get_text(strip=True)
  669. if text:
  670. html_texts.add(self.text_matcher.normalize_text(text))
  671. if not html_texts:
  672. return [], [0, 0, 0, 0]
  673. # 找出与 HTML 内容匹配的 boxes
  674. matched_boxes = []
  675. for box in paddle_boxes:
  676. normalized_text = self.text_matcher.normalize_text(box['text'])
  677. # 检查是否匹配
  678. if any(normalized_text in ht or ht in normalized_text
  679. for ht in html_texts):
  680. matched_boxes.append(box)
  681. if not matched_boxes:
  682. # 🔑 降级:如果精确匹配失败,使用模糊匹配
  683. print(" ℹ️ 精确匹配失败,尝试模糊匹配...")
  684. from fuzzywuzzy import fuzz
  685. for box in paddle_boxes:
  686. normalized_text = self.text_matcher.normalize_text(box['text'])
  687. for ht in html_texts:
  688. similarity = fuzz.partial_ratio(normalized_text, ht)
  689. if similarity >= 70: # 降低阈值
  690. matched_boxes.append(box)
  691. break
  692. if matched_boxes:
  693. # 计算边界框
  694. actual_bbox = [
  695. min(b['bbox'][0] for b in matched_boxes),
  696. min(b['bbox'][1] for b in matched_boxes),
  697. max(b['bbox'][2] for b in matched_boxes),
  698. max(b['bbox'][3] for b in matched_boxes)
  699. ]
  700. # 🔑 扩展边界,包含可能遗漏的文本
  701. margin = 30
  702. expanded_bbox = [
  703. max(0, actual_bbox[0] - margin),
  704. max(0, actual_bbox[1] - margin),
  705. actual_bbox[2] + margin,
  706. actual_bbox[3] + margin
  707. ]
  708. # 重新筛选(包含边界上的文本)
  709. final_filtered = []
  710. for box in paddle_boxes:
  711. bbox = box['bbox']
  712. box_center_x = (bbox[0] + bbox[2]) / 2
  713. box_center_y = (bbox[1] + bbox[3]) / 2
  714. if (expanded_bbox[0] <= box_center_x <= expanded_bbox[2] and
  715. expanded_bbox[1] <= box_center_y <= expanded_bbox[3]):
  716. final_filtered.append(box)
  717. return final_filtered, actual_bbox
  718. # 🔑 最后的降级:返回所有 boxes
  719. print(" ⚠️ 无法确定表格区域,使用所有 paddle boxes")
  720. if paddle_boxes:
  721. actual_bbox = [
  722. min(b['bbox'][0] for b in paddle_boxes),
  723. min(b['bbox'][1] for b in paddle_boxes),
  724. max(b['bbox'][2] for b in paddle_boxes),
  725. max(b['bbox'][3] for b in paddle_boxes)
  726. ]
  727. return paddle_boxes, actual_bbox
  728. return [], [0, 0, 0, 0]
  729. def _group_paddle_boxes_by_rows(self, paddle_boxes: List[Dict],
  730. y_tolerance: int = 20) -> List[Dict]:
  731. """
  732. 将 paddle_text_boxes 按 y 坐标分组(聚类)
  733. Args:
  734. paddle_boxes: Paddle OCR 文字框列表
  735. y_tolerance: Y 坐标容忍度(像素)
  736. Returns:
  737. 分组列表,每组包含 {'y_center': float, 'boxes': List[Dict]}
  738. """
  739. if not paddle_boxes:
  740. return []
  741. # 计算每个 box 的中心 y 坐标
  742. boxes_with_y = []
  743. for box in paddle_boxes:
  744. bbox = box['bbox']
  745. y_center = (bbox[1] + bbox[3]) / 2
  746. boxes_with_y.append({
  747. 'y_center': y_center,
  748. 'box': box
  749. })
  750. # 按 y 坐标排序
  751. boxes_with_y.sort(key=lambda x: x['y_center'])
  752. # 聚类
  753. groups = []
  754. current_group = None
  755. for item in boxes_with_y:
  756. if current_group is None:
  757. # 开始新组
  758. current_group = {
  759. 'y_center': item['y_center'],
  760. 'boxes': [item['box']]
  761. }
  762. else:
  763. # 检查是否属于当前组
  764. if abs(item['y_center'] - current_group['y_center']) <= y_tolerance:
  765. current_group['boxes'].append(item['box'])
  766. # 更新组的中心(使用平均值)
  767. current_group['y_center'] = sum(
  768. b['bbox'][1] + b['bbox'][3] for b in current_group['boxes']
  769. ) / (2 * len(current_group['boxes']))
  770. else:
  771. # 保存当前组,开始新组
  772. groups.append(current_group)
  773. current_group = {
  774. 'y_center': item['y_center'],
  775. 'boxes': [item['box']]
  776. }
  777. # 添加最后一组
  778. if current_group:
  779. groups.append(current_group)
  780. return groups
  781. def _match_html_rows_to_paddle_groups(self, html_rows: List,
  782. grouped_boxes: List[Dict]) -> Dict[int, List[int]]:
  783. """
  784. 智能匹配 HTML 行与 paddle 分组(改进版:处理跨行文本)
  785. 策略:
  786. 1. 第一遍:基于内容精确匹配
  787. 2. 第二遍:将未使用的组合并到相邻已匹配的行
  788. """
  789. if not html_rows or not grouped_boxes:
  790. return {}
  791. mapping = {}
  792. # 🎯 策略 1: 数量相等,简单 1:1 映射
  793. if len(html_rows) == len(grouped_boxes):
  794. for i in range(len(html_rows)):
  795. mapping[i] = [i]
  796. return mapping
  797. # 🎯 策略 2: 第一遍 - 基于内容精确匹配
  798. used_groups = set()
  799. for row_idx, row in enumerate(html_rows):
  800. row_texts = [cell.get_text(strip=True) for cell in row.find_all(['td', 'th'])]
  801. row_texts = [t for t in row_texts if t]
  802. if not row_texts:
  803. mapping[row_idx] = []
  804. continue
  805. row_text_normalized = [self.text_matcher.normalize_text(t) for t in row_texts]
  806. # 查找最匹配的 paddle 组
  807. best_groups = []
  808. best_score = 0
  809. # 尝试匹配单个组
  810. for group_idx, group in enumerate(grouped_boxes):
  811. if group_idx in used_groups:
  812. continue
  813. group_texts = [self.text_matcher.normalize_text(b['text'])
  814. for b in group['boxes'] if not b.get('used')]
  815. match_count = sum(1 for rt in row_text_normalized
  816. if any(rt in gt or gt in rt for gt in group_texts))
  817. coverage = match_count / len(row_texts) if row_texts else 0
  818. if coverage > best_score:
  819. best_score = coverage
  820. best_groups = [group_idx]
  821. # 🔑 如果单组匹配度不高,尝试匹配多个连续组
  822. if best_score < 0.5:
  823. # 从当前位置向后查找
  824. start_group = min([g for g in range(len(grouped_boxes)) if g not in used_groups],
  825. default=0)
  826. combined_texts = []
  827. combined_groups = []
  828. for group_idx in range(start_group, min(start_group + 5, len(grouped_boxes))):
  829. if group_idx in used_groups:
  830. continue
  831. combined_groups.append(group_idx)
  832. combined_texts.extend([
  833. self.text_matcher.normalize_text(b['text'])
  834. for b in grouped_boxes[group_idx]['boxes']
  835. if not b.get('used')
  836. ])
  837. match_count = sum(1 for rt in row_text_normalized
  838. if any(rt in gt or gt in rt for gt in combined_texts))
  839. coverage = match_count / len(row_texts) if row_texts else 0
  840. if coverage > best_score:
  841. best_score = coverage
  842. best_groups = combined_groups.copy()
  843. # 记录映射
  844. if best_groups and best_score > 0.3:
  845. mapping[row_idx] = best_groups
  846. used_groups.update(best_groups)
  847. else:
  848. # 降级策略:位置推测
  849. estimated_group = min(row_idx, len(grouped_boxes) - 1)
  850. if estimated_group not in used_groups:
  851. mapping[row_idx] = [estimated_group]
  852. used_groups.add(estimated_group)
  853. else:
  854. mapping[row_idx] = []
  855. # 🎯 策略 3: 第二遍 - 处理未使用的组(关键!)
  856. unused_groups = [i for i in range(len(grouped_boxes)) if i not in used_groups]
  857. if unused_groups:
  858. print(f" ℹ️ 发现 {len(unused_groups)} 个未匹配的 paddle 组: {unused_groups}")
  859. # 🔑 将未使用的组合并到相邻的已匹配行
  860. for unused_idx in unused_groups:
  861. # 🎯 关键改进:计算与相邻行的边界距离
  862. unused_group = grouped_boxes[unused_idx]
  863. unused_y_min = min(b['bbox'][1] for b in unused_group['boxes'])
  864. unused_y_max = max(b['bbox'][3] for b in unused_group['boxes'])
  865. # 🔑 查找上方和下方最近的已使用组
  866. above_idx = None
  867. below_idx = None
  868. above_distance = float('inf')
  869. below_distance = float('inf')
  870. # 向上查找
  871. for i in range(unused_idx - 1, -1, -1):
  872. if i in used_groups:
  873. above_idx = i
  874. # 🎯 边界距离:unused 的最小 y - above 的最大 y
  875. above_group = grouped_boxes[i]
  876. max_y_box = max(
  877. above_group['boxes'],
  878. key=lambda b: b['bbox'][3]
  879. )
  880. above_y_center = (max_y_box['bbox'][1] + max_y_box['bbox'][3]) / 2
  881. above_distance = abs(unused_y_min - above_y_center)
  882. print(f" • 组 {unused_idx} 与上方组 {i} 距离: {above_distance:.1f}px")
  883. break
  884. # 向下查找
  885. for i in range(unused_idx + 1, len(grouped_boxes)):
  886. if i in used_groups:
  887. below_idx = i
  888. # 🎯 边界距离:below 的最小 y - unused 的最大 y
  889. below_group = grouped_boxes[i]
  890. min_y_box = min(
  891. below_group['boxes'],
  892. key=lambda b: b['bbox'][1]
  893. )
  894. below_y_center = (min_y_box['bbox'][1] + min_y_box['bbox'][3]) / 2
  895. below_distance = abs(below_y_center - unused_y_max)
  896. print(f" • 组 {unused_idx} 与下方组 {i} 距离: {below_distance:.1f}px")
  897. break
  898. # 🎯 选择距离更近的一侧
  899. if above_idx is not None and below_idx is not None:
  900. # 都存在,选择距离更近的
  901. if above_distance < below_distance:
  902. closest_used_idx = above_idx
  903. merge_direction = "上方"
  904. else:
  905. closest_used_idx = below_idx
  906. merge_direction = "下方"
  907. print(f" ✓ 组 {unused_idx} 选择合并到{merge_direction}组 {closest_used_idx}")
  908. elif above_idx is not None:
  909. closest_used_idx = above_idx
  910. merge_direction = "上方"
  911. elif below_idx is not None:
  912. closest_used_idx = below_idx
  913. merge_direction = "下方"
  914. else:
  915. print(f" ⚠️ 组 {unused_idx} 无相邻已使用组,跳过")
  916. continue
  917. # 🔑 找到该组对应的 HTML 行
  918. target_html_row = None
  919. for html_row_idx, group_indices in mapping.items():
  920. if closest_used_idx in group_indices:
  921. target_html_row = html_row_idx
  922. break
  923. if target_html_row is not None:
  924. # 🎯 根据合并方向决定目标行
  925. if merge_direction == "上方":
  926. # 合并到上方对应的 HTML 行
  927. if target_html_row in mapping:
  928. if unused_idx not in mapping[target_html_row]:
  929. mapping[target_html_row].append(unused_idx)
  930. print(f" • 组 {unused_idx} 合并到 HTML 行 {target_html_row}(上方行)")
  931. else:
  932. # 合并到下方对应的 HTML 行
  933. if target_html_row in mapping:
  934. if unused_idx not in mapping[target_html_row]:
  935. mapping[target_html_row].append(unused_idx)
  936. print(f" • 组 {unused_idx} 合并到 HTML 行 {target_html_row}(下方行)")
  937. used_groups.add(unused_idx)
  938. # 🔑 策略 4: 第三遍 - 按 y 坐标排序每行的组索引
  939. for row_idx in mapping:
  940. if mapping[row_idx]:
  941. mapping[row_idx].sort(key=lambda idx: grouped_boxes[idx]['y_center'])
  942. return mapping
  943. def _match_cell_sequential(self, cell_text: str,
  944. boxes: List[Dict],
  945. col_boundaries: List[Tuple[int, int]],
  946. start_idx: int) -> Optional[Dict]:
  947. """
  948. 🎯 顺序匹配单元格:从指定位置开始,逐步合并 boxes 直到匹配
  949. 策略:
  950. 1. 找到第一个未使用的 box
  951. 2. 尝试单个 box 精确匹配
  952. 3. 如果失败,尝试合并多个 boxes
  953. Args:
  954. cell_text: HTML 单元格文本
  955. boxes: 候选 boxes(已按 x 坐标排序)
  956. col_boundaries: 列边界列表
  957. start_idx: 起始索引
  958. Returns:
  959. {'bbox': [x1,y1,x2,y2], 'text': str, 'score': float,
  960. 'paddle_indices': [idx1, idx2], 'used_boxes': [box1, box2],
  961. 'last_used_index': int}
  962. """
  963. from fuzzywuzzy import fuzz
  964. cell_text_normalized = self.text_matcher.normalize_text(cell_text)
  965. if len(cell_text_normalized) < 2:
  966. return None
  967. # 🔑 找到第一个未使用的 box
  968. first_unused_idx = start_idx
  969. while first_unused_idx < len(boxes) and boxes[first_unused_idx].get('used'):
  970. first_unused_idx += 1
  971. if first_unused_idx >= len(boxes):
  972. return None
  973. # 🔑 策略 1: 单个 box 精确匹配
  974. for box in boxes[first_unused_idx:]:
  975. if box.get('used'):
  976. continue
  977. box_text = self.text_matcher.normalize_text(box['text'])
  978. if cell_text_normalized == box_text:
  979. return self._build_match_result([box], box['text'], 100.0, boxes.index(box))
  980. # 🔑 策略 2: 多个 boxes 合并匹配
  981. unused_boxes = [b for b in boxes if not b.get('used')]
  982. # 合并同列的 boxes 合并
  983. merged_bboxes = []
  984. for col_idx in range(len(col_boundaries)):
  985. combo_boxes = self._get_boxes_in_column(unused_boxes, col_boundaries, col_idx)
  986. if len(combo_boxes) > 0:
  987. sorted_combo = sorted(combo_boxes, key=lambda b: (b['bbox'][1], b['bbox'][0]))
  988. merged_text = ''.join([b['text'] for b in sorted_combo])
  989. merged_bboxes.append({
  990. 'text': merged_text,
  991. 'sorted_combo': sorted_combo
  992. })
  993. for box in merged_bboxes:
  994. # 1. 精确匹配
  995. merged_text_normalized = self.text_matcher.normalize_text(box['text'])
  996. if cell_text_normalized == merged_text_normalized:
  997. last_sort_idx = boxes.index(box['sorted_combo'][-1])
  998. return self._build_match_result(box['sorted_combo'], box['text'], 100.0, last_sort_idx)
  999. # 2. 子串匹配
  1000. is_substring = (cell_text_normalized in merged_text_normalized or
  1001. merged_text_normalized in cell_text_normalized)
  1002. # 3. 模糊匹配
  1003. similarity = fuzz.partial_ratio(cell_text_normalized, merged_text_normalized)
  1004. # 🎯 子串匹配加分
  1005. if is_substring:
  1006. similarity = min(100, similarity + 10)
  1007. if similarity >= self.text_matcher.similarity_threshold:
  1008. print(f" ✓ 匹配成功: '{cell_text[:15]}' vs '{merged_text[:15]}' (相似度: {similarity})")
  1009. return self._build_match_result(box['sorted_combo'], box['text'], similarity, start_idx)
  1010. print(f" ✗ 匹配失败: '{cell_text[:15]}'")
  1011. return None
  1012. def _build_match_result(self, boxes: List[Dict], text: str,
  1013. score: float, last_index: int) -> Dict:
  1014. """构建匹配结果"""
  1015. merged_bbox = [
  1016. min(b['bbox'][0] for b in boxes),
  1017. min(b['bbox'][1] for b in boxes),
  1018. max(b['bbox'][2] for b in boxes),
  1019. max(b['bbox'][3] for b in boxes)
  1020. ]
  1021. return {
  1022. 'bbox': merged_bbox,
  1023. 'text': text,
  1024. 'score': score,
  1025. 'paddle_indices': [b['paddle_bbox_index'] for b in boxes],
  1026. 'used_boxes': boxes,
  1027. 'last_used_index': last_index
  1028. }