table_line_generator.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780
  1. """
  2. 基于 OCR bbox 的表格线生成模块
  3. 自动分析无线表格的行列结构,生成表格线
  4. """
  5. import cv2
  6. import numpy as np
  7. from PIL import Image, ImageDraw
  8. from pathlib import Path
  9. from typing import List, Dict, Tuple, Optional, Union
  10. import json
  11. from bs4 import BeautifulSoup
  12. class TableLineGenerator:
  13. """表格线生成器"""
  14. def __init__(self, image: Union[str, Image.Image, None], ocr_data: Dict):
  15. """
  16. 初始化表格线生成器
  17. Args:
  18. image: 图片路径(str) 或 PIL.Image 对象,或 None(仅分析结构时)
  19. ocr_data: OCR识别结果(包含bbox)
  20. """
  21. if image is None:
  22. # 🆕 无图片模式:仅用于结构分析
  23. self.image_path = None
  24. self.image = None
  25. elif isinstance(image, str):
  26. self.image_path = image
  27. self.image = Image.open(image)
  28. elif isinstance(image, Image.Image):
  29. self.image_path = None
  30. self.image = image
  31. else:
  32. raise TypeError(
  33. f"image 参数必须是 str (路径)、PIL.Image.Image 对象或 None,"
  34. f"实际类型: {type(image)}"
  35. )
  36. self.ocr_data = ocr_data
  37. # 表格结构参数
  38. self.rows = []
  39. self.columns = []
  40. self.row_height = 0
  41. self.col_widths = []
  42. @staticmethod
  43. def parse_ocr_data(ocr_result: Dict, tool: str = "ppstructv3") -> Tuple[List[int], Dict]:
  44. """
  45. 统一的 OCR 数据解析接口(第一步:仅读取数据)
  46. Args:
  47. ocr_result: OCR 识别结果(完整 JSON)
  48. tool: 工具类型 ("ppstructv3" / "mineru")
  49. Returns:
  50. (table_bbox, ocr_data): 表格边界框和文本框列表
  51. """
  52. if tool.lower() == "mineru":
  53. return TableLineGenerator._parse_mineru_data(ocr_result)
  54. elif tool.lower() in ["ppstructv3", "ppstructure"]:
  55. return TableLineGenerator._parse_ppstructure_data(ocr_result)
  56. else:
  57. raise ValueError(f"不支持的工具类型: {tool}")
  58. @staticmethod
  59. def _parse_mineru_data(mineru_result: Union[Dict, List]) -> Tuple[List[int], Dict]:
  60. """
  61. 解析 MinerU 格式数据(仅提取数据,不分析结构)
  62. Args:
  63. mineru_result: MinerU 的完整 JSON 结果
  64. Returns:
  65. (table_bbox, ocr_data): 表格边界框和文本框列表
  66. """
  67. # 🔑 提取 table 数据
  68. table_data = _extract_table_data(mineru_result)
  69. if not table_data:
  70. raise ValueError("未找到 MinerU 格式的表格数据 (type='table')")
  71. # 验证必要字段
  72. if 'table_cells' not in table_data:
  73. raise ValueError("表格数据中未找到 table_cells 字段")
  74. table_cells = table_data['table_cells']
  75. if not table_cells:
  76. raise ValueError("table_cells 为空")
  77. # 🔑 优先使用 table_body 确定准确的行列数
  78. if 'table_body' in table_data:
  79. actual_rows, actual_cols = _parse_table_body_structure(table_data['table_body'])
  80. print(f"📋 从 table_body 解析: {actual_rows} 行 × {actual_cols} 列")
  81. else:
  82. # 回退:从 table_cells 推断
  83. actual_rows = max(cell.get('row', 0) for cell in table_cells if 'row' in cell)
  84. actual_cols = max(cell.get('col', 0) for cell in table_cells if 'col' in cell)
  85. print(f"📋 从 table_cells 推断: {actual_rows} 行 × {actual_cols} 列")
  86. if not table_data or 'table_cells' not in table_data:
  87. raise ValueError("未找到有效的 MinerU 表格数据")
  88. table_cells = table_data['table_cells']
  89. # 🔑 计算表格边界框
  90. all_bboxes = [cell['bbox'] for cell in table_cells if 'bbox' in cell]
  91. if all_bboxes:
  92. x_min = min(bbox[0] for bbox in all_bboxes)
  93. y_min = min(bbox[1] for bbox in all_bboxes)
  94. x_max = max(bbox[2] for bbox in all_bboxes)
  95. y_max = max(bbox[3] for bbox in all_bboxes)
  96. table_bbox = [x_min, y_min, x_max, y_max]
  97. else:
  98. table_bbox = table_data.get('bbox', [0, 0, 2000, 2000])
  99. # 按位置排序(从上到下,从左到右)
  100. table_cells.sort(key=lambda x: (x['bbox'][1], x['bbox'][0]))
  101. # 🔑 转换为统一的 ocr_data 格式
  102. ocr_data = {
  103. 'table_bbox': table_bbox,
  104. 'actual_rows': actual_rows,
  105. 'actual_cols': actual_cols,
  106. 'text_boxes': table_cells
  107. }
  108. print(f"📊 MinerU 数据解析完成: {len(table_cells)} 个文本框")
  109. return table_bbox, ocr_data
  110. @staticmethod
  111. def _parse_ppstructure_data(ocr_result: Dict) -> Tuple[List[int], Dict]:
  112. """
  113. 解析 PPStructure V3 格式数据
  114. Args:
  115. ocr_result: PPStructure V3 的完整 JSON 结果
  116. Returns:
  117. (table_bbox, ocr_data): 表格边界框和文本框列表
  118. """
  119. # 1. 从 parsing_res_list 中找到 table 区域
  120. table_bbox = None
  121. if 'parsing_res_list' in ocr_result:
  122. for block in ocr_result['parsing_res_list']:
  123. if block.get('block_label') == 'table':
  124. table_bbox = block.get('block_bbox')
  125. break
  126. if not table_bbox:
  127. raise ValueError("未找到表格区域 (block_label='table')")
  128. # 2. 从 overall_ocr_res 中提取文本框
  129. text_boxes = []
  130. if 'overall_ocr_res' in ocr_result:
  131. rec_boxes = ocr_result['overall_ocr_res'].get('rec_boxes', [])
  132. rec_texts = ocr_result['overall_ocr_res'].get('rec_texts', [])
  133. # 过滤出表格区域内的文本框
  134. for i, bbox in enumerate(rec_boxes):
  135. if len(bbox) >= 4:
  136. x1, y1, x2, y2 = bbox[:4]
  137. # 判断文本框是否在表格区域内
  138. if (x1 >= table_bbox[0] and y1 >= table_bbox[1] and
  139. x2 <= table_bbox[2] and y2 <= table_bbox[3]):
  140. text_boxes.append({
  141. 'bbox': [int(x1), int(y1), int(x2), int(y2)],
  142. 'text': rec_texts[i] if i < len(rec_texts) else ''
  143. })
  144. # 按位置排序
  145. text_boxes.sort(key=lambda x: (x['bbox'][1], x['bbox'][0]))
  146. print(f"📊 PPStructure 数据解析完成: {len(text_boxes)} 个文本框")
  147. ocr_data = {
  148. 'table_bbox': table_bbox,
  149. 'text_boxes': text_boxes
  150. }
  151. return table_bbox, ocr_data
  152. # ==================== 统一接口:第二步 - 分析结构 ====================
  153. def analyze_table_structure(self,
  154. y_tolerance: int = 5,
  155. x_tolerance: int = 10,
  156. min_row_height: int = 20,
  157. method: str = "auto",
  158. ) -> Dict:
  159. """
  160. 分析表格结构(支持多种算法)
  161. Args:
  162. y_tolerance: Y轴聚类容差(像素)
  163. x_tolerance: X轴聚类容差(像素)
  164. min_row_height: 最小行高(像素)
  165. method: 分析方法 ("auto" / "cluster" / "mineru")
  166. use_table_body: 是否使用 table_body(仅 mineru 方法有效)
  167. Returns:
  168. 表格结构信息
  169. """
  170. if not self.ocr_data:
  171. return {}
  172. # 🔑 自动选择方法
  173. if method == "auto":
  174. # 根据数据特征自动选择
  175. has_cell_index = any('row' in item and 'col' in item for item in self.ocr_data.get('text_boxes', []))
  176. method = "mineru" if has_cell_index else "cluster"
  177. print(f"🤖 自动选择分析方法: {method}")
  178. # 🔑 根据方法选择算法
  179. if method == "mineru":
  180. return self._analyze_by_cell_index()
  181. else:
  182. return self._analyze_by_clustering(y_tolerance, x_tolerance, min_row_height)
  183. def _analyze_by_cell_index(self) -> Dict:
  184. """
  185. 基于单元格的 row/col 索引分析(MinerU 专用)
  186. Returns:
  187. 表格结构信息
  188. """
  189. if not self.ocr_data:
  190. return {}
  191. # 🔑 确定实际行列数
  192. actual_rows = self.ocr_data.get('actual_rows', 0)
  193. actual_cols = self.ocr_data.get('actual_cols', 0)
  194. print(f"📋 检测到: {actual_rows} 行 × {actual_cols} 列")
  195. ocr_data = self.ocr_data.get('text_boxes', [])
  196. # 🔑 按行列索引分组单元格
  197. cells_by_row = {}
  198. cells_by_col = {}
  199. for item in ocr_data:
  200. if 'row' not in item or 'col' not in item:
  201. continue
  202. row = item['row']
  203. col = item['col']
  204. bbox = item['bbox']
  205. if row <= actual_rows and col <= actual_cols:
  206. if row not in cells_by_row:
  207. cells_by_row[row] = []
  208. cells_by_row[row].append(bbox)
  209. if col not in cells_by_col:
  210. cells_by_col[col] = []
  211. cells_by_col[col].append(bbox)
  212. # 🔑 计算每行的 y 边界
  213. row_boundaries = {}
  214. for row_num in range(1, actual_rows + 1):
  215. if row_num in cells_by_row:
  216. bboxes = cells_by_row[row_num]
  217. y_min = min(bbox[1] for bbox in bboxes)
  218. y_max = max(bbox[3] for bbox in bboxes)
  219. row_boundaries[row_num] = (y_min, y_max)
  220. # 🔑 计算横线(现在使用的是过滤后的数据)
  221. horizontal_lines = _calculate_horizontal_lines_with_spacing(row_boundaries)
  222. # 🔑 列边界计算(同样需要过滤异常值)
  223. col_boundaries = {}
  224. for col_num in range(1, actual_cols + 1):
  225. if col_num in cells_by_col:
  226. bboxes = cells_by_col[col_num]
  227. # 🎯 过滤 x 方向的异常值(使用 IQR)
  228. if len(bboxes) > 1:
  229. x_centers = [(bbox[0] + bbox[2]) / 2 for bbox in bboxes]
  230. x_center_q1 = np.percentile(x_centers, 25)
  231. x_center_q3 = np.percentile(x_centers, 75)
  232. x_center_iqr = x_center_q3 - x_center_q1
  233. x_center_median = np.median(x_centers)
  234. # 允许偏移 3 倍 IQR 或至少 100px
  235. x_threshold = max(3 * x_center_iqr, 100)
  236. valid_bboxes = [
  237. bbox for bbox in bboxes
  238. if abs((bbox[0] + bbox[2]) / 2 - x_center_median) <= x_threshold
  239. ]
  240. else:
  241. valid_bboxes = bboxes
  242. if valid_bboxes:
  243. x_min = min(bbox[0] for bbox in valid_bboxes)
  244. x_max = max(bbox[2] for bbox in valid_bboxes)
  245. col_boundaries[col_num] = (x_min, x_max)
  246. # 🔑 计算竖线
  247. vertical_lines = _calculate_vertical_lines_with_spacing(col_boundaries)
  248. # 🔑 生成行区间
  249. self.rows = []
  250. for row_num in sorted(row_boundaries.keys()):
  251. y_min, y_max = row_boundaries[row_num]
  252. self.rows.append({
  253. 'y_start': y_min,
  254. 'y_end': y_max,
  255. 'bboxes': cells_by_row.get(row_num, []),
  256. 'row_index': row_num
  257. })
  258. # 🔑 生成列区间
  259. self.columns = []
  260. for col_num in sorted(col_boundaries.keys()):
  261. x_min, x_max = col_boundaries[col_num]
  262. self.columns.append({
  263. 'x_start': x_min,
  264. 'x_end': x_max,
  265. 'col_index': col_num
  266. })
  267. # 计算行高和列宽
  268. self.row_height = int(np.median([r['y_end'] - r['y_start'] for r in self.rows])) if self.rows else 0
  269. self.col_widths = [c['x_end'] - c['x_start'] for c in self.columns]
  270. return {
  271. 'rows': self.rows,
  272. 'columns': self.columns,
  273. 'horizontal_lines': horizontal_lines,
  274. 'vertical_lines': vertical_lines,
  275. 'row_height': self.row_height,
  276. 'col_widths': self.col_widths,
  277. 'table_bbox': self._get_table_bbox(),
  278. 'total_rows': actual_rows,
  279. 'total_cols': actual_cols,
  280. 'mode': 'hybrid', # ✅ 添加 mode 字段
  281. 'modified_h_lines': [], # ✅ 添加修改记录字段
  282. 'modified_v_lines': [] # ✅ 添加修改记录字段
  283. }
  284. def _analyze_by_clustering(self, y_tolerance: int, x_tolerance: int, min_row_height: int) -> Dict:
  285. """
  286. 基于坐标聚类分析(通用方法)
  287. Args:
  288. y_tolerance: Y轴聚类容差
  289. x_tolerance: X轴聚类容差
  290. min_row_height: 最小行高
  291. Returns:
  292. 表格结构信息
  293. """
  294. if not self.ocr_data:
  295. return {}
  296. ocr_data = self.ocr_data.get('text_boxes', [])
  297. # 1. 提取所有bbox的Y坐标(用于行检测)
  298. y_coords = []
  299. for item in ocr_data:
  300. bbox = item.get('bbox', [])
  301. if len(bbox) >= 4:
  302. y1, y2 = bbox[1], bbox[3]
  303. y_coords.append((y1, y2, bbox))
  304. # 按Y坐标排序
  305. y_coords.sort(key=lambda x: x[0])
  306. # 2. 聚类检测行
  307. self.rows = self._cluster_rows(y_coords, y_tolerance, min_row_height)
  308. # 3. 计算标准行高
  309. row_heights = [row['y_end'] - row['y_start'] for row in self.rows]
  310. self.row_height = int(np.median(row_heights)) if row_heights else 30
  311. # 4. 提取所有bbox的X坐标(用于列检测)
  312. x_coords = []
  313. for item in ocr_data:
  314. bbox = item.get('bbox', [])
  315. if len(bbox) >= 4:
  316. x1, x2 = bbox[0], bbox[2]
  317. x_coords.append((x1, x2))
  318. # 5. 聚类检测列
  319. self.columns = self._cluster_columns(x_coords, x_tolerance)
  320. # 6. 计算列宽
  321. self.col_widths = [col['x_end'] - col['x_start'] for col in self.columns]
  322. # 7. 生成横线坐标
  323. horizontal_lines = []
  324. for row in self.rows:
  325. horizontal_lines.append(row['y_start'])
  326. if self.rows:
  327. horizontal_lines.append(self.rows[-1]['y_end'])
  328. # 8. 生成竖线坐标
  329. vertical_lines = []
  330. for col in self.columns:
  331. vertical_lines.append(col['x_start'])
  332. if self.columns:
  333. vertical_lines.append(self.columns[-1]['x_end'])
  334. return {
  335. 'rows': self.rows,
  336. 'columns': self.columns,
  337. 'horizontal_lines': horizontal_lines,
  338. 'vertical_lines': vertical_lines,
  339. 'row_height': self.row_height,
  340. 'col_widths': self.col_widths,
  341. 'table_bbox': self._get_table_bbox(),
  342. 'mode': 'fixed', # ✅ 添加 mode 字段
  343. 'modified_h_lines': [], # ✅ 添加修改记录字段
  344. 'modified_v_lines': [] # ✅ 添加修改记录字段
  345. }
  346. @staticmethod
  347. def parse_mineru_table_result(mineru_result: Union[Dict, List], use_table_body: bool = True) -> Tuple[List[int], Dict]:
  348. """
  349. [已弃用] 建议使用 parse_ocr_data() + analyze_table_structure()
  350. 保留此方法是为了向后兼容
  351. """
  352. import warnings
  353. warnings.warn(
  354. "parse_mineru_table_result() 已弃用,请使用 "
  355. "parse_ocr_data() + analyze_table_structure()",
  356. DeprecationWarning
  357. )
  358. raise NotImplementedError( "parse_mineru_table_result() 已弃用,请使用 " "parse_ocr_data() + analyze_table_structure()")
  359. @staticmethod
  360. def parse_ppstructure_result(ocr_result: Dict) -> Tuple[List[int], Dict]:
  361. """
  362. [推荐] 解析 PPStructure V3 的 OCR 结果
  363. 这是第一步操作,建议继续使用
  364. """
  365. return TableLineGenerator._parse_ppstructure_data(ocr_result)
  366. def _cluster_rows(self, y_coords: List[Tuple], tolerance: int, min_height: int) -> List[Dict]:
  367. """聚类检测行"""
  368. if not y_coords:
  369. return []
  370. rows = []
  371. current_row = {
  372. 'y_start': y_coords[0][0],
  373. 'y_end': y_coords[0][1],
  374. 'bboxes': [y_coords[0][2]]
  375. }
  376. for i in range(1, len(y_coords)):
  377. y1, y2, bbox = y_coords[i]
  378. if abs(y1 - current_row['y_start']) <= tolerance:
  379. current_row['y_start'] = min(current_row['y_start'], y1)
  380. current_row['y_end'] = max(current_row['y_end'], y2)
  381. current_row['bboxes'].append(bbox)
  382. else:
  383. if current_row['y_end'] - current_row['y_start'] >= min_height:
  384. rows.append(current_row)
  385. current_row = {
  386. 'y_start': y1,
  387. 'y_end': y2,
  388. 'bboxes': [bbox]
  389. }
  390. if current_row['y_end'] - current_row['y_start'] >= min_height:
  391. rows.append(current_row)
  392. return rows
  393. def _cluster_columns(self, x_coords: List[Tuple], tolerance: int) -> List[Dict]:
  394. """聚类检测列"""
  395. if not x_coords:
  396. return []
  397. all_x = []
  398. for x1, x2 in x_coords:
  399. all_x.append(x1)
  400. all_x.append(x2)
  401. all_x = sorted(set(all_x))
  402. columns = []
  403. current_x = all_x[0]
  404. for x in all_x[1:]:
  405. if x - current_x > tolerance:
  406. columns.append(current_x)
  407. current_x = x
  408. columns.append(current_x)
  409. column_regions = []
  410. for i in range(len(columns) - 1):
  411. column_regions.append({
  412. 'x_start': columns[i],
  413. 'x_end': columns[i + 1]
  414. })
  415. return column_regions
  416. def _get_table_bbox(self) -> List[int]:
  417. """获取表格整体边界框"""
  418. if not self.rows or not self.columns:
  419. return [0, 0, self.image.width, self.image.height]
  420. y_min = min(row['y_start'] for row in self.rows)
  421. y_max = max(row['y_end'] for row in self.rows)
  422. x_min = min(col['x_start'] for col in self.columns)
  423. x_max = max(col['x_end'] for col in self.columns)
  424. return [x_min, y_min, x_max, y_max]
  425. def generate_table_lines(self,
  426. line_color: Tuple[int, int, int] = (0, 0, 255),
  427. line_width: int = 2) -> Image.Image:
  428. """在原图上绘制表格线"""
  429. if self.image is None:
  430. raise ValueError(
  431. "无图片模式下不能调用 generate_table_lines(),"
  432. "请在初始化时提供图片"
  433. )
  434. img_with_lines = self.image.copy()
  435. draw = ImageDraw.Draw(img_with_lines)
  436. x_start = self.columns[0]['x_start'] if self.columns else 0
  437. x_end = self.columns[-1]['x_end'] if self.columns else img_with_lines.width
  438. y_start = self.rows[0]['y_start'] if self.rows else 0
  439. y_end = self.rows[-1]['y_end'] if self.rows else img_with_lines.height
  440. # 绘制横线
  441. for row in self.rows:
  442. y = row['y_start']
  443. draw.line([(x_start, y), (x_end, y)], fill=line_color, width=line_width)
  444. if self.rows:
  445. y = self.rows[-1]['y_end']
  446. draw.line([(x_start, y), (x_end, y)], fill=line_color, width=line_width)
  447. # 绘制竖线
  448. for col in self.columns:
  449. x = col['x_start']
  450. draw.line([(x, y_start), (x, y_end)], fill=line_color, width=line_width)
  451. if self.columns:
  452. x = self.columns[-1]['x_end']
  453. draw.line([(x, y_start), (x, y_end)], fill=line_color, width=line_width)
  454. return img_with_lines
  455. @staticmethod
  456. def analyze_structure_only(
  457. ocr_data: Dict,
  458. y_tolerance: int = 5,
  459. x_tolerance: int = 10,
  460. min_row_height: int = 20,
  461. method: str = "auto"
  462. ) -> Dict:
  463. """
  464. 仅分析表格结构(无需图片)
  465. Args:
  466. ocr_data: OCR识别结果
  467. y_tolerance: Y轴聚类容差(像素)
  468. x_tolerance: X轴聚类容差(像素)
  469. min_row_height: 最小行高(像素)
  470. method: 分析方法 ("auto" / "cluster" / "mineru")
  471. Returns:
  472. 表格结构信息
  473. """
  474. # 🔑 创建无图片模式的生成器
  475. temp_generator = TableLineGenerator(None, ocr_data)
  476. # 🔑 分析结构
  477. return temp_generator.analyze_table_structure(
  478. y_tolerance=y_tolerance,
  479. x_tolerance=x_tolerance,
  480. min_row_height=min_row_height,
  481. method=method
  482. )
  483. def _calculate_horizontal_lines_with_spacing(row_boundaries: Dict[int, Tuple[int, int]]) -> List[int]:
  484. """
  485. 计算横线位置(考虑行间距)
  486. Args:
  487. row_boundaries: {row_num: (y_min, y_max)}
  488. Returns:
  489. 横线 y 坐标列表
  490. """
  491. if not row_boundaries:
  492. return []
  493. sorted_rows = sorted(row_boundaries.items())
  494. # 🔑 分析相邻行之间的间隔
  495. gaps = []
  496. gap_info = [] # 保存详细信息用于调试
  497. for i in range(len(sorted_rows) - 1):
  498. row_num1, (y_min1, y_max1) = sorted_rows[i]
  499. row_num2, (y_min2, y_max2) = sorted_rows[i + 1]
  500. gap = y_min2 - y_max1 # 行间距(可能为负,表示重叠)
  501. gaps.append(gap)
  502. gap_info.append({
  503. 'row1': row_num1,
  504. 'row2': row_num2,
  505. 'gap': gap
  506. })
  507. print(f"📏 行间距详情:")
  508. for info in gap_info:
  509. status = "重叠" if info['gap'] < 0 else "正常"
  510. print(f" 行 {info['row1']} → {info['row2']}: {info['gap']:.1f}px ({status})")
  511. # 🔑 过滤掉负数 gap(重叠情况)和极小的 gap
  512. valid_gaps = [g for g in gaps if g > 2] # 至少 2px 间隔才算有效
  513. if valid_gaps:
  514. gap_median = np.median(valid_gaps)
  515. gap_std = np.std(valid_gaps)
  516. print(f"📏 行间距统计: 中位数={gap_median:.1f}px, 标准差={gap_std:.1f}px")
  517. print(f" 有效间隔数: {len(valid_gaps)}/{len(gaps)}")
  518. # 🔑 生成横线坐标(在相邻行中间)
  519. horizontal_lines = []
  520. for i, (row_num, (y_min, y_max)) in enumerate(sorted_rows):
  521. if i == 0:
  522. # 第一行的上边界
  523. horizontal_lines.append(y_min)
  524. if i < len(sorted_rows) - 1:
  525. next_row_num, (next_y_min, next_y_max) = sorted_rows[i + 1]
  526. gap = next_y_min - y_max
  527. if gap > 0:
  528. # 有间隔:在间隔中间画线
  529. # separator_y = int((y_max + next_y_min) / 2)
  530. # 有间隔:更靠近下一行的位置
  531. separator_y = int(next_y_min) - max(int(gap / 4), 2)
  532. horizontal_lines.append(separator_y)
  533. else:
  534. # 重叠或紧贴:在当前行的下边界画线
  535. separator_y = int(next_y_min) - max(int(gap / 4), 2)
  536. horizontal_lines.append(separator_y)
  537. else:
  538. # 最后一行的下边界
  539. horizontal_lines.append(y_max)
  540. return sorted(set(horizontal_lines))
  541. def _calculate_vertical_lines_with_spacing(col_boundaries: Dict[int, Tuple[int, int]]) -> List[int]:
  542. """
  543. 计算竖线位置(考虑列间距和重叠)
  544. Args:
  545. col_boundaries: {col_num: (x_min, x_max)}
  546. Returns:
  547. 竖线 x 坐标列表
  548. """
  549. if not col_boundaries:
  550. return []
  551. sorted_cols = sorted(col_boundaries.items())
  552. # 🔑 分析相邻列之间的间隔
  553. gaps = []
  554. gap_info = []
  555. for i in range(len(sorted_cols) - 1):
  556. col_num1, (x_min1, x_max1) = sorted_cols[i]
  557. col_num2, (x_min2, x_max2) = sorted_cols[i + 1]
  558. gap = x_min2 - x_max1 # 列间距(可能为负)
  559. gaps.append(gap)
  560. gap_info.append({
  561. 'col1': col_num1,
  562. 'col2': col_num2,
  563. 'gap': gap
  564. })
  565. print(f"📏 列间距详情:")
  566. for info in gap_info:
  567. status = "重叠" if info['gap'] < 0 else "正常"
  568. print(f" 列 {info['col1']} → {info['col2']}: {info['gap']:.1f}px ({status})")
  569. # 🔑 过滤掉负数 gap
  570. valid_gaps = [g for g in gaps if g > 2]
  571. if valid_gaps:
  572. gap_median = np.median(valid_gaps)
  573. gap_std = np.std(valid_gaps)
  574. print(f"📏 列间距统计: 中位数={gap_median:.1f}px, 标准差={gap_std:.1f}px")
  575. # 🔑 生成竖线坐标(在相邻列中间)
  576. vertical_lines = []
  577. for i, (col_num, (x_min, x_max)) in enumerate(sorted_cols):
  578. if i == 0:
  579. # 第一列的左边界
  580. vertical_lines.append(x_min)
  581. if i < len(sorted_cols) - 1:
  582. next_col_num, (next_x_min, next_x_max) = sorted_cols[i + 1]
  583. gap = next_x_min - x_max
  584. if gap > 0:
  585. # 有间隔:在间隔中间画线
  586. separator_x = int((x_max + next_x_min) / 2)
  587. vertical_lines.append(separator_x)
  588. else:
  589. # 重叠或紧贴:在当前列的右边界画线
  590. vertical_lines.append(x_max)
  591. else:
  592. # 最后一列的右边界
  593. vertical_lines.append(x_max)
  594. return sorted(set(vertical_lines))
  595. def _extract_table_data(mineru_result: Union[Dict, List]) -> Optional[Dict]:
  596. """提取 table 数据"""
  597. if isinstance(mineru_result, list):
  598. for item in mineru_result:
  599. if isinstance(item, dict) and item.get('type') == 'table':
  600. return item
  601. elif isinstance(mineru_result, dict):
  602. if mineru_result.get('type') == 'table':
  603. return mineru_result
  604. # 递归查找
  605. for value in mineru_result.values():
  606. if isinstance(value, dict) and value.get('type') == 'table':
  607. return value
  608. elif isinstance(value, list):
  609. result = _extract_table_data(value)
  610. if result:
  611. return result
  612. return None
  613. def _parse_table_body_structure(table_body: str) -> Tuple[int, int]:
  614. """从 table_body HTML 中解析准确的行列数"""
  615. try:
  616. soup = BeautifulSoup(table_body, 'html.parser')
  617. table = soup.find('table')
  618. if not table:
  619. raise ValueError("未找到 <table> 标签")
  620. rows = table.find_all('tr')
  621. if not rows:
  622. raise ValueError("未找到 <tr> 标签")
  623. num_rows = len(rows)
  624. first_row = rows[0]
  625. num_cols = len(first_row.find_all(['td', 'th']))
  626. return num_rows, num_cols
  627. except Exception as e:
  628. print(f"⚠️ 解析 table_body 失败: {e}")
  629. return 0, 0