ocr_validator_utils.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724
  1. """
  2. OCR验证工具的工具函数模块
  3. 包含数据处理、图像处理、统计分析等功能
  4. """
  5. import json
  6. import pandas as pd
  7. import numpy as np
  8. from pathlib import Path
  9. from PIL import Image, ImageDraw
  10. from typing import Dict, List, Optional, Tuple, Union
  11. from io import StringIO, BytesIO
  12. import re
  13. from html import unescape
  14. import yaml
  15. import base64
  16. from urllib.parse import urlparse
  17. import os
  18. def load_config(config_path: str = "config.yaml") -> Dict:
  19. """加载配置文件"""
  20. try:
  21. with open(config_path, 'r', encoding='utf-8') as f:
  22. return yaml.safe_load(f)
  23. except Exception as e:
  24. # 返回默认配置
  25. return get_default_config()
  26. def get_default_config() -> Dict:
  27. """获取默认配置"""
  28. return {
  29. 'styles': {
  30. 'font_sizes': {'small': 10, 'medium': 12, 'large': 14, 'extra_large': 16},
  31. 'colors': {
  32. 'primary': '#0288d1', 'secondary': '#ff9800', 'success': '#4caf50',
  33. 'error': '#f44336', 'warning': '#ff9800', 'background': '#fafafa', 'text': '#333333'
  34. },
  35. 'layout': {'default_zoom': 1.0, 'default_height': 600, 'sidebar_width': 0.3, 'content_width': 0.7}
  36. },
  37. 'ui': {
  38. 'page_title': 'OCR可视化校验工具', 'page_icon': '🔍', 'layout': 'wide',
  39. 'sidebar_state': 'expanded', 'default_font_size': 'medium', 'default_layout': '标准布局'
  40. },
  41. 'paths': {
  42. 'ocr_out_dir': './sample_data', 'src_img_dir': './sample_data',
  43. 'supported_image_formats': ['.png', '.jpg', '.jpeg']
  44. },
  45. 'ocr': {
  46. 'min_text_length': 2, 'default_confidence': 1.0, 'exclude_texts': ['Picture', ''],
  47. 'tools': {
  48. 'dots_ocr': {
  49. 'name': 'Dots OCR', 'json_structure': 'array',
  50. 'text_field': 'text', 'bbox_field': 'bbox', 'category_field': 'category'
  51. },
  52. 'ppstructv3': {
  53. 'name': 'PPStructV3', 'json_structure': 'object', 'parsing_results_field': 'parsing_res_list',
  54. 'text_field': 'block_content', 'bbox_field': 'block_bbox', 'category_field': 'block_label'
  55. }
  56. },
  57. 'auto_detection': {
  58. 'enabled': True,
  59. 'rules': [
  60. {'field_exists': 'parsing_res_list', 'tool_type': 'ppstructv3'},
  61. {'json_is_array': True, 'tool_type': 'dots_ocr'}
  62. ]
  63. }
  64. }
  65. }
  66. def load_css_styles(css_path: str = "styles.css") -> str:
  67. """加载CSS样式文件"""
  68. try:
  69. with open(css_path, 'r', encoding='utf-8') as f:
  70. return f.read()
  71. except Exception:
  72. # 返回基本样式
  73. return """
  74. .main > div { background-color: white !important; color: #333333 !important; }
  75. .stApp { background-color: white !important; }
  76. .block-container { background-color: white !important; color: #333333 !important; }
  77. """
  78. def rotate_image_and_coordinates(
  79. image: Image.Image,
  80. angle: float,
  81. coordinates_list: List[List[int]],
  82. rotate_coordinates: bool = True
  83. ) -> Tuple[Image.Image, List[List[int]]]:
  84. """
  85. 根据角度旋转图像和坐标
  86. Args:
  87. image: 原始图像
  88. angle: 旋转角度(度数)
  89. coordinates_list: 坐标列表,每个坐标为[x1, y1, x2, y2]格式
  90. rotate_coordinates: 是否需要旋转坐标(针对不同OCR工具的处理方式)
  91. Returns:
  92. rotated_image: 旋转后的图像
  93. rotated_coordinates: 处理后的坐标列表
  94. """
  95. if angle == 0:
  96. return image, coordinates_list
  97. # 标准化旋转角度
  98. if angle == 270:
  99. rotation_angle = -90 # 顺时针90度
  100. elif angle == 90:
  101. rotation_angle = 90 # 逆时针90度
  102. elif angle == 180:
  103. rotation_angle = 180 # 180度
  104. else:
  105. rotation_angle = angle
  106. # 旋转图像
  107. rotated_image = image.rotate(rotation_angle, expand=True)
  108. # 如果不需要旋转坐标,直接返回原坐标
  109. if not rotate_coordinates:
  110. return rotated_image, coordinates_list
  111. # 获取原始和旋转后的图像尺寸
  112. orig_width, orig_height = image.size
  113. new_width, new_height = rotated_image.size
  114. # 计算旋转后的坐标
  115. rotated_coordinates = []
  116. for coord in coordinates_list:
  117. if len(coord) < 4:
  118. rotated_coordinates.append(coord)
  119. continue
  120. x1, y1, x2, y2 = coord[:4]
  121. # 验证原始坐标是否有效
  122. if x1 < 0 or y1 < 0 or x2 <= x1 or y2 <= y1:
  123. print(f"警告: 无效坐标 {coord}")
  124. rotated_coordinates.append([0, 0, 50, 50]) # 使用默认坐标
  125. continue
  126. # 根据旋转角度变换坐标
  127. if rotation_angle == -90: # 顺时针90度 (270度逆时针)
  128. # 变换公式: (x, y) -> (y, orig_width - x)
  129. new_x1 = y1
  130. new_y1 = orig_width - x2
  131. new_x2 = y2
  132. new_y2 = orig_width - x1
  133. elif rotation_angle == 90: # 逆时针90度
  134. # 变换公式: (x, y) -> (orig_height - y, x)
  135. new_x1 = orig_height - y2
  136. new_y1 = x1
  137. new_x2 = orig_height - y1
  138. new_y2 = x2
  139. elif rotation_angle == 180: # 180度
  140. # 变换公式: (x, y) -> (orig_width - x, orig_height - y)
  141. new_x1 = orig_width - x2
  142. new_y1 = orig_height - y2
  143. new_x2 = orig_width - x1
  144. new_y2 = orig_height - y1
  145. else:
  146. # 对于其他角度,使用通用的旋转矩阵
  147. center_x, center_y = orig_width / 2, orig_height / 2
  148. new_center_x, new_center_y = new_width / 2, new_height / 2
  149. angle_rad = np.radians(rotation_angle)
  150. cos_angle = np.cos(angle_rad)
  151. sin_angle = np.sin(angle_rad)
  152. # 旋转四个角点
  153. corners = [
  154. (x1 - center_x, y1 - center_y),
  155. (x2 - center_x, y1 - center_y),
  156. (x2 - center_x, y2 - center_y),
  157. (x1 - center_x, y2 - center_y)
  158. ]
  159. rotated_corners = []
  160. for x, y in corners:
  161. new_x = x * cos_angle - y * sin_angle
  162. new_y = x * sin_angle + y * cos_angle
  163. rotated_corners.append((new_x + new_center_x, new_y + new_center_y))
  164. # 计算边界框
  165. x_coords = [corner[0] for corner in rotated_corners]
  166. y_coords = [corner[1] for corner in rotated_corners]
  167. new_x1 = int(min(x_coords))
  168. new_y1 = int(min(y_coords))
  169. new_x2 = int(max(x_coords))
  170. new_y2 = int(max(y_coords))
  171. # 确保坐标在有效范围内
  172. new_x1 = max(0, min(new_width, new_x1))
  173. new_y1 = max(0, min(new_height, new_y1))
  174. new_x2 = max(0, min(new_width, new_x2))
  175. new_y2 = max(0, min(new_height, new_y2))
  176. # 确保x1 < x2, y1 < y2
  177. if new_x1 > new_x2:
  178. new_x1, new_x2 = new_x2, new_x1
  179. if new_y1 > new_y2:
  180. new_y1, new_y2 = new_y2, new_y1
  181. rotated_coordinates.append([new_x1, new_y1, new_x2, new_y2])
  182. return rotated_image, rotated_coordinates
  183. def detect_ocr_tool_type(data: Union[List, Dict], config: Dict) -> str:
  184. """自动检测OCR工具类型"""
  185. if not config['ocr']['auto_detection']['enabled']:
  186. return 'dots_ocr' # 默认类型
  187. rules = config['ocr']['auto_detection']['rules']
  188. for rule in rules:
  189. if 'field_exists' in rule:
  190. field_name = rule['field_exists']
  191. if isinstance(data, dict) and field_name in data:
  192. return rule['tool_type']
  193. if 'json_is_array' in rule:
  194. if rule['json_is_array'] and isinstance(data, list):
  195. return rule['tool_type']
  196. # 默认返回dots_ocr
  197. return 'dots_ocr'
  198. def parse_dots_ocr_data(data: List, config: Dict) -> List[Dict]:
  199. """解析Dots OCR格式的数据"""
  200. tool_config = config['ocr']['tools']['dots_ocr']
  201. parsed_data = []
  202. for item in data:
  203. if not isinstance(item, dict):
  204. continue
  205. # 提取字段
  206. text = item.get(tool_config['text_field'], '')
  207. bbox = item.get(tool_config['bbox_field'], [])
  208. category = item.get(tool_config['category_field'], 'Text')
  209. confidence = item.get(tool_config.get('confidence_field', 'confidence'),
  210. config['ocr']['default_confidence'])
  211. if text and bbox and len(bbox) >= 4:
  212. parsed_data.append({
  213. 'text': str(text).strip(),
  214. 'bbox': bbox[:4], # 确保只取前4个坐标
  215. 'category': category,
  216. 'confidence': confidence,
  217. 'source_tool': 'dots_ocr'
  218. })
  219. return parsed_data
  220. def parse_ppstructv3_data(data: Dict, config: Dict) -> List[Dict]:
  221. """解析PPStructV3格式的数据"""
  222. tool_config = config['ocr']['tools']['ppstructv3']
  223. parsed_data = []
  224. # 获取解析结果列表
  225. parsing_results_field = tool_config['parsing_results_field']
  226. if parsing_results_field not in data:
  227. return parsed_data
  228. parsing_results = data[parsing_results_field]
  229. if not isinstance(parsing_results, list):
  230. return parsed_data
  231. for item in parsing_results:
  232. if not isinstance(item, dict):
  233. continue
  234. # 提取字段
  235. text = item.get(tool_config['text_field'], '')
  236. bbox = item.get(tool_config['bbox_field'], [])
  237. category = item.get(tool_config['category_field'], 'text')
  238. confidence = item.get(tool_config.get('confidence_field', 'confidence'),
  239. config['ocr']['default_confidence'])
  240. if text and bbox and len(bbox) >= 4:
  241. parsed_data.append({
  242. 'text': str(text).strip(),
  243. 'bbox': bbox[:4], # 确保只取前4个坐标
  244. 'category': category,
  245. 'confidence': confidence,
  246. 'source_tool': 'ppstructv3'
  247. })
  248. # 如果有OCR文本识别结果,也添加进来
  249. if 'overall_ocr_res' in data:
  250. ocr_res = data['overall_ocr_res']
  251. if isinstance(ocr_res, dict) and 'rec_texts' in ocr_res and 'rec_boxes' in ocr_res:
  252. texts = ocr_res['rec_texts']
  253. boxes = ocr_res['rec_boxes']
  254. scores = ocr_res.get('rec_scores', [])
  255. for i, (text, box) in enumerate(zip(texts, boxes)):
  256. if text and len(box) >= 4:
  257. confidence = scores[i] if i < len(scores) else config['ocr']['default_confidence']
  258. parsed_data.append({
  259. 'text': str(text).strip(),
  260. 'bbox': box[:4],
  261. 'category': 'OCR_Text',
  262. 'confidence': confidence,
  263. 'source_tool': 'ppstructv3_ocr'
  264. })
  265. return parsed_data
  266. def normalize_ocr_data(raw_data: Union[List, Dict], config: Dict) -> List[Dict]:
  267. """统一不同OCR工具的数据格式"""
  268. # 自动检测OCR工具类型
  269. tool_type = detect_ocr_tool_type(raw_data, config)
  270. if tool_type == 'dots_ocr':
  271. return parse_dots_ocr_data(raw_data, config)
  272. elif tool_type == 'ppstructv3':
  273. return parse_ppstructv3_data(raw_data, config)
  274. else:
  275. raise ValueError(f"不支持的OCR工具类型: {tool_type}")
  276. def get_rotation_angle_from_ppstructv3(data: Dict) -> float:
  277. """从PPStructV3数据中获取旋转角度"""
  278. if 'doc_preprocessor_res' in data:
  279. doc_res = data['doc_preprocessor_res']
  280. if isinstance(doc_res, dict) and 'angle' in doc_res:
  281. return float(doc_res['angle'])
  282. return 0.0
  283. def process_markdown_images(md_content: str, json_path: str) -> str:
  284. """
  285. 处理Markdown中的图片引用,将本地图片转换为base64
  286. """
  287. import re
  288. # 匹配Markdown图片语法: ![alt](path)
  289. img_pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
  290. def replace_image(match):
  291. alt_text = match.group(1)
  292. img_path = match.group(2)
  293. # 如果已经是base64或者网络链接,直接返回
  294. if img_path.startswith('data:image') or img_path.startswith('http'):
  295. return match.group(0)
  296. # 处理相对路径
  297. if not os.path.isabs(img_path):
  298. # 相对于JSON文件的路径
  299. json_dir = os.path.dirname(json_path)
  300. full_img_path = os.path.join(json_dir, img_path)
  301. else:
  302. full_img_path = img_path
  303. # 尝试转换为base64
  304. try:
  305. if os.path.exists(full_img_path):
  306. with open(full_img_path, 'rb') as img_file:
  307. img_data = img_file.read()
  308. # 获取文件扩展名确定MIME类型
  309. ext = os.path.splitext(full_img_path)[1].lower()
  310. mime_type = {
  311. '.png': 'image/png',
  312. '.jpg': 'image/jpeg',
  313. '.jpeg': 'image/jpeg',
  314. '.gif': 'image/gif',
  315. '.bmp': 'image/bmp',
  316. '.webp': 'image/webp'
  317. }.get(ext, 'image/jpeg')
  318. # 转换为base64
  319. img_base64 = base64.b64encode(img_data).decode('utf-8')
  320. data_url = f"data:{mime_type};base64,{img_base64}"
  321. return f'![{alt_text}]({data_url})'
  322. else:
  323. # 文件不存在,返回原始链接但添加警告
  324. return f'![{alt_text} (文件不存在)]({img_path})'
  325. except Exception as e:
  326. # 转换失败,返回原始链接
  327. return f'![{alt_text} (加载失败)]({img_path})'
  328. # 替换所有图片引用
  329. processed_content = re.sub(img_pattern, replace_image, md_content)
  330. return processed_content
  331. def load_ocr_data_file(json_path: str, config: Dict) -> Tuple[List, str, str]:
  332. """加载OCR相关数据文件"""
  333. json_file = Path(json_path)
  334. ocr_data = []
  335. md_content = ""
  336. image_path = ""
  337. # 加载JSON数据
  338. try:
  339. with open(json_file, 'r', encoding='utf-8') as f:
  340. raw_data = json.load(f)
  341. # 统一数据格式
  342. ocr_data = normalize_ocr_data(raw_data, config)
  343. # 检查是否需要处理图像旋转
  344. rotation_angle = 0.0
  345. if isinstance(raw_data, dict):
  346. rotation_angle = get_rotation_angle_from_ppstructv3(raw_data)
  347. # 如果有旋转角度,记录下来供后续图像处理使用
  348. if rotation_angle != 0:
  349. for item in ocr_data:
  350. item['rotation_angle'] = rotation_angle
  351. except Exception as e:
  352. raise Exception(f"加载JSON文件失败: {e}")
  353. # 加载MD文件
  354. md_file = json_file.with_suffix('.md')
  355. if md_file.exists():
  356. with open(md_file, 'r', encoding='utf-8') as f:
  357. raw_md_content = f.read()
  358. # 处理Markdown中的图片引用
  359. md_content = process_markdown_images(raw_md_content, str(json_file))
  360. # 推断图片路径
  361. image_name = json_file.stem
  362. src_img_dir = Path(config['paths']['src_img_dir'])
  363. image_candidates = []
  364. for ext in config['paths']['supported_image_formats']:
  365. image_candidates.extend([
  366. src_img_dir / f"{image_name}{ext}",
  367. json_file.parent / f"{image_name}{ext}",
  368. # 对于PPStructV3,可能图片名包含page信息 # 去掉page后缀的通用匹配
  369. src_img_dir / f"{image_name.split('_page_')[0]}{ext}" if '_page_' in image_name else None,
  370. ])
  371. # 移除None值
  372. image_candidates = [candidate for candidate in image_candidates if candidate is not None]
  373. for candidate in image_candidates:
  374. if candidate.exists():
  375. image_path = str(candidate)
  376. break
  377. return ocr_data, md_content, image_path
  378. def process_ocr_data(ocr_data: List, config: Dict) -> Dict[str, List]:
  379. """处理OCR数据,建立文本到bbox的映射"""
  380. text_bbox_mapping = {}
  381. exclude_texts = config['ocr']['exclude_texts']
  382. min_text_length = config['ocr']['min_text_length']
  383. if not isinstance(ocr_data, list):
  384. return text_bbox_mapping
  385. for i, item in enumerate(ocr_data):
  386. if not isinstance(item, dict):
  387. continue
  388. text = str(item['text']).strip()
  389. if text and text not in exclude_texts and len(text) >= min_text_length:
  390. bbox = item['bbox']
  391. if isinstance(bbox, list) and len(bbox) == 4:
  392. if text not in text_bbox_mapping:
  393. text_bbox_mapping[text] = []
  394. text_bbox_mapping[text].append({
  395. 'bbox': bbox,
  396. 'category': item.get('category', 'Text'),
  397. 'index': i,
  398. 'confidence': item.get('confidence', config['ocr']['default_confidence']),
  399. 'source_tool': item.get('source_tool', 'unknown'),
  400. 'rotation_angle': item.get('rotation_angle', 0.0) # 添加旋转角度信息
  401. })
  402. return text_bbox_mapping
  403. def find_available_ocr_files(ocr_out_dir: str) -> List[str]:
  404. """查找可用的OCR文件"""
  405. available_files = []
  406. # 搜索多个可能的目录
  407. search_dirs = [
  408. Path(ocr_out_dir),
  409. ]
  410. for search_dir in search_dirs:
  411. if search_dir.exists():
  412. # 递归搜索JSON文件
  413. for json_file in search_dir.rglob("*.json"):
  414. available_files.append(str(json_file))
  415. # 去重并排序
  416. available_files = sorted(list(set(available_files)))
  417. return available_files
  418. def get_ocr_tool_info(ocr_data: List) -> Dict:
  419. """获取OCR工具信息统计"""
  420. tool_counts = {}
  421. for item in ocr_data:
  422. if isinstance(item, dict):
  423. source_tool = item.get('source_tool', 'unknown')
  424. tool_counts[source_tool] = tool_counts.get(source_tool, 0) + 1
  425. return tool_counts
  426. def draw_bbox_on_image(image: Image.Image, bbox: List[int], color: str = "red", width: int = 3) -> Image.Image:
  427. """在图片上绘制bbox框"""
  428. img_copy = image.copy()
  429. draw = ImageDraw.Draw(img_copy)
  430. x1, y1, x2, y2 = bbox
  431. # 绘制矩形框
  432. draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
  433. # 添加半透明填充
  434. overlay = Image.new('RGBA', img_copy.size, (0, 0, 0, 0))
  435. overlay_draw = ImageDraw.Draw(overlay)
  436. color_map = {
  437. "red": (255, 0, 0, 30),
  438. "blue": (0, 0, 255, 30),
  439. "green": (0, 255, 0, 30)
  440. }
  441. fill_color = color_map.get(color, (255, 255, 0, 30))
  442. overlay_draw.rectangle([x1, y1, x2, y2], fill=fill_color)
  443. img_copy = Image.alpha_composite(img_copy.convert('RGBA'), overlay).convert('RGB')
  444. return img_copy
  445. def get_ocr_statistics(ocr_data: List, text_bbox_mapping: Dict, marked_errors: set) -> Dict:
  446. """获取OCR数据统计信息"""
  447. if not isinstance(ocr_data, list) or not ocr_data:
  448. return {
  449. 'total_texts': 0, 'clickable_texts': 0, 'marked_errors': 0,
  450. 'categories': {}, 'accuracy_rate': 0, 'tool_info': {}
  451. }
  452. total_texts = len(ocr_data)
  453. clickable_texts = len(text_bbox_mapping)
  454. marked_errors_count = len(marked_errors)
  455. # 按类别统计
  456. categories = {}
  457. for item in ocr_data:
  458. if isinstance(item, dict):
  459. category = item.get('category', 'Unknown')
  460. categories[category] = categories.get(category, 0) + 1
  461. # OCR工具信息统计
  462. tool_info = get_ocr_tool_info(ocr_data)
  463. accuracy_rate = (clickable_texts - marked_errors_count) / clickable_texts * 100 if clickable_texts > 0 else 0
  464. return {
  465. 'total_texts': total_texts,
  466. 'clickable_texts': clickable_texts,
  467. 'marked_errors': marked_errors_count,
  468. 'categories': categories,
  469. 'accuracy_rate': accuracy_rate,
  470. 'tool_info': tool_info
  471. }
  472. def convert_html_table_to_markdown(content: str) -> str:
  473. """将HTML表格转换为Markdown表格格式"""
  474. def replace_table(match):
  475. table_html = match.group(0)
  476. # 提取所有行
  477. rows = re.findall(r'<tr>(.*?)</tr>', table_html, re.DOTALL | re.IGNORECASE)
  478. if not rows:
  479. return table_html
  480. markdown_rows = []
  481. for i, row in enumerate(rows):
  482. # 提取单元格
  483. cells = re.findall(r'<td[^>]*>(.*?)</td>', row, re.DOTALL | re.IGNORECASE)
  484. if cells:
  485. # 清理单元格内容
  486. clean_cells = []
  487. for cell in cells:
  488. cell_text = re.sub(r'<[^>]+>', '', cell).strip()
  489. cell_text = unescape(cell_text)
  490. clean_cells.append(cell_text)
  491. # 构建Markdown行
  492. markdown_row = '| ' + ' | '.join(clean_cells) + ' |'
  493. markdown_rows.append(markdown_row)
  494. # 在第一行后添加分隔符
  495. if i == 0:
  496. separator = '| ' + ' | '.join(['---'] * len(clean_cells)) + ' |'
  497. markdown_rows.append(separator)
  498. return '\n'.join(markdown_rows) if markdown_rows else table_html
  499. # 替换所有HTML表格
  500. converted = re.sub(r'<table[^>]*>.*?</table>', replace_table, content, flags=re.DOTALL | re.IGNORECASE)
  501. return converted
  502. def parse_html_tables(html_content: str) -> List[pd.DataFrame]:
  503. """解析HTML内容中的表格为DataFrame列表"""
  504. try:
  505. tables = pd.read_html(StringIO(html_content))
  506. return tables if tables else []
  507. except Exception:
  508. return []
  509. def create_dynamic_css(config: Dict, font_size_key: str, height: int) -> str:
  510. """根据配置动态创建CSS样式"""
  511. colors = config['styles']['colors']
  512. font_size = config['styles']['font_sizes'][font_size_key]
  513. return f"""
  514. <style>
  515. .dynamic-content {{
  516. height: {height}px;
  517. font-size: {font_size}px !important;
  518. line-height: 1.4;
  519. background-color: {colors['background']} !important;
  520. color: {colors['text']} !important;
  521. border: 1px solid #ddd;
  522. padding: 10px;
  523. border-radius: 5px;
  524. }}
  525. .highlight-selected {{
  526. background-color: {colors['success']} !important;
  527. color: white !important;
  528. }}
  529. .highlight-error {{
  530. background-color: {colors['error']} !important;
  531. color: white !important;
  532. }}
  533. </style>
  534. """
  535. def export_tables_to_excel(tables: List[pd.DataFrame], filename: str = "ocr_tables.xlsx") -> BytesIO:
  536. """导出表格数据到Excel"""
  537. output = BytesIO()
  538. with pd.ExcelWriter(output, engine='openpyxl') as writer:
  539. for i, table in enumerate(tables):
  540. table.to_excel(writer, sheet_name=f'Table_{i+1}', index=False)
  541. return output
  542. def get_table_statistics(tables: List[pd.DataFrame]) -> List[Dict]:
  543. """获取表格统计信息"""
  544. stats = []
  545. for i, table in enumerate(tables):
  546. numeric_cols = len(table.select_dtypes(include=[np.number]).columns)
  547. stats.append({
  548. 'table_index': i + 1,
  549. 'rows': len(table),
  550. 'columns': len(table.columns),
  551. 'numeric_columns': numeric_cols
  552. })
  553. return stats
  554. def group_texts_by_category(text_bbox_mapping: Dict[str, List]) -> Dict[str, List[str]]:
  555. """按类别对文本进行分组"""
  556. categories = {}
  557. for text, info_list in text_bbox_mapping.items():
  558. category = info_list[0]['category']
  559. if category not in categories:
  560. categories[category] = []
  561. categories[category].append(text)
  562. return categories
  563. def get_ocr_tool_rotation_config(ocr_data: List, config: Dict) -> Dict:
  564. """获取OCR工具的旋转配置"""
  565. if not ocr_data or not isinstance(ocr_data, list):
  566. # 默认配置
  567. return {
  568. 'coordinates_need_rotation': True,
  569. 'coordinates_are_pre_rotated': False
  570. }
  571. # 从第一个OCR数据项获取工具类型
  572. first_item = ocr_data[0] if ocr_data else {}
  573. source_tool = first_item.get('source_tool', 'dots_ocr')
  574. # 获取工具配置
  575. tools_config = config.get('ocr', {}).get('tools', {})
  576. if source_tool in tools_config:
  577. tool_config = tools_config[source_tool]
  578. return tool_config.get('rotation', {
  579. 'coordinates_are_pre_rotated': False
  580. })
  581. else:
  582. # 默认配置
  583. return {
  584. 'coordinates_are_pre_rotated': False
  585. }